X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/8e91d6e5b9c82efb626869d6618b240d2ae8ad05..a1a9ee0a7240087e202a7855e470573de0e59c09:/math/mpx-mul4-amd64-sse2.S

diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S
index 8b8cd414..bd8ff2f9 100644
--- a/math/mpx-mul4-amd64-sse2.S
+++ b/math/mpx-mul4-amd64-sse2.S
@@ -25,15 +25,13 @@
 /// MA 02111-1307, USA.
 
 ///--------------------------------------------------------------------------
-/// External definitions.
+/// Preliminaries.
 
 #include "config.h"
 #include "asm-common.h"
 
-///--------------------------------------------------------------------------
-/// Prologue.
-
 	.arch	pentium4
+
 	.text
 
 ///--------------------------------------------------------------------------
@@ -96,7 +94,7 @@
 .macro	mulcore	r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
 	// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
 	// of the product in registers D0, D1, D2, D3.
-	pshufd	\d0, \r, SHUF(3, \i, 3, \i) // (r_i, ?; r_i, ?)
+	pshufd	\d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
   .ifnes "\d1", "nil"
 	movdqa	\d1, \slo		// (s'_0, s'_1; s''_0, s''_1)
   .endif
@@ -163,7 +161,7 @@
 	// lane 0 or 1 of D; the high two lanes of D are clobbered.  On
 	// completion, XMM3 is clobbered.  If CC is `nil', then the
 	// contribution which would have been added to it is left in C.
-	pshufd	xmm3, \c, SHUF(2, 3, 3, 3) // (?, ?; ?, t = c'' mod B)
+	pshufd	xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
 	psrldq	xmm3, 12		// (t, 0; 0, 0) = (t; 0)
 	pslldq	xmm3, 2			// (t b; 0)
 	paddq	\c, xmm3		// (c' + t b; c'')
@@ -209,11 +207,11 @@
 	punpcklwd \c, \z		// (c'_0, c''_0; c'_1, c''_1)
 	punpckhwd \d, \z		// (c'_2, c''_2; c'_3, c''_3)
   .endif
-	pshufd	\a, \a, SHUF(3, 1, 2, 0) // (a'_0, a'_1; a''_0, a''_1)
-	pshufd	\b, \b, SHUF(3, 1, 2, 0) // (a'_2, a'_3; a''_2, a''_3)
+	pshufd	\a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
+	pshufd	\b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
   .ifnes "\c", "nil"
-	pshufd	\c, \c, SHUF(3, 1, 2, 0) // (c'_0, c'_1; c''_0, c''_1)
-	pshufd	\d, \d, SHUF(3, 1, 2, 0) // (c'_2, c'_3; c''_2, c''_3)
+	pshufd	\c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
+	pshufd	\d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
   .endif
 .endm
 
@@ -321,7 +319,6 @@ INTFUNC(carryprop)
 	movdqu	[rdi], xmm0
 
 	ret
-
 ENDFUNC
 
 INTFUNC(dmul4)
@@ -359,7 +356,6 @@ INTFUNC(dmul4)
 	movdqu	[rdi], xmm6
 
 	ret
-
 ENDFUNC
 
 INTFUNC(dmla4)
@@ -400,7 +396,6 @@ INTFUNC(dmla4)
 	movdqu	[rdi], xmm6
 
 	ret
-
 ENDFUNC
 
 INTFUNC(mul4zc)
@@ -431,7 +426,6 @@ INTFUNC(mul4zc)
 	movdqu	[rdi], xmm6
 
 	ret
-
 ENDFUNC
 
 INTFUNC(mul4)
@@ -464,7 +458,6 @@ INTFUNC(mul4)
 	movdqu	[rdi], xmm6
 
 	ret
-
 ENDFUNC
 
 INTFUNC(mla4zc)
@@ -500,7 +493,6 @@ INTFUNC(mla4zc)
 	movdqu	[rdi], xmm6
 
 	ret
-
 ENDFUNC
 
 INTFUNC(mla4)
@@ -535,7 +527,6 @@ INTFUNC(mla4)
 	movdqu	[rdi], xmm6
 
 	ret
-
 ENDFUNC
 
 INTFUNC(mmul4)
@@ -559,7 +550,6 @@ INTFUNC(mmul4)
 	mulcore	xmm4, 0,   xmm8,  xmm9,  xmm12, xmm13, xmm14, xmm15
 	propout	xmm7, lo,		 xmm12, xmm13
 	jmp	5f
-
 ENDFUNC
 
 INTFUNC(mmla4)
@@ -577,10 +567,10 @@ INTFUNC(mmla4)
 	movdqu	xmm4, [rax]
 #if ABI_WIN
 	stalloc	48 + 8			// space for the carries
-#  define STKTMP(i) [rsp + i]
+#  define STKTMP(i) [SP + i]
 #endif
 #if ABI_SYSV
-#  define STKTMP(i) [rsp + i - 48 - 8]	// use red zone
+#  define STKTMP(i) [SP + i - 48 - 8]	// use red zone
 #endif
   endprologue
 
@@ -746,7 +736,6 @@ INTFUNC(mont4)
 	// And, with that, we're done.
 	movdqu	[rdi], xmm6
 	ret
-
 ENDFUNC
 
 ///--------------------------------------------------------------------------
@@ -785,7 +774,6 @@ FUNC(mpx_umul4_amd64_sse2)
   endprologue
 
 	mov	DV, rdi
-
 #endif
 
 #if ABI_WIN
@@ -813,8 +801,7 @@ FUNC(mpx_umul4_amd64_sse2)
   endprologue
 
 	mov	rdi, DV
-	mov	BVL, [rsp + 224]
-
+	mov	BVL, [SP + 224]
 #endif
 
 	// Prepare for the first iteration.
@@ -880,7 +867,6 @@ FUNC(mpx_umul4_amd64_sse2)
 #endif
 
 #if ABI_WIN
-
 	rstrxmm	xmm6,    0
 	rstrxmm	xmm7,   16
 	rstrxmm	xmm8,   32
@@ -895,7 +881,6 @@ FUNC(mpx_umul4_amd64_sse2)
 	stfree	160 + 8
 	popreg	rdi
 	popreg	rbx
-
 #endif
 
 	ret
@@ -948,7 +933,6 @@ FUNC(mpxmont_mul4_amd64_sse2)
   endprologue
 
 	mov	DV, rdi
-
 #endif
 
 #if ABI_WIN
@@ -980,9 +964,8 @@ FUNC(mpxmont_mul4_amd64_sse2)
   endprologue
 
 	mov	rdi, DV
-	mov	N, [rsp + 224]
-	mov	MI, [rsp + 232]
-
+	mov	N, [SP + 224]
+	mov	MI, [SP + 232]
 #endif
 
 	// Establish the expanded operands.
@@ -1064,7 +1047,6 @@ FUNC(mpxmont_mul4_amd64_sse2)
 #endif
 
 #if ABI_WIN
-
 	rstrxmm	xmm6,    0
 	rstrxmm	xmm7,   16
 	rstrxmm	xmm8,   32
@@ -1080,7 +1062,6 @@ FUNC(mpxmont_mul4_amd64_sse2)
 	popreg	r12
 	popreg	rdi
 	popreg	rbx
-
 #endif
 
 	ret
@@ -1136,7 +1117,6 @@ FUNC(mpxmont_redc4_amd64_sse2)
 	// c			rcx	r9
 
 #if ABI_SYSV
-
 #  define DVL rax
 #  define DVL4 rsi
 #  define MI r8
@@ -1151,11 +1131,9 @@ FUNC(mpxmont_redc4_amd64_sse2)
   endprologue
 
 	mov	DV, rdi
-
 #endif
 
 #if ABI_WIN
-
 #  define DVL rax
 #  define DVL4 rdx
 #  define MI r10
@@ -1185,8 +1163,7 @@ FUNC(mpxmont_redc4_amd64_sse2)
   endprologue
 
 	mov	rdi, DV
-	mov	MI, [rsp + 224]
-
+	mov	MI, [SP + 224]
 #endif
 
 	// Establish the expanded operands and the blocks-of-4 dv limit.
@@ -1269,7 +1246,6 @@ FUNC(mpxmont_redc4_amd64_sse2)
 #endif
 
 #if ABI_WIN
-
 	rstrxmm	xmm6,    0
 	rstrxmm	xmm7,   16
 	rstrxmm	xmm8,   32
@@ -1285,7 +1261,6 @@ FUNC(mpxmont_redc4_amd64_sse2)
 	popreg	r12
 	popreg	rdi
 	popreg	rbx
-
 #endif
 
 	ret
@@ -1329,9 +1304,9 @@ ENDFUNC
 #  define ARG6 STKARG(2)
 #  define ARG7 STKARG(3)
 #  define ARG8 STKARG(4)
-#  define STKARG_OFFSET 40
+#  define STKARG_OFFSET 224
 #endif
-#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)]
+#define STKARG(i) [SP + STKARG_OFFSET + 8*(i)]
 
 //		  sysv				win
 //		  dmul  smul  mmul  mont	dmul  smul  mmul  mont
@@ -1386,7 +1361,7 @@ ENDFUNC
 	mov	rbx, r8
 	movdqu	xmm8, [r9]
 	movdqu	xmm10, [rax]
-	mov	r8, STKARG(1)
+	mov	r8d, STKARG(1)
 	mov	r9, STKARG(2)
 	mov	r10, rdx
 	mov	r11, rcx
@@ -1395,7 +1370,7 @@ ENDFUNC
   .ifeqs "\mode", "mont"
 	mov	rbx, rcx
 	movdqu	xmm8, [r8]
-	mov	r8, r9
+	mov	r8d, r9d
 	mov	r9, STKARG(0)
 	mov	r10, rdx
 	mov	rcx, rsi
@@ -1423,16 +1398,16 @@ ENDFUNC
 	mov	rbx, r9
 	movdqu	xmm8, [r10]
 	movdqu	xmm10, [r11]
-	mov	r8, STKARG(2)
-	mov	r9, STKARG(3)
 	mov	r11, r8
+	mov	r8d, STKARG(2)
+	mov	r9, STKARG(3)
   .endif
   .ifeqs "\mode", "smul"
 	mov	rdi, rcx
 	mov	rcx, rdx
 	mov	rbx, r8
 	movdqu	xmm10, [r9]
-	mov	r8, STKARG(0)
+	mov	r8d, STKARG(0)
 	mov	r9, STKARG(1)
   .endif
   .ifeqs "\mode", "mmul"
@@ -1443,10 +1418,10 @@ ENDFUNC
 	mov	rbx, STKARG(0)
 	movdqu	xmm8, [r10]
 	movdqu	xmm10, [r11]
-	mov	r8, STKARG(3)
-	mov	r9, STKARG(4)
 	mov	r10, r8
 	mov	r11, r9
+	mov	r8d, STKARG(3)
+	mov	r9, STKARG(4)
   .endif
   .ifeqs "\mode", "mont"
 	mov	r10, STKARG(0)
@@ -1454,9 +1429,9 @@ ENDFUNC
 	mov	rcx, rdx
 	mov	rbx, r9
 	movdqu	xmm8, [r10]
-	mov	r8, STKARG(1)
-	mov	r9, STKARG(2)
 	mov	r10, r8
+	mov	r8d, STKARG(1)
+	mov	r9, STKARG(2)
   .endif
 #endif
 
@@ -1550,6 +1525,16 @@ FUNC(test_mul4)
 	testepilogue
 ENDFUNC
 
+FUNC(test_mul4zc)
+	testprologue smul
+	testldcarry
+	testtop	nil
+	call	mul4zc
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
 FUNC(test_mla4)
 	testprologue smul
 	testldcarry
@@ -1560,6 +1545,16 @@ FUNC(test_mla4)
 	testepilogue
 ENDFUNC
 
+FUNC(test_mla4zc)
+	testprologue smul
+	testldcarry
+	testtop	nil
+	call	mla4zc
+	testtail
+	testcarryout
+	testepilogue
+ENDFUNC
+
 FUNC(test_mmul4)
 	testprologue mmul
 	testtop	r11