X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/a117c06f5ee62cbe7812769703eada01843f76ca..a1a9ee0a7240087e202a7855e470573de0e59c09:/math/mpx-mul4-amd64-sse2.S diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index 64460ca9..bd8ff2f9 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -25,15 +25,13 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" -///-------------------------------------------------------------------------- -/// Prologue. - .arch pentium4 + .text ///-------------------------------------------------------------------------- @@ -321,7 +319,6 @@ INTFUNC(carryprop) movdqu [rdi], xmm0 ret - ENDFUNC INTFUNC(dmul4) @@ -359,7 +356,6 @@ INTFUNC(dmul4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(dmla4) @@ -400,7 +396,6 @@ INTFUNC(dmla4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mul4zc) @@ -431,7 +426,6 @@ INTFUNC(mul4zc) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mul4) @@ -464,7 +458,6 @@ INTFUNC(mul4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mla4zc) @@ -500,7 +493,6 @@ INTFUNC(mla4zc) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mla4) @@ -535,7 +527,6 @@ INTFUNC(mla4) movdqu [rdi], xmm6 ret - ENDFUNC INTFUNC(mmul4) @@ -559,7 +550,6 @@ INTFUNC(mmul4) mulcore xmm4, 0, xmm8, xmm9, xmm12, xmm13, xmm14, xmm15 propout xmm7, lo, xmm12, xmm13 jmp 5f - ENDFUNC INTFUNC(mmla4) @@ -577,10 +567,10 @@ INTFUNC(mmla4) movdqu xmm4, [rax] #if ABI_WIN stalloc 48 + 8 // space for the carries -# define STKTMP(i) [rsp + i] +# define STKTMP(i) [SP + i] #endif #if ABI_SYSV -# define STKTMP(i) [rsp + i - 48 - 8] // use red zone +# define STKTMP(i) [SP + i - 48 - 8] // use red zone #endif endprologue @@ -746,7 +736,6 @@ INTFUNC(mont4) // And, with that, we're done. movdqu [rdi], xmm6 ret - ENDFUNC ///-------------------------------------------------------------------------- @@ -785,7 +774,6 @@ FUNC(mpx_umul4_amd64_sse2) endprologue mov DV, rdi - #endif #if ABI_WIN @@ -813,8 +801,7 @@ FUNC(mpx_umul4_amd64_sse2) endprologue mov rdi, DV - mov BVL, [rsp + 224] - + mov BVL, [SP + 224] #endif // Prepare for the first iteration. @@ -880,7 +867,6 @@ FUNC(mpx_umul4_amd64_sse2) #endif #if ABI_WIN - rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 @@ -895,7 +881,6 @@ FUNC(mpx_umul4_amd64_sse2) stfree 160 + 8 popreg rdi popreg rbx - #endif ret @@ -948,7 +933,6 @@ FUNC(mpxmont_mul4_amd64_sse2) endprologue mov DV, rdi - #endif #if ABI_WIN @@ -980,9 +964,8 @@ FUNC(mpxmont_mul4_amd64_sse2) endprologue mov rdi, DV - mov N, [rsp + 224] - mov MI, [rsp + 232] - + mov N, [SP + 224] + mov MI, [SP + 232] #endif // Establish the expanded operands. @@ -1064,7 +1047,6 @@ FUNC(mpxmont_mul4_amd64_sse2) #endif #if ABI_WIN - rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 @@ -1080,7 +1062,6 @@ FUNC(mpxmont_mul4_amd64_sse2) popreg r12 popreg rdi popreg rbx - #endif ret @@ -1136,7 +1117,6 @@ FUNC(mpxmont_redc4_amd64_sse2) // c rcx r9 #if ABI_SYSV - # define DVL rax # define DVL4 rsi # define MI r8 @@ -1151,11 +1131,9 @@ FUNC(mpxmont_redc4_amd64_sse2) endprologue mov DV, rdi - #endif #if ABI_WIN - # define DVL rax # define DVL4 rdx # define MI r10 @@ -1185,8 +1163,7 @@ FUNC(mpxmont_redc4_amd64_sse2) endprologue mov rdi, DV - mov MI, [rsp + 224] - + mov MI, [SP + 224] #endif // Establish the expanded operands and the blocks-of-4 dv limit. @@ -1269,7 +1246,6 @@ FUNC(mpxmont_redc4_amd64_sse2) #endif #if ABI_WIN - rstrxmm xmm6, 0 rstrxmm xmm7, 16 rstrxmm xmm8, 32 @@ -1285,7 +1261,6 @@ FUNC(mpxmont_redc4_amd64_sse2) popreg r12 popreg rdi popreg rbx - #endif ret @@ -1329,9 +1304,9 @@ ENDFUNC # define ARG6 STKARG(2) # define ARG7 STKARG(3) # define ARG8 STKARG(4) -# define STKARG_OFFSET 40 +# define STKARG_OFFSET 224 #endif -#define STKARG(i) [rsp + STKARG_OFFSET + 8*(i)] +#define STKARG(i) [SP + STKARG_OFFSET + 8*(i)] // sysv win // dmul smul mmul mont dmul smul mmul mont @@ -1386,7 +1361,7 @@ ENDFUNC mov rbx, r8 movdqu xmm8, [r9] movdqu xmm10, [rax] - mov r8, STKARG(1) + mov r8d, STKARG(1) mov r9, STKARG(2) mov r10, rdx mov r11, rcx @@ -1395,7 +1370,7 @@ ENDFUNC .ifeqs "\mode", "mont" mov rbx, rcx movdqu xmm8, [r8] - mov r8, r9 + mov r8d, r9d mov r9, STKARG(0) mov r10, rdx mov rcx, rsi @@ -1423,16 +1398,16 @@ ENDFUNC mov rbx, r9 movdqu xmm8, [r10] movdqu xmm10, [r11] - mov r8, STKARG(2) - mov r9, STKARG(3) mov r11, r8 + mov r8d, STKARG(2) + mov r9, STKARG(3) .endif .ifeqs "\mode", "smul" mov rdi, rcx mov rcx, rdx mov rbx, r8 movdqu xmm10, [r9] - mov r8, STKARG(0) + mov r8d, STKARG(0) mov r9, STKARG(1) .endif .ifeqs "\mode", "mmul" @@ -1443,10 +1418,10 @@ ENDFUNC mov rbx, STKARG(0) movdqu xmm8, [r10] movdqu xmm10, [r11] - mov r8, STKARG(3) - mov r9, STKARG(4) mov r10, r8 mov r11, r9 + mov r8d, STKARG(3) + mov r9, STKARG(4) .endif .ifeqs "\mode", "mont" mov r10, STKARG(0) @@ -1454,9 +1429,9 @@ ENDFUNC mov rcx, rdx mov rbx, r9 movdqu xmm8, [r10] - mov r8, STKARG(1) - mov r9, STKARG(2) mov r10, r8 + mov r8d, STKARG(1) + mov r9, STKARG(2) .endif #endif @@ -1550,6 +1525,16 @@ FUNC(test_mul4) testepilogue ENDFUNC +FUNC(test_mul4zc) + testprologue smul + testldcarry + testtop nil + call mul4zc + testtail + testcarryout + testepilogue +ENDFUNC + FUNC(test_mla4) testprologue smul testldcarry @@ -1560,6 +1545,16 @@ FUNC(test_mla4) testepilogue ENDFUNC +FUNC(test_mla4zc) + testprologue smul + testldcarry + testtop nil + call mla4zc + testtail + testcarryout + testepilogue +ENDFUNC + FUNC(test_mmul4) testprologue mmul testtop r11