(x86 asm): Zero the high parts of the ?MM registers if available.
[catacomb] / math / mpx-mul4-x86-sse2.S
index 14052fd..cdc3596 100644 (file)
@@ -678,6 +678,14 @@ ENDFUNC
 ///--------------------------------------------------------------------------
 /// Bulk multipliers.
 
+FUNC(mpx_umul4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpx_umul4_x86_sse2)
        // void mpx_umul4_x86_sse2(mpw *dv, const mpw *av, const mpw *avl,
        //                         const mpw *bv, const mpw *bvl);
@@ -778,6 +786,14 @@ FUNC(mpx_umul4_x86_sse2)
 
 ENDFUNC
 
+FUNC(mpxmont_mul4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpxmont_mul4_x86_sse2)
        // void mpxmont_mul4_x86_sse2(mpw *dv, const mpw *av, const mpw *bv,
        //                           const mpw *nv, size_t n, const mpw *mi);
@@ -919,6 +935,14 @@ FUNC(mpxmont_mul4_x86_sse2)
 
 ENDFUNC
 
+FUNC(mpxmont_redc4_x86_avx)
+       .arch   .avx
+       vzeroupper
+  endprologue
+       // and drop through...
+       .arch   pentium4
+ENDFUNC
+
 FUNC(mpxmont_redc4_x86_sse2)
        // void mpxmont_redc4_x86_sse2(mpw *dv, mpw *dvl, const mpw *nv,
        //                             size_t n, const mpw *mi);
@@ -1068,7 +1092,7 @@ ENDFUNC
        mov     [ebx + ecx*8 + 4], edx
 .endm
 
-.macro testprologue
+.macro testprologue n
        pushreg ebp
        pushreg ebx
        pushreg esi
@@ -1077,11 +1101,14 @@ ENDFUNC
        and     esp, ~15
        sub     esp, 3*32 + 4*4
   endprologue
+       mov     eax, \n
+       mov     [esp + 104], eax
        // vars:
        //      esp +   0 = v expanded
        //      esp +  32 = y expanded
        //      esp +  64 = ? expanded
        //      esp +  96 = cycles
+       //      esp + 104 = count
 .endm
 
 .macro testepilogue
@@ -1139,8 +1166,8 @@ ENDFUNC
   .endif
 .endm
 
-.macro testtail cyv, n
-       cystore esp + 96, \cyv, \n
+.macro testtail cyv
+       cystore esp + 96, \cyv, esp + 104
        jnz     0b
 .endm
 
@@ -1152,60 +1179,60 @@ ENDFUNC
 .endm
 
 FUNC(test_dmul4)
-       testprologue
+       testprologue [ebp + 44]
        testldcarry [ebp + 24]
        testexpand [ebp + 36], [ebp + 40]
        mov     edi, [ebp + 20]
        testtop [ebp + 28], [ebp + 32]
        call    dmul4
-       testtail [ebp + 48], [ebp + 44]
+       testtail [ebp + 48]
        testcarryout [ebp + 24]
        testepilogue
 ENDFUNC
 
 FUNC(test_dmla4)
-       testprologue
+       testprologue [ebp + 44]
        testldcarry [ebp + 24]
        testexpand [ebp + 36], [ebp + 40]
        mov     edi, [ebp + 20]
        testtop [ebp + 28], [ebp + 32]
        call    dmla4
-       testtail [ebp + 48], [ebp + 44]
+       testtail [ebp + 48]
        testcarryout [ebp + 24]
        testepilogue
 ENDFUNC
 
 FUNC(test_mul4)
-       testprologue
+       testprologue [ebp + 36]
        testldcarry [ebp + 24]
        testexpand nil, [ebp + 32]
        mov     edi, [ebp + 20]
        testtop nil, [ebp + 28]
        call    mul4
-       testtail [ebp + 40], [ebp + 36]
+       testtail [ebp + 40]
        testcarryout [ebp + 24]
        testepilogue
 ENDFUNC
 
 FUNC(test_mla4)
-       testprologue
+       testprologue [ebp + 36]
        testldcarry [ebp + 24]
        testexpand nil, [ebp + 32]
        mov     edi, [ebp + 20]
        testtop nil, [ebp + 28]
        call    mla4
-       testtail [ebp + 40], [ebp + 36]
+       testtail [ebp + 40]
        testcarryout [ebp + 24]
        testepilogue
 ENDFUNC
 
 FUNC(test_mmul4)
-       testprologue
+       testprologue [ebp + 48]
        testexpand [ebp + 40], [ebp + 44]
        mov     edi, [ebp + 20]
        testtop [ebp + 32], [ebp + 36], mont
        call    mmul4
-       testtail [ebp + 52], [ebp + 48]
+       testtail [ebp + 52]
        mov     edi, [ebp + 28]
        movdqa  xmm0, [esp + 64]
        movdqa  xmm1, [esp + 80]
@@ -1216,12 +1243,12 @@ FUNC(test_mmul4)
 ENDFUNC
 
 FUNC(test_mmla4)
-       testprologue
+       testprologue [ebp + 48]
        testexpand [ebp + 40], [ebp + 44]
        mov     edi, [ebp + 20]
        testtop [ebp + 32], [ebp + 36], mont
        call    mmla4
-       testtail [ebp + 52], [ebp + 48]
+       testtail [ebp + 52]
        mov     edi, [ebp + 28]
        movdqa  xmm0, [esp + 64]
        movdqa  xmm1, [esp + 80]
@@ -1232,12 +1259,12 @@ FUNC(test_mmla4)
 ENDFUNC
 
 FUNC(test_mont4)
-       testprologue
+       testprologue [ebp + 40]
        testexpand nil, [ebp + 36]
        mov     edi, [ebp + 20]
        testtop nil, [ebp + 32], mont
        call    mont4
-       testtail [ebp + 44], [ebp + 40]
+       testtail [ebp + 44]
        mov     edi, [ebp + 28]
        movdqa  xmm0, [esp + 64]
        movdqa  xmm1, [esp + 80]