math/mpx-mul4-*.S: Use more portable type syntax for ambiguous instructions.

[catacomb] / math / mpx-mul4-amd64-sse2.S
diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S

index a37aba6..5a748c6 100644 (file)
--- a/math/mpx-mul4-amd64-sse2.S
+++ b/math/mpx-mul4-amd64-sse2.S
@@ -1155,7 +1155,7 @@ FUNC(mpxmont_redc4_amd64_sse2)
         // outer loop dv        r10     rcx
         // outer loop dv limit  r11     r11
         // nv base              rdx     r8
-       // nv limit             r9      r12*
+       // nv limit             r9      r10*
         // n                    rcx     r9
         // c                    rcx     r9
  
@@ -1183,14 +1183,13 @@ FUNC(mpxmont_redc4_amd64_sse2)
  #  define DV rcx
  #  define DVLO r11
  #  define NV r8
-#  define NVL r12
+#  define NVL r10
  #  define N r9
  #  define C r9d
  
         pushreg rbx
         pushreg rdi
-       pushreg r12
-       stalloc 160
+       stalloc 168
  
         savexmm xmm6,    0
         savexmm xmm7,   16
@@ -1252,15 +1251,17 @@ FUNC(mpxmont_redc4_amd64_sse2)
         // Continue carry propagation until the end of the buffer.
  0:     add     [rdi], C
         mov     C, 0                    // preserves flags
-       adcd    [rdi + 4], 0
-       adcd    [rdi + 8], 0
-       adcd    [rdi + 12], 0
+       adc     dword ptr [rdi + 4], 0
+       adc     dword ptr [rdi + 8], 0
+       adc     dword ptr [rdi + 12], 0
         adc     C, 0
         add     rdi, 16
         cmp     rdi, DVL4
         jb      0b
  
-       // Deal with the tail end.
+       // Deal with the tail end.  Note that the actual destination length
+       // won't be an exacty number of blocks of four, so it's safe to just
+       // drop through here.
  7:     add     [rdi], C
         mov     C, 0
         add     rdi, 4
@@ -1268,13 +1269,11 @@ FUNC(mpxmont_redc4_amd64_sse2)
         cmp     rdi, DVL
         jb      7b
  
-       // All done for this iteration.  Start the next.  (This must have at
-       // least one follow-on iteration, or we'd not have started this outer
-       // loop.)
-8:     mov     rdi, DV                 // -> Z = dv[i]
-       mov     rbx, NV                 // -> X = nv[0]
-       cmp     rdi, DVLO               // all done yet?
+       // All done for this iteration.  Start the next.
+       cmp     DV, DVLO                // all done yet?
         jae     9f
+       mov     rdi, DV                 // -> Z = dv[i]
+       mov     rbx, NV                 // -> X = nv[0]
         add     DV, 16
         call    mont4
         add     rdi, 16
@@ -1300,8 +1299,7 @@ FUNC(mpxmont_redc4_amd64_sse2)
         rstrxmm xmm14, 128
         rstrxmm xmm15, 144
  
-       stfree  160
-       popreg  r12
+       stfree  168
         popreg  rdi
         popreg  rbx
  #endif
@@ -1603,6 +1601,8 @@ FUNC(test_mmul4)
         testtop r11
         call    mmul4
         testtail
+       pshufd  xmm10, xmm10, SHUF(0, 2, 1, 3)
+       pshufd  xmm11, xmm11, SHUF(0, 2, 1, 3)
         movdqu  [r10 +  0], xmm10
         movdqu  [r10 + 16], xmm11
         testcarryout
@@ -1614,6 +1614,8 @@ FUNC(test_mmla4)
         testtop r11
         call    mmla4
         testtail
+       pshufd  xmm10, xmm10, SHUF(0, 2, 1, 3)
+       pshufd  xmm11, xmm11, SHUF(0, 2, 1, 3)
         movdqu  [r10 +  0], xmm10
         movdqu  [r10 + 16], xmm11
         testcarryout
@@ -1625,6 +1627,8 @@ FUNC(test_mont4)
         testtop
         call    mont4
         testtail
+       pshufd  xmm10, xmm10, SHUF(0, 2, 1, 3)
+       pshufd  xmm11, xmm11, SHUF(0, 2, 1, 3)
         movdqu  [r10 +  0], xmm10
         movdqu  [r10 + 16], xmm11
         testcarryout