base/asm-common.h, *.S: Use consistent little-endian notation for SIMD regs.

[catacomb] / symm / gcm-x86ish-pclmul.S
diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S

index 837abbd..fadeca5 100644 (file)
--- a/symm/gcm-x86ish-pclmul.S
+++ b/symm/gcm-x86ish-pclmul.S
@@ -113,21 +113,21 @@
         // use Karatsuba's identity here, but I suspect that loses more in
         // the shifting, bit-twiddling, and dependency chains that it gains
         // in saving a multiplication which otherwise pipelines well.
-       // xmm0 =                       // (u_1; u_0)
-       // xmm1 =                       // (v_1; v_0)
-       movdqa  xmm2, xmm1              // (v_1; v_0) again
-       movdqa  xmm3, xmm0              // (u_1; u_0) again
-       movdqa  xmm4, xmm0              // (u_1; u_0) yet again
+       // xmm0 =                       // (u_0; u_1)
+       // xmm1 =                       // (v_0; v_1)
+       movdqa  xmm2, xmm1              // (v_0; v_1) again
+       movdqa  xmm3, xmm0              // (u_0; u_1) again
+       movdqa  xmm4, xmm0              // (u_0; u_1) yet again
         pclmulhqlqdq xmm2, xmm0         // u_1 v_0
         pclmullqlqdq xmm0, xmm1         // u_1 v_1
         pclmulhqlqdq xmm3, xmm1         // u_0 v_1
         pclmulhqhqdq xmm4, xmm1         // u_0 v_0
  
         // Arrange the pieces to form a double-precision polynomial.
-       pxor    xmm2, xmm3              // (m_1; m_0) = u_1 v_0 + u_0 v_1
-       movdqa  xmm1, xmm2              // (m_1; m_0) again
-       pslldq  xmm2, 8                 // (0; m_1)
-       psrldq  xmm1, 8                 // (m_0; 0)
+       pxor    xmm2, xmm3              // (m_0; m_1) = u_1 v_0 + u_0 v_1
+       movdqa  xmm1, xmm2              // (m_0; m_1) again
+       pslldq  xmm2, 8                 // (m_1; 0)
+       psrldq  xmm1, 8                 // (0; m_0)
         pxor    xmm0, xmm2              // z_1 = u_1 v_1 + m_1
         pxor    xmm1, xmm4              // z_0 = u_0 v_0 + t^64 m_0
  
@@ -158,9 +158,9 @@
         // word together, and then the low bits, everything will be fine.
  
         // First, shift the high bits down.
-       movdqa  xmm2, xmm0              // (x_7, x_6; x_5, x_4) again
-       movdqa  xmm3, xmm0              // (x_7, x_6; x_5, x_4) yet again
-       movdqa  xmm4, xmm0              // (x_7, x_6; x_5, x_4) again again
+       movdqa  xmm2, xmm0              // (x_4, x_5; x_6, x_7) again
+       movdqa  xmm3, xmm0              // (x_4, x_5; x_6, x_7) yet again
+       movdqa  xmm4, xmm0              // (x_4, x_5; x_6, x_7) again again
         pslld   xmm2, 31                // the b_i for t
         pslld   xmm3, 30                // the b_i for t^2
         pslld   xmm4, 25                // the b_i for t^7
@@ -196,13 +196,13 @@
         // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
  
         // First, we must detach the top (`low'!) half of the result.
-       movdqa  xmm0, xmm1              // (x_3, x_2; x_1, x_0) again
-       psrldq  xmm1, 8                 // (x_1, x_0; 0, 0)
+       movdqa  xmm0, xmm1              // (x_0, x_1; x_2, x_3) again
+       psrldq  xmm1, 8                 // (0, 0; x_0, x_1)
  
         // Next, shift the high bits down.
-       movdqa  xmm2, xmm0              // (x_3, x_2; ?, ?) again
-       movdqa  xmm3, xmm0              // (x_3, x_2; ?, ?) yet again
-       movdqa  xmm4, xmm0              // (x_3, x_2; ?, ?) again again
+       movdqa  xmm2, xmm0              // (?, ?; x_2, x_3) again
+       movdqa  xmm3, xmm0              // (?, ?; x_2, x_3) yet again
+       movdqa  xmm4, xmm0              // (?, ?; x_2, x_3) again again
         pslld   xmm2, 31                // b_i for t
         pslld   xmm3, 29                // b_i for t^3
         pslld   xmm4, 28                // b_i for t^4
@@ -239,11 +239,11 @@
         // shift both of them up by four bytes before we start.  This will
         // mean that the high 64 bits of the result (from GCM's viewpoint)
         // will be zero.
-       // xmm0 =                       // (0, u_2; u_1, u_0)
-       // xmm1 =                       // (0, v_2; v_1, v_0)
-       movdqa  xmm2, xmm1              // (0, v_2; v_1, v_0) again
-       movdqa  xmm3, xmm0              // (0, u_2; u_1, u_0) again
-       movdqa  xmm4, xmm0              // (0, u_2; u_1, u_0) yet again
+       // xmm0 =                       // (u_0, u_1; u_2, 0)
+       // xmm1 =                       // (v_0, v_1; v_2, 0)
+       movdqa  xmm2, xmm1              // (v_0, v_1; v_2, 0) again
+       movdqa  xmm3, xmm0              // (u_0, u_1; u_2, 0) again
+       movdqa  xmm4, xmm0              // (u_0, u_1; u_2, 0) yet again
         pclmulhqlqdq xmm2, xmm0         // u_2 (v_1 t^32 + v_0) = e_0
         pclmullqlqdq xmm0, xmm1         // u_2 v_2 = d = (0; d)
         pclmulhqlqdq xmm3, xmm1         // v_2 (u_1 t^32 + u_0) = e_1
@@ -255,10 +255,10 @@
         // registers.  The answer we want is d t^128 + e t^64 + f, where e =
         // e_0 + e_1.
         //
-       // The place values for the two halves are (t^160, t^128; t^96, ?)
-       // and (?, t^64; t^32, 1).  But we also want to shift the high part
+       // The place values for the two halves are (?, t^96; t^128, t^160)
+       // and (1, t^32; t^64, ?).  But we also want to shift the high part
         // left by a word, for symmetry's sake.
-       psrldq  xmm0, 8                 // (d; 0) = d t^128
+       psrldq  xmm0, 8                 // (0; d) = d t^128
         pxor    xmm2, xmm3              // e = (e_0 + e_1)
         movdqa  xmm1, xmm4              // f again
         pxor    xmm0, xmm2              // d t^128 + e t^64
@@ -308,15 +308,15 @@
         // are unimportant.  Clobbers xmm2--xmm7.
  
         // Start multiplying and accumulating pieces of product.
-       // xmm0 =                       // (u_2; u_1)
-       // xmm1 =                       // (u_0; ?)
-       // xmm2 =                       // (v_2; v_1)
-       // xmm3 =                       // (v_0; ?)
-       movdqa  xmm4, xmm0              // (u_2; u_1) again
-       movdqa  xmm5, xmm0              // (u_2; u_1) yet again
-       movdqa  xmm6, xmm0              // (u_2; u_1) again again
-       movdqa  xmm7, xmm3              // (v_0; ?) again
-       punpcklqdq xmm3, xmm1           // (v_0; u_0)
+       // xmm0 =                       // (u_1; u_2)
+       // xmm1 =                       // (?; u_0)
+       // xmm2 =                       // (v_1; v_2)
+       // xmm3 =                       // (?; v_0)
+       movdqa  xmm4, xmm0              // (u_1; u_2) again
+       movdqa  xmm5, xmm0              // (u_1; u_2) yet again
+       movdqa  xmm6, xmm0              // (u_1; u_2) again again
+       movdqa  xmm7, xmm3              // (?; v_0) again
+       punpcklqdq xmm3, xmm1           // (u_0; v_0)
         pclmulhqhqdq xmm4, xmm2         // u_1 v_1
         pclmullqlqdq xmm1, xmm2         // u_0 v_2
         pclmullqhqdq xmm5, xmm2         // u_2 v_1
@@ -324,7 +324,7 @@
         pxor    xmm1, xmm4              // u_0 v_2 + u_1 v_1
         pclmullqlqdq xmm7, xmm0         // u_2 v_0
         pxor    xmm5, xmm6              // b = u_2 v_1 + u_1 v_2
-       movdqa  xmm6, xmm0              // (u_2; u_1) like a bad penny
+       movdqa  xmm6, xmm0              // (u_1; u_2) like a bad penny
         pxor    xmm1, xmm7              // c = u_0 v_2 + u_1 v_1 + u_2 v_0
         pclmullqlqdq xmm0, xmm2         // a = u_2 v_2
         pclmulhqlqdq xmm6, xmm3         // u_1 v_0
@@ -334,50 +334,50 @@
  
         // Next, the piecing together of the product.  There's significant
         // work here to leave the completed pieces in sensible registers.
-       // xmm0 =                       // (a_1; a_0) = a = u_2 v_2
-       // xmm5 =                       // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
-       // xmm1 =                       // (c_1; c_0) = c = u_0 v_2 +
+       // xmm0 =                       // (a_0; a_1) = a = u_2 v_2
+       // xmm5 =                       // (b_0; b_1) = b = u_1 v_2 + u_2 v_1
+       // xmm1 =                       // (c_0; c_1) = c = u_0 v_2 +
                                         //      u_1 v_1 + u_2 v_0
-       // xmm6 =                       // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
-       // xmm3 =                       // (e_1; e_0) = e = u_0 v_0
+       // xmm6 =                       // (d_0; d_1) = d = u_0 v_1 + u_1 v_0
+       // xmm3 =                       // (e_0; e_1) = e = u_0 v_0
         // xmm2, xmm4, xmm7 spare
-       movdqa  xmm2, xmm6              // (d_1; d_0) again
-       movdqa  xmm4, xmm5              // (b_1; b_0) again
-       pslldq  xmm6, 8                 // (0; d_1)
-       psrldq  xmm5, 8                 // (b_0; 0)
-       psrldq  xmm2, 8                 // (d_0; 0)
-       pslldq  xmm4, 8                 // (0; b_1)
-       pxor    xmm5, xmm6              // (b_0; d_1)
-       pxor    xmm0, xmm4              // (x_5; x_4) = (a_1; a_0 + b_1)
-       pxor    xmm2, xmm3              // (x_1; x_0) = (e_1 + d_0; e_0)
-       pxor    xmm1, xmm5             // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)
+       movdqa  xmm2, xmm6              // (d_0; d_1) again
+       movdqa  xmm4, xmm5              // (b_0; b_1) again
+       pslldq  xmm6, 8                 // (d_1; 0)
+       psrldq  xmm5, 8                 // (0; b_0)
+       psrldq  xmm2, 8                 // (0; d_0)
+       pslldq  xmm4, 8                 // (b_1; 0)
+       pxor    xmm5, xmm6              // (d_1; b_0)
+       pxor    xmm0, xmm4              // (x_4; x_5) = (a_0 + b_1; a_1)
+       pxor    xmm2, xmm3              // (x_0; x_1) = (e_0; e_1 + d_0)
+       pxor    xmm1, xmm5             // (x_2; x_3) = (c_0 + d_1; b_0 + c_1)
  
         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
         // 128-bit case.  I don't know why.
  
         // First, shift the high bits down.
-       // xmm0 =                       // (x_5; x_4)
-       // xmm1 =                       // (x_3; x_2)
-       // xmm2 =                       // (x_1; x_0)
+       // xmm0 =                       // (x_4; x_5)
+       // xmm1 =                       // (x_2; x_3)
+       // xmm2 =                       // (x_0; x_1)
         // xmm3--xmm7 spare
-       movdqa  xmm3, xmm0              // (x_5; x_4) copy
-       movdqa  xmm4, xmm0              // (x_5; x_4) copy
-       movdqa  xmm5, xmm0              // (x_5; x_4) copy
-       pslld   xmm3, 31                // (x_5; x_4) b_i for t
-       pslld   xmm4, 30                // (x_5; x_4) b_i for t^2
-       pslld   xmm5, 25                // (x_5; x_4) b_i for t^7
-        movq   xmm6, xmm1              // (x_3; 0) copy
+       movdqa  xmm3, xmm0              // (x_4; x_5) copy
+       movdqa  xmm4, xmm0              // (x_4; x_5) copy
+       movdqa  xmm5, xmm0              // (x_4; x_5) copy
+       pslld   xmm3, 31                // (x_4; x_5) b_i for t
+       pslld   xmm4, 30                // (x_4; x_5) b_i for t^2
+       pslld   xmm5, 25                // (x_4; x_5) b_i for t^7
+        movq   xmm6, xmm1              // (0; x_3) copy
         pxor    xmm3, xmm4
-        movq   xmm7, xmm1              // (x_3; 0) copy
+        movq   xmm7, xmm1              // (0; x_3) copy
         pxor    xmm3, xmm5
-        movq   xmm5, xmm1              // (x_3; 0) copy
-       movdqa  xmm4, xmm3              // (x_5; x_4) b_i combined
-        pslld  xmm6, 31                // (x_3; 0) b_i for t
-        pslld  xmm7, 30                // (x_3; 0) b_i for t^2
-        pslld  xmm5, 25                // (x_3; 0) b_i for t^7
-       psrldq  xmm3, 12                // (x_5; x_4) low contrib
-       pslldq  xmm4, 4                 // (x_5; x_4) high contrib
+        movq   xmm5, xmm1              // (0; x_3) copy
+       movdqa  xmm4, xmm3              // (x_4; x_5) b_i combined
+        pslld  xmm6, 31                // (0; x_3) b_i for t
+        pslld  xmm7, 30                // (0; x_3) b_i for t^2
+        pslld  xmm5, 25                // (0; x_3) b_i for t^7
+       psrldq  xmm3, 12                // (x_4; x_5) low contrib
+       pslldq  xmm4, 4                 // (x_4; x_5) high contrib
          pxor   xmm6, xmm7
         pxor    xmm2, xmm3
          pxor   xmm6, xmm5
@@ -387,17 +387,17 @@
  
         // And finally shift the low bits up.  Unfortunately, we also have to
         // split the low bits out.
-       // xmm0 =                       // (x'_5; x'_4)
-       // xmm1 =                       // (x'_3; x'_2)
-       // xmm2 =                       // (x'_1; x'_0)
-        movdqa xmm5, xmm1              // copies of (x'_3; x'_2)
+       // xmm0 =                       // (x'_4; x'_5)
+       // xmm1 =                       // (x'_2; x'_3)
+       // xmm2 =                       // (x'_0; x'_1)
+        movdqa xmm5, xmm1              // copies of (x'_2; x'_3)
          movdqa xmm6, xmm1
          movdqa xmm7, xmm1
-         psrldq xmm1, 8                // bring down (x'_2; ?)
-       movdqa  xmm3, xmm0              // copies of (x'_5; x'_4)
+         psrldq xmm1, 8                // bring down (?; x'_2)
+       movdqa  xmm3, xmm0              // copies of (x'_4; x'_5)
         movdqa  xmm4, xmm0
-         punpcklqdq  xmm1, xmm2        // (x'_2; x'_1)
-         psrldq xmm2, 8                // (x'_0; ?)
+         punpcklqdq  xmm1, xmm2        // (x'_1; x'_2)
+         psrldq xmm2, 8                // (?; x'_0)
          pxor   xmm2, xmm5              // low half and unit contrib
         pxor    xmm1, xmm0
          psrld  xmm5, 1
@@ -412,7 +412,7 @@
         pxor    xmm0, xmm4
          pxor   xmm5, xmm2              // mix everything together
         pxor    xmm0, xmm1
-        movq   xmm1, xmm5              // shunt (z_0; ?) into proper place
+        movq   xmm1, xmm5              // shunt (?; z_0) into proper place
  .endm
  
  .macro mul256
@@ -442,10 +442,10 @@
         // On x86, there aren't quite enough registers, so spill one for a
         // bit.  On AMD64, we can keep on going, so it's all good.
  
-       // xmm0 =                       // u_1 = (u_11; u_10)
-       // xmm1 =                       // u_0 = (u_01; u_00)
-       // xmm2 =                       // v_1 = (v_11; v_10)
-       // xmm3 =                       // v_0 = (v_01; v_00)
+       // xmm0 =                       // u_1 = (u_10; u_11)
+       // xmm1 =                       // u_0 = (u_00; u_01)
+       // xmm2 =                       // v_1 = (v_10; v_11)
+       // xmm3 =                       // v_0 = (v_00; v_01)
         movdqa  xmm4, xmm0              // u_1 again
  #if CPUFAM_X86
         movdqa  [SP + 0], xmm3
@@ -453,8 +453,8 @@
         movdqa  xmm8, xmm3
  #  define V0 xmm8
  #endif
-       pxor    xmm4, xmm1              // u_* = (u_01 + u_11; u_00 + u_10)
-       pxor    xmm3, xmm2              // v_* = (v_01 + v_11; v_00 + v_10)
+       pxor    xmm4, xmm1              // u_* = (u_00 + u_10; u_01 + u_11)
+       pxor    xmm3, xmm2              // v_* = (v_00 + v_10; v_01 + v_11)
  
         // Start by building the cross product, q = u_* v_*.
         movdqa  xmm7, xmm4              // more copies of u_*
@@ -588,7 +588,7 @@
  //     the /last/ byte in the block.  If the block size is not a multiple of
  //     16 bytes, then there must be padding.  96-bit blocks are weird: the
  //     padding is inserted at the /least/ significant end, so the register
-//     holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
+//     holds (x_2, x_1; x_0, 0); otherwise, the padding goes at the most
  //     significant end.
  //
  //   * The `words' format consists of a sequence of bytes, as in the
@@ -613,9 +613,9 @@ SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
    endprologue
         movdqu  xmm0, [A]
         movdqu  xmm1, [K]
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
         mul128
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
         movdqu  [A], xmm0
         ret
  ENDFUNC
@@ -653,9 +653,9 @@ SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
    endprologue
         movq    xmm0, [A]
         movq    xmm1, [K]
-       pshufd  xmm0, xmm0, SHUF(1, 0, 3, 3)
+       pshufd  xmm0, xmm0, SHUF(3, 3, 0, 1)
         mul64
-       pshufd  xmm0, xmm0, SHUF(1, 0, 3, 3)
+       pshufd  xmm0, xmm0, SHUF(3, 3, 0, 1)
         movq    [A], xmm0
         ret
  ENDFUNC
@@ -696,9 +696,9 @@ SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
         movd    xmm2, [A + 8]
         movdqu  xmm1, [K]
         punpcklqdq xmm0, xmm2
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
         mul96
-       pshufd  xmm1, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm1, xmm0, SHUF(0, 1, 2, 3)
         psrldq  xmm0, 4
         movq    [A + 0], xmm1
         movd    [A + 8], xmm0
@@ -750,11 +750,11 @@ SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
         movq    xmm1, [A + 0]
         movdqu  xmm2, [K + 0]
         movq    xmm3, [K + 16]
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
-       pshufd  xmm1, xmm1, SHUF(1, 0, 3, 3)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
+       pshufd  xmm1, xmm1, SHUF(3, 3, 0, 1)
         mul192
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
-       pshufd  xmm1, xmm1, SHUF(1, 0, 3, 3)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
+       pshufd  xmm1, xmm1, SHUF(3, 3, 0, 1)
         movdqu  [A + 8], xmm0
         movq    [A + 0], xmm1
  #if CPUFAM_AMD64 && ABI_WIN
@@ -824,11 +824,11 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
         movdqu  xmm1, [A + 0]
         movdqu  xmm2, [K + 0]
         movdqu  xmm3, [K + 16]
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
-       pshufd  xmm1, xmm1, SHUF(3, 2, 1, 0)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
+       pshufd  xmm1, xmm1, SHUF(0, 1, 2, 3)
         mul256
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
-       pshufd  xmm1, xmm1, SHUF(3, 2, 1, 0)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
+       pshufd  xmm1, xmm1, SHUF(0, 1, 2, 3)
         movdqu  [A + 16], xmm0
         movdqu  [A + 0], xmm1
  #if CPUFAM_X86