base/asm-common.h, *.S: Use consistent little-endian notation for SIMD regs.

author Mark Wooding <mdw@distorted.org.uk>

Sat, 3 Feb 2024 23:02:22 +0000 (23:02 +0000)

committer Mark Wooding <mdw@distorted.org.uk>

Sat, 3 Feb 2024 23:36:13 +0000 (23:36 +0000)
author Mark Wooding <mdw@distorted.org.uk>
Sat, 3 Feb 2024 23:02:22 +0000 (23:02 +0000)
committer Mark Wooding <mdw@distorted.org.uk>
Sat, 3 Feb 2024 23:36:13 +0000 (23:36 +0000)
diff --git a/base/asm-common.h b/base/asm-common.h

index b4d4a90..9257d76 100644 (file)
--- a/base/asm-common.h
+++ b/base/asm-common.h
@@ -222,11 +222,11 @@ name:
  #  define INTADDR__1(addr, got) addr
  #endif
  
  #  define INTADDR__1(addr, got) addr
  #endif
  
-// Permutations for SIMD instructions.  SHUF(A, B, C, D) is an immediate,
+// Permutations for SIMD instructions.  SHUF(D, C, B, A) is an immediate,
  // suitable for use in `pshufd' or `shufpd', which copies element A
  // (0 <= A < 4) of the source to element 0 of the destination, element B to
  // element 1, element C to element 2, and element D to element 3.
  // suitable for use in `pshufd' or `shufpd', which copies element A
  // (0 <= A < 4) of the source to element 0 of the destination, element B to
  // element 1, element C to element 2, and element D to element 3.
-#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d))
+#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a))
  
  // Map register names to their individual pieces.
  
  
  // Map register names to their individual pieces.
  
diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S

index 5a748c6..d4726af 100644 (file)
--- a/math/mpx-mul4-amd64-sse2.S
+++ b/math/mpx-mul4-amd64-sse2.S
@@ -59,9 +59,9 @@
  /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  /// operands, as follows.
  ///
  /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  /// operands, as follows.
  ///
-///    Offset     0       4        8      12
-///       0    v'_0    v'_1    v''_0   v''_1
-///      16    v'_2    v'_3    v''_2   v''_3
+///    Offset     12       8      4       0
+///       0    v''_1   v''_0   v'_1    v'_0
+///      16    v''_3   v''_2   v'_3    v'_2
  ///
  /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  ///
  /// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
  /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
@@ -137,32 +137,32 @@
  .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
         // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
         // of the product in registers D0, D1, D2, D3.
  .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
         // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
         // of the product in registers D0, D1, D2, D3.
-       pshufd  \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
+       pshufd  \d0, \r, SHUF(3, \i, 3, \i) // (?, r_i; ?, r_i)
    .ifnes "\d1", "nil"
    .ifnes "\d1", "nil"
-       movdqa  \d1, \slo               // (s'_0, s'_1; s''_0, s''_1)
+       movdqa  \d1, \slo               // (s''_1, s''_0; s'_1, s'_0)
    .endif
    .ifnes "\d3", "nil"
    .endif
    .ifnes "\d3", "nil"
-       movdqa  \d3, \shi               // (s'_2, s'_3; s''_2, s''_3)
+       movdqa  \d3, \shi               // (s''_3, s''_2; s'_3, s'_2)
    .endif
    .ifnes "\d1", "nil"
    .endif
    .ifnes "\d1", "nil"
-       psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
+       psrldq  \d1, 4                  // (0, s''_1; s''_0, s'_1)
    .endif
    .ifnes "\d2", "nil"
    .endif
    .ifnes "\d2", "nil"
-       movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
+       movdqa  \d2, \d0                // another copy of (?, r_i; ?, r_i)
    .endif
    .ifnes "\d3", "nil"
    .endif
    .ifnes "\d3", "nil"
-       psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
+       psrldq  \d3, 4                  // (0, s''_3; s''_2, s'_3)
    .endif
    .ifnes "\d1", "nil"
    .endif
    .ifnes "\d1", "nil"
-       pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
+       pmuludq \d1, \d0                // (r_i s''_1; r_i s'_1)
    .endif
    .ifnes "\d3", "nil"
    .endif
    .ifnes "\d3", "nil"
-       pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
+       pmuludq \d3, \d0                // (r_i s''_3; r_i s'_3)
    .endif
    .ifnes "\d2", "nil"
    .endif
    .ifnes "\d2", "nil"
-       pmuludq \d2, \shi               // (r_i s'_2; r_i s''_2)
+       pmuludq \d2, \shi               // (r_i s''_2; r_i s'_2)
    .endif
    .endif
-       pmuludq \d0, \slo               // (r_i s'_0; r_i s''_0)
+       pmuludq \d0, \slo               // (r_i s''_0; r_i s'_0)
  .endm
  
  .macro accum   c0, c1=nil, c2=nil, c3=nil
  .endm
  
  .macro accum   c0, c1=nil, c2=nil, c3=nil
@@ -204,10 +204,10 @@
         // lane 0 or 1 of D; the high two lanes of D are clobbered.  On
         // completion, XMM3 is clobbered.  If CC is `nil', then the
         // contribution which would have been added to it is left in C.
         // lane 0 or 1 of D; the high two lanes of D are clobbered.  On
         // completion, XMM3 is clobbered.  If CC is `nil', then the
         // contribution which would have been added to it is left in C.
-       pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
-       psrldq  xmm3, 12                // (t, 0; 0, 0) = (t; 0)
-       pslldq  xmm3, 2                 // (t b; 0)
-       paddq   \c, xmm3                // (c' + t b; c'')
+       pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?)
+       psrldq  xmm3, 12                // (0, 0; 0, t) = (0; t)
+       pslldq  xmm3, 2                 // (0; t b)
+       paddq   \c, xmm3                // (c''; c' + t b)
    .ifeqs "\pos", "lo"
         movdqa  \d, \c
    .else
    .ifeqs "\pos", "lo"
         movdqa  \d, \c
    .else
@@ -224,10 +224,10 @@
         // of the value represented in C are written at POS in D, and the
         // remaining bits are left at the bottom of T.
         movdqa  \t, \c
         // of the value represented in C are written at POS in D, and the
         // remaining bits are left at the bottom of T.
         movdqa  \t, \c
-       psllq   \t, 16                  // (?; c'' b)
-       pslldq  \c, 8                   // (0; c')
-       paddq   \t, \c                  // (?; c' + c'' b)
-       psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
+       psllq   \t, 16                  // (c'' b; ?)
+       pslldq  \c, 8                   // (c'; 0)
+       paddq   \t, \c                  // (c' + c'' b; ?)
+       psrldq  \t, 8                   // (0; c' + c'' b) = (0; c)
    .ifeqs "\pos", "lo"
         movdqa  \d, \t
    .else
    .ifeqs "\pos", "lo"
         movdqa  \d, \t
    .else
@@ -240,21 +240,21 @@
         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
         // exit, A:B and C:D together hold the same values in expanded
         // form.  If C is `nil', then only expand A to A:B.
         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
         // exit, A:B and C:D together hold the same values in expanded
         // form.  If C is `nil', then only expand A to A:B.
-       movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
+       movdqa  \b, \a                  // (a_3, a_2; a_1, a_0)
    .ifnes "\c", "nil"
    .ifnes "\c", "nil"
-       movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
+       movdqa  \d, \c                  // (c_3, c_2; c_1, c_0)
    .endif
    .endif
-       punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
-       punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
+       punpcklwd \a, \z                // (a''_1, a'_1; a''_0, a'_0)
+       punpckhwd \b, \z                // (a''_3, a'_3; a''_2, a'_2)
    .ifnes "\c", "nil"
    .ifnes "\c", "nil"
-       punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
-       punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
+       punpcklwd \c, \z                // (c''_1, c'_1; c''_0, c'_0)
+       punpckhwd \d, \z                // (c''_3, c'_3; c''_2, c'_2)
    .endif
    .endif
-       pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
-       pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
+       pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0)
+       pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2)
    .ifnes "\c", "nil"
    .ifnes "\c", "nil"
-       pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
-       pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
+       pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0)
+       pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2)
    .endif
  .endm
  
    .endif
  .endm
  
@@ -270,10 +270,10 @@
         // we can do that, we must gather them together.
         movdqa  \t, \c0
         movdqa  \u, \c1
         // we can do that, we must gather them together.
         movdqa  \t, \c0
         movdqa  \u, \c1
-       punpcklqdq \t, \c2              // (y'_0; y'_2)
-       punpckhqdq \c0, \c2             // (y''_0; y''_2)
-       punpcklqdq \u, \c3              // (y'_1; y'_3)
-       punpckhqdq \c1, \c3             // (y''_1; y''_3)
+       punpcklqdq \t, \c2              // (y'_2; y'_0)
+       punpckhqdq \c0, \c2             // (y''_2; y''_0)
+       punpcklqdq \u, \c3              // (y'_3; y'_1)
+       punpckhqdq \c1, \c3             // (y''_3; y''_1)
  
         // Now split the double-prime pieces.  The high (up to) 48 bits will
         // go up; the low 16 bits go down.
  
         // Now split the double-prime pieces.  The high (up to) 48 bits will
         // go up; the low 16 bits go down.
@@ -281,43 +281,43 @@
         movdqa  \c3, \c1
         psllq   \c2, 48
         psllq   \c3, 48
         movdqa  \c3, \c1
         psllq   \c2, 48
         psllq   \c3, 48
-       psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
-       psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
-       psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
-       psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
+       psrlq   \c0, 16                 // high parts of (y''_2; y''_0)
+       psrlq   \c1, 16                 // high parts of (y''_3; y''_1)
+       psrlq   \c2, 32                 // low parts of (y''_2; y''_0)
+       psrlq   \c3, 32                 // low parts of (y''_3; y''_1)
    .ifnes "\hi", "nil"
         movdqa  \hi, \c1
    .endif
    .ifnes "\hi", "nil"
         movdqa  \hi, \c1
    .endif
-       pslldq  \c1, 8                  // high part of (0; y''_1)
+       pslldq  \c1, 8                  // high part of (y''_1; 0)
  
         paddq   \t, \c2                 // propagate down
         paddq   \u, \c3
  
         paddq   \t, \c2                 // propagate down
         paddq   \u, \c3
-       paddq   \t, \c1                 // and up: (y_0; y_2)
-       paddq   \u, \c0                 // (y_1; y_3)
+       paddq   \t, \c1                 // and up: (y_2; y_0)
+       paddq   \u, \c0                 // (y_3; y_1)
    .ifnes "\hi", "nil"
    .ifnes "\hi", "nil"
-       psrldq  \hi, 8                  // high part of (y''_3; 0)
+       psrldq  \hi, 8                  // high part of (0; y''_3)
    .endif
  
         // Finally extract the answer.  This complicated dance is better than
         // storing to memory and loading, because the piecemeal stores
         // inhibit store forwarding.
    .endif
  
         // Finally extract the answer.  This complicated dance is better than
         // storing to memory and loading, because the piecemeal stores
         // inhibit store forwarding.
-       movdqa  \c3, \t                 // (y_0; ?)
-       movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
-       psrldq  \t, 8                   // (y_2; 0)
+       movdqa  \c3, \t                 // (?; y_0)
+       movdqa  \lo, \t                 // (?, ?; ?, y^*_0)
+       psrldq  \t, 8                   // (0; y_2)
         psrlq   \c3, 32                 // (floor(y_0/B); ?)
         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
         psrlq   \c3, 32                 // (floor(y_0/B); ?)
         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
-       movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
-       psrldq  \u, 8                   // (y_3; 0)
+       movdqa  \c1, \c3                // (?, ?; ?, y^*_1)
+       psrldq  \u, 8                   // (0; y_3)
         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
-       punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
+       punpckldq \lo, \c3              // (?, ?; y^*_2, y^*_0)
         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
    .ifnes "\hi", "nil"
         movdqa  \t, \c3
         pxor    \u, \u
    .endif
         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
    .ifnes "\hi", "nil"
         movdqa  \t, \c3
         pxor    \u, \u
    .endif
-       punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
+       punpckldq \c1, \c3              // (?, ?; y^*_3, y^*_1)
    .ifnes "\hi", "nil"
         psrlq   \t, 32                  // very high bits of y
         paddq   \hi, \t
    .ifnes "\hi", "nil"
         psrlq   \t, 32                  // very high bits of y
         paddq   \hi, \t
@@ -334,13 +334,13 @@
         // On exit, the carry registers, including XMM15, are updated to hold
         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
         // registers are preserved.
         // On exit, the carry registers, including XMM15, are updated to hold
         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
         // registers are preserved.
-       movd    xmm0, [rdi +  0]        // (a_0; 0)
-       movd    xmm1, [rdi +  4]        // (a_1; 0)
-       movd    xmm2, [rdi +  8]        // (a_2; 0)
-       movd    xmm15, [rdi + 12]       // (a_3; 0)
-       paddq   xmm12, xmm0             // (c'_0 + a_0; c''_0)
-       paddq   xmm13, xmm1             // (c'_1 + a_1; c''_1)
-       paddq   xmm14, xmm2             // (c'_2 + a_2; c''_2 + a_3 b)
+       movd    xmm0, [rdi +  0]        // (0; a_0)
+       movd    xmm1, [rdi +  4]        // (0; a_1)
+       movd    xmm2, [rdi +  8]        // (0; a_2)
+       movd    xmm15, [rdi + 12]       // (0; a_3)
+       paddq   xmm12, xmm0             // (c''_0; c'_0 + a_0)
+       paddq   xmm13, xmm1             // (c''_1; c'_1 + a_1)
+       paddq   xmm14, xmm2             // (c''_2 + a_3 b; c'_2 + a_2)
  .endm
  
  ///--------------------------------------------------------------------------
  .endm
  
  ///--------------------------------------------------------------------------
@@ -654,8 +654,8 @@ INTFUNC(mmla4)
         mulcore xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
         accum                            xmm4,  xmm5,  xmm6
  
         mulcore xmm7, 1,   xmm10, xmm11, xmm0,  xmm1,  xmm2
         accum                            xmm4,  xmm5,  xmm6
  
-       punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
-       punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
+       punpckldq xmm12, xmm15          // (0, w_1; 0, w_0)
+       punpckhdq xmm14, xmm15          // (0, w_3; 0, w_2)
  
         mulcore xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
         accum                            xmm5,  xmm6
  
         mulcore xmm7, 2,   xmm10, xmm11, xmm0,  xmm1
         accum                            xmm5,  xmm6
@@ -667,10 +667,10 @@ INTFUNC(mmla4)
         mulcore xmm7, 3,   xmm10, xmm11, xmm0
         accum                            xmm6
  
         mulcore xmm7, 3,   xmm10, xmm11, xmm0
         accum                            xmm6
  
-       punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
-       punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
-       punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
-       punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
+       punpckldq xmm12, xmm2           // (0, 0; 0, w_0)
+       punpckldq xmm14, xmm2           // (0, 0; 0, w_2)
+       punpckhdq xmm13, xmm2           // (0, 0; 0, w_1)
+       punpckhdq xmm15, xmm2           // (0, 0; 0, w_3)
  
         // That's lots of pieces.  Now we have to assemble the answer.
         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
  
         // That's lots of pieces.  Now we have to assemble the answer.
         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
@@ -736,8 +736,8 @@ INTFUNC(mont4)
         mulcore xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
         accum                            xmm4,  xmm5,  xmm6
  
         mulcore xmm7, 1,   xmm8,  xmm9,  xmm0,  xmm1,  xmm2
         accum                            xmm4,  xmm5,  xmm6
  
-       punpckldq xmm12, xmm15          // (w_0, 0; w_1, 0)
-       punpckhdq xmm14, xmm15          // (w_2, 0; w_3, 0)
+       punpckldq xmm12, xmm15          // (0, w_1; 0, w_0)
+       punpckhdq xmm14, xmm15          // (0, w_3; 0, w_2)
  
         mulcore xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
         accum                            xmm5,  xmm6
  
         mulcore xmm7, 2,   xmm8,  xmm9,  xmm0,  xmm1
         accum                            xmm5,  xmm6
@@ -749,10 +749,10 @@ INTFUNC(mont4)
         mulcore xmm7, 3,   xmm8,  xmm9,  xmm0
         accum                            xmm6
  
         mulcore xmm7, 3,   xmm8,  xmm9,  xmm0
         accum                            xmm6
  
-       punpckldq xmm12, xmm2           // (w_0, 0; 0, 0)
-       punpckldq xmm14, xmm2           // (w_2, 0; 0, 0)
-       punpckhdq xmm13, xmm2           // (w_1, 0; 0, 0)
-       punpckhdq xmm15, xmm2           // (w_3, 0; 0, 0)
+       punpckldq xmm12, xmm2           // (0, 0; 0, w_0)
+       punpckldq xmm14, xmm2           // (0, 0; 0, w_2)
+       punpckhdq xmm13, xmm2           // (0, 0; 0, w_1)
+       punpckhdq xmm15, xmm2           // (0, 0; 0, w_3)
  
         // That's lots of pieces.  Now we have to assemble the answer.
         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
  
         // That's lots of pieces.  Now we have to assemble the answer.
         squash  xmm3, xmm4, xmm5, xmm6,  xmm0, xmm1,  xmm10
@@ -1511,9 +1511,9 @@ ENDFUNC
  .endm
  
  .macro testldcarry
  .endm
  
  .macro testldcarry
-       movdqu  xmm12, [rcx +  0]       // (c'_0; c''_0)
-       movdqu  xmm13, [rcx + 16]       // (c'_1; c''_1)
-       movdqu  xmm14, [rcx + 32]       // (c'_2; c''_2)
+       movdqu  xmm12, [rcx +  0]       // (c''_0; c'_0)
+       movdqu  xmm13, [rcx + 16]       // (c''_1; c'_1)
+       movdqu  xmm14, [rcx + 32]       // (c''_2; c'_2)
  .endm
  
  .macro testtop u=nil
  .endm
  
  .macro testtop u=nil
@@ -1601,8 +1601,8 @@ FUNC(test_mmul4)
         testtop r11
         call    mmul4
         testtail
         testtop r11
         call    mmul4
         testtail
-       pshufd  xmm10, xmm10, SHUF(0, 2, 1, 3)
-       pshufd  xmm11, xmm11, SHUF(0, 2, 1, 3)
+       pshufd  xmm10, xmm10, SHUF(3, 1, 2, 0)
+       pshufd  xmm11, xmm11, SHUF(3, 1, 2, 0)
         movdqu  [r10 +  0], xmm10
         movdqu  [r10 + 16], xmm11
         testcarryout
         movdqu  [r10 +  0], xmm10
         movdqu  [r10 + 16], xmm11
         testcarryout
@@ -1614,8 +1614,8 @@ FUNC(test_mmla4)
         testtop r11
         call    mmla4
         testtail
         testtop r11
         call    mmla4
         testtail
-       pshufd  xmm10, xmm10, SHUF(0, 2, 1, 3)
-       pshufd  xmm11, xmm11, SHUF(0, 2, 1, 3)
+       pshufd  xmm10, xmm10, SHUF(3, 1, 2, 0)
+       pshufd  xmm11, xmm11, SHUF(3, 1, 2, 0)
         movdqu  [r10 +  0], xmm10
         movdqu  [r10 + 16], xmm11
         testcarryout
         movdqu  [r10 +  0], xmm10
         movdqu  [r10 + 16], xmm11
         testcarryout
@@ -1627,8 +1627,8 @@ FUNC(test_mont4)
         testtop
         call    mont4
         testtail
         testtop
         call    mont4
         testtail
-       pshufd  xmm10, xmm10, SHUF(0, 2, 1, 3)
-       pshufd  xmm11, xmm11, SHUF(0, 2, 1, 3)
+       pshufd  xmm10, xmm10, SHUF(3, 1, 2, 0)
+       pshufd  xmm11, xmm11, SHUF(3, 1, 2, 0)
         movdqu  [r10 +  0], xmm10
         movdqu  [r10 + 16], xmm11
         testcarryout
         movdqu  [r10 +  0], xmm10
         movdqu  [r10 + 16], xmm11
         testcarryout
diff --git a/math/mpx-mul4-arm-neon.S b/math/mpx-mul4-arm-neon.S

index efca790..8aa01bc 100644 (file)
--- a/math/mpx-mul4-arm-neon.S
+++ b/math/mpx-mul4-arm-neon.S
@@ -60,9 +60,9 @@
  /// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
  /// operands, as follows.
  ///
  /// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
  /// operands, as follows.
  ///
-///    Offset     0       4        8      12
-///       0    v'_0   v''_0     v'_1   v''_1
-///      16    v'_2   v''_2     v'_3   v''_3
+///    Offset     12      8        4      0
+///       0    v''_1   v'_1    v''_0   v'_0
+///      16    v''_3   v'_3    v''_2   v'_2
  ///
  /// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
  /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
  ///
  /// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
  /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
@@ -1012,12 +1012,12 @@ ENDFUNC
         ldr     r14, [STKARG(0)]        // -> vv
         vld1.32 {q2}, [r14]
         vmov.i32 q3, #0
         ldr     r14, [STKARG(0)]        // -> vv
         vld1.32 {q2}, [r14]
         vmov.i32 q3, #0
-       vzip.16 q2, q3                  // (v'_0, v''_0; v'_1, v''_1)
+       vzip.16 q2, q3                  // (v''_1, v'_1; v''_0, v'_0)
  
         ldr     r14, [STKARG(1)]        // -> yy
         vld1.32 {q4}, [r14]
         vmov.i32 q5, #0
  
         ldr     r14, [STKARG(1)]        // -> yy
         vld1.32 {q4}, [r14]
         vmov.i32 q5, #0
-       vzip.16 q4, q5                  // (y'_0, y''_0; y'_1, y''_1)
+       vzip.16 q4, q5                  // (y''_1, y'_1; y''_0, y'_0)
  
         ldr     r5, [STKARG(2)]         // = n
         ldr     r6, [STKARG(3)]         // -> cyv
  
         ldr     r5, [STKARG(2)]         // = n
         ldr     r6, [STKARG(3)]         // -> cyv
@@ -1029,7 +1029,7 @@ ENDFUNC
  
         vld1.32 {q4}, [r3]
         vmov.i32 q5, #0
  
         vld1.32 {q4}, [r3]
         vmov.i32 q5, #0
-       vzip.16 q4, q5                  // (y'_0, y''_0; y'_1, y''_1)
+       vzip.16 q4, q5                  // (y''_1, y'_1; y''_0, y'_0)
  
         ldr     r5, [STKARG(0)]         // = n
         ldr     r6, [STKARG(1)]         // -> cyv
  
         ldr     r5, [STKARG(0)]         // = n
         ldr     r6, [STKARG(1)]         // -> cyv
@@ -1044,12 +1044,12 @@ ENDFUNC
         ldr     r14, [STKARG(1)]        // -> vv
         vld1.32 {q2}, [r14]
         vmov.i32 q3, #0
         ldr     r14, [STKARG(1)]        // -> vv
         vld1.32 {q2}, [r14]
         vmov.i32 q3, #0
-       vzip.16 q2, q3                  // (v'_0, v''_0; v'_1, v''_1)
+       vzip.16 q2, q3                  // (v''_1, v'_1; v''_0, v'_0)
  
         ldr     r14, [STKARG(2)]        // -> yy
         vld1.32 {q4}, [r14]
         vmov.i32 q5, #0
  
         ldr     r14, [STKARG(2)]        // -> yy
         vld1.32 {q4}, [r14]
         vmov.i32 q5, #0
-       vzip.16 q4, q5                  // (y'_0, y''_0; y'_1, y''_1)
+       vzip.16 q4, q5                  // (y''_1, y'_1; y''_0, y'_0)
  
         ldr     r5, [STKARG(3)]         // = n
         ldr     r6, [STKARG(4)]         // -> cyv
  
         ldr     r5, [STKARG(3)]         // = n
         ldr     r6, [STKARG(4)]         // -> cyv
@@ -1065,7 +1065,7 @@ ENDFUNC
         ldr     r14, [STKARG(0)]        // -> vv
         vld1.32 {q2}, [r14]
         vmov.i32 q3, #0
         ldr     r14, [STKARG(0)]        // -> vv
         vld1.32 {q2}, [r14]
         vmov.i32 q3, #0
-       vzip.16 q2, q3                  // (v'_0, v''_0; v'_1, v''_1)
+       vzip.16 q2, q3                  // (v''_1, v'_1; v''_0, v'_0)
  
         ldr     r5, [STKARG(1)]         // = n
         ldr     r6, [STKARG(2)]         // -> cyv
  
         ldr     r5, [STKARG(1)]         // = n
         ldr     r6, [STKARG(2)]         // -> cyv
diff --git a/math/mpx-mul4-arm64-simd.S b/math/mpx-mul4-arm64-simd.S

index 60eed20..ee33a00 100644 (file)
--- a/math/mpx-mul4-arm64-simd.S
+++ b/math/mpx-mul4-arm64-simd.S
@@ -57,9 +57,9 @@
  /// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
  /// operands, as follows.
  ///
  /// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
  /// operands, as follows.
  ///
-///    Offset     0       4        8      12
-///       0    v'_0   v''_0     v'_1   v''_1
-///      16    v'_2   v''_2     v'_3   v''_3
+///    Offset     12      8        4      0
+///       0    v''_1   v'_1    v''_0   v'_0
+///      16    v''_3   v'_3    v''_2   v'_2
  ///
  /// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
  /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
  ///
  /// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
  /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
@@ -230,7 +230,7 @@
  // leaving a carry in CG.
  //
  // In detail, what happens is as follows.  Suppose initially that ZLO =
  // leaving a carry in CG.
  //
  // In detail, what happens is as follows.  Suppose initially that ZLO =
-// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}).  Let t = z'_i + b z''_i;
+// (z''_i; z'_i) and ZHI = (z''_{i+1}; z'_{i+1}).  Let t = z'_i + b z''_i;
  // observe that floor(t/b) = floor(z'_i/b) + z''_i.  Let z_i = t mod B, and
  // add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}.  This has
  // a circuit depth of 3; I don't know how to do better.
  // observe that floor(t/b) = floor(z'_i/b) + z''_i.  Let z_i = t mod B, and
  // add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}.  This has
  // a circuit depth of 3; I don't know how to do better.
@@ -1032,12 +1032,12 @@ ENDFUNC
  
    .ifeqs "\mode", "dmul"
         ldr     q2, [x4]
  
    .ifeqs "\mode", "dmul"
         ldr     q2, [x4]
-       zip2    v3.8h, v2.8h, v31.8h    // (v'_2, v''_2; v'_3, v''_3)
-       zip1    v2.8h, v2.8h, v31.8h    // (v'_0, v''_0; v'_1, v''_1)
+       zip2    v3.8h, v2.8h, v31.8h    // (v''_3, v'_3; v''_2, v'_2)
+       zip1    v2.8h, v2.8h, v31.8h    // (v''_1, v'_1; v''_0, v'_0)
  
         ldr     q4, [x5]
  
         ldr     q4, [x5]
-       zip2    v5.8h, v4.8h, v31.8h    // (y'_2, y''_2; y'_3, y''_3)
-       zip1    v4.8h, v4.8h, v31.8h    // (y'_0, y''_0; y'_1, y''_1)
+       zip2    v5.8h, v4.8h, v31.8h    // (y''_3, y'_3; y''_2, y'_2)
+       zip1    v4.8h, v4.8h, v31.8h    // (y''_1, y'_1; y''_0, y'_0)
  
         mov     x16, x1
         mov     x1, x2                  // -> u
  
         mov     x16, x1
         mov     x1, x2                  // -> u
@@ -1050,8 +1050,8 @@ ENDFUNC
  
    .ifeqs "\mode", "smul"
         ldr     q4, [x3]
  
    .ifeqs "\mode", "smul"
         ldr     q4, [x3]
-       zip2    v5.8h, v4.8h, v31.8h    // (y'_2, y''_2; y'_3, y''_3)
-       zip1    v4.8h, v4.8h, v31.8h    // (y'_0, y''_0; y'_1, y''_1)
+       zip2    v5.8h, v4.8h, v31.8h    // (y''_3, y'_3; y''_2, y'_2)
+       zip1    v4.8h, v4.8h, v31.8h    // (y''_1, y'_1; y''_0, y'_0)
  
         // x2                           // -> x
         mov     x3, x1                  // -> c
  
         // x2                           // -> x
         mov     x3, x1                  // -> c
@@ -1061,12 +1061,12 @@ ENDFUNC
  
    .ifeqs "\mode", "mmul"
         ldr     q2, [x5]
  
    .ifeqs "\mode", "mmul"
         ldr     q2, [x5]
-       zip2    v3.8h, v2.8h, v31.8h    // (v'_2, v''_2; v'_3, v''_3)
-       zip1    v2.8h, v2.8h, v31.8h    // (v'_0, v''_0; v'_1, v''_1)
+       zip2    v3.8h, v2.8h, v31.8h    // (v''_3, v'_3; v''_2, v'_2)
+       zip1    v2.8h, v2.8h, v31.8h    // (v''_1, v'_1; v''_0, v'_0)
  
         ldr     q6, [x6]
  
         ldr     q6, [x6]
-       zip2    v7.8h, v6.8h, v31.8h    // (y'_2, y''_2; y'_3, y''_3)
-       zip1    v6.8h, v6.8h, v31.8h    // (y'_0, y''_0; y'_1, y''_1)
+       zip2    v7.8h, v6.8h, v31.8h    // (y''_3, y'_3; y''_2, y'_2)
+       zip1    v6.8h, v6.8h, v31.8h    // (y''_1, y'_1; y''_0, y'_0)
  
         mov     x16, x1
         mov     x1, x3                  // -> u
  
         mov     x16, x1
         mov     x1, x3                  // -> u
@@ -1082,8 +1082,8 @@ ENDFUNC
  
    .ifeqs "\mode", "mont"
         ldr     q6, [x4]
  
    .ifeqs "\mode", "mont"
         ldr     q6, [x4]
-       zip2    v7.8h, v6.8h, v31.8h    // (m'_2, m''_2; m'_3, m''_3)
-       zip1    v6.8h, v6.8h, v31.8h    // (m'_0, m''_0; m'_1, m''_1)
+       zip2    v7.8h, v6.8h, v31.8h    // (m''_3, m'_3; m''_2, m'_2)
+       zip1    v6.8h, v6.8h, v31.8h    // (m''_1, m'_1; m''_0, m'_0)
  
         mov     x4, x2                  // -> y
         mov     x2, x3                  // -> x
  
         mov     x4, x2                  // -> y
         mov     x2, x3                  // -> x
diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S

index 916adef..0964de9 100644 (file)
--- a/math/mpx-mul4-x86-sse2.S
+++ b/math/mpx-mul4-x86-sse2.S
@@ -58,9 +58,9 @@
  /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  /// operands, as follows.
  ///
  /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
  /// operands, as follows.
  ///
-///    Offset     0       4        8      12
-///       0    v'_0    v'_1    v''_0   v''_1
-///      16    v'_2    v'_3    v''_2   v''_3
+///    Offset     12       8      4       0
+///       0    v''_1   v''_0   v'_1    v'_0
+///      16    v''_3   v''_2   v'_3    v'_2
  ///
  /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
  /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
  ///
  /// A `pmuludq' instruction ignores the odd positions in its operands; thus,
  /// it will act on (say) v'_0 and v''_0 in a single instruction.  Shifting
@@ -135,41 +135,41 @@
  .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
         // Load a word r_i from R, multiply by the expanded operand [S], and
         // leave the pieces of the product in registers D0, D1, D2, D3.
  .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
         // Load a word r_i from R, multiply by the expanded operand [S], and
         // leave the pieces of the product in registers D0, D1, D2, D3.
-       movd    \d0, \r                 // (r_i, 0; 0, 0)
+       movd    \d0, \r                 // (0, 0; 0, r_i)
    .ifnes "\d1", "nil"
    .ifnes "\d1", "nil"
-       movdqa  \d1, [\s]               // (s'_0, s'_1; s''_0, s''_1)
+       movdqa  \d1, [\s]               // (s''_1, s''_0; s'_1, s'_0)
    .endif
    .ifnes "\d3", "nil"
    .endif
    .ifnes "\d3", "nil"
-       movdqa  \d3, [\s + 16]          // (s'_2, s'_3; s''_2, s''_3)
+       movdqa  \d3, [\s + 16]          // (s''_3, s''_2; s'_3, s'_2)
    .endif
    .endif
-       pshufd  \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
+       pshufd  \d0, \d0, SHUF(3, 0, 3, 0) // (?, r_i; ?, r_i)
    .ifnes "\d1", "nil"
    .ifnes "\d1", "nil"
-       psrldq  \d1, 4                  // (s'_1, s''_0; s''_1, 0)
+       psrldq  \d1, 4                  // (0, s''_1; s''_0, s'_1)
    .endif
    .ifnes "\d2", "nil"
      .ifnes "\d3", "nil"
         movdqa  \d2, \d3                // another copy of (s'_2, s'_3; ...)
      .else
    .endif
    .ifnes "\d2", "nil"
      .ifnes "\d3", "nil"
         movdqa  \d2, \d3                // another copy of (s'_2, s'_3; ...)
      .else
-       movdqa  \d2, \d0                // another copy of (r_i, ?; r_i, ?)
+       movdqa  \d2, \d0                // another copy of (?, r_i; ?, r_i)
      .endif
    .endif
    .ifnes "\d3", "nil"
      .endif
    .endif
    .ifnes "\d3", "nil"
-       psrldq  \d3, 4                  // (s'_3, s''_2; s''_3, 0)
+       psrldq  \d3, 4                  // (0, s''_3; s''_2, s'_3)
    .endif
    .ifnes "\d1", "nil"
    .endif
    .ifnes "\d1", "nil"
-       pmuludq \d1, \d0                // (r_i s'_1; r_i s''_1)
+       pmuludq \d1, \d0                // (r_i s''_1; r_i s'_1)
    .endif
    .ifnes "\d3", "nil"
    .endif
    .ifnes "\d3", "nil"
-       pmuludq \d3, \d0                // (r_i s'_3; r_i s''_3)
+       pmuludq \d3, \d0                // (r_i s''_3; r_i s'_3)
    .endif
    .ifnes "\d2", "nil"
      .ifnes "\d3", "nil"
    .endif
    .ifnes "\d2", "nil"
      .ifnes "\d3", "nil"
-       pmuludq \d2, \d0                // (r_i s'_2; r_i s''_2)
+       pmuludq \d2, \d0                // (r_i s''_2; r_i s'_2)
      .else
         pmuludq \d2, [\s + 16]
      .endif
    .endif
      .else
         pmuludq \d2, [\s + 16]
      .endif
    .endif
-       pmuludq \d0, [\s]               // (r_i s'_0; r_i s''_0)
+       pmuludq \d0, [\s]               // (r_i s''_0; r_i s'_0)
  .endm
  
  .macro accum   c0, c1=nil, c2=nil, c3=nil
  .endm
  
  .macro accum   c0, c1=nil, c2=nil, c3=nil
@@ -210,10 +210,10 @@
         // carry registers.  On completion, XMM3 is clobbered.  If CC is
         // `nil', then the contribution which would have been added to it is
         // left in C.
         // carry registers.  On completion, XMM3 is clobbered.  If CC is
         // `nil', then the contribution which would have been added to it is
         // left in C.
-       pshufd  xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
-       psrldq  xmm3, 12                // (t, 0; 0, 0) = (t, 0)
-       pslldq  xmm3, 2                 // (t b; 0)
-       paddq   \c, xmm3                // (c' + t b; c'')
+       pshufd  xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?)
+       psrldq  xmm3, 12                // (0, 0; 0, t) = (0; t)
+       pslldq  xmm3, 2                 // (0; t b)
+       paddq   \c, xmm3                // (c''; c' + t b)
         movd    \d, \c
         psrlq   \c, 32                  // floor(c/B)
    .ifnes "\cc", "nil"
         movd    \d, \c
         psrlq   \c, 32                  // floor(c/B)
    .ifnes "\cc", "nil"
@@ -226,10 +226,10 @@
         // of the value represented in C are written to D, and the remaining
         // bits are left at the bottom of T.
         movdqa  \t, \c
         // of the value represented in C are written to D, and the remaining
         // bits are left at the bottom of T.
         movdqa  \t, \c
-       psllq   \t, 16                  // (?; c'' b)
-       pslldq  \c, 8                   // (0; c')
-       paddq   \t, \c                  // (?; c' + c'' b)
-       psrldq  \t, 8                   // (c' + c'' b; 0) = (c; 0)
+       psllq   \t, 16                  // (c'' b; ?)
+       pslldq  \c, 8                   // (c'; 0)
+       paddq   \t, \c                  // (c' + c'' b; ?)
+       psrldq  \t, 8                   // (0; c' + c'' b) = (0; c)
         movd    \d, \t
         psrldq  \t, 4                   // (floor(c/B); 0)
  .endm
         movd    \d, \t
         psrldq  \t, 4                   // (floor(c/B); 0)
  .endm
@@ -238,21 +238,21 @@
         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
         // exit, A:B and C:D together hold the same values in expanded
         // form.  If C is `nil', then only expand A to A:B.
         // On entry, A and C hold packed 128-bit values, and Z is zero.  On
         // exit, A:B and C:D together hold the same values in expanded
         // form.  If C is `nil', then only expand A to A:B.
-       movdqa  \b, \a                  // (a_0, a_1; a_2, a_3)
+       movdqa  \b, \a                  // (a_3, a_2; a_1, a_0)
    .ifnes "\c", "nil"
    .ifnes "\c", "nil"
-       movdqa  \d, \c                  // (c_0, c_1; c_2, c_3)
+       movdqa  \d, \c                  // (c_3, c_2; c_1, c_0)
    .endif
    .endif
-       punpcklwd \a, \z                // (a'_0, a''_0; a'_1, a''_1)
-       punpckhwd \b, \z                // (a'_2, a''_2; a'_3, a''_3)
+       punpcklwd \a, \z                // (a''_1, a'_1; a''_0, a'_0)
+       punpckhwd \b, \z                // (a''_3, a'_3; a''_2, a'_2)
    .ifnes "\c", "nil"
    .ifnes "\c", "nil"
-       punpcklwd \c, \z                // (c'_0, c''_0; c'_1, c''_1)
-       punpckhwd \d, \z                // (c'_2, c''_2; c'_3, c''_3)
+       punpcklwd \c, \z                // (c''_1, c'_1; c''_0, c'_0)
+       punpckhwd \d, \z                // (c''_3, c'_3; c''_2, c'_2)
    .endif
    .endif
-       pshufd  \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
-       pshufd  \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
+       pshufd  \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0)
+       pshufd  \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2)
    .ifnes "\c", "nil"
    .ifnes "\c", "nil"
-       pshufd  \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
-       pshufd  \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
+       pshufd  \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0)
+       pshufd  \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2)
    .endif
  .endm
  
    .endif
  .endm
  
@@ -268,10 +268,10 @@
         // we can do that, we must gather them together.
         movdqa  \t, \c0
         movdqa  \u, \c1
         // we can do that, we must gather them together.
         movdqa  \t, \c0
         movdqa  \u, \c1
-       punpcklqdq \t, \c2              // (y'_0; y'_2)
-       punpckhqdq \c0, \c2             // (y''_0; y''_2)
-       punpcklqdq \u, \c3              // (y'_1; y'_3)
-       punpckhqdq \c1, \c3             // (y''_1; y''_3)
+       punpcklqdq \t, \c2              // (y'_2; y'_0)
+       punpckhqdq \c0, \c2             // (y''_2; y''_0)
+       punpcklqdq \u, \c3              // (y'_3; y'_1)
+       punpckhqdq \c1, \c3             // (y''_3; y''_1)
  
         // Now split the double-prime pieces.  The high (up to) 48 bits will
         // go up; the low 16 bits go down.
  
         // Now split the double-prime pieces.  The high (up to) 48 bits will
         // go up; the low 16 bits go down.
@@ -279,43 +279,43 @@
         movdqa  \c3, \c1
         psllq   \c2, 48
         psllq   \c3, 48
         movdqa  \c3, \c1
         psllq   \c2, 48
         psllq   \c3, 48
-       psrlq   \c0, 16                 // high parts of (y''_0; y''_2)
-       psrlq   \c1, 16                 // high parts of (y''_1; y''_3)
-       psrlq   \c2, 32                 // low parts of (y''_0; y''_2)
-       psrlq   \c3, 32                 // low parts of (y''_1; y''_3)
+       psrlq   \c0, 16                 // high parts of (y''_2; y''_0)
+       psrlq   \c1, 16                 // high parts of (y''_3; y''_1)
+       psrlq   \c2, 32                 // low parts of (y''_2; y''_0)
+       psrlq   \c3, 32                 // low parts of (y''_3; y''_1)
    .ifnes "\hi", "nil"
         movdqa  \hi, \c1
    .endif
    .ifnes "\hi", "nil"
         movdqa  \hi, \c1
    .endif
-       pslldq  \c1, 8                  // high part of (0; y''_1)
+       pslldq  \c1, 8                  // high part of (y''_1; 0)
  
         paddq   \t, \c2                 // propagate down
         paddq   \u, \c3
  
         paddq   \t, \c2                 // propagate down
         paddq   \u, \c3
-       paddq   \t, \c1                 // and up: (y_0; y_2)
-       paddq   \u, \c0                 // (y_1; y_3)
+       paddq   \t, \c1                 // and up: (y_2; y_0)
+       paddq   \u, \c0                 // (y_3; y_1)
    .ifnes "\hi", "nil"
    .ifnes "\hi", "nil"
-       psrldq  \hi, 8                  // high part of (y''_3; 0)
+       psrldq  \hi, 8                  // high part of (0; y''_3)
    .endif
  
         // Finally extract the answer.  This complicated dance is better than
         // storing to memory and loading, because the piecemeal stores
         // inhibit store forwarding.
    .endif
  
         // Finally extract the answer.  This complicated dance is better than
         // storing to memory and loading, because the piecemeal stores
         // inhibit store forwarding.
-       movdqa  \c3, \t                 // (y_0; ?)
-       movdqa  \lo, \t                 // (y^*_0, ?; ?, ?)
-       psrldq  \t, 8                   // (y_2; 0)
+       movdqa  \c3, \t                 // (?; y_0)
+       movdqa  \lo, \t                 // (?, ?; ?, y^*_0)
+       psrldq  \t, 8                   // (0; y_2)
         psrlq   \c3, 32                 // (floor(y_0/B); ?)
         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
         psrlq   \c3, 32                 // (floor(y_0/B); ?)
         paddq   \c3, \u                 // (y_1 + floor(y_0/B); ?)
-       movdqa  \c1, \c3                // (y^*_1, ?; ?, ?)
-       psrldq  \u, 8                   // (y_3; 0)
+       movdqa  \c1, \c3                // (?, ?; ?, y^*_1)
+       psrldq  \u, 8                   // (0; y_3)
         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
         psrlq   \c3, 32                 // (floor((y_1 B + y_0)/B^2; ?)
         paddq   \c3, \t                 // (y_2 + floor((y_1 B + y_0)/B^2; ?)
-       punpckldq \lo, \c3              // (y^*_0, y^*_2; ?, ?)
+       punpckldq \lo, \c3              // (?, ?; y^*_2, y^*_0)
         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
    .ifnes "\hi", "nil"
         movdqa  \t, \c3
         pxor    \u, \u
    .endif
         psrlq   \c3, 32             // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
         paddq   \c3, \u       // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
    .ifnes "\hi", "nil"
         movdqa  \t, \c3
         pxor    \u, \u
    .endif
-       punpckldq \c1, \c3              // (y^*_1, y^*_3; ?, ?)
+       punpckldq \c1, \c3              // (?, ?; y^*_3, y^*_1)
    .ifnes "\hi", "nil"
         psrlq   \t, 32                  // very high bits of y
         paddq   \hi, \t
    .ifnes "\hi", "nil"
         psrlq   \t, 32                  // very high bits of y
         paddq   \hi, \t
@@ -332,14 +332,14 @@
         // On exit, the carry registers, including XMM7, are updated to hold
         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
         // registers are preserved.
         // On exit, the carry registers, including XMM7, are updated to hold
         // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered.  The other
         // registers are preserved.
-       movd    xmm0, [edi +  0]        // (a_0; 0)
-       movd    xmm1, [edi +  4]        // (a_1; 0)
-       movd    xmm2, [edi +  8]        // (a_2; 0)
-       movd    xmm7, [edi + 12]        // (a_3; 0)
-
-       paddq   xmm4, xmm0              // (c'_0 + a_0; c''_0)
-       paddq   xmm5, xmm1              // (c'_1 + a_1; c''_1)
-       paddq   xmm6, xmm2              // (c'_2 + a_2; c''_2 + a_3 b)
+       movd    xmm0, [edi +  0]        // (0; a_0)
+       movd    xmm1, [edi +  4]        // (0; a_1)
+       movd    xmm2, [edi +  8]        // (0; a_2)
+       movd    xmm7, [edi + 12]        // (0; a_3)
+
+       paddq   xmm4, xmm0              // (c''_0; c'_0 + a_0)
+       paddq   xmm5, xmm1              // (c''_1; c'_1 + a_1)
+       paddq   xmm6, xmm2              // (c''_2 + a_3 b; c'_2 + a_2)
  .endm
  
  ///--------------------------------------------------------------------------
  .endm
  
  ///--------------------------------------------------------------------------
@@ -1148,9 +1148,9 @@ ENDFUNC
  
  .macro testldcarry c
         mov     ecx, \c                 // -> c
  
  .macro testldcarry c
         mov     ecx, \c                 // -> c
-       movdqu  xmm4, [ecx +  0]        // (c'_0; c''_0)
-       movdqu  xmm5, [ecx + 16]        // (c'_1; c''_1)
-       movdqu  xmm6, [ecx + 32]        // (c'_2; c''_2)
+       movdqu  xmm4, [ecx +  0]        // (c''_0; c'_0)
+       movdqu  xmm5, [ecx + 16]        // (c''_1; c'_1)
+       movdqu  xmm6, [ecx + 32]        // (c''_2; c'_2)
  .endm
  
  .macro testexpand v=nil, y=nil
  .endm
  
  .macro testexpand v=nil, y=nil
@@ -1286,8 +1286,8 @@ FUNC(test_mmul4)
         mov     edi, [BP + 28]
         movdqa  xmm0, [SP + 64]
         movdqa  xmm1, [SP + 80]
         mov     edi, [BP + 28]
         movdqa  xmm0, [SP + 64]
         movdqa  xmm1, [SP + 80]
-       pshufd  xmm0, xmm0, SHUF(0, 2, 1, 3)
-       pshufd  xmm1, xmm1, SHUF(0, 2, 1, 3)
+       pshufd  xmm0, xmm0, SHUF(3, 1, 2, 0)
+       pshufd  xmm1, xmm1, SHUF(3, 1, 2, 0)
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [BP + 24]
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [BP + 24]
@@ -1304,8 +1304,8 @@ FUNC(test_mmla4)
         mov     edi, [BP + 28]
         movdqa  xmm0, [SP + 64]
         movdqa  xmm1, [SP + 80]
         mov     edi, [BP + 28]
         movdqa  xmm0, [SP + 64]
         movdqa  xmm1, [SP + 80]
-       pshufd  xmm0, xmm0, SHUF(0, 2, 1, 3)
-       pshufd  xmm1, xmm1, SHUF(0, 2, 1, 3)
+       pshufd  xmm0, xmm0, SHUF(3, 1, 2, 0)
+       pshufd  xmm1, xmm1, SHUF(3, 1, 2, 0)
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [BP + 24]
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [BP + 24]
@@ -1322,8 +1322,8 @@ FUNC(test_mont4)
         mov     edi, [BP + 28]
         movdqa  xmm0, [SP + 64]
         movdqa  xmm1, [SP + 80]
         mov     edi, [BP + 28]
         movdqa  xmm0, [SP + 64]
         movdqa  xmm1, [SP + 80]
-       pshufd  xmm0, xmm0, SHUF(0, 2, 1, 3)
-       pshufd  xmm1, xmm1, SHUF(0, 2, 1, 3)
+       pshufd  xmm0, xmm0, SHUF(3, 1, 2, 0)
+       pshufd  xmm1, xmm1, SHUF(3, 1, 2, 0)
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [BP + 24]
         movdqu  [edi], xmm0
         movdqu  [edi + 16], xmm1
         testcarryout [BP + 24]
diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S

index 974ec5b..13a1848 100644 (file)
--- a/symm/chacha-x86ish-sse2.S
+++ b/symm/chacha-x86ish-sse2.S
@@ -164,9 +164,9 @@ FUNC(chacha_core_x86ish_sse2)
  
         // c += d; b ^= c; b <<<=  7
         paddd   xmm2, xmm3
  
         // c += d; b ^= c; b <<<=  7
         paddd   xmm2, xmm3
-        pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
+        pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
         pxor    xmm1, xmm2
         pxor    xmm1, xmm2
-        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
         movdqa  xmm4, xmm1
         pslld   xmm1, 7
         psrld   xmm4, 25
         movdqa  xmm4, xmm1
         pslld   xmm1, 7
         psrld   xmm4, 25
@@ -184,7 +184,7 @@ FUNC(chacha_core_x86ish_sse2)
         //
         // The shuffles have quite high latency, so they've mostly been
         // pushed upwards.  The remaining one can't be moved, though.
         //
         // The shuffles have quite high latency, so they've mostly been
         // pushed upwards.  The remaining one can't be moved, though.
-       pshufd  xmm1, xmm1, SHUF(1, 2, 3, 0)
+       pshufd  xmm1, xmm1, SHUF(0, 3, 2, 1)
  
         // Apply the diagonal quarterround to each of the columns
         // simultaneously.
  
         // Apply the diagonal quarterround to each of the columns
         // simultaneously.
@@ -215,9 +215,9 @@ FUNC(chacha_core_x86ish_sse2)
  
         // c += d; b ^= c; b <<<=  7
         paddd   xmm2, xmm3
  
         // c += d; b ^= c; b <<<=  7
         paddd   xmm2, xmm3
-        pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
+        pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
         pxor    xmm1, xmm2
         pxor    xmm1, xmm2
-        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
         movdqa  xmm4, xmm1
         pslld   xmm1, 7
         psrld   xmm4, 25
         movdqa  xmm4, xmm1
         pslld   xmm1, 7
         psrld   xmm4, 25
@@ -226,7 +226,7 @@ FUNC(chacha_core_x86ish_sse2)
         // Finally, finish off undoing the transpose, and we're done for this
         // doubleround.  Again, most of this was done above so we don't have
         // to wait for the shuffles.
         // Finally, finish off undoing the transpose, and we're done for this
         // doubleround.  Again, most of this was done above so we don't have
         // to wait for the shuffles.
-       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)
+       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)
  
         // Decrement the loop counter and see if we should go round again.
         sub     NR, 2
  
         // Decrement the loop counter and see if we should go round again.
         sub     NR, 2
diff --git a/symm/gcm-arm-crypto.S b/symm/gcm-arm-crypto.S

index d5a58f8..8494e42 100644 (file)
--- a/symm/gcm-arm-crypto.S
+++ b/symm/gcm-arm-crypto.S
@@ -99,19 +99,19 @@
         // use Karatsuba's identity here, but I suspect that loses more in
         // the shifting, bit-twiddling, and dependency chains that it gains
         // in saving a multiplication which otherwise pipelines well.
         // use Karatsuba's identity here, but I suspect that loses more in
         // the shifting, bit-twiddling, and dependency chains that it gains
         // in saving a multiplication which otherwise pipelines well.
-       // q0 =                         // (u_0; u_1)
-       // q1 =                         // (v_0; v_1)
+       // q0 =                         // (u_1; u_0)
+       // q1 =                         // (v_1; v_0)
         vmull.p64 q2, d1, d2            // u_1 v_0
         vmull.p64 q3, d0, d3            // u_0 v_1
         vmull.p64 q2, d1, d2            // u_1 v_0
         vmull.p64 q3, d0, d3            // u_0 v_1
-       vmull.p64 q8, d1, d3            // (x_3; t_1) = u_1 v_1
-       vmull.p64 q9, d0, d2            // (t_0; x_0) = u_0 v_0
+       vmull.p64 q8, d1, d3            // (t_1; x_3) = u_1 v_1
+       vmull.p64 q9, d0, d2            // (x_0; t_0) = u_0 v_0
  
         // Arrange the pieces to form a double-precision polynomial.
  
         // Arrange the pieces to form a double-precision polynomial.
-       veor    q2, q2, q3              // (m_1; m_0) = u_0 v_1 + u_1 v_0
+       veor    q2, q2, q3              // (m_0; m_1) = u_0 v_1 + u_1 v_0
         veor    d17, d17, d4            // x_2 = t_1 + m_1
         veor    d18, d18, d5            // x_1 = t_0 + m_0
         veor    d17, d17, d4            // x_2 = t_1 + m_1
         veor    d18, d18, d5            // x_1 = t_0 + m_0
-       // q8 =                         // (x_3; x_2)
-       // q9 =                         // (x_1; x_0)
+       // q8 =                         // (x_2; x_3)
+       // q9 =                         // (x_0; x_1)
  
         // One-and-a-half problems remain.
         //
  
         // One-and-a-half problems remain.
         //
@@ -198,11 +198,11 @@
  
         // This is an inconvenient size.  There's nothing for it but to do
         // four multiplications, as if for the 128-bit case.
  
         // This is an inconvenient size.  There's nothing for it but to do
         // four multiplications, as if for the 128-bit case.
-       // q0 =                         // (u_0 + u_1 t^32; u_2)
-       // q1 =                         // (v_0 + v_1 t^32; v_2)
+       // q0 =                         // (u_2; u_0 + u_1 t^32)
+       // q1 =                         // (v_2; v_0 + v_1 t^32)
         vmull.p64 q8, d1, d2            // u_2 (v_0 + v_1 t^32) = e_0
         vmull.p64 q9, d0, d3            // v_2 (u_0 + u_1 t^32) = e_1
         vmull.p64 q8, d1, d2            // u_2 (v_0 + v_1 t^32) = e_0
         vmull.p64 q9, d0, d3            // v_2 (u_0 + u_1 t^32) = e_1
-       vmull.p64 q3, d1, d3            // u_2 v_2 t^64 = d = (0; d)
+       vmull.p64 q3, d1, d3            // u_2 v_2 t^64 = d = (d; 0)
         vmull.p64 q0, d0, d2            // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
                                         //   + u_1 v_1 t^64 = f
  
         vmull.p64 q0, d0, d2            // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
                                         //   + u_1 v_1 t^64 = f
  
@@ -279,24 +279,24 @@
          veor   q11, q11, q13           // b = u_1 v_2 + u_2 v_1
  
         // Piece the product together.
          veor   q11, q11, q13           // b = u_1 v_2 + u_2 v_1
  
         // Piece the product together.
-       veor    d17, d17, d22  //  q8 = // (x_5; x_4)
+       veor    d17, d17, d22  //  q8 = // (x_4; x_5)
         veor    d18, d18, d23
         veor    d18, d18, d23
-       veor    d19, d19, d24  //  q9 = // (x_3; x_2)
-       veor    d20, d20, d25  // q10 = // (x_1; x_0)
+       veor    d19, d19, d24  //  q9 = // (x_2; x_3)
+       veor    d20, d20, d25  // q10 = // (x_0; x_1)
  
         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
         // 128-bit case.  I don't know why.
  
         // First, shift the high bits down.
  
         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
         // 128-bit case.  I don't know why.
  
         // First, shift the high bits down.
-       // q8 =                         // (y_5; y_4)
-       // q9 =                         // (y_3; y_2)
-       // q10 =                        // (y_1; y_0)
-       vshl.u64 q11, q8, #63           // (y_5; y_4) b_i for t
+       // q8 =                         // (y_4; y_5)
+       // q9 =                         // (y_2; y_3)
+       // q10 =                        // (y_0; y_1)
+       vshl.u64 q11, q8, #63           // (y_4; y_5) b_i for t
         vshl.u64 d28, d18, #63          // y_3 b_i for t
         vshl.u64 d28, d18, #63          // y_3 b_i for t
-       vshl.u64 q12, q8, #62           // (y_5; y_4) b_i for t^2
+       vshl.u64 q12, q8, #62           // (y_4; y_5) b_i for t^2
         vshl.u64 d29, d18, #62          // y_3 b_i for t^2
         vshl.u64 d29, d18, #62          // y_3 b_i for t^2
-       vshl.u64 q13, q8, #57           // (y_5; y_4) b_i for t^7
+       vshl.u64 q13, q8, #57           // (y_4; y_5) b_i for t^7
         vshl.u64 d30, d18, #57          // y_3 b_i for t^7
         veor    q11, q11, q12           // mix them all together
         veor    d28, d28, d29
         vshl.u64 d30, d18, #57          // y_3 b_i for t^7
         veor    q11, q11, q12           // mix them all together
         veor    d28, d28, d29
@@ -307,14 +307,14 @@
  
         // And finally shift the low bits up.  Also, switch the order of the
         // pieces for output.
  
         // And finally shift the low bits up.  Also, switch the order of the
         // pieces for output.
-       // q8 =                         // (y'_5; y'_4)
-       // q9 =                         // (y'_3; y'_2)
-       // q10 =                        // (y'_1; y'_0)
-       vshr.u64 q11, q8, #1            // (y_5; y_4) a_i for t
+       // q8 =                         // (y'_4; y'_5)
+       // q9 =                         // (y'_2; y'_3)
+       // q10 =                        // (y'_0; y'_1)
+       vshr.u64 q11, q8, #1            // (y_4; y_5) a_i for t
         vshr.u64 d28, d18, #1           // y'_3 a_i for t
         vshr.u64 d28, d18, #1           // y'_3 a_i for t
-       vshr.u64 q12, q8, #2            // (y_5; y_4) a_i for t^2
+       vshr.u64 q12, q8, #2            // (y_4; y_5) a_i for t^2
         vshr.u64 d29, d18, #2           // y'_3 a_i for t^2
         vshr.u64 d29, d18, #2           // y'_3 a_i for t^2
-       vshr.u64 q13, q8, #7            // (y_5; y_4) a_i for t^7
+       vshr.u64 q13, q8, #7            // (y_4; y_5) a_i for t^7
         vshr.u64 d30, d18, #7           // y'_3 a_i for t^7
         veor    q8, q8, q11
         veor    d18, d18, d28
         vshr.u64 d30, d18, #7           // y'_3 a_i for t^7
         veor    q8, q8, q11
         veor    d18, d18, d28
@@ -348,13 +348,13 @@
         // 128-bit multiplications already, and Karatsuba is too annoying
         // there, so there'll be 12 multiplications altogether, rather than
         // the 16 we'd have if we did this the naïve way.
         // 128-bit multiplications already, and Karatsuba is too annoying
         // there, so there'll be 12 multiplications altogether, rather than
         // the 16 we'd have if we did this the naïve way.
-       // q0 =                         // u_0 = (u_00; u_01)
-       // q1 =                         // u_1 = (u_10; u_11)
-       // q2 =                         // v_0 = (v_00; v_01)
-       // q3 =                         // v_1 = (v_10; v_11)
+       // q0 =                         // u_0 = (u_01; u_00)
+       // q1 =                         // u_1 = (u_11; u_10)
+       // q2 =                         // v_0 = (v_01; v_00)
+       // q3 =                         // v_1 = (v_11; v_10)
  
  
-       veor    q8, q0, q1              // u_* = (u_00 + u_10; u_01 + u_11)
-       veor    q9, q2, q3              // v_* = (v_00 + v_10; v_01 + v_11)
+       veor    q8, q0, q1              // u_* = (u_01 + u_11; u_00 + u_10)
+       veor    q9, q2, q3              // v_* = (v_01 + v_11; v_00 + v_10)
  
         // Start by building the cross product, q = u_* v_*.
         vmull.p64 q14, d16, d19         // u_*0 v_*1
  
         // Start by building the cross product, q = u_* v_*.
         vmull.p64 q14, d16, d19         // u_*0 v_*1
@@ -398,16 +398,16 @@
         // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
  
         // First, shift the high bits down.
         // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
  
         // First, shift the high bits down.
-       // q8 =                         // (y_7; y_6)
-       // q9 =                         // (y_5; y_4)
-       // q10 =                        // (y_3; y_2)
-       // q11 =                        // (y_1; y_0)
-       vshl.u64 q0, q8, #62            // (y_7; y_6) b_i for t^2
-       vshl.u64 q12, q9, #62           // (y_5; y_4) b_i for t^2
-       vshl.u64 q1, q8, #59            // (y_7; y_6) b_i for t^5
-       vshl.u64 q13, q9, #59           // (y_5; y_4) b_i for t^5
-       vshl.u64 q2, q8, #54            // (y_7; y_6) b_i for t^10
-       vshl.u64 q14, q9, #54           // (y_5; y_4) b_i for t^10
+       // q8 =                         // (y_6; y_7)
+       // q9 =                         // (y_4; y_5)
+       // q10 =                        // (y_2; y_3)
+       // q11 =                        // (y_0; y_1)
+       vshl.u64 q0, q8, #62            // (y_6; y_7) b_i for t^2
+       vshl.u64 q12, q9, #62           // (y_4; y_5) b_i for t^2
+       vshl.u64 q1, q8, #59            // (y_6; y_7) b_i for t^5
+       vshl.u64 q13, q9, #59           // (y_4; y_5) b_i for t^5
+       vshl.u64 q2, q8, #54            // (y_6; y_7) b_i for t^10
+       vshl.u64 q14, q9, #54           // (y_4; y_5) b_i for t^10
         veor    q0, q0, q1              // mix the contributions together
         veor    q12, q12, q13
         veor    q0, q0, q2
         veor    q0, q0, q1              // mix the contributions together
         veor    q12, q12, q13
         veor    q0, q0, q2
@@ -419,16 +419,16 @@
  
         // And then shift the low bits up.  Also, switch the order of the
         // pieces for output.
  
         // And then shift the low bits up.  Also, switch the order of the
         // pieces for output.
-       // q8 =                         // (y'_7; y'_6)
-       // q9 =                         // (y'_5; y'_4)
-       // q10 =                        // (y'_3; y'_2)
-       // q11 =                        // (y'_1; y'_0)
-       vshr.u64 q0, q8, #2             // (y_7; y_6) a_i for t^2
-       vshr.u64 q12, q9, #2            // (y_5; y'_4) a_i for t^2
-       vshr.u64 q1, q8, #5             // (y_7; y_6) a_i for t^5
-       vshr.u64 q13, q9, #5            // (y_5; y_4) a_i for t^5
-       vshr.u64 q2, q8, #10            // (y_7; y_6) a_i for t^10
-       vshr.u64 q14, q9, #10           // (y_5; y_4) a_i for t^10
+       // q8 =                         // (y'_6; y'_7)
+       // q9 =                         // (y'_4; y'_5)
+       // q10 =                        // (y'_2; y'_3)
+       // q11 =                        // (y'_0; y'_1)
+       vshr.u64 q0, q8, #2             // (y_6; y_7) a_i for t^2
+       vshr.u64 q12, q9, #2            // (y'_4; y_5) a_i for t^2
+       vshr.u64 q1, q8, #5             // (y_6; y_7) a_i for t^5
+       vshr.u64 q13, q9, #5            // (y_4; y_5) a_i for t^5
+       vshr.u64 q2, q8, #10            // (y_6; y_7) a_i for t^10
+       vshr.u64 q14, q9, #10           // (y_4; y_5) a_i for t^10
  
         veor    q8, q8, q0              // mix the contributions together
         veor    q1, q1, q2
  
         veor    q8, q8, q0              // mix the contributions together
         veor    q1, q1, q2
diff --git a/symm/gcm-arm64-pmull.S b/symm/gcm-arm64-pmull.S

index dcd8c45..0e4bd79 100644 (file)
--- a/symm/gcm-arm64-pmull.S
+++ b/symm/gcm-arm64-pmull.S
@@ -71,19 +71,19 @@
         // use Karatsuba's identity here, but I suspect that loses more in
         // the shifting, bit-twiddling, and dependency chains that it gains
         // in saving a multiplication which otherwise pipelines well.
         // use Karatsuba's identity here, but I suspect that loses more in
         // the shifting, bit-twiddling, and dependency chains that it gains
         // in saving a multiplication which otherwise pipelines well.
-       // v0 =                         // (u_0; u_1)
-       // v1/v2 =                      // (v_0; v_1)
+       // v0 =                         // (u_1; u_0)
+       // v1/v2 =                      // (v_1; v_0)
         pmull2  v3.1q, v0.2d, v1.2d     // u_1 v_0
         pmull   v4.1q, v0.1d, v2.1d     // u_0 v_1
         pmull2  v3.1q, v0.2d, v1.2d     // u_1 v_0
         pmull   v4.1q, v0.1d, v2.1d     // u_0 v_1
-       pmull2  v5.1q, v0.2d, v2.2d     // (t_1; x_3) = u_1 v_1
-       pmull   v6.1q, v0.1d, v1.1d     // (x_0; t_0) = u_0 v_0
+       pmull2  v5.1q, v0.2d, v2.2d     // (x_3; t_1) = u_1 v_1
+       pmull   v6.1q, v0.1d, v1.1d     // (t_0; x_0) = u_0 v_0
  
         // Arrange the pieces to form a double-precision polynomial.
  
         // Arrange the pieces to form a double-precision polynomial.
-       eor     v3.16b, v3.16b, v4.16b  // (m_0; m_1) = u_0 v_1 + u_1 v_0
-       vshr128 v4, v3, 64              // (m_1; 0)
-       vshl128 v3, v3, 64              // (0; m_0)
-       eor     v1.16b, v5.16b, v4.16b  // (x_2; x_3)
-       eor     v0.16b, v6.16b, v3.16b  // (x_0; x_1)
+       eor     v3.16b, v3.16b, v4.16b  // (m_1; m_0) = u_0 v_1 + u_1 v_0
+       vshr128 v4, v3, 64              // (0; m_1)
+       vshl128 v3, v3, 64              // (m_0; 0)
+       eor     v1.16b, v5.16b, v4.16b  // (x_3; x_2)
+       eor     v0.16b, v6.16b, v3.16b  // (x_1; x_0)
  
         // And now the only remaining difficulty is that the result needs to
         // be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128
  
         // And now the only remaining difficulty is that the result needs to
         // be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128
@@ -137,8 +137,8 @@
         // leave with z = u v in x2.  Clobbers x2--x4.
  
         // The multiplication is thankfully easy.
         // leave with z = u v in x2.  Clobbers x2--x4.
  
         // The multiplication is thankfully easy.
-       // v0 =                                 // (u; ?)
-       // v1 =                                 // (v; ?)
+       // v0 =                                 // (?; u)
+       // v1 =                                 // (?; v)
         pmull   v0.1q, v0.1d, v1.1d             // u v
  
         // Now we must reduce.  This is essentially the same as the 128-bit
         pmull   v0.1q, v0.1d, v1.1d             // u v
  
         // Now we must reduce.  This is essentially the same as the 128-bit
@@ -176,12 +176,12 @@
         // shift both of them up by four bytes before we start.  This will
         // mean that the high 64 bits of the result (from GCM's viewpoint)
         // will be zero.
         // shift both of them up by four bytes before we start.  This will
         // mean that the high 64 bits of the result (from GCM's viewpoint)
         // will be zero.
-       // v0 =                         // (u_0 + u_1 t^32; u_2)
+       // v0 =                         // (u_2; u_0 + u_1 t^32)
         // v1 =                         // (v_0 + v_1 t^32; v_0 + v_1 t^32)
         // v2 =                         // (v_2; v_2)
         pmull2  v5.1q, v0.2d, v1.2d     // u_2 (v_0 + v_1 t^32) t^32 = e_0
         pmull   v4.1q, v0.1d, v2.1d     // v_2 (u_0 + u_1 t^32) t^32 = e_1
         // v1 =                         // (v_0 + v_1 t^32; v_0 + v_1 t^32)
         // v2 =                         // (v_2; v_2)
         pmull2  v5.1q, v0.2d, v1.2d     // u_2 (v_0 + v_1 t^32) t^32 = e_0
         pmull   v4.1q, v0.1d, v2.1d     // v_2 (u_0 + u_1 t^32) t^32 = e_1
-       pmull2  v6.1q, v0.2d, v2.2d     // u_2 v_2 = d = (d; 0)
+       pmull2  v6.1q, v0.2d, v2.2d     // u_2 v_2 = d = (0; d)
         pmull   v3.1q, v0.1d, v1.1d     // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
                                         //   + u_1 v_1 t^64 = f
  
         pmull   v3.1q, v0.1d, v1.1d     // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
                                         //   + u_1 v_1 t^64 = f
  
@@ -238,8 +238,8 @@
         // Clobbers v16--v25.
  
         // Start multiplying and accumulating pieces of product.
         // Clobbers v16--v25.
  
         // Start multiplying and accumulating pieces of product.
-       // v0 =                         // (u_0; u_1)
-       // v1 =                         // (u_2; ?)
+       // v0 =                         // (u_1; u_0)
+       // v1 =                         // (?; u_2)
         // v2 =                         // (v_0; v_0)
         // v3 =                         // (v_1; v_1)
         // v4 =                         // (v_2; v_2)
         // v2 =                         // (v_0; v_0)
         // v3 =                         // (v_1; v_1)
         // v4 =                         // (v_2; v_2)
@@ -262,27 +262,27 @@
          eor    v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
  
         // Piece the product together.
          eor    v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
  
         // Piece the product together.
-       // v16 =                        // (a_0; a_1)
-       // v19 =                        // (b_0; b_1)
-       // v17 =                        // (c_0; c_1)
-       // v20 =                        // (d_0; d_1)
-       // v18 =                        // (e_0; e_1)
-       vshl128 v21, v19, 64            // (0; b_0)
-       ext     v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
-       vshr128 v23, v20, 64            // (d_1; 0)
-       eor     v16.16b, v16.16b, v21.16b // (x_0; x_1)
-       eor     v17.16b, v17.16b, v22.16b // (x_2; x_3)
-       eor     v18.16b, v18.16b, v23.16b // (x_2; x_3)
+       // v16 =                        // (a_1; a_0)
+       // v19 =                        // (b_1; b_0)
+       // v17 =                        // (c_1; c_0)
+       // v20 =                        // (d_1; d_0)
+       // v18 =                        // (e_1; e_0)
+       vshl128 v21, v19, 64            // (b_0; 0)
+       ext     v22.16b, v19.16b, v20.16b, #8 // (d_0; b_1)
+       vshr128 v23, v20, 64            // (0; d_1)
+       eor     v16.16b, v16.16b, v21.16b // (x_1; x_0)
+       eor     v17.16b, v17.16b, v22.16b // (x_3; x_2)
+       eor     v18.16b, v18.16b, v23.16b // (x_3; x_2)
  
         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
         // 128-bit case.  I don't know why.
  
         // First, shift the high bits down.
  
         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
         // 128-bit case.  I don't know why.
  
         // First, shift the high bits down.
-       // v16 =                        // (y_0; y_1)
-       // v17 =                        // (y_2; y_3)
-       // v18 =                        // (y_4; y_5)
-       mov     v19.d[0], v17.d[1]      // (y_3; ?)
+       // v16 =                        // (y_1; y_0)
+       // v17 =                        // (y_3; y_2)
+       // v18 =                        // (y_5; y_4)
+       mov     v19.d[0], v17.d[1]      // (?; y_3)
  
         ushr    v23.2d, v18.2d, #63     // hi b_i for t
         ushr    d20, d19, #63           // lo b_i for t
  
         ushr    v23.2d, v18.2d, #63     // hi b_i for t
         ushr    d20, d19, #63           // lo b_i for t
@@ -298,15 +298,15 @@
         // Permute the high pieces while we fold in the b_i.
         eor     v17.16b, v17.16b, v23.16b
         vshl128 v20, v20, 64
         // Permute the high pieces while we fold in the b_i.
         eor     v17.16b, v17.16b, v23.16b
         vshl128 v20, v20, 64
-       mov     v19.d[0], v18.d[1]      // (y_5; ?)
-       ext     v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
+       mov     v19.d[0], v18.d[1]      // (?; y_5)
+       ext     v18.16b, v17.16b, v18.16b, #8 // (y_4; y_3)
         eor     v16.16b, v16.16b, v20.16b
  
         // And finally shift the low bits up.
         eor     v16.16b, v16.16b, v20.16b
  
         // And finally shift the low bits up.
-       // v16 =                        // (y'_0; y'_1)
-       // v17 =                        // (y'_2; ?)
-       // v18 =                        // (y'_3; y'_4)
-       // v19 =                        // (y'_5; ?)
+       // v16 =                        // (y'_1; y'_0)
+       // v17 =                        // (?; y'_2)
+       // v18 =                        // (y'_4; y'_3)
+       // v19 =                        // (?; y'_5)
         shl     v20.2d, v18.2d, #1
         shl     d23, d19, #1
         shl     v21.2d, v18.2d, #2
         shl     v20.2d, v18.2d, #1
         shl     d23, d19, #1
         shl     v21.2d, v18.2d, #2
@@ -345,14 +345,14 @@
         // 128-bit multiplications already, and Karatsuba is too annoying
         // there, so there'll be 12 multiplications altogether, rather than
         // the 16 we'd have if we did this the naïve way.
         // 128-bit multiplications already, and Karatsuba is too annoying
         // there, so there'll be 12 multiplications altogether, rather than
         // the 16 we'd have if we did this the naïve way.
-       // v0 =                         // u_0 = (u_00; u_01)
-       // v1 =                         // u_1 = (u_10; u_11)
+       // v0 =                         // u_0 = (u_01; u_00)
+       // v1 =                         // u_1 = (u_11; u_10)
         // v2 =                         // (v_00; v_00)
         // v3 =                         // (v_01; v_01)
         // v4 =                         // (v_10; v_10)
         // v5 =                         // (v_11; v_11)
  
         // v2 =                         // (v_00; v_00)
         // v3 =                         // (v_01; v_01)
         // v4 =                         // (v_10; v_10)
         // v5 =                         // (v_11; v_11)
  
-       eor     v28.16b, v0.16b, v1.16b // u_* = (u_00 + u_10; u_01 + u_11)
+       eor     v28.16b, v0.16b, v1.16b // u_* = (u_01 + u_11; u_00 + u_10)
         eor     v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
         eor     v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
  
         eor     v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
         eor     v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
  
@@ -402,16 +402,16 @@
         // Now we must reduce.  This is essentially the same as the 192-bit
         // case above, but more complicated because everything is bigger.
         // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
         // Now we must reduce.  This is essentially the same as the 192-bit
         // case above, but more complicated because everything is bigger.
         // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
-       // v16 =                        // (y_0; y_1)
-       // v17 =                        // (y_2; y_3)
-       // v18 =                        // (y_4; y_5)
-       // v19 =                        // (y_6; y_7)
-       ushr    v24.2d, v18.2d, #62     // (y_4; y_5) b_i for t^2
-       ushr    v25.2d, v19.2d, #62     // (y_6; y_7) b_i for t^2
-       ushr    v26.2d, v18.2d, #59     // (y_4; y_5) b_i for t^5
-       ushr    v27.2d, v19.2d, #59     // (y_6; y_7) b_i for t^5
-       ushr    v28.2d, v18.2d, #54     // (y_4; y_5) b_i for t^10
-       ushr    v29.2d, v19.2d, #54     // (y_6; y_7) b_i for t^10
+       // v16 =                        // (y_1; y_0)
+       // v17 =                        // (y_3; y_2)
+       // v18 =                        // (y_5; y_4)
+       // v19 =                        // (y_7; y_6)
+       ushr    v24.2d, v18.2d, #62     // (y_5; y_4) b_i for t^2
+       ushr    v25.2d, v19.2d, #62     // (y_7; y_6) b_i for t^2
+       ushr    v26.2d, v18.2d, #59     // (y_5; y_4) b_i for t^5
+       ushr    v27.2d, v19.2d, #59     // (y_7; y_6) b_i for t^5
+       ushr    v28.2d, v18.2d, #54     // (y_5; y_4) b_i for t^10
+       ushr    v29.2d, v19.2d, #54     // (y_7; y_6) b_i for t^10
         eor     v24.16b, v24.16b, v26.16b // mix the contributions together
         eor     v25.16b, v25.16b, v27.16b
         eor     v24.16b, v24.16b, v28.16b
         eor     v24.16b, v24.16b, v26.16b // mix the contributions together
         eor     v25.16b, v25.16b, v27.16b
         eor     v24.16b, v24.16b, v28.16b
@@ -424,16 +424,16 @@
         eor     v16.16b, v16.16b, v24.16b
  
         // And then shift the low bits up.
         eor     v16.16b, v16.16b, v24.16b
  
         // And then shift the low bits up.
-       // v16 =                        // (y'_0; y'_1)
-       // v17 =                        // (y'_2; y'_3)
-       // v18 =                        // (y'_4; y'_5)
-       // v19 =                        // (y'_6; y'_7)
-       shl     v24.2d, v18.2d, #2      // (y'_4; y_5) a_i for t^2
-       shl     v25.2d, v19.2d, #2      // (y_6; y_7) a_i for t^2
-       shl     v26.2d, v18.2d, #5      // (y'_4; y_5) a_i for t^5
-       shl     v27.2d, v19.2d, #5      // (y_6; y_7) a_i for t^5
-       shl     v28.2d, v18.2d, #10     // (y'_4; y_5) a_i for t^10
-       shl     v29.2d, v19.2d, #10     // (y_6; y_7) a_i for t^10
+       // v16 =                        // (y'_1; y'_0)
+       // v17 =                        // (y'_3; y'_2)
+       // v18 =                        // (y'_5; y'_4)
+       // v19 =                        // (y'_7; y'_6)
+       shl     v24.2d, v18.2d, #2      // (y_5; y'_4) a_i for t^2
+       shl     v25.2d, v19.2d, #2      // (y_7; y_6) a_i for t^2
+       shl     v26.2d, v18.2d, #5      // (y_5; y'_4) a_i for t^5
+       shl     v27.2d, v19.2d, #5      // (y_7; y_6) a_i for t^5
+       shl     v28.2d, v18.2d, #10     // (y_5; y'_4) a_i for t^10
+       shl     v29.2d, v19.2d, #10     // (y_7; y_6) a_i for t^10
         eor     v18.16b, v18.16b, v24.16b // mix the contributions together
         eor     v19.16b, v19.16b, v25.16b
         eor     v26.16b, v26.16b, v28.16b
         eor     v18.16b, v18.16b, v24.16b // mix the contributions together
         eor     v19.16b, v19.16b, v25.16b
         eor     v26.16b, v26.16b, v28.16b
diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S

index 837abbd..fadeca5 100644 (file)
--- a/symm/gcm-x86ish-pclmul.S
+++ b/symm/gcm-x86ish-pclmul.S
@@ -113,21 +113,21 @@
         // use Karatsuba's identity here, but I suspect that loses more in
         // the shifting, bit-twiddling, and dependency chains that it gains
         // in saving a multiplication which otherwise pipelines well.
         // use Karatsuba's identity here, but I suspect that loses more in
         // the shifting, bit-twiddling, and dependency chains that it gains
         // in saving a multiplication which otherwise pipelines well.
-       // xmm0 =                       // (u_1; u_0)
-       // xmm1 =                       // (v_1; v_0)
-       movdqa  xmm2, xmm1              // (v_1; v_0) again
-       movdqa  xmm3, xmm0              // (u_1; u_0) again
-       movdqa  xmm4, xmm0              // (u_1; u_0) yet again
+       // xmm0 =                       // (u_0; u_1)
+       // xmm1 =                       // (v_0; v_1)
+       movdqa  xmm2, xmm1              // (v_0; v_1) again
+       movdqa  xmm3, xmm0              // (u_0; u_1) again
+       movdqa  xmm4, xmm0              // (u_0; u_1) yet again
         pclmulhqlqdq xmm2, xmm0         // u_1 v_0
         pclmullqlqdq xmm0, xmm1         // u_1 v_1
         pclmulhqlqdq xmm3, xmm1         // u_0 v_1
         pclmulhqhqdq xmm4, xmm1         // u_0 v_0
  
         // Arrange the pieces to form a double-precision polynomial.
         pclmulhqlqdq xmm2, xmm0         // u_1 v_0
         pclmullqlqdq xmm0, xmm1         // u_1 v_1
         pclmulhqlqdq xmm3, xmm1         // u_0 v_1
         pclmulhqhqdq xmm4, xmm1         // u_0 v_0
  
         // Arrange the pieces to form a double-precision polynomial.
-       pxor    xmm2, xmm3              // (m_1; m_0) = u_1 v_0 + u_0 v_1
-       movdqa  xmm1, xmm2              // (m_1; m_0) again
-       pslldq  xmm2, 8                 // (0; m_1)
-       psrldq  xmm1, 8                 // (m_0; 0)
+       pxor    xmm2, xmm3              // (m_0; m_1) = u_1 v_0 + u_0 v_1
+       movdqa  xmm1, xmm2              // (m_0; m_1) again
+       pslldq  xmm2, 8                 // (m_1; 0)
+       psrldq  xmm1, 8                 // (0; m_0)
         pxor    xmm0, xmm2              // z_1 = u_1 v_1 + m_1
         pxor    xmm1, xmm4              // z_0 = u_0 v_0 + t^64 m_0
  
         pxor    xmm0, xmm2              // z_1 = u_1 v_1 + m_1
         pxor    xmm1, xmm4              // z_0 = u_0 v_0 + t^64 m_0
  
@@ -158,9 +158,9 @@
         // word together, and then the low bits, everything will be fine.
  
         // First, shift the high bits down.
         // word together, and then the low bits, everything will be fine.
  
         // First, shift the high bits down.
-       movdqa  xmm2, xmm0              // (x_7, x_6; x_5, x_4) again
-       movdqa  xmm3, xmm0              // (x_7, x_6; x_5, x_4) yet again
-       movdqa  xmm4, xmm0              // (x_7, x_6; x_5, x_4) again again
+       movdqa  xmm2, xmm0              // (x_4, x_5; x_6, x_7) again
+       movdqa  xmm3, xmm0              // (x_4, x_5; x_6, x_7) yet again
+       movdqa  xmm4, xmm0              // (x_4, x_5; x_6, x_7) again again
         pslld   xmm2, 31                // the b_i for t
         pslld   xmm3, 30                // the b_i for t^2
         pslld   xmm4, 25                // the b_i for t^7
         pslld   xmm2, 31                // the b_i for t
         pslld   xmm3, 30                // the b_i for t^2
         pslld   xmm4, 25                // the b_i for t^7
@@ -196,13 +196,13 @@
         // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
  
         // First, we must detach the top (`low'!) half of the result.
         // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
  
         // First, we must detach the top (`low'!) half of the result.
-       movdqa  xmm0, xmm1              // (x_3, x_2; x_1, x_0) again
-       psrldq  xmm1, 8                 // (x_1, x_0; 0, 0)
+       movdqa  xmm0, xmm1              // (x_0, x_1; x_2, x_3) again
+       psrldq  xmm1, 8                 // (0, 0; x_0, x_1)
  
         // Next, shift the high bits down.
  
         // Next, shift the high bits down.
-       movdqa  xmm2, xmm0              // (x_3, x_2; ?, ?) again
-       movdqa  xmm3, xmm0              // (x_3, x_2; ?, ?) yet again
-       movdqa  xmm4, xmm0              // (x_3, x_2; ?, ?) again again
+       movdqa  xmm2, xmm0              // (?, ?; x_2, x_3) again
+       movdqa  xmm3, xmm0              // (?, ?; x_2, x_3) yet again
+       movdqa  xmm4, xmm0              // (?, ?; x_2, x_3) again again
         pslld   xmm2, 31                // b_i for t
         pslld   xmm3, 29                // b_i for t^3
         pslld   xmm4, 28                // b_i for t^4
         pslld   xmm2, 31                // b_i for t
         pslld   xmm3, 29                // b_i for t^3
         pslld   xmm4, 28                // b_i for t^4
@@ -239,11 +239,11 @@
         // shift both of them up by four bytes before we start.  This will
         // mean that the high 64 bits of the result (from GCM's viewpoint)
         // will be zero.
         // shift both of them up by four bytes before we start.  This will
         // mean that the high 64 bits of the result (from GCM's viewpoint)
         // will be zero.
-       // xmm0 =                       // (0, u_2; u_1, u_0)
-       // xmm1 =                       // (0, v_2; v_1, v_0)
-       movdqa  xmm2, xmm1              // (0, v_2; v_1, v_0) again
-       movdqa  xmm3, xmm0              // (0, u_2; u_1, u_0) again
-       movdqa  xmm4, xmm0              // (0, u_2; u_1, u_0) yet again
+       // xmm0 =                       // (u_0, u_1; u_2, 0)
+       // xmm1 =                       // (v_0, v_1; v_2, 0)
+       movdqa  xmm2, xmm1              // (v_0, v_1; v_2, 0) again
+       movdqa  xmm3, xmm0              // (u_0, u_1; u_2, 0) again
+       movdqa  xmm4, xmm0              // (u_0, u_1; u_2, 0) yet again
         pclmulhqlqdq xmm2, xmm0         // u_2 (v_1 t^32 + v_0) = e_0
         pclmullqlqdq xmm0, xmm1         // u_2 v_2 = d = (0; d)
         pclmulhqlqdq xmm3, xmm1         // v_2 (u_1 t^32 + u_0) = e_1
         pclmulhqlqdq xmm2, xmm0         // u_2 (v_1 t^32 + v_0) = e_0
         pclmullqlqdq xmm0, xmm1         // u_2 v_2 = d = (0; d)
         pclmulhqlqdq xmm3, xmm1         // v_2 (u_1 t^32 + u_0) = e_1
@@ -255,10 +255,10 @@
         // registers.  The answer we want is d t^128 + e t^64 + f, where e =
         // e_0 + e_1.
         //
         // registers.  The answer we want is d t^128 + e t^64 + f, where e =
         // e_0 + e_1.
         //
-       // The place values for the two halves are (t^160, t^128; t^96, ?)
-       // and (?, t^64; t^32, 1).  But we also want to shift the high part
+       // The place values for the two halves are (?, t^96; t^128, t^160)
+       // and (1, t^32; t^64, ?).  But we also want to shift the high part
         // left by a word, for symmetry's sake.
         // left by a word, for symmetry's sake.
-       psrldq  xmm0, 8                 // (d; 0) = d t^128
+       psrldq  xmm0, 8                 // (0; d) = d t^128
         pxor    xmm2, xmm3              // e = (e_0 + e_1)
         movdqa  xmm1, xmm4              // f again
         pxor    xmm0, xmm2              // d t^128 + e t^64
         pxor    xmm2, xmm3              // e = (e_0 + e_1)
         movdqa  xmm1, xmm4              // f again
         pxor    xmm0, xmm2              // d t^128 + e t^64
@@ -308,15 +308,15 @@
         // are unimportant.  Clobbers xmm2--xmm7.
  
         // Start multiplying and accumulating pieces of product.
         // are unimportant.  Clobbers xmm2--xmm7.
  
         // Start multiplying and accumulating pieces of product.
-       // xmm0 =                       // (u_2; u_1)
-       // xmm1 =                       // (u_0; ?)
-       // xmm2 =                       // (v_2; v_1)
-       // xmm3 =                       // (v_0; ?)
-       movdqa  xmm4, xmm0              // (u_2; u_1) again
-       movdqa  xmm5, xmm0              // (u_2; u_1) yet again
-       movdqa  xmm6, xmm0              // (u_2; u_1) again again
-       movdqa  xmm7, xmm3              // (v_0; ?) again
-       punpcklqdq xmm3, xmm1           // (v_0; u_0)
+       // xmm0 =                       // (u_1; u_2)
+       // xmm1 =                       // (?; u_0)
+       // xmm2 =                       // (v_1; v_2)
+       // xmm3 =                       // (?; v_0)
+       movdqa  xmm4, xmm0              // (u_1; u_2) again
+       movdqa  xmm5, xmm0              // (u_1; u_2) yet again
+       movdqa  xmm6, xmm0              // (u_1; u_2) again again
+       movdqa  xmm7, xmm3              // (?; v_0) again
+       punpcklqdq xmm3, xmm1           // (u_0; v_0)
         pclmulhqhqdq xmm4, xmm2         // u_1 v_1
         pclmullqlqdq xmm1, xmm2         // u_0 v_2
         pclmullqhqdq xmm5, xmm2         // u_2 v_1
         pclmulhqhqdq xmm4, xmm2         // u_1 v_1
         pclmullqlqdq xmm1, xmm2         // u_0 v_2
         pclmullqhqdq xmm5, xmm2         // u_2 v_1
@@ -324,7 +324,7 @@
         pxor    xmm1, xmm4              // u_0 v_2 + u_1 v_1
         pclmullqlqdq xmm7, xmm0         // u_2 v_0
         pxor    xmm5, xmm6              // b = u_2 v_1 + u_1 v_2
         pxor    xmm1, xmm4              // u_0 v_2 + u_1 v_1
         pclmullqlqdq xmm7, xmm0         // u_2 v_0
         pxor    xmm5, xmm6              // b = u_2 v_1 + u_1 v_2
-       movdqa  xmm6, xmm0              // (u_2; u_1) like a bad penny
+       movdqa  xmm6, xmm0              // (u_1; u_2) like a bad penny
         pxor    xmm1, xmm7              // c = u_0 v_2 + u_1 v_1 + u_2 v_0
         pclmullqlqdq xmm0, xmm2         // a = u_2 v_2
         pclmulhqlqdq xmm6, xmm3         // u_1 v_0
         pxor    xmm1, xmm7              // c = u_0 v_2 + u_1 v_1 + u_2 v_0
         pclmullqlqdq xmm0, xmm2         // a = u_2 v_2
         pclmulhqlqdq xmm6, xmm3         // u_1 v_0
@@ -334,50 +334,50 @@
  
         // Next, the piecing together of the product.  There's significant
         // work here to leave the completed pieces in sensible registers.
  
         // Next, the piecing together of the product.  There's significant
         // work here to leave the completed pieces in sensible registers.
-       // xmm0 =                       // (a_1; a_0) = a = u_2 v_2
-       // xmm5 =                       // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
-       // xmm1 =                       // (c_1; c_0) = c = u_0 v_2 +
+       // xmm0 =                       // (a_0; a_1) = a = u_2 v_2
+       // xmm5 =                       // (b_0; b_1) = b = u_1 v_2 + u_2 v_1
+       // xmm1 =                       // (c_0; c_1) = c = u_0 v_2 +
                                         //      u_1 v_1 + u_2 v_0
                                         //      u_1 v_1 + u_2 v_0
-       // xmm6 =                       // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
-       // xmm3 =                       // (e_1; e_0) = e = u_0 v_0
+       // xmm6 =                       // (d_0; d_1) = d = u_0 v_1 + u_1 v_0
+       // xmm3 =                       // (e_0; e_1) = e = u_0 v_0
         // xmm2, xmm4, xmm7 spare
         // xmm2, xmm4, xmm7 spare
-       movdqa  xmm2, xmm6              // (d_1; d_0) again
-       movdqa  xmm4, xmm5              // (b_1; b_0) again
-       pslldq  xmm6, 8                 // (0; d_1)
-       psrldq  xmm5, 8                 // (b_0; 0)
-       psrldq  xmm2, 8                 // (d_0; 0)
-       pslldq  xmm4, 8                 // (0; b_1)
-       pxor    xmm5, xmm6              // (b_0; d_1)
-       pxor    xmm0, xmm4              // (x_5; x_4) = (a_1; a_0 + b_1)
-       pxor    xmm2, xmm3              // (x_1; x_0) = (e_1 + d_0; e_0)
-       pxor    xmm1, xmm5             // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)
+       movdqa  xmm2, xmm6              // (d_0; d_1) again
+       movdqa  xmm4, xmm5              // (b_0; b_1) again
+       pslldq  xmm6, 8                 // (d_1; 0)
+       psrldq  xmm5, 8                 // (0; b_0)
+       psrldq  xmm2, 8                 // (0; d_0)
+       pslldq  xmm4, 8                 // (b_1; 0)
+       pxor    xmm5, xmm6              // (d_1; b_0)
+       pxor    xmm0, xmm4              // (x_4; x_5) = (a_0 + b_1; a_1)
+       pxor    xmm2, xmm3              // (x_0; x_1) = (e_0; e_1 + d_0)
+       pxor    xmm1, xmm5             // (x_2; x_3) = (c_0 + d_1; b_0 + c_1)
  
         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
         // 128-bit case.  I don't know why.
  
         // First, shift the high bits down.
  
         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
         // 128-bit case.  I don't know why.
  
         // First, shift the high bits down.
-       // xmm0 =                       // (x_5; x_4)
-       // xmm1 =                       // (x_3; x_2)
-       // xmm2 =                       // (x_1; x_0)
+       // xmm0 =                       // (x_4; x_5)
+       // xmm1 =                       // (x_2; x_3)
+       // xmm2 =                       // (x_0; x_1)
         // xmm3--xmm7 spare
         // xmm3--xmm7 spare
-       movdqa  xmm3, xmm0              // (x_5; x_4) copy
-       movdqa  xmm4, xmm0              // (x_5; x_4) copy
-       movdqa  xmm5, xmm0              // (x_5; x_4) copy
-       pslld   xmm3, 31                // (x_5; x_4) b_i for t
-       pslld   xmm4, 30                // (x_5; x_4) b_i for t^2
-       pslld   xmm5, 25                // (x_5; x_4) b_i for t^7
-        movq   xmm6, xmm1              // (x_3; 0) copy
+       movdqa  xmm3, xmm0              // (x_4; x_5) copy
+       movdqa  xmm4, xmm0              // (x_4; x_5) copy
+       movdqa  xmm5, xmm0              // (x_4; x_5) copy
+       pslld   xmm3, 31                // (x_4; x_5) b_i for t
+       pslld   xmm4, 30                // (x_4; x_5) b_i for t^2
+       pslld   xmm5, 25                // (x_4; x_5) b_i for t^7
+        movq   xmm6, xmm1              // (0; x_3) copy
         pxor    xmm3, xmm4
         pxor    xmm3, xmm4
-        movq   xmm7, xmm1              // (x_3; 0) copy
+        movq   xmm7, xmm1              // (0; x_3) copy
         pxor    xmm3, xmm5
         pxor    xmm3, xmm5
-        movq   xmm5, xmm1              // (x_3; 0) copy
-       movdqa  xmm4, xmm3              // (x_5; x_4) b_i combined
-        pslld  xmm6, 31                // (x_3; 0) b_i for t
-        pslld  xmm7, 30                // (x_3; 0) b_i for t^2
-        pslld  xmm5, 25                // (x_3; 0) b_i for t^7
-       psrldq  xmm3, 12                // (x_5; x_4) low contrib
-       pslldq  xmm4, 4                 // (x_5; x_4) high contrib
+        movq   xmm5, xmm1              // (0; x_3) copy
+       movdqa  xmm4, xmm3              // (x_4; x_5) b_i combined
+        pslld  xmm6, 31                // (0; x_3) b_i for t
+        pslld  xmm7, 30                // (0; x_3) b_i for t^2
+        pslld  xmm5, 25                // (0; x_3) b_i for t^7
+       psrldq  xmm3, 12                // (x_4; x_5) low contrib
+       pslldq  xmm4, 4                 // (x_4; x_5) high contrib
          pxor   xmm6, xmm7
         pxor    xmm2, xmm3
          pxor   xmm6, xmm5
          pxor   xmm6, xmm7
         pxor    xmm2, xmm3
          pxor   xmm6, xmm5
@@ -387,17 +387,17 @@
  
         // And finally shift the low bits up.  Unfortunately, we also have to
         // split the low bits out.
  
         // And finally shift the low bits up.  Unfortunately, we also have to
         // split the low bits out.
-       // xmm0 =                       // (x'_5; x'_4)
-       // xmm1 =                       // (x'_3; x'_2)
-       // xmm2 =                       // (x'_1; x'_0)
-        movdqa xmm5, xmm1              // copies of (x'_3; x'_2)
+       // xmm0 =                       // (x'_4; x'_5)
+       // xmm1 =                       // (x'_2; x'_3)
+       // xmm2 =                       // (x'_0; x'_1)
+        movdqa xmm5, xmm1              // copies of (x'_2; x'_3)
          movdqa xmm6, xmm1
          movdqa xmm7, xmm1
          movdqa xmm6, xmm1
          movdqa xmm7, xmm1
-         psrldq xmm1, 8                // bring down (x'_2; ?)
-       movdqa  xmm3, xmm0              // copies of (x'_5; x'_4)
+         psrldq xmm1, 8                // bring down (?; x'_2)
+       movdqa  xmm3, xmm0              // copies of (x'_4; x'_5)
         movdqa  xmm4, xmm0
         movdqa  xmm4, xmm0
-         punpcklqdq  xmm1, xmm2        // (x'_2; x'_1)
-         psrldq xmm2, 8                // (x'_0; ?)
+         punpcklqdq  xmm1, xmm2        // (x'_1; x'_2)
+         psrldq xmm2, 8                // (?; x'_0)
          pxor   xmm2, xmm5              // low half and unit contrib
         pxor    xmm1, xmm0
          psrld  xmm5, 1
          pxor   xmm2, xmm5              // low half and unit contrib
         pxor    xmm1, xmm0
          psrld  xmm5, 1
@@ -412,7 +412,7 @@
         pxor    xmm0, xmm4
          pxor   xmm5, xmm2              // mix everything together
         pxor    xmm0, xmm1
         pxor    xmm0, xmm4
          pxor   xmm5, xmm2              // mix everything together
         pxor    xmm0, xmm1
-        movq   xmm1, xmm5              // shunt (z_0; ?) into proper place
+        movq   xmm1, xmm5              // shunt (?; z_0) into proper place
  .endm
  
  .macro mul256
  .endm
  
  .macro mul256
@@ -442,10 +442,10 @@
         // On x86, there aren't quite enough registers, so spill one for a
         // bit.  On AMD64, we can keep on going, so it's all good.
  
         // On x86, there aren't quite enough registers, so spill one for a
         // bit.  On AMD64, we can keep on going, so it's all good.
  
-       // xmm0 =                       // u_1 = (u_11; u_10)
-       // xmm1 =                       // u_0 = (u_01; u_00)
-       // xmm2 =                       // v_1 = (v_11; v_10)
-       // xmm3 =                       // v_0 = (v_01; v_00)
+       // xmm0 =                       // u_1 = (u_10; u_11)
+       // xmm1 =                       // u_0 = (u_00; u_01)
+       // xmm2 =                       // v_1 = (v_10; v_11)
+       // xmm3 =                       // v_0 = (v_00; v_01)
         movdqa  xmm4, xmm0              // u_1 again
  #if CPUFAM_X86
         movdqa  [SP + 0], xmm3
         movdqa  xmm4, xmm0              // u_1 again
  #if CPUFAM_X86
         movdqa  [SP + 0], xmm3
@@ -453,8 +453,8 @@
         movdqa  xmm8, xmm3
  #  define V0 xmm8
  #endif
         movdqa  xmm8, xmm3
  #  define V0 xmm8
  #endif
-       pxor    xmm4, xmm1              // u_* = (u_01 + u_11; u_00 + u_10)
-       pxor    xmm3, xmm2              // v_* = (v_01 + v_11; v_00 + v_10)
+       pxor    xmm4, xmm1              // u_* = (u_00 + u_10; u_01 + u_11)
+       pxor    xmm3, xmm2              // v_* = (v_00 + v_10; v_01 + v_11)
  
         // Start by building the cross product, q = u_* v_*.
         movdqa  xmm7, xmm4              // more copies of u_*
  
         // Start by building the cross product, q = u_* v_*.
         movdqa  xmm7, xmm4              // more copies of u_*
@@ -588,7 +588,7 @@
  //     the /last/ byte in the block.  If the block size is not a multiple of
  //     16 bytes, then there must be padding.  96-bit blocks are weird: the
  //     padding is inserted at the /least/ significant end, so the register
  //     the /last/ byte in the block.  If the block size is not a multiple of
  //     16 bytes, then there must be padding.  96-bit blocks are weird: the
  //     padding is inserted at the /least/ significant end, so the register
-//     holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
+//     holds (x_2, x_1; x_0, 0); otherwise, the padding goes at the most
  //     significant end.
  //
  //   * The `words' format consists of a sequence of bytes, as in the
  //     significant end.
  //
  //   * The `words' format consists of a sequence of bytes, as in the
@@ -613,9 +613,9 @@ SSEFUNC(gcm_mulk_128b_x86ish_pclmul)
    endprologue
         movdqu  xmm0, [A]
         movdqu  xmm1, [K]
    endprologue
         movdqu  xmm0, [A]
         movdqu  xmm1, [K]
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
         mul128
         mul128
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
         movdqu  [A], xmm0
         ret
  ENDFUNC
         movdqu  [A], xmm0
         ret
  ENDFUNC
@@ -653,9 +653,9 @@ SSEFUNC(gcm_mulk_64b_x86ish_pclmul)
    endprologue
         movq    xmm0, [A]
         movq    xmm1, [K]
    endprologue
         movq    xmm0, [A]
         movq    xmm1, [K]
-       pshufd  xmm0, xmm0, SHUF(1, 0, 3, 3)
+       pshufd  xmm0, xmm0, SHUF(3, 3, 0, 1)
         mul64
         mul64
-       pshufd  xmm0, xmm0, SHUF(1, 0, 3, 3)
+       pshufd  xmm0, xmm0, SHUF(3, 3, 0, 1)
         movq    [A], xmm0
         ret
  ENDFUNC
         movq    [A], xmm0
         ret
  ENDFUNC
@@ -696,9 +696,9 @@ SSEFUNC(gcm_mulk_96b_x86ish_pclmul)
         movd    xmm2, [A + 8]
         movdqu  xmm1, [K]
         punpcklqdq xmm0, xmm2
         movd    xmm2, [A + 8]
         movdqu  xmm1, [K]
         punpcklqdq xmm0, xmm2
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
         mul96
         mul96
-       pshufd  xmm1, xmm0, SHUF(3, 2, 1, 0)
+       pshufd  xmm1, xmm0, SHUF(0, 1, 2, 3)
         psrldq  xmm0, 4
         movq    [A + 0], xmm1
         movd    [A + 8], xmm0
         psrldq  xmm0, 4
         movq    [A + 0], xmm1
         movd    [A + 8], xmm0
@@ -750,11 +750,11 @@ SSEFUNC(gcm_mulk_192b_x86ish_pclmul)
         movq    xmm1, [A + 0]
         movdqu  xmm2, [K + 0]
         movq    xmm3, [K + 16]
         movq    xmm1, [A + 0]
         movdqu  xmm2, [K + 0]
         movq    xmm3, [K + 16]
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
-       pshufd  xmm1, xmm1, SHUF(1, 0, 3, 3)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
+       pshufd  xmm1, xmm1, SHUF(3, 3, 0, 1)
         mul192
         mul192
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
-       pshufd  xmm1, xmm1, SHUF(1, 0, 3, 3)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
+       pshufd  xmm1, xmm1, SHUF(3, 3, 0, 1)
         movdqu  [A + 8], xmm0
         movq    [A + 0], xmm1
  #if CPUFAM_AMD64 && ABI_WIN
         movdqu  [A + 8], xmm0
         movq    [A + 0], xmm1
  #if CPUFAM_AMD64 && ABI_WIN
@@ -824,11 +824,11 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul)
         movdqu  xmm1, [A + 0]
         movdqu  xmm2, [K + 0]
         movdqu  xmm3, [K + 16]
         movdqu  xmm1, [A + 0]
         movdqu  xmm2, [K + 0]
         movdqu  xmm3, [K + 16]
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
-       pshufd  xmm1, xmm1, SHUF(3, 2, 1, 0)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
+       pshufd  xmm1, xmm1, SHUF(0, 1, 2, 3)
         mul256
         mul256
-       pshufd  xmm0, xmm0, SHUF(3, 2, 1, 0)
-       pshufd  xmm1, xmm1, SHUF(3, 2, 1, 0)
+       pshufd  xmm0, xmm0, SHUF(0, 1, 2, 3)
+       pshufd  xmm1, xmm1, SHUF(0, 1, 2, 3)
         movdqu  [A + 16], xmm0
         movdqu  [A + 0], xmm1
  #if CPUFAM_X86
         movdqu  [A + 16], xmm0
         movdqu  [A + 0], xmm1
  #if CPUFAM_X86
diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S

index f5e5cc9..ad9236a 100644 (file)
--- a/symm/rijndael-x86ish-aesni.S
+++ b/symm/rijndael-x86ish-aesni.S
@@ -209,16 +209,16 @@ FUNC(rijndael_setup_x86ish_aesni)
         // Fourth word of the cycle, and seven or eight words of key.  Do a
         // byte substitution.
         movd    xmm0, eax
         // Fourth word of the cycle, and seven or eight words of key.  Do a
         // byte substitution.
         movd    xmm0, eax
-       pshufd  xmm0, xmm0, SHUF(3, 0, 1, 2)
+       pshufd  xmm0, xmm0, SHUF(2, 1, 0, 3)
         aeskeygenassist xmm1, xmm0, 0
         movd    eax, xmm1
         jmp     2f
  
         // First word of the cycle.  This is the complicated piece.
  1:     movd    xmm0, eax
         aeskeygenassist xmm1, xmm0, 0
         movd    eax, xmm1
         jmp     2f
  
         // First word of the cycle.  This is the complicated piece.
  1:     movd    xmm0, eax
-       pshufd  xmm0, xmm0, SHUF(1, 2, 3, 0)
+       pshufd  xmm0, xmm0, SHUF(0, 3, 2, 1)
         aeskeygenassist xmm1, xmm0, 0
         aeskeygenassist xmm1, xmm0, 0
-       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)
+       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)
         movd    eax, xmm1
         xor     al, [RCON]
         inc     RCON
         movd    eax, xmm1
         xor     al, [RCON]
         inc     RCON
diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S

index 26bab89..ccf912b 100644 (file)
--- a/symm/salsa20-x86ish-sse2.S
+++ b/symm/salsa20-x86ish-sse2.S
@@ -180,7 +180,7 @@ FUNC(salsa20_core_x86ish_sse2)
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm1
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm1
-        pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
+        pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
@@ -189,9 +189,9 @@ FUNC(salsa20_core_x86ish_sse2)
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm3
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm3
-        pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
+        pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
         paddd   xmm4, xmm2
         paddd   xmm4, xmm2
-        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
@@ -235,7 +235,7 @@ FUNC(salsa20_core_x86ish_sse2)
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm3
         // d ^= (c + b) <<< 13
         movdqa  xmm4, xmm2
         paddd   xmm4, xmm3
-        pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
+        pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
         movdqa  xmm5, xmm4
         pslld   xmm4, 13
         psrld   xmm5, 19
@@ -244,9 +244,9 @@ FUNC(salsa20_core_x86ish_sse2)
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm1
  
         // a ^= (d + c) <<< 18
         movdqa  xmm4, xmm1
-        pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
+        pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
         paddd   xmm4, xmm2
         paddd   xmm4, xmm2
-        pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+        pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
         movdqa  xmm5, xmm4
         pslld   xmm4, 18
         psrld   xmm5, 14
@@ -270,9 +270,9 @@ FUNC(salsa20_core_x86ish_sse2)
         // input.  This can be done by juggling values in registers, with the
         // following fancy footwork: some row rotations, a transpose, and
         // some more rotations.
         // input.  This can be done by juggling values in registers, with the
         // following fancy footwork: some row rotations, a transpose, and
         // some more rotations.
-       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)    //  3,  4,  9, 14
-       pshufd  xmm2, xmm2, SHUF(2, 3, 0, 1)    //  2,  7,  8, 13
-       pshufd  xmm3, xmm3, SHUF(1, 2, 3, 0)    //  1,  6, 11, 12
+       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)    //  3,  4,  9, 14
+       pshufd  xmm2, xmm2, SHUF(1, 0, 3, 2)    //  2,  7,  8, 13
+       pshufd  xmm3, xmm3, SHUF(0, 3, 2, 1)    //  1,  6, 11, 12
  
         movdqa  xmm4, xmm0
         movdqa  xmm5, xmm3
  
         movdqa  xmm4, xmm0
         movdqa  xmm5, xmm3
@@ -288,9 +288,9 @@ FUNC(salsa20_core_x86ish_sse2)
         punpckhdq xmm1, xmm3                    //  5,  6,  7,  4
         punpckhdq xmm2, xmm5                    // 15, 12, 13, 14
  
         punpckhdq xmm1, xmm3                    //  5,  6,  7,  4
         punpckhdq xmm2, xmm5                    // 15, 12, 13, 14
  
-       pshufd  xmm1, xmm1, SHUF(3, 0, 1, 2)    //  4,  5,  6,  7
-       pshufd  xmm4, xmm4, SHUF(2, 3, 0, 1)    //  8,  9, 10, 11
-       pshufd  xmm2, xmm2, SHUF(1, 2, 3, 0)    // 12, 13, 14, 15
+       pshufd  xmm1, xmm1, SHUF(2, 1, 0, 3)    //  4,  5,  6,  7
+       pshufd  xmm4, xmm4, SHUF(1, 0, 3, 2)    //  8,  9, 10, 11
+       pshufd  xmm2, xmm2, SHUF(0, 3, 2, 1)    // 12, 13, 14, 15
  
         // Finally we have to write out the result.
         movdqu  [OUT +  0], xmm0
  
         // Finally we have to write out the result.
         movdqu  [OUT +  0], xmm0
author	Mark Wooding <mdw@distorted.org.uk>
	Sat, 3 Feb 2024 23:02:22 +0000 (23:02 +0000)
committer	Mark Wooding <mdw@distorted.org.uk>
	Sat, 3 Feb 2024 23:36:13 +0000 (23:36 +0000)
base/asm-common.h		patch \| blob \| blame \| history
math/mpx-mul4-amd64-sse2.S		patch \| blob \| blame \| history
math/mpx-mul4-arm-neon.S		patch \| blob \| blame \| history
math/mpx-mul4-arm64-simd.S		patch \| blob \| blame \| history
math/mpx-mul4-x86-sse2.S		patch \| blob \| blame \| history
symm/chacha-x86ish-sse2.S		patch \| blob \| blame \| history
symm/gcm-arm-crypto.S		patch \| blob \| blame \| history
symm/gcm-arm64-pmull.S		patch \| blob \| blame \| history
symm/gcm-x86ish-pclmul.S		patch \| blob \| blame \| history
symm/rijndael-x86ish-aesni.S		patch \| blob \| blame \| history
symm/salsa20-x86ish-sse2.S		patch \| blob \| blame \| history