From: Mark Wooding Date: Sat, 3 Feb 2024 23:02:22 +0000 (+0000) Subject: base/asm-common.h, *.S: Use consistent little-endian notation for SIMD regs. X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/981a9e5d5e3af6c06ad8b3f821928852068227e4 base/asm-common.h, *.S: Use consistent little-endian notation for SIMD regs. This makes operations which involve changing one's perspective about the SIMD processing elements make significantly more sense. In particular, I hope that this removes a layer of brain-twisting from the GCM code. * Adjust all of the register-contents diagrams so that less significant elements are on the right, rather than on the left. * Change the x86 `SHUF' macro so that the desired pieces are listed in decreasing significance order, so `SHUF(3, 2, 1, 0)' would be a no-op. I would, of course, continue to use big-endian notation on a target which actually used a big-endian ordering natively, but we don't currently support any of them. --- diff --git a/base/asm-common.h b/base/asm-common.h index b4d4a909..9257d762 100644 --- a/base/asm-common.h +++ b/base/asm-common.h @@ -222,11 +222,11 @@ name: # define INTADDR__1(addr, got) addr #endif -// Permutations for SIMD instructions. SHUF(A, B, C, D) is an immediate, +// Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate, // suitable for use in `pshufd' or `shufpd', which copies element A // (0 <= A < 4) of the source to element 0 of the destination, element B to // element 1, element C to element 2, and element D to element 3. -#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d)) +#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a)) // Map register names to their individual pieces. diff --git a/math/mpx-mul4-amd64-sse2.S b/math/mpx-mul4-amd64-sse2.S index 5a748c60..d4726afa 100644 --- a/math/mpx-mul4-amd64-sse2.S +++ b/math/mpx-mul4-amd64-sse2.S @@ -59,9 +59,9 @@ /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE /// operands, as follows. /// -/// Offset 0 4 8 12 -/// 0 v'_0 v'_1 v''_0 v''_1 -/// 16 v'_2 v'_3 v''_2 v''_3 +/// Offset 12 8 4 0 +/// 0 v''_1 v''_0 v'_1 v'_0 +/// 16 v''_3 v''_2 v'_3 v'_2 /// /// A `pmuludqd' instruction ignores the odd positions in its operands; thus, /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting @@ -137,32 +137,32 @@ .macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil // Multiply R_I by the expanded operand SLO/SHI, and leave the pieces // of the product in registers D0, D1, D2, D3. - pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?) + pshufd \d0, \r, SHUF(3, \i, 3, \i) // (?, r_i; ?, r_i) .ifnes "\d1", "nil" - movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1) + movdqa \d1, \slo // (s''_1, s''_0; s'_1, s'_0) .endif .ifnes "\d3", "nil" - movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3) + movdqa \d3, \shi // (s''_3, s''_2; s'_3, s'_2) .endif .ifnes "\d1", "nil" - psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0) + psrldq \d1, 4 // (0, s''_1; s''_0, s'_1) .endif .ifnes "\d2", "nil" - movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?) + movdqa \d2, \d0 // another copy of (?, r_i; ?, r_i) .endif .ifnes "\d3", "nil" - psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0) + psrldq \d3, 4 // (0, s''_3; s''_2, s'_3) .endif .ifnes "\d1", "nil" - pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1) + pmuludq \d1, \d0 // (r_i s''_1; r_i s'_1) .endif .ifnes "\d3", "nil" - pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3) + pmuludq \d3, \d0 // (r_i s''_3; r_i s'_3) .endif .ifnes "\d2", "nil" - pmuludq \d2, \shi // (r_i s'_2; r_i s''_2) + pmuludq \d2, \shi // (r_i s''_2; r_i s'_2) .endif - pmuludq \d0, \slo // (r_i s'_0; r_i s''_0) + pmuludq \d0, \slo // (r_i s''_0; r_i s'_0) .endm .macro accum c0, c1=nil, c2=nil, c3=nil @@ -204,10 +204,10 @@ // lane 0 or 1 of D; the high two lanes of D are clobbered. On // completion, XMM3 is clobbered. If CC is `nil', then the // contribution which would have been added to it is left in C. - pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) - psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0) - pslldq xmm3, 2 // (t b; 0) - paddq \c, xmm3 // (c' + t b; c'') + pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?) + psrldq xmm3, 12 // (0, 0; 0, t) = (0; t) + pslldq xmm3, 2 // (0; t b) + paddq \c, xmm3 // (c''; c' + t b) .ifeqs "\pos", "lo" movdqa \d, \c .else @@ -224,10 +224,10 @@ // of the value represented in C are written at POS in D, and the // remaining bits are left at the bottom of T. movdqa \t, \c - psllq \t, 16 // (?; c'' b) - pslldq \c, 8 // (0; c') - paddq \t, \c // (?; c' + c'' b) - psrldq \t, 8 // (c' + c'' b; 0) = (c; 0) + psllq \t, 16 // (c'' b; ?) + pslldq \c, 8 // (c'; 0) + paddq \t, \c // (c' + c'' b; ?) + psrldq \t, 8 // (0; c' + c'' b) = (0; c) .ifeqs "\pos", "lo" movdqa \d, \t .else @@ -240,21 +240,21 @@ // On entry, A and C hold packed 128-bit values, and Z is zero. On // exit, A:B and C:D together hold the same values in expanded // form. If C is `nil', then only expand A to A:B. - movdqa \b, \a // (a_0, a_1; a_2, a_3) + movdqa \b, \a // (a_3, a_2; a_1, a_0) .ifnes "\c", "nil" - movdqa \d, \c // (c_0, c_1; c_2, c_3) + movdqa \d, \c // (c_3, c_2; c_1, c_0) .endif - punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1) - punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3) + punpcklwd \a, \z // (a''_1, a'_1; a''_0, a'_0) + punpckhwd \b, \z // (a''_3, a'_3; a''_2, a'_2) .ifnes "\c", "nil" - punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) - punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) + punpcklwd \c, \z // (c''_1, c'_1; c''_0, c'_0) + punpckhwd \d, \z // (c''_3, c'_3; c''_2, c'_2) .endif - pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) - pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) + pshufd \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0) + pshufd \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) - pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) + pshufd \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0) + pshufd \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2) .endif .endm @@ -270,10 +270,10 @@ // we can do that, we must gather them together. movdqa \t, \c0 movdqa \u, \c1 - punpcklqdq \t, \c2 // (y'_0; y'_2) - punpckhqdq \c0, \c2 // (y''_0; y''_2) - punpcklqdq \u, \c3 // (y'_1; y'_3) - punpckhqdq \c1, \c3 // (y''_1; y''_3) + punpcklqdq \t, \c2 // (y'_2; y'_0) + punpckhqdq \c0, \c2 // (y''_2; y''_0) + punpcklqdq \u, \c3 // (y'_3; y'_1) + punpckhqdq \c1, \c3 // (y''_3; y''_1) // Now split the double-prime pieces. The high (up to) 48 bits will // go up; the low 16 bits go down. @@ -281,43 +281,43 @@ movdqa \c3, \c1 psllq \c2, 48 psllq \c3, 48 - psrlq \c0, 16 // high parts of (y''_0; y''_2) - psrlq \c1, 16 // high parts of (y''_1; y''_3) - psrlq \c2, 32 // low parts of (y''_0; y''_2) - psrlq \c3, 32 // low parts of (y''_1; y''_3) + psrlq \c0, 16 // high parts of (y''_2; y''_0) + psrlq \c1, 16 // high parts of (y''_3; y''_1) + psrlq \c2, 32 // low parts of (y''_2; y''_0) + psrlq \c3, 32 // low parts of (y''_3; y''_1) .ifnes "\hi", "nil" movdqa \hi, \c1 .endif - pslldq \c1, 8 // high part of (0; y''_1) + pslldq \c1, 8 // high part of (y''_1; 0) paddq \t, \c2 // propagate down paddq \u, \c3 - paddq \t, \c1 // and up: (y_0; y_2) - paddq \u, \c0 // (y_1; y_3) + paddq \t, \c1 // and up: (y_2; y_0) + paddq \u, \c0 // (y_3; y_1) .ifnes "\hi", "nil" - psrldq \hi, 8 // high part of (y''_3; 0) + psrldq \hi, 8 // high part of (0; y''_3) .endif // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. - movdqa \c3, \t // (y_0; ?) - movdqa \lo, \t // (y^*_0, ?; ?, ?) - psrldq \t, 8 // (y_2; 0) + movdqa \c3, \t // (?; y_0) + movdqa \lo, \t // (?, ?; ?, y^*_0) + psrldq \t, 8 // (0; y_2) psrlq \c3, 32 // (floor(y_0/B); ?) paddq \c3, \u // (y_1 + floor(y_0/B); ?) - movdqa \c1, \c3 // (y^*_1, ?; ?, ?) - psrldq \u, 8 // (y_3; 0) + movdqa \c1, \c3 // (?, ?; ?, y^*_1) + psrldq \u, 8 // (0; y_3) psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?) paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?) - punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?) + punpckldq \lo, \c3 // (?, ?; y^*_2, y^*_0) psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) .ifnes "\hi", "nil" movdqa \t, \c3 pxor \u, \u .endif - punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?) + punpckldq \c1, \c3 // (?, ?; y^*_3, y^*_1) .ifnes "\hi", "nil" psrlq \t, 32 // very high bits of y paddq \hi, \t @@ -334,13 +334,13 @@ // On exit, the carry registers, including XMM15, are updated to hold // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other // registers are preserved. - movd xmm0, [rdi + 0] // (a_0; 0) - movd xmm1, [rdi + 4] // (a_1; 0) - movd xmm2, [rdi + 8] // (a_2; 0) - movd xmm15, [rdi + 12] // (a_3; 0) - paddq xmm12, xmm0 // (c'_0 + a_0; c''_0) - paddq xmm13, xmm1 // (c'_1 + a_1; c''_1) - paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b) + movd xmm0, [rdi + 0] // (0; a_0) + movd xmm1, [rdi + 4] // (0; a_1) + movd xmm2, [rdi + 8] // (0; a_2) + movd xmm15, [rdi + 12] // (0; a_3) + paddq xmm12, xmm0 // (c''_0; c'_0 + a_0) + paddq xmm13, xmm1 // (c''_1; c'_1 + a_1) + paddq xmm14, xmm2 // (c''_2 + a_3 b; c'_2 + a_2) .endm ///-------------------------------------------------------------------------- @@ -654,8 +654,8 @@ INTFUNC(mmla4) mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2 accum xmm4, xmm5, xmm6 - punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0) - punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0) + punpckldq xmm12, xmm15 // (0, w_1; 0, w_0) + punpckhdq xmm14, xmm15 // (0, w_3; 0, w_2) mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1 accum xmm5, xmm6 @@ -667,10 +667,10 @@ INTFUNC(mmla4) mulcore xmm7, 3, xmm10, xmm11, xmm0 accum xmm6 - punpckldq xmm12, xmm2 // (w_0, 0; 0, 0) - punpckldq xmm14, xmm2 // (w_2, 0; 0, 0) - punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0) - punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0) + punpckldq xmm12, xmm2 // (0, 0; 0, w_0) + punpckldq xmm14, xmm2 // (0, 0; 0, w_2) + punpckhdq xmm13, xmm2 // (0, 0; 0, w_1) + punpckhdq xmm15, xmm2 // (0, 0; 0, w_3) // That's lots of pieces. Now we have to assemble the answer. squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10 @@ -736,8 +736,8 @@ INTFUNC(mont4) mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2 accum xmm4, xmm5, xmm6 - punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0) - punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0) + punpckldq xmm12, xmm15 // (0, w_1; 0, w_0) + punpckhdq xmm14, xmm15 // (0, w_3; 0, w_2) mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1 accum xmm5, xmm6 @@ -749,10 +749,10 @@ INTFUNC(mont4) mulcore xmm7, 3, xmm8, xmm9, xmm0 accum xmm6 - punpckldq xmm12, xmm2 // (w_0, 0; 0, 0) - punpckldq xmm14, xmm2 // (w_2, 0; 0, 0) - punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0) - punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0) + punpckldq xmm12, xmm2 // (0, 0; 0, w_0) + punpckldq xmm14, xmm2 // (0, 0; 0, w_2) + punpckhdq xmm13, xmm2 // (0, 0; 0, w_1) + punpckhdq xmm15, xmm2 // (0, 0; 0, w_3) // That's lots of pieces. Now we have to assemble the answer. squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10 @@ -1511,9 +1511,9 @@ ENDFUNC .endm .macro testldcarry - movdqu xmm12, [rcx + 0] // (c'_0; c''_0) - movdqu xmm13, [rcx + 16] // (c'_1; c''_1) - movdqu xmm14, [rcx + 32] // (c'_2; c''_2) + movdqu xmm12, [rcx + 0] // (c''_0; c'_0) + movdqu xmm13, [rcx + 16] // (c''_1; c'_1) + movdqu xmm14, [rcx + 32] // (c''_2; c'_2) .endm .macro testtop u=nil @@ -1601,8 +1601,8 @@ FUNC(test_mmul4) testtop r11 call mmul4 testtail - pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) - pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) + pshufd xmm10, xmm10, SHUF(3, 1, 2, 0) + pshufd xmm11, xmm11, SHUF(3, 1, 2, 0) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1614,8 +1614,8 @@ FUNC(test_mmla4) testtop r11 call mmla4 testtail - pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) - pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) + pshufd xmm10, xmm10, SHUF(3, 1, 2, 0) + pshufd xmm11, xmm11, SHUF(3, 1, 2, 0) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout @@ -1627,8 +1627,8 @@ FUNC(test_mont4) testtop call mont4 testtail - pshufd xmm10, xmm10, SHUF(0, 2, 1, 3) - pshufd xmm11, xmm11, SHUF(0, 2, 1, 3) + pshufd xmm10, xmm10, SHUF(3, 1, 2, 0) + pshufd xmm11, xmm11, SHUF(3, 1, 2, 0) movdqu [r10 + 0], xmm10 movdqu [r10 + 16], xmm11 testcarryout diff --git a/math/mpx-mul4-arm-neon.S b/math/mpx-mul4-arm-neon.S index efca7902..8aa01bc0 100644 --- a/math/mpx-mul4-arm-neon.S +++ b/math/mpx-mul4-arm-neon.S @@ -60,9 +60,9 @@ /// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON /// operands, as follows. /// -/// Offset 0 4 8 12 -/// 0 v'_0 v''_0 v'_1 v''_1 -/// 16 v'_2 v''_2 v'_3 v''_3 +/// Offset 12 8 4 0 +/// 0 v''_1 v'_1 v''_0 v'_0 +/// 16 v''_3 v'_3 v''_2 v'_2 /// /// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act @@ -1012,12 +1012,12 @@ ENDFUNC ldr r14, [STKARG(0)] // -> vv vld1.32 {q2}, [r14] vmov.i32 q3, #0 - vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1) + vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0) ldr r14, [STKARG(1)] // -> yy vld1.32 {q4}, [r14] vmov.i32 q5, #0 - vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1) + vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0) ldr r5, [STKARG(2)] // = n ldr r6, [STKARG(3)] // -> cyv @@ -1029,7 +1029,7 @@ ENDFUNC vld1.32 {q4}, [r3] vmov.i32 q5, #0 - vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1) + vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0) ldr r5, [STKARG(0)] // = n ldr r6, [STKARG(1)] // -> cyv @@ -1044,12 +1044,12 @@ ENDFUNC ldr r14, [STKARG(1)] // -> vv vld1.32 {q2}, [r14] vmov.i32 q3, #0 - vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1) + vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0) ldr r14, [STKARG(2)] // -> yy vld1.32 {q4}, [r14] vmov.i32 q5, #0 - vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1) + vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0) ldr r5, [STKARG(3)] // = n ldr r6, [STKARG(4)] // -> cyv @@ -1065,7 +1065,7 @@ ENDFUNC ldr r14, [STKARG(0)] // -> vv vld1.32 {q2}, [r14] vmov.i32 q3, #0 - vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1) + vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0) ldr r5, [STKARG(1)] // = n ldr r6, [STKARG(2)] // -> cyv diff --git a/math/mpx-mul4-arm64-simd.S b/math/mpx-mul4-arm64-simd.S index 60eed208..ee33a002 100644 --- a/math/mpx-mul4-arm64-simd.S +++ b/math/mpx-mul4-arm64-simd.S @@ -57,9 +57,9 @@ /// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD /// operands, as follows. /// -/// Offset 0 4 8 12 -/// 0 v'_0 v''_0 v'_1 v''_1 -/// 16 v'_2 v''_2 v'_3 v''_3 +/// Offset 12 8 4 0 +/// 0 v''_1 v'_1 v''_0 v'_0 +/// 16 v''_3 v'_3 v''_2 v'_2 /// /// The `umull' and `umlal' instructions can multiply a vector of two 32-bit /// values by a 32-bit scalar, giving two 64-bit results; thus, it will act @@ -230,7 +230,7 @@ // leaving a carry in CG. // // In detail, what happens is as follows. Suppose initially that ZLO = -// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i; +// (z''_i; z'_i) and ZHI = (z''_{i+1}; z'_{i+1}). Let t = z'_i + b z''_i; // observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and // add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has // a circuit depth of 3; I don't know how to do better. @@ -1032,12 +1032,12 @@ ENDFUNC .ifeqs "\mode", "dmul" ldr q2, [x4] - zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3) - zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1) + zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2) + zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0) ldr q4, [x5] - zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) - zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2) + zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0) mov x16, x1 mov x1, x2 // -> u @@ -1050,8 +1050,8 @@ ENDFUNC .ifeqs "\mode", "smul" ldr q4, [x3] - zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) - zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2) + zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0) // x2 // -> x mov x3, x1 // -> c @@ -1061,12 +1061,12 @@ ENDFUNC .ifeqs "\mode", "mmul" ldr q2, [x5] - zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3) - zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1) + zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2) + zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0) ldr q6, [x6] - zip2 v7.8h, v6.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3) - zip1 v6.8h, v6.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1) + zip2 v7.8h, v6.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2) + zip1 v6.8h, v6.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0) mov x16, x1 mov x1, x3 // -> u @@ -1082,8 +1082,8 @@ ENDFUNC .ifeqs "\mode", "mont" ldr q6, [x4] - zip2 v7.8h, v6.8h, v31.8h // (m'_2, m''_2; m'_3, m''_3) - zip1 v6.8h, v6.8h, v31.8h // (m'_0, m''_0; m'_1, m''_1) + zip2 v7.8h, v6.8h, v31.8h // (m''_3, m'_3; m''_2, m'_2) + zip1 v6.8h, v6.8h, v31.8h // (m''_1, m'_1; m''_0, m'_0) mov x4, x2 // -> y mov x2, x3 // -> x diff --git a/math/mpx-mul4-x86-sse2.S b/math/mpx-mul4-x86-sse2.S index 916adef9..0964de9f 100644 --- a/math/mpx-mul4-x86-sse2.S +++ b/math/mpx-mul4-x86-sse2.S @@ -58,9 +58,9 @@ /// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE /// operands, as follows. /// -/// Offset 0 4 8 12 -/// 0 v'_0 v'_1 v''_0 v''_1 -/// 16 v'_2 v'_3 v''_2 v''_3 +/// Offset 12 8 4 0 +/// 0 v''_1 v''_0 v'_1 v'_0 +/// 16 v''_3 v''_2 v'_3 v'_2 /// /// A `pmuludq' instruction ignores the odd positions in its operands; thus, /// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting @@ -135,41 +135,41 @@ .macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil // Load a word r_i from R, multiply by the expanded operand [S], and // leave the pieces of the product in registers D0, D1, D2, D3. - movd \d0, \r // (r_i, 0; 0, 0) + movd \d0, \r // (0, 0; 0, r_i) .ifnes "\d1", "nil" - movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1) + movdqa \d1, [\s] // (s''_1, s''_0; s'_1, s'_0) .endif .ifnes "\d3", "nil" - movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3) + movdqa \d3, [\s + 16] // (s''_3, s''_2; s'_3, s'_2) .endif - pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?) + pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (?, r_i; ?, r_i) .ifnes "\d1", "nil" - psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0) + psrldq \d1, 4 // (0, s''_1; s''_0, s'_1) .endif .ifnes "\d2", "nil" .ifnes "\d3", "nil" movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...) .else - movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?) + movdqa \d2, \d0 // another copy of (?, r_i; ?, r_i) .endif .endif .ifnes "\d3", "nil" - psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0) + psrldq \d3, 4 // (0, s''_3; s''_2, s'_3) .endif .ifnes "\d1", "nil" - pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1) + pmuludq \d1, \d0 // (r_i s''_1; r_i s'_1) .endif .ifnes "\d3", "nil" - pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3) + pmuludq \d3, \d0 // (r_i s''_3; r_i s'_3) .endif .ifnes "\d2", "nil" .ifnes "\d3", "nil" - pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2) + pmuludq \d2, \d0 // (r_i s''_2; r_i s'_2) .else pmuludq \d2, [\s + 16] .endif .endif - pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0) + pmuludq \d0, [\s] // (r_i s''_0; r_i s'_0) .endm .macro accum c0, c1=nil, c2=nil, c3=nil @@ -210,10 +210,10 @@ // carry registers. On completion, XMM3 is clobbered. If CC is // `nil', then the contribution which would have been added to it is // left in C. - pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B) - psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0) - pslldq xmm3, 2 // (t b; 0) - paddq \c, xmm3 // (c' + t b; c'') + pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?) + psrldq xmm3, 12 // (0, 0; 0, t) = (0; t) + pslldq xmm3, 2 // (0; t b) + paddq \c, xmm3 // (c''; c' + t b) movd \d, \c psrlq \c, 32 // floor(c/B) .ifnes "\cc", "nil" @@ -226,10 +226,10 @@ // of the value represented in C are written to D, and the remaining // bits are left at the bottom of T. movdqa \t, \c - psllq \t, 16 // (?; c'' b) - pslldq \c, 8 // (0; c') - paddq \t, \c // (?; c' + c'' b) - psrldq \t, 8 // (c' + c'' b; 0) = (c; 0) + psllq \t, 16 // (c'' b; ?) + pslldq \c, 8 // (c'; 0) + paddq \t, \c // (c' + c'' b; ?) + psrldq \t, 8 // (0; c' + c'' b) = (0; c) movd \d, \t psrldq \t, 4 // (floor(c/B); 0) .endm @@ -238,21 +238,21 @@ // On entry, A and C hold packed 128-bit values, and Z is zero. On // exit, A:B and C:D together hold the same values in expanded // form. If C is `nil', then only expand A to A:B. - movdqa \b, \a // (a_0, a_1; a_2, a_3) + movdqa \b, \a // (a_3, a_2; a_1, a_0) .ifnes "\c", "nil" - movdqa \d, \c // (c_0, c_1; c_2, c_3) + movdqa \d, \c // (c_3, c_2; c_1, c_0) .endif - punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1) - punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3) + punpcklwd \a, \z // (a''_1, a'_1; a''_0, a'_0) + punpckhwd \b, \z // (a''_3, a'_3; a''_2, a'_2) .ifnes "\c", "nil" - punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1) - punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3) + punpcklwd \c, \z // (c''_1, c'_1; c''_0, c'_0) + punpckhwd \d, \z // (c''_3, c'_3; c''_2, c'_2) .endif - pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1) - pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3) + pshufd \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0) + pshufd \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2) .ifnes "\c", "nil" - pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1) - pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3) + pshufd \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0) + pshufd \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2) .endif .endm @@ -268,10 +268,10 @@ // we can do that, we must gather them together. movdqa \t, \c0 movdqa \u, \c1 - punpcklqdq \t, \c2 // (y'_0; y'_2) - punpckhqdq \c0, \c2 // (y''_0; y''_2) - punpcklqdq \u, \c3 // (y'_1; y'_3) - punpckhqdq \c1, \c3 // (y''_1; y''_3) + punpcklqdq \t, \c2 // (y'_2; y'_0) + punpckhqdq \c0, \c2 // (y''_2; y''_0) + punpcklqdq \u, \c3 // (y'_3; y'_1) + punpckhqdq \c1, \c3 // (y''_3; y''_1) // Now split the double-prime pieces. The high (up to) 48 bits will // go up; the low 16 bits go down. @@ -279,43 +279,43 @@ movdqa \c3, \c1 psllq \c2, 48 psllq \c3, 48 - psrlq \c0, 16 // high parts of (y''_0; y''_2) - psrlq \c1, 16 // high parts of (y''_1; y''_3) - psrlq \c2, 32 // low parts of (y''_0; y''_2) - psrlq \c3, 32 // low parts of (y''_1; y''_3) + psrlq \c0, 16 // high parts of (y''_2; y''_0) + psrlq \c1, 16 // high parts of (y''_3; y''_1) + psrlq \c2, 32 // low parts of (y''_2; y''_0) + psrlq \c3, 32 // low parts of (y''_3; y''_1) .ifnes "\hi", "nil" movdqa \hi, \c1 .endif - pslldq \c1, 8 // high part of (0; y''_1) + pslldq \c1, 8 // high part of (y''_1; 0) paddq \t, \c2 // propagate down paddq \u, \c3 - paddq \t, \c1 // and up: (y_0; y_2) - paddq \u, \c0 // (y_1; y_3) + paddq \t, \c1 // and up: (y_2; y_0) + paddq \u, \c0 // (y_3; y_1) .ifnes "\hi", "nil" - psrldq \hi, 8 // high part of (y''_3; 0) + psrldq \hi, 8 // high part of (0; y''_3) .endif // Finally extract the answer. This complicated dance is better than // storing to memory and loading, because the piecemeal stores // inhibit store forwarding. - movdqa \c3, \t // (y_0; ?) - movdqa \lo, \t // (y^*_0, ?; ?, ?) - psrldq \t, 8 // (y_2; 0) + movdqa \c3, \t // (?; y_0) + movdqa \lo, \t // (?, ?; ?, y^*_0) + psrldq \t, 8 // (0; y_2) psrlq \c3, 32 // (floor(y_0/B); ?) paddq \c3, \u // (y_1 + floor(y_0/B); ?) - movdqa \c1, \c3 // (y^*_1, ?; ?, ?) - psrldq \u, 8 // (y_3; 0) + movdqa \c1, \c3 // (?, ?; ?, y^*_1) + psrldq \u, 8 // (0; y_3) psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?) paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?) - punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?) + punpckldq \lo, \c3 // (?, ?; y^*_2, y^*_0) psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?) .ifnes "\hi", "nil" movdqa \t, \c3 pxor \u, \u .endif - punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?) + punpckldq \c1, \c3 // (?, ?; y^*_3, y^*_1) .ifnes "\hi", "nil" psrlq \t, 32 // very high bits of y paddq \hi, \t @@ -332,14 +332,14 @@ // On exit, the carry registers, including XMM7, are updated to hold // C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other // registers are preserved. - movd xmm0, [edi + 0] // (a_0; 0) - movd xmm1, [edi + 4] // (a_1; 0) - movd xmm2, [edi + 8] // (a_2; 0) - movd xmm7, [edi + 12] // (a_3; 0) - - paddq xmm4, xmm0 // (c'_0 + a_0; c''_0) - paddq xmm5, xmm1 // (c'_1 + a_1; c''_1) - paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b) + movd xmm0, [edi + 0] // (0; a_0) + movd xmm1, [edi + 4] // (0; a_1) + movd xmm2, [edi + 8] // (0; a_2) + movd xmm7, [edi + 12] // (0; a_3) + + paddq xmm4, xmm0 // (c''_0; c'_0 + a_0) + paddq xmm5, xmm1 // (c''_1; c'_1 + a_1) + paddq xmm6, xmm2 // (c''_2 + a_3 b; c'_2 + a_2) .endm ///-------------------------------------------------------------------------- @@ -1148,9 +1148,9 @@ ENDFUNC .macro testldcarry c mov ecx, \c // -> c - movdqu xmm4, [ecx + 0] // (c'_0; c''_0) - movdqu xmm5, [ecx + 16] // (c'_1; c''_1) - movdqu xmm6, [ecx + 32] // (c'_2; c''_2) + movdqu xmm4, [ecx + 0] // (c''_0; c'_0) + movdqu xmm5, [ecx + 16] // (c''_1; c'_1) + movdqu xmm6, [ecx + 32] // (c''_2; c'_2) .endm .macro testexpand v=nil, y=nil @@ -1286,8 +1286,8 @@ FUNC(test_mmul4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] - pshufd xmm0, xmm0, SHUF(0, 2, 1, 3) - pshufd xmm1, xmm1, SHUF(0, 2, 1, 3) + pshufd xmm0, xmm0, SHUF(3, 1, 2, 0) + pshufd xmm1, xmm1, SHUF(3, 1, 2, 0) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24] @@ -1304,8 +1304,8 @@ FUNC(test_mmla4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] - pshufd xmm0, xmm0, SHUF(0, 2, 1, 3) - pshufd xmm1, xmm1, SHUF(0, 2, 1, 3) + pshufd xmm0, xmm0, SHUF(3, 1, 2, 0) + pshufd xmm1, xmm1, SHUF(3, 1, 2, 0) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24] @@ -1322,8 +1322,8 @@ FUNC(test_mont4) mov edi, [BP + 28] movdqa xmm0, [SP + 64] movdqa xmm1, [SP + 80] - pshufd xmm0, xmm0, SHUF(0, 2, 1, 3) - pshufd xmm1, xmm1, SHUF(0, 2, 1, 3) + pshufd xmm0, xmm0, SHUF(3, 1, 2, 0) + pshufd xmm1, xmm1, SHUF(3, 1, 2, 0) movdqu [edi], xmm0 movdqu [edi + 16], xmm1 testcarryout [BP + 24] diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S index 974ec5b5..13a1848c 100644 --- a/symm/chacha-x86ish-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -164,9 +164,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, SHUF(3, 0, 1, 2) + pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) pxor xmm1, xmm2 - pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) + pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -184,7 +184,7 @@ FUNC(chacha_core_x86ish_sse2) // // The shuffles have quite high latency, so they've mostly been // pushed upwards. The remaining one can't be moved, though. - pshufd xmm1, xmm1, SHUF(1, 2, 3, 0) + pshufd xmm1, xmm1, SHUF(0, 3, 2, 1) // Apply the diagonal quarterround to each of the columns // simultaneously. @@ -215,9 +215,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) + pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) pxor xmm1, xmm2 - pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) + pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -226,7 +226,7 @@ FUNC(chacha_core_x86ish_sse2) // Finally, finish off undoing the transpose, and we're done for this // doubleround. Again, most of this was done above so we don't have // to wait for the shuffles. - pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // Decrement the loop counter and see if we should go round again. sub NR, 2 diff --git a/symm/gcm-arm-crypto.S b/symm/gcm-arm-crypto.S index d5a58f89..8494e42b 100644 --- a/symm/gcm-arm-crypto.S +++ b/symm/gcm-arm-crypto.S @@ -99,19 +99,19 @@ // use Karatsuba's identity here, but I suspect that loses more in // the shifting, bit-twiddling, and dependency chains that it gains // in saving a multiplication which otherwise pipelines well. - // q0 = // (u_0; u_1) - // q1 = // (v_0; v_1) + // q0 = // (u_1; u_0) + // q1 = // (v_1; v_0) vmull.p64 q2, d1, d2 // u_1 v_0 vmull.p64 q3, d0, d3 // u_0 v_1 - vmull.p64 q8, d1, d3 // (x_3; t_1) = u_1 v_1 - vmull.p64 q9, d0, d2 // (t_0; x_0) = u_0 v_0 + vmull.p64 q8, d1, d3 // (t_1; x_3) = u_1 v_1 + vmull.p64 q9, d0, d2 // (x_0; t_0) = u_0 v_0 // Arrange the pieces to form a double-precision polynomial. - veor q2, q2, q3 // (m_1; m_0) = u_0 v_1 + u_1 v_0 + veor q2, q2, q3 // (m_0; m_1) = u_0 v_1 + u_1 v_0 veor d17, d17, d4 // x_2 = t_1 + m_1 veor d18, d18, d5 // x_1 = t_0 + m_0 - // q8 = // (x_3; x_2) - // q9 = // (x_1; x_0) + // q8 = // (x_2; x_3) + // q9 = // (x_0; x_1) // One-and-a-half problems remain. // @@ -198,11 +198,11 @@ // This is an inconvenient size. There's nothing for it but to do // four multiplications, as if for the 128-bit case. - // q0 = // (u_0 + u_1 t^32; u_2) - // q1 = // (v_0 + v_1 t^32; v_2) + // q0 = // (u_2; u_0 + u_1 t^32) + // q1 = // (v_2; v_0 + v_1 t^32) vmull.p64 q8, d1, d2 // u_2 (v_0 + v_1 t^32) = e_0 vmull.p64 q9, d0, d3 // v_2 (u_0 + u_1 t^32) = e_1 - vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (0; d) + vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (d; 0) vmull.p64 q0, d0, d2 // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32 // + u_1 v_1 t^64 = f @@ -279,24 +279,24 @@ veor q11, q11, q13 // b = u_1 v_2 + u_2 v_1 // Piece the product together. - veor d17, d17, d22 // q8 = // (x_5; x_4) + veor d17, d17, d22 // q8 = // (x_4; x_5) veor d18, d18, d23 - veor d19, d19, d24 // q9 = // (x_3; x_2) - veor d20, d20, d25 // q10 = // (x_1; x_0) + veor d19, d19, d24 // q9 = // (x_2; x_3) + veor d20, d20, d25 // q10 = // (x_0; x_1) // Next, the reduction. Our polynomial this time is p(x) = t^192 + // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the // 128-bit case. I don't know why. // First, shift the high bits down. - // q8 = // (y_5; y_4) - // q9 = // (y_3; y_2) - // q10 = // (y_1; y_0) - vshl.u64 q11, q8, #63 // (y_5; y_4) b_i for t + // q8 = // (y_4; y_5) + // q9 = // (y_2; y_3) + // q10 = // (y_0; y_1) + vshl.u64 q11, q8, #63 // (y_4; y_5) b_i for t vshl.u64 d28, d18, #63 // y_3 b_i for t - vshl.u64 q12, q8, #62 // (y_5; y_4) b_i for t^2 + vshl.u64 q12, q8, #62 // (y_4; y_5) b_i for t^2 vshl.u64 d29, d18, #62 // y_3 b_i for t^2 - vshl.u64 q13, q8, #57 // (y_5; y_4) b_i for t^7 + vshl.u64 q13, q8, #57 // (y_4; y_5) b_i for t^7 vshl.u64 d30, d18, #57 // y_3 b_i for t^7 veor q11, q11, q12 // mix them all together veor d28, d28, d29 @@ -307,14 +307,14 @@ // And finally shift the low bits up. Also, switch the order of the // pieces for output. - // q8 = // (y'_5; y'_4) - // q9 = // (y'_3; y'_2) - // q10 = // (y'_1; y'_0) - vshr.u64 q11, q8, #1 // (y_5; y_4) a_i for t + // q8 = // (y'_4; y'_5) + // q9 = // (y'_2; y'_3) + // q10 = // (y'_0; y'_1) + vshr.u64 q11, q8, #1 // (y_4; y_5) a_i for t vshr.u64 d28, d18, #1 // y'_3 a_i for t - vshr.u64 q12, q8, #2 // (y_5; y_4) a_i for t^2 + vshr.u64 q12, q8, #2 // (y_4; y_5) a_i for t^2 vshr.u64 d29, d18, #2 // y'_3 a_i for t^2 - vshr.u64 q13, q8, #7 // (y_5; y_4) a_i for t^7 + vshr.u64 q13, q8, #7 // (y_4; y_5) a_i for t^7 vshr.u64 d30, d18, #7 // y'_3 a_i for t^7 veor q8, q8, q11 veor d18, d18, d28 @@ -348,13 +348,13 @@ // 128-bit multiplications already, and Karatsuba is too annoying // there, so there'll be 12 multiplications altogether, rather than // the 16 we'd have if we did this the naïve way. - // q0 = // u_0 = (u_00; u_01) - // q1 = // u_1 = (u_10; u_11) - // q2 = // v_0 = (v_00; v_01) - // q3 = // v_1 = (v_10; v_11) + // q0 = // u_0 = (u_01; u_00) + // q1 = // u_1 = (u_11; u_10) + // q2 = // v_0 = (v_01; v_00) + // q3 = // v_1 = (v_11; v_10) - veor q8, q0, q1 // u_* = (u_00 + u_10; u_01 + u_11) - veor q9, q2, q3 // v_* = (v_00 + v_10; v_01 + v_11) + veor q8, q0, q1 // u_* = (u_01 + u_11; u_00 + u_10) + veor q9, q2, q3 // v_* = (v_01 + v_11; v_00 + v_10) // Start by building the cross product, q = u_* v_*. vmull.p64 q14, d16, d19 // u_*0 v_*1 @@ -398,16 +398,16 @@ // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1. // First, shift the high bits down. - // q8 = // (y_7; y_6) - // q9 = // (y_5; y_4) - // q10 = // (y_3; y_2) - // q11 = // (y_1; y_0) - vshl.u64 q0, q8, #62 // (y_7; y_6) b_i for t^2 - vshl.u64 q12, q9, #62 // (y_5; y_4) b_i for t^2 - vshl.u64 q1, q8, #59 // (y_7; y_6) b_i for t^5 - vshl.u64 q13, q9, #59 // (y_5; y_4) b_i for t^5 - vshl.u64 q2, q8, #54 // (y_7; y_6) b_i for t^10 - vshl.u64 q14, q9, #54 // (y_5; y_4) b_i for t^10 + // q8 = // (y_6; y_7) + // q9 = // (y_4; y_5) + // q10 = // (y_2; y_3) + // q11 = // (y_0; y_1) + vshl.u64 q0, q8, #62 // (y_6; y_7) b_i for t^2 + vshl.u64 q12, q9, #62 // (y_4; y_5) b_i for t^2 + vshl.u64 q1, q8, #59 // (y_6; y_7) b_i for t^5 + vshl.u64 q13, q9, #59 // (y_4; y_5) b_i for t^5 + vshl.u64 q2, q8, #54 // (y_6; y_7) b_i for t^10 + vshl.u64 q14, q9, #54 // (y_4; y_5) b_i for t^10 veor q0, q0, q1 // mix the contributions together veor q12, q12, q13 veor q0, q0, q2 @@ -419,16 +419,16 @@ // And then shift the low bits up. Also, switch the order of the // pieces for output. - // q8 = // (y'_7; y'_6) - // q9 = // (y'_5; y'_4) - // q10 = // (y'_3; y'_2) - // q11 = // (y'_1; y'_0) - vshr.u64 q0, q8, #2 // (y_7; y_6) a_i for t^2 - vshr.u64 q12, q9, #2 // (y_5; y'_4) a_i for t^2 - vshr.u64 q1, q8, #5 // (y_7; y_6) a_i for t^5 - vshr.u64 q13, q9, #5 // (y_5; y_4) a_i for t^5 - vshr.u64 q2, q8, #10 // (y_7; y_6) a_i for t^10 - vshr.u64 q14, q9, #10 // (y_5; y_4) a_i for t^10 + // q8 = // (y'_6; y'_7) + // q9 = // (y'_4; y'_5) + // q10 = // (y'_2; y'_3) + // q11 = // (y'_0; y'_1) + vshr.u64 q0, q8, #2 // (y_6; y_7) a_i for t^2 + vshr.u64 q12, q9, #2 // (y'_4; y_5) a_i for t^2 + vshr.u64 q1, q8, #5 // (y_6; y_7) a_i for t^5 + vshr.u64 q13, q9, #5 // (y_4; y_5) a_i for t^5 + vshr.u64 q2, q8, #10 // (y_6; y_7) a_i for t^10 + vshr.u64 q14, q9, #10 // (y_4; y_5) a_i for t^10 veor q8, q8, q0 // mix the contributions together veor q1, q1, q2 diff --git a/symm/gcm-arm64-pmull.S b/symm/gcm-arm64-pmull.S index dcd8c450..0e4bd798 100644 --- a/symm/gcm-arm64-pmull.S +++ b/symm/gcm-arm64-pmull.S @@ -71,19 +71,19 @@ // use Karatsuba's identity here, but I suspect that loses more in // the shifting, bit-twiddling, and dependency chains that it gains // in saving a multiplication which otherwise pipelines well. - // v0 = // (u_0; u_1) - // v1/v2 = // (v_0; v_1) + // v0 = // (u_1; u_0) + // v1/v2 = // (v_1; v_0) pmull2 v3.1q, v0.2d, v1.2d // u_1 v_0 pmull v4.1q, v0.1d, v2.1d // u_0 v_1 - pmull2 v5.1q, v0.2d, v2.2d // (t_1; x_3) = u_1 v_1 - pmull v6.1q, v0.1d, v1.1d // (x_0; t_0) = u_0 v_0 + pmull2 v5.1q, v0.2d, v2.2d // (x_3; t_1) = u_1 v_1 + pmull v6.1q, v0.1d, v1.1d // (t_0; x_0) = u_0 v_0 // Arrange the pieces to form a double-precision polynomial. - eor v3.16b, v3.16b, v4.16b // (m_0; m_1) = u_0 v_1 + u_1 v_0 - vshr128 v4, v3, 64 // (m_1; 0) - vshl128 v3, v3, 64 // (0; m_0) - eor v1.16b, v5.16b, v4.16b // (x_2; x_3) - eor v0.16b, v6.16b, v3.16b // (x_0; x_1) + eor v3.16b, v3.16b, v4.16b // (m_1; m_0) = u_0 v_1 + u_1 v_0 + vshr128 v4, v3, 64 // (0; m_1) + vshl128 v3, v3, 64 // (m_0; 0) + eor v1.16b, v5.16b, v4.16b // (x_3; x_2) + eor v0.16b, v6.16b, v3.16b // (x_1; x_0) // And now the only remaining difficulty is that the result needs to // be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128 @@ -137,8 +137,8 @@ // leave with z = u v in x2. Clobbers x2--x4. // The multiplication is thankfully easy. - // v0 = // (u; ?) - // v1 = // (v; ?) + // v0 = // (?; u) + // v1 = // (?; v) pmull v0.1q, v0.1d, v1.1d // u v // Now we must reduce. This is essentially the same as the 128-bit @@ -176,12 +176,12 @@ // shift both of them up by four bytes before we start. This will // mean that the high 64 bits of the result (from GCM's viewpoint) // will be zero. - // v0 = // (u_0 + u_1 t^32; u_2) + // v0 = // (u_2; u_0 + u_1 t^32) // v1 = // (v_0 + v_1 t^32; v_0 + v_1 t^32) // v2 = // (v_2; v_2) pmull2 v5.1q, v0.2d, v1.2d // u_2 (v_0 + v_1 t^32) t^32 = e_0 pmull v4.1q, v0.1d, v2.1d // v_2 (u_0 + u_1 t^32) t^32 = e_1 - pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (d; 0) + pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (0; d) pmull v3.1q, v0.1d, v1.1d // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32 // + u_1 v_1 t^64 = f @@ -238,8 +238,8 @@ // Clobbers v16--v25. // Start multiplying and accumulating pieces of product. - // v0 = // (u_0; u_1) - // v1 = // (u_2; ?) + // v0 = // (u_1; u_0) + // v1 = // (?; u_2) // v2 = // (v_0; v_0) // v3 = // (v_1; v_1) // v4 = // (v_2; v_2) @@ -262,27 +262,27 @@ eor v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1 // Piece the product together. - // v16 = // (a_0; a_1) - // v19 = // (b_0; b_1) - // v17 = // (c_0; c_1) - // v20 = // (d_0; d_1) - // v18 = // (e_0; e_1) - vshl128 v21, v19, 64 // (0; b_0) - ext v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0) - vshr128 v23, v20, 64 // (d_1; 0) - eor v16.16b, v16.16b, v21.16b // (x_0; x_1) - eor v17.16b, v17.16b, v22.16b // (x_2; x_3) - eor v18.16b, v18.16b, v23.16b // (x_2; x_3) + // v16 = // (a_1; a_0) + // v19 = // (b_1; b_0) + // v17 = // (c_1; c_0) + // v20 = // (d_1; d_0) + // v18 = // (e_1; e_0) + vshl128 v21, v19, 64 // (b_0; 0) + ext v22.16b, v19.16b, v20.16b, #8 // (d_0; b_1) + vshr128 v23, v20, 64 // (0; d_1) + eor v16.16b, v16.16b, v21.16b // (x_1; x_0) + eor v17.16b, v17.16b, v22.16b // (x_3; x_2) + eor v18.16b, v18.16b, v23.16b // (x_3; x_2) // Next, the reduction. Our polynomial this time is p(x) = t^192 + // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the // 128-bit case. I don't know why. // First, shift the high bits down. - // v16 = // (y_0; y_1) - // v17 = // (y_2; y_3) - // v18 = // (y_4; y_5) - mov v19.d[0], v17.d[1] // (y_3; ?) + // v16 = // (y_1; y_0) + // v17 = // (y_3; y_2) + // v18 = // (y_5; y_4) + mov v19.d[0], v17.d[1] // (?; y_3) ushr v23.2d, v18.2d, #63 // hi b_i for t ushr d20, d19, #63 // lo b_i for t @@ -298,15 +298,15 @@ // Permute the high pieces while we fold in the b_i. eor v17.16b, v17.16b, v23.16b vshl128 v20, v20, 64 - mov v19.d[0], v18.d[1] // (y_5; ?) - ext v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4) + mov v19.d[0], v18.d[1] // (?; y_5) + ext v18.16b, v17.16b, v18.16b, #8 // (y_4; y_3) eor v16.16b, v16.16b, v20.16b // And finally shift the low bits up. - // v16 = // (y'_0; y'_1) - // v17 = // (y'_2; ?) - // v18 = // (y'_3; y'_4) - // v19 = // (y'_5; ?) + // v16 = // (y'_1; y'_0) + // v17 = // (?; y'_2) + // v18 = // (y'_4; y'_3) + // v19 = // (?; y'_5) shl v20.2d, v18.2d, #1 shl d23, d19, #1 shl v21.2d, v18.2d, #2 @@ -345,14 +345,14 @@ // 128-bit multiplications already, and Karatsuba is too annoying // there, so there'll be 12 multiplications altogether, rather than // the 16 we'd have if we did this the naïve way. - // v0 = // u_0 = (u_00; u_01) - // v1 = // u_1 = (u_10; u_11) + // v0 = // u_0 = (u_01; u_00) + // v1 = // u_1 = (u_11; u_10) // v2 = // (v_00; v_00) // v3 = // (v_01; v_01) // v4 = // (v_10; v_10) // v5 = // (v_11; v_11) - eor v28.16b, v0.16b, v1.16b // u_* = (u_00 + u_10; u_01 + u_11) + eor v28.16b, v0.16b, v1.16b // u_* = (u_01 + u_11; u_00 + u_10) eor v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10 eor v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11 @@ -402,16 +402,16 @@ // Now we must reduce. This is essentially the same as the 192-bit // case above, but more complicated because everything is bigger. // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1. - // v16 = // (y_0; y_1) - // v17 = // (y_2; y_3) - // v18 = // (y_4; y_5) - // v19 = // (y_6; y_7) - ushr v24.2d, v18.2d, #62 // (y_4; y_5) b_i for t^2 - ushr v25.2d, v19.2d, #62 // (y_6; y_7) b_i for t^2 - ushr v26.2d, v18.2d, #59 // (y_4; y_5) b_i for t^5 - ushr v27.2d, v19.2d, #59 // (y_6; y_7) b_i for t^5 - ushr v28.2d, v18.2d, #54 // (y_4; y_5) b_i for t^10 - ushr v29.2d, v19.2d, #54 // (y_6; y_7) b_i for t^10 + // v16 = // (y_1; y_0) + // v17 = // (y_3; y_2) + // v18 = // (y_5; y_4) + // v19 = // (y_7; y_6) + ushr v24.2d, v18.2d, #62 // (y_5; y_4) b_i for t^2 + ushr v25.2d, v19.2d, #62 // (y_7; y_6) b_i for t^2 + ushr v26.2d, v18.2d, #59 // (y_5; y_4) b_i for t^5 + ushr v27.2d, v19.2d, #59 // (y_7; y_6) b_i for t^5 + ushr v28.2d, v18.2d, #54 // (y_5; y_4) b_i for t^10 + ushr v29.2d, v19.2d, #54 // (y_7; y_6) b_i for t^10 eor v24.16b, v24.16b, v26.16b // mix the contributions together eor v25.16b, v25.16b, v27.16b eor v24.16b, v24.16b, v28.16b @@ -424,16 +424,16 @@ eor v16.16b, v16.16b, v24.16b // And then shift the low bits up. - // v16 = // (y'_0; y'_1) - // v17 = // (y'_2; y'_3) - // v18 = // (y'_4; y'_5) - // v19 = // (y'_6; y'_7) - shl v24.2d, v18.2d, #2 // (y'_4; y_5) a_i for t^2 - shl v25.2d, v19.2d, #2 // (y_6; y_7) a_i for t^2 - shl v26.2d, v18.2d, #5 // (y'_4; y_5) a_i for t^5 - shl v27.2d, v19.2d, #5 // (y_6; y_7) a_i for t^5 - shl v28.2d, v18.2d, #10 // (y'_4; y_5) a_i for t^10 - shl v29.2d, v19.2d, #10 // (y_6; y_7) a_i for t^10 + // v16 = // (y'_1; y'_0) + // v17 = // (y'_3; y'_2) + // v18 = // (y'_5; y'_4) + // v19 = // (y'_7; y'_6) + shl v24.2d, v18.2d, #2 // (y_5; y'_4) a_i for t^2 + shl v25.2d, v19.2d, #2 // (y_7; y_6) a_i for t^2 + shl v26.2d, v18.2d, #5 // (y_5; y'_4) a_i for t^5 + shl v27.2d, v19.2d, #5 // (y_7; y_6) a_i for t^5 + shl v28.2d, v18.2d, #10 // (y_5; y'_4) a_i for t^10 + shl v29.2d, v19.2d, #10 // (y_7; y_6) a_i for t^10 eor v18.16b, v18.16b, v24.16b // mix the contributions together eor v19.16b, v19.16b, v25.16b eor v26.16b, v26.16b, v28.16b diff --git a/symm/gcm-x86ish-pclmul.S b/symm/gcm-x86ish-pclmul.S index 837abbdd..fadeca58 100644 --- a/symm/gcm-x86ish-pclmul.S +++ b/symm/gcm-x86ish-pclmul.S @@ -113,21 +113,21 @@ // use Karatsuba's identity here, but I suspect that loses more in // the shifting, bit-twiddling, and dependency chains that it gains // in saving a multiplication which otherwise pipelines well. - // xmm0 = // (u_1; u_0) - // xmm1 = // (v_1; v_0) - movdqa xmm2, xmm1 // (v_1; v_0) again - movdqa xmm3, xmm0 // (u_1; u_0) again - movdqa xmm4, xmm0 // (u_1; u_0) yet again + // xmm0 = // (u_0; u_1) + // xmm1 = // (v_0; v_1) + movdqa xmm2, xmm1 // (v_0; v_1) again + movdqa xmm3, xmm0 // (u_0; u_1) again + movdqa xmm4, xmm0 // (u_0; u_1) yet again pclmulhqlqdq xmm2, xmm0 // u_1 v_0 pclmullqlqdq xmm0, xmm1 // u_1 v_1 pclmulhqlqdq xmm3, xmm1 // u_0 v_1 pclmulhqhqdq xmm4, xmm1 // u_0 v_0 // Arrange the pieces to form a double-precision polynomial. - pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1 - movdqa xmm1, xmm2 // (m_1; m_0) again - pslldq xmm2, 8 // (0; m_1) - psrldq xmm1, 8 // (m_0; 0) + pxor xmm2, xmm3 // (m_0; m_1) = u_1 v_0 + u_0 v_1 + movdqa xmm1, xmm2 // (m_0; m_1) again + pslldq xmm2, 8 // (m_1; 0) + psrldq xmm1, 8 // (0; m_0) pxor xmm0, xmm2 // z_1 = u_1 v_1 + m_1 pxor xmm1, xmm4 // z_0 = u_0 v_0 + t^64 m_0 @@ -158,9 +158,9 @@ // word together, and then the low bits, everything will be fine. // First, shift the high bits down. - movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again - movdqa xmm3, xmm0 // (x_7, x_6; x_5, x_4) yet again - movdqa xmm4, xmm0 // (x_7, x_6; x_5, x_4) again again + movdqa xmm2, xmm0 // (x_4, x_5; x_6, x_7) again + movdqa xmm3, xmm0 // (x_4, x_5; x_6, x_7) yet again + movdqa xmm4, xmm0 // (x_4, x_5; x_6, x_7) again again pslld xmm2, 31 // the b_i for t pslld xmm3, 30 // the b_i for t^2 pslld xmm4, 25 // the b_i for t^7 @@ -196,13 +196,13 @@ // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1. // First, we must detach the top (`low'!) half of the result. - movdqa xmm0, xmm1 // (x_3, x_2; x_1, x_0) again - psrldq xmm1, 8 // (x_1, x_0; 0, 0) + movdqa xmm0, xmm1 // (x_0, x_1; x_2, x_3) again + psrldq xmm1, 8 // (0, 0; x_0, x_1) // Next, shift the high bits down. - movdqa xmm2, xmm0 // (x_3, x_2; ?, ?) again - movdqa xmm3, xmm0 // (x_3, x_2; ?, ?) yet again - movdqa xmm4, xmm0 // (x_3, x_2; ?, ?) again again + movdqa xmm2, xmm0 // (?, ?; x_2, x_3) again + movdqa xmm3, xmm0 // (?, ?; x_2, x_3) yet again + movdqa xmm4, xmm0 // (?, ?; x_2, x_3) again again pslld xmm2, 31 // b_i for t pslld xmm3, 29 // b_i for t^3 pslld xmm4, 28 // b_i for t^4 @@ -239,11 +239,11 @@ // shift both of them up by four bytes before we start. This will // mean that the high 64 bits of the result (from GCM's viewpoint) // will be zero. - // xmm0 = // (0, u_2; u_1, u_0) - // xmm1 = // (0, v_2; v_1, v_0) - movdqa xmm2, xmm1 // (0, v_2; v_1, v_0) again - movdqa xmm3, xmm0 // (0, u_2; u_1, u_0) again - movdqa xmm4, xmm0 // (0, u_2; u_1, u_0) yet again + // xmm0 = // (u_0, u_1; u_2, 0) + // xmm1 = // (v_0, v_1; v_2, 0) + movdqa xmm2, xmm1 // (v_0, v_1; v_2, 0) again + movdqa xmm3, xmm0 // (u_0, u_1; u_2, 0) again + movdqa xmm4, xmm0 // (u_0, u_1; u_2, 0) yet again pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0 pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d) pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1 @@ -255,10 +255,10 @@ // registers. The answer we want is d t^128 + e t^64 + f, where e = // e_0 + e_1. // - // The place values for the two halves are (t^160, t^128; t^96, ?) - // and (?, t^64; t^32, 1). But we also want to shift the high part + // The place values for the two halves are (?, t^96; t^128, t^160) + // and (1, t^32; t^64, ?). But we also want to shift the high part // left by a word, for symmetry's sake. - psrldq xmm0, 8 // (d; 0) = d t^128 + psrldq xmm0, 8 // (0; d) = d t^128 pxor xmm2, xmm3 // e = (e_0 + e_1) movdqa xmm1, xmm4 // f again pxor xmm0, xmm2 // d t^128 + e t^64 @@ -308,15 +308,15 @@ // are unimportant. Clobbers xmm2--xmm7. // Start multiplying and accumulating pieces of product. - // xmm0 = // (u_2; u_1) - // xmm1 = // (u_0; ?) - // xmm2 = // (v_2; v_1) - // xmm3 = // (v_0; ?) - movdqa xmm4, xmm0 // (u_2; u_1) again - movdqa xmm5, xmm0 // (u_2; u_1) yet again - movdqa xmm6, xmm0 // (u_2; u_1) again again - movdqa xmm7, xmm3 // (v_0; ?) again - punpcklqdq xmm3, xmm1 // (v_0; u_0) + // xmm0 = // (u_1; u_2) + // xmm1 = // (?; u_0) + // xmm2 = // (v_1; v_2) + // xmm3 = // (?; v_0) + movdqa xmm4, xmm0 // (u_1; u_2) again + movdqa xmm5, xmm0 // (u_1; u_2) yet again + movdqa xmm6, xmm0 // (u_1; u_2) again again + movdqa xmm7, xmm3 // (?; v_0) again + punpcklqdq xmm3, xmm1 // (u_0; v_0) pclmulhqhqdq xmm4, xmm2 // u_1 v_1 pclmullqlqdq xmm1, xmm2 // u_0 v_2 pclmullqhqdq xmm5, xmm2 // u_2 v_1 @@ -324,7 +324,7 @@ pxor xmm1, xmm4 // u_0 v_2 + u_1 v_1 pclmullqlqdq xmm7, xmm0 // u_2 v_0 pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2 - movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny + movdqa xmm6, xmm0 // (u_1; u_2) like a bad penny pxor xmm1, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0 pclmullqlqdq xmm0, xmm2 // a = u_2 v_2 pclmulhqlqdq xmm6, xmm3 // u_1 v_0 @@ -334,50 +334,50 @@ // Next, the piecing together of the product. There's significant // work here to leave the completed pieces in sensible registers. - // xmm0 = // (a_1; a_0) = a = u_2 v_2 - // xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1 - // xmm1 = // (c_1; c_0) = c = u_0 v_2 + + // xmm0 = // (a_0; a_1) = a = u_2 v_2 + // xmm5 = // (b_0; b_1) = b = u_1 v_2 + u_2 v_1 + // xmm1 = // (c_0; c_1) = c = u_0 v_2 + // u_1 v_1 + u_2 v_0 - // xmm6 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0 - // xmm3 = // (e_1; e_0) = e = u_0 v_0 + // xmm6 = // (d_0; d_1) = d = u_0 v_1 + u_1 v_0 + // xmm3 = // (e_0; e_1) = e = u_0 v_0 // xmm2, xmm4, xmm7 spare - movdqa xmm2, xmm6 // (d_1; d_0) again - movdqa xmm4, xmm5 // (b_1; b_0) again - pslldq xmm6, 8 // (0; d_1) - psrldq xmm5, 8 // (b_0; 0) - psrldq xmm2, 8 // (d_0; 0) - pslldq xmm4, 8 // (0; b_1) - pxor xmm5, xmm6 // (b_0; d_1) - pxor xmm0, xmm4 // (x_5; x_4) = (a_1; a_0 + b_1) - pxor xmm2, xmm3 // (x_1; x_0) = (e_1 + d_0; e_0) - pxor xmm1, xmm5 // (x_3; x_2) = (b_0 + c_1; c_0 + d_1) + movdqa xmm2, xmm6 // (d_0; d_1) again + movdqa xmm4, xmm5 // (b_0; b_1) again + pslldq xmm6, 8 // (d_1; 0) + psrldq xmm5, 8 // (0; b_0) + psrldq xmm2, 8 // (0; d_0) + pslldq xmm4, 8 // (b_1; 0) + pxor xmm5, xmm6 // (d_1; b_0) + pxor xmm0, xmm4 // (x_4; x_5) = (a_0 + b_1; a_1) + pxor xmm2, xmm3 // (x_0; x_1) = (e_0; e_1 + d_0) + pxor xmm1, xmm5 // (x_2; x_3) = (c_0 + d_1; b_0 + c_1) // Next, the reduction. Our polynomial this time is p(x) = t^192 + // t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the // 128-bit case. I don't know why. // First, shift the high bits down. - // xmm0 = // (x_5; x_4) - // xmm1 = // (x_3; x_2) - // xmm2 = // (x_1; x_0) + // xmm0 = // (x_4; x_5) + // xmm1 = // (x_2; x_3) + // xmm2 = // (x_0; x_1) // xmm3--xmm7 spare - movdqa xmm3, xmm0 // (x_5; x_4) copy - movdqa xmm4, xmm0 // (x_5; x_4) copy - movdqa xmm5, xmm0 // (x_5; x_4) copy - pslld xmm3, 31 // (x_5; x_4) b_i for t - pslld xmm4, 30 // (x_5; x_4) b_i for t^2 - pslld xmm5, 25 // (x_5; x_4) b_i for t^7 - movq xmm6, xmm1 // (x_3; 0) copy + movdqa xmm3, xmm0 // (x_4; x_5) copy + movdqa xmm4, xmm0 // (x_4; x_5) copy + movdqa xmm5, xmm0 // (x_4; x_5) copy + pslld xmm3, 31 // (x_4; x_5) b_i for t + pslld xmm4, 30 // (x_4; x_5) b_i for t^2 + pslld xmm5, 25 // (x_4; x_5) b_i for t^7 + movq xmm6, xmm1 // (0; x_3) copy pxor xmm3, xmm4 - movq xmm7, xmm1 // (x_3; 0) copy + movq xmm7, xmm1 // (0; x_3) copy pxor xmm3, xmm5 - movq xmm5, xmm1 // (x_3; 0) copy - movdqa xmm4, xmm3 // (x_5; x_4) b_i combined - pslld xmm6, 31 // (x_3; 0) b_i for t - pslld xmm7, 30 // (x_3; 0) b_i for t^2 - pslld xmm5, 25 // (x_3; 0) b_i for t^7 - psrldq xmm3, 12 // (x_5; x_4) low contrib - pslldq xmm4, 4 // (x_5; x_4) high contrib + movq xmm5, xmm1 // (0; x_3) copy + movdqa xmm4, xmm3 // (x_4; x_5) b_i combined + pslld xmm6, 31 // (0; x_3) b_i for t + pslld xmm7, 30 // (0; x_3) b_i for t^2 + pslld xmm5, 25 // (0; x_3) b_i for t^7 + psrldq xmm3, 12 // (x_4; x_5) low contrib + pslldq xmm4, 4 // (x_4; x_5) high contrib pxor xmm6, xmm7 pxor xmm2, xmm3 pxor xmm6, xmm5 @@ -387,17 +387,17 @@ // And finally shift the low bits up. Unfortunately, we also have to // split the low bits out. - // xmm0 = // (x'_5; x'_4) - // xmm1 = // (x'_3; x'_2) - // xmm2 = // (x'_1; x'_0) - movdqa xmm5, xmm1 // copies of (x'_3; x'_2) + // xmm0 = // (x'_4; x'_5) + // xmm1 = // (x'_2; x'_3) + // xmm2 = // (x'_0; x'_1) + movdqa xmm5, xmm1 // copies of (x'_2; x'_3) movdqa xmm6, xmm1 movdqa xmm7, xmm1 - psrldq xmm1, 8 // bring down (x'_2; ?) - movdqa xmm3, xmm0 // copies of (x'_5; x'_4) + psrldq xmm1, 8 // bring down (?; x'_2) + movdqa xmm3, xmm0 // copies of (x'_4; x'_5) movdqa xmm4, xmm0 - punpcklqdq xmm1, xmm2 // (x'_2; x'_1) - psrldq xmm2, 8 // (x'_0; ?) + punpcklqdq xmm1, xmm2 // (x'_1; x'_2) + psrldq xmm2, 8 // (?; x'_0) pxor xmm2, xmm5 // low half and unit contrib pxor xmm1, xmm0 psrld xmm5, 1 @@ -412,7 +412,7 @@ pxor xmm0, xmm4 pxor xmm5, xmm2 // mix everything together pxor xmm0, xmm1 - movq xmm1, xmm5 // shunt (z_0; ?) into proper place + movq xmm1, xmm5 // shunt (?; z_0) into proper place .endm .macro mul256 @@ -442,10 +442,10 @@ // On x86, there aren't quite enough registers, so spill one for a // bit. On AMD64, we can keep on going, so it's all good. - // xmm0 = // u_1 = (u_11; u_10) - // xmm1 = // u_0 = (u_01; u_00) - // xmm2 = // v_1 = (v_11; v_10) - // xmm3 = // v_0 = (v_01; v_00) + // xmm0 = // u_1 = (u_10; u_11) + // xmm1 = // u_0 = (u_00; u_01) + // xmm2 = // v_1 = (v_10; v_11) + // xmm3 = // v_0 = (v_00; v_01) movdqa xmm4, xmm0 // u_1 again #if CPUFAM_X86 movdqa [SP + 0], xmm3 @@ -453,8 +453,8 @@ movdqa xmm8, xmm3 # define V0 xmm8 #endif - pxor xmm4, xmm1 // u_* = (u_01 + u_11; u_00 + u_10) - pxor xmm3, xmm2 // v_* = (v_01 + v_11; v_00 + v_10) + pxor xmm4, xmm1 // u_* = (u_00 + u_10; u_01 + u_11) + pxor xmm3, xmm2 // v_* = (v_00 + v_10; v_01 + v_11) // Start by building the cross product, q = u_* v_*. movdqa xmm7, xmm4 // more copies of u_* @@ -588,7 +588,7 @@ // the /last/ byte in the block. If the block size is not a multiple of // 16 bytes, then there must be padding. 96-bit blocks are weird: the // padding is inserted at the /least/ significant end, so the register -// holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most +// holds (x_2, x_1; x_0, 0); otherwise, the padding goes at the most // significant end. // // * The `words' format consists of a sequence of bytes, as in the @@ -613,9 +613,9 @@ SSEFUNC(gcm_mulk_128b_x86ish_pclmul) endprologue movdqu xmm0, [A] movdqu xmm1, [K] - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) mul128 - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) movdqu [A], xmm0 ret ENDFUNC @@ -653,9 +653,9 @@ SSEFUNC(gcm_mulk_64b_x86ish_pclmul) endprologue movq xmm0, [A] movq xmm1, [K] - pshufd xmm0, xmm0, SHUF(1, 0, 3, 3) + pshufd xmm0, xmm0, SHUF(3, 3, 0, 1) mul64 - pshufd xmm0, xmm0, SHUF(1, 0, 3, 3) + pshufd xmm0, xmm0, SHUF(3, 3, 0, 1) movq [A], xmm0 ret ENDFUNC @@ -696,9 +696,9 @@ SSEFUNC(gcm_mulk_96b_x86ish_pclmul) movd xmm2, [A + 8] movdqu xmm1, [K] punpcklqdq xmm0, xmm2 - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) mul96 - pshufd xmm1, xmm0, SHUF(3, 2, 1, 0) + pshufd xmm1, xmm0, SHUF(0, 1, 2, 3) psrldq xmm0, 4 movq [A + 0], xmm1 movd [A + 8], xmm0 @@ -750,11 +750,11 @@ SSEFUNC(gcm_mulk_192b_x86ish_pclmul) movq xmm1, [A + 0] movdqu xmm2, [K + 0] movq xmm3, [K + 16] - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) - pshufd xmm1, xmm1, SHUF(1, 0, 3, 3) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) + pshufd xmm1, xmm1, SHUF(3, 3, 0, 1) mul192 - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) - pshufd xmm1, xmm1, SHUF(1, 0, 3, 3) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) + pshufd xmm1, xmm1, SHUF(3, 3, 0, 1) movdqu [A + 8], xmm0 movq [A + 0], xmm1 #if CPUFAM_AMD64 && ABI_WIN @@ -824,11 +824,11 @@ SSEFUNC(gcm_mulk_256b_x86ish_pclmul) movdqu xmm1, [A + 0] movdqu xmm2, [K + 0] movdqu xmm3, [K + 16] - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) - pshufd xmm1, xmm1, SHUF(3, 2, 1, 0) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) + pshufd xmm1, xmm1, SHUF(0, 1, 2, 3) mul256 - pshufd xmm0, xmm0, SHUF(3, 2, 1, 0) - pshufd xmm1, xmm1, SHUF(3, 2, 1, 0) + pshufd xmm0, xmm0, SHUF(0, 1, 2, 3) + pshufd xmm1, xmm1, SHUF(0, 1, 2, 3) movdqu [A + 16], xmm0 movdqu [A + 0], xmm1 #if CPUFAM_X86 diff --git a/symm/rijndael-x86ish-aesni.S b/symm/rijndael-x86ish-aesni.S index f5e5cc9c..ad9236a8 100644 --- a/symm/rijndael-x86ish-aesni.S +++ b/symm/rijndael-x86ish-aesni.S @@ -209,16 +209,16 @@ FUNC(rijndael_setup_x86ish_aesni) // Fourth word of the cycle, and seven or eight words of key. Do a // byte substitution. movd xmm0, eax - pshufd xmm0, xmm0, SHUF(3, 0, 1, 2) + pshufd xmm0, xmm0, SHUF(2, 1, 0, 3) aeskeygenassist xmm1, xmm0, 0 movd eax, xmm1 jmp 2f // First word of the cycle. This is the complicated piece. 1: movd xmm0, eax - pshufd xmm0, xmm0, SHUF(1, 2, 3, 0) + pshufd xmm0, xmm0, SHUF(0, 3, 2, 1) aeskeygenassist xmm1, xmm0, 0 - pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) movd eax, xmm1 xor al, [RCON] inc RCON diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S index 26bab892..ccf912b6 100644 --- a/symm/salsa20-x86ish-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@ -180,7 +180,7 @@ FUNC(salsa20_core_x86ish_sse2) // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm1 - pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@ -189,9 +189,9 @@ FUNC(salsa20_core_x86ish_sse2) // a ^= (d + c) <<< 18 movdqa xmm4, xmm3 - pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) + pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) paddd xmm4, xmm2 - pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) + pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@ -235,7 +235,7 @@ FUNC(salsa20_core_x86ish_sse2) // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm3 - pshufd xmm3, xmm3, SHUF(3, 0, 1, 2) + pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@ -244,9 +244,9 @@ FUNC(salsa20_core_x86ish_sse2) // a ^= (d + c) <<< 18 movdqa xmm4, xmm1 - pshufd xmm1, xmm1, SHUF(1, 2, 3, 0) + pshufd xmm1, xmm1, SHUF(0, 3, 2, 1) paddd xmm4, xmm2 - pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) + pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@ -270,9 +270,9 @@ FUNC(salsa20_core_x86ish_sse2) // input. This can be done by juggling values in registers, with the // following fancy footwork: some row rotations, a transpose, and // some more rotations. - pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 3, 4, 9, 14 - pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) // 2, 7, 8, 13 - pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) // 1, 6, 11, 12 + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14 + pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13 + pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12 movdqa xmm4, xmm0 movdqa xmm5, xmm3 @@ -288,9 +288,9 @@ FUNC(salsa20_core_x86ish_sse2) punpckhdq xmm1, xmm3 // 5, 6, 7, 4 punpckhdq xmm2, xmm5 // 15, 12, 13, 14 - pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 4, 5, 6, 7 - pshufd xmm4, xmm4, SHUF(2, 3, 0, 1) // 8, 9, 10, 11 - pshufd xmm2, xmm2, SHUF(1, 2, 3, 0) // 12, 13, 14, 15 + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7 + pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11 + pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15 // Finally we have to write out the result. movdqu [OUT + 0], xmm0