# define INTADDR__1(addr, got) addr
#endif
-// Permutations for SIMD instructions. SHUF(A, B, C, D) is an immediate,
+// Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate,
// suitable for use in `pshufd' or `shufpd', which copies element A
// (0 <= A < 4) of the source to element 0 of the destination, element B to
// element 1, element C to element 2, and element D to element 3.
-#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d))
+#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a))
// Map register names to their individual pieces.
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
/// operands, as follows.
///
-/// Offset 0 4 8 12
-/// 0 v'_0 v'_1 v''_0 v''_1
-/// 16 v'_2 v'_3 v''_2 v''_3
+/// Offset 12 8 4 0
+/// 0 v''_1 v''_0 v'_1 v'_0
+/// 16 v''_3 v''_2 v'_3 v'_2
///
/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
// of the product in registers D0, D1, D2, D3.
- pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
+ pshufd \d0, \r, SHUF(3, \i, 3, \i) // (?, r_i; ?, r_i)
.ifnes "\d1", "nil"
- movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
+ movdqa \d1, \slo // (s''_1, s''_0; s'_1, s'_0)
.endif
.ifnes "\d3", "nil"
- movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
+ movdqa \d3, \shi // (s''_3, s''_2; s'_3, s'_2)
.endif
.ifnes "\d1", "nil"
- psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
+ psrldq \d1, 4 // (0, s''_1; s''_0, s'_1)
.endif
.ifnes "\d2", "nil"
- movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
+ movdqa \d2, \d0 // another copy of (?, r_i; ?, r_i)
.endif
.ifnes "\d3", "nil"
- psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
+ psrldq \d3, 4 // (0, s''_3; s''_2, s'_3)
.endif
.ifnes "\d1", "nil"
- pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
+ pmuludq \d1, \d0 // (r_i s''_1; r_i s'_1)
.endif
.ifnes "\d3", "nil"
- pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
+ pmuludq \d3, \d0 // (r_i s''_3; r_i s'_3)
.endif
.ifnes "\d2", "nil"
- pmuludq \d2, \shi // (r_i s'_2; r_i s''_2)
+ pmuludq \d2, \shi // (r_i s''_2; r_i s'_2)
.endif
- pmuludq \d0, \slo // (r_i s'_0; r_i s''_0)
+ pmuludq \d0, \slo // (r_i s''_0; r_i s'_0)
.endm
.macro accum c0, c1=nil, c2=nil, c3=nil
// lane 0 or 1 of D; the high two lanes of D are clobbered. On
// completion, XMM3 is clobbered. If CC is `nil', then the
// contribution which would have been added to it is left in C.
- pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
- psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
- pslldq xmm3, 2 // (t b; 0)
- paddq \c, xmm3 // (c' + t b; c'')
+ pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?)
+ psrldq xmm3, 12 // (0, 0; 0, t) = (0; t)
+ pslldq xmm3, 2 // (0; t b)
+ paddq \c, xmm3 // (c''; c' + t b)
.ifeqs "\pos", "lo"
movdqa \d, \c
.else
// of the value represented in C are written at POS in D, and the
// remaining bits are left at the bottom of T.
movdqa \t, \c
- psllq \t, 16 // (?; c'' b)
- pslldq \c, 8 // (0; c')
- paddq \t, \c // (?; c' + c'' b)
- psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
+ psllq \t, 16 // (c'' b; ?)
+ pslldq \c, 8 // (c'; 0)
+ paddq \t, \c // (c' + c'' b; ?)
+ psrldq \t, 8 // (0; c' + c'' b) = (0; c)
.ifeqs "\pos", "lo"
movdqa \d, \t
.else
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
- movdqa \b, \a // (a_0, a_1; a_2, a_3)
+ movdqa \b, \a // (a_3, a_2; a_1, a_0)
.ifnes "\c", "nil"
- movdqa \d, \c // (c_0, c_1; c_2, c_3)
+ movdqa \d, \c // (c_3, c_2; c_1, c_0)
.endif
- punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
- punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
+ punpcklwd \a, \z // (a''_1, a'_1; a''_0, a'_0)
+ punpckhwd \b, \z // (a''_3, a'_3; a''_2, a'_2)
.ifnes "\c", "nil"
- punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
- punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
+ punpcklwd \c, \z // (c''_1, c'_1; c''_0, c'_0)
+ punpckhwd \d, \z // (c''_3, c'_3; c''_2, c'_2)
.endif
- pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
- pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
+ pshufd \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0)
+ pshufd \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2)
.ifnes "\c", "nil"
- pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
- pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
+ pshufd \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0)
+ pshufd \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2)
.endif
.endm
// we can do that, we must gather them together.
movdqa \t, \c0
movdqa \u, \c1
- punpcklqdq \t, \c2 // (y'_0; y'_2)
- punpckhqdq \c0, \c2 // (y''_0; y''_2)
- punpcklqdq \u, \c3 // (y'_1; y'_3)
- punpckhqdq \c1, \c3 // (y''_1; y''_3)
+ punpcklqdq \t, \c2 // (y'_2; y'_0)
+ punpckhqdq \c0, \c2 // (y''_2; y''_0)
+ punpcklqdq \u, \c3 // (y'_3; y'_1)
+ punpckhqdq \c1, \c3 // (y''_3; y''_1)
// Now split the double-prime pieces. The high (up to) 48 bits will
// go up; the low 16 bits go down.
movdqa \c3, \c1
psllq \c2, 48
psllq \c3, 48
- psrlq \c0, 16 // high parts of (y''_0; y''_2)
- psrlq \c1, 16 // high parts of (y''_1; y''_3)
- psrlq \c2, 32 // low parts of (y''_0; y''_2)
- psrlq \c3, 32 // low parts of (y''_1; y''_3)
+ psrlq \c0, 16 // high parts of (y''_2; y''_0)
+ psrlq \c1, 16 // high parts of (y''_3; y''_1)
+ psrlq \c2, 32 // low parts of (y''_2; y''_0)
+ psrlq \c3, 32 // low parts of (y''_3; y''_1)
.ifnes "\hi", "nil"
movdqa \hi, \c1
.endif
- pslldq \c1, 8 // high part of (0; y''_1)
+ pslldq \c1, 8 // high part of (y''_1; 0)
paddq \t, \c2 // propagate down
paddq \u, \c3
- paddq \t, \c1 // and up: (y_0; y_2)
- paddq \u, \c0 // (y_1; y_3)
+ paddq \t, \c1 // and up: (y_2; y_0)
+ paddq \u, \c0 // (y_3; y_1)
.ifnes "\hi", "nil"
- psrldq \hi, 8 // high part of (y''_3; 0)
+ psrldq \hi, 8 // high part of (0; y''_3)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
- movdqa \c3, \t // (y_0; ?)
- movdqa \lo, \t // (y^*_0, ?; ?, ?)
- psrldq \t, 8 // (y_2; 0)
+ movdqa \c3, \t // (?; y_0)
+ movdqa \lo, \t // (?, ?; ?, y^*_0)
+ psrldq \t, 8 // (0; y_2)
psrlq \c3, 32 // (floor(y_0/B); ?)
paddq \c3, \u // (y_1 + floor(y_0/B); ?)
- movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
- psrldq \u, 8 // (y_3; 0)
+ movdqa \c1, \c3 // (?, ?; ?, y^*_1)
+ psrldq \u, 8 // (0; y_3)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
- punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
+ punpckldq \lo, \c3 // (?, ?; y^*_2, y^*_0)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
.ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
- punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
+ punpckldq \c1, \c3 // (?, ?; y^*_3, y^*_1)
.ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
paddq \hi, \t
// On exit, the carry registers, including XMM15, are updated to hold
// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
// registers are preserved.
- movd xmm0, [rdi + 0] // (a_0; 0)
- movd xmm1, [rdi + 4] // (a_1; 0)
- movd xmm2, [rdi + 8] // (a_2; 0)
- movd xmm15, [rdi + 12] // (a_3; 0)
- paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
- paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
- paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
+ movd xmm0, [rdi + 0] // (0; a_0)
+ movd xmm1, [rdi + 4] // (0; a_1)
+ movd xmm2, [rdi + 8] // (0; a_2)
+ movd xmm15, [rdi + 12] // (0; a_3)
+ paddq xmm12, xmm0 // (c''_0; c'_0 + a_0)
+ paddq xmm13, xmm1 // (c''_1; c'_1 + a_1)
+ paddq xmm14, xmm2 // (c''_2 + a_3 b; c'_2 + a_2)
.endm
///--------------------------------------------------------------------------
mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
- punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
- punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
+ punpckldq xmm12, xmm15 // (0, w_1; 0, w_0)
+ punpckhdq xmm14, xmm15 // (0, w_3; 0, w_2)
mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 3, xmm10, xmm11, xmm0
accum xmm6
- punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
- punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
- punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
- punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
+ punpckldq xmm12, xmm2 // (0, 0; 0, w_0)
+ punpckldq xmm14, xmm2 // (0, 0; 0, w_2)
+ punpckhdq xmm13, xmm2 // (0, 0; 0, w_1)
+ punpckhdq xmm15, xmm2 // (0, 0; 0, w_3)
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
- punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
- punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
+ punpckldq xmm12, xmm15 // (0, w_1; 0, w_0)
+ punpckhdq xmm14, xmm15 // (0, w_3; 0, w_2)
mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 3, xmm8, xmm9, xmm0
accum xmm6
- punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
- punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
- punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
- punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
+ punpckldq xmm12, xmm2 // (0, 0; 0, w_0)
+ punpckldq xmm14, xmm2 // (0, 0; 0, w_2)
+ punpckhdq xmm13, xmm2 // (0, 0; 0, w_1)
+ punpckhdq xmm15, xmm2 // (0, 0; 0, w_3)
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
.endm
.macro testldcarry
- movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
- movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
- movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
+ movdqu xmm12, [rcx + 0] // (c''_0; c'_0)
+ movdqu xmm13, [rcx + 16] // (c''_1; c'_1)
+ movdqu xmm14, [rcx + 32] // (c''_2; c'_2)
.endm
.macro testtop u=nil
testtop r11
call mmul4
testtail
- pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
- pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
+ pshufd xmm10, xmm10, SHUF(3, 1, 2, 0)
+ pshufd xmm11, xmm11, SHUF(3, 1, 2, 0)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
testtop r11
call mmla4
testtail
- pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
- pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
+ pshufd xmm10, xmm10, SHUF(3, 1, 2, 0)
+ pshufd xmm11, xmm11, SHUF(3, 1, 2, 0)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
testtop
call mont4
testtail
- pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
- pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
+ pshufd xmm10, xmm10, SHUF(3, 1, 2, 0)
+ pshufd xmm11, xmm11, SHUF(3, 1, 2, 0)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
/// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
/// operands, as follows.
///
-/// Offset 0 4 8 12
-/// 0 v'_0 v''_0 v'_1 v''_1
-/// 16 v'_2 v''_2 v'_3 v''_3
+/// Offset 12 8 4 0
+/// 0 v''_1 v'_1 v''_0 v'_0
+/// 16 v''_3 v'_3 v''_2 v'_2
///
/// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
ldr r14, [STKARG(0)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
- vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+ vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ldr r14, [STKARG(1)] // -> yy
vld1.32 {q4}, [r14]
vmov.i32 q5, #0
- vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+ vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ldr r5, [STKARG(2)] // = n
ldr r6, [STKARG(3)] // -> cyv
vld1.32 {q4}, [r3]
vmov.i32 q5, #0
- vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+ vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ldr r5, [STKARG(0)] // = n
ldr r6, [STKARG(1)] // -> cyv
ldr r14, [STKARG(1)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
- vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+ vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ldr r14, [STKARG(2)] // -> yy
vld1.32 {q4}, [r14]
vmov.i32 q5, #0
- vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+ vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ldr r5, [STKARG(3)] // = n
ldr r6, [STKARG(4)] // -> cyv
ldr r14, [STKARG(0)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
- vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+ vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ldr r5, [STKARG(1)] // = n
ldr r6, [STKARG(2)] // -> cyv
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
/// operands, as follows.
///
-/// Offset 0 4 8 12
-/// 0 v'_0 v''_0 v'_1 v''_1
-/// 16 v'_2 v''_2 v'_3 v''_3
+/// Offset 12 8 4 0
+/// 0 v''_1 v'_1 v''_0 v'_0
+/// 16 v''_3 v'_3 v''_2 v'_2
///
/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
// leaving a carry in CG.
//
// In detail, what happens is as follows. Suppose initially that ZLO =
-// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i;
+// (z''_i; z'_i) and ZHI = (z''_{i+1}; z'_{i+1}). Let t = z'_i + b z''_i;
// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
// a circuit depth of 3; I don't know how to do better.
.ifeqs "\mode", "dmul"
ldr q2, [x4]
- zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3)
- zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1)
+ zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2)
+ zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0)
ldr q4, [x5]
- zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
- zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
+ zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
+ zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
mov x16, x1
mov x1, x2 // -> u
.ifeqs "\mode", "smul"
ldr q4, [x3]
- zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
- zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
+ zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
+ zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
// x2 // -> x
mov x3, x1 // -> c
.ifeqs "\mode", "mmul"
ldr q2, [x5]
- zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3)
- zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1)
+ zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2)
+ zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0)
ldr q6, [x6]
- zip2 v7.8h, v6.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
- zip1 v6.8h, v6.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
+ zip2 v7.8h, v6.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
+ zip1 v6.8h, v6.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
mov x16, x1
mov x1, x3 // -> u
.ifeqs "\mode", "mont"
ldr q6, [x4]
- zip2 v7.8h, v6.8h, v31.8h // (m'_2, m''_2; m'_3, m''_3)
- zip1 v6.8h, v6.8h, v31.8h // (m'_0, m''_0; m'_1, m''_1)
+ zip2 v7.8h, v6.8h, v31.8h // (m''_3, m'_3; m''_2, m'_2)
+ zip1 v6.8h, v6.8h, v31.8h // (m''_1, m'_1; m''_0, m'_0)
mov x4, x2 // -> y
mov x2, x3 // -> x
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
/// operands, as follows.
///
-/// Offset 0 4 8 12
-/// 0 v'_0 v'_1 v''_0 v''_1
-/// 16 v'_2 v'_3 v''_2 v''_3
+/// Offset 12 8 4 0
+/// 0 v''_1 v''_0 v'_1 v'_0
+/// 16 v''_3 v''_2 v'_3 v'_2
///
/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
- movd \d0, \r // (r_i, 0; 0, 0)
+ movd \d0, \r // (0, 0; 0, r_i)
.ifnes "\d1", "nil"
- movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
+ movdqa \d1, [\s] // (s''_1, s''_0; s'_1, s'_0)
.endif
.ifnes "\d3", "nil"
- movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
+ movdqa \d3, [\s + 16] // (s''_3, s''_2; s'_3, s'_2)
.endif
- pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
+ pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (?, r_i; ?, r_i)
.ifnes "\d1", "nil"
- psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
+ psrldq \d1, 4 // (0, s''_1; s''_0, s'_1)
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
.else
- movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
+ movdqa \d2, \d0 // another copy of (?, r_i; ?, r_i)
.endif
.endif
.ifnes "\d3", "nil"
- psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
+ psrldq \d3, 4 // (0, s''_3; s''_2, s'_3)
.endif
.ifnes "\d1", "nil"
- pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
+ pmuludq \d1, \d0 // (r_i s''_1; r_i s'_1)
.endif
.ifnes "\d3", "nil"
- pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
+ pmuludq \d3, \d0 // (r_i s''_3; r_i s'_3)
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
- pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
+ pmuludq \d2, \d0 // (r_i s''_2; r_i s'_2)
.else
pmuludq \d2, [\s + 16]
.endif
.endif
- pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
+ pmuludq \d0, [\s] // (r_i s''_0; r_i s'_0)
.endm
.macro accum c0, c1=nil, c2=nil, c3=nil
// carry registers. On completion, XMM3 is clobbered. If CC is
// `nil', then the contribution which would have been added to it is
// left in C.
- pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
- psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
- pslldq xmm3, 2 // (t b; 0)
- paddq \c, xmm3 // (c' + t b; c'')
+ pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?)
+ psrldq xmm3, 12 // (0, 0; 0, t) = (0; t)
+ pslldq xmm3, 2 // (0; t b)
+ paddq \c, xmm3 // (c''; c' + t b)
movd \d, \c
psrlq \c, 32 // floor(c/B)
.ifnes "\cc", "nil"
// of the value represented in C are written to D, and the remaining
// bits are left at the bottom of T.
movdqa \t, \c
- psllq \t, 16 // (?; c'' b)
- pslldq \c, 8 // (0; c')
- paddq \t, \c // (?; c' + c'' b)
- psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
+ psllq \t, 16 // (c'' b; ?)
+ pslldq \c, 8 // (c'; 0)
+ paddq \t, \c // (c' + c'' b; ?)
+ psrldq \t, 8 // (0; c' + c'' b) = (0; c)
movd \d, \t
psrldq \t, 4 // (floor(c/B); 0)
.endm
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
- movdqa \b, \a // (a_0, a_1; a_2, a_3)
+ movdqa \b, \a // (a_3, a_2; a_1, a_0)
.ifnes "\c", "nil"
- movdqa \d, \c // (c_0, c_1; c_2, c_3)
+ movdqa \d, \c // (c_3, c_2; c_1, c_0)
.endif
- punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
- punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
+ punpcklwd \a, \z // (a''_1, a'_1; a''_0, a'_0)
+ punpckhwd \b, \z // (a''_3, a'_3; a''_2, a'_2)
.ifnes "\c", "nil"
- punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
- punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
+ punpcklwd \c, \z // (c''_1, c'_1; c''_0, c'_0)
+ punpckhwd \d, \z // (c''_3, c'_3; c''_2, c'_2)
.endif
- pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
- pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
+ pshufd \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0)
+ pshufd \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2)
.ifnes "\c", "nil"
- pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
- pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
+ pshufd \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0)
+ pshufd \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2)
.endif
.endm
// we can do that, we must gather them together.
movdqa \t, \c0
movdqa \u, \c1
- punpcklqdq \t, \c2 // (y'_0; y'_2)
- punpckhqdq \c0, \c2 // (y''_0; y''_2)
- punpcklqdq \u, \c3 // (y'_1; y'_3)
- punpckhqdq \c1, \c3 // (y''_1; y''_3)
+ punpcklqdq \t, \c2 // (y'_2; y'_0)
+ punpckhqdq \c0, \c2 // (y''_2; y''_0)
+ punpcklqdq \u, \c3 // (y'_3; y'_1)
+ punpckhqdq \c1, \c3 // (y''_3; y''_1)
// Now split the double-prime pieces. The high (up to) 48 bits will
// go up; the low 16 bits go down.
movdqa \c3, \c1
psllq \c2, 48
psllq \c3, 48
- psrlq \c0, 16 // high parts of (y''_0; y''_2)
- psrlq \c1, 16 // high parts of (y''_1; y''_3)
- psrlq \c2, 32 // low parts of (y''_0; y''_2)
- psrlq \c3, 32 // low parts of (y''_1; y''_3)
+ psrlq \c0, 16 // high parts of (y''_2; y''_0)
+ psrlq \c1, 16 // high parts of (y''_3; y''_1)
+ psrlq \c2, 32 // low parts of (y''_2; y''_0)
+ psrlq \c3, 32 // low parts of (y''_3; y''_1)
.ifnes "\hi", "nil"
movdqa \hi, \c1
.endif
- pslldq \c1, 8 // high part of (0; y''_1)
+ pslldq \c1, 8 // high part of (y''_1; 0)
paddq \t, \c2 // propagate down
paddq \u, \c3
- paddq \t, \c1 // and up: (y_0; y_2)
- paddq \u, \c0 // (y_1; y_3)
+ paddq \t, \c1 // and up: (y_2; y_0)
+ paddq \u, \c0 // (y_3; y_1)
.ifnes "\hi", "nil"
- psrldq \hi, 8 // high part of (y''_3; 0)
+ psrldq \hi, 8 // high part of (0; y''_3)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
- movdqa \c3, \t // (y_0; ?)
- movdqa \lo, \t // (y^*_0, ?; ?, ?)
- psrldq \t, 8 // (y_2; 0)
+ movdqa \c3, \t // (?; y_0)
+ movdqa \lo, \t // (?, ?; ?, y^*_0)
+ psrldq \t, 8 // (0; y_2)
psrlq \c3, 32 // (floor(y_0/B); ?)
paddq \c3, \u // (y_1 + floor(y_0/B); ?)
- movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
- psrldq \u, 8 // (y_3; 0)
+ movdqa \c1, \c3 // (?, ?; ?, y^*_1)
+ psrldq \u, 8 // (0; y_3)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
- punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
+ punpckldq \lo, \c3 // (?, ?; y^*_2, y^*_0)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
.ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
- punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
+ punpckldq \c1, \c3 // (?, ?; y^*_3, y^*_1)
.ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
paddq \hi, \t
// On exit, the carry registers, including XMM7, are updated to hold
// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
// registers are preserved.
- movd xmm0, [edi + 0] // (a_0; 0)
- movd xmm1, [edi + 4] // (a_1; 0)
- movd xmm2, [edi + 8] // (a_2; 0)
- movd xmm7, [edi + 12] // (a_3; 0)
-
- paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
- paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
- paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
+ movd xmm0, [edi + 0] // (0; a_0)
+ movd xmm1, [edi + 4] // (0; a_1)
+ movd xmm2, [edi + 8] // (0; a_2)
+ movd xmm7, [edi + 12] // (0; a_3)
+
+ paddq xmm4, xmm0 // (c''_0; c'_0 + a_0)
+ paddq xmm5, xmm1 // (c''_1; c'_1 + a_1)
+ paddq xmm6, xmm2 // (c''_2 + a_3 b; c'_2 + a_2)
.endm
///--------------------------------------------------------------------------
.macro testldcarry c
mov ecx, \c // -> c
- movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
- movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
- movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
+ movdqu xmm4, [ecx + 0] // (c''_0; c'_0)
+ movdqu xmm5, [ecx + 16] // (c''_1; c'_1)
+ movdqu xmm6, [ecx + 32] // (c''_2; c'_2)
.endm
.macro testexpand v=nil, y=nil
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
- pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
- pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
+ pshufd xmm0, xmm0, SHUF(3, 1, 2, 0)
+ pshufd xmm1, xmm1, SHUF(3, 1, 2, 0)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
- pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
- pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
+ pshufd xmm0, xmm0, SHUF(3, 1, 2, 0)
+ pshufd xmm1, xmm1, SHUF(3, 1, 2, 0)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
- pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
- pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
+ pshufd xmm0, xmm0, SHUF(3, 1, 2, 0)
+ pshufd xmm1, xmm1, SHUF(3, 1, 2, 0)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
+ pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
pxor xmm1, xmm2
- pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
//
// The shuffles have quite high latency, so they've mostly been
// pushed upwards. The remaining one can't be moved, though.
- pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
+ pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
// Apply the diagonal quarterround to each of the columns
// simultaneously.
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
+ pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
pxor xmm1, xmm2
- pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
// Finally, finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// to wait for the shuffles.
- pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
// Decrement the loop counter and see if we should go round again.
sub NR, 2
// use Karatsuba's identity here, but I suspect that loses more in
// the shifting, bit-twiddling, and dependency chains that it gains
// in saving a multiplication which otherwise pipelines well.
- // q0 = // (u_0; u_1)
- // q1 = // (v_0; v_1)
+ // q0 = // (u_1; u_0)
+ // q1 = // (v_1; v_0)
vmull.p64 q2, d1, d2 // u_1 v_0
vmull.p64 q3, d0, d3 // u_0 v_1
- vmull.p64 q8, d1, d3 // (x_3; t_1) = u_1 v_1
- vmull.p64 q9, d0, d2 // (t_0; x_0) = u_0 v_0
+ vmull.p64 q8, d1, d3 // (t_1; x_3) = u_1 v_1
+ vmull.p64 q9, d0, d2 // (x_0; t_0) = u_0 v_0
// Arrange the pieces to form a double-precision polynomial.
- veor q2, q2, q3 // (m_1; m_0) = u_0 v_1 + u_1 v_0
+ veor q2, q2, q3 // (m_0; m_1) = u_0 v_1 + u_1 v_0
veor d17, d17, d4 // x_2 = t_1 + m_1
veor d18, d18, d5 // x_1 = t_0 + m_0
- // q8 = // (x_3; x_2)
- // q9 = // (x_1; x_0)
+ // q8 = // (x_2; x_3)
+ // q9 = // (x_0; x_1)
// One-and-a-half problems remain.
//
// This is an inconvenient size. There's nothing for it but to do
// four multiplications, as if for the 128-bit case.
- // q0 = // (u_0 + u_1 t^32; u_2)
- // q1 = // (v_0 + v_1 t^32; v_2)
+ // q0 = // (u_2; u_0 + u_1 t^32)
+ // q1 = // (v_2; v_0 + v_1 t^32)
vmull.p64 q8, d1, d2 // u_2 (v_0 + v_1 t^32) = e_0
vmull.p64 q9, d0, d3 // v_2 (u_0 + u_1 t^32) = e_1
- vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (0; d)
+ vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (d; 0)
vmull.p64 q0, d0, d2 // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
// + u_1 v_1 t^64 = f
veor q11, q11, q13 // b = u_1 v_2 + u_2 v_1
// Piece the product together.
- veor d17, d17, d22 // q8 = // (x_5; x_4)
+ veor d17, d17, d22 // q8 = // (x_4; x_5)
veor d18, d18, d23
- veor d19, d19, d24 // q9 = // (x_3; x_2)
- veor d20, d20, d25 // q10 = // (x_1; x_0)
+ veor d19, d19, d24 // q9 = // (x_2; x_3)
+ veor d20, d20, d25 // q10 = // (x_0; x_1)
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
- // q8 = // (y_5; y_4)
- // q9 = // (y_3; y_2)
- // q10 = // (y_1; y_0)
- vshl.u64 q11, q8, #63 // (y_5; y_4) b_i for t
+ // q8 = // (y_4; y_5)
+ // q9 = // (y_2; y_3)
+ // q10 = // (y_0; y_1)
+ vshl.u64 q11, q8, #63 // (y_4; y_5) b_i for t
vshl.u64 d28, d18, #63 // y_3 b_i for t
- vshl.u64 q12, q8, #62 // (y_5; y_4) b_i for t^2
+ vshl.u64 q12, q8, #62 // (y_4; y_5) b_i for t^2
vshl.u64 d29, d18, #62 // y_3 b_i for t^2
- vshl.u64 q13, q8, #57 // (y_5; y_4) b_i for t^7
+ vshl.u64 q13, q8, #57 // (y_4; y_5) b_i for t^7
vshl.u64 d30, d18, #57 // y_3 b_i for t^7
veor q11, q11, q12 // mix them all together
veor d28, d28, d29
// And finally shift the low bits up. Also, switch the order of the
// pieces for output.
- // q8 = // (y'_5; y'_4)
- // q9 = // (y'_3; y'_2)
- // q10 = // (y'_1; y'_0)
- vshr.u64 q11, q8, #1 // (y_5; y_4) a_i for t
+ // q8 = // (y'_4; y'_5)
+ // q9 = // (y'_2; y'_3)
+ // q10 = // (y'_0; y'_1)
+ vshr.u64 q11, q8, #1 // (y_4; y_5) a_i for t
vshr.u64 d28, d18, #1 // y'_3 a_i for t
- vshr.u64 q12, q8, #2 // (y_5; y_4) a_i for t^2
+ vshr.u64 q12, q8, #2 // (y_4; y_5) a_i for t^2
vshr.u64 d29, d18, #2 // y'_3 a_i for t^2
- vshr.u64 q13, q8, #7 // (y_5; y_4) a_i for t^7
+ vshr.u64 q13, q8, #7 // (y_4; y_5) a_i for t^7
vshr.u64 d30, d18, #7 // y'_3 a_i for t^7
veor q8, q8, q11
veor d18, d18, d28
// 128-bit multiplications already, and Karatsuba is too annoying
// there, so there'll be 12 multiplications altogether, rather than
// the 16 we'd have if we did this the naïve way.
- // q0 = // u_0 = (u_00; u_01)
- // q1 = // u_1 = (u_10; u_11)
- // q2 = // v_0 = (v_00; v_01)
- // q3 = // v_1 = (v_10; v_11)
+ // q0 = // u_0 = (u_01; u_00)
+ // q1 = // u_1 = (u_11; u_10)
+ // q2 = // v_0 = (v_01; v_00)
+ // q3 = // v_1 = (v_11; v_10)
- veor q8, q0, q1 // u_* = (u_00 + u_10; u_01 + u_11)
- veor q9, q2, q3 // v_* = (v_00 + v_10; v_01 + v_11)
+ veor q8, q0, q1 // u_* = (u_01 + u_11; u_00 + u_10)
+ veor q9, q2, q3 // v_* = (v_01 + v_11; v_00 + v_10)
// Start by building the cross product, q = u_* v_*.
vmull.p64 q14, d16, d19 // u_*0 v_*1
// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
// First, shift the high bits down.
- // q8 = // (y_7; y_6)
- // q9 = // (y_5; y_4)
- // q10 = // (y_3; y_2)
- // q11 = // (y_1; y_0)
- vshl.u64 q0, q8, #62 // (y_7; y_6) b_i for t^2
- vshl.u64 q12, q9, #62 // (y_5; y_4) b_i for t^2
- vshl.u64 q1, q8, #59 // (y_7; y_6) b_i for t^5
- vshl.u64 q13, q9, #59 // (y_5; y_4) b_i for t^5
- vshl.u64 q2, q8, #54 // (y_7; y_6) b_i for t^10
- vshl.u64 q14, q9, #54 // (y_5; y_4) b_i for t^10
+ // q8 = // (y_6; y_7)
+ // q9 = // (y_4; y_5)
+ // q10 = // (y_2; y_3)
+ // q11 = // (y_0; y_1)
+ vshl.u64 q0, q8, #62 // (y_6; y_7) b_i for t^2
+ vshl.u64 q12, q9, #62 // (y_4; y_5) b_i for t^2
+ vshl.u64 q1, q8, #59 // (y_6; y_7) b_i for t^5
+ vshl.u64 q13, q9, #59 // (y_4; y_5) b_i for t^5
+ vshl.u64 q2, q8, #54 // (y_6; y_7) b_i for t^10
+ vshl.u64 q14, q9, #54 // (y_4; y_5) b_i for t^10
veor q0, q0, q1 // mix the contributions together
veor q12, q12, q13
veor q0, q0, q2
// And then shift the low bits up. Also, switch the order of the
// pieces for output.
- // q8 = // (y'_7; y'_6)
- // q9 = // (y'_5; y'_4)
- // q10 = // (y'_3; y'_2)
- // q11 = // (y'_1; y'_0)
- vshr.u64 q0, q8, #2 // (y_7; y_6) a_i for t^2
- vshr.u64 q12, q9, #2 // (y_5; y'_4) a_i for t^2
- vshr.u64 q1, q8, #5 // (y_7; y_6) a_i for t^5
- vshr.u64 q13, q9, #5 // (y_5; y_4) a_i for t^5
- vshr.u64 q2, q8, #10 // (y_7; y_6) a_i for t^10
- vshr.u64 q14, q9, #10 // (y_5; y_4) a_i for t^10
+ // q8 = // (y'_6; y'_7)
+ // q9 = // (y'_4; y'_5)
+ // q10 = // (y'_2; y'_3)
+ // q11 = // (y'_0; y'_1)
+ vshr.u64 q0, q8, #2 // (y_6; y_7) a_i for t^2
+ vshr.u64 q12, q9, #2 // (y'_4; y_5) a_i for t^2
+ vshr.u64 q1, q8, #5 // (y_6; y_7) a_i for t^5
+ vshr.u64 q13, q9, #5 // (y_4; y_5) a_i for t^5
+ vshr.u64 q2, q8, #10 // (y_6; y_7) a_i for t^10
+ vshr.u64 q14, q9, #10 // (y_4; y_5) a_i for t^10
veor q8, q8, q0 // mix the contributions together
veor q1, q1, q2
// use Karatsuba's identity here, but I suspect that loses more in
// the shifting, bit-twiddling, and dependency chains that it gains
// in saving a multiplication which otherwise pipelines well.
- // v0 = // (u_0; u_1)
- // v1/v2 = // (v_0; v_1)
+ // v0 = // (u_1; u_0)
+ // v1/v2 = // (v_1; v_0)
pmull2 v3.1q, v0.2d, v1.2d // u_1 v_0
pmull v4.1q, v0.1d, v2.1d // u_0 v_1
- pmull2 v5.1q, v0.2d, v2.2d // (t_1; x_3) = u_1 v_1
- pmull v6.1q, v0.1d, v1.1d // (x_0; t_0) = u_0 v_0
+ pmull2 v5.1q, v0.2d, v2.2d // (x_3; t_1) = u_1 v_1
+ pmull v6.1q, v0.1d, v1.1d // (t_0; x_0) = u_0 v_0
// Arrange the pieces to form a double-precision polynomial.
- eor v3.16b, v3.16b, v4.16b // (m_0; m_1) = u_0 v_1 + u_1 v_0
- vshr128 v4, v3, 64 // (m_1; 0)
- vshl128 v3, v3, 64 // (0; m_0)
- eor v1.16b, v5.16b, v4.16b // (x_2; x_3)
- eor v0.16b, v6.16b, v3.16b // (x_0; x_1)
+ eor v3.16b, v3.16b, v4.16b // (m_1; m_0) = u_0 v_1 + u_1 v_0
+ vshr128 v4, v3, 64 // (0; m_1)
+ vshl128 v3, v3, 64 // (m_0; 0)
+ eor v1.16b, v5.16b, v4.16b // (x_3; x_2)
+ eor v0.16b, v6.16b, v3.16b // (x_1; x_0)
// And now the only remaining difficulty is that the result needs to
// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128
// leave with z = u v in x2. Clobbers x2--x4.
// The multiplication is thankfully easy.
- // v0 = // (u; ?)
- // v1 = // (v; ?)
+ // v0 = // (?; u)
+ // v1 = // (?; v)
pmull v0.1q, v0.1d, v1.1d // u v
// Now we must reduce. This is essentially the same as the 128-bit
// shift both of them up by four bytes before we start. This will
// mean that the high 64 bits of the result (from GCM's viewpoint)
// will be zero.
- // v0 = // (u_0 + u_1 t^32; u_2)
+ // v0 = // (u_2; u_0 + u_1 t^32)
// v1 = // (v_0 + v_1 t^32; v_0 + v_1 t^32)
// v2 = // (v_2; v_2)
pmull2 v5.1q, v0.2d, v1.2d // u_2 (v_0 + v_1 t^32) t^32 = e_0
pmull v4.1q, v0.1d, v2.1d // v_2 (u_0 + u_1 t^32) t^32 = e_1
- pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (d; 0)
+ pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (0; d)
pmull v3.1q, v0.1d, v1.1d // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
// + u_1 v_1 t^64 = f
// Clobbers v16--v25.
// Start multiplying and accumulating pieces of product.
- // v0 = // (u_0; u_1)
- // v1 = // (u_2; ?)
+ // v0 = // (u_1; u_0)
+ // v1 = // (?; u_2)
// v2 = // (v_0; v_0)
// v3 = // (v_1; v_1)
// v4 = // (v_2; v_2)
eor v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
// Piece the product together.
- // v16 = // (a_0; a_1)
- // v19 = // (b_0; b_1)
- // v17 = // (c_0; c_1)
- // v20 = // (d_0; d_1)
- // v18 = // (e_0; e_1)
- vshl128 v21, v19, 64 // (0; b_0)
- ext v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
- vshr128 v23, v20, 64 // (d_1; 0)
- eor v16.16b, v16.16b, v21.16b // (x_0; x_1)
- eor v17.16b, v17.16b, v22.16b // (x_2; x_3)
- eor v18.16b, v18.16b, v23.16b // (x_2; x_3)
+ // v16 = // (a_1; a_0)
+ // v19 = // (b_1; b_0)
+ // v17 = // (c_1; c_0)
+ // v20 = // (d_1; d_0)
+ // v18 = // (e_1; e_0)
+ vshl128 v21, v19, 64 // (b_0; 0)
+ ext v22.16b, v19.16b, v20.16b, #8 // (d_0; b_1)
+ vshr128 v23, v20, 64 // (0; d_1)
+ eor v16.16b, v16.16b, v21.16b // (x_1; x_0)
+ eor v17.16b, v17.16b, v22.16b // (x_3; x_2)
+ eor v18.16b, v18.16b, v23.16b // (x_3; x_2)
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
- // v16 = // (y_0; y_1)
- // v17 = // (y_2; y_3)
- // v18 = // (y_4; y_5)
- mov v19.d[0], v17.d[1] // (y_3; ?)
+ // v16 = // (y_1; y_0)
+ // v17 = // (y_3; y_2)
+ // v18 = // (y_5; y_4)
+ mov v19.d[0], v17.d[1] // (?; y_3)
ushr v23.2d, v18.2d, #63 // hi b_i for t
ushr d20, d19, #63 // lo b_i for t
// Permute the high pieces while we fold in the b_i.
eor v17.16b, v17.16b, v23.16b
vshl128 v20, v20, 64
- mov v19.d[0], v18.d[1] // (y_5; ?)
- ext v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
+ mov v19.d[0], v18.d[1] // (?; y_5)
+ ext v18.16b, v17.16b, v18.16b, #8 // (y_4; y_3)
eor v16.16b, v16.16b, v20.16b
// And finally shift the low bits up.
- // v16 = // (y'_0; y'_1)
- // v17 = // (y'_2; ?)
- // v18 = // (y'_3; y'_4)
- // v19 = // (y'_5; ?)
+ // v16 = // (y'_1; y'_0)
+ // v17 = // (?; y'_2)
+ // v18 = // (y'_4; y'_3)
+ // v19 = // (?; y'_5)
shl v20.2d, v18.2d, #1
shl d23, d19, #1
shl v21.2d, v18.2d, #2
// 128-bit multiplications already, and Karatsuba is too annoying
// there, so there'll be 12 multiplications altogether, rather than
// the 16 we'd have if we did this the naïve way.
- // v0 = // u_0 = (u_00; u_01)
- // v1 = // u_1 = (u_10; u_11)
+ // v0 = // u_0 = (u_01; u_00)
+ // v1 = // u_1 = (u_11; u_10)
// v2 = // (v_00; v_00)
// v3 = // (v_01; v_01)
// v4 = // (v_10; v_10)
// v5 = // (v_11; v_11)
- eor v28.16b, v0.16b, v1.16b // u_* = (u_00 + u_10; u_01 + u_11)
+ eor v28.16b, v0.16b, v1.16b // u_* = (u_01 + u_11; u_00 + u_10)
eor v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
eor v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
// Now we must reduce. This is essentially the same as the 192-bit
// case above, but more complicated because everything is bigger.
// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
- // v16 = // (y_0; y_1)
- // v17 = // (y_2; y_3)
- // v18 = // (y_4; y_5)
- // v19 = // (y_6; y_7)
- ushr v24.2d, v18.2d, #62 // (y_4; y_5) b_i for t^2
- ushr v25.2d, v19.2d, #62 // (y_6; y_7) b_i for t^2
- ushr v26.2d, v18.2d, #59 // (y_4; y_5) b_i for t^5
- ushr v27.2d, v19.2d, #59 // (y_6; y_7) b_i for t^5
- ushr v28.2d, v18.2d, #54 // (y_4; y_5) b_i for t^10
- ushr v29.2d, v19.2d, #54 // (y_6; y_7) b_i for t^10
+ // v16 = // (y_1; y_0)
+ // v17 = // (y_3; y_2)
+ // v18 = // (y_5; y_4)
+ // v19 = // (y_7; y_6)
+ ushr v24.2d, v18.2d, #62 // (y_5; y_4) b_i for t^2
+ ushr v25.2d, v19.2d, #62 // (y_7; y_6) b_i for t^2
+ ushr v26.2d, v18.2d, #59 // (y_5; y_4) b_i for t^5
+ ushr v27.2d, v19.2d, #59 // (y_7; y_6) b_i for t^5
+ ushr v28.2d, v18.2d, #54 // (y_5; y_4) b_i for t^10
+ ushr v29.2d, v19.2d, #54 // (y_7; y_6) b_i for t^10
eor v24.16b, v24.16b, v26.16b // mix the contributions together
eor v25.16b, v25.16b, v27.16b
eor v24.16b, v24.16b, v28.16b
eor v16.16b, v16.16b, v24.16b
// And then shift the low bits up.
- // v16 = // (y'_0; y'_1)
- // v17 = // (y'_2; y'_3)
- // v18 = // (y'_4; y'_5)
- // v19 = // (y'_6; y'_7)
- shl v24.2d, v18.2d, #2 // (y'_4; y_5) a_i for t^2
- shl v25.2d, v19.2d, #2 // (y_6; y_7) a_i for t^2
- shl v26.2d, v18.2d, #5 // (y'_4; y_5) a_i for t^5
- shl v27.2d, v19.2d, #5 // (y_6; y_7) a_i for t^5
- shl v28.2d, v18.2d, #10 // (y'_4; y_5) a_i for t^10
- shl v29.2d, v19.2d, #10 // (y_6; y_7) a_i for t^10
+ // v16 = // (y'_1; y'_0)
+ // v17 = // (y'_3; y'_2)
+ // v18 = // (y'_5; y'_4)
+ // v19 = // (y'_7; y'_6)
+ shl v24.2d, v18.2d, #2 // (y_5; y'_4) a_i for t^2
+ shl v25.2d, v19.2d, #2 // (y_7; y_6) a_i for t^2
+ shl v26.2d, v18.2d, #5 // (y_5; y'_4) a_i for t^5
+ shl v27.2d, v19.2d, #5 // (y_7; y_6) a_i for t^5
+ shl v28.2d, v18.2d, #10 // (y_5; y'_4) a_i for t^10
+ shl v29.2d, v19.2d, #10 // (y_7; y_6) a_i for t^10
eor v18.16b, v18.16b, v24.16b // mix the contributions together
eor v19.16b, v19.16b, v25.16b
eor v26.16b, v26.16b, v28.16b
// use Karatsuba's identity here, but I suspect that loses more in
// the shifting, bit-twiddling, and dependency chains that it gains
// in saving a multiplication which otherwise pipelines well.
- // xmm0 = // (u_1; u_0)
- // xmm1 = // (v_1; v_0)
- movdqa xmm2, xmm1 // (v_1; v_0) again
- movdqa xmm3, xmm0 // (u_1; u_0) again
- movdqa xmm4, xmm0 // (u_1; u_0) yet again
+ // xmm0 = // (u_0; u_1)
+ // xmm1 = // (v_0; v_1)
+ movdqa xmm2, xmm1 // (v_0; v_1) again
+ movdqa xmm3, xmm0 // (u_0; u_1) again
+ movdqa xmm4, xmm0 // (u_0; u_1) yet again
pclmulhqlqdq xmm2, xmm0 // u_1 v_0
pclmullqlqdq xmm0, xmm1 // u_1 v_1
pclmulhqlqdq xmm3, xmm1 // u_0 v_1
pclmulhqhqdq xmm4, xmm1 // u_0 v_0
// Arrange the pieces to form a double-precision polynomial.
- pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1
- movdqa xmm1, xmm2 // (m_1; m_0) again
- pslldq xmm2, 8 // (0; m_1)
- psrldq xmm1, 8 // (m_0; 0)
+ pxor xmm2, xmm3 // (m_0; m_1) = u_1 v_0 + u_0 v_1
+ movdqa xmm1, xmm2 // (m_0; m_1) again
+ pslldq xmm2, 8 // (m_1; 0)
+ psrldq xmm1, 8 // (0; m_0)
pxor xmm0, xmm2 // z_1 = u_1 v_1 + m_1
pxor xmm1, xmm4 // z_0 = u_0 v_0 + t^64 m_0
// word together, and then the low bits, everything will be fine.
// First, shift the high bits down.
- movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again
- movdqa xmm3, xmm0 // (x_7, x_6; x_5, x_4) yet again
- movdqa xmm4, xmm0 // (x_7, x_6; x_5, x_4) again again
+ movdqa xmm2, xmm0 // (x_4, x_5; x_6, x_7) again
+ movdqa xmm3, xmm0 // (x_4, x_5; x_6, x_7) yet again
+ movdqa xmm4, xmm0 // (x_4, x_5; x_6, x_7) again again
pslld xmm2, 31 // the b_i for t
pslld xmm3, 30 // the b_i for t^2
pslld xmm4, 25 // the b_i for t^7
// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
// First, we must detach the top (`low'!) half of the result.
- movdqa xmm0, xmm1 // (x_3, x_2; x_1, x_0) again
- psrldq xmm1, 8 // (x_1, x_0; 0, 0)
+ movdqa xmm0, xmm1 // (x_0, x_1; x_2, x_3) again
+ psrldq xmm1, 8 // (0, 0; x_0, x_1)
// Next, shift the high bits down.
- movdqa xmm2, xmm0 // (x_3, x_2; ?, ?) again
- movdqa xmm3, xmm0 // (x_3, x_2; ?, ?) yet again
- movdqa xmm4, xmm0 // (x_3, x_2; ?, ?) again again
+ movdqa xmm2, xmm0 // (?, ?; x_2, x_3) again
+ movdqa xmm3, xmm0 // (?, ?; x_2, x_3) yet again
+ movdqa xmm4, xmm0 // (?, ?; x_2, x_3) again again
pslld xmm2, 31 // b_i for t
pslld xmm3, 29 // b_i for t^3
pslld xmm4, 28 // b_i for t^4
// shift both of them up by four bytes before we start. This will
// mean that the high 64 bits of the result (from GCM's viewpoint)
// will be zero.
- // xmm0 = // (0, u_2; u_1, u_0)
- // xmm1 = // (0, v_2; v_1, v_0)
- movdqa xmm2, xmm1 // (0, v_2; v_1, v_0) again
- movdqa xmm3, xmm0 // (0, u_2; u_1, u_0) again
- movdqa xmm4, xmm0 // (0, u_2; u_1, u_0) yet again
+ // xmm0 = // (u_0, u_1; u_2, 0)
+ // xmm1 = // (v_0, v_1; v_2, 0)
+ movdqa xmm2, xmm1 // (v_0, v_1; v_2, 0) again
+ movdqa xmm3, xmm0 // (u_0, u_1; u_2, 0) again
+ movdqa xmm4, xmm0 // (u_0, u_1; u_2, 0) yet again
pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0
pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d)
pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1
// registers. The answer we want is d t^128 + e t^64 + f, where e =
// e_0 + e_1.
//
- // The place values for the two halves are (t^160, t^128; t^96, ?)
- // and (?, t^64; t^32, 1). But we also want to shift the high part
+ // The place values for the two halves are (?, t^96; t^128, t^160)
+ // and (1, t^32; t^64, ?). But we also want to shift the high part
// left by a word, for symmetry's sake.
- psrldq xmm0, 8 // (d; 0) = d t^128
+ psrldq xmm0, 8 // (0; d) = d t^128
pxor xmm2, xmm3 // e = (e_0 + e_1)
movdqa xmm1, xmm4 // f again
pxor xmm0, xmm2 // d t^128 + e t^64
// are unimportant. Clobbers xmm2--xmm7.
// Start multiplying and accumulating pieces of product.
- // xmm0 = // (u_2; u_1)
- // xmm1 = // (u_0; ?)
- // xmm2 = // (v_2; v_1)
- // xmm3 = // (v_0; ?)
- movdqa xmm4, xmm0 // (u_2; u_1) again
- movdqa xmm5, xmm0 // (u_2; u_1) yet again
- movdqa xmm6, xmm0 // (u_2; u_1) again again
- movdqa xmm7, xmm3 // (v_0; ?) again
- punpcklqdq xmm3, xmm1 // (v_0; u_0)
+ // xmm0 = // (u_1; u_2)
+ // xmm1 = // (?; u_0)
+ // xmm2 = // (v_1; v_2)
+ // xmm3 = // (?; v_0)
+ movdqa xmm4, xmm0 // (u_1; u_2) again
+ movdqa xmm5, xmm0 // (u_1; u_2) yet again
+ movdqa xmm6, xmm0 // (u_1; u_2) again again
+ movdqa xmm7, xmm3 // (?; v_0) again
+ punpcklqdq xmm3, xmm1 // (u_0; v_0)
pclmulhqhqdq xmm4, xmm2 // u_1 v_1
pclmullqlqdq xmm1, xmm2 // u_0 v_2
pclmullqhqdq xmm5, xmm2 // u_2 v_1
pxor xmm1, xmm4 // u_0 v_2 + u_1 v_1
pclmullqlqdq xmm7, xmm0 // u_2 v_0
pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2
- movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny
+ movdqa xmm6, xmm0 // (u_1; u_2) like a bad penny
pxor xmm1, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
pclmullqlqdq xmm0, xmm2 // a = u_2 v_2
pclmulhqlqdq xmm6, xmm3 // u_1 v_0
// Next, the piecing together of the product. There's significant
// work here to leave the completed pieces in sensible registers.
- // xmm0 = // (a_1; a_0) = a = u_2 v_2
- // xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
- // xmm1 = // (c_1; c_0) = c = u_0 v_2 +
+ // xmm0 = // (a_0; a_1) = a = u_2 v_2
+ // xmm5 = // (b_0; b_1) = b = u_1 v_2 + u_2 v_1
+ // xmm1 = // (c_0; c_1) = c = u_0 v_2 +
// u_1 v_1 + u_2 v_0
- // xmm6 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
- // xmm3 = // (e_1; e_0) = e = u_0 v_0
+ // xmm6 = // (d_0; d_1) = d = u_0 v_1 + u_1 v_0
+ // xmm3 = // (e_0; e_1) = e = u_0 v_0
// xmm2, xmm4, xmm7 spare
- movdqa xmm2, xmm6 // (d_1; d_0) again
- movdqa xmm4, xmm5 // (b_1; b_0) again
- pslldq xmm6, 8 // (0; d_1)
- psrldq xmm5, 8 // (b_0; 0)
- psrldq xmm2, 8 // (d_0; 0)
- pslldq xmm4, 8 // (0; b_1)
- pxor xmm5, xmm6 // (b_0; d_1)
- pxor xmm0, xmm4 // (x_5; x_4) = (a_1; a_0 + b_1)
- pxor xmm2, xmm3 // (x_1; x_0) = (e_1 + d_0; e_0)
- pxor xmm1, xmm5 // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)
+ movdqa xmm2, xmm6 // (d_0; d_1) again
+ movdqa xmm4, xmm5 // (b_0; b_1) again
+ pslldq xmm6, 8 // (d_1; 0)
+ psrldq xmm5, 8 // (0; b_0)
+ psrldq xmm2, 8 // (0; d_0)
+ pslldq xmm4, 8 // (b_1; 0)
+ pxor xmm5, xmm6 // (d_1; b_0)
+ pxor xmm0, xmm4 // (x_4; x_5) = (a_0 + b_1; a_1)
+ pxor xmm2, xmm3 // (x_0; x_1) = (e_0; e_1 + d_0)
+ pxor xmm1, xmm5 // (x_2; x_3) = (c_0 + d_1; b_0 + c_1)
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
- // xmm0 = // (x_5; x_4)
- // xmm1 = // (x_3; x_2)
- // xmm2 = // (x_1; x_0)
+ // xmm0 = // (x_4; x_5)
+ // xmm1 = // (x_2; x_3)
+ // xmm2 = // (x_0; x_1)
// xmm3--xmm7 spare
- movdqa xmm3, xmm0 // (x_5; x_4) copy
- movdqa xmm4, xmm0 // (x_5; x_4) copy
- movdqa xmm5, xmm0 // (x_5; x_4) copy
- pslld xmm3, 31 // (x_5; x_4) b_i for t
- pslld xmm4, 30 // (x_5; x_4) b_i for t^2
- pslld xmm5, 25 // (x_5; x_4) b_i for t^7
- movq xmm6, xmm1 // (x_3; 0) copy
+ movdqa xmm3, xmm0 // (x_4; x_5) copy
+ movdqa xmm4, xmm0 // (x_4; x_5) copy
+ movdqa xmm5, xmm0 // (x_4; x_5) copy
+ pslld xmm3, 31 // (x_4; x_5) b_i for t
+ pslld xmm4, 30 // (x_4; x_5) b_i for t^2
+ pslld xmm5, 25 // (x_4; x_5) b_i for t^7
+ movq xmm6, xmm1 // (0; x_3) copy
pxor xmm3, xmm4
- movq xmm7, xmm1 // (x_3; 0) copy
+ movq xmm7, xmm1 // (0; x_3) copy
pxor xmm3, xmm5
- movq xmm5, xmm1 // (x_3; 0) copy
- movdqa xmm4, xmm3 // (x_5; x_4) b_i combined
- pslld xmm6, 31 // (x_3; 0) b_i for t
- pslld xmm7, 30 // (x_3; 0) b_i for t^2
- pslld xmm5, 25 // (x_3; 0) b_i for t^7
- psrldq xmm3, 12 // (x_5; x_4) low contrib
- pslldq xmm4, 4 // (x_5; x_4) high contrib
+ movq xmm5, xmm1 // (0; x_3) copy
+ movdqa xmm4, xmm3 // (x_4; x_5) b_i combined
+ pslld xmm6, 31 // (0; x_3) b_i for t
+ pslld xmm7, 30 // (0; x_3) b_i for t^2
+ pslld xmm5, 25 // (0; x_3) b_i for t^7
+ psrldq xmm3, 12 // (x_4; x_5) low contrib
+ pslldq xmm4, 4 // (x_4; x_5) high contrib
pxor xmm6, xmm7
pxor xmm2, xmm3
pxor xmm6, xmm5
// And finally shift the low bits up. Unfortunately, we also have to
// split the low bits out.
- // xmm0 = // (x'_5; x'_4)
- // xmm1 = // (x'_3; x'_2)
- // xmm2 = // (x'_1; x'_0)
- movdqa xmm5, xmm1 // copies of (x'_3; x'_2)
+ // xmm0 = // (x'_4; x'_5)
+ // xmm1 = // (x'_2; x'_3)
+ // xmm2 = // (x'_0; x'_1)
+ movdqa xmm5, xmm1 // copies of (x'_2; x'_3)
movdqa xmm6, xmm1
movdqa xmm7, xmm1
- psrldq xmm1, 8 // bring down (x'_2; ?)
- movdqa xmm3, xmm0 // copies of (x'_5; x'_4)
+ psrldq xmm1, 8 // bring down (?; x'_2)
+ movdqa xmm3, xmm0 // copies of (x'_4; x'_5)
movdqa xmm4, xmm0
- punpcklqdq xmm1, xmm2 // (x'_2; x'_1)
- psrldq xmm2, 8 // (x'_0; ?)
+ punpcklqdq xmm1, xmm2 // (x'_1; x'_2)
+ psrldq xmm2, 8 // (?; x'_0)
pxor xmm2, xmm5 // low half and unit contrib
pxor xmm1, xmm0
psrld xmm5, 1
pxor xmm0, xmm4
pxor xmm5, xmm2 // mix everything together
pxor xmm0, xmm1
- movq xmm1, xmm5 // shunt (z_0; ?) into proper place
+ movq xmm1, xmm5 // shunt (?; z_0) into proper place
.endm
.macro mul256
// On x86, there aren't quite enough registers, so spill one for a
// bit. On AMD64, we can keep on going, so it's all good.
- // xmm0 = // u_1 = (u_11; u_10)
- // xmm1 = // u_0 = (u_01; u_00)
- // xmm2 = // v_1 = (v_11; v_10)
- // xmm3 = // v_0 = (v_01; v_00)
+ // xmm0 = // u_1 = (u_10; u_11)
+ // xmm1 = // u_0 = (u_00; u_01)
+ // xmm2 = // v_1 = (v_10; v_11)
+ // xmm3 = // v_0 = (v_00; v_01)
movdqa xmm4, xmm0 // u_1 again
#if CPUFAM_X86
movdqa [SP + 0], xmm3
movdqa xmm8, xmm3
# define V0 xmm8
#endif
- pxor xmm4, xmm1 // u_* = (u_01 + u_11; u_00 + u_10)
- pxor xmm3, xmm2 // v_* = (v_01 + v_11; v_00 + v_10)
+ pxor xmm4, xmm1 // u_* = (u_00 + u_10; u_01 + u_11)
+ pxor xmm3, xmm2 // v_* = (v_00 + v_10; v_01 + v_11)
// Start by building the cross product, q = u_* v_*.
movdqa xmm7, xmm4 // more copies of u_*
// the /last/ byte in the block. If the block size is not a multiple of
// 16 bytes, then there must be padding. 96-bit blocks are weird: the
// padding is inserted at the /least/ significant end, so the register
-// holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
+// holds (x_2, x_1; x_0, 0); otherwise, the padding goes at the most
// significant end.
//
// * The `words' format consists of a sequence of bytes, as in the
endprologue
movdqu xmm0, [A]
movdqu xmm1, [K]
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
mul128
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
movdqu [A], xmm0
ret
ENDFUNC
endprologue
movq xmm0, [A]
movq xmm1, [K]
- pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(3, 3, 0, 1)
mul64
- pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(3, 3, 0, 1)
movq [A], xmm0
ret
ENDFUNC
movd xmm2, [A + 8]
movdqu xmm1, [K]
punpcklqdq xmm0, xmm2
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
mul96
- pshufd xmm1, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm1, xmm0, SHUF(0, 1, 2, 3)
psrldq xmm0, 4
movq [A + 0], xmm1
movd [A + 8], xmm0
movq xmm1, [A + 0]
movdqu xmm2, [K + 0]
movq xmm3, [K + 16]
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(3, 3, 0, 1)
mul192
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(3, 3, 0, 1)
movdqu [A + 8], xmm0
movq [A + 0], xmm1
#if CPUFAM_AMD64 && ABI_WIN
movdqu xmm1, [A + 0]
movdqu xmm2, [K + 0]
movdqu xmm3, [K + 16]
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(0, 1, 2, 3)
mul256
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(0, 1, 2, 3)
movdqu [A + 16], xmm0
movdqu [A + 0], xmm1
#if CPUFAM_X86
// Fourth word of the cycle, and seven or eight words of key. Do a
// byte substitution.
movd xmm0, eax
- pshufd xmm0, xmm0, SHUF(3, 0, 1, 2)
+ pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
aeskeygenassist xmm1, xmm0, 0
movd eax, xmm1
jmp 2f
// First word of the cycle. This is the complicated piece.
1: movd xmm0, eax
- pshufd xmm0, xmm0, SHUF(1, 2, 3, 0)
+ pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
aeskeygenassist xmm1, xmm0, 0
- pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
movd eax, xmm1
xor al, [RCON]
inc RCON
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm1
- pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
// a ^= (d + c) <<< 18
movdqa xmm4, xmm3
- pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
+ pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
paddd xmm4, xmm2
- pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm3
- pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
+ pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
// a ^= (d + c) <<< 18
movdqa xmm4, xmm1
- pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
+ pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
paddd xmm4, xmm2
- pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
// input. This can be done by juggling values in registers, with the
// following fancy footwork: some row rotations, a transpose, and
// some more rotations.
- pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 3, 4, 9, 14
- pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) // 2, 7, 8, 13
- pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) // 1, 6, 11, 12
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13
+ pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12
movdqa xmm4, xmm0
movdqa xmm5, xmm3
punpckhdq xmm1, xmm3 // 5, 6, 7, 4
punpckhdq xmm2, xmm5 // 15, 12, 13, 14
- pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 4, 5, 6, 7
- pshufd xmm4, xmm4, SHUF(2, 3, 0, 1) // 8, 9, 10, 11
- pshufd xmm2, xmm2, SHUF(1, 2, 3, 0) // 12, 13, 14, 15
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7
+ pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11
+ pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15
// Finally we have to write out the result.
movdqu [OUT + 0], xmm0