This makes operations which involve changing one's perspective about the
SIMD processing elements make significantly more sense. In particular,
I hope that this removes a layer of brain-twisting from the GCM code.
* Adjust all of the register-contents diagrams so that less
significant elements are on the right, rather than on the left.
* Change the x86 `SHUF' macro so that the desired pieces are listed in
decreasing significance order, so `SHUF(3, 2, 1, 0)' would be a
no-op.
I would, of course, continue to use big-endian notation on a target
which actually used a big-endian ordering natively, but we don't
currently support any of them.
# define INTADDR__1(addr, got) addr
#endif
# define INTADDR__1(addr, got) addr
#endif
-// Permutations for SIMD instructions. SHUF(A, B, C, D) is an immediate,
+// Permutations for SIMD instructions. SHUF(D, C, B, A) is an immediate,
// suitable for use in `pshufd' or `shufpd', which copies element A
// (0 <= A < 4) of the source to element 0 of the destination, element B to
// element 1, element C to element 2, and element D to element 3.
// suitable for use in `pshufd' or `shufpd', which copies element A
// (0 <= A < 4) of the source to element 0 of the destination, element B to
// element 1, element C to element 2, and element D to element 3.
-#define SHUF(a, b, c, d) ((a) + 4*(b) + 16*(c) + 64*(d))
+#define SHUF(d, c, b, a) (64*(d) + 16*(c) + 4*(b) + (a))
// Map register names to their individual pieces.
// Map register names to their individual pieces.
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
/// operands, as follows.
///
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
/// operands, as follows.
///
-/// Offset 0 4 8 12
-/// 0 v'_0 v'_1 v''_0 v''_1
-/// 16 v'_2 v'_3 v''_2 v''_3
+/// Offset 12 8 4 0
+/// 0 v''_1 v''_0 v'_1 v'_0
+/// 16 v''_3 v''_2 v'_3 v'_2
///
/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
///
/// A `pmuludqd' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
// of the product in registers D0, D1, D2, D3.
.macro mulcore r, i, slo, shi, d0, d1=nil, d2=nil, d3=nil
// Multiply R_I by the expanded operand SLO/SHI, and leave the pieces
// of the product in registers D0, D1, D2, D3.
- pshufd \d0, \r, SHUF(\i, 3, \i, 3) // (r_i, ?; r_i, ?)
+ pshufd \d0, \r, SHUF(3, \i, 3, \i) // (?, r_i; ?, r_i)
- movdqa \d1, \slo // (s'_0, s'_1; s''_0, s''_1)
+ movdqa \d1, \slo // (s''_1, s''_0; s'_1, s'_0)
.endif
.ifnes "\d3", "nil"
.endif
.ifnes "\d3", "nil"
- movdqa \d3, \shi // (s'_2, s'_3; s''_2, s''_3)
+ movdqa \d3, \shi // (s''_3, s''_2; s'_3, s'_2)
.endif
.ifnes "\d1", "nil"
.endif
.ifnes "\d1", "nil"
- psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
+ psrldq \d1, 4 // (0, s''_1; s''_0, s'_1)
.endif
.ifnes "\d2", "nil"
.endif
.ifnes "\d2", "nil"
- movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
+ movdqa \d2, \d0 // another copy of (?, r_i; ?, r_i)
.endif
.ifnes "\d3", "nil"
.endif
.ifnes "\d3", "nil"
- psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
+ psrldq \d3, 4 // (0, s''_3; s''_2, s'_3)
.endif
.ifnes "\d1", "nil"
.endif
.ifnes "\d1", "nil"
- pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
+ pmuludq \d1, \d0 // (r_i s''_1; r_i s'_1)
.endif
.ifnes "\d3", "nil"
.endif
.ifnes "\d3", "nil"
- pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
+ pmuludq \d3, \d0 // (r_i s''_3; r_i s'_3)
.endif
.ifnes "\d2", "nil"
.endif
.ifnes "\d2", "nil"
- pmuludq \d2, \shi // (r_i s'_2; r_i s''_2)
+ pmuludq \d2, \shi // (r_i s''_2; r_i s'_2)
- pmuludq \d0, \slo // (r_i s'_0; r_i s''_0)
+ pmuludq \d0, \slo // (r_i s''_0; r_i s'_0)
.endm
.macro accum c0, c1=nil, c2=nil, c3=nil
.endm
.macro accum c0, c1=nil, c2=nil, c3=nil
// lane 0 or 1 of D; the high two lanes of D are clobbered. On
// completion, XMM3 is clobbered. If CC is `nil', then the
// contribution which would have been added to it is left in C.
// lane 0 or 1 of D; the high two lanes of D are clobbered. On
// completion, XMM3 is clobbered. If CC is `nil', then the
// contribution which would have been added to it is left in C.
- pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
- psrldq xmm3, 12 // (t, 0; 0, 0) = (t; 0)
- pslldq xmm3, 2 // (t b; 0)
- paddq \c, xmm3 // (c' + t b; c'')
+ pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?)
+ psrldq xmm3, 12 // (0, 0; 0, t) = (0; t)
+ pslldq xmm3, 2 // (0; t b)
+ paddq \c, xmm3 // (c''; c' + t b)
.ifeqs "\pos", "lo"
movdqa \d, \c
.else
.ifeqs "\pos", "lo"
movdqa \d, \c
.else
// of the value represented in C are written at POS in D, and the
// remaining bits are left at the bottom of T.
movdqa \t, \c
// of the value represented in C are written at POS in D, and the
// remaining bits are left at the bottom of T.
movdqa \t, \c
- psllq \t, 16 // (?; c'' b)
- pslldq \c, 8 // (0; c')
- paddq \t, \c // (?; c' + c'' b)
- psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
+ psllq \t, 16 // (c'' b; ?)
+ pslldq \c, 8 // (c'; 0)
+ paddq \t, \c // (c' + c'' b; ?)
+ psrldq \t, 8 // (0; c' + c'' b) = (0; c)
.ifeqs "\pos", "lo"
movdqa \d, \t
.else
.ifeqs "\pos", "lo"
movdqa \d, \t
.else
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
- movdqa \b, \a // (a_0, a_1; a_2, a_3)
+ movdqa \b, \a // (a_3, a_2; a_1, a_0)
- movdqa \d, \c // (c_0, c_1; c_2, c_3)
+ movdqa \d, \c // (c_3, c_2; c_1, c_0)
- punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
- punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
+ punpcklwd \a, \z // (a''_1, a'_1; a''_0, a'_0)
+ punpckhwd \b, \z // (a''_3, a'_3; a''_2, a'_2)
- punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
- punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
+ punpcklwd \c, \z // (c''_1, c'_1; c''_0, c'_0)
+ punpckhwd \d, \z // (c''_3, c'_3; c''_2, c'_2)
- pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
- pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
+ pshufd \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0)
+ pshufd \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2)
- pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
- pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
+ pshufd \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0)
+ pshufd \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2)
// we can do that, we must gather them together.
movdqa \t, \c0
movdqa \u, \c1
// we can do that, we must gather them together.
movdqa \t, \c0
movdqa \u, \c1
- punpcklqdq \t, \c2 // (y'_0; y'_2)
- punpckhqdq \c0, \c2 // (y''_0; y''_2)
- punpcklqdq \u, \c3 // (y'_1; y'_3)
- punpckhqdq \c1, \c3 // (y''_1; y''_3)
+ punpcklqdq \t, \c2 // (y'_2; y'_0)
+ punpckhqdq \c0, \c2 // (y''_2; y''_0)
+ punpcklqdq \u, \c3 // (y'_3; y'_1)
+ punpckhqdq \c1, \c3 // (y''_3; y''_1)
// Now split the double-prime pieces. The high (up to) 48 bits will
// go up; the low 16 bits go down.
// Now split the double-prime pieces. The high (up to) 48 bits will
// go up; the low 16 bits go down.
movdqa \c3, \c1
psllq \c2, 48
psllq \c3, 48
movdqa \c3, \c1
psllq \c2, 48
psllq \c3, 48
- psrlq \c0, 16 // high parts of (y''_0; y''_2)
- psrlq \c1, 16 // high parts of (y''_1; y''_3)
- psrlq \c2, 32 // low parts of (y''_0; y''_2)
- psrlq \c3, 32 // low parts of (y''_1; y''_3)
+ psrlq \c0, 16 // high parts of (y''_2; y''_0)
+ psrlq \c1, 16 // high parts of (y''_3; y''_1)
+ psrlq \c2, 32 // low parts of (y''_2; y''_0)
+ psrlq \c3, 32 // low parts of (y''_3; y''_1)
.ifnes "\hi", "nil"
movdqa \hi, \c1
.endif
.ifnes "\hi", "nil"
movdqa \hi, \c1
.endif
- pslldq \c1, 8 // high part of (0; y''_1)
+ pslldq \c1, 8 // high part of (y''_1; 0)
paddq \t, \c2 // propagate down
paddq \u, \c3
paddq \t, \c2 // propagate down
paddq \u, \c3
- paddq \t, \c1 // and up: (y_0; y_2)
- paddq \u, \c0 // (y_1; y_3)
+ paddq \t, \c1 // and up: (y_2; y_0)
+ paddq \u, \c0 // (y_3; y_1)
- psrldq \hi, 8 // high part of (y''_3; 0)
+ psrldq \hi, 8 // high part of (0; y''_3)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
- movdqa \c3, \t // (y_0; ?)
- movdqa \lo, \t // (y^*_0, ?; ?, ?)
- psrldq \t, 8 // (y_2; 0)
+ movdqa \c3, \t // (?; y_0)
+ movdqa \lo, \t // (?, ?; ?, y^*_0)
+ psrldq \t, 8 // (0; y_2)
psrlq \c3, 32 // (floor(y_0/B); ?)
paddq \c3, \u // (y_1 + floor(y_0/B); ?)
psrlq \c3, 32 // (floor(y_0/B); ?)
paddq \c3, \u // (y_1 + floor(y_0/B); ?)
- movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
- psrldq \u, 8 // (y_3; 0)
+ movdqa \c1, \c3 // (?, ?; ?, y^*_1)
+ psrldq \u, 8 // (0; y_3)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
- punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
+ punpckldq \lo, \c3 // (?, ?; y^*_2, y^*_0)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
.ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
.ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
- punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
+ punpckldq \c1, \c3 // (?, ?; y^*_3, y^*_1)
.ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
paddq \hi, \t
.ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
paddq \hi, \t
// On exit, the carry registers, including XMM15, are updated to hold
// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
// registers are preserved.
// On exit, the carry registers, including XMM15, are updated to hold
// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
// registers are preserved.
- movd xmm0, [rdi + 0] // (a_0; 0)
- movd xmm1, [rdi + 4] // (a_1; 0)
- movd xmm2, [rdi + 8] // (a_2; 0)
- movd xmm15, [rdi + 12] // (a_3; 0)
- paddq xmm12, xmm0 // (c'_0 + a_0; c''_0)
- paddq xmm13, xmm1 // (c'_1 + a_1; c''_1)
- paddq xmm14, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
+ movd xmm0, [rdi + 0] // (0; a_0)
+ movd xmm1, [rdi + 4] // (0; a_1)
+ movd xmm2, [rdi + 8] // (0; a_2)
+ movd xmm15, [rdi + 12] // (0; a_3)
+ paddq xmm12, xmm0 // (c''_0; c'_0 + a_0)
+ paddq xmm13, xmm1 // (c''_1; c'_1 + a_1)
+ paddq xmm14, xmm2 // (c''_2 + a_3 b; c'_2 + a_2)
.endm
///--------------------------------------------------------------------------
.endm
///--------------------------------------------------------------------------
mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
mulcore xmm7, 1, xmm10, xmm11, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
- punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
- punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
+ punpckldq xmm12, xmm15 // (0, w_1; 0, w_0)
+ punpckhdq xmm14, xmm15 // (0, w_3; 0, w_2)
mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 2, xmm10, xmm11, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 3, xmm10, xmm11, xmm0
accum xmm6
mulcore xmm7, 3, xmm10, xmm11, xmm0
accum xmm6
- punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
- punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
- punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
- punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
+ punpckldq xmm12, xmm2 // (0, 0; 0, w_0)
+ punpckldq xmm14, xmm2 // (0, 0; 0, w_2)
+ punpckhdq xmm13, xmm2 // (0, 0; 0, w_1)
+ punpckhdq xmm15, xmm2 // (0, 0; 0, w_3)
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
mulcore xmm7, 1, xmm8, xmm9, xmm0, xmm1, xmm2
accum xmm4, xmm5, xmm6
- punpckldq xmm12, xmm15 // (w_0, 0; w_1, 0)
- punpckhdq xmm14, xmm15 // (w_2, 0; w_3, 0)
+ punpckldq xmm12, xmm15 // (0, w_1; 0, w_0)
+ punpckhdq xmm14, xmm15 // (0, w_3; 0, w_2)
mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 2, xmm8, xmm9, xmm0, xmm1
accum xmm5, xmm6
mulcore xmm7, 3, xmm8, xmm9, xmm0
accum xmm6
mulcore xmm7, 3, xmm8, xmm9, xmm0
accum xmm6
- punpckldq xmm12, xmm2 // (w_0, 0; 0, 0)
- punpckldq xmm14, xmm2 // (w_2, 0; 0, 0)
- punpckhdq xmm13, xmm2 // (w_1, 0; 0, 0)
- punpckhdq xmm15, xmm2 // (w_3, 0; 0, 0)
+ punpckldq xmm12, xmm2 // (0, 0; 0, w_0)
+ punpckldq xmm14, xmm2 // (0, 0; 0, w_2)
+ punpckhdq xmm13, xmm2 // (0, 0; 0, w_1)
+ punpckhdq xmm15, xmm2 // (0, 0; 0, w_3)
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
// That's lots of pieces. Now we have to assemble the answer.
squash xmm3, xmm4, xmm5, xmm6, xmm0, xmm1, xmm10
- movdqu xmm12, [rcx + 0] // (c'_0; c''_0)
- movdqu xmm13, [rcx + 16] // (c'_1; c''_1)
- movdqu xmm14, [rcx + 32] // (c'_2; c''_2)
+ movdqu xmm12, [rcx + 0] // (c''_0; c'_0)
+ movdqu xmm13, [rcx + 16] // (c''_1; c'_1)
+ movdqu xmm14, [rcx + 32] // (c''_2; c'_2)
.endm
.macro testtop u=nil
.endm
.macro testtop u=nil
testtop r11
call mmul4
testtail
testtop r11
call mmul4
testtail
- pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
- pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
+ pshufd xmm10, xmm10, SHUF(3, 1, 2, 0)
+ pshufd xmm11, xmm11, SHUF(3, 1, 2, 0)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
testtop r11
call mmla4
testtail
testtop r11
call mmla4
testtail
- pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
- pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
+ pshufd xmm10, xmm10, SHUF(3, 1, 2, 0)
+ pshufd xmm11, xmm11, SHUF(3, 1, 2, 0)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
testtop
call mont4
testtail
testtop
call mont4
testtail
- pshufd xmm10, xmm10, SHUF(0, 2, 1, 3)
- pshufd xmm11, xmm11, SHUF(0, 2, 1, 3)
+ pshufd xmm10, xmm10, SHUF(3, 1, 2, 0)
+ pshufd xmm11, xmm11, SHUF(3, 1, 2, 0)
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
movdqu [r10 + 0], xmm10
movdqu [r10 + 16], xmm11
testcarryout
/// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
/// operands, as follows.
///
/// pieces are placed into 32-bit cells, and arranged as two 128-bit NEON
/// operands, as follows.
///
-/// Offset 0 4 8 12
-/// 0 v'_0 v''_0 v'_1 v''_1
-/// 16 v'_2 v''_2 v'_3 v''_3
+/// Offset 12 8 4 0
+/// 0 v''_1 v'_1 v''_0 v'_0
+/// 16 v''_3 v'_3 v''_2 v'_2
///
/// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
///
/// The `vmull' and `vmlal' instructions can multiply a vector of two 32-bit
/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
ldr r14, [STKARG(0)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
ldr r14, [STKARG(0)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
- vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+ vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ldr r14, [STKARG(1)] // -> yy
vld1.32 {q4}, [r14]
vmov.i32 q5, #0
ldr r14, [STKARG(1)] // -> yy
vld1.32 {q4}, [r14]
vmov.i32 q5, #0
- vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+ vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ldr r5, [STKARG(2)] // = n
ldr r6, [STKARG(3)] // -> cyv
ldr r5, [STKARG(2)] // = n
ldr r6, [STKARG(3)] // -> cyv
vld1.32 {q4}, [r3]
vmov.i32 q5, #0
vld1.32 {q4}, [r3]
vmov.i32 q5, #0
- vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+ vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ldr r5, [STKARG(0)] // = n
ldr r6, [STKARG(1)] // -> cyv
ldr r5, [STKARG(0)] // = n
ldr r6, [STKARG(1)] // -> cyv
ldr r14, [STKARG(1)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
ldr r14, [STKARG(1)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
- vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+ vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ldr r14, [STKARG(2)] // -> yy
vld1.32 {q4}, [r14]
vmov.i32 q5, #0
ldr r14, [STKARG(2)] // -> yy
vld1.32 {q4}, [r14]
vmov.i32 q5, #0
- vzip.16 q4, q5 // (y'_0, y''_0; y'_1, y''_1)
+ vzip.16 q4, q5 // (y''_1, y'_1; y''_0, y'_0)
ldr r5, [STKARG(3)] // = n
ldr r6, [STKARG(4)] // -> cyv
ldr r5, [STKARG(3)] // = n
ldr r6, [STKARG(4)] // -> cyv
ldr r14, [STKARG(0)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
ldr r14, [STKARG(0)] // -> vv
vld1.32 {q2}, [r14]
vmov.i32 q3, #0
- vzip.16 q2, q3 // (v'_0, v''_0; v'_1, v''_1)
+ vzip.16 q2, q3 // (v''_1, v'_1; v''_0, v'_0)
ldr r5, [STKARG(1)] // = n
ldr r6, [STKARG(2)] // -> cyv
ldr r5, [STKARG(1)] // = n
ldr r6, [STKARG(2)] // -> cyv
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
/// operands, as follows.
///
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SIMD
/// operands, as follows.
///
-/// Offset 0 4 8 12
-/// 0 v'_0 v''_0 v'_1 v''_1
-/// 16 v'_2 v''_2 v'_3 v''_3
+/// Offset 12 8 4 0
+/// 0 v''_1 v'_1 v''_0 v'_0
+/// 16 v''_3 v'_3 v''_2 v'_2
///
/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
///
/// The `umull' and `umlal' instructions can multiply a vector of two 32-bit
/// values by a 32-bit scalar, giving two 64-bit results; thus, it will act
// leaving a carry in CG.
//
// In detail, what happens is as follows. Suppose initially that ZLO =
// leaving a carry in CG.
//
// In detail, what happens is as follows. Suppose initially that ZLO =
-// (z'_i; z''_i) and ZHI = (z'_{i+1}; z''_{i+1}). Let t = z'_i + b z''_i;
+// (z''_i; z'_i) and ZHI = (z''_{i+1}; z'_{i+1}). Let t = z'_i + b z''_i;
// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
// a circuit depth of 3; I don't know how to do better.
// observe that floor(t/b) = floor(z'_i/b) + z''_i. Let z_i = t mod B, and
// add floor(t/B) = floor((floor(z'_i/b) + z''_i)/b) onto z'_{i+1}. This has
// a circuit depth of 3; I don't know how to do better.
.ifeqs "\mode", "dmul"
ldr q2, [x4]
.ifeqs "\mode", "dmul"
ldr q2, [x4]
- zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3)
- zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1)
+ zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2)
+ zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0)
- zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
- zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
+ zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
+ zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
mov x16, x1
mov x1, x2 // -> u
mov x16, x1
mov x1, x2 // -> u
.ifeqs "\mode", "smul"
ldr q4, [x3]
.ifeqs "\mode", "smul"
ldr q4, [x3]
- zip2 v5.8h, v4.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
- zip1 v4.8h, v4.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
+ zip2 v5.8h, v4.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
+ zip1 v4.8h, v4.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
// x2 // -> x
mov x3, x1 // -> c
// x2 // -> x
mov x3, x1 // -> c
.ifeqs "\mode", "mmul"
ldr q2, [x5]
.ifeqs "\mode", "mmul"
ldr q2, [x5]
- zip2 v3.8h, v2.8h, v31.8h // (v'_2, v''_2; v'_3, v''_3)
- zip1 v2.8h, v2.8h, v31.8h // (v'_0, v''_0; v'_1, v''_1)
+ zip2 v3.8h, v2.8h, v31.8h // (v''_3, v'_3; v''_2, v'_2)
+ zip1 v2.8h, v2.8h, v31.8h // (v''_1, v'_1; v''_0, v'_0)
- zip2 v7.8h, v6.8h, v31.8h // (y'_2, y''_2; y'_3, y''_3)
- zip1 v6.8h, v6.8h, v31.8h // (y'_0, y''_0; y'_1, y''_1)
+ zip2 v7.8h, v6.8h, v31.8h // (y''_3, y'_3; y''_2, y'_2)
+ zip1 v6.8h, v6.8h, v31.8h // (y''_1, y'_1; y''_0, y'_0)
mov x16, x1
mov x1, x3 // -> u
mov x16, x1
mov x1, x3 // -> u
.ifeqs "\mode", "mont"
ldr q6, [x4]
.ifeqs "\mode", "mont"
ldr q6, [x4]
- zip2 v7.8h, v6.8h, v31.8h // (m'_2, m''_2; m'_3, m''_3)
- zip1 v6.8h, v6.8h, v31.8h // (m'_0, m''_0; m'_1, m''_1)
+ zip2 v7.8h, v6.8h, v31.8h // (m''_3, m'_3; m''_2, m'_2)
+ zip1 v6.8h, v6.8h, v31.8h // (m''_1, m'_1; m''_0, m'_0)
mov x4, x2 // -> y
mov x2, x3 // -> x
mov x4, x2 // -> y
mov x2, x3 // -> x
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
/// operands, as follows.
///
/// pieces are placed into 32-bit cells, and arranged as two 128-bit SSE
/// operands, as follows.
///
-/// Offset 0 4 8 12
-/// 0 v'_0 v'_1 v''_0 v''_1
-/// 16 v'_2 v'_3 v''_2 v''_3
+/// Offset 12 8 4 0
+/// 0 v''_1 v''_0 v'_1 v'_0
+/// 16 v''_3 v''_2 v'_3 v'_2
///
/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
///
/// A `pmuludq' instruction ignores the odd positions in its operands; thus,
/// it will act on (say) v'_0 and v''_0 in a single instruction. Shifting
.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
.macro mulcore r, s, d0, d1=nil, d2=nil, d3=nil
// Load a word r_i from R, multiply by the expanded operand [S], and
// leave the pieces of the product in registers D0, D1, D2, D3.
- movd \d0, \r // (r_i, 0; 0, 0)
+ movd \d0, \r // (0, 0; 0, r_i)
- movdqa \d1, [\s] // (s'_0, s'_1; s''_0, s''_1)
+ movdqa \d1, [\s] // (s''_1, s''_0; s'_1, s'_0)
.endif
.ifnes "\d3", "nil"
.endif
.ifnes "\d3", "nil"
- movdqa \d3, [\s + 16] // (s'_2, s'_3; s''_2, s''_3)
+ movdqa \d3, [\s + 16] // (s''_3, s''_2; s'_3, s'_2)
- pshufd \d0, \d0, SHUF(0, 3, 0, 3) // (r_i, ?; r_i, ?)
+ pshufd \d0, \d0, SHUF(3, 0, 3, 0) // (?, r_i; ?, r_i)
- psrldq \d1, 4 // (s'_1, s''_0; s''_1, 0)
+ psrldq \d1, 4 // (0, s''_1; s''_0, s'_1)
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
.else
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
movdqa \d2, \d3 // another copy of (s'_2, s'_3; ...)
.else
- movdqa \d2, \d0 // another copy of (r_i, ?; r_i, ?)
+ movdqa \d2, \d0 // another copy of (?, r_i; ?, r_i)
.endif
.endif
.ifnes "\d3", "nil"
.endif
.endif
.ifnes "\d3", "nil"
- psrldq \d3, 4 // (s'_3, s''_2; s''_3, 0)
+ psrldq \d3, 4 // (0, s''_3; s''_2, s'_3)
.endif
.ifnes "\d1", "nil"
.endif
.ifnes "\d1", "nil"
- pmuludq \d1, \d0 // (r_i s'_1; r_i s''_1)
+ pmuludq \d1, \d0 // (r_i s''_1; r_i s'_1)
.endif
.ifnes "\d3", "nil"
.endif
.ifnes "\d3", "nil"
- pmuludq \d3, \d0 // (r_i s'_3; r_i s''_3)
+ pmuludq \d3, \d0 // (r_i s''_3; r_i s'_3)
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
.endif
.ifnes "\d2", "nil"
.ifnes "\d3", "nil"
- pmuludq \d2, \d0 // (r_i s'_2; r_i s''_2)
+ pmuludq \d2, \d0 // (r_i s''_2; r_i s'_2)
.else
pmuludq \d2, [\s + 16]
.endif
.endif
.else
pmuludq \d2, [\s + 16]
.endif
.endif
- pmuludq \d0, [\s] // (r_i s'_0; r_i s''_0)
+ pmuludq \d0, [\s] // (r_i s''_0; r_i s'_0)
.endm
.macro accum c0, c1=nil, c2=nil, c3=nil
.endm
.macro accum c0, c1=nil, c2=nil, c3=nil
// carry registers. On completion, XMM3 is clobbered. If CC is
// `nil', then the contribution which would have been added to it is
// left in C.
// carry registers. On completion, XMM3 is clobbered. If CC is
// `nil', then the contribution which would have been added to it is
// left in C.
- pshufd xmm3, \c, SHUF(3, 3, 3, 2) // (?, ?; ?, t = c'' mod B)
- psrldq xmm3, 12 // (t, 0; 0, 0) = (t, 0)
- pslldq xmm3, 2 // (t b; 0)
- paddq \c, xmm3 // (c' + t b; c'')
+ pshufd xmm3, \c, SHUF(2, 3, 3, 3) // (t = c'' mod B, ?; ?, ?)
+ psrldq xmm3, 12 // (0, 0; 0, t) = (0; t)
+ pslldq xmm3, 2 // (0; t b)
+ paddq \c, xmm3 // (c''; c' + t b)
movd \d, \c
psrlq \c, 32 // floor(c/B)
.ifnes "\cc", "nil"
movd \d, \c
psrlq \c, 32 // floor(c/B)
.ifnes "\cc", "nil"
// of the value represented in C are written to D, and the remaining
// bits are left at the bottom of T.
movdqa \t, \c
// of the value represented in C are written to D, and the remaining
// bits are left at the bottom of T.
movdqa \t, \c
- psllq \t, 16 // (?; c'' b)
- pslldq \c, 8 // (0; c')
- paddq \t, \c // (?; c' + c'' b)
- psrldq \t, 8 // (c' + c'' b; 0) = (c; 0)
+ psllq \t, 16 // (c'' b; ?)
+ pslldq \c, 8 // (c'; 0)
+ paddq \t, \c // (c' + c'' b; ?)
+ psrldq \t, 8 // (0; c' + c'' b) = (0; c)
movd \d, \t
psrldq \t, 4 // (floor(c/B); 0)
.endm
movd \d, \t
psrldq \t, 4 // (floor(c/B); 0)
.endm
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
// On entry, A and C hold packed 128-bit values, and Z is zero. On
// exit, A:B and C:D together hold the same values in expanded
// form. If C is `nil', then only expand A to A:B.
- movdqa \b, \a // (a_0, a_1; a_2, a_3)
+ movdqa \b, \a // (a_3, a_2; a_1, a_0)
- movdqa \d, \c // (c_0, c_1; c_2, c_3)
+ movdqa \d, \c // (c_3, c_2; c_1, c_0)
- punpcklwd \a, \z // (a'_0, a''_0; a'_1, a''_1)
- punpckhwd \b, \z // (a'_2, a''_2; a'_3, a''_3)
+ punpcklwd \a, \z // (a''_1, a'_1; a''_0, a'_0)
+ punpckhwd \b, \z // (a''_3, a'_3; a''_2, a'_2)
- punpcklwd \c, \z // (c'_0, c''_0; c'_1, c''_1)
- punpckhwd \d, \z // (c'_2, c''_2; c'_3, c''_3)
+ punpcklwd \c, \z // (c''_1, c'_1; c''_0, c'_0)
+ punpckhwd \d, \z // (c''_3, c'_3; c''_2, c'_2)
- pshufd \a, \a, SHUF(0, 2, 1, 3) // (a'_0, a'_1; a''_0, a''_1)
- pshufd \b, \b, SHUF(0, 2, 1, 3) // (a'_2, a'_3; a''_2, a''_3)
+ pshufd \a, \a, SHUF(3, 1, 2, 0) // (a''_1, a''_0; a'_1, a'_0)
+ pshufd \b, \b, SHUF(3, 1, 2, 0) // (a''_3, a''_2; a'_3, a'_2)
- pshufd \c, \c, SHUF(0, 2, 1, 3) // (c'_0, c'_1; c''_0, c''_1)
- pshufd \d, \d, SHUF(0, 2, 1, 3) // (c'_2, c'_3; c''_2, c''_3)
+ pshufd \c, \c, SHUF(3, 1, 2, 0) // (c''_1, c''_0; c'_1, c'_0)
+ pshufd \d, \d, SHUF(3, 1, 2, 0) // (c''_3, c''_2; c'_3, c'_2)
// we can do that, we must gather them together.
movdqa \t, \c0
movdqa \u, \c1
// we can do that, we must gather them together.
movdqa \t, \c0
movdqa \u, \c1
- punpcklqdq \t, \c2 // (y'_0; y'_2)
- punpckhqdq \c0, \c2 // (y''_0; y''_2)
- punpcklqdq \u, \c3 // (y'_1; y'_3)
- punpckhqdq \c1, \c3 // (y''_1; y''_3)
+ punpcklqdq \t, \c2 // (y'_2; y'_0)
+ punpckhqdq \c0, \c2 // (y''_2; y''_0)
+ punpcklqdq \u, \c3 // (y'_3; y'_1)
+ punpckhqdq \c1, \c3 // (y''_3; y''_1)
// Now split the double-prime pieces. The high (up to) 48 bits will
// go up; the low 16 bits go down.
// Now split the double-prime pieces. The high (up to) 48 bits will
// go up; the low 16 bits go down.
movdqa \c3, \c1
psllq \c2, 48
psllq \c3, 48
movdqa \c3, \c1
psllq \c2, 48
psllq \c3, 48
- psrlq \c0, 16 // high parts of (y''_0; y''_2)
- psrlq \c1, 16 // high parts of (y''_1; y''_3)
- psrlq \c2, 32 // low parts of (y''_0; y''_2)
- psrlq \c3, 32 // low parts of (y''_1; y''_3)
+ psrlq \c0, 16 // high parts of (y''_2; y''_0)
+ psrlq \c1, 16 // high parts of (y''_3; y''_1)
+ psrlq \c2, 32 // low parts of (y''_2; y''_0)
+ psrlq \c3, 32 // low parts of (y''_3; y''_1)
.ifnes "\hi", "nil"
movdqa \hi, \c1
.endif
.ifnes "\hi", "nil"
movdqa \hi, \c1
.endif
- pslldq \c1, 8 // high part of (0; y''_1)
+ pslldq \c1, 8 // high part of (y''_1; 0)
paddq \t, \c2 // propagate down
paddq \u, \c3
paddq \t, \c2 // propagate down
paddq \u, \c3
- paddq \t, \c1 // and up: (y_0; y_2)
- paddq \u, \c0 // (y_1; y_3)
+ paddq \t, \c1 // and up: (y_2; y_0)
+ paddq \u, \c0 // (y_3; y_1)
- psrldq \hi, 8 // high part of (y''_3; 0)
+ psrldq \hi, 8 // high part of (0; y''_3)
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
.endif
// Finally extract the answer. This complicated dance is better than
// storing to memory and loading, because the piecemeal stores
// inhibit store forwarding.
- movdqa \c3, \t // (y_0; ?)
- movdqa \lo, \t // (y^*_0, ?; ?, ?)
- psrldq \t, 8 // (y_2; 0)
+ movdqa \c3, \t // (?; y_0)
+ movdqa \lo, \t // (?, ?; ?, y^*_0)
+ psrldq \t, 8 // (0; y_2)
psrlq \c3, 32 // (floor(y_0/B); ?)
paddq \c3, \u // (y_1 + floor(y_0/B); ?)
psrlq \c3, 32 // (floor(y_0/B); ?)
paddq \c3, \u // (y_1 + floor(y_0/B); ?)
- movdqa \c1, \c3 // (y^*_1, ?; ?, ?)
- psrldq \u, 8 // (y_3; 0)
+ movdqa \c1, \c3 // (?, ?; ?, y^*_1)
+ psrldq \u, 8 // (0; y_3)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
psrlq \c3, 32 // (floor((y_1 B + y_0)/B^2; ?)
paddq \c3, \t // (y_2 + floor((y_1 B + y_0)/B^2; ?)
- punpckldq \lo, \c3 // (y^*_0, y^*_2; ?, ?)
+ punpckldq \lo, \c3 // (?, ?; y^*_2, y^*_0)
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
.ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
psrlq \c3, 32 // (floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
paddq \c3, \u // (y_3 + floor((y_2 B^2 + y_1 B + y_0)/B^3; ?)
.ifnes "\hi", "nil"
movdqa \t, \c3
pxor \u, \u
.endif
- punpckldq \c1, \c3 // (y^*_1, y^*_3; ?, ?)
+ punpckldq \c1, \c3 // (?, ?; y^*_3, y^*_1)
.ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
paddq \hi, \t
.ifnes "\hi", "nil"
psrlq \t, 32 // very high bits of y
paddq \hi, \t
// On exit, the carry registers, including XMM7, are updated to hold
// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
// registers are preserved.
// On exit, the carry registers, including XMM7, are updated to hold
// C + A; XMM0, XMM1, XMM2, and XMM3 are clobbered. The other
// registers are preserved.
- movd xmm0, [edi + 0] // (a_0; 0)
- movd xmm1, [edi + 4] // (a_1; 0)
- movd xmm2, [edi + 8] // (a_2; 0)
- movd xmm7, [edi + 12] // (a_3; 0)
-
- paddq xmm4, xmm0 // (c'_0 + a_0; c''_0)
- paddq xmm5, xmm1 // (c'_1 + a_1; c''_1)
- paddq xmm6, xmm2 // (c'_2 + a_2; c''_2 + a_3 b)
+ movd xmm0, [edi + 0] // (0; a_0)
+ movd xmm1, [edi + 4] // (0; a_1)
+ movd xmm2, [edi + 8] // (0; a_2)
+ movd xmm7, [edi + 12] // (0; a_3)
+
+ paddq xmm4, xmm0 // (c''_0; c'_0 + a_0)
+ paddq xmm5, xmm1 // (c''_1; c'_1 + a_1)
+ paddq xmm6, xmm2 // (c''_2 + a_3 b; c'_2 + a_2)
.endm
///--------------------------------------------------------------------------
.endm
///--------------------------------------------------------------------------
.macro testldcarry c
mov ecx, \c // -> c
.macro testldcarry c
mov ecx, \c // -> c
- movdqu xmm4, [ecx + 0] // (c'_0; c''_0)
- movdqu xmm5, [ecx + 16] // (c'_1; c''_1)
- movdqu xmm6, [ecx + 32] // (c'_2; c''_2)
+ movdqu xmm4, [ecx + 0] // (c''_0; c'_0)
+ movdqu xmm5, [ecx + 16] // (c''_1; c'_1)
+ movdqu xmm6, [ecx + 32] // (c''_2; c'_2)
.endm
.macro testexpand v=nil, y=nil
.endm
.macro testexpand v=nil, y=nil
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
- pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
- pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
+ pshufd xmm0, xmm0, SHUF(3, 1, 2, 0)
+ pshufd xmm1, xmm1, SHUF(3, 1, 2, 0)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
- pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
- pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
+ pshufd xmm0, xmm0, SHUF(3, 1, 2, 0)
+ pshufd xmm1, xmm1, SHUF(3, 1, 2, 0)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
mov edi, [BP + 28]
movdqa xmm0, [SP + 64]
movdqa xmm1, [SP + 80]
- pshufd xmm0, xmm0, SHUF(0, 2, 1, 3)
- pshufd xmm1, xmm1, SHUF(0, 2, 1, 3)
+ pshufd xmm0, xmm0, SHUF(3, 1, 2, 0)
+ pshufd xmm1, xmm1, SHUF(3, 1, 2, 0)
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
movdqu [edi], xmm0
movdqu [edi + 16], xmm1
testcarryout [BP + 24]
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
+ pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
- pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
//
// The shuffles have quite high latency, so they've mostly been
// pushed upwards. The remaining one can't be moved, though.
//
// The shuffles have quite high latency, so they've mostly been
// pushed upwards. The remaining one can't be moved, though.
- pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
+ pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
// Apply the diagonal quarterround to each of the columns
// simultaneously.
// Apply the diagonal quarterround to each of the columns
// simultaneously.
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
+ pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
- pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
// Finally, finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// to wait for the shuffles.
// Finally, finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// to wait for the shuffles.
- pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
// Decrement the loop counter and see if we should go round again.
sub NR, 2
// Decrement the loop counter and see if we should go round again.
sub NR, 2
// use Karatsuba's identity here, but I suspect that loses more in
// the shifting, bit-twiddling, and dependency chains that it gains
// in saving a multiplication which otherwise pipelines well.
// use Karatsuba's identity here, but I suspect that loses more in
// the shifting, bit-twiddling, and dependency chains that it gains
// in saving a multiplication which otherwise pipelines well.
- // q0 = // (u_0; u_1)
- // q1 = // (v_0; v_1)
+ // q0 = // (u_1; u_0)
+ // q1 = // (v_1; v_0)
vmull.p64 q2, d1, d2 // u_1 v_0
vmull.p64 q3, d0, d3 // u_0 v_1
vmull.p64 q2, d1, d2 // u_1 v_0
vmull.p64 q3, d0, d3 // u_0 v_1
- vmull.p64 q8, d1, d3 // (x_3; t_1) = u_1 v_1
- vmull.p64 q9, d0, d2 // (t_0; x_0) = u_0 v_0
+ vmull.p64 q8, d1, d3 // (t_1; x_3) = u_1 v_1
+ vmull.p64 q9, d0, d2 // (x_0; t_0) = u_0 v_0
// Arrange the pieces to form a double-precision polynomial.
// Arrange the pieces to form a double-precision polynomial.
- veor q2, q2, q3 // (m_1; m_0) = u_0 v_1 + u_1 v_0
+ veor q2, q2, q3 // (m_0; m_1) = u_0 v_1 + u_1 v_0
veor d17, d17, d4 // x_2 = t_1 + m_1
veor d18, d18, d5 // x_1 = t_0 + m_0
veor d17, d17, d4 // x_2 = t_1 + m_1
veor d18, d18, d5 // x_1 = t_0 + m_0
- // q8 = // (x_3; x_2)
- // q9 = // (x_1; x_0)
+ // q8 = // (x_2; x_3)
+ // q9 = // (x_0; x_1)
// One-and-a-half problems remain.
//
// One-and-a-half problems remain.
//
// This is an inconvenient size. There's nothing for it but to do
// four multiplications, as if for the 128-bit case.
// This is an inconvenient size. There's nothing for it but to do
// four multiplications, as if for the 128-bit case.
- // q0 = // (u_0 + u_1 t^32; u_2)
- // q1 = // (v_0 + v_1 t^32; v_2)
+ // q0 = // (u_2; u_0 + u_1 t^32)
+ // q1 = // (v_2; v_0 + v_1 t^32)
vmull.p64 q8, d1, d2 // u_2 (v_0 + v_1 t^32) = e_0
vmull.p64 q9, d0, d3 // v_2 (u_0 + u_1 t^32) = e_1
vmull.p64 q8, d1, d2 // u_2 (v_0 + v_1 t^32) = e_0
vmull.p64 q9, d0, d3 // v_2 (u_0 + u_1 t^32) = e_1
- vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (0; d)
+ vmull.p64 q3, d1, d3 // u_2 v_2 t^64 = d = (d; 0)
vmull.p64 q0, d0, d2 // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
// + u_1 v_1 t^64 = f
vmull.p64 q0, d0, d2 // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
// + u_1 v_1 t^64 = f
veor q11, q11, q13 // b = u_1 v_2 + u_2 v_1
// Piece the product together.
veor q11, q11, q13 // b = u_1 v_2 + u_2 v_1
// Piece the product together.
- veor d17, d17, d22 // q8 = // (x_5; x_4)
+ veor d17, d17, d22 // q8 = // (x_4; x_5)
- veor d19, d19, d24 // q9 = // (x_3; x_2)
- veor d20, d20, d25 // q10 = // (x_1; x_0)
+ veor d19, d19, d24 // q9 = // (x_2; x_3)
+ veor d20, d20, d25 // q10 = // (x_0; x_1)
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
- // q8 = // (y_5; y_4)
- // q9 = // (y_3; y_2)
- // q10 = // (y_1; y_0)
- vshl.u64 q11, q8, #63 // (y_5; y_4) b_i for t
+ // q8 = // (y_4; y_5)
+ // q9 = // (y_2; y_3)
+ // q10 = // (y_0; y_1)
+ vshl.u64 q11, q8, #63 // (y_4; y_5) b_i for t
vshl.u64 d28, d18, #63 // y_3 b_i for t
vshl.u64 d28, d18, #63 // y_3 b_i for t
- vshl.u64 q12, q8, #62 // (y_5; y_4) b_i for t^2
+ vshl.u64 q12, q8, #62 // (y_4; y_5) b_i for t^2
vshl.u64 d29, d18, #62 // y_3 b_i for t^2
vshl.u64 d29, d18, #62 // y_3 b_i for t^2
- vshl.u64 q13, q8, #57 // (y_5; y_4) b_i for t^7
+ vshl.u64 q13, q8, #57 // (y_4; y_5) b_i for t^7
vshl.u64 d30, d18, #57 // y_3 b_i for t^7
veor q11, q11, q12 // mix them all together
veor d28, d28, d29
vshl.u64 d30, d18, #57 // y_3 b_i for t^7
veor q11, q11, q12 // mix them all together
veor d28, d28, d29
// And finally shift the low bits up. Also, switch the order of the
// pieces for output.
// And finally shift the low bits up. Also, switch the order of the
// pieces for output.
- // q8 = // (y'_5; y'_4)
- // q9 = // (y'_3; y'_2)
- // q10 = // (y'_1; y'_0)
- vshr.u64 q11, q8, #1 // (y_5; y_4) a_i for t
+ // q8 = // (y'_4; y'_5)
+ // q9 = // (y'_2; y'_3)
+ // q10 = // (y'_0; y'_1)
+ vshr.u64 q11, q8, #1 // (y_4; y_5) a_i for t
vshr.u64 d28, d18, #1 // y'_3 a_i for t
vshr.u64 d28, d18, #1 // y'_3 a_i for t
- vshr.u64 q12, q8, #2 // (y_5; y_4) a_i for t^2
+ vshr.u64 q12, q8, #2 // (y_4; y_5) a_i for t^2
vshr.u64 d29, d18, #2 // y'_3 a_i for t^2
vshr.u64 d29, d18, #2 // y'_3 a_i for t^2
- vshr.u64 q13, q8, #7 // (y_5; y_4) a_i for t^7
+ vshr.u64 q13, q8, #7 // (y_4; y_5) a_i for t^7
vshr.u64 d30, d18, #7 // y'_3 a_i for t^7
veor q8, q8, q11
veor d18, d18, d28
vshr.u64 d30, d18, #7 // y'_3 a_i for t^7
veor q8, q8, q11
veor d18, d18, d28
// 128-bit multiplications already, and Karatsuba is too annoying
// there, so there'll be 12 multiplications altogether, rather than
// the 16 we'd have if we did this the naïve way.
// 128-bit multiplications already, and Karatsuba is too annoying
// there, so there'll be 12 multiplications altogether, rather than
// the 16 we'd have if we did this the naïve way.
- // q0 = // u_0 = (u_00; u_01)
- // q1 = // u_1 = (u_10; u_11)
- // q2 = // v_0 = (v_00; v_01)
- // q3 = // v_1 = (v_10; v_11)
+ // q0 = // u_0 = (u_01; u_00)
+ // q1 = // u_1 = (u_11; u_10)
+ // q2 = // v_0 = (v_01; v_00)
+ // q3 = // v_1 = (v_11; v_10)
- veor q8, q0, q1 // u_* = (u_00 + u_10; u_01 + u_11)
- veor q9, q2, q3 // v_* = (v_00 + v_10; v_01 + v_11)
+ veor q8, q0, q1 // u_* = (u_01 + u_11; u_00 + u_10)
+ veor q9, q2, q3 // v_* = (v_01 + v_11; v_00 + v_10)
// Start by building the cross product, q = u_* v_*.
vmull.p64 q14, d16, d19 // u_*0 v_*1
// Start by building the cross product, q = u_* v_*.
vmull.p64 q14, d16, d19 // u_*0 v_*1
// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
// First, shift the high bits down.
// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
// First, shift the high bits down.
- // q8 = // (y_7; y_6)
- // q9 = // (y_5; y_4)
- // q10 = // (y_3; y_2)
- // q11 = // (y_1; y_0)
- vshl.u64 q0, q8, #62 // (y_7; y_6) b_i for t^2
- vshl.u64 q12, q9, #62 // (y_5; y_4) b_i for t^2
- vshl.u64 q1, q8, #59 // (y_7; y_6) b_i for t^5
- vshl.u64 q13, q9, #59 // (y_5; y_4) b_i for t^5
- vshl.u64 q2, q8, #54 // (y_7; y_6) b_i for t^10
- vshl.u64 q14, q9, #54 // (y_5; y_4) b_i for t^10
+ // q8 = // (y_6; y_7)
+ // q9 = // (y_4; y_5)
+ // q10 = // (y_2; y_3)
+ // q11 = // (y_0; y_1)
+ vshl.u64 q0, q8, #62 // (y_6; y_7) b_i for t^2
+ vshl.u64 q12, q9, #62 // (y_4; y_5) b_i for t^2
+ vshl.u64 q1, q8, #59 // (y_6; y_7) b_i for t^5
+ vshl.u64 q13, q9, #59 // (y_4; y_5) b_i for t^5
+ vshl.u64 q2, q8, #54 // (y_6; y_7) b_i for t^10
+ vshl.u64 q14, q9, #54 // (y_4; y_5) b_i for t^10
veor q0, q0, q1 // mix the contributions together
veor q12, q12, q13
veor q0, q0, q2
veor q0, q0, q1 // mix the contributions together
veor q12, q12, q13
veor q0, q0, q2
// And then shift the low bits up. Also, switch the order of the
// pieces for output.
// And then shift the low bits up. Also, switch the order of the
// pieces for output.
- // q8 = // (y'_7; y'_6)
- // q9 = // (y'_5; y'_4)
- // q10 = // (y'_3; y'_2)
- // q11 = // (y'_1; y'_0)
- vshr.u64 q0, q8, #2 // (y_7; y_6) a_i for t^2
- vshr.u64 q12, q9, #2 // (y_5; y'_4) a_i for t^2
- vshr.u64 q1, q8, #5 // (y_7; y_6) a_i for t^5
- vshr.u64 q13, q9, #5 // (y_5; y_4) a_i for t^5
- vshr.u64 q2, q8, #10 // (y_7; y_6) a_i for t^10
- vshr.u64 q14, q9, #10 // (y_5; y_4) a_i for t^10
+ // q8 = // (y'_6; y'_7)
+ // q9 = // (y'_4; y'_5)
+ // q10 = // (y'_2; y'_3)
+ // q11 = // (y'_0; y'_1)
+ vshr.u64 q0, q8, #2 // (y_6; y_7) a_i for t^2
+ vshr.u64 q12, q9, #2 // (y'_4; y_5) a_i for t^2
+ vshr.u64 q1, q8, #5 // (y_6; y_7) a_i for t^5
+ vshr.u64 q13, q9, #5 // (y_4; y_5) a_i for t^5
+ vshr.u64 q2, q8, #10 // (y_6; y_7) a_i for t^10
+ vshr.u64 q14, q9, #10 // (y_4; y_5) a_i for t^10
veor q8, q8, q0 // mix the contributions together
veor q1, q1, q2
veor q8, q8, q0 // mix the contributions together
veor q1, q1, q2
// use Karatsuba's identity here, but I suspect that loses more in
// the shifting, bit-twiddling, and dependency chains that it gains
// in saving a multiplication which otherwise pipelines well.
// use Karatsuba's identity here, but I suspect that loses more in
// the shifting, bit-twiddling, and dependency chains that it gains
// in saving a multiplication which otherwise pipelines well.
- // v0 = // (u_0; u_1)
- // v1/v2 = // (v_0; v_1)
+ // v0 = // (u_1; u_0)
+ // v1/v2 = // (v_1; v_0)
pmull2 v3.1q, v0.2d, v1.2d // u_1 v_0
pmull v4.1q, v0.1d, v2.1d // u_0 v_1
pmull2 v3.1q, v0.2d, v1.2d // u_1 v_0
pmull v4.1q, v0.1d, v2.1d // u_0 v_1
- pmull2 v5.1q, v0.2d, v2.2d // (t_1; x_3) = u_1 v_1
- pmull v6.1q, v0.1d, v1.1d // (x_0; t_0) = u_0 v_0
+ pmull2 v5.1q, v0.2d, v2.2d // (x_3; t_1) = u_1 v_1
+ pmull v6.1q, v0.1d, v1.1d // (t_0; x_0) = u_0 v_0
// Arrange the pieces to form a double-precision polynomial.
// Arrange the pieces to form a double-precision polynomial.
- eor v3.16b, v3.16b, v4.16b // (m_0; m_1) = u_0 v_1 + u_1 v_0
- vshr128 v4, v3, 64 // (m_1; 0)
- vshl128 v3, v3, 64 // (0; m_0)
- eor v1.16b, v5.16b, v4.16b // (x_2; x_3)
- eor v0.16b, v6.16b, v3.16b // (x_0; x_1)
+ eor v3.16b, v3.16b, v4.16b // (m_1; m_0) = u_0 v_1 + u_1 v_0
+ vshr128 v4, v3, 64 // (0; m_1)
+ vshl128 v3, v3, 64 // (m_0; 0)
+ eor v1.16b, v5.16b, v4.16b // (x_3; x_2)
+ eor v0.16b, v6.16b, v3.16b // (x_1; x_0)
// And now the only remaining difficulty is that the result needs to
// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128
// And now the only remaining difficulty is that the result needs to
// be reduced modulo p(t) = t^128 + t^7 + t^2 + t + 1. Let R = t^128
// leave with z = u v in x2. Clobbers x2--x4.
// The multiplication is thankfully easy.
// leave with z = u v in x2. Clobbers x2--x4.
// The multiplication is thankfully easy.
- // v0 = // (u; ?)
- // v1 = // (v; ?)
+ // v0 = // (?; u)
+ // v1 = // (?; v)
pmull v0.1q, v0.1d, v1.1d // u v
// Now we must reduce. This is essentially the same as the 128-bit
pmull v0.1q, v0.1d, v1.1d // u v
// Now we must reduce. This is essentially the same as the 128-bit
// shift both of them up by four bytes before we start. This will
// mean that the high 64 bits of the result (from GCM's viewpoint)
// will be zero.
// shift both of them up by four bytes before we start. This will
// mean that the high 64 bits of the result (from GCM's viewpoint)
// will be zero.
- // v0 = // (u_0 + u_1 t^32; u_2)
+ // v0 = // (u_2; u_0 + u_1 t^32)
// v1 = // (v_0 + v_1 t^32; v_0 + v_1 t^32)
// v2 = // (v_2; v_2)
pmull2 v5.1q, v0.2d, v1.2d // u_2 (v_0 + v_1 t^32) t^32 = e_0
pmull v4.1q, v0.1d, v2.1d // v_2 (u_0 + u_1 t^32) t^32 = e_1
// v1 = // (v_0 + v_1 t^32; v_0 + v_1 t^32)
// v2 = // (v_2; v_2)
pmull2 v5.1q, v0.2d, v1.2d // u_2 (v_0 + v_1 t^32) t^32 = e_0
pmull v4.1q, v0.1d, v2.1d // v_2 (u_0 + u_1 t^32) t^32 = e_1
- pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (d; 0)
+ pmull2 v6.1q, v0.2d, v2.2d // u_2 v_2 = d = (0; d)
pmull v3.1q, v0.1d, v1.1d // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
// + u_1 v_1 t^64 = f
pmull v3.1q, v0.1d, v1.1d // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
// + u_1 v_1 t^64 = f
// Clobbers v16--v25.
// Start multiplying and accumulating pieces of product.
// Clobbers v16--v25.
// Start multiplying and accumulating pieces of product.
- // v0 = // (u_0; u_1)
- // v1 = // (u_2; ?)
+ // v0 = // (u_1; u_0)
+ // v1 = // (?; u_2)
// v2 = // (v_0; v_0)
// v3 = // (v_1; v_1)
// v4 = // (v_2; v_2)
// v2 = // (v_0; v_0)
// v3 = // (v_1; v_1)
// v4 = // (v_2; v_2)
eor v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
// Piece the product together.
eor v20.16b, v20.16b, v24.16b // d = u_1 v_2 + u_2 v_1
// Piece the product together.
- // v16 = // (a_0; a_1)
- // v19 = // (b_0; b_1)
- // v17 = // (c_0; c_1)
- // v20 = // (d_0; d_1)
- // v18 = // (e_0; e_1)
- vshl128 v21, v19, 64 // (0; b_0)
- ext v22.16b, v19.16b, v20.16b, #8 // (b_1; d_0)
- vshr128 v23, v20, 64 // (d_1; 0)
- eor v16.16b, v16.16b, v21.16b // (x_0; x_1)
- eor v17.16b, v17.16b, v22.16b // (x_2; x_3)
- eor v18.16b, v18.16b, v23.16b // (x_2; x_3)
+ // v16 = // (a_1; a_0)
+ // v19 = // (b_1; b_0)
+ // v17 = // (c_1; c_0)
+ // v20 = // (d_1; d_0)
+ // v18 = // (e_1; e_0)
+ vshl128 v21, v19, 64 // (b_0; 0)
+ ext v22.16b, v19.16b, v20.16b, #8 // (d_0; b_1)
+ vshr128 v23, v20, 64 // (0; d_1)
+ eor v16.16b, v16.16b, v21.16b // (x_1; x_0)
+ eor v17.16b, v17.16b, v22.16b // (x_3; x_2)
+ eor v18.16b, v18.16b, v23.16b // (x_3; x_2)
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
- // v16 = // (y_0; y_1)
- // v17 = // (y_2; y_3)
- // v18 = // (y_4; y_5)
- mov v19.d[0], v17.d[1] // (y_3; ?)
+ // v16 = // (y_1; y_0)
+ // v17 = // (y_3; y_2)
+ // v18 = // (y_5; y_4)
+ mov v19.d[0], v17.d[1] // (?; y_3)
ushr v23.2d, v18.2d, #63 // hi b_i for t
ushr d20, d19, #63 // lo b_i for t
ushr v23.2d, v18.2d, #63 // hi b_i for t
ushr d20, d19, #63 // lo b_i for t
// Permute the high pieces while we fold in the b_i.
eor v17.16b, v17.16b, v23.16b
vshl128 v20, v20, 64
// Permute the high pieces while we fold in the b_i.
eor v17.16b, v17.16b, v23.16b
vshl128 v20, v20, 64
- mov v19.d[0], v18.d[1] // (y_5; ?)
- ext v18.16b, v17.16b, v18.16b, #8 // (y_3; y_4)
+ mov v19.d[0], v18.d[1] // (?; y_5)
+ ext v18.16b, v17.16b, v18.16b, #8 // (y_4; y_3)
eor v16.16b, v16.16b, v20.16b
// And finally shift the low bits up.
eor v16.16b, v16.16b, v20.16b
// And finally shift the low bits up.
- // v16 = // (y'_0; y'_1)
- // v17 = // (y'_2; ?)
- // v18 = // (y'_3; y'_4)
- // v19 = // (y'_5; ?)
+ // v16 = // (y'_1; y'_0)
+ // v17 = // (?; y'_2)
+ // v18 = // (y'_4; y'_3)
+ // v19 = // (?; y'_5)
shl v20.2d, v18.2d, #1
shl d23, d19, #1
shl v21.2d, v18.2d, #2
shl v20.2d, v18.2d, #1
shl d23, d19, #1
shl v21.2d, v18.2d, #2
// 128-bit multiplications already, and Karatsuba is too annoying
// there, so there'll be 12 multiplications altogether, rather than
// the 16 we'd have if we did this the naïve way.
// 128-bit multiplications already, and Karatsuba is too annoying
// there, so there'll be 12 multiplications altogether, rather than
// the 16 we'd have if we did this the naïve way.
- // v0 = // u_0 = (u_00; u_01)
- // v1 = // u_1 = (u_10; u_11)
+ // v0 = // u_0 = (u_01; u_00)
+ // v1 = // u_1 = (u_11; u_10)
// v2 = // (v_00; v_00)
// v3 = // (v_01; v_01)
// v4 = // (v_10; v_10)
// v5 = // (v_11; v_11)
// v2 = // (v_00; v_00)
// v3 = // (v_01; v_01)
// v4 = // (v_10; v_10)
// v5 = // (v_11; v_11)
- eor v28.16b, v0.16b, v1.16b // u_* = (u_00 + u_10; u_01 + u_11)
+ eor v28.16b, v0.16b, v1.16b // u_* = (u_01 + u_11; u_00 + u_10)
eor v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
eor v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
eor v29.16b, v2.16b, v4.16b // v_*0 = v_00 + v_10
eor v30.16b, v3.16b, v5.16b // v_*1 = v_01 + v_11
// Now we must reduce. This is essentially the same as the 192-bit
// case above, but more complicated because everything is bigger.
// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
// Now we must reduce. This is essentially the same as the 192-bit
// case above, but more complicated because everything is bigger.
// The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
- // v16 = // (y_0; y_1)
- // v17 = // (y_2; y_3)
- // v18 = // (y_4; y_5)
- // v19 = // (y_6; y_7)
- ushr v24.2d, v18.2d, #62 // (y_4; y_5) b_i for t^2
- ushr v25.2d, v19.2d, #62 // (y_6; y_7) b_i for t^2
- ushr v26.2d, v18.2d, #59 // (y_4; y_5) b_i for t^5
- ushr v27.2d, v19.2d, #59 // (y_6; y_7) b_i for t^5
- ushr v28.2d, v18.2d, #54 // (y_4; y_5) b_i for t^10
- ushr v29.2d, v19.2d, #54 // (y_6; y_7) b_i for t^10
+ // v16 = // (y_1; y_0)
+ // v17 = // (y_3; y_2)
+ // v18 = // (y_5; y_4)
+ // v19 = // (y_7; y_6)
+ ushr v24.2d, v18.2d, #62 // (y_5; y_4) b_i for t^2
+ ushr v25.2d, v19.2d, #62 // (y_7; y_6) b_i for t^2
+ ushr v26.2d, v18.2d, #59 // (y_5; y_4) b_i for t^5
+ ushr v27.2d, v19.2d, #59 // (y_7; y_6) b_i for t^5
+ ushr v28.2d, v18.2d, #54 // (y_5; y_4) b_i for t^10
+ ushr v29.2d, v19.2d, #54 // (y_7; y_6) b_i for t^10
eor v24.16b, v24.16b, v26.16b // mix the contributions together
eor v25.16b, v25.16b, v27.16b
eor v24.16b, v24.16b, v28.16b
eor v24.16b, v24.16b, v26.16b // mix the contributions together
eor v25.16b, v25.16b, v27.16b
eor v24.16b, v24.16b, v28.16b
eor v16.16b, v16.16b, v24.16b
// And then shift the low bits up.
eor v16.16b, v16.16b, v24.16b
// And then shift the low bits up.
- // v16 = // (y'_0; y'_1)
- // v17 = // (y'_2; y'_3)
- // v18 = // (y'_4; y'_5)
- // v19 = // (y'_6; y'_7)
- shl v24.2d, v18.2d, #2 // (y'_4; y_5) a_i for t^2
- shl v25.2d, v19.2d, #2 // (y_6; y_7) a_i for t^2
- shl v26.2d, v18.2d, #5 // (y'_4; y_5) a_i for t^5
- shl v27.2d, v19.2d, #5 // (y_6; y_7) a_i for t^5
- shl v28.2d, v18.2d, #10 // (y'_4; y_5) a_i for t^10
- shl v29.2d, v19.2d, #10 // (y_6; y_7) a_i for t^10
+ // v16 = // (y'_1; y'_0)
+ // v17 = // (y'_3; y'_2)
+ // v18 = // (y'_5; y'_4)
+ // v19 = // (y'_7; y'_6)
+ shl v24.2d, v18.2d, #2 // (y_5; y'_4) a_i for t^2
+ shl v25.2d, v19.2d, #2 // (y_7; y_6) a_i for t^2
+ shl v26.2d, v18.2d, #5 // (y_5; y'_4) a_i for t^5
+ shl v27.2d, v19.2d, #5 // (y_7; y_6) a_i for t^5
+ shl v28.2d, v18.2d, #10 // (y_5; y'_4) a_i for t^10
+ shl v29.2d, v19.2d, #10 // (y_7; y_6) a_i for t^10
eor v18.16b, v18.16b, v24.16b // mix the contributions together
eor v19.16b, v19.16b, v25.16b
eor v26.16b, v26.16b, v28.16b
eor v18.16b, v18.16b, v24.16b // mix the contributions together
eor v19.16b, v19.16b, v25.16b
eor v26.16b, v26.16b, v28.16b
// use Karatsuba's identity here, but I suspect that loses more in
// the shifting, bit-twiddling, and dependency chains that it gains
// in saving a multiplication which otherwise pipelines well.
// use Karatsuba's identity here, but I suspect that loses more in
// the shifting, bit-twiddling, and dependency chains that it gains
// in saving a multiplication which otherwise pipelines well.
- // xmm0 = // (u_1; u_0)
- // xmm1 = // (v_1; v_0)
- movdqa xmm2, xmm1 // (v_1; v_0) again
- movdqa xmm3, xmm0 // (u_1; u_0) again
- movdqa xmm4, xmm0 // (u_1; u_0) yet again
+ // xmm0 = // (u_0; u_1)
+ // xmm1 = // (v_0; v_1)
+ movdqa xmm2, xmm1 // (v_0; v_1) again
+ movdqa xmm3, xmm0 // (u_0; u_1) again
+ movdqa xmm4, xmm0 // (u_0; u_1) yet again
pclmulhqlqdq xmm2, xmm0 // u_1 v_0
pclmullqlqdq xmm0, xmm1 // u_1 v_1
pclmulhqlqdq xmm3, xmm1 // u_0 v_1
pclmulhqhqdq xmm4, xmm1 // u_0 v_0
// Arrange the pieces to form a double-precision polynomial.
pclmulhqlqdq xmm2, xmm0 // u_1 v_0
pclmullqlqdq xmm0, xmm1 // u_1 v_1
pclmulhqlqdq xmm3, xmm1 // u_0 v_1
pclmulhqhqdq xmm4, xmm1 // u_0 v_0
// Arrange the pieces to form a double-precision polynomial.
- pxor xmm2, xmm3 // (m_1; m_0) = u_1 v_0 + u_0 v_1
- movdqa xmm1, xmm2 // (m_1; m_0) again
- pslldq xmm2, 8 // (0; m_1)
- psrldq xmm1, 8 // (m_0; 0)
+ pxor xmm2, xmm3 // (m_0; m_1) = u_1 v_0 + u_0 v_1
+ movdqa xmm1, xmm2 // (m_0; m_1) again
+ pslldq xmm2, 8 // (m_1; 0)
+ psrldq xmm1, 8 // (0; m_0)
pxor xmm0, xmm2 // z_1 = u_1 v_1 + m_1
pxor xmm1, xmm4 // z_0 = u_0 v_0 + t^64 m_0
pxor xmm0, xmm2 // z_1 = u_1 v_1 + m_1
pxor xmm1, xmm4 // z_0 = u_0 v_0 + t^64 m_0
// word together, and then the low bits, everything will be fine.
// First, shift the high bits down.
// word together, and then the low bits, everything will be fine.
// First, shift the high bits down.
- movdqa xmm2, xmm0 // (x_7, x_6; x_5, x_4) again
- movdqa xmm3, xmm0 // (x_7, x_6; x_5, x_4) yet again
- movdqa xmm4, xmm0 // (x_7, x_6; x_5, x_4) again again
+ movdqa xmm2, xmm0 // (x_4, x_5; x_6, x_7) again
+ movdqa xmm3, xmm0 // (x_4, x_5; x_6, x_7) yet again
+ movdqa xmm4, xmm0 // (x_4, x_5; x_6, x_7) again again
pslld xmm2, 31 // the b_i for t
pslld xmm3, 30 // the b_i for t^2
pslld xmm4, 25 // the b_i for t^7
pslld xmm2, 31 // the b_i for t
pslld xmm3, 30 // the b_i for t^2
pslld xmm4, 25 // the b_i for t^7
// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
// First, we must detach the top (`low'!) half of the result.
// polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
// First, we must detach the top (`low'!) half of the result.
- movdqa xmm0, xmm1 // (x_3, x_2; x_1, x_0) again
- psrldq xmm1, 8 // (x_1, x_0; 0, 0)
+ movdqa xmm0, xmm1 // (x_0, x_1; x_2, x_3) again
+ psrldq xmm1, 8 // (0, 0; x_0, x_1)
// Next, shift the high bits down.
// Next, shift the high bits down.
- movdqa xmm2, xmm0 // (x_3, x_2; ?, ?) again
- movdqa xmm3, xmm0 // (x_3, x_2; ?, ?) yet again
- movdqa xmm4, xmm0 // (x_3, x_2; ?, ?) again again
+ movdqa xmm2, xmm0 // (?, ?; x_2, x_3) again
+ movdqa xmm3, xmm0 // (?, ?; x_2, x_3) yet again
+ movdqa xmm4, xmm0 // (?, ?; x_2, x_3) again again
pslld xmm2, 31 // b_i for t
pslld xmm3, 29 // b_i for t^3
pslld xmm4, 28 // b_i for t^4
pslld xmm2, 31 // b_i for t
pslld xmm3, 29 // b_i for t^3
pslld xmm4, 28 // b_i for t^4
// shift both of them up by four bytes before we start. This will
// mean that the high 64 bits of the result (from GCM's viewpoint)
// will be zero.
// shift both of them up by four bytes before we start. This will
// mean that the high 64 bits of the result (from GCM's viewpoint)
// will be zero.
- // xmm0 = // (0, u_2; u_1, u_0)
- // xmm1 = // (0, v_2; v_1, v_0)
- movdqa xmm2, xmm1 // (0, v_2; v_1, v_0) again
- movdqa xmm3, xmm0 // (0, u_2; u_1, u_0) again
- movdqa xmm4, xmm0 // (0, u_2; u_1, u_0) yet again
+ // xmm0 = // (u_0, u_1; u_2, 0)
+ // xmm1 = // (v_0, v_1; v_2, 0)
+ movdqa xmm2, xmm1 // (v_0, v_1; v_2, 0) again
+ movdqa xmm3, xmm0 // (u_0, u_1; u_2, 0) again
+ movdqa xmm4, xmm0 // (u_0, u_1; u_2, 0) yet again
pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0
pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d)
pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1
pclmulhqlqdq xmm2, xmm0 // u_2 (v_1 t^32 + v_0) = e_0
pclmullqlqdq xmm0, xmm1 // u_2 v_2 = d = (0; d)
pclmulhqlqdq xmm3, xmm1 // v_2 (u_1 t^32 + u_0) = e_1
// registers. The answer we want is d t^128 + e t^64 + f, where e =
// e_0 + e_1.
//
// registers. The answer we want is d t^128 + e t^64 + f, where e =
// e_0 + e_1.
//
- // The place values for the two halves are (t^160, t^128; t^96, ?)
- // and (?, t^64; t^32, 1). But we also want to shift the high part
+ // The place values for the two halves are (?, t^96; t^128, t^160)
+ // and (1, t^32; t^64, ?). But we also want to shift the high part
// left by a word, for symmetry's sake.
// left by a word, for symmetry's sake.
- psrldq xmm0, 8 // (d; 0) = d t^128
+ psrldq xmm0, 8 // (0; d) = d t^128
pxor xmm2, xmm3 // e = (e_0 + e_1)
movdqa xmm1, xmm4 // f again
pxor xmm0, xmm2 // d t^128 + e t^64
pxor xmm2, xmm3 // e = (e_0 + e_1)
movdqa xmm1, xmm4 // f again
pxor xmm0, xmm2 // d t^128 + e t^64
// are unimportant. Clobbers xmm2--xmm7.
// Start multiplying and accumulating pieces of product.
// are unimportant. Clobbers xmm2--xmm7.
// Start multiplying and accumulating pieces of product.
- // xmm0 = // (u_2; u_1)
- // xmm1 = // (u_0; ?)
- // xmm2 = // (v_2; v_1)
- // xmm3 = // (v_0; ?)
- movdqa xmm4, xmm0 // (u_2; u_1) again
- movdqa xmm5, xmm0 // (u_2; u_1) yet again
- movdqa xmm6, xmm0 // (u_2; u_1) again again
- movdqa xmm7, xmm3 // (v_0; ?) again
- punpcklqdq xmm3, xmm1 // (v_0; u_0)
+ // xmm0 = // (u_1; u_2)
+ // xmm1 = // (?; u_0)
+ // xmm2 = // (v_1; v_2)
+ // xmm3 = // (?; v_0)
+ movdqa xmm4, xmm0 // (u_1; u_2) again
+ movdqa xmm5, xmm0 // (u_1; u_2) yet again
+ movdqa xmm6, xmm0 // (u_1; u_2) again again
+ movdqa xmm7, xmm3 // (?; v_0) again
+ punpcklqdq xmm3, xmm1 // (u_0; v_0)
pclmulhqhqdq xmm4, xmm2 // u_1 v_1
pclmullqlqdq xmm1, xmm2 // u_0 v_2
pclmullqhqdq xmm5, xmm2 // u_2 v_1
pclmulhqhqdq xmm4, xmm2 // u_1 v_1
pclmullqlqdq xmm1, xmm2 // u_0 v_2
pclmullqhqdq xmm5, xmm2 // u_2 v_1
pxor xmm1, xmm4 // u_0 v_2 + u_1 v_1
pclmullqlqdq xmm7, xmm0 // u_2 v_0
pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2
pxor xmm1, xmm4 // u_0 v_2 + u_1 v_1
pclmullqlqdq xmm7, xmm0 // u_2 v_0
pxor xmm5, xmm6 // b = u_2 v_1 + u_1 v_2
- movdqa xmm6, xmm0 // (u_2; u_1) like a bad penny
+ movdqa xmm6, xmm0 // (u_1; u_2) like a bad penny
pxor xmm1, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
pclmullqlqdq xmm0, xmm2 // a = u_2 v_2
pclmulhqlqdq xmm6, xmm3 // u_1 v_0
pxor xmm1, xmm7 // c = u_0 v_2 + u_1 v_1 + u_2 v_0
pclmullqlqdq xmm0, xmm2 // a = u_2 v_2
pclmulhqlqdq xmm6, xmm3 // u_1 v_0
// Next, the piecing together of the product. There's significant
// work here to leave the completed pieces in sensible registers.
// Next, the piecing together of the product. There's significant
// work here to leave the completed pieces in sensible registers.
- // xmm0 = // (a_1; a_0) = a = u_2 v_2
- // xmm5 = // (b_1; b_0) = b = u_1 v_2 + u_2 v_1
- // xmm1 = // (c_1; c_0) = c = u_0 v_2 +
+ // xmm0 = // (a_0; a_1) = a = u_2 v_2
+ // xmm5 = // (b_0; b_1) = b = u_1 v_2 + u_2 v_1
+ // xmm1 = // (c_0; c_1) = c = u_0 v_2 +
- // xmm6 = // (d_1; d_0) = d = u_0 v_1 + u_1 v_0
- // xmm3 = // (e_1; e_0) = e = u_0 v_0
+ // xmm6 = // (d_0; d_1) = d = u_0 v_1 + u_1 v_0
+ // xmm3 = // (e_0; e_1) = e = u_0 v_0
// xmm2, xmm4, xmm7 spare
// xmm2, xmm4, xmm7 spare
- movdqa xmm2, xmm6 // (d_1; d_0) again
- movdqa xmm4, xmm5 // (b_1; b_0) again
- pslldq xmm6, 8 // (0; d_1)
- psrldq xmm5, 8 // (b_0; 0)
- psrldq xmm2, 8 // (d_0; 0)
- pslldq xmm4, 8 // (0; b_1)
- pxor xmm5, xmm6 // (b_0; d_1)
- pxor xmm0, xmm4 // (x_5; x_4) = (a_1; a_0 + b_1)
- pxor xmm2, xmm3 // (x_1; x_0) = (e_1 + d_0; e_0)
- pxor xmm1, xmm5 // (x_3; x_2) = (b_0 + c_1; c_0 + d_1)
+ movdqa xmm2, xmm6 // (d_0; d_1) again
+ movdqa xmm4, xmm5 // (b_0; b_1) again
+ pslldq xmm6, 8 // (d_1; 0)
+ psrldq xmm5, 8 // (0; b_0)
+ psrldq xmm2, 8 // (0; d_0)
+ pslldq xmm4, 8 // (b_1; 0)
+ pxor xmm5, xmm6 // (d_1; b_0)
+ pxor xmm0, xmm4 // (x_4; x_5) = (a_0 + b_1; a_1)
+ pxor xmm2, xmm3 // (x_0; x_1) = (e_0; e_1 + d_0)
+ pxor xmm1, xmm5 // (x_2; x_3) = (c_0 + d_1; b_0 + c_1)
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
// Next, the reduction. Our polynomial this time is p(x) = t^192 +
// t^7 + t^2 + t + 1. Yes, the magic numbers are the same as the
// 128-bit case. I don't know why.
// First, shift the high bits down.
- // xmm0 = // (x_5; x_4)
- // xmm1 = // (x_3; x_2)
- // xmm2 = // (x_1; x_0)
+ // xmm0 = // (x_4; x_5)
+ // xmm1 = // (x_2; x_3)
+ // xmm2 = // (x_0; x_1)
- movdqa xmm3, xmm0 // (x_5; x_4) copy
- movdqa xmm4, xmm0 // (x_5; x_4) copy
- movdqa xmm5, xmm0 // (x_5; x_4) copy
- pslld xmm3, 31 // (x_5; x_4) b_i for t
- pslld xmm4, 30 // (x_5; x_4) b_i for t^2
- pslld xmm5, 25 // (x_5; x_4) b_i for t^7
- movq xmm6, xmm1 // (x_3; 0) copy
+ movdqa xmm3, xmm0 // (x_4; x_5) copy
+ movdqa xmm4, xmm0 // (x_4; x_5) copy
+ movdqa xmm5, xmm0 // (x_4; x_5) copy
+ pslld xmm3, 31 // (x_4; x_5) b_i for t
+ pslld xmm4, 30 // (x_4; x_5) b_i for t^2
+ pslld xmm5, 25 // (x_4; x_5) b_i for t^7
+ movq xmm6, xmm1 // (0; x_3) copy
- movq xmm7, xmm1 // (x_3; 0) copy
+ movq xmm7, xmm1 // (0; x_3) copy
- movq xmm5, xmm1 // (x_3; 0) copy
- movdqa xmm4, xmm3 // (x_5; x_4) b_i combined
- pslld xmm6, 31 // (x_3; 0) b_i for t
- pslld xmm7, 30 // (x_3; 0) b_i for t^2
- pslld xmm5, 25 // (x_3; 0) b_i for t^7
- psrldq xmm3, 12 // (x_5; x_4) low contrib
- pslldq xmm4, 4 // (x_5; x_4) high contrib
+ movq xmm5, xmm1 // (0; x_3) copy
+ movdqa xmm4, xmm3 // (x_4; x_5) b_i combined
+ pslld xmm6, 31 // (0; x_3) b_i for t
+ pslld xmm7, 30 // (0; x_3) b_i for t^2
+ pslld xmm5, 25 // (0; x_3) b_i for t^7
+ psrldq xmm3, 12 // (x_4; x_5) low contrib
+ pslldq xmm4, 4 // (x_4; x_5) high contrib
pxor xmm6, xmm7
pxor xmm2, xmm3
pxor xmm6, xmm5
pxor xmm6, xmm7
pxor xmm2, xmm3
pxor xmm6, xmm5
// And finally shift the low bits up. Unfortunately, we also have to
// split the low bits out.
// And finally shift the low bits up. Unfortunately, we also have to
// split the low bits out.
- // xmm0 = // (x'_5; x'_4)
- // xmm1 = // (x'_3; x'_2)
- // xmm2 = // (x'_1; x'_0)
- movdqa xmm5, xmm1 // copies of (x'_3; x'_2)
+ // xmm0 = // (x'_4; x'_5)
+ // xmm1 = // (x'_2; x'_3)
+ // xmm2 = // (x'_0; x'_1)
+ movdqa xmm5, xmm1 // copies of (x'_2; x'_3)
movdqa xmm6, xmm1
movdqa xmm7, xmm1
movdqa xmm6, xmm1
movdqa xmm7, xmm1
- psrldq xmm1, 8 // bring down (x'_2; ?)
- movdqa xmm3, xmm0 // copies of (x'_5; x'_4)
+ psrldq xmm1, 8 // bring down (?; x'_2)
+ movdqa xmm3, xmm0 // copies of (x'_4; x'_5)
- punpcklqdq xmm1, xmm2 // (x'_2; x'_1)
- psrldq xmm2, 8 // (x'_0; ?)
+ punpcklqdq xmm1, xmm2 // (x'_1; x'_2)
+ psrldq xmm2, 8 // (?; x'_0)
pxor xmm2, xmm5 // low half and unit contrib
pxor xmm1, xmm0
psrld xmm5, 1
pxor xmm2, xmm5 // low half and unit contrib
pxor xmm1, xmm0
psrld xmm5, 1
pxor xmm0, xmm4
pxor xmm5, xmm2 // mix everything together
pxor xmm0, xmm1
pxor xmm0, xmm4
pxor xmm5, xmm2 // mix everything together
pxor xmm0, xmm1
- movq xmm1, xmm5 // shunt (z_0; ?) into proper place
+ movq xmm1, xmm5 // shunt (?; z_0) into proper place
// On x86, there aren't quite enough registers, so spill one for a
// bit. On AMD64, we can keep on going, so it's all good.
// On x86, there aren't quite enough registers, so spill one for a
// bit. On AMD64, we can keep on going, so it's all good.
- // xmm0 = // u_1 = (u_11; u_10)
- // xmm1 = // u_0 = (u_01; u_00)
- // xmm2 = // v_1 = (v_11; v_10)
- // xmm3 = // v_0 = (v_01; v_00)
+ // xmm0 = // u_1 = (u_10; u_11)
+ // xmm1 = // u_0 = (u_00; u_01)
+ // xmm2 = // v_1 = (v_10; v_11)
+ // xmm3 = // v_0 = (v_00; v_01)
movdqa xmm4, xmm0 // u_1 again
#if CPUFAM_X86
movdqa [SP + 0], xmm3
movdqa xmm4, xmm0 // u_1 again
#if CPUFAM_X86
movdqa [SP + 0], xmm3
movdqa xmm8, xmm3
# define V0 xmm8
#endif
movdqa xmm8, xmm3
# define V0 xmm8
#endif
- pxor xmm4, xmm1 // u_* = (u_01 + u_11; u_00 + u_10)
- pxor xmm3, xmm2 // v_* = (v_01 + v_11; v_00 + v_10)
+ pxor xmm4, xmm1 // u_* = (u_00 + u_10; u_01 + u_11)
+ pxor xmm3, xmm2 // v_* = (v_00 + v_10; v_01 + v_11)
// Start by building the cross product, q = u_* v_*.
movdqa xmm7, xmm4 // more copies of u_*
// Start by building the cross product, q = u_* v_*.
movdqa xmm7, xmm4 // more copies of u_*
// the /last/ byte in the block. If the block size is not a multiple of
// 16 bytes, then there must be padding. 96-bit blocks are weird: the
// padding is inserted at the /least/ significant end, so the register
// the /last/ byte in the block. If the block size is not a multiple of
// 16 bytes, then there must be padding. 96-bit blocks are weird: the
// padding is inserted at the /least/ significant end, so the register
-// holds (0, x_0; x_1, x_2); otherwise, the padding goes at the most
+// holds (x_2, x_1; x_0, 0); otherwise, the padding goes at the most
// significant end.
//
// * The `words' format consists of a sequence of bytes, as in the
// significant end.
//
// * The `words' format consists of a sequence of bytes, as in the
endprologue
movdqu xmm0, [A]
movdqu xmm1, [K]
endprologue
movdqu xmm0, [A]
movdqu xmm1, [K]
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
movdqu [A], xmm0
ret
ENDFUNC
movdqu [A], xmm0
ret
ENDFUNC
endprologue
movq xmm0, [A]
movq xmm1, [K]
endprologue
movq xmm0, [A]
movq xmm1, [K]
- pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(3, 3, 0, 1)
- pshufd xmm0, xmm0, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(3, 3, 0, 1)
movq [A], xmm0
ret
ENDFUNC
movq [A], xmm0
ret
ENDFUNC
movd xmm2, [A + 8]
movdqu xmm1, [K]
punpcklqdq xmm0, xmm2
movd xmm2, [A + 8]
movdqu xmm1, [K]
punpcklqdq xmm0, xmm2
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
- pshufd xmm1, xmm0, SHUF(3, 2, 1, 0)
+ pshufd xmm1, xmm0, SHUF(0, 1, 2, 3)
psrldq xmm0, 4
movq [A + 0], xmm1
movd [A + 8], xmm0
psrldq xmm0, 4
movq [A + 0], xmm1
movd [A + 8], xmm0
movq xmm1, [A + 0]
movdqu xmm2, [K + 0]
movq xmm3, [K + 16]
movq xmm1, [A + 0]
movdqu xmm2, [K + 0]
movq xmm3, [K + 16]
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(3, 3, 0, 1)
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(1, 0, 3, 3)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(3, 3, 0, 1)
movdqu [A + 8], xmm0
movq [A + 0], xmm1
#if CPUFAM_AMD64 && ABI_WIN
movdqu [A + 8], xmm0
movq [A + 0], xmm1
#if CPUFAM_AMD64 && ABI_WIN
movdqu xmm1, [A + 0]
movdqu xmm2, [K + 0]
movdqu xmm3, [K + 16]
movdqu xmm1, [A + 0]
movdqu xmm2, [K + 0]
movdqu xmm3, [K + 16]
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(0, 1, 2, 3)
- pshufd xmm0, xmm0, SHUF(3, 2, 1, 0)
- pshufd xmm1, xmm1, SHUF(3, 2, 1, 0)
+ pshufd xmm0, xmm0, SHUF(0, 1, 2, 3)
+ pshufd xmm1, xmm1, SHUF(0, 1, 2, 3)
movdqu [A + 16], xmm0
movdqu [A + 0], xmm1
#if CPUFAM_X86
movdqu [A + 16], xmm0
movdqu [A + 0], xmm1
#if CPUFAM_X86
// Fourth word of the cycle, and seven or eight words of key. Do a
// byte substitution.
movd xmm0, eax
// Fourth word of the cycle, and seven or eight words of key. Do a
// byte substitution.
movd xmm0, eax
- pshufd xmm0, xmm0, SHUF(3, 0, 1, 2)
+ pshufd xmm0, xmm0, SHUF(2, 1, 0, 3)
aeskeygenassist xmm1, xmm0, 0
movd eax, xmm1
jmp 2f
// First word of the cycle. This is the complicated piece.
1: movd xmm0, eax
aeskeygenassist xmm1, xmm0, 0
movd eax, xmm1
jmp 2f
// First word of the cycle. This is the complicated piece.
1: movd xmm0, eax
- pshufd xmm0, xmm0, SHUF(1, 2, 3, 0)
+ pshufd xmm0, xmm0, SHUF(0, 3, 2, 1)
aeskeygenassist xmm1, xmm0, 0
aeskeygenassist xmm1, xmm0, 0
- pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
movd eax, xmm1
xor al, [RCON]
inc RCON
movd eax, xmm1
xor al, [RCON]
inc RCON
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm1
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm1
- pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
// a ^= (d + c) <<< 18
movdqa xmm4, xmm3
// a ^= (d + c) <<< 18
movdqa xmm4, xmm3
- pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
+ pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
- pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm3
// d ^= (c + b) <<< 13
movdqa xmm4, xmm2
paddd xmm4, xmm3
- pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
+ pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
movdqa xmm5, xmm4
pslld xmm4, 13
psrld xmm5, 19
// a ^= (d + c) <<< 18
movdqa xmm4, xmm1
// a ^= (d + c) <<< 18
movdqa xmm4, xmm1
- pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
+ pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
- pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
movdqa xmm5, xmm4
pslld xmm4, 18
psrld xmm5, 14
// input. This can be done by juggling values in registers, with the
// following fancy footwork: some row rotations, a transpose, and
// some more rotations.
// input. This can be done by juggling values in registers, with the
// following fancy footwork: some row rotations, a transpose, and
// some more rotations.
- pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 3, 4, 9, 14
- pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) // 2, 7, 8, 13
- pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) // 1, 6, 11, 12
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14
+ pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13
+ pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12
movdqa xmm4, xmm0
movdqa xmm5, xmm3
movdqa xmm4, xmm0
movdqa xmm5, xmm3
punpckhdq xmm1, xmm3 // 5, 6, 7, 4
punpckhdq xmm2, xmm5 // 15, 12, 13, 14
punpckhdq xmm1, xmm3 // 5, 6, 7, 4
punpckhdq xmm2, xmm5 // 15, 12, 13, 14
- pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 4, 5, 6, 7
- pshufd xmm4, xmm4, SHUF(2, 3, 0, 1) // 8, 9, 10, 11
- pshufd xmm2, xmm2, SHUF(1, 2, 3, 0) // 12, 13, 14, 15
+ pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7
+ pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11
+ pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15
// Finally we have to write out the result.
movdqu [OUT + 0], xmm0
// Finally we have to write out the result.
movdqu [OUT + 0], xmm0