X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/e297526c6cfe427a9d70204966745651eac50fdb:/symm/salsa20-x86-sse2.S..0f23f75ff53acadf80e9d3dfd2dfd14cb526074f:/symm/salsa20-x86ish-sse2.S diff --git a/symm/salsa20-x86-sse2.S b/symm/salsa20-x86ish-sse2.S similarity index 63% rename from symm/salsa20-x86-sse2.S rename to symm/salsa20-x86ish-sse2.S index 7a5bd2a3..a168d79a 100644 --- a/symm/salsa20-x86-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@ -44,20 +44,76 @@ .arch pentium4 .section .text -FUNC(salsa20_core_x86_sse2) +FUNC(salsa20_core_x86ish_sse2) + + // Initial setup. + +#if CPUFAM_X86 + // Arguments come in on the stack, and will need to be collected. We + // we can get away with just the scratch registers for integer work, + // but we'll run out of XMM registers and will need some properly + // aligned space which we'll steal from the stack. I don't trust the + // stack pointer's alignment, so I'll have to mask the stack pointer, + // which in turn means I'll need to keep track of the old value. + // Hence I'm making a full i386-style stack frame here. + // + // The Windows and SysV ABIs are sufficiently similar that we don't + // need to worry about the differences here. + +# define NR ecx +# define IN eax +# define OUT edx +# define SAVE0 xmm6 +# define SAVE1 xmm7 +# define SAVE2 [esp + 0] +# define SAVE3 [esp + 16] - // Initial state. We have three arguments: - // [ebp + 8] is the number of rounds to do - // [ebp + 12] points to the input matrix - // [ebp + 16] points to the output matrix push ebp mov ebp, esp sub esp, 32 - mov edx, [ebp + 12] + mov IN, [ebp + 12] + mov OUT, [ebp + 16] and esp, ~15 - - // Prepare for the main loop. - mov ecx, [ebp + 8] + mov NR, [ebp + 8] +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // This is nice. We have plenty of XMM registers, and the arguments + // are in useful places. There's no need to spill anything and we + // can just get on with the code. + +# define NR edi +# define IN rsi +# define OUT rdx +# define SAVE0 xmm6 +# define SAVE1 xmm7 +# define SAVE2 xmm8 +# define SAVE3 xmm9 +#endif + +# if CPUFAM_AMD64 && ABI_WIN + // Arguments come in registers, but they're different between Windows + // and everyone else (and everyone else is saner). + // + // The Windows ABI insists that we preserve some of the XMM + // registers, but we want more than we can use as scratch space. Two + // places we only need to save a copy of the input for the + // feedforward at the end; but the other two we want for the final + // permutation, so save the old values on the stack (We need an extra + // 8 bytes to align the stack.) + +# define NR ecx +# define IN rdx +# define OUT r8 +# define SAVE0 xmm6 +# define SAVE1 xmm7 +# define SAVE2 [rsp + 32] +# define SAVE3 [rsp + 48] + + sub rsp, 64 + 8 + movdqa [rsp + 0], xmm6 + movdqa [rsp + 16], xmm7 +#endif // First job is to slurp the matrix into XMM registers. The words // have already been permuted conveniently to make them line up @@ -85,19 +141,18 @@ FUNC(salsa20_core_x86_sse2) // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1) // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2) // [12 13 14 15] [12 1 6 11] (d, xmm3) - movdqu xmm0, [edx + 0] - movdqu xmm1, [edx + 16] - movdqu xmm2, [edx + 32] - movdqu xmm3, [edx + 48] + movdqu xmm0, [IN + 0] + movdqu xmm1, [IN + 16] + movdqu xmm2, [IN + 32] + movdqu xmm3, [IN + 48] - // Take a copy for later. - movdqa [esp + 0], xmm0 - movdqa [esp + 16], xmm1 - movdqa xmm6, xmm2 - movdqa xmm7, xmm3 + ## Take a copy for later. + movdqa SAVE0, xmm0 + movdqa SAVE1, xmm1 + movdqa SAVE2, xmm2 + movdqa SAVE3, xmm3 loop: - // Apply a column quarterround to each of the columns simultaneously. // Alas, there doesn't seem to be a packed doubleword rotate, so we // have to synthesize it. @@ -147,9 +202,9 @@ loop: // involve any movement of elements between rows. // // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0) - // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3) - // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2) - // [12 1 6 11] [ 3 4 9 14] (d, xmm1) + // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3) + // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2) + // [12 1 6 11] [ 3 4 9 14] (d, xmm1) // // The shuffles have quite high latency, so they've been pushed // backwards into the main instruction list. @@ -200,7 +255,7 @@ loop: // back the shuffles because they take a long time coming through. // Decrement the loop counter and see if we should go round again. // Later processors fuse this pair into a single uop. - sub ecx, 2 + sub NR, 2 ja loop // Almost there. Firstly, the feedforward addition, and then we have @@ -208,55 +263,69 @@ loop: // which was already applied to the input. Shuffling has quite high // latency, so arrange to start a new shuffle into a temporary as // soon as we've written out the old value. - mov edx, [ebp + 16] - - paddd xmm0, [esp + 0] - pshufd xmm4, xmm0, ROTR - movd [edx + 0], xmm0 + paddd xmm0, SAVE0 + pshufd xmm4, xmm0, 0x39 + movd [OUT + 0], xmm0 - paddd xmm1, [esp + 16] + paddd xmm1, SAVE1 pshufd xmm5, xmm1, ROTL - movd [edx + 16], xmm1 + movd [OUT + 16], xmm1 - paddd xmm2, xmm6 + paddd xmm2, SAVE2 pshufd xmm6, xmm2, ROT2 - movd [edx + 32], xmm2 + movd [OUT + 32], xmm2 - paddd xmm3, xmm7 + paddd xmm3, SAVE3 pshufd xmm7, xmm3, ROTR - movd [edx + 48], xmm3 + movd [OUT + 48], xmm3 - movd [edx + 4], xmm7 + movd [OUT + 4], xmm7 pshufd xmm7, xmm3, ROT2 - movd [edx + 24], xmm7 + movd [OUT + 24], xmm7 pshufd xmm3, xmm3, ROTL - movd [edx + 44], xmm3 + movd [OUT + 44], xmm3 - movd [edx + 8], xmm6 + movd [OUT + 8], xmm6 pshufd xmm6, xmm2, ROTL - movd [edx + 28], xmm6 + movd [OUT + 28], xmm6 pshufd xmm2, xmm2, ROTR - movd [edx + 52], xmm2 + movd [OUT + 52], xmm2 - movd [edx + 12], xmm5 + movd [OUT + 12], xmm5 pshufd xmm5, xmm1, ROTR - movd [edx + 36], xmm5 + movd [OUT + 36], xmm5 pshufd xmm1, xmm1, ROT2 - movd [edx + 56], xmm1 + movd [OUT + 56], xmm1 - movd [edx + 20], xmm4 + movd [OUT + 20], xmm4 pshufd xmm4, xmm0, ROT2 - movd [edx + 40], xmm4 + movd [OUT + 40], xmm4 pshufd xmm0, xmm0, ROTL - movd [edx + 60], xmm0 + movd [OUT + 60], xmm0 // Tidy things up. + +#if CPUFAM_X86 mov esp, ebp pop ebp +#endif +#if CPUFAM_AMD64 && ABI_WIN + movdqa xmm6, [rsp + 0] + movdqa xmm7, [rsp + 16] + add rsp, 64 + 8 +#endif // And with that, we're done. ret +#undef NR +#undef IN +#undef OUT +#undef SAVE0 +#undef SAVE1 +#undef SAVE2 +#undef SAVE3 + ENDFUNC ///----- That's all, folks --------------------------------------------------