X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/e297526c6cfe427a9d70204966745651eac50fdb:/symm/chacha-x86-sse2.S..0f23f75ff53acadf80e9d3dfd2dfd14cb526074f:/symm/chacha-x86ish-sse2.S diff --git a/symm/chacha-x86-sse2.S b/symm/chacha-x86ish-sse2.S similarity index 65% rename from symm/chacha-x86-sse2.S rename to symm/chacha-x86ish-sse2.S index ccdfa538..f36bf90f 100644 --- a/symm/chacha-x86-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -44,17 +44,73 @@ .arch pentium4 .section .text -FUNC(chacha_core_x86_sse2) +FUNC(chacha_core_x86ish_sse2) + + // Initial setup. + +#if CPUFAM_X86 + // Arguments come in on the stack, and will need to be collected. We + // we can get away with just the scratch registers for integer work, + // but we'll run out of XMM registers and will need some properly + // aligned space which we'll steal from the stack. I don't trust the + // stack pointer's alignment, so I'll have to mask the stack pointer, + // which in turn means I'll need to keep track of the old value. + // Hence I'm making a full i386-style stack frame here. + // + // The Windows and SysV ABIs are sufficiently similar that we don't + // need to worry about the differences here. + +# define NR ecx +# define IN eax +# define OUT edx +# define SAVE0 xmm5 +# define SAVE1 xmm6 +# define SAVE2 xmm7 +# define SAVE3 [esp] - // Initial state. We have three arguments: - // [ebp + 8] is the number of rounds to do - // [ebp + 12] points to the input matrix - // [ebp + 16] points to the output matrix push ebp mov ebp, esp sub esp, 16 - mov edx, [ebp + 12] + mov IN, [ebp + 12] + mov OUT, [ebp + 16] and esp, ~15 + mov NR, [ebp + 8] +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // This is nice. We have plenty of XMM registers, and the arguments + // are in useful places. There's no need to spill anything and we + // can just get on with the code. + +# define NR edi +# define IN rsi +# define OUT rdx +# define SAVE0 xmm5 +# define SAVE1 xmm6 +# define SAVE2 xmm7 +# define SAVE3 xmm8 +#endif + +#if CPUFAM_AMD64 && ABI_WIN + // Arguments come in registers, but they're different between Windows + // and everyone else (and everyone else is saner). + // + // The Windows ABI insists that we preserve some of the XMM + // registers, but we want more than we can use as scratch space. We + // only need to save a copy of the input for the feedforward at the + // end, so we might as well use memory rather than spill extra + // registers. (We need an extra 8 bytes to align the stack.) + +# define NR ecx +# define IN rdx +# define OUT r8 +# define SAVE0 xmm5 +# define SAVE1 [rsp + 0] +# define SAVE2 [rsp + 16] +# define SAVE3 [rsp + 32] + + sub rsp, 48 + 8 +#endif // First job is to slurp the matrix into XMM registers. Be careful: // the input matrix isn't likely to be properly aligned. @@ -63,20 +119,17 @@ FUNC(chacha_core_x86_sse2) // [ 4 5 6 7] (b, xmm1) // [ 8 9 10 11] (c, xmm2) // [12 13 14 15] (d, xmm3) - movdqu xmm0, [edx + 0] - movdqu xmm1, [edx + 16] - movdqu xmm2, [edx + 32] - movdqu xmm3, [edx + 48] - - // Prepare for the main loop. - mov ecx, [ebp + 8] + movdqu xmm0, [IN + 0] + movdqu xmm1, [IN + 16] + movdqu xmm2, [IN + 32] + movdqu xmm3, [IN + 48] // Take a copy for later. This one is aligned properly, by // construction. - movdqa [esp], xmm0 - movdqa xmm5, xmm1 - movdqa xmm6, xmm2 - movdqa xmm7, xmm3 + movdqa SAVE0, xmm0 + movdqa SAVE1, xmm1 + movdqa SAVE2, xmm2 + movdqa SAVE3, xmm3 loop: // Apply a column quarterround to each of the columns simultaneously. @@ -174,26 +227,30 @@ loop: pshufd xmm1, xmm1, ROTL // Decrement the loop counter and see if we should go round again. - sub ecx, 2 + sub NR, 2 ja loop // Almost there. Firstly, the feedforward addition. - mov edx, [ebp + 16] - paddd xmm0, [esp] - paddd xmm1, xmm5 - paddd xmm2, xmm6 - paddd xmm3, xmm7 + paddd xmm0, SAVE0 + paddd xmm1, SAVE1 + paddd xmm2, SAVE2 + paddd xmm3, SAVE3 // And now we write out the result. This one won't be aligned // either. - movdqu [edx + 0], xmm0 - movdqu [edx + 16], xmm1 - movdqu [edx + 32], xmm2 - movdqu [edx + 48], xmm3 + movdqu [OUT + 0], xmm0 + movdqu [OUT + 16], xmm1 + movdqu [OUT + 32], xmm2 + movdqu [OUT + 48], xmm3 // Tidy things up. +#if CPUFAM_X86 mov esp, ebp pop ebp +#endif +#if CPUFAM_AMD64 && ABI_WIN + add rsp, 48 + 8 +#endif // And with that, we're done. ret