X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/e297526c6cfe427a9d70204966745651eac50fdb..0f23f75ff53acadf80e9d3dfd2dfd14cb526074f:/symm/chacha-x86ish-sse2.S diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S new file mode 100644 index 00000000..f36bf90f --- /dev/null +++ b/symm/chacha-x86ish-sse2.S @@ -0,0 +1,260 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Fancy SIMD implementation of ChaCha +/// +/// (c) 2015 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// Local utilities. + +// Magic constants for shuffling. +#define ROTL 0x93 +#define ROT2 0x4e +#define ROTR 0x39 + +///-------------------------------------------------------------------------- +/// Main code. + + .arch pentium4 + .section .text + +FUNC(chacha_core_x86ish_sse2) + + // Initial setup. + +#if CPUFAM_X86 + // Arguments come in on the stack, and will need to be collected. We + // we can get away with just the scratch registers for integer work, + // but we'll run out of XMM registers and will need some properly + // aligned space which we'll steal from the stack. I don't trust the + // stack pointer's alignment, so I'll have to mask the stack pointer, + // which in turn means I'll need to keep track of the old value. + // Hence I'm making a full i386-style stack frame here. + // + // The Windows and SysV ABIs are sufficiently similar that we don't + // need to worry about the differences here. + +# define NR ecx +# define IN eax +# define OUT edx +# define SAVE0 xmm5 +# define SAVE1 xmm6 +# define SAVE2 xmm7 +# define SAVE3 [esp] + + push ebp + mov ebp, esp + sub esp, 16 + mov IN, [ebp + 12] + mov OUT, [ebp + 16] + and esp, ~15 + mov NR, [ebp + 8] +#endif + +#if CPUFAM_AMD64 && ABI_SYSV + // This is nice. We have plenty of XMM registers, and the arguments + // are in useful places. There's no need to spill anything and we + // can just get on with the code. + +# define NR edi +# define IN rsi +# define OUT rdx +# define SAVE0 xmm5 +# define SAVE1 xmm6 +# define SAVE2 xmm7 +# define SAVE3 xmm8 +#endif + +#if CPUFAM_AMD64 && ABI_WIN + // Arguments come in registers, but they're different between Windows + // and everyone else (and everyone else is saner). + // + // The Windows ABI insists that we preserve some of the XMM + // registers, but we want more than we can use as scratch space. We + // only need to save a copy of the input for the feedforward at the + // end, so we might as well use memory rather than spill extra + // registers. (We need an extra 8 bytes to align the stack.) + +# define NR ecx +# define IN rdx +# define OUT r8 +# define SAVE0 xmm5 +# define SAVE1 [rsp + 0] +# define SAVE2 [rsp + 16] +# define SAVE3 [rsp + 32] + + sub rsp, 48 + 8 +#endif + + // First job is to slurp the matrix into XMM registers. Be careful: + // the input matrix isn't likely to be properly aligned. + // + // [ 0 1 2 3] (a, xmm0) + // [ 4 5 6 7] (b, xmm1) + // [ 8 9 10 11] (c, xmm2) + // [12 13 14 15] (d, xmm3) + movdqu xmm0, [IN + 0] + movdqu xmm1, [IN + 16] + movdqu xmm2, [IN + 32] + movdqu xmm3, [IN + 48] + + // Take a copy for later. This one is aligned properly, by + // construction. + movdqa SAVE0, xmm0 + movdqa SAVE1, xmm1 + movdqa SAVE2, xmm2 + movdqa SAVE3, xmm3 + +loop: + // Apply a column quarterround to each of the columns simultaneously. + // Alas, there doesn't seem to be a packed doubleword rotate, so we + // have to synthesize it. + + // a += b; d ^= a; d <<<= 16 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 16 + psrld xmm4, 16 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 12 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm4, xmm1 + pslld xmm1, 12 + psrld xmm4, 20 + por xmm1, xmm4 + + // a += b; d ^= a; d <<<= 8 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 8 + psrld xmm4, 24 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 7 + paddd xmm2, xmm3 + pshufd xmm3, xmm3, ROTL + pxor xmm1, xmm2 + pshufd xmm2, xmm2, ROT2 + movdqa xmm4, xmm1 + pslld xmm1, 7 + psrld xmm4, 25 + por xmm1, xmm4 + + // The not-quite-transpose conveniently only involves reordering + // elements of individual rows, which can be done quite easily. It + // doesn't involve any movement of elements between rows, or even + // renaming of the rows. + // + // [ 0 1 2 3] [ 0 1 2 3] (a, xmm0) + // [ 4 5 6 7] --> [ 5 6 7 4] (b, xmm1) + // [ 8 9 10 11] [10 11 8 9] (c, xmm2) + // [12 13 14 15] [15 12 13 14] (d, xmm3) + // + // The shuffles have quite high latency, so they've mostly been + // pushed upwards. The remaining one can't be moved, though. + pshufd xmm1, xmm1, ROTR + + // Apply the diagonal quarterround to each of the columns + // simultaneously. + + // a += b; d ^= a; d <<<= 16 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 16 + psrld xmm4, 16 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 12 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm4, xmm1 + pslld xmm1, 12 + psrld xmm4, 20 + por xmm1, xmm4 + + // a += b; d ^= a; d <<<= 8 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm4, xmm3 + pslld xmm3, 8 + psrld xmm4, 24 + por xmm3, xmm4 + + // c += d; b ^= c; b <<<= 7 + paddd xmm2, xmm3 + pshufd xmm3, xmm3, ROTR + pxor xmm1, xmm2 + pshufd xmm2, xmm2, ROT2 + movdqa xmm4, xmm1 + pslld xmm1, 7 + psrld xmm4, 25 + por xmm1, xmm4 + + // Finally, finish off undoing the transpose, and we're done for this + // doubleround. Again, most of this was done above so we don't have + // to wait for the shuffles. + pshufd xmm1, xmm1, ROTL + + // Decrement the loop counter and see if we should go round again. + sub NR, 2 + ja loop + + // Almost there. Firstly, the feedforward addition. + paddd xmm0, SAVE0 + paddd xmm1, SAVE1 + paddd xmm2, SAVE2 + paddd xmm3, SAVE3 + + // And now we write out the result. This one won't be aligned + // either. + movdqu [OUT + 0], xmm0 + movdqu [OUT + 16], xmm1 + movdqu [OUT + 32], xmm2 + movdqu [OUT + 48], xmm3 + + // Tidy things up. +#if CPUFAM_X86 + mov esp, ebp + pop ebp +#endif +#if CPUFAM_AMD64 && ABI_WIN + add rsp, 48 + 8 +#endif + + // And with that, we're done. + ret + +ENDFUNC + +///----- That's all, folks --------------------------------------------------