/// -*- mode: asm; asm-comment-char: ?/ -*- /// /// Fancy SIMD implementation of Salsa20 /// /// (c) 2015 Straylight/Edgeware /// ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software; you can redistribute it and/or modify /// it under the terms of the GNU Library General Public License as /// published by the Free Software Foundation; either version 2 of the /// License, or (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, /// but WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the /// GNU Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb; if not, write to the Free /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- /// External definitions. #include "config.h" #include "asm-common.h" ///-------------------------------------------------------------------------- /// Main code. .arch pentium4 .text FUNC(salsa20_core_x86ish_sse2) // Initial setup. #if CPUFAM_X86 // Arguments come in on the stack, and will need to be collected. We // we can get away with just the scratch registers for integer work, // but we'll run out of XMM registers and will need some properly // aligned space which we'll steal from the stack. I don't trust the // stack pointer's alignment, so I'll have to mask the stack pointer, // which in turn means I'll need to keep track of the old value. // Hence I'm making a full i386-style stack frame here. // // The Windows and SysV ABIs are sufficiently similar that we don't // need to worry about the differences here. # define NR ecx # define IN eax # define OUT edx # define SAVE0 xmm6 # define SAVE1 xmm7 # define SAVE2 [esp + 0] # define SAVE3 [esp + 16] push ebp mov ebp, esp sub esp, 32 mov IN, [ebp + 12] mov OUT, [ebp + 16] and esp, ~15 mov NR, [ebp + 8] #endif #if CPUFAM_AMD64 && ABI_SYSV // This is nice. We have plenty of XMM registers, and the arguments // are in useful places. There's no need to spill anything and we // can just get on with the code. # define NR edi # define IN rsi # define OUT rdx # define SAVE0 xmm6 # define SAVE1 xmm7 # define SAVE2 xmm8 # define SAVE3 xmm9 #endif # if CPUFAM_AMD64 && ABI_WIN // Arguments come in registers, but they're different between Windows // and everyone else (and everyone else is saner). // // The Windows ABI insists that we preserve some of the XMM // registers, but we want more than we can use as scratch space. Two // places we only need to save a copy of the input for the // feedforward at the end; but the other two we want for the final // permutation, so save the old values on the stack. (We need an // extra 8 bytes to align the stack.) # define NR ecx # define IN rdx # define OUT r8 # define SAVE0 xmm6 # define SAVE1 xmm7 # define SAVE2 [rsp + 32] # define SAVE3 [rsp + 48] sub rsp, 64 + 8 .seh_stackalloc 64 + 8 movdqa [rsp + 0], xmm6 .seh_savexmm xmm6, 0 movdqa [rsp + 16], xmm7 .seh_savexmm xmm7, 16 .seh_endprologue #endif // First job is to slurp the matrix into XMM registers. The words // have already been permuted conveniently to make them line up // better for SIMD processing. // // The textbook arrangement of the matrix is this. // // [C K K K] // [K C N N] // [T T C K] // [K K K C] // // But we've rotated the columns up so that the main diagonal with // the constants on it end up in the first row, giving something more // like // // [C C C C] // [K T K K] // [T K K N] // [K K N K] // // so the transformation looks like this: // // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0) // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1) // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2) // [12 13 14 15] [12 1 6 11] (d, xmm3) movdqu xmm0, [IN + 0] movdqu xmm1, [IN + 16] movdqu xmm2, [IN + 32] movdqu xmm3, [IN + 48] // Take a copy for later. movdqa SAVE0, xmm0 movdqa SAVE1, xmm1 movdqa SAVE2, xmm2 movdqa SAVE3, xmm3 0: // Apply a column quarterround to each of the columns simultaneously. // Alas, there doesn't seem to be a packed doubleword rotate, so we // have to synthesize it. // b ^= (a + d) <<< 7 movdqa xmm4, xmm0 paddd xmm4, xmm3 movdqa xmm5, xmm4 pslld xmm4, 7 psrld xmm5, 25 por xmm4, xmm5 pxor xmm1, xmm4 // c ^= (b + a) <<< 9 movdqa xmm4, xmm1 paddd xmm4, xmm0 movdqa xmm5, xmm4 pslld xmm4, 9 psrld xmm5, 23 por xmm4, xmm5 pxor xmm2, xmm4 // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm1 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 por xmm4, xmm5 pxor xmm3, xmm4 // a ^= (d + c) <<< 18 movdqa xmm4, xmm3 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) paddd xmm4, xmm2 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 por xmm4, xmm5 pxor xmm0, xmm4 // The transpose conveniently only involves reordering elements of // individual rows, which can be done quite easily, and reordering // the rows themselves, which is a trivial renaming. It doesn't // involve any movement of elements between rows. // // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0) // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3) // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2) // [12 1 6 11] [ 3 4 9 14] (d, xmm1) // // The shuffles have quite high latency, so they've been pushed // backwards into the main instruction list. // Apply the row quarterround to each of the columns (yes!) // simultaneously. // b ^= (a + d) <<< 7 movdqa xmm4, xmm0 paddd xmm4, xmm1 movdqa xmm5, xmm4 pslld xmm4, 7 psrld xmm5, 25 por xmm4, xmm5 pxor xmm3, xmm4 // c ^= (b + a) <<< 9 movdqa xmm4, xmm3 paddd xmm4, xmm0 movdqa xmm5, xmm4 pslld xmm4, 9 psrld xmm5, 23 por xmm4, xmm5 pxor xmm2, xmm4 // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm3 pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 por xmm4, xmm5 pxor xmm1, xmm4 // a ^= (d + c) <<< 18 movdqa xmm4, xmm1 pshufd xmm1, xmm1, SHUF(0, 3, 2, 1) paddd xmm4, xmm2 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 por xmm4, xmm5 pxor xmm0, xmm4 // We had to undo the transpose ready for the next loop. Again, push // back the shuffles because they take a long time coming through. // Decrement the loop counter and see if we should go round again. // Later processors fuse this pair into a single uop. sub NR, 2 ja 0b // Almost there. Firstly, the feedforward addition, and then we have // to write out the result. Here we have to undo the permutation // which was already applied to the input. Shuffling has quite high // latency, so arrange to start a new shuffle into a temporary as // soon as we've written out the old value. paddd xmm0, SAVE0 pshufd xmm4, xmm0, 0x39 movd [OUT + 0], xmm0 paddd xmm1, SAVE1 pshufd xmm5, xmm1, SHUF(2, 1, 0, 3) movd [OUT + 16], xmm1 paddd xmm2, SAVE2 pshufd xmm6, xmm2, SHUF(1, 0, 3, 2) movd [OUT + 32], xmm2 paddd xmm3, SAVE3 pshufd xmm7, xmm3, SHUF(0, 3, 2, 1) movd [OUT + 48], xmm3 movd [OUT + 4], xmm7 pshufd xmm7, xmm3, SHUF(1, 0, 3, 2) movd [OUT + 24], xmm7 pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) movd [OUT + 44], xmm3 movd [OUT + 8], xmm6 pshufd xmm6, xmm2, SHUF(2, 1, 0, 3) movd [OUT + 28], xmm6 pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) movd [OUT + 52], xmm2 movd [OUT + 12], xmm5 pshufd xmm5, xmm1, SHUF(0, 3, 2, 1) movd [OUT + 36], xmm5 pshufd xmm1, xmm1, SHUF(1, 0, 3, 2) movd [OUT + 56], xmm1 movd [OUT + 20], xmm4 pshufd xmm4, xmm0, SHUF(1, 0, 3, 2) movd [OUT + 40], xmm4 pshufd xmm0, xmm0, SHUF(2, 1, 0, 3) movd [OUT + 60], xmm0 // Tidy things up. #if CPUFAM_X86 mov esp, ebp pop ebp #endif #if CPUFAM_AMD64 && ABI_WIN movdqa xmm6, [rsp + 0] movdqa xmm7, [rsp + 16] add rsp, 64 + 8 #endif // And with that, we're done. ret #undef NR #undef IN #undef OUT #undef SAVE0 #undef SAVE1 #undef SAVE2 #undef SAVE3 ENDFUNC ///----- That's all, folks --------------------------------------------------