/// -*- mode: asm; asm-comment-char: ?/ -*- /// /// Fancy SIMD implementation of Salsa20 /// /// (c) 2015 Straylight/Edgeware /// ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software; you can redistribute it and/or modify /// it under the terms of the GNU Library General Public License as /// published by the Free Software Foundation; either version 2 of the /// License, or (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, /// but WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the /// GNU Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb; if not, write to the Free /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- /// Preliminaries. #include "config.h" #include "asm-common.h" .text ///-------------------------------------------------------------------------- /// Main code. FUNC(salsa20_core_x86ish_avx) .arch .avx vzeroupper endprologue // drop through... ENDFUNC .arch pentium4 FUNC(salsa20_core_x86ish_sse2) // Initial setup. #if CPUFAM_X86 // Arguments come in on the stack, and will need to be collected. We // can get away with just the scratch registers for integer work, but // we'll run out of XMM registers and will need some properly aligned // space which we'll steal from the stack. I don't trust the stack // pointer's alignment, so I'll have to mask the stack pointer, which // in turn means I'll need to keep track of the old value. Hence I'm // making a full i386-style stack frame here. // // The Windows and SysV ABIs are sufficiently similar that we don't // need to worry about the differences here. # define NR ecx # define IN eax # define OUT edx # define SAVE0 xmm6 # define SAVE1 xmm7 # define SAVE2 [SP + 0] # define SAVE3 [SP + 16] pushreg BP setfp stalloc 32 mov IN, [BP + 12] mov OUT, [BP + 16] and SP, ~15 mov NR, [BP + 8] #endif #if CPUFAM_AMD64 && ABI_SYSV // This is nice. We have plenty of XMM registers, and the arguments // are in useful places. There's no need to spill anything and we // can just get on with the code. # define NR edi # define IN rsi # define OUT rdx # define SAVE0 xmm6 # define SAVE1 xmm7 # define SAVE2 xmm8 # define SAVE3 xmm9 #endif # if CPUFAM_AMD64 && ABI_WIN // Arguments come in registers, but they're different between Windows // and everyone else (and everyone else is saner). // // The Windows ABI insists that we preserve some of the XMM // registers, but we want more than we can use as scratch space. Two // places we only need to save a copy of the input for the // feedforward at the end; but the other two we want for the final // permutation, so save the old values on the stack. (We need an // extra 8 bytes to align the stack.) # define NR ecx # define IN rdx # define OUT r8 # define SAVE0 xmm6 # define SAVE1 xmm7 # define SAVE2 [SP + 32] # define SAVE3 [SP + 48] stalloc 64 + 8 savexmm xmm6, 0 savexmm xmm7, 16 #endif endprologue // First job is to slurp the matrix into XMM registers. The words // have already been permuted conveniently to make them line up // better for SIMD processing. // // The textbook arrangement of the matrix is this. // // [C K K K] // [K C N N] // [T T C K] // [K K K C] // // But we've rotated the columns up so that the main diagonal with // the constants on it end up in the first row, giving something more // like // // [C C C C] // [K T K K] // [T K K N] // [K K N K] // // so the transformation looks like this: // // [ 0 1 2 3] [ 0 5 10 15] (a, xmm0) // [ 4 5 6 7] --> [ 4 9 14 3] (b, xmm1) // [ 8 9 10 11] [ 8 13 2 7] (c, xmm2) // [12 13 14 15] [12 1 6 11] (d, xmm3) movdqu xmm0, [IN + 0] movdqu xmm1, [IN + 16] movdqu xmm2, [IN + 32] movdqu xmm3, [IN + 48] // Take a copy for later. movdqa SAVE0, xmm0 movdqa SAVE1, xmm1 movdqa SAVE2, xmm2 movdqa SAVE3, xmm3 0: // Apply a column quarterround to each of the columns simultaneously. // Alas, there doesn't seem to be a packed doubleword rotate, so we // have to synthesize it. // b ^= (a + d) <<< 7 movdqa xmm4, xmm0 paddd xmm4, xmm3 movdqa xmm5, xmm4 pslld xmm4, 7 psrld xmm5, 25 por xmm4, xmm5 pxor xmm1, xmm4 // c ^= (b + a) <<< 9 movdqa xmm4, xmm1 paddd xmm4, xmm0 movdqa xmm5, xmm4 pslld xmm4, 9 psrld xmm5, 23 por xmm4, xmm5 pxor xmm2, xmm4 // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm1 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 por xmm4, xmm5 pxor xmm3, xmm4 // a ^= (d + c) <<< 18 movdqa xmm4, xmm3 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) paddd xmm4, xmm2 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 por xmm4, xmm5 pxor xmm0, xmm4 // The transpose conveniently only involves reordering elements of // individual rows, which can be done quite easily, and reordering // the rows themselves, which is a trivial renaming. It doesn't // involve any movement of elements between rows. // // [ 0 5 10 15] [ 0 5 10 15] (a, xmm0) // [ 4 9 14 3] --> [ 1 6 11 12] (b, xmm3) // [ 8 13 2 7] [ 2 7 8 13] (c, xmm2) // [12 1 6 11] [ 3 4 9 14] (d, xmm1) // // The shuffles have quite high latency, so they've been pushed // backwards into the main instruction list. // Apply the row quarterround to each of the columns (yes!) // simultaneously. // b ^= (a + d) <<< 7 movdqa xmm4, xmm0 paddd xmm4, xmm1 movdqa xmm5, xmm4 pslld xmm4, 7 psrld xmm5, 25 por xmm4, xmm5 pxor xmm3, xmm4 // c ^= (b + a) <<< 9 movdqa xmm4, xmm3 paddd xmm4, xmm0 movdqa xmm5, xmm4 pslld xmm4, 9 psrld xmm5, 23 por xmm4, xmm5 pxor xmm2, xmm4 // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm3 pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 por xmm4, xmm5 pxor xmm1, xmm4 // a ^= (d + c) <<< 18 movdqa xmm4, xmm1 pshufd xmm1, xmm1, SHUF(0, 3, 2, 1) paddd xmm4, xmm2 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 por xmm4, xmm5 pxor xmm0, xmm4 // We had to undo the transpose ready for the next loop. Again, push // back the shuffles because they take a long time coming through. // Decrement the loop counter and see if we should go round again. // Later processors fuse this pair into a single uop. sub NR, 2 ja 0b // Almost there. Firstly, the feedforward addition. paddd xmm0, SAVE0 // 0, 5, 10, 15 paddd xmm1, SAVE1 // 4, 9, 14, 3 paddd xmm2, SAVE2 // 8, 13, 2, 7 paddd xmm3, SAVE3 // 12, 1, 6, 11 // Next we must undo the permutation which was already applied to the // input. This can be done by juggling values in registers, with the // following fancy footwork: some row rotations, a transpose, and // some more rotations. pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14 pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13 pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12 movdqa xmm4, xmm0 movdqa xmm5, xmm3 punpckldq xmm0, xmm2 // 0, 2, 5, 7 punpckldq xmm3, xmm1 // 1, 3, 6, 4 punpckhdq xmm4, xmm2 // 10, 8, 15, 13 punpckhdq xmm5, xmm1 // 11, 9, 12, 14 movdqa xmm1, xmm0 movdqa xmm2, xmm4 punpckldq xmm0, xmm3 // 0, 1, 2, 3 punpckldq xmm4, xmm5 // 10, 11, 8, 9 punpckhdq xmm1, xmm3 // 5, 6, 7, 4 punpckhdq xmm2, xmm5 // 15, 12, 13, 14 pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7 pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11 pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15 // Finally we have to write out the result. movdqu [OUT + 0], xmm0 movdqu [OUT + 16], xmm1 movdqu [OUT + 32], xmm4 movdqu [OUT + 48], xmm2 // Tidy things up. #if CPUFAM_X86 dropfp popreg BP #endif #if CPUFAM_AMD64 && ABI_WIN rstrxmm xmm6, 0 rstrxmm xmm7, 16 stfree 64 + 8 #endif // And with that, we're done. ret #undef NR #undef IN #undef OUT #undef SAVE0 #undef SAVE1 #undef SAVE2 #undef SAVE3 ENDFUNC ///----- That's all, folks --------------------------------------------------