X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/1da1ed6a5815deef6c33d74f1eb3c856793df3e5..refs/heads/mdw/chacha-multi:/symm/chacha-x86ish-sse2.S diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S index f36bf90f..d21a6245 100644 --- a/symm/chacha-x86ish-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -25,24 +25,24 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" -///-------------------------------------------------------------------------- -/// Local utilities. - -// Magic constants for shuffling. -#define ROTL 0x93 -#define ROT2 0x4e -#define ROTR 0x39 + .text ///-------------------------------------------------------------------------- /// Main code. - .arch pentium4 - .section .text +FUNC(chacha_core_x86ish_avx) + .arch .avx + vzeroupper + endprologue + // drop through... +ENDFUNC + + .arch pentium4 FUNC(chacha_core_x86ish_sse2) @@ -50,12 +50,12 @@ FUNC(chacha_core_x86ish_sse2) #if CPUFAM_X86 // Arguments come in on the stack, and will need to be collected. We - // we can get away with just the scratch registers for integer work, - // but we'll run out of XMM registers and will need some properly - // aligned space which we'll steal from the stack. I don't trust the - // stack pointer's alignment, so I'll have to mask the stack pointer, - // which in turn means I'll need to keep track of the old value. - // Hence I'm making a full i386-style stack frame here. + // can get away with just the scratch registers for integer work, but + // we'll run out of XMM registers and will need some properly aligned + // space which we'll steal from the stack. I don't trust the stack + // pointer's alignment, so I'll have to mask the stack pointer, which + // in turn means I'll need to keep track of the old value. Hence I'm + // making a full i386-style stack frame here. // // The Windows and SysV ABIs are sufficiently similar that we don't // need to worry about the differences here. @@ -66,15 +66,15 @@ FUNC(chacha_core_x86ish_sse2) # define SAVE0 xmm5 # define SAVE1 xmm6 # define SAVE2 xmm7 -# define SAVE3 [esp] - - push ebp - mov ebp, esp - sub esp, 16 - mov IN, [ebp + 12] - mov OUT, [ebp + 16] - and esp, ~15 - mov NR, [ebp + 8] +# define SAVE3 [SP] + + pushreg BP + setfp + stalloc 16 + mov IN, [BP + 12] + mov OUT, [BP + 16] + and SP, ~15 + mov NR, [BP + 8] #endif #if CPUFAM_AMD64 && ABI_SYSV @@ -105,13 +105,15 @@ FUNC(chacha_core_x86ish_sse2) # define IN rdx # define OUT r8 # define SAVE0 xmm5 -# define SAVE1 [rsp + 0] -# define SAVE2 [rsp + 16] -# define SAVE3 [rsp + 32] +# define SAVE1 [SP + 0] +# define SAVE2 [SP + 16] +# define SAVE3 [SP + 32] - sub rsp, 48 + 8 + stalloc 48 + 8 #endif + endprologue + // First job is to slurp the matrix into XMM registers. Be careful: // the input matrix isn't likely to be properly aligned. // @@ -131,7 +133,7 @@ FUNC(chacha_core_x86ish_sse2) movdqa SAVE2, xmm2 movdqa SAVE3, xmm3 -loop: +0: // Apply a column quarterround to each of the columns simultaneously. // Alas, there doesn't seem to be a packed doubleword rotate, so we // have to synthesize it. @@ -162,9 +164,9 @@ loop: // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, ROTL + pshufd xmm3, xmm3, SHUF(3, 0, 1, 2) pxor xmm1, xmm2 - pshufd xmm2, xmm2, ROT2 + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -182,7 +184,7 @@ loop: // // The shuffles have quite high latency, so they've mostly been // pushed upwards. The remaining one can't be moved, though. - pshufd xmm1, xmm1, ROTR + pshufd xmm1, xmm1, SHUF(1, 2, 3, 0) // Apply the diagonal quarterround to each of the columns // simultaneously. @@ -213,9 +215,9 @@ loop: // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, ROTR + pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) pxor xmm1, xmm2 - pshufd xmm2, xmm2, ROT2 + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -224,11 +226,11 @@ loop: // Finally, finish off undoing the transpose, and we're done for this // doubleround. Again, most of this was done above so we don't have // to wait for the shuffles. - pshufd xmm1, xmm1, ROTL + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // Decrement the loop counter and see if we should go round again. sub NR, 2 - ja loop + ja 0b // Almost there. Firstly, the feedforward addition. paddd xmm0, SAVE0 @@ -245,11 +247,11 @@ loop: // Tidy things up. #if CPUFAM_X86 - mov esp, ebp - pop ebp + dropfp + popreg BP #endif #if CPUFAM_AMD64 && ABI_WIN - add rsp, 48 + 8 + stfree 48 + 8 #endif // And with that, we're done. @@ -257,4 +259,138 @@ loop: ENDFUNC +FUNC(chacha_multi_i386_sse2) + // Arguments are on the stack: + // + // [sp + 4] pointer to state + // [sp + 8] input pointer (or null) + // [sp + 12] output pointer + // [sp + 16] number of blocks to process + // [sp + 20] number of rounds per block + + pushreg SI + pushreg DI + pushreg BX + stalloc 4*64 + endprologue + + // Load the arguments. + mov BX, [SP + 272] // = state pointer + mov SI, [SP + 276] // = source pointer + mov DI, [SP + 280] // = destination pointer + mov CX, [SP + 284] // = block count + mov DX, [SP + 288] // = (initial) round count + + // Do chunks of four blocks at a time. + sub CX, 4 + jb 8f + + // Inhale the initial state. + movdqu xmm1, [BX + 0] + movdqu xmm3, [BX + 16] + movdqu xmm5, [BX + 32] + movdqu xmm0, [BX + 48] + + // Set the counters and initialize the working blocks. + pxor xmm2, xmm2 + pxor xmm4, xmm4 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + + xor eax, eax + mov al, 1 + pinsrw xmm2, eax, 4 + mov al, 2 + pinsrw xmm4, eax, 4 + mov al, 3 + pinsrw xmm6, eax, 4 + mov al, 4 + pinsrw xmm7, eax, 4 + + movdqa [SP + 16], xmm3 + movdqa [SP + 32], xmm5 + movdqa [SP + 48], xmm0 + + paddq xmm2, xmm3 + paddq xmm4, xmm3 + paddq xmm6, xmm3 + paddq xmm7, xmm3 + + movdqu [BX + 48], xmm7 + + // a += b; d ^= a; d <<<= 16 + paddd xmm1, xmm3 // a += b + + movdqa [SP + 0], xmm1 + + pxor xmm0, xmm1 // d ^= a + pxor xmm2, xmm1 + pxor xmm4, xmm1 + pxor xmm6, xmm1 + + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + movdqa xmm5, xmm4 + movdqa xmm7, xmm6 + + pslld xmm0, 16 // d << 16 + pslld xmm2, 16 + pslld xmm4, 16 + pslld xmm6, 16 + + pslrd xmm1, 16 // d >> 16 + pslrd xmm3, 16 + pslrd xmm5, 16 + pslrd xmm7, 16 + + por xmm0, xmm1 // d <<<= 16 + movdqa xmm1, [SP + 32] + por xmm2, xmm3 + movdqa xmm3, [SP + 48] + por xmm4, xmm5 + por xmm6, xmm7 + + movdqa [SP + 48], xmm0 + movdqa [SP + 112], xmm2 + movdqa [SP + 176], xmm4 + movdqa [SP + 240], xmm6 + + // c += d; b ^= c; c <<<= 12 + paddd xmm0, xmm1 // c += d + paddd xmm2, xmm1 + paddd xmm4, xmm1 + paddd xmm6, xmm1 + + movdqa [SP + 32], xmm0 + movdqa [SP + 96], xmm0 + movdqa [SP + 160], xmm0 + movdqa [SP + 224], xmm0 + + pxor xmm0, xmm3 // b ^= c + pxor xmm2, xmm3 + pxor xmm4, xmm3 + pxor xmm6, xmm3 + + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + movdqa xmm5, xmm4 + movdqa xmm7, xmm6 + + pslld xmm0, 16 // d << 16 + pslld xmm2, 16 + pslld xmm4, 16 + pslld xmm6, 16 + + pslrd xmm1, 16 // d >> 16 + pslrd xmm3, 16 + pslrd xmm5, 16 + pslrd xmm7, 16 + + por xmm0, xmm1 // d <<<= 16 + por xmm2, xmm3 + por xmm4, xmm5 + por xmm6, xmm7 + +ENDFUNC + ///----- That's all, folks --------------------------------------------------