From: Mark Wooding Date: Tue, 26 Jul 2022 10:26:58 +0000 (+0100) Subject: @@@ i386 wip X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/commitdiff_plain/22bace226c338f994b99ac8cdcdde60b805e7b05 @@@ i386 wip --- diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S index 974ec5b5..d21a6245 100644 --- a/symm/chacha-x86ish-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -259,4 +259,138 @@ FUNC(chacha_core_x86ish_sse2) ENDFUNC +FUNC(chacha_multi_i386_sse2) + // Arguments are on the stack: + // + // [sp + 4] pointer to state + // [sp + 8] input pointer (or null) + // [sp + 12] output pointer + // [sp + 16] number of blocks to process + // [sp + 20] number of rounds per block + + pushreg SI + pushreg DI + pushreg BX + stalloc 4*64 + endprologue + + // Load the arguments. + mov BX, [SP + 272] // = state pointer + mov SI, [SP + 276] // = source pointer + mov DI, [SP + 280] // = destination pointer + mov CX, [SP + 284] // = block count + mov DX, [SP + 288] // = (initial) round count + + // Do chunks of four blocks at a time. + sub CX, 4 + jb 8f + + // Inhale the initial state. + movdqu xmm1, [BX + 0] + movdqu xmm3, [BX + 16] + movdqu xmm5, [BX + 32] + movdqu xmm0, [BX + 48] + + // Set the counters and initialize the working blocks. + pxor xmm2, xmm2 + pxor xmm4, xmm4 + pxor xmm6, xmm6 + pxor xmm7, xmm7 + + xor eax, eax + mov al, 1 + pinsrw xmm2, eax, 4 + mov al, 2 + pinsrw xmm4, eax, 4 + mov al, 3 + pinsrw xmm6, eax, 4 + mov al, 4 + pinsrw xmm7, eax, 4 + + movdqa [SP + 16], xmm3 + movdqa [SP + 32], xmm5 + movdqa [SP + 48], xmm0 + + paddq xmm2, xmm3 + paddq xmm4, xmm3 + paddq xmm6, xmm3 + paddq xmm7, xmm3 + + movdqu [BX + 48], xmm7 + + // a += b; d ^= a; d <<<= 16 + paddd xmm1, xmm3 // a += b + + movdqa [SP + 0], xmm1 + + pxor xmm0, xmm1 // d ^= a + pxor xmm2, xmm1 + pxor xmm4, xmm1 + pxor xmm6, xmm1 + + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + movdqa xmm5, xmm4 + movdqa xmm7, xmm6 + + pslld xmm0, 16 // d << 16 + pslld xmm2, 16 + pslld xmm4, 16 + pslld xmm6, 16 + + pslrd xmm1, 16 // d >> 16 + pslrd xmm3, 16 + pslrd xmm5, 16 + pslrd xmm7, 16 + + por xmm0, xmm1 // d <<<= 16 + movdqa xmm1, [SP + 32] + por xmm2, xmm3 + movdqa xmm3, [SP + 48] + por xmm4, xmm5 + por xmm6, xmm7 + + movdqa [SP + 48], xmm0 + movdqa [SP + 112], xmm2 + movdqa [SP + 176], xmm4 + movdqa [SP + 240], xmm6 + + // c += d; b ^= c; c <<<= 12 + paddd xmm0, xmm1 // c += d + paddd xmm2, xmm1 + paddd xmm4, xmm1 + paddd xmm6, xmm1 + + movdqa [SP + 32], xmm0 + movdqa [SP + 96], xmm0 + movdqa [SP + 160], xmm0 + movdqa [SP + 224], xmm0 + + pxor xmm0, xmm3 // b ^= c + pxor xmm2, xmm3 + pxor xmm4, xmm3 + pxor xmm6, xmm3 + + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + movdqa xmm5, xmm4 + movdqa xmm7, xmm6 + + pslld xmm0, 16 // d << 16 + pslld xmm2, 16 + pslld xmm4, 16 + pslld xmm6, 16 + + pslrd xmm1, 16 // d >> 16 + pslrd xmm3, 16 + pslrd xmm5, 16 + pslrd xmm7, 16 + + por xmm0, xmm1 // d <<<= 16 + por xmm2, xmm3 + por xmm4, xmm5 + por xmm6, xmm7 + +ENDFUNC + ///----- That's all, folks --------------------------------------------------