X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/609affae0305784d87f2357488fba35699a04098..aac19f0d10e3e1a41ddb5ca3fdbdaab97d2d439d:/symm/chacha-x86ish-sse2.S diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S index 2dab283b..77047ebe 100644 --- a/symm/chacha-x86ish-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -33,9 +33,17 @@ ///-------------------------------------------------------------------------- /// Main code. - .arch pentium4 .text +FUNC(chacha_core_x86ish_avx) + .arch .avx + vzeroupper + endprologue + // drop through... +ENDFUNC + + .arch pentium4 + FUNC(chacha_core_x86ish_sse2) // Initial setup. @@ -156,9 +164,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) + pshufd xmm3, xmm3, SHUF(3, 0, 1, 2) pxor xmm1, xmm2 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -176,7 +184,7 @@ FUNC(chacha_core_x86ish_sse2) // // The shuffles have quite high latency, so they've mostly been // pushed upwards. The remaining one can't be moved, though. - pshufd xmm1, xmm1, SHUF(0, 3, 2, 1) + pshufd xmm1, xmm1, SHUF(1, 2, 3, 0) // Apply the diagonal quarterround to each of the columns // simultaneously. @@ -207,9 +215,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) + pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) pxor xmm1, xmm2 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -218,7 +226,7 @@ FUNC(chacha_core_x86ish_sse2) // Finally, finish off undoing the transpose, and we're done for this // doubleround. Again, most of this was done above so we don't have // to wait for the shuffles. - pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // Decrement the loop counter and see if we should go round again. sub NR, 2