X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/b9b279b4105524d5d4e5dcd389141645d904aa0c..6d2bd7f11dcd292461bbd66e487e4367f05f9fe8:/symm/chacha-x86ish-sse2.S diff --git a/symm/chacha-x86ish-sse2.S b/symm/chacha-x86ish-sse2.S index b8f72d53..974ec5b5 100644 --- a/symm/chacha-x86ish-sse2.S +++ b/symm/chacha-x86ish-sse2.S @@ -25,16 +25,16 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" + .text + ///-------------------------------------------------------------------------- /// Main code. - .text - FUNC(chacha_core_x86ish_avx) .arch .avx vzeroupper @@ -66,15 +66,15 @@ FUNC(chacha_core_x86ish_sse2) # define SAVE0 xmm5 # define SAVE1 xmm6 # define SAVE2 xmm7 -# define SAVE3 [esp] - - pushreg ebp - setfp ebp - sub esp, 16 - mov IN, [ebp + 12] - mov OUT, [ebp + 16] - and esp, ~15 - mov NR, [ebp + 8] +# define SAVE3 [SP] + + pushreg BP + setfp + stalloc 16 + mov IN, [BP + 12] + mov OUT, [BP + 16] + and SP, ~15 + mov NR, [BP + 8] #endif #if CPUFAM_AMD64 && ABI_SYSV @@ -105,9 +105,9 @@ FUNC(chacha_core_x86ish_sse2) # define IN rdx # define OUT r8 # define SAVE0 xmm5 -# define SAVE1 [rsp + 0] -# define SAVE2 [rsp + 16] -# define SAVE3 [rsp + 32] +# define SAVE1 [SP + 0] +# define SAVE2 [SP + 16] +# define SAVE3 [SP + 32] stalloc 48 + 8 #endif @@ -164,9 +164,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) + pshufd xmm3, xmm3, SHUF(3, 0, 1, 2) pxor xmm1, xmm2 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -184,7 +184,7 @@ FUNC(chacha_core_x86ish_sse2) // // The shuffles have quite high latency, so they've mostly been // pushed upwards. The remaining one can't be moved, though. - pshufd xmm1, xmm1, SHUF(0, 3, 2, 1) + pshufd xmm1, xmm1, SHUF(1, 2, 3, 0) // Apply the diagonal quarterround to each of the columns // simultaneously. @@ -215,9 +215,9 @@ FUNC(chacha_core_x86ish_sse2) // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) + pshufd xmm3, xmm3, SHUF(1, 2, 3, 0) pxor xmm1, xmm2 - pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) + pshufd xmm2, xmm2, SHUF(2, 3, 0, 1) movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -226,7 +226,7 @@ FUNC(chacha_core_x86ish_sse2) // Finally, finish off undoing the transpose, and we're done for this // doubleround. Again, most of this was done above so we don't have // to wait for the shuffles. - pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) + pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // Decrement the loop counter and see if we should go round again. sub NR, 2 @@ -248,7 +248,7 @@ FUNC(chacha_core_x86ish_sse2) // Tidy things up. #if CPUFAM_X86 dropfp - popreg ebp + popreg BP #endif #if CPUFAM_AMD64 && ABI_WIN stfree 48 + 8