/// MA 02111-1307, USA.
///--------------------------------------------------------------------------
-/// External definitions.
+/// Preliminaries.
#include "config.h"
#include "asm-common.h"
+ .text
+
///--------------------------------------------------------------------------
/// Main code.
- .arch pentium4
- .text
+FUNC(chacha_core_x86ish_avx)
+ .arch .avx
+ vzeroupper
+ endprologue
+ // drop through...
+ENDFUNC
+
+ .arch pentium4
FUNC(chacha_core_x86ish_sse2)
# define SAVE0 xmm5
# define SAVE1 xmm6
# define SAVE2 xmm7
-# define SAVE3 [esp]
-
- pushreg ebp
- setfp ebp
- sub esp, 16
- mov IN, [ebp + 12]
- mov OUT, [ebp + 16]
- and esp, ~15
- mov NR, [ebp + 8]
+# define SAVE3 [SP]
+
+ pushreg BP
+ setfp
+ stalloc 16
+ mov IN, [BP + 12]
+ mov OUT, [BP + 16]
+ and SP, ~15
+ mov NR, [BP + 8]
#endif
#if CPUFAM_AMD64 && ABI_SYSV
# define IN rdx
# define OUT r8
# define SAVE0 xmm5
-# define SAVE1 [rsp + 0]
-# define SAVE2 [rsp + 16]
-# define SAVE3 [rsp + 32]
+# define SAVE1 [SP + 0]
+# define SAVE2 [SP + 16]
+# define SAVE3 [SP + 32]
stalloc 48 + 8
#endif
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, SHUF(2, 1, 0, 3)
+ pshufd xmm3, xmm3, SHUF(3, 0, 1, 2)
pxor xmm1, xmm2
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
//
// The shuffles have quite high latency, so they've mostly been
// pushed upwards. The remaining one can't be moved, though.
- pshufd xmm1, xmm1, SHUF(0, 3, 2, 1)
+ pshufd xmm1, xmm1, SHUF(1, 2, 3, 0)
// Apply the diagonal quarterround to each of the columns
// simultaneously.
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, SHUF(0, 3, 2, 1)
+ pshufd xmm3, xmm3, SHUF(1, 2, 3, 0)
pxor xmm1, xmm2
- pshufd xmm2, xmm2, SHUF(1, 0, 3, 2)
+ pshufd xmm2, xmm2, SHUF(2, 3, 0, 1)
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
// Finally, finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// to wait for the shuffles.
- pshufd xmm1, xmm1, SHUF(2, 1, 0, 3)
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2)
// Decrement the loop counter and see if we should go round again.
sub NR, 2
// Tidy things up.
#if CPUFAM_X86
dropfp
- popreg ebp
+ popreg BP
#endif
#if CPUFAM_AMD64 && ABI_WIN
stfree 48 + 8
ENDFUNC
+FUNC(chacha_multi_i386_sse2)
+ // Arguments are on the stack:
+ //
+ // [sp + 4] pointer to state
+ // [sp + 8] input pointer (or null)
+ // [sp + 12] output pointer
+ // [sp + 16] number of blocks to process
+ // [sp + 20] number of rounds per block
+
+ pushreg SI
+ pushreg DI
+ pushreg BX
+ stalloc 4*64
+ endprologue
+
+ // Load the arguments.
+ mov BX, [SP + 272] // = state pointer
+ mov SI, [SP + 276] // = source pointer
+ mov DI, [SP + 280] // = destination pointer
+ mov CX, [SP + 284] // = block count
+ mov DX, [SP + 288] // = (initial) round count
+
+ // Do chunks of four blocks at a time.
+ sub CX, 4
+ jb 8f
+
+ // Inhale the initial state.
+ movdqu xmm1, [BX + 0]
+ movdqu xmm3, [BX + 16]
+ movdqu xmm5, [BX + 32]
+ movdqu xmm0, [BX + 48]
+
+ // Set the counters and initialize the working blocks.
+ pxor xmm2, xmm2
+ pxor xmm4, xmm4
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ xor eax, eax
+ mov al, 1
+ pinsrw xmm2, eax, 4
+ mov al, 2
+ pinsrw xmm4, eax, 4
+ mov al, 3
+ pinsrw xmm6, eax, 4
+ mov al, 4
+ pinsrw xmm7, eax, 4
+
+ movdqa [SP + 16], xmm3
+ movdqa [SP + 32], xmm5
+ movdqa [SP + 48], xmm0
+
+ paddq xmm2, xmm3
+ paddq xmm4, xmm3
+ paddq xmm6, xmm3
+ paddq xmm7, xmm3
+
+ movdqu [BX + 48], xmm7
+
+ // a += b; d ^= a; d <<<= 16
+ paddd xmm1, xmm3 // a += b
+
+ movdqa [SP + 0], xmm1
+
+ pxor xmm0, xmm1 // d ^= a
+ pxor xmm2, xmm1
+ pxor xmm4, xmm1
+ pxor xmm6, xmm1
+
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ movdqa xmm5, xmm4
+ movdqa xmm7, xmm6
+
+ pslld xmm0, 16 // d << 16
+ pslld xmm2, 16
+ pslld xmm4, 16
+ pslld xmm6, 16
+
+ pslrd xmm1, 16 // d >> 16
+ pslrd xmm3, 16
+ pslrd xmm5, 16
+ pslrd xmm7, 16
+
+ por xmm0, xmm1 // d <<<= 16
+ movdqa xmm1, [SP + 32]
+ por xmm2, xmm3
+ movdqa xmm3, [SP + 48]
+ por xmm4, xmm5
+ por xmm6, xmm7
+
+ movdqa [SP + 48], xmm0
+ movdqa [SP + 112], xmm2
+ movdqa [SP + 176], xmm4
+ movdqa [SP + 240], xmm6
+
+ // c += d; b ^= c; c <<<= 12
+ paddd xmm0, xmm1 // c += d
+ paddd xmm2, xmm1
+ paddd xmm4, xmm1
+ paddd xmm6, xmm1
+
+ movdqa [SP + 32], xmm0
+ movdqa [SP + 96], xmm0
+ movdqa [SP + 160], xmm0
+ movdqa [SP + 224], xmm0
+
+ pxor xmm0, xmm3 // b ^= c
+ pxor xmm2, xmm3
+ pxor xmm4, xmm3
+ pxor xmm6, xmm3
+
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ movdqa xmm5, xmm4
+ movdqa xmm7, xmm6
+
+ pslld xmm0, 16 // d << 16
+ pslld xmm2, 16
+ pslld xmm4, 16
+ pslld xmm6, 16
+
+ pslrd xmm1, 16 // d >> 16
+ pslrd xmm3, 16
+ pslrd xmm5, 16
+ pslrd xmm7, 16
+
+ por xmm0, xmm1 // d <<<= 16
+ por xmm2, xmm3
+ por xmm4, xmm5
+ por xmm6, xmm7
+
+ENDFUNC
+
///----- That's all, folks --------------------------------------------------