From 47103664f22c3c2d4b51578d0bec226f778329af Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Wed, 18 May 2016 10:29:03 +0100 Subject: [PATCH] symm/*.S: Symbolic names for shuffles. The magic constants for the various shuffles (actually, all rotations) have irritated me. Replace them with names, now we have a preprocessor. --- symm/chacha-x86-sse2.S | 20 ++++++++++++++------ symm/rijndael-x86-aesni.S | 14 +++++++++++--- symm/salsa20-x86-sse2.S | 44 ++++++++++++++++++++++++++------------------ 3 files changed, 51 insertions(+), 27 deletions(-) diff --git a/symm/chacha-x86-sse2.S b/symm/chacha-x86-sse2.S index 87fb0965..ccdfa538 100644 --- a/symm/chacha-x86-sse2.S +++ b/symm/chacha-x86-sse2.S @@ -31,6 +31,14 @@ #include "asm-common.h" ///-------------------------------------------------------------------------- +/// Local utilities. + +// Magic constants for shuffling. +#define ROTL 0x93 +#define ROT2 0x4e +#define ROTR 0x39 + +///-------------------------------------------------------------------------- /// Main code. .arch pentium4 @@ -101,9 +109,9 @@ loop: // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, 0x93 + pshufd xmm3, xmm3, ROTL pxor xmm1, xmm2 - pshufd xmm2, xmm2, 0x4e + pshufd xmm2, xmm2, ROT2 movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -121,7 +129,7 @@ loop: // // The shuffles have quite high latency, so they've mostly been // pushed upwards. The remaining one can't be moved, though. - pshufd xmm1, xmm1, 0x39 + pshufd xmm1, xmm1, ROTR // Apply the diagonal quarterround to each of the columns // simultaneously. @@ -152,9 +160,9 @@ loop: // c += d; b ^= c; b <<<= 7 paddd xmm2, xmm3 - pshufd xmm3, xmm3, 0x39 + pshufd xmm3, xmm3, ROTR pxor xmm1, xmm2 - pshufd xmm2, xmm2, 0x4e + pshufd xmm2, xmm2, ROT2 movdqa xmm4, xmm1 pslld xmm1, 7 psrld xmm4, 25 @@ -163,7 +171,7 @@ loop: // Finally, finish off undoing the transpose, and we're done for this // doubleround. Again, most of this was done above so we don't have // to wait for the shuffles. - pshufd xmm1, xmm1, 0x93 + pshufd xmm1, xmm1, ROTL // Decrement the loop counter and see if we should go round again. sub ecx, 2 diff --git a/symm/rijndael-x86-aesni.S b/symm/rijndael-x86-aesni.S index d9aa9dc9..eba7b058 100644 --- a/symm/rijndael-x86-aesni.S +++ b/symm/rijndael-x86-aesni.S @@ -37,6 +37,14 @@ .globl F(rijndael_rcon) ///-------------------------------------------------------------------------- +/// Local utilities. + +// Magic constants for shuffling. +#define ROTL 0x93 +#define ROT2 0x4e +#define ROTR 0x39 + +///-------------------------------------------------------------------------- /// Main code. .arch .aes @@ -119,9 +127,9 @@ FUNC(rijndael_setup_x86_aesni) // open-coding the whole thing. It's much easier to leave that as // zero and XOR in the round constant by hand. 9: movd xmm0, eax - pshufd xmm0, xmm0, 0x39 + pshufd xmm0, xmm0, ROTR aeskeygenassist xmm1, xmm0, 0 - pshufd xmm1, xmm1, 0x93 + pshufd xmm1, xmm1, ROTL movd eax, xmm1 xor eax, [esi] xor al, [ecx] @@ -159,7 +167,7 @@ FUNC(rijndael_setup_x86_aesni) cmp ebx, 7 jb 0f movd xmm0, eax - pshufd xmm0, xmm0, 0x93 + pshufd xmm0, xmm0, ROTL aeskeygenassist xmm1, xmm0, 0 movd eax, xmm1 0: xor eax, [esi] diff --git a/symm/salsa20-x86-sse2.S b/symm/salsa20-x86-sse2.S index 5a13fd49..7a5bd2a3 100644 --- a/symm/salsa20-x86-sse2.S +++ b/symm/salsa20-x86-sse2.S @@ -31,6 +31,14 @@ #include "asm-common.h" ///-------------------------------------------------------------------------- +/// Local utilities. + +// Magic constants for shuffling. +#define ROTL 0x93 +#define ROT2 0x4e +#define ROTR 0x39 + +///-------------------------------------------------------------------------- /// Main code. .arch pentium4 @@ -115,7 +123,7 @@ loop: // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm1 - pshufd xmm1, xmm1, 0x93 + pshufd xmm1, xmm1, ROTL movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@ -124,9 +132,9 @@ loop: // a ^= (d + c) <<< 18 movdqa xmm4, xmm3 - pshufd xmm3, xmm3, 0x39 + pshufd xmm3, xmm3, ROTR paddd xmm4, xmm2 - pshufd xmm2, xmm2, 0x4e + pshufd xmm2, xmm2, ROT2 movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@ -170,7 +178,7 @@ loop: // d ^= (c + b) <<< 13 movdqa xmm4, xmm2 paddd xmm4, xmm3 - pshufd xmm3, xmm3, 0x93 + pshufd xmm3, xmm3, ROTL movdqa xmm5, xmm4 pslld xmm4, 13 psrld xmm5, 19 @@ -179,9 +187,9 @@ loop: // a ^= (d + c) <<< 18 movdqa xmm4, xmm1 - pshufd xmm1, xmm1, 0x39 + pshufd xmm1, xmm1, ROTR paddd xmm4, xmm2 - pshufd xmm2, xmm2, 0x4e + pshufd xmm2, xmm2, ROT2 movdqa xmm5, xmm4 pslld xmm4, 18 psrld xmm5, 14 @@ -203,43 +211,43 @@ loop: mov edx, [ebp + 16] paddd xmm0, [esp + 0] - pshufd xmm4, xmm0, 0x39 + pshufd xmm4, xmm0, ROTR movd [edx + 0], xmm0 paddd xmm1, [esp + 16] - pshufd xmm5, xmm1, 0x93 + pshufd xmm5, xmm1, ROTL movd [edx + 16], xmm1 paddd xmm2, xmm6 - pshufd xmm6, xmm2, 0x4e + pshufd xmm6, xmm2, ROT2 movd [edx + 32], xmm2 paddd xmm3, xmm7 - pshufd xmm7, xmm3, 0x39 + pshufd xmm7, xmm3, ROTR movd [edx + 48], xmm3 movd [edx + 4], xmm7 - pshufd xmm7, xmm3, 0x4e + pshufd xmm7, xmm3, ROT2 movd [edx + 24], xmm7 - pshufd xmm3, xmm3, 0x93 + pshufd xmm3, xmm3, ROTL movd [edx + 44], xmm3 movd [edx + 8], xmm6 - pshufd xmm6, xmm2, 0x93 + pshufd xmm6, xmm2, ROTL movd [edx + 28], xmm6 - pshufd xmm2, xmm2, 0x39 + pshufd xmm2, xmm2, ROTR movd [edx + 52], xmm2 movd [edx + 12], xmm5 - pshufd xmm5, xmm1, 0x39 + pshufd xmm5, xmm1, ROTR movd [edx + 36], xmm5 - pshufd xmm1, xmm1, 0x4e + pshufd xmm1, xmm1, ROT2 movd [edx + 56], xmm1 movd [edx + 20], xmm4 - pshufd xmm4, xmm0, 0x4e + pshufd xmm4, xmm0, ROT2 movd [edx + 40], xmm4 - pshufd xmm0, xmm0, 0x93 + pshufd xmm0, xmm0, ROTL movd [edx + 60], xmm0 // Tidy things up. -- 2.11.0