From 47103664f22c3c2d4b51578d0bec226f778329af Mon Sep 17 00:00:00 2001
From: Mark Wooding <mdw@distorted.org.uk>
Date: Wed, 18 May 2016 10:29:03 +0100
Subject: [PATCH] symm/*.S: Symbolic names for shuffles.

The magic constants for the various shuffles (actually, all rotations)
have irritated me.  Replace them with names, now we have a preprocessor.
---
 symm/chacha-x86-sse2.S    | 20 ++++++++++++++------
 symm/rijndael-x86-aesni.S | 14 +++++++++++---
 symm/salsa20-x86-sse2.S   | 44 ++++++++++++++++++++++++++------------------
 3 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/symm/chacha-x86-sse2.S b/symm/chacha-x86-sse2.S
index 87fb0965..ccdfa538 100644
--- a/symm/chacha-x86-sse2.S
+++ b/symm/chacha-x86-sse2.S
@@ -31,6 +31,14 @@
 #include "asm-common.h"
 
 ///--------------------------------------------------------------------------
+/// Local utilities.
+
+// Magic constants for shuffling.
+#define ROTL 0x93
+#define ROT2 0x4e
+#define ROTR 0x39
+
+///--------------------------------------------------------------------------
 /// Main code.
 
 	.arch pentium4
@@ -101,9 +109,9 @@ loop:
 
 	// c += d; b ^= c; b <<<=  7
 	paddd	xmm2, xmm3
-	pshufd	xmm3, xmm3, 0x93
+	pshufd	xmm3, xmm3, ROTL
 	pxor	xmm1, xmm2
-	pshufd	xmm2, xmm2, 0x4e
+	pshufd	xmm2, xmm2, ROT2
 	movdqa	xmm4, xmm1
 	pslld	xmm1, 7
 	psrld	xmm4, 25
@@ -121,7 +129,7 @@ loop:
 	//
 	// The shuffles have quite high latency, so they've mostly been
 	// pushed upwards.  The remaining one can't be moved, though.
-	pshufd	xmm1, xmm1, 0x39
+	pshufd	xmm1, xmm1, ROTR
 
 	// Apply the diagonal quarterround to each of the columns
 	// simultaneously.
@@ -152,9 +160,9 @@ loop:
 
 	// c += d; b ^= c; b <<<=  7
 	paddd	xmm2, xmm3
-	pshufd	xmm3, xmm3, 0x39
+	pshufd	xmm3, xmm3, ROTR
 	pxor	xmm1, xmm2
-	pshufd	xmm2, xmm2, 0x4e
+	pshufd	xmm2, xmm2, ROT2
 	movdqa	xmm4, xmm1
 	pslld	xmm1, 7
 	psrld	xmm4, 25
@@ -163,7 +171,7 @@ loop:
 	// Finally, finish off undoing the transpose, and we're done for this
 	// doubleround.  Again, most of this was done above so we don't have
 	// to wait for the shuffles.
-	pshufd	xmm1, xmm1, 0x93
+	pshufd	xmm1, xmm1, ROTL
 
 	// Decrement the loop counter and see if we should go round again.
 	sub	ecx, 2
diff --git a/symm/rijndael-x86-aesni.S b/symm/rijndael-x86-aesni.S
index d9aa9dc9..eba7b058 100644
--- a/symm/rijndael-x86-aesni.S
+++ b/symm/rijndael-x86-aesni.S
@@ -37,6 +37,14 @@
 	.globl	F(rijndael_rcon)
 
 ///--------------------------------------------------------------------------
+/// Local utilities.
+
+// Magic constants for shuffling.
+#define ROTL 0x93
+#define ROT2 0x4e
+#define ROTR 0x39
+
+///--------------------------------------------------------------------------
 /// Main code.
 
 	.arch	.aes
@@ -119,9 +127,9 @@ FUNC(rijndael_setup_x86_aesni)
 	// open-coding the whole thing.  It's much easier to leave that as
 	// zero and XOR in the round constant by hand.
 9:	movd	xmm0, eax
-	pshufd	xmm0, xmm0, 0x39
+	pshufd	xmm0, xmm0, ROTR
 	aeskeygenassist xmm1, xmm0, 0
-	pshufd	xmm1, xmm1, 0x93
+	pshufd	xmm1, xmm1, ROTL
 	movd	eax, xmm1
 	xor	eax, [esi]
 	xor	al, [ecx]
@@ -159,7 +167,7 @@ FUNC(rijndael_setup_x86_aesni)
 	cmp	ebx, 7
 	jb	0f
 	movd	xmm0, eax
-	pshufd	xmm0, xmm0, 0x93
+	pshufd	xmm0, xmm0, ROTL
 	aeskeygenassist xmm1, xmm0, 0
 	movd	eax, xmm1
 0:	xor	eax, [esi]
diff --git a/symm/salsa20-x86-sse2.S b/symm/salsa20-x86-sse2.S
index 5a13fd49..7a5bd2a3 100644
--- a/symm/salsa20-x86-sse2.S
+++ b/symm/salsa20-x86-sse2.S
@@ -31,6 +31,14 @@
 #include "asm-common.h"
 
 ///--------------------------------------------------------------------------
+/// Local utilities.
+
+// Magic constants for shuffling.
+#define ROTL 0x93
+#define ROT2 0x4e
+#define ROTR 0x39
+
+///--------------------------------------------------------------------------
 /// Main code.
 
 	.arch pentium4
@@ -115,7 +123,7 @@ loop:
 	// d ^= (c + b) <<< 13
 	movdqa	xmm4, xmm2
 	paddd	xmm4, xmm1
-	pshufd	xmm1, xmm1, 0x93
+	pshufd	xmm1, xmm1, ROTL
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 13
 	psrld	xmm5, 19
@@ -124,9 +132,9 @@ loop:
 
 	// a ^= (d + c) <<< 18
 	movdqa	xmm4, xmm3
-	pshufd	xmm3, xmm3, 0x39
+	pshufd	xmm3, xmm3, ROTR
 	paddd	xmm4, xmm2
-	pshufd	xmm2, xmm2, 0x4e
+	pshufd	xmm2, xmm2, ROT2
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 18
 	psrld	xmm5, 14
@@ -170,7 +178,7 @@ loop:
 	// d ^= (c + b) <<< 13
 	movdqa	xmm4, xmm2
 	paddd	xmm4, xmm3
-	pshufd	xmm3, xmm3, 0x93
+	pshufd	xmm3, xmm3, ROTL
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 13
 	psrld	xmm5, 19
@@ -179,9 +187,9 @@ loop:
 
 	// a ^= (d + c) <<< 18
 	movdqa	xmm4, xmm1
-	pshufd	xmm1, xmm1, 0x39
+	pshufd	xmm1, xmm1, ROTR
 	paddd	xmm4, xmm2
-	pshufd	xmm2, xmm2, 0x4e
+	pshufd	xmm2, xmm2, ROT2
 	movdqa	xmm5, xmm4
 	pslld	xmm4, 18
 	psrld	xmm5, 14
@@ -203,43 +211,43 @@ loop:
 	mov	edx, [ebp + 16]
 
 	paddd	xmm0, [esp +  0]
-	pshufd	xmm4, xmm0, 0x39
+	pshufd	xmm4, xmm0, ROTR
 	movd	[edx +  0], xmm0
 
 	paddd	xmm1, [esp + 16]
-	pshufd	xmm5, xmm1, 0x93
+	pshufd	xmm5, xmm1, ROTL
 	movd	[edx + 16], xmm1
 
 	paddd	xmm2, xmm6
-	pshufd	xmm6, xmm2, 0x4e
+	pshufd	xmm6, xmm2, ROT2
 	movd	[edx + 32], xmm2
 
 	paddd	xmm3, xmm7
-	pshufd	xmm7, xmm3, 0x39
+	pshufd	xmm7, xmm3, ROTR
 	movd	[edx + 48], xmm3
 
 	movd	[edx +  4], xmm7
-	pshufd	xmm7, xmm3, 0x4e
+	pshufd	xmm7, xmm3, ROT2
 	movd	[edx + 24], xmm7
-	pshufd	xmm3, xmm3, 0x93
+	pshufd	xmm3, xmm3, ROTL
 	movd	[edx + 44], xmm3
 
 	movd	[edx +  8], xmm6
-	pshufd	xmm6, xmm2, 0x93
+	pshufd	xmm6, xmm2, ROTL
 	movd	[edx + 28], xmm6
-	pshufd	xmm2, xmm2, 0x39
+	pshufd	xmm2, xmm2, ROTR
 	movd	[edx + 52], xmm2
 
 	movd	[edx + 12], xmm5
-	pshufd	xmm5, xmm1, 0x39
+	pshufd	xmm5, xmm1, ROTR
 	movd	[edx + 36], xmm5
-	pshufd	xmm1, xmm1, 0x4e
+	pshufd	xmm1, xmm1, ROT2
 	movd	[edx + 56], xmm1
 
 	movd	[edx + 20], xmm4
-	pshufd	xmm4, xmm0, 0x4e
+	pshufd	xmm4, xmm0, ROT2
 	movd	[edx + 40], xmm4
-	pshufd	xmm0, xmm0, 0x93
+	pshufd	xmm0, xmm0, ROTL
 	movd	[edx + 60], xmm0
 
 	// Tidy things up.
-- 
2.11.0