#include "asm-common.h"
///--------------------------------------------------------------------------
+/// Local utilities.
+
+// Magic constants for shuffling.
+#define ROTL 0x93
+#define ROT2 0x4e
+#define ROTR 0x39
+
+///--------------------------------------------------------------------------
/// Main code.
.arch pentium4
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, 0x93
+ pshufd xmm3, xmm3, ROTL
pxor xmm1, xmm2
- pshufd xmm2, xmm2, 0x4e
+ pshufd xmm2, xmm2, ROT2
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
//
// The shuffles have quite high latency, so they've mostly been
// pushed upwards. The remaining one can't be moved, though.
- pshufd xmm1, xmm1, 0x39
+ pshufd xmm1, xmm1, ROTR
// Apply the diagonal quarterround to each of the columns
// simultaneously.
// c += d; b ^= c; b <<<= 7
paddd xmm2, xmm3
- pshufd xmm3, xmm3, 0x39
+ pshufd xmm3, xmm3, ROTR
pxor xmm1, xmm2
- pshufd xmm2, xmm2, 0x4e
+ pshufd xmm2, xmm2, ROT2
movdqa xmm4, xmm1
pslld xmm1, 7
psrld xmm4, 25
// Finally, finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// to wait for the shuffles.
- pshufd xmm1, xmm1, 0x93
+ pshufd xmm1, xmm1, ROTL
// Decrement the loop counter and see if we should go round again.
sub ecx, 2