+ movdqa xmm4, xmm0
+ movdqa xmm5, xmm3
+ punpckldq xmm0, xmm2 // 0, 2, 5, 7
+ punpckldq xmm3, xmm1 // 1, 3, 6, 4
+ punpckhdq xmm4, xmm2 // 10, 8, 15, 13
+ punpckhdq xmm5, xmm1 // 11, 9, 12, 14
+
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm4
+ punpckldq xmm0, xmm3 // 0, 1, 2, 3
+ punpckldq xmm4, xmm5 // 10, 11, 8, 9
+ punpckhdq xmm1, xmm3 // 5, 6, 7, 4
+ punpckhdq xmm2, xmm5 // 15, 12, 13, 14
+
+ pshufd xmm1, xmm1, SHUF(3, 0, 1, 2) // 4, 5, 6, 7
+ pshufd xmm4, xmm4, SHUF(2, 3, 0, 1) // 8, 9, 10, 11
+ pshufd xmm2, xmm2, SHUF(1, 2, 3, 0) // 12, 13, 14, 15
+
+ // Finally we have to write out the result.
+ movdqu [OUT + 0], xmm0
+ movdqu [OUT + 16], xmm1
+ movdqu [OUT + 32], xmm4
+ movdqu [OUT + 48], xmm2
+
+ // Tidy things up.