movdqa SAVE2, xmm2
movdqa SAVE3, xmm3
-loop:
+0:
// Apply a column quarterround to each of the columns simultaneously.
// Alas, there doesn't seem to be a packed doubleword rotate, so we
// have to synthesize it.
// Decrement the loop counter and see if we should go round again.
// Later processors fuse this pair into a single uop.
sub NR, 2
- ja loop
+ ja 0b
// Almost there. Firstly, the feedforward addition, and then we have
// to write out the result. Here we have to undo the permutation