X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/0f23f75ff53acadf80e9d3dfd2dfd14cb526074f..729a52ce3a79c1edd4d72334660a5dd80998b3ef:/symm/salsa20-x86ish-sse2.S diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S index a168d79a..47401b7a 100644 --- a/symm/salsa20-x86ish-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@ -42,7 +42,7 @@ /// Main code. .arch pentium4 - .section .text + .text FUNC(salsa20_core_x86ish_sse2) @@ -146,13 +146,13 @@ FUNC(salsa20_core_x86ish_sse2) movdqu xmm2, [IN + 32] movdqu xmm3, [IN + 48] - ## Take a copy for later. + // Take a copy for later. movdqa SAVE0, xmm0 movdqa SAVE1, xmm1 movdqa SAVE2, xmm2 movdqa SAVE3, xmm3 -loop: +0: // Apply a column quarterround to each of the columns simultaneously. // Alas, there doesn't seem to be a packed doubleword rotate, so we // have to synthesize it. @@ -256,7 +256,7 @@ loop: // Decrement the loop counter and see if we should go round again. // Later processors fuse this pair into a single uop. sub NR, 2 - ja loop + ja 0b // Almost there. Firstly, the feedforward addition, and then we have // to write out the result. Here we have to undo the permutation