From 3cb47d2759de69a9e4fdf0030f518ca513b59c7b Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Tue, 1 Nov 2016 22:38:41 +0000 Subject: [PATCH] symm/salsa20-*.S: Optimize the output permutations. A little analysis, and a lot of trial and error, shows reveals that the state permutation can be decomposed into some rotations of the rows, a matrix transpose, and another rotation of the rows. These steps can be done moderately efficiently using the Intel and ARM SIMD instructions. --- symm/salsa20-arm-neon.S | 55 +++++++++++++++---------------- symm/salsa20-x86ish-sse2.S | 81 +++++++++++++++++++++------------------------- 2 files changed, 65 insertions(+), 71 deletions(-) diff --git a/symm/salsa20-arm-neon.S b/symm/salsa20-arm-neon.S index f212f2fa..9cb40472 100644 --- a/symm/salsa20-arm-neon.S +++ b/symm/salsa20-arm-neon.S @@ -205,35 +205,36 @@ FUNC(salsa20_core_arm_neon) b 0b - // Almost there. Firstly the feedfoward addition, and then we have - // to write out the result. Here we have to undo the permutation - // which was already applied to the input. -9: vadd.u32 q8, q8, q12 - vadd.u32 q9, q9, q13 - vadd.u32 q10, q10, q14 - vadd.u32 q11, q11, q15 - - vst1.32 {d16[0]}, [r2 :32]! - vst1.32 {d22[1]}, [r2 :32]! - vst1.32 {d21[0]}, [r2 :32]! - vst1.32 {d19[1]}, [r2 :32]! - - vst1.32 {d18[0]}, [r2 :32]! - vst1.32 {d16[1]}, [r2 :32]! - vst1.32 {d23[0]}, [r2 :32]! - vst1.32 {d21[1]}, [r2 :32]! - - vst1.32 {d20[0]}, [r2 :32]! - vst1.32 {d18[1]}, [r2 :32]! - vst1.32 {d17[0]}, [r2 :32]! - vst1.32 {d23[1]}, [r2 :32]! - - vst1.32 {d22[0]}, [r2 :32]! - vst1.32 {d20[1]}, [r2 :32]! - vst1.32 {d19[0]}, [r2 :32]! - vst1.32 {d17[1]}, [r2 :32]! + // Almost there. Firstly the feedfoward addition. +9: vadd.u32 q0, q8, q12 // 0, 5, 10, 15 + vadd.u32 q9, q9, q13 // 4, 9, 14, 3 + vadd.u32 q10, q10, q14 // 8, 13, 2, 7 + vadd.u32 q11, q11, q15 // 12, 1, 6, 11 + + // Next we must undo the permutation which was already applied to the + // input. This can be done juggling values in registers, with the + // following fancy footwork: some row rotations, a transpose, and + // some more rotations. + vext.32 q9, q9, q9, #3 // 3, 4, 9, 14 + vext.32 q10, q10, q10, #2 // 2, 7, 8, 13 + vext.32 q11, q11, q11, #1 // 1, 6, 11, 12 + + vzip.32 q0, q10 // 0, 2, 5, 7 + // 10, 8, 15, 13 + vzip.32 q11, q9 // 1, 3, 6, 4 + // 11, 9, 12, 14 + + vzip.32 q0, q11 // 0, 1, 2, 3 + // 5, 6, 7, 4 + vzip.32 q10, q9 // 10, 11, 8, 9 + // 15, 12, 13, 14 + + vext.32 q1, q11, q11, #3 // 4, 5, 6, 7 + vext.32 q2, q10, q10, #2 // 8, 9, 10, 11 + vext.32 q3, q9, q9, #1 // 12, 13, 14, 15 // And with that, we're done. + vstmia r2, {d0-d7} bx r14 ENDFUNC diff --git a/symm/salsa20-x86ish-sse2.S b/symm/salsa20-x86ish-sse2.S index 5fa5b151..a05cb4e4 100644 --- a/symm/salsa20-x86ish-sse2.S +++ b/symm/salsa20-x86ish-sse2.S @@ -254,50 +254,43 @@ FUNC(salsa20_core_x86ish_sse2) sub NR, 2 ja 0b - // Almost there. Firstly, the feedforward addition, and then we have - // to write out the result. Here we have to undo the permutation - // which was already applied to the input. Shuffling has quite high - // latency, so arrange to start a new shuffle into a temporary as - // soon as we've written out the old value. - paddd xmm0, SAVE0 - pshufd xmm4, xmm0, 0x39 - movd [OUT + 0], xmm0 - - paddd xmm1, SAVE1 - pshufd xmm5, xmm1, SHUF(2, 1, 0, 3) - movd [OUT + 16], xmm1 - - paddd xmm2, SAVE2 - pshufd xmm6, xmm2, SHUF(1, 0, 3, 2) - movd [OUT + 32], xmm2 - - paddd xmm3, SAVE3 - pshufd xmm7, xmm3, SHUF(0, 3, 2, 1) - movd [OUT + 48], xmm3 - - movd [OUT + 4], xmm7 - pshufd xmm7, xmm3, SHUF(1, 0, 3, 2) - movd [OUT + 24], xmm7 - pshufd xmm3, xmm3, SHUF(2, 1, 0, 3) - movd [OUT + 44], xmm3 - - movd [OUT + 8], xmm6 - pshufd xmm6, xmm2, SHUF(2, 1, 0, 3) - movd [OUT + 28], xmm6 - pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) - movd [OUT + 52], xmm2 - - movd [OUT + 12], xmm5 - pshufd xmm5, xmm1, SHUF(0, 3, 2, 1) - movd [OUT + 36], xmm5 - pshufd xmm1, xmm1, SHUF(1, 0, 3, 2) - movd [OUT + 56], xmm1 - - movd [OUT + 20], xmm4 - pshufd xmm4, xmm0, SHUF(1, 0, 3, 2) - movd [OUT + 40], xmm4 - pshufd xmm0, xmm0, SHUF(2, 1, 0, 3) - movd [OUT + 60], xmm0 + // Almost there. Firstly, the feedforward addition. + paddd xmm0, SAVE0 // 0, 5, 10, 15 + paddd xmm1, SAVE1 // 4, 9, 14, 3 + paddd xmm2, SAVE2 // 8, 13, 2, 7 + paddd xmm3, SAVE3 // 12, 1, 6, 11 + + // Next we must undo the permutation which was already applied to the + // input. This can be done by juggling values in registers, with the + // following fancy footwork: some row rotations, a transpose, and + // some more rotations. + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 3, 4, 9, 14 + pshufd xmm2, xmm2, SHUF(1, 0, 3, 2) // 2, 7, 8, 13 + pshufd xmm3, xmm3, SHUF(0, 3, 2, 1) // 1, 6, 11, 12 + + movdqa xmm4, xmm0 + movdqa xmm5, xmm3 + punpckldq xmm0, xmm2 // 0, 2, 5, 7 + punpckldq xmm3, xmm1 // 1, 3, 6, 4 + punpckhdq xmm4, xmm2 // 10, 8, 15, 13 + punpckhdq xmm5, xmm1 // 11, 9, 12, 14 + + movdqa xmm1, xmm0 + movdqa xmm2, xmm4 + punpckldq xmm0, xmm3 // 0, 1, 2, 3 + punpckldq xmm4, xmm5 // 10, 11, 8, 9 + punpckhdq xmm1, xmm3 // 5, 6, 7, 4 + punpckhdq xmm2, xmm5 // 15, 12, 13, 14 + + pshufd xmm1, xmm1, SHUF(2, 1, 0, 3) // 4, 5, 6, 7 + pshufd xmm4, xmm4, SHUF(1, 0, 3, 2) // 8, 9, 10, 11 + pshufd xmm2, xmm2, SHUF(0, 3, 2, 1) // 12, 13, 14, 15 + + // Finally we have to write out the result. + movdqu [OUT + 0], xmm0 + movdqu [OUT + 16], xmm1 + movdqu [OUT + 32], xmm4 + movdqu [OUT + 48], xmm2 // Tidy things up. #if CPUFAM_X86 -- 2.11.0