From 704d59c80bb5cc3a56016e6733eb0a16fbff0bab Mon Sep 17 00:00:00 2001 From: Mark Wooding Date: Wed, 18 May 2016 10:29:03 +0100 Subject: [PATCH] symm/: Add ARM NEON implementations of ChaCha and Salsa20. --- symm/Makefile.am | 6 ++ symm/chacha-arm-neon.S | 183 ++++++++++++++++++++++++++++++++++++ symm/chacha.c | 8 ++ symm/salsa20-arm-neon.S | 241 ++++++++++++++++++++++++++++++++++++++++++++++++ symm/salsa20.c | 8 ++ 5 files changed, 446 insertions(+) create mode 100644 symm/chacha-arm-neon.S create mode 100644 symm/salsa20-arm-neon.S diff --git a/symm/Makefile.am b/symm/Makefile.am index 1d3374f5..e56b2a43 100644 --- a/symm/Makefile.am +++ b/symm/Makefile.am @@ -400,6 +400,9 @@ endif if CPUFAM_AMD64 libsymm_la_SOURCES += salsa20-x86ish-sse2.S endif +if CPUFAM_ARMEL +libsymm_la_SOURCES += salsa20-arm-neon.S +endif TESTS += salsa20.t$(EXEEXT) ALL_CIPHERS += salsa20 salsa2012 salsa208 ALL_CIPHERS += xsalsa20 xsalsa2012 xsalsa208 @@ -432,6 +435,9 @@ endif if CPUFAM_AMD64 libsymm_la_SOURCES += chacha-x86ish-sse2.S endif +if CPUFAM_ARMEL +libsymm_la_SOURCES += chacha-arm-neon.S +endif TESTS += chacha.t$(EXEEXT) EXTRA_DIST += t/chacha ALL_CIPHERS += chacha20 chacha12 chacha8 diff --git a/symm/chacha-arm-neon.S b/symm/chacha-arm-neon.S new file mode 100644 index 00000000..5fb0073d --- /dev/null +++ b/symm/chacha-arm-neon.S @@ -0,0 +1,183 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Fancy SIMD implementation of ChaCha for ARM +/// +/// (c) 2016 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// Main.code. + + .arch armv7-a + .fpu neon + .section .text + +FUNC(chacha_core_arm_neon) + + // Arguments are in registers. + // r0 is the number of rounds to perform + // r1 points to the input matrix + // r2 points to the output matrix + + // First job is to slurp the matrix into the SIMD registers. vldm + // and vstm work on word-aligned data, so this is fine. + // + // [ 0 1 2 3] (a, q8) + // [ 4 5 6 7] (b, q9) + // [ 8 9 10 11] (c, q10) + // [12 13 14 15] (d, q11) + // + // We need a copy for later. Rather than waste time copying them by + // hand, we'll use the three-address nature of the instruction set. + // But this means that the main loop is offset by a bit. + vldmia r1, {d24-d31} + + // a += b; d ^= a; d <<<= 16 + vadd.u32 q8, q12, q13 + veor q11, q15, q8 + vshl.u32 q0, q11, #16 + vshr.u32 q11, q11, #16 + vorr q11, q11, q0 + + // c += d; b ^= c; b <<<= 12 + vadd.u32 q10, q14, q11 + veor q9, q13, q10 + vshl.u32 q0, q9, #12 + vshr.u32 q9, q9, #20 + vorr q9, q9, q0 + +0: + // Apply (the rest of) a column quarterround to each of the columns + // simultaneously. Alas, there doesn't seem to be a packed word + // rotate, so we have to synthesize it. + + // a += b; d ^= a; d <<<= 8 + vadd.u32 q8, q8, q9 + veor q11, q11, q8 + vshl.u32 q0, q11, #8 + vshr.u32 q11, q11, #24 + vorr q11, q11, q0 + + // c += d; b ^= c; b <<<= 7 + vadd.u32 q10, q10, q11 + vext.32 q11, q11, q11, #3 + veor q9, q9, q10 + vext.32 q10, q10, q10, #2 + vshl.u32 q0, q9, #7 + vshr.u32 q9, q9, #25 + vorr q9, q9, q0 + + // The not-quite-transpose conveniently only involves reordering + // elements of individual rows, which can be done quite easily. It + // doesn't involve any movement of elements between rows, or even + // renaming of the rows. + // + // [ 0 1 2 3] [ 0 1 2 3] (a, q8) + // [ 4 5 6 7] --> [ 5 6 7 4] (b, q9) + // [ 8 9 10 11] [10 11 8 9] (c, q10) + // [12 13 14 15] [15 12 13 14] (d, q11) + // + // The reorderings have for the most part been pushed upwards to + // reduce delays. + vext.32 q9, q9, q9, #1 + + // Apply the diagonal quarterround to each of the columns + // simultaneously. + + // a += b; d ^= a; d <<<= 16 + vadd.u32 q8, q8, q9 + veor q11, q11, q8 + vshl.u32 q0, q11, #16 + vshr.u32 q11, q11, #16 + vorr q11, q11, q0 + + // c += d; b ^= c; b <<<= 12 + vadd.u32 q10, q10, q11 + veor q9, q9, q10 + vshl.u32 q0, q9, #12 + vshr.u32 q9, q9, #20 + vorr q9, q9, q0 + + // a += b; d ^= a; d <<<= 8 + vadd.u32 q8, q8, q9 + veor q11, q11, q8 + vshl.u32 q0, q11, #8 + vshr.u32 q11, q11, #24 + vorr q11, q11, q0 + + // c += d; b ^= c; b <<<= 7 + vadd.u32 q10, q10, q11 + vext.32 q11, q11, q11, #1 + veor q9, q9, q10 + vext.32 q10, q10, q10, #2 + vshl.u32 q0, q9, #7 + vshr.u32 q9, q9, #25 + vorr q9, q9, q0 + + // Finally finish off undoing the transpose, and we're done for this + // doubleround. Again, most of this was done above so we don't have + // to wait for the reorderings. + vext.32 q9, q9, q9, #3 + + // Decrement the loop counter and see if we should go round again. + subs r0, r0, #2 + bls 9f + + // Do the first part of the next round because this loop is offset. + + // a += b; d ^= a; d <<<= 16 + vadd.u32 q8, q8, q9 + veor q11, q11, q8 + vshl.u32 q0, q11, #16 + vshr.u32 q11, q11, #16 + vorr q11, q11, q0 + + // c += d; b ^= c; b <<<= 12 + vadd.u32 q10, q10, q11 + veor q9, q9, q10 + vshl.u32 q0, q9, #12 + vshr.u32 q9, q9, #20 + vorr q9, q9, q0 + + b 0b + + // Almost there. Firstly the feedfoward addition. +9: vadd.u32 q8, q8, q12 + vadd.u32 q9, q9, q13 + vadd.u32 q10, q10, q14 + vadd.u32 q11, q11, q15 + + // And now we write out the result. + vstmia r2, {d16-d23} + + // And with that, we're done. + bx r14 + +ENDFUNC + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/chacha.c b/symm/chacha.c index 0c8aa003..cb879e3c 100644 --- a/symm/chacha.c +++ b/symm/chacha.c @@ -76,12 +76,20 @@ static void simple_core(unsigned r, const chacha_matrix src, extern core__functype chacha_core_x86ish_sse2; #endif +#if CPUFAM_ARMEL +extern core__functype chacha_core_arm_neon; +#endif + static core__functype *pick_core(void) { #if CPUFAM_X86 || CPUFAM_AMD64 DISPATCH_PICK_COND(chacha_core, chacha_core_x86ish_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif +#if CPUFAM_ARMEL + DISPATCH_PICK_COND(chacha_core, chacha_core_arm_neon, + cpu_feature_p(CPUFEAT_ARM_NEON)); +#endif DISPATCH_PICK_FALLBACK(chacha_core, simple_core); } diff --git a/symm/salsa20-arm-neon.S b/symm/salsa20-arm-neon.S new file mode 100644 index 00000000..15585e47 --- /dev/null +++ b/symm/salsa20-arm-neon.S @@ -0,0 +1,241 @@ +/// -*- mode: asm; asm-comment-char: ?/ -*- +/// +/// Fancy SIMD implementation of Salsa20 for ARM +/// +/// (c) 2016 Straylight/Edgeware +/// + +///----- Licensing notice --------------------------------------------------- +/// +/// This file is part of Catacomb. +/// +/// Catacomb is free software; you can redistribute it and/or modify +/// it under the terms of the GNU Library General Public License as +/// published by the Free Software Foundation; either version 2 of the +/// License, or (at your option) any later version. +/// +/// Catacomb is distributed in the hope that it will be useful, +/// but WITHOUT ANY WARRANTY; without even the implied warranty of +/// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +/// GNU Library General Public License for more details. +/// +/// You should have received a copy of the GNU Library General Public +/// License along with Catacomb; if not, write to the Free +/// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +/// MA 02111-1307, USA. + +///-------------------------------------------------------------------------- +/// External definitions. + +#include "config.h" +#include "asm-common.h" + +///-------------------------------------------------------------------------- +/// Main.code. + + .arch armv7-a + .fpu neon + .section .text + +FUNC(salsa20_core_arm_neon) + + // Arguments are in registers. + // r0 is the number of rounds to perform + // r1 points to the input matrix + // r2 points to the output matrix + + // First job is to slurp the matrix into the SIMD registers. The + // words have already been permuted conveniently to make them line up + // better for SIMD processing. + // + // The textbook arrangement of the matrix is this. + // + // [C K K K] + // [K C N N] + // [T T C K] + // [K K K C] + // + // But we've rotated the columns up so that the main diagonal with + // the constants on it end up in the first row, giving something more + // like + // + // [C C C C] + // [K T K K] + // [T K K N] + // [K K N K] + // + // so the transformation looks like this: + // + // [ 0 1 2 3] [ 0 5 10 15] (a, q8) + // [ 4 5 6 7] --> [ 4 9 14 3] (b, q9) + // [ 8 9 10 11] [ 8 13 2 7] (c, q10) + // [12 13 14 15] [12 1 6 11] (d, q11) + // + // [ 0 1 2 3] (a, q8) + // [ 4 5 6 7] (b, q9) + // [ 8 9 10 11] (c, q10) + // [12 13 14 15] (d, q11) + // + // We need a copy for later. Rather than waste time copying them by + // hand, we'll use the three-address nature of the instruction set. + // But this means that the main loop is offset by a bit. + vldmia r1, {d24-d31} + + // Apply a column quarterround to each of the columns simultaneously, + // moving the results to their working registers. Alas, there + // doesn't seem to be a packed word rotate, so we have to synthesize + // it. + + // b ^= (a + d) <<< 7 + vadd.u32 q0, q12, q15 + vshl.u32 q1, q0, #7 + vshr.u32 q0, q0, #25 + vorr q0, q0, q1 + veor q9, q13, q0 + + // c ^= (b + a) <<< 9 + vadd.u32 q0, q9, q12 + vshl.u32 q1, q0, #9 + vshr.u32 q0, q0, #23 + vorr q0, q0, q1 + veor q10, q14, q0 + + // d ^= (c + b) <<< 13 + vadd.u32 q0, q10, q9 + vext.32 q9, q9, q9, #3 + vshl.u32 q1, q0, #13 + vshr.u32 q0, q0, #19 + vorr q0, q0, q1 + veor q11, q15, q0 + + // a ^= (d + c) <<< 18 + vadd.u32 q0, q11, q10 + vext.32 q10, q10, q10, #2 + vext.32 q11, q11, q11, #1 + vshl.u32 q1, q0, #18 + vshr.u32 q0, q0, #14 + vorr q0, q0, q1 + veor q8, q12, q0 + +0: + // The transpose conveniently only involves reordering elements of + // individual rows, which can be done quite easily, and reordering + // the rows themselves, which is a trivial renaming. It doesn't + // involve any movement of elements between rows. + // + // [ 0 5 10 15] [ 0 5 10 15] (a, q8) + // [ 4 9 14 3] --> [ 1 6 11 12] (b, q11) + // [ 8 13 2 7] [ 2 7 8 13] (c, q10) + // [12 1 6 11] [ 3 4 9 14] (d, q9) + // + // The reorderings have been pushed upwards to reduce delays. + + // Apply the row quarterround to each of the columns (yes!) + // simultaneously. + + // b ^= (a + d) <<< 7 + vadd.u32 q0, q8, q9 + vshl.u32 q1, q0, #7 + vshr.u32 q0, q0, #25 + vorr q0, q0, q1 + veor q11, q11, q0 + + // c ^= (b + a) <<< 9 + vadd.u32 q0, q11, q8 + vshl.u32 q1, q0, #9 + vshr.u32 q0, q0, #23 + vorr q0, q0, q1 + veor q10, q10, q0 + + // d ^= (c + b) <<< 13 + vadd.u32 q0, q10, q11 + vext.32 q11, q11, q11, #3 + vshl.u32 q1, q0, #13 + vshr.u32 q0, q0, #19 + vorr q0, q0, q1 + veor q9, q9, q0 + + // a ^= (d + c) <<< 18 + vadd.u32 q0, q9, q10 + vext.32 q10, q10, q10, #2 + vext.32 q9, q9, q9, #1 + vshl.u32 q1, q0, #18 + vshr.u32 q0, q0, #14 + vorr q0, q0, q1 + veor q8, q8, q0 + + // We had to undo the transpose ready for the next loop. Again, push + // back the reorderings to reduce latency. Decrement the loop + // counter and see if we should go round again. + subs r0, r0, #2 + bls 9f + + // Do the first half of the next round because this loop is offset. + + // b ^= (a + d) <<< 7 + vadd.u32 q0, q8, q11 + vshl.u32 q1, q0, #7 + vshr.u32 q0, q0, #25 + vorr q0, q0, q1 + veor q9, q9, q0 + + // c ^= (b + a) <<< 9 + vadd.u32 q0, q9, q8 + vshl.u32 q1, q0, #9 + vshr.u32 q0, q0, #23 + vorr q0, q0, q1 + veor q10, q10, q0 + + // d ^= (c + b) <<< 13 + vadd.u32 q0, q10, q9 + vext.32 q9, q9, q9, #3 + vshl.u32 q1, q0, #13 + vshr.u32 q0, q0, #19 + vorr q0, q0, q1 + veor q11, q11, q0 + + // a ^= (d + c) <<< 18 + vadd.u32 q0, q11, q10 + vext.32 q10, q10, q10, #2 + vext.32 q11, q11, q11, #1 + vshl.u32 q1, q0, #18 + vshr.u32 q0, q0, #14 + vorr q0, q0, q1 + veor q8, q8, q0 + + b 0b + + // Almost there. Firstly the feedfoward addition, and then we have + // to write out the result. Here we have to undo the permutation + // which was already applied to the input. +9: vadd.u32 q8, q8, q12 + vadd.u32 q9, q9, q13 + vadd.u32 q10, q10, q14 + vadd.u32 q11, q11, q15 + + vst1.32 {d16[0]}, [r2]! + vst1.32 {d22[1]}, [r2]! + vst1.32 {d21[0]}, [r2]! + vst1.32 {d19[1]}, [r2]! + + vst1.32 {d18[0]}, [r2]! + vst1.32 {d16[1]}, [r2]! + vst1.32 {d23[0]}, [r2]! + vst1.32 {d21[1]}, [r2]! + + vst1.32 {d20[0]}, [r2]! + vst1.32 {d18[1]}, [r2]! + vst1.32 {d17[0]}, [r2]! + vst1.32 {d23[1]}, [r2]! + + vst1.32 {d22[0]}, [r2]! + vst1.32 {d20[1]}, [r2]! + vst1.32 {d19[0]}, [r2]! + vst1.32 {d17[1]}, [r2]! + + // And with that, we're done. + bx r14 + +ENDFUNC + +///----- That's all, folks -------------------------------------------------- diff --git a/symm/salsa20.c b/symm/salsa20.c index 40f28fc0..0afad2cb 100644 --- a/symm/salsa20.c +++ b/symm/salsa20.c @@ -56,12 +56,20 @@ static void simple_core(unsigned r, const salsa20_matrix src, extern core__functype salsa20_core_x86ish_sse2; #endif +#if CPUFAM_ARMEL +extern core__functype salsa20_core_arm_neon; +#endif + static core__functype *pick_core(void) { #if CPUFAM_X86 || CPUFAM_AMD64 DISPATCH_PICK_COND(salsa20_core, salsa20_core_x86ish_sse2, cpu_feature_p(CPUFEAT_X86_SSE2)); #endif +#if CPUFAM_ARMEL + DISPATCH_PICK_COND(salsa20_core, salsa20_core_arm_neon, + cpu_feature_p(CPUFEAT_ARM_NEON)); +#endif DISPATCH_PICK_FALLBACK(salsa20_core, simple_core); } -- 2.11.0