/// -*- mode: asm; asm-comment-char: ?/ -*- /// /// Fancy SIMD implementation of ChaCha for ARM /// /// (c) 2016 Straylight/Edgeware /// ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software; you can redistribute it and/or modify /// it under the terms of the GNU Library General Public License as /// published by the Free Software Foundation; either version 2 of the /// License, or (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, /// but WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the /// GNU Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb; if not, write to the Free /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- /// Preliminaries. #include "config.h" #include "asm-common.h" .arch armv7-a .fpu neon .text ///-------------------------------------------------------------------------- /// Main code. FUNC(chacha_core_arm_neon) // Arguments are in registers. // r0 is the number of rounds to perform // r1 points to the input matrix // r2 points to the output matrix // First job is to slurp the matrix into the SIMD registers. vldm // and vstm work on word-aligned data, so this is fine. // // [ 0 1 2 3] (a, q8) // [ 4 5 6 7] (b, q9) // [ 8 9 10 11] (c, q10) // [12 13 14 15] (d, q11) // // We need a copy for later. Rather than waste time copying them by // hand, we'll use the three-address nature of the instruction set. // But this means that the main loop is offset by a bit. vldmia r1, {QQ(q12, q15)} // a += b; d ^= a; d <<<= 16 vadd.u32 q8, q12, q13 veor q11, q15, q8 vrev32.16 q11, q11 // c += d; b ^= c; b <<<= 12 vadd.u32 q10, q14, q11 veor q0, q13, q10 vshl.u32 q9, q0, #12 vsri.u32 q9, q0, #20 0: // Apply (the rest of) a column quarterround to each of the columns // simultaneously. Alas, there doesn't seem to be a packed word // rotate, so we have to synthesize it. // a += b; d ^= a; d <<<= 8 vadd.u32 q8, q8, q9 veor q0, q11, q8 vshl.u32 q11, q0, #8 vsri.u32 q11, q0, #24 // c += d; b ^= c; b <<<= 7 vadd.u32 q10, q10, q11 vext.32 q11, q11, q11, #3 veor q0, q9, q10 vext.32 q10, q10, q10, #2 vshl.u32 q9, q0, #7 vsri.u32 q9, q0, #25 // The not-quite-transpose conveniently only involves reordering // elements of individual rows, which can be done quite easily. It // doesn't involve any movement of elements between rows, or even // renaming of the rows. // // [ 0 1 2 3] [ 0 1 2 3] (a, q8) // [ 4 5 6 7] --> [ 5 6 7 4] (b, q9) // [ 8 9 10 11] [10 11 8 9] (c, q10) // [12 13 14 15] [15 12 13 14] (d, q11) // // The reorderings have for the most part been pushed upwards to // reduce delays. vext.32 q9, q9, q9, #1 // Apply the diagonal quarterround to each of the columns // simultaneously. // a += b; d ^= a; d <<<= 16 vadd.u32 q8, q8, q9 veor q11, q11, q8 vrev32.16 q11, q11 // c += d; b ^= c; b <<<= 12 vadd.u32 q10, q10, q11 veor q0, q9, q10 vshl.u32 q9, q0, #12 vsri.u32 q9, q0, #20 // a += b; d ^= a; d <<<= 8 vadd.u32 q8, q8, q9 veor q0, q11, q8 vshl.u32 q11, q0, #8 vsri.u32 q11, q0, #24 // c += d; b ^= c; b <<<= 7 vadd.u32 q10, q10, q11 vext.32 q11, q11, q11, #1 veor q0, q9, q10 vext.32 q10, q10, q10, #2 vshl.u32 q9, q0, #7 vsri.u32 q9, q0, #25 // Finally finish off undoing the transpose, and we're done for this // doubleround. Again, most of this was done above so we don't have // to wait for the reorderings. vext.32 q9, q9, q9, #3 // Decrement the loop counter and see if we should go round again. subs r0, r0, #2 bls 9f // Do the first part of the next round because this loop is offset. // a += b; d ^= a; d <<<= 16 vadd.u32 q8, q8, q9 veor q11, q11, q8 vrev32.16 q11, q11 // c += d; b ^= c; b <<<= 12 vadd.u32 q10, q10, q11 veor q0, q9, q10 vshl.u32 q9, q0, #12 vsri.u32 q9, q0, #20 b 0b // Almost there. Firstly the feedfoward addition. 9: vadd.u32 q8, q8, q12 vadd.u32 q9, q9, q13 vadd.u32 q10, q10, q14 vadd.u32 q11, q11, q15 // And now we write out the result. vstmia r2, {QQ(q8, q11)} // And with that, we're done. bx r14 ENDFUNC ///----- That's all, folks --------------------------------------------------