/// -*- mode: asm; asm-comment-char: ?/ -*- /// /// Fancy SIMD implementation of ChaCha for AArch64 /// /// (c) 2018 Straylight/Edgeware /// ///----- Licensing notice --------------------------------------------------- /// /// This file is part of Catacomb. /// /// Catacomb is free software; you can redistribute it and/or modify /// it under the terms of the GNU Library General Public License as /// published by the Free Software Foundation; either version 2 of the /// License, or (at your option) any later version. /// /// Catacomb is distributed in the hope that it will be useful, /// but WITHOUT ANY WARRANTY; without even the implied warranty of /// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the /// GNU Library General Public License for more details. /// /// You should have received a copy of the GNU Library General Public /// License along with Catacomb; if not, write to the Free /// Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- /// Preliminaries. #include "config.h" #include "asm-common.h" .arch armv8-a .text ///-------------------------------------------------------------------------- /// Main.code. FUNC(chacha_core_arm64) // Arguments are in registers. // w0 is the number of rounds to perform // x1 points to the input matrix // x2 points to the output matrix // First job is to slurp the matrix into the SIMD registers. // // [ 0 1 2 3] (a, v4) // [ 4 5 6 7] (b, v5) // [ 8 9 10 11] (c, v6) // [12 13 14 15] (d, v7) // // We need a copy for later. Rather than waste time copying them by // hand, we'll use the three-address nature of the instruction set. // But this means that the main loop is offset by a bit. ld1 {v0.4s-v3.4s}, [x1] // a += b; d ^= a; d <<<= 16 add v4.4s, v0.4s, v1.4s eor v7.16b, v3.16b, v4.16b shl v16.4s, v7.4s, #16 ushr v7.4s, v7.4s, #16 orr v7.16b, v7.16b, v16.16b // c += d; b ^= c; b <<<= 12 add v6.4s, v2.4s, v7.4s eor v5.16b, v1.16b, v6.16b shl v16.4s, v5.4s, #12 ushr v5.4s, v5.4s, #20 orr v5.16b, v5.16b, v16.16b 0: // Apply (the rest of) a column quarterround to each of the columns // simultaneously. Alas, there doesn't seem to be a packed word // rotate, so we have to synthesize it. // a += b; d ^= a; d <<<= 8 add v4.4s, v4.4s, v5.4s eor v7.16b, v7.16b, v4.16b shl v16.4s, v7.4s, #8 ushr v7.4s, v7.4s, #24 orr v7.16b, v7.16b, v16.16b // c += d; b ^= c; b <<<= 7 add v6.4s, v6.4s, v7.4s ext v7.16b, v7.16b, v7.16b, #12 eor v5.16b, v5.16b, v6.16b ext v6.16b, v6.16b, v6.16b, #8 shl v16.4s, v5.4s, #7 ushr v5.4s, v5.4s, #25 orr v5.16b, v5.16b, v16.16b // The not-quite-transpose conveniently only involves reordering // elements of individual rows, which can be done quite easily. It // doesn't involve any movement of elements between rows, or even // renaming of the rows. // // [ 0 1 2 3] [ 0 1 2 3] (a, v4) // [ 4 5 6 7] --> [ 5 6 7 4] (b, v5) // [ 8 9 10 11] [10 11 8 9] (c, v6) // [12 13 14 15] [15 12 13 14] (d, v7) // // The reorderings have for the most part been pushed upwards to // reduce delays. ext v5.16b, v5.16b, v5.16b, #4 sub w0, w0, #2 // Apply the diagonal quarterround to each of the columns // simultaneously. // a += b; d ^= a; d <<<= 16 add v4.4s, v4.4s, v5.4s eor v7.16b, v7.16b, v4.16b shl v16.4s, v7.4s, #16 ushr v7.4s, v7.4s, #16 orr v7.16b, v7.16b, v16.16b // c += d; b ^= c; b <<<= 12 add v6.4s, v6.4s, v7.4s eor v5.16b, v5.16b, v6.16b shl v16.4s, v5.4s, #12 ushr v5.4s, v5.4s, #20 orr v5.16b, v5.16b, v16.16b // a += b; d ^= a; d <<<= 8 add v4.4s, v4.4s, v5.4s eor v7.16b, v7.16b, v4.16b shl v16.4s, v7.4s, #8 ushr v7.4s, v7.4s, #24 orr v7.16b, v7.16b, v16.16b // c += d; b ^= c; b <<<= 7 add v6.4s, v6.4s, v7.4s ext v7.16b, v7.16b, v7.16b, #4 eor v5.16b, v5.16b, v6.16b ext v6.16b, v6.16b, v6.16b, #8 shl v16.4s, v5.4s, #7 ushr v5.4s, v5.4s, #25 orr v5.16b, v5.16b, v16.16b // Finally finish off undoing the transpose, and we're done for this // doubleround. Again, most of this was done above so we don't have // to wait for the reorderings. ext v5.16b, v5.16b, v5.16b, #12 // Decrement the loop counter and see if we should go round again. cbz w0, 9f // Do the first part of the next round because this loop is offset. // a += b; d ^= a; d <<<= 16 add v4.4s, v4.4s, v5.4s eor v7.16b, v7.16b, v4.16b shl v16.4s, v7.4s, #16 ushr v7.4s, v7.4s, #16 orr v7.16b, v7.16b, v16.16b // c += d; b ^= c; b <<<= 12 add v6.4s, v6.4s, v7.4s eor v5.16b, v5.16b, v6.16b shl v16.4s, v5.4s, #12 ushr v5.4s, v5.4s, #20 orr v5.16b, v5.16b, v16.16b b 0b // Almost there. Firstly the feedfoward addition. 9: add v0.4s, v0.4s, v4.4s add v1.4s, v1.4s, v5.4s add v2.4s, v2.4s, v6.4s add v3.4s, v3.4s, v7.4s // And now we write out the result. st1 {v0.4s-v3.4s}, [x2] // And with that, we're done. ret ENDFUNC ///----- That's all, folks --------------------------------------------------