X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/e492db887af6561dd33aa18e3887efaeb219fd16..HEAD:/symm/chacha-arm64.S diff --git a/symm/chacha-arm64.S b/symm/chacha-arm64.S index a423e9e5..00957e60 100644 --- a/symm/chacha-arm64.S +++ b/symm/chacha-arm64.S @@ -25,17 +25,18 @@ /// MA 02111-1307, USA. ///-------------------------------------------------------------------------- -/// External definitions. +/// Preliminaries. #include "config.h" #include "asm-common.h" -///-------------------------------------------------------------------------- -/// Main.code. - .arch armv8-a + .text +///-------------------------------------------------------------------------- +/// Main code. + FUNC(chacha_core_arm64) // Arguments are in registers. @@ -58,16 +59,13 @@ FUNC(chacha_core_arm64) // a += b; d ^= a; d <<<= 16 add v4.4s, v0.4s, v1.4s eor v7.16b, v3.16b, v4.16b - shl v16.4s, v7.4s, #16 - ushr v7.4s, v7.4s, #16 - orr v7.16b, v7.16b, v16.16b + rev32 v7.8h, v7.8h // c += d; b ^= c; b <<<= 12 add v6.4s, v2.4s, v7.4s - eor v5.16b, v1.16b, v6.16b - shl v16.4s, v5.4s, #12 - ushr v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v16.16b + eor v16.16b, v1.16b, v6.16b + shl v5.4s, v16.4s, #12 + sri v5.4s, v16.4s, #20 0: // Apply (the rest of) a column quarterround to each of the columns @@ -76,19 +74,17 @@ FUNC(chacha_core_arm64) // a += b; d ^= a; d <<<= 8 add v4.4s, v4.4s, v5.4s - eor v7.16b, v7.16b, v4.16b - shl v16.4s, v7.4s, #8 - ushr v7.4s, v7.4s, #24 - orr v7.16b, v7.16b, v16.16b + eor v16.16b, v7.16b, v4.16b + shl v7.4s, v16.4s, #8 + sri v7.4s, v16.4s, #24 // c += d; b ^= c; b <<<= 7 add v6.4s, v6.4s, v7.4s ext v7.16b, v7.16b, v7.16b, #12 - eor v5.16b, v5.16b, v6.16b + eor v16.16b, v5.16b, v6.16b ext v6.16b, v6.16b, v6.16b, #8 - shl v16.4s, v5.4s, #7 - ushr v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v16.16b + shl v5.4s, v16.4s, #7 + sri v5.4s, v16.4s, #25 // The not-quite-transpose conveniently only involves reordering // elements of individual rows, which can be done quite easily. It @@ -111,32 +107,27 @@ FUNC(chacha_core_arm64) // a += b; d ^= a; d <<<= 16 add v4.4s, v4.4s, v5.4s eor v7.16b, v7.16b, v4.16b - shl v16.4s, v7.4s, #16 - ushr v7.4s, v7.4s, #16 - orr v7.16b, v7.16b, v16.16b + rev32 v7.8h, v7.8h // c += d; b ^= c; b <<<= 12 add v6.4s, v6.4s, v7.4s - eor v5.16b, v5.16b, v6.16b - shl v16.4s, v5.4s, #12 - ushr v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v16.16b + eor v16.16b, v5.16b, v6.16b + shl v5.4s, v16.4s, #12 + sri v5.4s, v16.4s, #20 // a += b; d ^= a; d <<<= 8 add v4.4s, v4.4s, v5.4s - eor v7.16b, v7.16b, v4.16b - shl v16.4s, v7.4s, #8 - ushr v7.4s, v7.4s, #24 - orr v7.16b, v7.16b, v16.16b + eor v16.16b, v7.16b, v4.16b + shl v7.4s, v16.4s, #8 + sri v7.4s, v16.4s, #24 // c += d; b ^= c; b <<<= 7 add v6.4s, v6.4s, v7.4s ext v7.16b, v7.16b, v7.16b, #4 - eor v5.16b, v5.16b, v6.16b + eor v16.16b, v5.16b, v6.16b ext v6.16b, v6.16b, v6.16b, #8 - shl v16.4s, v5.4s, #7 - ushr v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v16.16b + shl v5.4s, v16.4s, #7 + sri v5.4s, v16.4s, #25 // Finally finish off undoing the transpose, and we're done for this // doubleround. Again, most of this was done above so we don't have @@ -151,16 +142,13 @@ FUNC(chacha_core_arm64) // a += b; d ^= a; d <<<= 16 add v4.4s, v4.4s, v5.4s eor v7.16b, v7.16b, v4.16b - shl v16.4s, v7.4s, #16 - ushr v7.4s, v7.4s, #16 - orr v7.16b, v7.16b, v16.16b + rev32 v7.8h, v7.8h // c += d; b ^= c; b <<<= 12 add v6.4s, v6.4s, v7.4s - eor v5.16b, v5.16b, v6.16b - shl v16.4s, v5.4s, #12 - ushr v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v16.16b + eor v16.16b, v5.16b, v6.16b + shl v5.4s, v16.4s, #12 + sri v5.4s, v16.4s, #20 b 0b