.text
///--------------------------------------------------------------------------
-/// Main.code.
+/// Main code.
FUNC(chacha_core_arm64)
// a += b; d ^= a; d <<<= 16
add v4.4s, v0.4s, v1.4s
eor v7.16b, v3.16b, v4.16b
- shl v16.4s, v7.4s, #16
- ushr v7.4s, v7.4s, #16
- orr v7.16b, v7.16b, v16.16b
+ rev32 v7.8h, v7.8h
// c += d; b ^= c; b <<<= 12
add v6.4s, v2.4s, v7.4s
- eor v5.16b, v1.16b, v6.16b
- shl v16.4s, v5.4s, #12
- ushr v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v16.16b
+ eor v16.16b, v1.16b, v6.16b
+ shl v5.4s, v16.4s, #12
+ sri v5.4s, v16.4s, #20
0:
// Apply (the rest of) a column quarterround to each of the columns
// a += b; d ^= a; d <<<= 8
add v4.4s, v4.4s, v5.4s
- eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #8
- ushr v7.4s, v7.4s, #24
- orr v7.16b, v7.16b, v16.16b
+ eor v16.16b, v7.16b, v4.16b
+ shl v7.4s, v16.4s, #8
+ sri v7.4s, v16.4s, #24
// c += d; b ^= c; b <<<= 7
add v6.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #12
- eor v5.16b, v5.16b, v6.16b
+ eor v16.16b, v5.16b, v6.16b
ext v6.16b, v6.16b, v6.16b, #8
- shl v16.4s, v5.4s, #7
- ushr v5.4s, v5.4s, #25
- orr v5.16b, v5.16b, v16.16b
+ shl v5.4s, v16.4s, #7
+ sri v5.4s, v16.4s, #25
// The not-quite-transpose conveniently only involves reordering
// elements of individual rows, which can be done quite easily. It
// a += b; d ^= a; d <<<= 16
add v4.4s, v4.4s, v5.4s
eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #16
- ushr v7.4s, v7.4s, #16
- orr v7.16b, v7.16b, v16.16b
+ rev32 v7.8h, v7.8h
// c += d; b ^= c; b <<<= 12
add v6.4s, v6.4s, v7.4s
- eor v5.16b, v5.16b, v6.16b
- shl v16.4s, v5.4s, #12
- ushr v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v16.16b
+ eor v16.16b, v5.16b, v6.16b
+ shl v5.4s, v16.4s, #12
+ sri v5.4s, v16.4s, #20
// a += b; d ^= a; d <<<= 8
add v4.4s, v4.4s, v5.4s
- eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #8
- ushr v7.4s, v7.4s, #24
- orr v7.16b, v7.16b, v16.16b
+ eor v16.16b, v7.16b, v4.16b
+ shl v7.4s, v16.4s, #8
+ sri v7.4s, v16.4s, #24
// c += d; b ^= c; b <<<= 7
add v6.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #4
- eor v5.16b, v5.16b, v6.16b
+ eor v16.16b, v5.16b, v6.16b
ext v6.16b, v6.16b, v6.16b, #8
- shl v16.4s, v5.4s, #7
- ushr v5.4s, v5.4s, #25
- orr v5.16b, v5.16b, v16.16b
+ shl v5.4s, v16.4s, #7
+ sri v5.4s, v16.4s, #25
// Finally finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// a += b; d ^= a; d <<<= 16
add v4.4s, v4.4s, v5.4s
eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #16
- ushr v7.4s, v7.4s, #16
- orr v7.16b, v7.16b, v16.16b
+ rev32 v7.8h, v7.8h
// c += d; b ^= c; b <<<= 12
add v6.4s, v6.4s, v7.4s
- eor v5.16b, v5.16b, v6.16b
- shl v16.4s, v5.4s, #12
- ushr v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v16.16b
+ eor v16.16b, v5.16b, v6.16b
+ shl v5.4s, v16.4s, #12
+ sri v5.4s, v16.4s, #20
b 0b