~mdw
/
catacomb
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
symm/{chacha,salsa20}-{arm64,arm-neon}.S: Improve rotation code.
[catacomb]
/
symm
/
chacha-arm64.S
diff --git
a/symm/chacha-arm64.S
b/symm/chacha-arm64.S
index
61ac51a
..
00957e6
100644
(file)
--- a/
symm/chacha-arm64.S
+++ b/
symm/chacha-arm64.S
@@
-35,7
+35,7
@@
.text
///--------------------------------------------------------------------------
.text
///--------------------------------------------------------------------------
-/// Main
.
code.
+/// Main
code.
FUNC(chacha_core_arm64)
FUNC(chacha_core_arm64)
@@
-59,16
+59,13
@@
FUNC(chacha_core_arm64)
// a += b; d ^= a; d <<<= 16
add v4.4s, v0.4s, v1.4s
eor v7.16b, v3.16b, v4.16b
// a += b; d ^= a; d <<<= 16
add v4.4s, v0.4s, v1.4s
eor v7.16b, v3.16b, v4.16b
- shl v16.4s, v7.4s, #16
- ushr v7.4s, v7.4s, #16
- orr v7.16b, v7.16b, v16.16b
+ rev32 v7.8h, v7.8h
// c += d; b ^= c; b <<<= 12
add v6.4s, v2.4s, v7.4s
// c += d; b ^= c; b <<<= 12
add v6.4s, v2.4s, v7.4s
- eor v5.16b, v1.16b, v6.16b
- shl v16.4s, v5.4s, #12
- ushr v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v16.16b
+ eor v16.16b, v1.16b, v6.16b
+ shl v5.4s, v16.4s, #12
+ sri v5.4s, v16.4s, #20
0:
// Apply (the rest of) a column quarterround to each of the columns
0:
// Apply (the rest of) a column quarterround to each of the columns
@@
-77,19
+74,17
@@
FUNC(chacha_core_arm64)
// a += b; d ^= a; d <<<= 8
add v4.4s, v4.4s, v5.4s
// a += b; d ^= a; d <<<= 8
add v4.4s, v4.4s, v5.4s
- eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #8
- ushr v7.4s, v7.4s, #24
- orr v7.16b, v7.16b, v16.16b
+ eor v16.16b, v7.16b, v4.16b
+ shl v7.4s, v16.4s, #8
+ sri v7.4s, v16.4s, #24
// c += d; b ^= c; b <<<= 7
add v6.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #12
// c += d; b ^= c; b <<<= 7
add v6.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #12
- eor v
5
.16b, v5.16b, v6.16b
+ eor v
16
.16b, v5.16b, v6.16b
ext v6.16b, v6.16b, v6.16b, #8
ext v6.16b, v6.16b, v6.16b, #8
- shl v16.4s, v5.4s, #7
- ushr v5.4s, v5.4s, #25
- orr v5.16b, v5.16b, v16.16b
+ shl v5.4s, v16.4s, #7
+ sri v5.4s, v16.4s, #25
// The not-quite-transpose conveniently only involves reordering
// elements of individual rows, which can be done quite easily. It
// The not-quite-transpose conveniently only involves reordering
// elements of individual rows, which can be done quite easily. It
@@
-112,32
+107,27
@@
FUNC(chacha_core_arm64)
// a += b; d ^= a; d <<<= 16
add v4.4s, v4.4s, v5.4s
eor v7.16b, v7.16b, v4.16b
// a += b; d ^= a; d <<<= 16
add v4.4s, v4.4s, v5.4s
eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #16
- ushr v7.4s, v7.4s, #16
- orr v7.16b, v7.16b, v16.16b
+ rev32 v7.8h, v7.8h
// c += d; b ^= c; b <<<= 12
add v6.4s, v6.4s, v7.4s
// c += d; b ^= c; b <<<= 12
add v6.4s, v6.4s, v7.4s
- eor v5.16b, v5.16b, v6.16b
- shl v16.4s, v5.4s, #12
- ushr v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v16.16b
+ eor v16.16b, v5.16b, v6.16b
+ shl v5.4s, v16.4s, #12
+ sri v5.4s, v16.4s, #20
// a += b; d ^= a; d <<<= 8
add v4.4s, v4.4s, v5.4s
// a += b; d ^= a; d <<<= 8
add v4.4s, v4.4s, v5.4s
- eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #8
- ushr v7.4s, v7.4s, #24
- orr v7.16b, v7.16b, v16.16b
+ eor v16.16b, v7.16b, v4.16b
+ shl v7.4s, v16.4s, #8
+ sri v7.4s, v16.4s, #24
// c += d; b ^= c; b <<<= 7
add v6.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #4
// c += d; b ^= c; b <<<= 7
add v6.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #4
- eor v
5
.16b, v5.16b, v6.16b
+ eor v
16
.16b, v5.16b, v6.16b
ext v6.16b, v6.16b, v6.16b, #8
ext v6.16b, v6.16b, v6.16b, #8
- shl v16.4s, v5.4s, #7
- ushr v5.4s, v5.4s, #25
- orr v5.16b, v5.16b, v16.16b
+ shl v5.4s, v16.4s, #7
+ sri v5.4s, v16.4s, #25
// Finally finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
// Finally finish off undoing the transpose, and we're done for this
// doubleround. Again, most of this was done above so we don't have
@@
-152,16
+142,13
@@
FUNC(chacha_core_arm64)
// a += b; d ^= a; d <<<= 16
add v4.4s, v4.4s, v5.4s
eor v7.16b, v7.16b, v4.16b
// a += b; d ^= a; d <<<= 16
add v4.4s, v4.4s, v5.4s
eor v7.16b, v7.16b, v4.16b
- shl v16.4s, v7.4s, #16
- ushr v7.4s, v7.4s, #16
- orr v7.16b, v7.16b, v16.16b
+ rev32 v7.8h, v7.8h
// c += d; b ^= c; b <<<= 12
add v6.4s, v6.4s, v7.4s
// c += d; b ^= c; b <<<= 12
add v6.4s, v6.4s, v7.4s
- eor v5.16b, v5.16b, v6.16b
- shl v16.4s, v5.4s, #12
- ushr v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v16.16b
+ eor v16.16b, v5.16b, v6.16b
+ shl v5.4s, v16.4s, #12
+ sri v5.4s, v16.4s, #20
b 0b
b 0b