~mdw
/
catacomb
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
symm/{chacha,salsa20}-{arm64,arm-neon}.S: Improve rotation code.
[catacomb]
/
symm
/
salsa20-arm64.S
diff --git
a/symm/salsa20-arm64.S
b/symm/salsa20-arm64.S
index
a3bce4e
..
bd50514
100644
(file)
--- a/
symm/salsa20-arm64.S
+++ b/
symm/salsa20-arm64.S
@@
-84,33
+84,29
@@
FUNC(salsa20_core_arm64)
// b ^= (a + d) <<< 7
add v16.4s, v0.4s, v3.4s
shl v17.4s, v16.4s, #7
// b ^= (a + d) <<< 7
add v16.4s, v0.4s, v3.4s
shl v17.4s, v16.4s, #7
- ushr v16.4s, v16.4s, #25
- orr v16.16b, v16.16b, v17.16b
- eor v5.16b, v1.16b, v16.16b
+ sri v17.4s, v16.4s, #25
+ eor v5.16b, v1.16b, v17.16b
// c ^= (b + a) <<< 9
add v16.4s, v5.4s, v0.4s
shl v17.4s, v16.4s, #9
// c ^= (b + a) <<< 9
add v16.4s, v5.4s, v0.4s
shl v17.4s, v16.4s, #9
- ushr v16.4s, v16.4s, #23
- orr v16.16b, v16.16b, v17.16b
- eor v6.16b, v2.16b, v16.16b
+ sri v17.4s, v16.4s, #23
+ eor v6.16b, v2.16b, v17.16b
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v5.4s
ext v5.16b, v5.16b, v5.16b, #12
shl v17.4s, v16.4s, #13
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v5.4s
ext v5.16b, v5.16b, v5.16b, #12
shl v17.4s, v16.4s, #13
- ushr v16.4s, v16.4s, #19
- orr v16.16b, v16.16b, v17.16b
- eor v7.16b, v3.16b, v16.16b
+ sri v17.4s, v16.4s, #19
+ eor v7.16b, v3.16b, v17.16b
// a ^= (d + c) <<< 18
add v16.4s, v7.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #4
shl v17.4s, v16.4s, #18
// a ^= (d + c) <<< 18
add v16.4s, v7.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #4
shl v17.4s, v16.4s, #18
- ushr v16.4s, v16.4s, #14
- orr v16.16b, v16.16b, v17.16b
- eor v4.16b, v0.16b, v16.16b
+ sri v17.4s, v16.4s, #14
+ eor v4.16b, v0.16b, v17.16b
0:
// The transpose conveniently only involves reordering elements of
0:
// The transpose conveniently only involves reordering elements of
@@
-132,33
+128,29
@@
FUNC(salsa20_core_arm64)
// b ^= (a + d) <<< 7
add v16.4s, v4.4s, v5.4s
shl v17.4s, v16.4s, #7
// b ^= (a + d) <<< 7
add v16.4s, v4.4s, v5.4s
shl v17.4s, v16.4s, #7
- ushr v16.4s, v16.4s, #25
- orr v16.16b, v16.16b, v17.16b
- eor v7.16b, v7.16b, v16.16b
+ sri v17.4s, v16.4s, #25
+ eor v7.16b, v7.16b, v17.16b
// c ^= (b + a) <<< 9
add v16.4s, v7.4s, v4.4s
shl v17.4s, v16.4s, #9
// c ^= (b + a) <<< 9
add v16.4s, v7.4s, v4.4s
shl v17.4s, v16.4s, #9
- ushr v16.4s, v16.4s, #23
- orr v16.16b, v16.16b, v17.16b
- eor v6.16b, v6.16b, v16.16b
+ sri v17.4s, v16.4s, #23
+ eor v6.16b, v6.16b, v17.16b
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #12
shl v17.4s, v16.4s, #13
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #12
shl v17.4s, v16.4s, #13
- ushr v16.4s, v16.4s, #19
- orr v16.16b, v16.16b, v17.16b
- eor v5.16b, v5.16b, v16.16b
+ sri v17.4s, v16.4s, #19
+ eor v5.16b, v5.16b, v17.16b
// a ^= (d + c) <<< 18
add v16.4s, v5.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v5.16b, v5.16b, v5.16b, #4
shl v17.4s, v16.4s, #18
// a ^= (d + c) <<< 18
add v16.4s, v5.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v5.16b, v5.16b, v5.16b, #4
shl v17.4s, v16.4s, #18
- ushr v16.4s, v16.4s, #14
- orr v16.16b, v16.16b, v17.16b
- eor v4.16b, v4.16b, v16.16b
+ sri v17.4s, v16.4s, #14
+ eor v4.16b, v4.16b, v17.16b
// We had to undo the transpose ready for the next loop. Again, push
// back the reorderings to reduce latency. Decrement the loop
// We had to undo the transpose ready for the next loop. Again, push
// back the reorderings to reduce latency. Decrement the loop
@@
-170,33
+162,29
@@
FUNC(salsa20_core_arm64)
// b ^= (a + d) <<< 7
add v16.4s, v4.4s, v7.4s
shl v17.4s, v16.4s, #7
// b ^= (a + d) <<< 7
add v16.4s, v4.4s, v7.4s
shl v17.4s, v16.4s, #7
- ushr v16.4s, v16.4s, #25
- orr v16.16b, v16.16b, v17.16b
- eor v5.16b, v5.16b, v16.16b
+ sri v17.4s, v16.4s, #25
+ eor v5.16b, v5.16b, v17.16b
// c ^= (b + a) <<< 9
add v16.4s, v5.4s, v4.4s
shl v17.4s, v16.4s, #9
// c ^= (b + a) <<< 9
add v16.4s, v5.4s, v4.4s
shl v17.4s, v16.4s, #9
- ushr v16.4s, v16.4s, #23
- orr v16.16b, v16.16b, v17.16b
- eor v6.16b, v6.16b, v16.16b
+ sri v17.4s, v16.4s, #23
+ eor v6.16b, v6.16b, v17.16b
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v5.4s
ext v5.16b, v5.16b, v5.16b, #12
shl v17.4s, v16.4s, #13
// d ^= (c + b) <<< 13
add v16.4s, v6.4s, v5.4s
ext v5.16b, v5.16b, v5.16b, #12
shl v17.4s, v16.4s, #13
- ushr v16.4s, v16.4s, #19
- orr v16.16b, v16.16b, v17.16b
- eor v7.16b, v7.16b, v16.16b
+ sri v17.4s, v16.4s, #19
+ eor v7.16b, v7.16b, v17.16b
// a ^= (d + c) <<< 18
add v16.4s, v7.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #4
shl v17.4s, v16.4s, #18
// a ^= (d + c) <<< 18
add v16.4s, v7.4s, v6.4s
ext v6.16b, v6.16b, v6.16b, #8
ext v7.16b, v7.16b, v7.16b, #4
shl v17.4s, v16.4s, #18
- ushr v16.4s, v16.4s, #14
- orr v16.16b, v16.16b, v17.16b
- eor v4.16b, v4.16b, v16.16b
+ sri v17.4s, v16.4s, #14
+ eor v4.16b, v4.16b, v17.16b
b 0b
b 0b