X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/e492db887af6561dd33aa18e3887efaeb219fd16..HEAD:/symm/chacha-arm64.S

diff --git a/symm/chacha-arm64.S b/symm/chacha-arm64.S
index a423e9e5..00957e60 100644
--- a/symm/chacha-arm64.S
+++ b/symm/chacha-arm64.S
@@ -25,17 +25,18 @@
 /// MA 02111-1307, USA.
 
 ///--------------------------------------------------------------------------
-/// External definitions.
+/// Preliminaries.
 
 #include "config.h"
 #include "asm-common.h"
 
-///--------------------------------------------------------------------------
-/// Main.code.
-
 	.arch	armv8-a
+
 	.text
 
+///--------------------------------------------------------------------------
+/// Main code.
+
 FUNC(chacha_core_arm64)
 
 	// Arguments are in registers.
@@ -58,16 +59,13 @@ FUNC(chacha_core_arm64)
 	// a += b; d ^= a; d <<<= 16
 	add	v4.4s, v0.4s, v1.4s
 	eor	v7.16b, v3.16b, v4.16b
-	shl	v16.4s, v7.4s, #16
-	ushr	v7.4s, v7.4s, #16
-	orr	v7.16b, v7.16b, v16.16b
+	rev32	v7.8h, v7.8h
 
 	// c += d; b ^= c; b <<<= 12
 	add	v6.4s, v2.4s, v7.4s
-	eor	v5.16b, v1.16b, v6.16b
-	shl	v16.4s, v5.4s, #12
-	ushr	v5.4s, v5.4s, #20
-	orr	v5.16b, v5.16b, v16.16b
+	eor	v16.16b, v1.16b, v6.16b
+	shl	v5.4s, v16.4s, #12
+	sri	v5.4s, v16.4s, #20
 
 0:
 	// Apply (the rest of) a column quarterround to each of the columns
@@ -76,19 +74,17 @@ FUNC(chacha_core_arm64)
 
 	// a += b; d ^= a; d <<<= 8
 	add	v4.4s, v4.4s, v5.4s
-	eor	v7.16b, v7.16b, v4.16b
-	shl	v16.4s, v7.4s, #8
-	ushr	v7.4s, v7.4s, #24
-	orr	v7.16b, v7.16b, v16.16b
+	eor	v16.16b, v7.16b, v4.16b
+	shl	v7.4s, v16.4s, #8
+	sri	v7.4s, v16.4s, #24
 
 	// c += d; b ^= c; b <<<= 7
 	add	v6.4s, v6.4s, v7.4s
 	 ext	v7.16b, v7.16b, v7.16b, #12
-	eor	v5.16b, v5.16b, v6.16b
+	eor	v16.16b, v5.16b, v6.16b
 	 ext	v6.16b, v6.16b, v6.16b, #8
-	shl	v16.4s, v5.4s, #7
-	ushr	v5.4s, v5.4s, #25
-	orr	v5.16b, v5.16b, v16.16b
+	shl	v5.4s, v16.4s, #7
+	sri	v5.4s, v16.4s, #25
 
 	// The not-quite-transpose conveniently only involves reordering
 	// elements of individual rows, which can be done quite easily.  It
@@ -111,32 +107,27 @@ FUNC(chacha_core_arm64)
 	// a += b; d ^= a; d <<<= 16
 	add	v4.4s, v4.4s, v5.4s
 	eor	v7.16b, v7.16b, v4.16b
-	shl	v16.4s, v7.4s, #16
-	ushr	v7.4s, v7.4s, #16
-	orr	v7.16b, v7.16b, v16.16b
+	rev32	v7.8h, v7.8h
 
 	// c += d; b ^= c; b <<<= 12
 	add	v6.4s, v6.4s, v7.4s
-	eor	v5.16b, v5.16b, v6.16b
-	shl	v16.4s, v5.4s, #12
-	ushr	v5.4s, v5.4s, #20
-	orr	v5.16b, v5.16b, v16.16b
+	eor	v16.16b, v5.16b, v6.16b
+	shl	v5.4s, v16.4s, #12
+	sri	v5.4s, v16.4s, #20
 
 	// a += b; d ^= a; d <<<= 8
 	add	v4.4s, v4.4s, v5.4s
-	eor	v7.16b, v7.16b, v4.16b
-	shl	v16.4s, v7.4s, #8
-	ushr	v7.4s, v7.4s, #24
-	orr	v7.16b, v7.16b, v16.16b
+	eor	v16.16b, v7.16b, v4.16b
+	shl	v7.4s, v16.4s, #8
+	sri	v7.4s, v16.4s, #24
 
 	// c += d; b ^= c; b <<<= 7
 	add	v6.4s, v6.4s, v7.4s
 	 ext	v7.16b, v7.16b, v7.16b, #4
-	eor	v5.16b, v5.16b, v6.16b
+	eor	v16.16b, v5.16b, v6.16b
 	 ext	v6.16b, v6.16b, v6.16b, #8
-	shl	v16.4s, v5.4s, #7
-	ushr	v5.4s, v5.4s, #25
-	orr	v5.16b, v5.16b, v16.16b
+	shl	v5.4s, v16.4s, #7
+	sri	v5.4s, v16.4s, #25
 
 	// Finally finish off undoing the transpose, and we're done for this
 	// doubleround.  Again, most of this was done above so we don't have
@@ -151,16 +142,13 @@ FUNC(chacha_core_arm64)
 	// a += b; d ^= a; d <<<= 16
 	add	v4.4s, v4.4s, v5.4s
 	eor	v7.16b, v7.16b, v4.16b
-	shl	v16.4s, v7.4s, #16
-	ushr	v7.4s, v7.4s, #16
-	orr	v7.16b, v7.16b, v16.16b
+	rev32	v7.8h, v7.8h
 
 	// c += d; b ^= c; b <<<= 12
 	add	v6.4s, v6.4s, v7.4s
-	eor	v5.16b, v5.16b, v6.16b
-	shl	v16.4s, v5.4s, #12
-	ushr	v5.4s, v5.4s, #20
-	orr	v5.16b, v5.16b, v16.16b
+	eor	v16.16b, v5.16b, v6.16b
+	shl	v5.4s, v16.4s, #12
+	sri	v5.4s, v16.4s, #20
 
 	b	0b