X-Git-Url: https://git.distorted.org.uk/~mdw/catacomb/blobdiff_plain/429bb008068e94288da5328132b35bcfa20771ee..57e7040b318f0ffc5ab43c3fb62df9a2bef42ac7:/symm/chacha-arm-neon.S

diff --git a/symm/chacha-arm-neon.S b/symm/chacha-arm-neon.S
index 5fb0073d..af53cfd3 100644
--- a/symm/chacha-arm-neon.S
+++ b/symm/chacha-arm-neon.S
@@ -35,7 +35,7 @@
 
 	.arch	armv7-a
 	.fpu	neon
-	.section .text
+	.text
 
 FUNC(chacha_core_arm_neon)
 
@@ -55,7 +55,7 @@ FUNC(chacha_core_arm_neon)
 	// We need a copy for later.  Rather than waste time copying them by
 	// hand, we'll use the three-address nature of the instruction set.
 	// But this means that the main loop is offset by a bit.
-	vldmia	r1, {d24-d31}
+	vldmia	r1, {QQ(q12, q15)}
 
 	// a += b; d ^= a; d <<<= 16
 	vadd.u32 q8, q12, q13
@@ -85,9 +85,9 @@ FUNC(chacha_core_arm_neon)
 
 	// c += d; b ^= c; b <<<=  7
 	vadd.u32 q10, q10, q11
-	vext.32	q11, q11, q11, #3
+	 vext.32 q11, q11, q11, #3
 	veor	q9, q9, q10
-	vext.32	q10, q10, q10, #2
+	 vext.32 q10, q10, q10, #2
 	vshl.u32 q0, q9, #7
 	vshr.u32 q9, q9, #25
 	vorr	q9, q9, q0
@@ -132,9 +132,9 @@ FUNC(chacha_core_arm_neon)
 
 	// c += d; b ^= c; b <<<=  7
 	vadd.u32 q10, q10, q11
-	vext.32	q11, q11, q11, #1
+	 vext.32 q11, q11, q11, #1
 	veor	q9, q9, q10
-	vext.32	q10, q10, q10, #2
+	 vext.32 q10, q10, q10, #2
 	vshl.u32 q0, q9, #7
 	vshr.u32 q9, q9, #25
 	vorr	q9, q9, q0
@@ -173,7 +173,7 @@ FUNC(chacha_core_arm_neon)
 	vadd.u32 q11, q11, q15
 
 	// And now we write out the result.
-	vstmia	r2, {d16-d23}
+	vstmia	r2, {QQ(q8, q11)}
 
 	// And with that, we're done.
 	bx	r14