symm/{chacha,salsa20}-{arm64,arm-neon}.S: Improve rotation code.

[catacomb] / symm / chacha-arm64.S
diff --git a/symm/chacha-arm64.S b/symm/chacha-arm64.S

index 61ac51a..00957e6 100644 (file)
--- a/symm/chacha-arm64.S
+++ b/symm/chacha-arm64.S
@@ -35,7 +35,7 @@
         .text
  
  ///--------------------------------------------------------------------------
         .text
  
  ///--------------------------------------------------------------------------
-/// Main.code.
+/// Main code.
  
  FUNC(chacha_core_arm64)
  
  
  FUNC(chacha_core_arm64)
  
@@ -59,16 +59,13 @@ FUNC(chacha_core_arm64)
         // a += b; d ^= a; d <<<= 16
         add     v4.4s, v0.4s, v1.4s
         eor     v7.16b, v3.16b, v4.16b
         // a += b; d ^= a; d <<<= 16
         add     v4.4s, v0.4s, v1.4s
         eor     v7.16b, v3.16b, v4.16b
-       shl     v16.4s, v7.4s, #16
-       ushr    v7.4s, v7.4s, #16
-       orr     v7.16b, v7.16b, v16.16b
+       rev32   v7.8h, v7.8h
  
         // c += d; b ^= c; b <<<= 12
         add     v6.4s, v2.4s, v7.4s
  
         // c += d; b ^= c; b <<<= 12
         add     v6.4s, v2.4s, v7.4s
-       eor     v5.16b, v1.16b, v6.16b
-       shl     v16.4s, v5.4s, #12
-       ushr    v5.4s, v5.4s, #20
-       orr     v5.16b, v5.16b, v16.16b
+       eor     v16.16b, v1.16b, v6.16b
+       shl     v5.4s, v16.4s, #12
+       sri     v5.4s, v16.4s, #20
  
  0:
         // Apply (the rest of) a column quarterround to each of the columns
  
  0:
         // Apply (the rest of) a column quarterround to each of the columns
@@ -77,19 +74,17 @@ FUNC(chacha_core_arm64)
  
         // a += b; d ^= a; d <<<= 8
         add     v4.4s, v4.4s, v5.4s
  
         // a += b; d ^= a; d <<<= 8
         add     v4.4s, v4.4s, v5.4s
-       eor     v7.16b, v7.16b, v4.16b
-       shl     v16.4s, v7.4s, #8
-       ushr    v7.4s, v7.4s, #24
-       orr     v7.16b, v7.16b, v16.16b
+       eor     v16.16b, v7.16b, v4.16b
+       shl     v7.4s, v16.4s, #8
+       sri     v7.4s, v16.4s, #24
  
         // c += d; b ^= c; b <<<= 7
         add     v6.4s, v6.4s, v7.4s
          ext    v7.16b, v7.16b, v7.16b, #12
  
         // c += d; b ^= c; b <<<= 7
         add     v6.4s, v6.4s, v7.4s
          ext    v7.16b, v7.16b, v7.16b, #12
-       eor     v5.16b, v5.16b, v6.16b
+       eor     v16.16b, v5.16b, v6.16b
          ext    v6.16b, v6.16b, v6.16b, #8
          ext    v6.16b, v6.16b, v6.16b, #8
-       shl     v16.4s, v5.4s, #7
-       ushr    v5.4s, v5.4s, #25
-       orr     v5.16b, v5.16b, v16.16b
+       shl     v5.4s, v16.4s, #7
+       sri     v5.4s, v16.4s, #25
  
         // The not-quite-transpose conveniently only involves reordering
         // elements of individual rows, which can be done quite easily.  It
  
         // The not-quite-transpose conveniently only involves reordering
         // elements of individual rows, which can be done quite easily.  It
@@ -112,32 +107,27 @@ FUNC(chacha_core_arm64)
         // a += b; d ^= a; d <<<= 16
         add     v4.4s, v4.4s, v5.4s
         eor     v7.16b, v7.16b, v4.16b
         // a += b; d ^= a; d <<<= 16
         add     v4.4s, v4.4s, v5.4s
         eor     v7.16b, v7.16b, v4.16b
-       shl     v16.4s, v7.4s, #16
-       ushr    v7.4s, v7.4s, #16
-       orr     v7.16b, v7.16b, v16.16b
+       rev32   v7.8h, v7.8h
  
         // c += d; b ^= c; b <<<= 12
         add     v6.4s, v6.4s, v7.4s
  
         // c += d; b ^= c; b <<<= 12
         add     v6.4s, v6.4s, v7.4s
-       eor     v5.16b, v5.16b, v6.16b
-       shl     v16.4s, v5.4s, #12
-       ushr    v5.4s, v5.4s, #20
-       orr     v5.16b, v5.16b, v16.16b
+       eor     v16.16b, v5.16b, v6.16b
+       shl     v5.4s, v16.4s, #12
+       sri     v5.4s, v16.4s, #20
  
         // a += b; d ^= a; d <<<= 8
         add     v4.4s, v4.4s, v5.4s
  
         // a += b; d ^= a; d <<<= 8
         add     v4.4s, v4.4s, v5.4s
-       eor     v7.16b, v7.16b, v4.16b
-       shl     v16.4s, v7.4s, #8
-       ushr    v7.4s, v7.4s, #24
-       orr     v7.16b, v7.16b, v16.16b
+       eor     v16.16b, v7.16b, v4.16b
+       shl     v7.4s, v16.4s, #8
+       sri     v7.4s, v16.4s, #24
  
         // c += d; b ^= c; b <<<= 7
         add     v6.4s, v6.4s, v7.4s
          ext    v7.16b, v7.16b, v7.16b, #4
  
         // c += d; b ^= c; b <<<= 7
         add     v6.4s, v6.4s, v7.4s
          ext    v7.16b, v7.16b, v7.16b, #4
-       eor     v5.16b, v5.16b, v6.16b
+       eor     v16.16b, v5.16b, v6.16b
          ext    v6.16b, v6.16b, v6.16b, #8
          ext    v6.16b, v6.16b, v6.16b, #8
-       shl     v16.4s, v5.4s, #7
-       ushr    v5.4s, v5.4s, #25
-       orr     v5.16b, v5.16b, v16.16b
+       shl     v5.4s, v16.4s, #7
+       sri     v5.4s, v16.4s, #25
  
         // Finally finish off undoing the transpose, and we're done for this
         // doubleround.  Again, most of this was done above so we don't have
  
         // Finally finish off undoing the transpose, and we're done for this
         // doubleround.  Again, most of this was done above so we don't have
@@ -152,16 +142,13 @@ FUNC(chacha_core_arm64)
         // a += b; d ^= a; d <<<= 16
         add     v4.4s, v4.4s, v5.4s
         eor     v7.16b, v7.16b, v4.16b
         // a += b; d ^= a; d <<<= 16
         add     v4.4s, v4.4s, v5.4s
         eor     v7.16b, v7.16b, v4.16b
-       shl     v16.4s, v7.4s, #16
-       ushr    v7.4s, v7.4s, #16
-       orr     v7.16b, v7.16b, v16.16b
+       rev32   v7.8h, v7.8h
  
         // c += d; b ^= c; b <<<= 12
         add     v6.4s, v6.4s, v7.4s
  
         // c += d; b ^= c; b <<<= 12
         add     v6.4s, v6.4s, v7.4s
-       eor     v5.16b, v5.16b, v6.16b
-       shl     v16.4s, v5.4s, #12
-       ushr    v5.4s, v5.4s, #20
-       orr     v5.16b, v5.16b, v16.16b
+       eor     v16.16b, v5.16b, v6.16b
+       shl     v5.4s, v16.4s, #12
+       sri     v5.4s, v16.4s, #20
  
         b       0b
  
  
         b       0b