symm/salsa20-arm-neon.S: Indent some reordered instructions.

[catacomb] / symm / gcm-arm-crypto.S
diff --git a/symm/gcm-arm-crypto.S b/symm/gcm-arm-crypto.S

index ee31785..8494e42 100644 (file)
--- a/symm/gcm-arm-crypto.S
+++ b/symm/gcm-arm-crypto.S
@@ -67,7 +67,14 @@
         //          = SUM_{0<=i,j<n} u_i v_j t^{2n-2-(i+j)}
         //
         // which is almost the bit-reversal of u v, only it's shifted right
-       // by one place.  Oh, well: we'll have to shift it back later.
+       // by one place.  Putting this another way, what we have is actually
+       // the bit reversal of the product u v t.  We could get the correct
+       // answer (modulo p(t)) if we'd sneakily divided one of the operands
+       // by t before we started.  Conveniently, v is actually the secret
+       // value k set up by the GCM `mktable' function, so we can arrange to
+       // actually store k/t (mod p(t)) and then the product will come out
+       // correct (modulo p(t)) and we won't have anything more to worry
+       // about here.
         //
         // That was important to think about, but there's not a great deal to
         // do about it yet other than to convert what we've got from the
@@ -92,109 +99,66 @@
         // use Karatsuba's identity here, but I suspect that loses more in
         // the shifting, bit-twiddling, and dependency chains that it gains
         // in saving a multiplication which otherwise pipelines well.
-       // q0 =                         // (u_0; u_1)
-       // q1 =                         // (v_0; v_1)
+       // q0 =                         // (u_1; u_0)
+       // q1 =                         // (v_1; v_0)
         vmull.p64 q2, d1, d2            // u_1 v_0
         vmull.p64 q3, d0, d3            // u_0 v_1
-       vmull.p64 q8, d1, d3            // (x_3; t_1) = u_1 v_1
-       vmull.p64 q9, d0, d2            // (t_0; x_0) = u_0 v_0
+       vmull.p64 q8, d1, d3            // (t_1; x_3) = u_1 v_1
+       vmull.p64 q9, d0, d2            // (x_0; t_0) = u_0 v_0
  
         // Arrange the pieces to form a double-precision polynomial.
-       veor    q2, q2, q3              // (m_1; m_0) = u_0 v_1 + u_1 v_0
+       veor    q2, q2, q3              // (m_0; m_1) = u_0 v_1 + u_1 v_0
         veor    d17, d17, d4            // x_2 = t_1 + m_1
         veor    d18, d18, d5            // x_1 = t_0 + m_0
-       // q8 =                         // (x_3; x_2)
-       // q9 =                         // (x_1; x_0)
+       // q8 =                         // (x_2; x_3)
+       // q9 =                         // (x_0; x_1)
  
-       // Two-and-a-half problems remain.  The first is that this product is
-       // shifted left by one place, which is annoying.  Let's take care of
-       // that now.  Once this is done, we'll be properly in GCM's backwards
-       // bit-ordering.
+       // One-and-a-half problems remain.
         //
-       // The half a problem is that the result wants to have its 64-bit
-       // halves switched.  Here turns out to be the best place to arrange
-       // for that.
+       // The full-size problem is that the result needs to be reduced
+       // modulo p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 +
+       // t^2 + t + 1 in our field.  So far, we've calculated z_0 and z_1
+       // such that z_0 + z_1 R = u v using the identity R = t^128: now we
+       // must collapse the two halves of y together using the other
+       // identity R = t^7 + t^2 + t + 1.
         //
-       //                   q9                             q8
-       //      ,-------------.-------------.  ,-------------.-------------.
-       //      | 0  x_0-x_62 | x_63-x_126  |  | x_127-x_190 | x_191-x_254 |
-       //      `-------------^-------------'  `-------------^-------------'
-       //            d19           d18              d17           d16
-       //
-       // We start by shifting each 32-bit lane right (from GCM's point of
-       // view -- physically, left) by one place, which gives us this:
-       //
-       //                 low (q9)                      high (q8)
-       //      ,-------------.-------------.  ,-------------.-------------.
-       //      | x_0-x_62  0 |x_64-x_126 0 |  |x_128-x_190 0|x_192-x_254 0|
-       //      `-------------^-------------'  `-------------^-------------'
-       //            d19           d18              d17           d16
-       //
-       // but we've lost a bunch of bits.  We separately shift each lane
-       // left by 31 places to give us the bits we lost.
-       //
-       //                 low (q3)                      high (q2)
-       //      ,-------------.-------------.  ,-------------.-------------.
-       //      |    0...0    | 0...0  x_63 |  | 0...0 x_127 | 0...0 x_191 |
-       //      `-------------^-------------'  `-------------^-------------'
-       //                           d6              d5            d4
-       //
-       // Since we can address each of these pieces individually, putting
-       // them together is relatively straightforward.
-
-
-       vshr.u64 d6, d18, #63           // shifted left; just the carries
-       vshl.u64 q9, q9, #1             // shifted right, but dropped carries
-       vshr.u64 q2, q8, #63
-       vshl.u64 q8, q8, #1
-       vorr    d0, d19, d6             // y_0
-       vorr    d1, d18, d5             // y_1
-       vorr    d2, d17, d4             // y_2
-       vmov    d3, d16                 // y_3
-
-       // And the other one is that the result needs to be reduced modulo
-       // p(t) = t^128 + t^7 + t^2 + t + 1.  Let R = t^128 = t^7 + t^2 + t +
-       // 1 in our field.  So far, we've calculated z_0 and z_1 such that
-       // z_0 + z_1 R = u v using the identity R = t^128: now we must
-       // collapse the two halves of y together using the other identity R =
-       // t^7 + t^2 + t + 1.
-       //
-       // We do this by working on y_2 and y_3 separately, so consider y_i
-       // for i = 2 or 3.  Certainly, y_i t^{64i} = y_i R t^{64(i-2) =
-       // (t^7 + t^2 + t + 1) y_i t^{64(i-2)}, but we can't use that
+       // We do this by working on x_2 and x_3 separately, so consider x_i
+       // for i = 2 or 3.  Certainly, x_i t^{64i} = x_i R t^{64(i-2) =
+       // (t^7 + t^2 + t + 1) x_i t^{64(i-2)}, but we can't use that
         // directly without breaking up the 64-bit word structure.  Instead,
-       // we start by considering just y_i t^7 t^{64(i-2)}, which again
-       // looks tricky.  Now, split y_i = a_i + t^57 b_i, with deg a_i < 57;
+       // we start by considering just x_i t^7 t^{64(i-2)}, which again
+       // looks tricky.  Now, split x_i = a_i + t^57 b_i, with deg a_i < 57;
         // then
         //
-       //      y_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
+       //      x_i t^7 t^{64(i-2)} = a_i t^7 t^{64(i-2)} + b_i t^{64(i-1)}
         //
-       // We can similarly decompose y_i t^2 and y_i t into a pair of 64-bit
+       // We can similarly decompose x_i t^2 and x_i t into a pair of 64-bit
         // contributions to the t^{64(i-2)} and t^{64(i-1)} words, but the
         // splits are different.  This is lovely, with one small snag: when
-       // we do this to y_3, we end up with a contribution back into the
+       // we do this to x_3, we end up with a contribution back into the
         // t^128 coefficient word.  But notice that only the low seven bits
         // of this word are affected, so there's no knock-on contribution
         // into the t^64 word.  Therefore, if we handle the high bits of each
         // word together, and then the low bits, everything will be fine.
  
         // First, shift the high bits down.
-       vshl.u64 q2, q1, #63            // the b_i for t
-       vshl.u64 q3, q1, #62            // the b_i for t^2
-       vshl.u64 q8, q1, #57            // the b_i for t^7
+       vshl.u64 q2, q8, #63            // the b_i for t
+       vshl.u64 q3, q8, #62            // the b_i for t^2
+       vshl.u64 q0, q8, #57            // the b_i for t^7
         veor    q2, q2, q3              // add them all together
-       veor    q2, q2, q8
-       veor    d2, d2, d5              // contribution into low half
-       veor    d1, d1, d4              // and high half
+       veor    q2, q2, q0
+       veor    d18, d18, d5            // contribution into low half
+       veor    d17, d17, d4            // and high half
  
         // And then shift the low bits up.
-       vshr.u64 q2, q1, #1
-       vshr.u64 q3, q1, #2
-       vshr.u64 q8, q1, #7
-       veor    q0, q0, q1              // mix in the unit contribution
+       vshr.u64 q2, q8, #1
+       vshr.u64 q3, q8, #2
+       vshr.u64 q1, q8, #7
+       veor    q8, q8, q9              // mix in the unit contribution
         veor    q2, q2, q3              // t and t^2 contribs
-       veor    q0, q0, q8              // low, unit, and t^7 contribs
-       veor    q0, q0, q2              // mix them together and we're done
+       veor    q1, q1, q8              // low, unit, and t^7 contribs
+       veor    d1, d2, d4              // mix them together and swap halves
+       veor    d0, d3, d5
  .endm
  
  .macro mul64
@@ -204,29 +168,22 @@
         // The multiplication is thankfully easy.
         vmull.p64 q0, d0, d1            // u v
  
-       // Shift the product up by one place, and swap the two halves.  After
-       // this, we're in GCM bizarro-world.
-       vshr.u64 d2, d0, #63            // shifted left; just the carries
-       vshl.u64 d3, d1, #1             // low half right
-       vshl.u64 d1, d0, #1             // high half shifted right
-       vorr    d0, d3, d2              // mix in the carries
-
         // Now we must reduce.  This is essentially the same as the 128-bit
         // case above, but mostly simpler because everything is smaller.  The
         // polynomial this time is p(t) = t^64 + t^4 + t^3 + t + 1.
  
         // First, shift the high bits down.
-       vshl.u64 d2, d1, #63            // b_i for t
-       vshl.u64 d3, d1, #61            // b_i for t^3
-       vshl.u64 d4, d1, #60            // b_i for t^4
+       vshl.u64 d2, d0, #63            // b_i for t
+       vshl.u64 d3, d0, #61            // b_i for t^3
+       vshl.u64 d4, d0, #60            // b_i for t^4
         veor    d2, d2, d3              // add them all together
         veor    d2, d2, d4
-       veor    d1, d1, d2              // contribution back into high half
+       veor    d0, d0, d2              // contribution back into high half
  
         // And then shift the low bits up.
-       vshr.u64 d2, d1, #1
-       vshr.u64 d3, d1, #3
-       vshr.u64 d4, d1, #4
+       vshr.u64 d2, d0, #1
+       vshr.u64 d3, d0, #3
+       vshr.u64 d4, d0, #4
         veor    d0, d0, d1              // mix in the unit contribution
         veor    d2, d2, d3              // t and t^3 contribs
         veor    d0, d0, d4              // low, unit, and t^4
@@ -237,19 +194,15 @@
         // Enter with u and v in the most-significant three words of q0 and
         // q1 respectively, and zero in the low words, and zero in q15; leave
         // with z = u v in the high three words of q0, and /junk/ in the low
-       // word.  Clobbers ???.
+       // word.  Clobbers q1--q3, q8, q9.
  
         // This is an inconvenient size.  There's nothing for it but to do
-       // four multiplications, as if for the 128-bit case.  It's possible
-       // that there's cruft in the top 32 bits of the input registers, so
-       // shift both of them up by four bytes before we start.  This will
-       // mean that the high 64 bits of the result (from GCM's viewpoint)
-       // will be zero.
-       // q0 =                         // (u_0 + u_1 t^32; u_2)
-       // q1 =                         // (v_0 + v_1 t^32; v_2)
+       // four multiplications, as if for the 128-bit case.
+       // q0 =                         // (u_2; u_0 + u_1 t^32)
+       // q1 =                         // (v_2; v_0 + v_1 t^32)
         vmull.p64 q8, d1, d2            // u_2 (v_0 + v_1 t^32) = e_0
         vmull.p64 q9, d0, d3            // v_2 (u_0 + u_1 t^32) = e_1
-       vmull.p64 q3, d1, d3            // u_2 v_2 t^64 = d = (0; d)
+       vmull.p64 q3, d1, d3            // u_2 v_2 t^64 = d = (d; 0)
         vmull.p64 q0, d0, d2            // u_0 v_0 + (u_0 v_1 + u_1 v_0) t^32
                                         //   + u_1 v_1 t^64 = f
  
@@ -271,20 +224,6 @@
         veor    d0, d0, d17             // q0 = bot([d t^128] + e t^64 + f)
         veor    d3, d3, d16             // q1 = top(d t^128 + e t^64 + f)
  
-       // Shift the product right by one place (from GCM's point of view),
-       // but, unusually, don't swap the halves, because we need to work on
-       // the 32-bit pieces later.  After this, we're in GCM bizarro-world.
-       // q0 =                         // (?, x_2; x_1, x_0)
-       // q1 =                         // (0, x_5; x_4, x_3)
-       vshr.u64 d4, d0, #63            // carry from d0 to d1
-       vshr.u64 d5, d2, #63            // carry from d2 to d3
-       vshr.u32 d6, d3, #31            // carry from d3 to d0
-       vshl.u64 q0, q0, #1             // shift low half
-       vshl.u64 q1, q1, #1             // shift high half
-       vorr    d1, d1, d4
-       vorr    d0, d0, d6
-       vorr    d3, d3, d5
-
         // Finally, the reduction.  This is essentially the same as the
         // 128-bit case, except that the polynomial is p(t) = t^96 + t^10 +
         // t^9 + t^6 + 1.  The degrees are larger but not enough to cause
@@ -340,37 +279,24 @@
          veor   q11, q11, q13           // b = u_1 v_2 + u_2 v_1
  
         // Piece the product together.
-       veor    d17, d17, d22  //  q8 = // (x_5; x_4)
+       veor    d17, d17, d22  //  q8 = // (x_4; x_5)
         veor    d18, d18, d23
-       veor    d19, d19, d24  //  q9 = // (x_3; x_2)
-       veor    d20, d20, d25  // q10 = // (x_1; x_0)
-
-       // Shift the product right by one place (from GCM's point of view).
-       vshr.u64 q11, q8, #63           // carry from d16/d17 to d17/d18
-       vshr.u64 q12, q9, #63           // carry from d18/d19 to d19/d20
-       vshr.u64 d26, d20, #63          // carry from d20 to d21
-       vshl.u64 q8, q8, #1             // shift everything down
-       vshl.u64 q9, q9, #1
-       vshl.u64 q10, q10, #1
-       vorr    d17, d17, d22           // and mix in the carries
-       vorr    d18, d18, d23
-       vorr    d19, d19, d24
-       vorr    d20, d20, d25
-       vorr    d21, d21, d26
+       veor    d19, d19, d24  //  q9 = // (x_2; x_3)
+       veor    d20, d20, d25  // q10 = // (x_0; x_1)
  
         // Next, the reduction.  Our polynomial this time is p(x) = t^192 +
         // t^7 + t^2 + t + 1.  Yes, the magic numbers are the same as the
         // 128-bit case.  I don't know why.
  
         // First, shift the high bits down.
-       // q8 =                         // (y_5; y_4)
-       // q9 =                         // (y_3; y_2)
-       // q10 =                        // (y_1; y_0)
-       vshl.u64 q11, q8, #63           // (y_5; y_4) b_i for t
+       // q8 =                         // (y_4; y_5)
+       // q9 =                         // (y_2; y_3)
+       // q10 =                        // (y_0; y_1)
+       vshl.u64 q11, q8, #63           // (y_4; y_5) b_i for t
         vshl.u64 d28, d18, #63          // y_3 b_i for t
-       vshl.u64 q12, q8, #62           // (y_5; y_4) b_i for t^2
+       vshl.u64 q12, q8, #62           // (y_4; y_5) b_i for t^2
         vshl.u64 d29, d18, #62          // y_3 b_i for t^2
-       vshl.u64 q13, q8, #57           // (y_5; y_4) b_i for t^7
+       vshl.u64 q13, q8, #57           // (y_4; y_5) b_i for t^7
         vshl.u64 d30, d18, #57          // y_3 b_i for t^7
         veor    q11, q11, q12           // mix them all together
         veor    d28, d28, d29
@@ -381,14 +307,14 @@
  
         // And finally shift the low bits up.  Also, switch the order of the
         // pieces for output.
-       // q8 =                         // (y'_5; y'_4)
-       // q9 =                         // (y'_3; y'_2)
-       // q10 =                        // (y'_1; y'_0)
-       vshr.u64 q11, q8, #1            // (y_5; y_4) a_i for t
+       // q8 =                         // (y'_4; y'_5)
+       // q9 =                         // (y'_2; y'_3)
+       // q10 =                        // (y'_0; y'_1)
+       vshr.u64 q11, q8, #1            // (y_4; y_5) a_i for t
         vshr.u64 d28, d18, #1           // y'_3 a_i for t
-       vshr.u64 q12, q8, #2            // (y_5; y_4) a_i for t^2
+       vshr.u64 q12, q8, #2            // (y_4; y_5) a_i for t^2
         vshr.u64 d29, d18, #2           // y'_3 a_i for t^2
-       vshr.u64 q13, q8, #7            // (y_5; y_4) a_i for t^7
+       vshr.u64 q13, q8, #7            // (y_4; y_5) a_i for t^7
         vshr.u64 d30, d18, #7           // y'_3 a_i for t^7
         veor    q8, q8, q11
         veor    d18, d18, d28
@@ -415,20 +341,20 @@
         //
         //      q = r s = (u_0 + u_1) (v_0 + v_1)
         //        = (u_0 v_0) + (u1 v_1) + (u_0 v_1 + u_1 v_0)
-       //        = a + d + c
+       //        = a + c + b
         //
         // The first two terms we've already calculated; the last is the
         // remaining one we want.  We'll set B = t^128.  We know how to do
         // 128-bit multiplications already, and Karatsuba is too annoying
         // there, so there'll be 12 multiplications altogether, rather than
         // the 16 we'd have if we did this the naïve way.
-       // q0 =                         // u_0 = (u_00; u_01)
-       // q1 =                         // u_1 = (u_10; u_11)
-       // q2 =                         // v_0 = (v_00; v_01)
-       // q3 =                         // v_1 = (v_10; v_11)
+       // q0 =                         // u_0 = (u_01; u_00)
+       // q1 =                         // u_1 = (u_11; u_10)
+       // q2 =                         // v_0 = (v_01; v_00)
+       // q3 =                         // v_1 = (v_11; v_10)
  
-       veor    q8, q0, q1              // u_* = (u_00 + u_10; u_01 + u_11)
-       veor    q9, q2, q3              // v_* = (v_00 + v_10; v_01 + v_11)
+       veor    q8, q0, q1              // u_* = (u_01 + u_11; u_00 + u_10)
+       veor    q9, q2, q3              // v_* = (v_01 + v_11; v_00 + v_10)
  
         // Start by building the cross product, q = u_* v_*.
         vmull.p64 q14, d16, d19         // u_*0 v_*1
@@ -467,38 +393,21 @@
         veor    q9, q9, q12
         veor    q10, q10, q13
  
-       // Shift the product right by one place (from GCM's point of view).
-       vshr.u64 q0, q8, #63            // carry from d16/d17 to d17/d18
-       vshr.u64 q1, q9, #63            // carry from d18/d19 to d19/d20
-       vshr.u64 q2, q10, #63           // carry from d20/d21 to d21/d22
-       vshr.u64 d6, d22, #63           // carry from d22 to d23
-       vshl.u64 q8, q8, #1             // shift everyting down
-       vshl.u64 q9, q9, #1
-       vshl.u64 q10, q10, #1
-       vshl.u64 q11, q11, #1
-       vorr    d17, d17, d0
-       vorr    d18, d18, d1
-       vorr    d19, d19, d2
-       vorr    d20, d20, d3
-       vorr    d21, d21, d4
-       vorr    d22, d22, d5
-       vorr    d23, d23, d6
-
         // Now we must reduce.  This is essentially the same as the 192-bit
         // case above, but more complicated because everything is bigger.
         // The polynomial this time is p(t) = t^256 + t^10 + t^5 + t^2 + 1.
  
         // First, shift the high bits down.
-       // q8 =                         // (y_7; y_6)
-       // q9 =                         // (y_5; y_4)
-       // q10 =                        // (y_3; y_2)
-       // q11 =                        // (y_1; y_0)
-       vshl.u64 q0, q8, #62            // (y_7; y_6) b_i for t^2
-       vshl.u64 q12, q9, #62           // (y_5; y_4) b_i for t^2
-       vshl.u64 q1, q8, #59            // (y_7; y_6) b_i for t^5
-       vshl.u64 q13, q9, #59           // (y_5; y_4) b_i for t^5
-       vshl.u64 q2, q8, #54            // (y_7; y_6) b_i for t^10
-       vshl.u64 q14, q9, #54           // (y_5; y_4) b_i for t^10
+       // q8 =                         // (y_6; y_7)
+       // q9 =                         // (y_4; y_5)
+       // q10 =                        // (y_2; y_3)
+       // q11 =                        // (y_0; y_1)
+       vshl.u64 q0, q8, #62            // (y_6; y_7) b_i for t^2
+       vshl.u64 q12, q9, #62           // (y_4; y_5) b_i for t^2
+       vshl.u64 q1, q8, #59            // (y_6; y_7) b_i for t^5
+       vshl.u64 q13, q9, #59           // (y_4; y_5) b_i for t^5
+       vshl.u64 q2, q8, #54            // (y_6; y_7) b_i for t^10
+       vshl.u64 q14, q9, #54           // (y_4; y_5) b_i for t^10
         veor    q0, q0, q1              // mix the contributions together
         veor    q12, q12, q13
         veor    q0, q0, q2
@@ -510,16 +419,16 @@
  
         // And then shift the low bits up.  Also, switch the order of the
         // pieces for output.
-       // q8 =                         // (y'_7; y'_6)
-       // q9 =                         // (y'_5; y'_4)
-       // q10 =                        // (y'_3; y'_2)
-       // q11 =                        // (y'_1; y'_0)
-       vshr.u64 q0, q8, #2             // (y_7; y_6) a_i for t^2
-       vshr.u64 q12, q9, #2            // (y_5; y'_4) a_i for t^2
-       vshr.u64 q1, q8, #5             // (y_7; y_6) a_i for t^5
-       vshr.u64 q13, q9, #5            // (y_5; y_4) a_i for t^5
-       vshr.u64 q2, q8, #10            // (y_7; y_6) a_i for t^10
-       vshr.u64 q14, q9, #10           // (y_5; y_4) a_i for t^10
+       // q8 =                         // (y'_6; y'_7)
+       // q9 =                         // (y'_4; y'_5)
+       // q10 =                        // (y'_2; y'_3)
+       // q11 =                        // (y'_0; y'_1)
+       vshr.u64 q0, q8, #2             // (y_6; y_7) a_i for t^2
+       vshr.u64 q12, q9, #2            // (y'_4; y_5) a_i for t^2
+       vshr.u64 q1, q8, #5             // (y_6; y_7) a_i for t^5
+       vshr.u64 q13, q9, #5            // (y_4; y_5) a_i for t^5
+       vshr.u64 q2, q8, #10            // (y_6; y_7) a_i for t^10
+       vshr.u64 q14, q9, #10           // (y_4; y_5) a_i for t^10
  
         veor    q8, q8, q0              // mix the contributions together
         veor    q1, q1, q2