symm/gcm.c, symm/gcm-*.S, utils/gcm-ref: Replace one-bit shift with algebra.

[catacomb] / symm / gcm.c
diff --git a/symm/gcm.c b/symm/gcm.c

index b438415..25be748 100644 (file)
--- a/symm/gcm.c
+++ b/symm/gcm.c
@@ -121,7 +121,7 @@
  
  /*----- Low-level utilities -----------------------------------------------*/
  
-/* --- @mult@ --- *
+/* --- @mult@, @divt@ --- *
   *
   * Arguments:  @const gcm_params *p@ = pointer to the parameters
   *             @uint32 *z@ = where to write the result
@@ -129,11 +129,11 @@
   *
   * Returns:    ---
   *
- * Use:                Multiply the input field element by %$t$%, and write the
- *             product to @z@.  It's safe for @x@ and @z@ to be equal, but
- *             they should not otherwise overlap.  Both input and output are
- *             in big-endian form, i.e., with the lowest-degree coefficients
- *             in the most significant bits.
+ * Use:                Multiply or divide the input field element by %$t$%, and
+ *             write the product or quotient to @z@.  It's safe for @x@ and
+ *             @z@ to be equal, but they should not otherwise overlap.  Both
+ *             input and output are in big-endian form, i.e., with the
+ *             lowest-degree coefficients in the most significant bits.
   */
  
  static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
@@ -145,6 +145,18 @@ static void mult(const gcm_params *p, uint32 *z, const uint32 *x)
    for (i = 0; i < p->n; i++) { t = x[i]; z[i] = (t >> 1) ^ c; c = t << 31; }
  }
  
+#if CPUFAM_X86 || CPUFAM_AMD64 || CPUFAM_ARMEL
+static void divt(const gcm_params *p, uint32 *z, const uint32 *x)
+{
+  uint32 m, c, t;
+  unsigned i;
+
+  t = x[0]; m = -((t >> 31)&1u); c = m&1u;
+  for (i = p->n - 1; i; i--) { t = x[i]; z[i] = (t << 1) | c; c = t >> 31; }
+  t = x[0]; z[0] = ((t ^ (m&p->poly)) << 1) | c;
+}
+#endif
+
  /* --- @mul@ --- *
   *
   * Arguments:  @const gcm_params *p@ = pointer to the parameters
@@ -238,22 +250,26 @@ static void simple_mktable(const gcm_params *p,
  static void pclmul_mktable(const gcm_params *p,
                            uint32 *ktab, const uint32 *k)
  {
-  unsigned n = p->n;
+  unsigned i, n = p->n;
    unsigned nz;
-  uint32 *t;
+  uint32 k_over_t[GCM_NMAX], *t;
  
-  /* We just need to store the value in a way which is convenient for the
-   * assembler code to read back.  That involves reordering the words, and,
-   * in the case of 96-bit blocks, padding with zeroes to fill out a 128-bit
-   * chunk.
+  /* We need to divide the value by t (to compensate for the one-bit shift
+   * resulting from GCM's backwards bit ordering) and store the value in a
+   * way which is convenient for the assembler code to read back.  That
+   * involves reordering the words, and, in the case of 96-bit blocks,
+   * padding with zeroes to fill out a 128-bit chunk.
     */
  
+  if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
+  else {
+    for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
+    divt(p, k_over_t, k_over_t);
+  }
+
    if (n == 3) nz = 1;
    else nz = 0;
-  t = ktab + n + nz;
-
-  if (p->f&GCMF_SWAP) while (n--) { *--t = ENDSWAP32(*k); k++; }
-  else while (n--) *--t = *k++;
+  k = k_over_t; t = ktab + n + nz; while (n--) *--t = *k++;
    while (nz--) *--t = 0;
  }
  #endif
@@ -262,28 +278,27 @@ static void pclmul_mktable(const gcm_params *p,
  static void arm_crypto_mktable(const gcm_params *p,
                                uint32 *ktab, const uint32 *k)
  {
-  unsigned n = p->n;
-  uint32 *t;
+  unsigned i, n = p->n;
+  uint32 k_over_t[GCM_NMAX], *t;
  
-  /* We just need to store the value in a way which is convenient for the
-   * assembler code to read back.  That involves swapping the bytes in each
-   * 64-bit lane.
+  /* We need to divide the value by t (to compensate for the one-bit shift
+   * resulting from GCM's backwards bit ordering) and store the value in a
+   * way which is convenient for the assembler code to read back.  That
+   * involves swapping the bytes in each 64-bit lane.
     */
  
-  t = ktab;
-  if (p->f&GCMF_SWAP) {
-    while (n >= 2) {
-      t[1] = ENDSWAP32(k[0]); t[0] = ENDSWAP32(k[1]);
-      t += 2; k += 2; n -= 2;
-    }
-    if (n) { t[1] = ENDSWAP32(k[0]); t[0] = 0; }
-  } else {
-    while (n >= 2) {
-      t[1] = k[0]; t[0] = k[1];
-      t += 2; k += 2; n -= 2;
-    }
-    if (n) { t[1] = k[0]; t[0] = 0; }
+  if (!(p->f&GCMF_SWAP)) divt(p, k_over_t, k);
+  else {
+    for (i = 0; i < n; i++) k_over_t[i] = ENDSWAP32(k[i]);
+    divt(p, k_over_t, k_over_t);
+  }
+
+  t = ktab; k = k_over_t;
+  while (n >= 2) {
+    t[1] = k[0]; t[0] = k[1];
+    t += 2; k += 2; n -= 2;
    }
+  if (n) { t[1] = k[0]; t[0] = 0; }
  }
  #endif
  
@@ -407,12 +422,14 @@ static void pclmul_recover_k(const gcm_params *p,
    const uint32 *t;
  
    /* The representation is already independent of the blockcipher endianness.
-   * We need to compensate for padding, and reorder the words.
+   * We need to compensate for padding, reorder the words, and multiply by t
+   * to compensate for the factor of t we divided out earlier.
     */
  
    if (n == 3) nz = 1; else nz = 0;
    t = ktab + n + nz;
    while (n--) *k++ = *--t;
+  mult(p, k - p->n, k - p->n);
  }
  #endif
  
@@ -424,12 +441,14 @@ static void arm_crypto_recover_k(const gcm_params *p,
    const uint32 *t;
  
    /* The representation is already independent of the blockcipher endianness.
-   * We only need to reorder the words.
+   * We only need to reorder the words, and multiply by t to compensate for
+   * the factor of t we divided out earlier.
     */
  
    t = ktab;
    while (n >= 2) { k[1] = t[0]; k[0] = t[1]; t += 2; k += 2; n -= 2; }
-  if (n) k[0] = t[1];
+  if (n) { k[0] = t[1]; k++; n--; }
+  mult(p, k - p->n, k - p->n);
  }
  #endif