mdw@git.distorted.org.uk Git - secnet/blob - fgoldi.c

   1 /* -*-c-*-
   2  *
   3  * Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
   4  *
   5  * (c) 2017 Straylight/Edgeware
   6  */
   7
   8 /*----- Licensing notice --------------------------------------------------*
   9  *
  10  * This file is part of Catacomb.
  11  *
  12  * Catacomb is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU Library General Public License as
  14  * published by the Free Software Foundation; either version 2 of the
  15  * License, or (at your option) any later version.
  16  *
  17  * Catacomb is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU Library General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Library General Public
  23  * License along with Catacomb; if not, write to the Free
  24  * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
  25  * MA 02111-1307, USA.
  26  */
  27
  28 /*----- Header files ------------------------------------------------------*/
  29
  30 #include "config.h"
  31
  32 #include "fgoldi.h"
  33
  34 /*----- Basic setup -------------------------------------------------------*
  35  *
  36  * Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
  37  * (hence the name).
  38  */
  39
  40 #if FGOLDI_IMPL == 28
  41 /* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
  42  * x = SUM_{0<=i<16} x_i 2^(28i).
  43  */
  44
  45 typedef  int32  piece; typedef  int64  dblpiece;
  46 typedef uint32 upiece; typedef uint64 udblpiece;
  47 #define PIECEWD(i) 28
  48 #define NPIECE 16
  49 #define P p28
  50
  51 #define B28 0x10000000u
  52 #define B27 0x08000000u
  53 #define M28 0x0fffffffu
  54 #define M27 0x07ffffffu
  55 #define M32 0xffffffffu
  56
  57 #elif FGOLDI_IMPL == 12
  58 /* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
  59  * SUM_{0<=i<40} x_i 2^ceil(224i/20).  Pieces i with i == 0 (mod 5) are 12
  60  * bits wide; the others are 11 bits wide, so they form eight groups of 56
  61  * bits.
  62  */
  63
  64 typedef  int16  piece; typedef  int32  dblpiece;
  65 typedef uint16 upiece; typedef uint32 udblpiece;
  66 #define PIECEWD(i) ((i)%5 ? 11 : 12)
  67 #define NPIECE 40
  68 #define P p12
  69
  70 #define B12 0x1000u
  71 #define B11 0x0800u
  72 #define B10 0x0400u
  73 #define M12 0xfffu
  74 #define M11 0x7ffu
  75 #define M10 0x3ffu
  76 #define M8 0xffu
  77
  78 #endif
  79
  80 /*----- Debugging machinery -----------------------------------------------*/
  81
  82 #if defined(FGOLDI_DEBUG) || defined(TEST_RIG)
  83
  84 #include <stdio.h>
  85
  86 #include "mp.h"
  87 #include "mptext.h"
  88
  89 static mp *get_pgoldi(void)
  90 {
  91   mp *p = MP_NEW, *t = MP_NEW;
  92
  93   p = mp_setbit(p, MP_ZERO, 448);
  94   t = mp_setbit(t, MP_ZERO, 224);
  95   p = mp_sub(p, p, t);
  96   p = mp_sub(p, p, MP_ONE);
  97   mp_drop(t);
  98   return (p);
  99 }
 100
 101 DEF_FDUMP(fdump, piece, PIECEWD, NPIECE, 56, get_pgoldi())
 102
 103 #endif
 104
 105 /*----- Loading and storing -----------------------------------------------*/
 106
 107 /* --- @fgoldi_load@ --- *
 108  *
 109  * Arguments:   @fgoldi *z@ = where to store the result
 110  *              @const octet xv[56]@ = source to read
 111  *
 112  * Returns:     ---
 113  *
 114  * Use:         Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
 115  *              external representation from @xv@ and stores it in @z@.
 116  *
 117  *              External representation is little-endian base-256.  Some
 118  *              elements have multiple encodings, which are not produced by
 119  *              correct software; use of noncanonical encodings is not an
 120  *              error, and toleration of them is considered a performance
 121  *              feature.
 122  */
 123
 124 void fgoldi_load(fgoldi *z, const octet xv[56])
 125 {
 126 #if FGOLDI_IMPL == 28
 127
 128   unsigned i;
 129   uint32 xw[14];
 130   piece b, c;
 131
 132   /* First, read the input value as words. */
 133   for (i = 0; i < 14; i++) xw[i] = LOAD32_L(xv + 4*i);
 134
 135   /* Extract unsigned 28-bit pieces from the words. */
 136   z->P[ 0] = (xw[ 0] >> 0)&M28;
 137   z->P[ 7] = (xw[ 6] >> 4)&M28;
 138   z->P[ 8] = (xw[ 7] >> 0)&M28;
 139   z->P[15] = (xw[13] >> 4)&M28;
 140   for (i = 1; i < 7; i++) {
 141     z->P[i + 0] = ((xw[i + 0] << (4*i)) | (xw[i - 1] >> (32 - 4*i)))&M28;
 142     z->P[i + 8] = ((xw[i + 7] << (4*i)) | (xw[i + 6] >> (32 - 4*i)))&M28;
 143   }
 144
 145   /* Convert the nonnegative pieces into a balanced signed representation, so
 146    * each piece ends up in the interval |z_i| <= 2^27.  For each piece, if
 147    * its top bit is set, lend a bit leftwards; in the case of z_15, reduce
 148    * this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
 149    * φ^2 = φ + 1.  We delay this carry until after all of the pieces have
 150    * been balanced.  If we don't do this, then we have to do a more expensive
 151    * test for nonzeroness to decide whether to lend a bit leftwards rather
 152    * than just testing a single bit.
 153    *
 154    * Note that we don't try for a canonical representation here: both upper
 155    * and lower bounds are achievable.
 156    */
 157   b = z->P[15]&B27; z->P[15] -= b << 1; c = b >> 27;
 158   for (i = NPIECE - 1; i--; )
 159     { b = z->P[i]&B27; z->P[i] -= b << 1; z->P[i + 1] += b >> 27; }
 160   z->P[0] += c; z->P[8] += c;
 161
 162 #elif FGOLDI_IMPL == 12
 163
 164   unsigned i, j, n, w, b;
 165   uint32 a;
 166   int c;
 167
 168   /* First, convert the bytes into nonnegative pieces. */
 169   for (i = j = a = n = 0, w = PIECEWD(0); i < 56; i++) {
 170     a |= (uint32)xv[i] << n; n += 8;
 171     if (n >= w) {
 172       z->P[j++] = a&MASK(w);
 173       a >>= w; n -= w; w = PIECEWD(j);
 174     }
 175   }
 176
 177   /* Convert the nonnegative pieces into a balanced signed representation, so
 178    * each piece ends up in the interval |z_i| <= 2^11 + 1.
 179    */
 180   b = z->P[39]&B10; z->P[39] -= b << 1; c = b >> 10;
 181   for (i = NPIECE - 1; i--; ) {
 182     w = PIECEWD(i) - 1;
 183     b = z->P[i]&BIT(w);
 184     z->P[i] -= b << 1;
 185     z->P[i + 1] += b >> w;
 186   }
 187   z->P[0] += c; z->P[20] += c;
 188
 189 #endif
 190 }
 191
 192 /* --- @fgoldi_store@ --- *
 193  *
 194  * Arguments:   @octet zv[56]@ = where to write the result
 195  *              @const fgoldi *x@ = the field element to write
 196  *
 197  * Returns:     ---
 198  *
 199  * Use:         Stores a field element in the given octet vector in external
 200  *              representation.  A canonical encoding is always stored.
 201  */
 202
 203 void fgoldi_store(octet zv[56], const fgoldi *x)
 204 {
 205 #if FGOLDI_IMPL == 28
 206
 207   piece y[NPIECE], yy[NPIECE], c, d;
 208   uint32 u, v;
 209   mask32 m;
 210   unsigned i;
 211
 212   for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
 213
 214   /* First, propagate the carries.  By the end of this, we'll have all of the
 215    * the pieces canonically sized and positive, and maybe there'll be
 216    * (signed) carry out.  The carry c is in { -1, 0, +1 }, and the remaining
 217    * value will be in the half-open interval [0, φ^2).  The whole represented
 218    * value is then y + φ^2 c.
 219    *
 220    * Assume that we start out with |y_i| <= 2^30.  We start off by cutting
 221    * off and reducing the carry c_15 from the topmost piece, y_15.  This
 222    * leaves 0 <= y_15 < 2^28; and we'll have |c_15| <= 4.  We'll add this
 223    * onto y_0 and y_8, and propagate the carries.  It's very clear that we'll
 224    * end up with |y + (φ + 1) c_15 - φ^2/2| << φ^2.
 225    *
 226    * Here, the y_i are signed, so we must be cautious about bithacking them.
 227    */
 228   c = ASR(piece, y[15], 28); y[15] = (upiece)y[15]&M28; y[8] += c;
 229   for (i = 0; i < NPIECE; i++)
 230     { y[i] += c; c = ASR(piece, y[i], 28); y[i] = (upiece)y[i]&M28; }
 231
 232   /* Now we have a slightly fiddly job to do.  If c = +1, or if c = 0 and
 233    * y >= p, then we should subtract p from the whole value; if c = -1 then
 234    * we should add p; and otherwise we should do nothing.
 235    *
 236    * But conditional behaviour is bad, m'kay.  So here's what we do instead.
 237    *
 238    * The first job is to sort out what we wanted to do.  If c = -1 then we
 239    * want to (a) invert the constant addend and (b) feed in a carry-in;
 240    * otherwise, we don't.
 241    */
 242   m = SIGN(c)&M28;
 243   d = m&1;
 244
 245   /* Now do the addition/subtraction.  Remember that all of the y_i are
 246    * nonnegative, so shifting and masking are safe and easy.
 247    */
 248       d += y[0] + (1 ^ m); yy[0] = d&M28; d >>= 28;
 249   for (i = 1; i < 8; i++)
 250     { d += y[i] +      m;  yy[i] = d&M28; d >>= 28; }
 251       d += y[8] + (1 ^ m); yy[8] = d&M28; d >>= 28;
 252   for (i = 9; i < 16; i++)
 253     { d += y[i] +      m;  yy[i] = d&M28; d >>= 28; }
 254
 255   /* The final carry-out is in d; since we only did addition, and the y_i are
 256    * nonnegative, then d is in { 0, 1 }.  We want to keep y', rather than y,
 257    * if (a) c /= 0 (in which case we know that the old value was
 258    * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
 259    * the subtraction didn't cause a borrow, so we must be in the case where
 260    * p <= y < φ^2.
 261    */
 262   m = NONZEROP(c) | ~NONZEROP(d - 1);
 263   for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
 264
 265   /* Extract 32-bit words from the value. */
 266   for (i = 0; i < 7; i++) {
 267     u = ((y[i + 0] >> (4*i)) | ((uint32)y[i + 1] << (28 - 4*i)))&M32;
 268     v = ((y[i + 8] >> (4*i)) | ((uint32)y[i + 9] << (28 - 4*i)))&M32;
 269     STORE32_L(zv + 4*i,      u);
 270     STORE32_L(zv + 4*i + 28, v);
 271   }
 272
 273 #elif FGOLDI_IMPL == 12
 274
 275   piece y[NPIECE], yy[NPIECE], c, d;
 276   uint32 a;
 277   mask32 m, mm;
 278   unsigned i, j, n, w;
 279
 280   for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
 281
 282   /* First, propagate the carries.  By the end of this, we'll have all of the
 283    * the pieces canonically sized and positive, and maybe there'll be
 284    * (signed) carry out.  The carry c is in { -1, 0, +1 }, and the remaining
 285    * value will be in the half-open interval [0, φ^2).  The whole represented
 286    * value is then y + φ^2 c.
 287    *
 288    * Assume that we start out with |y_i| <= 2^14.  We start off by cutting
 289    * off and reducing the carry c_39 from the topmost piece, y_39.  This
 290    * leaves 0 <= y_39 < 2^11; and we'll have |c_39| <= 16.  We'll add this
 291    * onto y_0 and y_20, and propagate the carries.  It's very clear that
 292    * we'll end up with |y + (φ + 1) c_39 - φ^2/2| << φ^2.
 293    *
 294    * Here, the y_i are signed, so we must be cautious about bithacking them.
 295    */
 296   c = ASR(piece, y[39], 11); y[39] = (piece)y[39]&M11; y[20] += c;
 297   for (i = 0; i < NPIECE; i++) {
 298     w = PIECEWD(i); m = (1 << w) - 1;
 299     y[i] += c; c = ASR(piece, y[i], w); y[i] = (upiece)y[i]&m;
 300   }
 301
 302   /* Now we have a slightly fiddly job to do.  If c = +1, or if c = 0 and
 303    * y >= p, then we should subtract p from the whole value; if c = -1 then
 304    * we should add p; and otherwise we should do nothing.
 305    *
 306    * But conditional behaviour is bad, m'kay.  So here's what we do instead.
 307    *
 308    * The first job is to sort out what we wanted to do.  If c = -1 then we
 309    * want to (a) invert the constant addend and (b) feed in a carry-in;
 310    * otherwise, we don't.
 311    */
 312   mm = SIGN(c);
 313   d = m&1;
 314
 315   /* Now do the addition/subtraction.  Remember that all of the y_i are
 316    * nonnegative, so shifting and masking are safe and easy.
 317    */
 318     d += y[ 0] + (1 ^ (mm&M12)); yy[ 0] = d&M12; d >>= 12;
 319   for (i = 1; i < 20; i++) {
 320     w = PIECEWD(i); m = MASK(w);
 321     d += y[ i] +      (mm&m);    yy[ i] = d&m;   d >>= w;
 322   }
 323     d += y[20] + (1 ^ (mm&M12)); yy[20] = d&M12; d >>= 12;
 324   for (i = 21; i < 40; i++) {
 325     w = PIECEWD(i); m = MASK(w);
 326     d += y[ i] +      (mm&m);    yy[ i] = d&m;   d >>= w;
 327   }
 328
 329   /* The final carry-out is in d; since we only did addition, and the y_i are
 330    * nonnegative, then d is in { 0, 1 }.  We want to keep y', rather than y,
 331    * if (a) c /= 0 (in which case we know that the old value was
 332    * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
 333    * the subtraction didn't cause a borrow, so we must be in the case where
 334    * p <= y < φ^2.
 335    */
 336   m = NONZEROP(c) | ~NONZEROP(d - 1);
 337   for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
 338
 339   /* Convert that back into octets. */
 340   for (i = j = a = n = 0; i < NPIECE; i++) {
 341     a |= (uint32)y[i] << n; n += PIECEWD(i);
 342     while (n >= 8) { zv[j++] = a&M8; a >>= 8; n -= 8; }
 343   }
 344
 345 #endif
 346 }
 347
 348 /* --- @fgoldi_set@ --- *
 349  *
 350  * Arguments:   @fgoldi *z@ = where to write the result
 351  *              @int a@ = a small-ish constant
 352  *
 353  * Returns:     ---
 354  *
 355  * Use:         Sets @z@ to equal @a@.
 356  */
 357
 358 void fgoldi_set(fgoldi *x, int a)
 359 {
 360   unsigned i;
 361
 362   x->P[0] = a;
 363   for (i = 1; i < NPIECE; i++) x->P[i] = 0;
 364 }
 365
 366 /*----- Basic arithmetic --------------------------------------------------*/
 367
 368 /* --- @fgoldi_add@ --- *
 369  *
 370  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 371  *              @const fgoldi *x, *y@ = two operands
 372  *
 373  * Returns:     ---
 374  *
 375  * Use:         Set @z@ to the sum %$x + y$%.
 376  */
 377
 378 void fgoldi_add(fgoldi *z, const fgoldi *x, const fgoldi *y)
 379 {
 380   unsigned i;
 381   for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] + y->P[i];
 382 }
 383
 384 /* --- @fgoldi_sub@ --- *
 385  *
 386  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 387  *              @const fgoldi *x, *y@ = two operands
 388  *
 389  * Returns:     ---
 390  *
 391  * Use:         Set @z@ to the difference %$x - y$%.
 392  */
 393
 394 void fgoldi_sub(fgoldi *z, const fgoldi *x, const fgoldi *y)
 395 {
 396   unsigned i;
 397   for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] - y->P[i];
 398 }
 399
 400 /*----- Constant-time utilities -------------------------------------------*/
 401
 402 /* --- @fgoldi_condswap@ --- *
 403  *
 404  * Arguments:   @fgoldi *x, *y@ = two operands
 405  *              @uint32 m@ = a mask
 406  *
 407  * Returns:     ---
 408  *
 409  * Use:         If @m@ is zero, do nothing; if @m@ is all-bits-set, then
 410  *              exchange @x@ and @y@.  If @m@ has some other value, then
 411  *              scramble @x@ and @y@ in an unhelpful way.
 412  */
 413
 414 void fgoldi_condswap(fgoldi *x, fgoldi *y, uint32 m)
 415 {
 416   unsigned i;
 417   mask32 mm = FIX_MASK32(m);
 418
 419   for (i = 0; i < NPIECE; i++) CONDSWAP(x->P[i], y->P[i], mm);
 420 }
 421
 422 /*----- Multiplication ----------------------------------------------------*/
 423
 424 #if FGOLDI_IMPL == 28
 425
 426 /* Let B = 2^63 - 1 be the largest value such that +B and -B can be
 427  * represented in a double-precision piece.  On entry, it must be the case
 428  * that |X_i| <= M <= B - 2^27 for some M.  If this is the case, then, on
 429  * exit, we will have |Z_i| <= 2^27 + M/2^27.
 430  */
 431 #define CARRY_REDUCE(z, x) do {                                         \
 432   dblpiece _t[NPIECE], _c;                                              \
 433   unsigned _i;                                                          \
 434                                                                         \
 435   /* Bias the input pieces.  This keeps the carries and so on centred   \
 436    * around zero rather than biased positive.                           \
 437    */                                                                   \
 438   for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27;               \
 439                                                                         \
 440   /* Calculate the reduced pieces.  Careful with the bithacking. */     \
 441   _c = ASR(dblpiece, _t[15], 28);                                       \
 442   (z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c;                 \
 443   for (_i = 1; _i < NPIECE; _i++) {                                     \
 444     (z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 +                 \
 445       ASR(dblpiece, _t[_i - 1], 28);                                    \
 446   }                                                                     \
 447   (z)[8] += _c;                                                         \
 448 } while (0)
 449
 450 #elif FGOLDI_IMPL == 12
 451
 452 static void carry_reduce(dblpiece x[NPIECE])
 453 {
 454   /* Initial bounds: we assume |x_i| < 2^31 - 2^27. */
 455
 456   unsigned i, j;
 457   dblpiece c;
 458
 459   /* The result is nearly canonical, because we do sequential carry
 460    * propagation, because smaller processors are more likely to prefer the
 461    * smaller working set than the instruction-level parallelism.
 462    *
 463    * Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
 464    * Truncate x_38 to 10 bits, and add the carry onto x_39.  Truncate x_39 to
 465    * 10 bits, and add the carry onto x_0 and x_20.  And so on.
 466    *
 467    * Once we reach x_37 for the second time, we start with |x_37| <= 2^10.
 468    * The carry into x_37 is at most 2^21; so the carry out into x_38 has
 469    * magnitude at most 2^10.  In turn, |x_38| <= 2^10 before the carry, so is
 470    * now no more than 2^11 in magnitude, and the carry out into x_39 is at
 471    * most 1.  This leaves |x_39| <= 2^10 + 1 after carry propagation.
 472    *
 473    * Be careful with the bit hacking because the quantities involved are
 474    * signed.
 475    */
 476
 477   /* For each piece, we bias it so that floor division (as done by an
 478    * arithmetic right shift) and modulus (as done by bitwise-AND) does the
 479    * right thing.
 480    */
 481 #define CARRY(i, wd, b, m) do {                                         \
 482   x[i] += (b);                                                          \
 483   c = ASR(dblpiece, x[i], (wd));                                        \
 484   x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b);                         \
 485 } while (0)
 486
 487                              {             CARRY(37, 11, B10, M11);      }
 488                              { x[38] += c; CARRY(38, 11, B10, M11);      }
 489                              { x[39] += c; CARRY(39, 11, B10, M11);      }
 490                                x[20] += c;
 491   for (i = 0; i < 35; ) {
 492                              {  x[i] += c; CARRY( i, 12, B11, M12); i++; }
 493     for (j = i + 4; i < j; ) {  x[i] += c; CARRY( i, 11, B10, M11); i++; }
 494   }
 495                              {  x[i] += c; CARRY( i, 12, B11, M12); i++; }
 496   while (i < 39)             {  x[i] += c; CARRY( i, 11, B10, M11); i++; }
 497   x[39] += c;
 498 }
 499
 500 #endif
 501
 502 /* --- @fgoldi_mulconst@ --- *
 503  *
 504  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 505  *              @const fgoldi *x@ = an operand
 506  *              @long a@ = a small-ish constant; %$|a| < 2^{20}$%.
 507  *
 508  * Returns:     ---
 509  *
 510  * Use:         Set @z@ to the product %$a x$%.
 511  */
 512
 513 void fgoldi_mulconst(fgoldi *z, const fgoldi *x, long a)
 514 {
 515   unsigned i;
 516   dblpiece zz[NPIECE], aa = a;
 517
 518   for (i = 0; i < NPIECE; i++) zz[i] = aa*x->P[i];
 519 #if FGOLDI_IMPL == 28
 520   CARRY_REDUCE(z->P, zz);
 521 #elif FGOLDI_IMPL == 12
 522   carry_reduce(zz);
 523   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 524 #endif
 525 }
 526
 527 /* --- @fgoldi_mul@ --- *
 528  *
 529  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 530  *              @const fgoldi *x, *y@ = two operands
 531  *
 532  * Returns:     ---
 533  *
 534  * Use:         Set @z@ to the product %$x y$%.
 535  */
 536
 537 void fgoldi_mul(fgoldi *z, const fgoldi *x, const fgoldi *y)
 538 {
 539   dblpiece zz[NPIECE], u[NPIECE];
 540   piece ab[NPIECE/2], cd[NPIECE/2];
 541   const piece
 542     *a = x->P + NPIECE/2, *b = x->P,
 543     *c = y->P + NPIECE/2, *d = y->P;
 544   unsigned i, j;
 545
 546 #if FGOLDI_IMPL == 28
 547
 548 #  define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
 549
 550 #elif FGOLDI_IMPL == 12
 551
 552   static const unsigned short off[39] = {
 553       0,  12,  23,  34,  45,  56,  68,  79,  90, 101,
 554     112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
 555     224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
 556     336, 348, 359, 370, 381, 392, 404, 415, 426
 557   };
 558
 559 #define M(x,i, y,j)                                                     \
 560   (((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
 561
 562 #endif
 563
 564   /* Behold the magic.
 565    *
 566    * Write x = a φ + b, and y = c φ + d.  Then x y = a c φ^2 +
 567    * (a d + b c) φ + b d.  Karatsuba and Ofman observed that a d + b c =
 568    * (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
 569    * the prime p so that φ^2 = φ + 1.  So
 570    *
 571    *    x y = ((a + b) (c + d) - b d) φ + a c + b d
 572    */
 573
 574   for (i = 0; i < NPIECE; i++) zz[i] = 0;
 575
 576   /* Our first job will be to calculate (1 - φ) b d, and write the result
 577    * into z.  As we do this, an interesting thing will happen.  Write
 578    * b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
 579    * So, what we do is to write the product end-swapped and negated, and then
 580    * we'll subtract the (negated, remember) high half from the low half.
 581    */
 582   for (i = 0; i < NPIECE/2; i++) {
 583     for (j = 0; j < NPIECE/2 - i; j++)
 584       zz[i + j + NPIECE/2] -= M(b,i, d,j);
 585     for (; j < NPIECE/2; j++)
 586       zz[i + j - NPIECE/2] -= M(b,i, d,j);
 587   }
 588   for (i = 0; i < NPIECE/2; i++)
 589     zz[i] -= zz[i + NPIECE/2];
 590
 591   /* Next, we add on a c.  There are no surprises here. */
 592   for (i = 0; i < NPIECE/2; i++)
 593     for (j = 0; j < NPIECE/2; j++)
 594       zz[i + j] += M(a,i, c,j);
 595
 596   /* Now, calculate a + b and c + d. */
 597   for (i = 0; i < NPIECE/2; i++)
 598     { ab[i] = a[i] + b[i]; cd[i] = c[i] + d[i]; }
 599
 600   /* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
 601    * Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
 602    * v φ + (1 + φ) u.  We'll store u in a temporary place and add it on
 603    * twice.
 604    */
 605   for (i = 0; i < NPIECE; i++) u[i] = 0;
 606   for (i = 0; i < NPIECE/2; i++) {
 607     for (j = 0; j < NPIECE/2 - i; j++)
 608       zz[i + j + NPIECE/2] += M(ab,i, cd,j);
 609     for (; j < NPIECE/2; j++)
 610       u[i + j - NPIECE/2] += M(ab,i, cd,j);
 611   }
 612   for (i = 0; i < NPIECE/2; i++)
 613     { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
 614
 615 #undef M
 616
 617 #if FGOLDI_IMPL == 28
 618   /* That wraps it up for the multiplication.  Let's figure out some bounds.
 619    * Fortunately, Karatsuba is a polynomial identity, so all of the pieces
 620    * end up the way they'd be if we'd done the thing the easy way, which
 621    * simplifies the analysis.  Suppose we started with |x_i|, |y_i| <= 9/5
 622    * 2^28.  The overheads in the result are given by the coefficients of
 623    *
 624    *    ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
 625    *
 626    * the greatest of which is 38.  So |z_i| <= 38*81/25*2^56 < 2^63.
 627    *
 628    * Anyway, a round of `CARRY_REDUCE' will leave us with |z_i| < 2^27 +
 629    * 2^36; and a second round will leave us with |z_i| < 2^27 + 512.
 630    */
 631   for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
 632   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 633 #elif FGOLDI_IMPL == 12
 634   carry_reduce(zz);
 635   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 636 #endif
 637 }
 638
 639 /* --- @fgoldi_sqr@ --- *
 640  *
 641  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
 642  *              @const fgoldi *x@ = an operand
 643  *
 644  * Returns:     ---
 645  *
 646  * Use:         Set @z@ to the square %$x^2$%.
 647  */
 648
 649 void fgoldi_sqr(fgoldi *z, const fgoldi *x)
 650 {
 651 #if FGOLDI_IMPL == 28
 652
 653   dblpiece zz[NPIECE], u[NPIECE];
 654   piece ab[NPIECE];
 655   const piece *a = x->P + NPIECE/2, *b = x->P;
 656   unsigned i, j;
 657
 658 #  define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
 659
 660   /* The magic is basically the same as `fgoldi_mul' above.  We write
 661    * x = a φ + b and use Karatsuba and the special prime shape.  This time,
 662    * we have
 663    *
 664    *    x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
 665    */
 666
 667   for (i = 0; i < NPIECE; i++) zz[i] = 0;
 668
 669   /* Our first job will be to calculate (1 - φ) b^2, and write the result
 670    * into z.  Again, this interacts pleasantly with the prime shape.
 671    */
 672   for (i = 0; i < NPIECE/4; i++) {
 673     zz[2*i + NPIECE/2] -= M(b,i, b,i);
 674     for (j = i + 1; j < NPIECE/2 - i; j++)
 675       zz[i + j + NPIECE/2] -= 2*M(b,i, b,j);
 676     for (; j < NPIECE/2; j++)
 677       zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
 678   }
 679   for (; i < NPIECE/2; i++) {
 680     zz[2*i - NPIECE/2] -= M(b,i, b,i);
 681     for (j = i + 1; j < NPIECE/2; j++)
 682       zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
 683   }
 684   for (i = 0; i < NPIECE/2; i++)
 685     zz[i] -= zz[i + NPIECE/2];
 686
 687   /* Next, we add on a^2.  There are no surprises here. */
 688   for (i = 0; i < NPIECE/2; i++) {
 689     zz[2*i] += M(a,i, a,i);
 690     for (j = i + 1; j < NPIECE/2; j++)
 691       zz[i + j] += 2*M(a,i, a,j);
 692   }
 693
 694   /* Now, calculate a + b. */
 695   for (i = 0; i < NPIECE/2; i++)
 696     ab[i] = a[i] + b[i];
 697
 698   /* Finally (for the multiplication) we must add on (a + b)^2 φ.
 699    * Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u.  We'll
 700    * store u in a temporary place and add it on twice.
 701    */
 702   for (i = 0; i < NPIECE; i++) u[i] = 0;
 703   for (i = 0; i < NPIECE/4; i++) {
 704     zz[2*i + NPIECE/2] += M(ab,i, ab,i);
 705     for (j = i + 1; j < NPIECE/2 - i; j++)
 706       zz[i + j + NPIECE/2] += 2*M(ab,i, ab,j);
 707     for (; j < NPIECE/2; j++)
 708       u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
 709   }
 710   for (; i < NPIECE/2; i++) {
 711     u[2*i - NPIECE/2] += M(ab,i, ab,i);
 712     for (j = i + 1; j < NPIECE/2; j++)
 713       u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
 714   }
 715   for (i = 0; i < NPIECE/2; i++)
 716     { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
 717
 718 #undef M
 719
 720   /* Finally, carrying. */
 721   for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
 722   for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
 723
 724 #elif FGOLDI_IMPL == 12
 725   fgoldi_mul(z, x, x);
 726 #endif
 727 }
 728
 729 /*----- More advanced operations ------------------------------------------*/
 730
 731 /* --- @fgoldi_inv@ --- *
 732  *
 733  * Arguments:   @fgoldi *z@ = where to put the result (may alias @x@)
 734  *              @const fgoldi *x@ = an operand
 735  *
 736  * Returns:     ---
 737  *
 738  * Use:         Stores in @z@ the multiplicative inverse %$x^{-1}$%.  If
 739  *              %$x = 0$% then @z@ is set to zero.  This is considered a
 740  *              feature.
 741  */
 742
 743 void fgoldi_inv(fgoldi *z, const fgoldi *x)
 744 {
 745   fgoldi t, u;
 746   unsigned i;
 747
 748 #define SQRN(z, x, n) do {                                              \
 749   fgoldi_sqr((z), (x));                                                 \
 750   for (i = 1; i < (n); i++) fgoldi_sqr((z), (z));                       \
 751 } while (0)
 752
 753   /* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
 754    * x = 0 as intended.  The addition chain is home-made.
 755    */                                   /* step | value */
 756   fgoldi_sqr(&u, x);                    /*    1 | 2 */
 757   fgoldi_mul(&t, &u, x);                /*    2 | 3 */
 758   SQRN(&u, &t, 2);                      /*    4 | 12 */
 759   fgoldi_mul(&t, &u, &t);               /*    5 | 15 */
 760   SQRN(&u, &t, 4);                      /*    9 | 240 */
 761   fgoldi_mul(&u, &u, &t);               /*   10 | 255 = 2^8 - 1 */
 762   SQRN(&u, &u, 4);                      /*   14 | 2^12 - 16 */
 763   fgoldi_mul(&t, &u, &t);               /*   15 | 2^12 - 1 */
 764   SQRN(&u, &t, 12);                     /*   27 | 2^24 - 2^12 */
 765   fgoldi_mul(&u, &u, &t);               /*   28 | 2^24 - 1 */
 766   SQRN(&u, &u, 12);                     /*   40 | 2^36 - 2^12 */
 767   fgoldi_mul(&t, &u, &t);               /*   41 | 2^36 - 1 */
 768   fgoldi_sqr(&t, &t);                   /*   42 | 2^37 - 2 */
 769   fgoldi_mul(&t, &t, x);                /*   43 | 2^37 - 1 */
 770   SQRN(&u, &t, 37);                     /*   80 | 2^74 - 2^37 */
 771   fgoldi_mul(&u, &u, &t);               /*   81 | 2^74 - 1 */
 772   SQRN(&u, &u, 37);                     /*  118 | 2^111 - 2^37 */
 773   fgoldi_mul(&t, &u, &t);               /*  119 | 2^111 - 1 */
 774   SQRN(&u, &t, 111);                    /*  230 | 2^222 - 2^111 */
 775   fgoldi_mul(&t, &u, &t);               /*  231 | 2^222 - 1 */
 776   fgoldi_sqr(&u, &t);                   /*  232 | 2^223 - 2 */
 777   fgoldi_mul(&u, &u, x);                /*  233 | 2^223 - 1 */
 778   SQRN(&u, &u, 223);                    /*  456 | 2^446 - 2^223 */
 779   fgoldi_mul(&t, &u, &t);               /*  457 | 2^446 - 2^222 - 1 */
 780   SQRN(&t, &t, 2);                      /*  459 | 2^448 - 2^224 - 4 */
 781   fgoldi_mul(z, &t, x);                 /*  460 | 2^448 - 2^224 - 3 */
 782
 783 #undef SQRN
 784 }
 785
 786 /*----- Test rig ----------------------------------------------------------*/
 787
 788 #ifdef TEST_RIG
 789
 790 #include <mLib/report.h>
 791 #include <mLib/str.h>
 792 #include <mLib/testrig.h>
 793
 794 static void fixdstr(dstr *d)
 795 {
 796   if (d->len > 56)
 797     die(1, "invalid length for fgoldi");
 798   else if (d->len < 56) {
 799     dstr_ensure(d, 56);
 800     memset(d->buf + d->len, 0, 56 - d->len);
 801     d->len = 56;
 802   }
 803 }
 804
 805 static void cvt_fgoldi(const char *buf, dstr *d)
 806 {
 807   dstr dd = DSTR_INIT;
 808
 809   type_hex.cvt(buf, &dd); fixdstr(&dd);
 810   dstr_ensure(d, sizeof(fgoldi)); d->len = sizeof(fgoldi);
 811   fgoldi_load((fgoldi *)d->buf, (const octet *)dd.buf);
 812   dstr_destroy(&dd);
 813 }
 814
 815 static void dump_fgoldi(dstr *d, FILE *fp)
 816   { fdump(stderr, "???", (const piece *)d->buf); }
 817
 818 static void cvt_fgoldi_ref(const char *buf, dstr *d)
 819   { type_hex.cvt(buf, d); fixdstr(d); }
 820
 821 static void dump_fgoldi_ref(dstr *d, FILE *fp)
 822 {
 823   fgoldi x;
 824
 825   fgoldi_load(&x, (const octet *)d->buf);
 826   fdump(stderr, "???", x.P);
 827 }
 828
 829 static int eq(const fgoldi *x, dstr *d)
 830   { octet b[56]; fgoldi_store(b, x); return (memcmp(b, d->buf, 56) == 0); }
 831
 832 static const test_type
 833   type_fgoldi = { cvt_fgoldi, dump_fgoldi },
 834   type_fgoldi_ref = { cvt_fgoldi_ref, dump_fgoldi_ref };
 835
 836 #define TEST_UNOP(op)                                                   \
 837   static int vrf_##op(dstr dv[])                                        \
 838   {                                                                     \
 839     fgoldi *x = (fgoldi *)dv[0].buf;                                    \
 840     fgoldi z, zz;                                                       \
 841     int ok = 1;                                                         \
 842                                                                         \
 843     fgoldi_##op(&z, x);                                                 \
 844     if (!eq(&z, &dv[1])) {                                              \
 845       ok = 0;                                                           \
 846       fprintf(stderr, "failed!\n");                                     \
 847       fdump(stderr, "x", x->P);                                         \
 848       fdump(stderr, "calc", z.P);                                       \
 849       fgoldi_load(&zz, (const octet *)dv[1].buf);                       \
 850       fdump(stderr, "z", zz.P);                                         \
 851     }                                                                   \
 852                                                                         \
 853     return (ok);                                                        \
 854   }
 855
 856 TEST_UNOP(sqr)
 857 TEST_UNOP(inv)
 858
 859 #define TEST_BINOP(op)                                                  \
 860   static int vrf_##op(dstr dv[])                                        \
 861   {                                                                     \
 862     fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;          \
 863     fgoldi z, zz;                                                       \
 864     int ok = 1;                                                         \
 865                                                                         \
 866     fgoldi_##op(&z, x, y);                                              \
 867     if (!eq(&z, &dv[2])) {                                              \
 868       ok = 0;                                                           \
 869       fprintf(stderr, "failed!\n");                                     \
 870       fdump(stderr, "x", x->P);                                         \
 871       fdump(stderr, "y", y->P);                                         \
 872       fdump(stderr, "calc", z.P);                                       \
 873       fgoldi_load(&zz, (const octet *)dv[2].buf);                       \
 874       fdump(stderr, "z", zz.P);                                         \
 875     }                                                                   \
 876                                                                         \
 877     return (ok);                                                        \
 878   }
 879
 880 TEST_BINOP(add)
 881 TEST_BINOP(sub)
 882 TEST_BINOP(mul)
 883
 884 static int vrf_mulc(dstr dv[])
 885 {
 886   fgoldi *x = (fgoldi *)dv[0].buf;
 887   long a = *(const long *)dv[1].buf;
 888   fgoldi z, zz;
 889   int ok = 1;
 890
 891   fgoldi_mulconst(&z, x, a);
 892   if (!eq(&z, &dv[2])) {
 893     ok = 0;
 894     fprintf(stderr, "failed!\n");
 895     fdump(stderr, "x", x->P);
 896     fprintf(stderr, "a = %ld\n", a);
 897     fdump(stderr, "calc", z.P);
 898     fgoldi_load(&zz, (const octet *)dv[2].buf);
 899     fdump(stderr, "z", zz.P);
 900   }
 901
 902   return (ok);
 903 }
 904
 905 static int vrf_condswap(dstr dv[])
 906 {
 907   fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
 908   fgoldi xx = *x, yy = *y;
 909   uint32 m = *(uint32 *)dv[2].buf;
 910   int ok = 1;
 911
 912   fgoldi_condswap(&xx, &yy, m);
 913   if (!eq(&xx, &dv[3]) || !eq(&yy, &dv[4])) {
 914     ok = 0;
 915     fprintf(stderr, "failed!\n");
 916     fdump(stderr, "x", x->P);
 917     fdump(stderr, "y", y->P);
 918     fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
 919     fdump(stderr, "calc xx", xx.P);
 920     fdump(stderr, "calc yy", yy.P);
 921     fgoldi_load(&xx, (const octet *)dv[3].buf);
 922     fgoldi_load(&yy, (const octet *)dv[4].buf);
 923     fdump(stderr, "want xx", xx.P);
 924     fdump(stderr, "want yy", yy.P);
 925   }
 926
 927   return (ok);
 928 }
 929
 930 static int vrf_sub_mulc_add_sub_mul(dstr dv[])
 931 {
 932   fgoldi *u = (fgoldi *)dv[0].buf, *v = (fgoldi *)dv[1].buf,
 933     *w = (fgoldi *)dv[3].buf, *x = (fgoldi *)dv[4].buf,
 934     *y = (fgoldi *)dv[5].buf;
 935   long a = *(const long *)dv[2].buf;
 936   fgoldi umv, aumv, wpaumv, xmy, z, zz;
 937   int ok = 1;
 938
 939   fgoldi_sub(&umv, u, v);
 940   fgoldi_mulconst(&aumv, &umv, a);
 941   fgoldi_add(&wpaumv, w, &aumv);
 942   fgoldi_sub(&xmy, x, y);
 943   fgoldi_mul(&z, &wpaumv, &xmy);
 944
 945   if (!eq(&z, &dv[6])) {
 946     ok = 0;
 947     fprintf(stderr, "failed!\n");
 948     fdump(stderr, "u", u->P);
 949     fdump(stderr, "v", v->P);
 950     fdump(stderr, "u - v", umv.P);
 951     fprintf(stderr, "a = %ld\n", a);
 952     fdump(stderr, "a (u - v)", aumv.P);
 953     fdump(stderr, "w + a (u - v)", wpaumv.P);
 954     fdump(stderr, "x", x->P);
 955     fdump(stderr, "y", y->P);
 956     fdump(stderr, "x - y", xmy.P);
 957     fdump(stderr, "(x - y) (w + a (u - v))", z.P);
 958     fgoldi_load(&zz, (const octet *)dv[6].buf); fdump(stderr, "z", zz.P);
 959   }
 960
 961   return (ok);
 962 }
 963
 964 static test_chunk tests[] = {
 965   { "add", vrf_add, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
 966   { "sub", vrf_sub, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
 967   { "mul", vrf_mul, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
 968   { "mulconst", vrf_mulc, { &type_fgoldi, &type_long, &type_fgoldi_ref } },
 969   { "condswap", vrf_condswap,
 970     { &type_fgoldi, &type_fgoldi, &type_uint32,
 971       &type_fgoldi_ref, &type_fgoldi_ref } },
 972   { "sqr", vrf_sqr, { &type_fgoldi, &type_fgoldi_ref } },
 973   { "inv", vrf_inv, { &type_fgoldi, &type_fgoldi_ref } },
 974   { "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul,
 975     { &type_fgoldi, &type_fgoldi, &type_long, &type_fgoldi,
 976       &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
 977   { 0, 0, { 0 } }
 978 };
 979
 980 int main(int argc, char *argv[])
 981 {
 982   test_run(argc, argv, tests, SRCDIR "/t/fgoldi");
 983   return (0);
 984 }
 985
 986 #endif
 987
 988 /*----- That's all, folks -------------------------------------------------*/