math/f{25519,goldi}.[ch]: Export the piece type.
[catacomb] / math / fgoldi.c
1 /* -*-c-*-
2 *
3 * Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
4 *
5 * (c) 2017 Straylight/Edgeware
6 */
7
8 /*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of Catacomb.
11 *
12 * Catacomb is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU Library General Public License as
14 * published by the Free Software Foundation; either version 2 of the
15 * License, or (at your option) any later version.
16 *
17 * Catacomb is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU Library General Public License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb; if not, write to the Free
24 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 * MA 02111-1307, USA.
26 */
27
28 /*----- Header files ------------------------------------------------------*/
29
30 #include "config.h"
31
32 #include "fgoldi.h"
33
34 /*----- Basic setup -------------------------------------------------------*
35 *
36 * Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
37 * (hence the name).
38 */
39
40 typedef fgoldi_piece piece;
41
42 #if FGOLDI_IMPL == 28
43 /* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
44 * x = SUM_{0<=i<16} x_i 2^(28i).
45 */
46
47 typedef int64 dblpiece;
48 typedef uint32 upiece; typedef uint64 udblpiece;
49 #define PIECEWD(i) 28
50 #define NPIECE 16
51 #define P p28
52
53 #define B28 0x10000000u
54 #define B27 0x08000000u
55 #define M28 0x0fffffffu
56 #define M27 0x07ffffffu
57 #define M32 0xffffffffu
58
59 #elif FGOLDI_IMPL == 12
60 /* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
61 * SUM_{0<=i<40} x_i 2^ceil(224i/20). Pieces i with i == 0 (mod 5) are 12
62 * bits wide; the others are 11 bits wide, so they form eight groups of 56
63 * bits.
64 */
65
66 typedef int32 dblpiece;
67 typedef uint16 upiece; typedef uint32 udblpiece;
68 #define PIECEWD(i) ((i)%5 ? 11 : 12)
69 #define NPIECE 40
70 #define P p12
71
72 #define B12 0x1000u
73 #define B11 0x0800u
74 #define B10 0x0400u
75 #define M12 0xfffu
76 #define M11 0x7ffu
77 #define M10 0x3ffu
78 #define M8 0xffu
79
80 #endif
81
82 /*----- Debugging machinery -----------------------------------------------*/
83
84 #if defined(FGOLDI_DEBUG) || defined(TEST_RIG)
85
86 #include <stdio.h>
87
88 #include "mp.h"
89 #include "mptext.h"
90
91 static mp *get_pgoldi(void)
92 {
93 mp *p = MP_NEW, *t = MP_NEW;
94
95 p = mp_setbit(p, MP_ZERO, 448);
96 t = mp_setbit(t, MP_ZERO, 224);
97 p = mp_sub(p, p, t);
98 p = mp_sub(p, p, MP_ONE);
99 mp_drop(t);
100 return (p);
101 }
102
103 DEF_FDUMP(fdump, piece, PIECEWD, NPIECE, 56, get_pgoldi())
104
105 #endif
106
107 /*----- Loading and storing -----------------------------------------------*/
108
109 /* --- @fgoldi_load@ --- *
110 *
111 * Arguments: @fgoldi *z@ = where to store the result
112 * @const octet xv[56]@ = source to read
113 *
114 * Returns: ---
115 *
116 * Use: Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
117 * external representation from @xv@ and stores it in @z@.
118 *
119 * External representation is little-endian base-256. Some
120 * elements have multiple encodings, which are not produced by
121 * correct software; use of noncanonical encodings is not an
122 * error, and toleration of them is considered a performance
123 * feature.
124 */
125
126 void fgoldi_load(fgoldi *z, const octet xv[56])
127 {
128 #if FGOLDI_IMPL == 28
129
130 unsigned i;
131 uint32 xw[14];
132 piece b, c;
133
134 /* First, read the input value as words. */
135 for (i = 0; i < 14; i++) xw[i] = LOAD32_L(xv + 4*i);
136
137 /* Extract unsigned 28-bit pieces from the words. */
138 z->P[ 0] = (xw[ 0] >> 0)&M28;
139 z->P[ 7] = (xw[ 6] >> 4)&M28;
140 z->P[ 8] = (xw[ 7] >> 0)&M28;
141 z->P[15] = (xw[13] >> 4)&M28;
142 for (i = 1; i < 7; i++) {
143 z->P[i + 0] = ((xw[i + 0] << (4*i)) | (xw[i - 1] >> (32 - 4*i)))&M28;
144 z->P[i + 8] = ((xw[i + 7] << (4*i)) | (xw[i + 6] >> (32 - 4*i)))&M28;
145 }
146
147 /* Convert the nonnegative pieces into a balanced signed representation, so
148 * each piece ends up in the interval |z_i| <= 2^27. For each piece, if
149 * its top bit is set, lend a bit leftwards; in the case of z_15, reduce
150 * this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
151 * φ^2 = φ + 1. We delay this carry until after all of the pieces have
152 * been balanced. If we don't do this, then we have to do a more expensive
153 * test for nonzeroness to decide whether to lend a bit leftwards rather
154 * than just testing a single bit.
155 *
156 * Note that we don't try for a canonical representation here: both upper
157 * and lower bounds are achievable.
158 */
159 b = z->P[15]&B27; z->P[15] -= b << 1; c = b >> 27;
160 for (i = NPIECE - 1; i--; )
161 { b = z->P[i]&B27; z->P[i] -= b << 1; z->P[i + 1] += b >> 27; }
162 z->P[0] += c; z->P[8] += c;
163
164 #elif FGOLDI_IMPL == 12
165
166 unsigned i, j, n, w, b;
167 uint32 a;
168 int c;
169
170 /* First, convert the bytes into nonnegative pieces. */
171 for (i = j = a = n = 0, w = PIECEWD(0); i < 56; i++) {
172 a |= (uint32)xv[i] << n; n += 8;
173 if (n >= w) {
174 z->P[j++] = a&MASK(w);
175 a >>= w; n -= w; w = PIECEWD(j);
176 }
177 }
178
179 /* Convert the nonnegative pieces into a balanced signed representation, so
180 * each piece ends up in the interval |z_i| <= 2^11 + 1.
181 */
182 b = z->P[39]&B10; z->P[39] -= b << 1; c = b >> 10;
183 for (i = NPIECE - 1; i--; ) {
184 w = PIECEWD(i) - 1;
185 b = z->P[i]&BIT(w);
186 z->P[i] -= b << 1;
187 z->P[i + 1] += b >> w;
188 }
189 z->P[0] += c; z->P[20] += c;
190
191 #endif
192 }
193
194 /* --- @fgoldi_store@ --- *
195 *
196 * Arguments: @octet zv[56]@ = where to write the result
197 * @const fgoldi *x@ = the field element to write
198 *
199 * Returns: ---
200 *
201 * Use: Stores a field element in the given octet vector in external
202 * representation. A canonical encoding is always stored.
203 */
204
205 void fgoldi_store(octet zv[56], const fgoldi *x)
206 {
207 #if FGOLDI_IMPL == 28
208
209 piece y[NPIECE], yy[NPIECE], c, d;
210 uint32 u, v;
211 mask32 m;
212 unsigned i;
213
214 for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
215
216 /* First, propagate the carries. By the end of this, we'll have all of the
217 * the pieces canonically sized and positive, and maybe there'll be
218 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
219 * value will be in the half-open interval [0, φ^2). The whole represented
220 * value is then y + φ^2 c.
221 *
222 * Assume that we start out with |y_i| <= 2^30. We start off by cutting
223 * off and reducing the carry c_15 from the topmost piece, y_15. This
224 * leaves 0 <= y_15 < 2^28; and we'll have |c_15| <= 4. We'll add this
225 * onto y_0 and y_8, and propagate the carries. It's very clear that we'll
226 * end up with |y + (φ + 1) c_15 - φ^2/2| << φ^2.
227 *
228 * Here, the y_i are signed, so we must be cautious about bithacking them.
229 */
230 c = ASR(piece, y[15], 28); y[15] = (upiece)y[15]&M28; y[8] += c;
231 for (i = 0; i < NPIECE; i++)
232 { y[i] += c; c = ASR(piece, y[i], 28); y[i] = (upiece)y[i]&M28; }
233
234 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
235 * y >= p, then we should subtract p from the whole value; if c = -1 then
236 * we should add p; and otherwise we should do nothing.
237 *
238 * But conditional behaviour is bad, m'kay. So here's what we do instead.
239 *
240 * The first job is to sort out what we wanted to do. If c = -1 then we
241 * want to (a) invert the constant addend and (b) feed in a carry-in;
242 * otherwise, we don't.
243 */
244 m = SIGN(c)&M28;
245 d = m&1;
246
247 /* Now do the addition/subtraction. Remember that all of the y_i are
248 * nonnegative, so shifting and masking are safe and easy.
249 */
250 d += y[0] + (1 ^ m); yy[0] = d&M28; d >>= 28;
251 for (i = 1; i < 8; i++)
252 { d += y[i] + m; yy[i] = d&M28; d >>= 28; }
253 d += y[8] + (1 ^ m); yy[8] = d&M28; d >>= 28;
254 for (i = 9; i < 16; i++)
255 { d += y[i] + m; yy[i] = d&M28; d >>= 28; }
256
257 /* The final carry-out is in d; since we only did addition, and the y_i are
258 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
259 * if (a) c /= 0 (in which case we know that the old value was
260 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
261 * the subtraction didn't cause a borrow, so we must be in the case where
262 * p <= y < φ^2.
263 */
264 m = NONZEROP(c) | ~NONZEROP(d - 1);
265 for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
266
267 /* Extract 32-bit words from the value. */
268 for (i = 0; i < 7; i++) {
269 u = ((y[i + 0] >> (4*i)) | ((uint32)y[i + 1] << (28 - 4*i)))&M32;
270 v = ((y[i + 8] >> (4*i)) | ((uint32)y[i + 9] << (28 - 4*i)))&M32;
271 STORE32_L(zv + 4*i, u);
272 STORE32_L(zv + 4*i + 28, v);
273 }
274
275 #elif FGOLDI_IMPL == 12
276
277 piece y[NPIECE], yy[NPIECE], c, d;
278 uint32 a;
279 mask32 m, mm;
280 unsigned i, j, n, w;
281
282 for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
283
284 /* First, propagate the carries. By the end of this, we'll have all of the
285 * the pieces canonically sized and positive, and maybe there'll be
286 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
287 * value will be in the half-open interval [0, φ^2). The whole represented
288 * value is then y + φ^2 c.
289 *
290 * Assume that we start out with |y_i| <= 2^14. We start off by cutting
291 * off and reducing the carry c_39 from the topmost piece, y_39. This
292 * leaves 0 <= y_39 < 2^11; and we'll have |c_39| <= 16. We'll add this
293 * onto y_0 and y_20, and propagate the carries. It's very clear that
294 * we'll end up with |y + (φ + 1) c_39 - φ^2/2| << φ^2.
295 *
296 * Here, the y_i are signed, so we must be cautious about bithacking them.
297 */
298 c = ASR(piece, y[39], 11); y[39] = (piece)y[39]&M11; y[20] += c;
299 for (i = 0; i < NPIECE; i++) {
300 w = PIECEWD(i); m = (1 << w) - 1;
301 y[i] += c; c = ASR(piece, y[i], w); y[i] = (upiece)y[i]&m;
302 }
303
304 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
305 * y >= p, then we should subtract p from the whole value; if c = -1 then
306 * we should add p; and otherwise we should do nothing.
307 *
308 * But conditional behaviour is bad, m'kay. So here's what we do instead.
309 *
310 * The first job is to sort out what we wanted to do. If c = -1 then we
311 * want to (a) invert the constant addend and (b) feed in a carry-in;
312 * otherwise, we don't.
313 */
314 mm = SIGN(c);
315 d = m&1;
316
317 /* Now do the addition/subtraction. Remember that all of the y_i are
318 * nonnegative, so shifting and masking are safe and easy.
319 */
320 d += y[ 0] + (1 ^ (mm&M12)); yy[ 0] = d&M12; d >>= 12;
321 for (i = 1; i < 20; i++) {
322 w = PIECEWD(i); m = MASK(w);
323 d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
324 }
325 d += y[20] + (1 ^ (mm&M12)); yy[20] = d&M12; d >>= 12;
326 for (i = 21; i < 40; i++) {
327 w = PIECEWD(i); m = MASK(w);
328 d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
329 }
330
331 /* The final carry-out is in d; since we only did addition, and the y_i are
332 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
333 * if (a) c /= 0 (in which case we know that the old value was
334 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
335 * the subtraction didn't cause a borrow, so we must be in the case where
336 * p <= y < φ^2.
337 */
338 m = NONZEROP(c) | ~NONZEROP(d - 1);
339 for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
340
341 /* Convert that back into octets. */
342 for (i = j = a = n = 0; i < NPIECE; i++) {
343 a |= (uint32)y[i] << n; n += PIECEWD(i);
344 while (n >= 8) { zv[j++] = a&M8; a >>= 8; n -= 8; }
345 }
346
347 #endif
348 }
349
350 /* --- @fgoldi_set@ --- *
351 *
352 * Arguments: @fgoldi *z@ = where to write the result
353 * @int a@ = a small-ish constant
354 *
355 * Returns: ---
356 *
357 * Use: Sets @z@ to equal @a@.
358 */
359
360 void fgoldi_set(fgoldi *x, int a)
361 {
362 unsigned i;
363
364 x->P[0] = a;
365 for (i = 1; i < NPIECE; i++) x->P[i] = 0;
366 }
367
368 /*----- Basic arithmetic --------------------------------------------------*/
369
370 /* --- @fgoldi_add@ --- *
371 *
372 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
373 * @const fgoldi *x, *y@ = two operands
374 *
375 * Returns: ---
376 *
377 * Use: Set @z@ to the sum %$x + y$%.
378 */
379
380 void fgoldi_add(fgoldi *z, const fgoldi *x, const fgoldi *y)
381 {
382 unsigned i;
383 for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] + y->P[i];
384 }
385
386 /* --- @fgoldi_sub@ --- *
387 *
388 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
389 * @const fgoldi *x, *y@ = two operands
390 *
391 * Returns: ---
392 *
393 * Use: Set @z@ to the difference %$x - y$%.
394 */
395
396 void fgoldi_sub(fgoldi *z, const fgoldi *x, const fgoldi *y)
397 {
398 unsigned i;
399 for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] - y->P[i];
400 }
401
402 /*----- Constant-time utilities -------------------------------------------*/
403
404 /* --- @fgoldi_condswap@ --- *
405 *
406 * Arguments: @fgoldi *x, *y@ = two operands
407 * @uint32 m@ = a mask
408 *
409 * Returns: ---
410 *
411 * Use: If @m@ is zero, do nothing; if @m@ is all-bits-set, then
412 * exchange @x@ and @y@. If @m@ has some other value, then
413 * scramble @x@ and @y@ in an unhelpful way.
414 */
415
416 void fgoldi_condswap(fgoldi *x, fgoldi *y, uint32 m)
417 {
418 unsigned i;
419 mask32 mm = FIX_MASK32(m);
420
421 for (i = 0; i < NPIECE; i++) CONDSWAP(x->P[i], y->P[i], mm);
422 }
423
424 /*----- Multiplication ----------------------------------------------------*/
425
426 #if FGOLDI_IMPL == 28
427
428 /* Let B = 2^63 - 1 be the largest value such that +B and -B can be
429 * represented in a double-precision piece. On entry, it must be the case
430 * that |X_i| <= M <= B - 2^27 for some M. If this is the case, then, on
431 * exit, we will have |Z_i| <= 2^27 + M/2^27.
432 */
433 #define CARRY_REDUCE(z, x) do { \
434 dblpiece _t[NPIECE], _c; \
435 unsigned _i; \
436 \
437 /* Bias the input pieces. This keeps the carries and so on centred \
438 * around zero rather than biased positive. \
439 */ \
440 for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27; \
441 \
442 /* Calculate the reduced pieces. Careful with the bithacking. */ \
443 _c = ASR(dblpiece, _t[15], 28); \
444 (z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c; \
445 for (_i = 1; _i < NPIECE; _i++) { \
446 (z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 + \
447 ASR(dblpiece, _t[_i - 1], 28); \
448 } \
449 (z)[8] += _c; \
450 } while (0)
451
452 #elif FGOLDI_IMPL == 12
453
454 static void carry_reduce(dblpiece x[NPIECE])
455 {
456 /* Initial bounds: we assume |x_i| < 2^31 - 2^27. */
457
458 unsigned i, j;
459 dblpiece c;
460
461 /* The result is nearly canonical, because we do sequential carry
462 * propagation, because smaller processors are more likely to prefer the
463 * smaller working set than the instruction-level parallelism.
464 *
465 * Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
466 * Truncate x_38 to 10 bits, and add the carry onto x_39. Truncate x_39 to
467 * 10 bits, and add the carry onto x_0 and x_20. And so on.
468 *
469 * Once we reach x_37 for the second time, we start with |x_37| <= 2^10.
470 * The carry into x_37 is at most 2^21; so the carry out into x_38 has
471 * magnitude at most 2^10. In turn, |x_38| <= 2^10 before the carry, so is
472 * now no more than 2^11 in magnitude, and the carry out into x_39 is at
473 * most 1. This leaves |x_39| <= 2^10 + 1 after carry propagation.
474 *
475 * Be careful with the bit hacking because the quantities involved are
476 * signed.
477 */
478
479 /* For each piece, we bias it so that floor division (as done by an
480 * arithmetic right shift) and modulus (as done by bitwise-AND) does the
481 * right thing.
482 */
483 #define CARRY(i, wd, b, m) do { \
484 x[i] += (b); \
485 c = ASR(dblpiece, x[i], (wd)); \
486 x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b); \
487 } while (0)
488
489 { CARRY(37, 11, B10, M11); }
490 { x[38] += c; CARRY(38, 11, B10, M11); }
491 { x[39] += c; CARRY(39, 11, B10, M11); }
492 x[20] += c;
493 for (i = 0; i < 35; ) {
494 { x[i] += c; CARRY( i, 12, B11, M12); i++; }
495 for (j = i + 4; i < j; ) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
496 }
497 { x[i] += c; CARRY( i, 12, B11, M12); i++; }
498 while (i < 39) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
499 x[39] += c;
500 }
501
502 #endif
503
504 /* --- @fgoldi_mulconst@ --- *
505 *
506 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
507 * @const fgoldi *x@ = an operand
508 * @long a@ = a small-ish constant; %$|a| < 2^{20}$%.
509 *
510 * Returns: ---
511 *
512 * Use: Set @z@ to the product %$a x$%.
513 */
514
515 void fgoldi_mulconst(fgoldi *z, const fgoldi *x, long a)
516 {
517 unsigned i;
518 dblpiece zz[NPIECE], aa = a;
519
520 for (i = 0; i < NPIECE; i++) zz[i] = aa*x->P[i];
521 #if FGOLDI_IMPL == 28
522 CARRY_REDUCE(z->P, zz);
523 #elif FGOLDI_IMPL == 12
524 carry_reduce(zz);
525 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
526 #endif
527 }
528
529 /* --- @fgoldi_mul@ --- *
530 *
531 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
532 * @const fgoldi *x, *y@ = two operands
533 *
534 * Returns: ---
535 *
536 * Use: Set @z@ to the product %$x y$%.
537 */
538
539 void fgoldi_mul(fgoldi *z, const fgoldi *x, const fgoldi *y)
540 {
541 dblpiece zz[NPIECE], u[NPIECE];
542 piece ab[NPIECE/2], cd[NPIECE/2];
543 const piece
544 *a = x->P + NPIECE/2, *b = x->P,
545 *c = y->P + NPIECE/2, *d = y->P;
546 unsigned i, j;
547
548 #if FGOLDI_IMPL == 28
549
550 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
551
552 #elif FGOLDI_IMPL == 12
553
554 static const unsigned short off[39] = {
555 0, 12, 23, 34, 45, 56, 68, 79, 90, 101,
556 112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
557 224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
558 336, 348, 359, 370, 381, 392, 404, 415, 426
559 };
560
561 #define M(x,i, y,j) \
562 (((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
563
564 #endif
565
566 /* Behold the magic.
567 *
568 * Write x = a φ + b, and y = c φ + d. Then x y = a c φ^2 +
569 * (a d + b c) φ + b d. Karatsuba and Ofman observed that a d + b c =
570 * (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
571 * the prime p so that φ^2 = φ + 1. So
572 *
573 * x y = ((a + b) (c + d) - b d) φ + a c + b d
574 */
575
576 for (i = 0; i < NPIECE; i++) zz[i] = 0;
577
578 /* Our first job will be to calculate (1 - φ) b d, and write the result
579 * into z. As we do this, an interesting thing will happen. Write
580 * b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
581 * So, what we do is to write the product end-swapped and negated, and then
582 * we'll subtract the (negated, remember) high half from the low half.
583 */
584 for (i = 0; i < NPIECE/2; i++) {
585 for (j = 0; j < NPIECE/2 - i; j++)
586 zz[i + j + NPIECE/2] -= M(b,i, d,j);
587 for (; j < NPIECE/2; j++)
588 zz[i + j - NPIECE/2] -= M(b,i, d,j);
589 }
590 for (i = 0; i < NPIECE/2; i++)
591 zz[i] -= zz[i + NPIECE/2];
592
593 /* Next, we add on a c. There are no surprises here. */
594 for (i = 0; i < NPIECE/2; i++)
595 for (j = 0; j < NPIECE/2; j++)
596 zz[i + j] += M(a,i, c,j);
597
598 /* Now, calculate a + b and c + d. */
599 for (i = 0; i < NPIECE/2; i++)
600 { ab[i] = a[i] + b[i]; cd[i] = c[i] + d[i]; }
601
602 /* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
603 * Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
604 * v φ + (1 + φ) u. We'll store u in a temporary place and add it on
605 * twice.
606 */
607 for (i = 0; i < NPIECE; i++) u[i] = 0;
608 for (i = 0; i < NPIECE/2; i++) {
609 for (j = 0; j < NPIECE/2 - i; j++)
610 zz[i + j + NPIECE/2] += M(ab,i, cd,j);
611 for (; j < NPIECE/2; j++)
612 u[i + j - NPIECE/2] += M(ab,i, cd,j);
613 }
614 for (i = 0; i < NPIECE/2; i++)
615 { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
616
617 #undef M
618
619 #if FGOLDI_IMPL == 28
620 /* That wraps it up for the multiplication. Let's figure out some bounds.
621 * Fortunately, Karatsuba is a polynomial identity, so all of the pieces
622 * end up the way they'd be if we'd done the thing the easy way, which
623 * simplifies the analysis. Suppose we started with |x_i|, |y_i| <= 9/5
624 * 2^28. The overheads in the result are given by the coefficients of
625 *
626 * ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
627 *
628 * the greatest of which is 38. So |z_i| <= 38*81/25*2^56 < 2^63.
629 *
630 * Anyway, a round of `CARRY_REDUCE' will leave us with |z_i| < 2^27 +
631 * 2^36; and a second round will leave us with |z_i| < 2^27 + 512.
632 */
633 for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
634 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
635 #elif FGOLDI_IMPL == 12
636 carry_reduce(zz);
637 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
638 #endif
639 }
640
641 /* --- @fgoldi_sqr@ --- *
642 *
643 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
644 * @const fgoldi *x@ = an operand
645 *
646 * Returns: ---
647 *
648 * Use: Set @z@ to the square %$x^2$%.
649 */
650
651 void fgoldi_sqr(fgoldi *z, const fgoldi *x)
652 {
653 #if FGOLDI_IMPL == 28
654
655 dblpiece zz[NPIECE], u[NPIECE];
656 piece ab[NPIECE];
657 const piece *a = x->P + NPIECE/2, *b = x->P;
658 unsigned i, j;
659
660 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
661
662 /* The magic is basically the same as `fgoldi_mul' above. We write
663 * x = a φ + b and use Karatsuba and the special prime shape. This time,
664 * we have
665 *
666 * x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
667 */
668
669 for (i = 0; i < NPIECE; i++) zz[i] = 0;
670
671 /* Our first job will be to calculate (1 - φ) b^2, and write the result
672 * into z. Again, this interacts pleasantly with the prime shape.
673 */
674 for (i = 0; i < NPIECE/4; i++) {
675 zz[2*i + NPIECE/2] -= M(b,i, b,i);
676 for (j = i + 1; j < NPIECE/2 - i; j++)
677 zz[i + j + NPIECE/2] -= 2*M(b,i, b,j);
678 for (; j < NPIECE/2; j++)
679 zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
680 }
681 for (; i < NPIECE/2; i++) {
682 zz[2*i - NPIECE/2] -= M(b,i, b,i);
683 for (j = i + 1; j < NPIECE/2; j++)
684 zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
685 }
686 for (i = 0; i < NPIECE/2; i++)
687 zz[i] -= zz[i + NPIECE/2];
688
689 /* Next, we add on a^2. There are no surprises here. */
690 for (i = 0; i < NPIECE/2; i++) {
691 zz[2*i] += M(a,i, a,i);
692 for (j = i + 1; j < NPIECE/2; j++)
693 zz[i + j] += 2*M(a,i, a,j);
694 }
695
696 /* Now, calculate a + b. */
697 for (i = 0; i < NPIECE/2; i++)
698 ab[i] = a[i] + b[i];
699
700 /* Finally (for the multiplication) we must add on (a + b)^2 φ.
701 * Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u. We'll
702 * store u in a temporary place and add it on twice.
703 */
704 for (i = 0; i < NPIECE; i++) u[i] = 0;
705 for (i = 0; i < NPIECE/4; i++) {
706 zz[2*i + NPIECE/2] += M(ab,i, ab,i);
707 for (j = i + 1; j < NPIECE/2 - i; j++)
708 zz[i + j + NPIECE/2] += 2*M(ab,i, ab,j);
709 for (; j < NPIECE/2; j++)
710 u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
711 }
712 for (; i < NPIECE/2; i++) {
713 u[2*i - NPIECE/2] += M(ab,i, ab,i);
714 for (j = i + 1; j < NPIECE/2; j++)
715 u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
716 }
717 for (i = 0; i < NPIECE/2; i++)
718 { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
719
720 #undef M
721
722 /* Finally, carrying. */
723 for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
724 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
725
726 #elif FGOLDI_IMPL == 12
727 fgoldi_mul(z, x, x);
728 #endif
729 }
730
731 /*----- More advanced operations ------------------------------------------*/
732
733 /* --- @fgoldi_inv@ --- *
734 *
735 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
736 * @const fgoldi *x@ = an operand
737 *
738 * Returns: ---
739 *
740 * Use: Stores in @z@ the multiplicative inverse %$x^{-1}$%. If
741 * %$x = 0$% then @z@ is set to zero. This is considered a
742 * feature.
743 */
744
745 void fgoldi_inv(fgoldi *z, const fgoldi *x)
746 {
747 fgoldi t, u;
748 unsigned i;
749
750 #define SQRN(z, x, n) do { \
751 fgoldi_sqr((z), (x)); \
752 for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
753 } while (0)
754
755 /* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
756 * x = 0 as intended. The addition chain is home-made.
757 */ /* step | value */
758 fgoldi_sqr(&u, x); /* 1 | 2 */
759 fgoldi_mul(&t, &u, x); /* 2 | 3 */
760 SQRN(&u, &t, 2); /* 4 | 12 */
761 fgoldi_mul(&t, &u, &t); /* 5 | 15 */
762 SQRN(&u, &t, 4); /* 9 | 240 */
763 fgoldi_mul(&u, &u, &t); /* 10 | 255 = 2^8 - 1 */
764 SQRN(&u, &u, 4); /* 14 | 2^12 - 16 */
765 fgoldi_mul(&t, &u, &t); /* 15 | 2^12 - 1 */
766 SQRN(&u, &t, 12); /* 27 | 2^24 - 2^12 */
767 fgoldi_mul(&u, &u, &t); /* 28 | 2^24 - 1 */
768 SQRN(&u, &u, 12); /* 40 | 2^36 - 2^12 */
769 fgoldi_mul(&t, &u, &t); /* 41 | 2^36 - 1 */
770 fgoldi_sqr(&t, &t); /* 42 | 2^37 - 2 */
771 fgoldi_mul(&t, &t, x); /* 43 | 2^37 - 1 */
772 SQRN(&u, &t, 37); /* 80 | 2^74 - 2^37 */
773 fgoldi_mul(&u, &u, &t); /* 81 | 2^74 - 1 */
774 SQRN(&u, &u, 37); /* 118 | 2^111 - 2^37 */
775 fgoldi_mul(&t, &u, &t); /* 119 | 2^111 - 1 */
776 SQRN(&u, &t, 111); /* 230 | 2^222 - 2^111 */
777 fgoldi_mul(&t, &u, &t); /* 231 | 2^222 - 1 */
778 fgoldi_sqr(&u, &t); /* 232 | 2^223 - 2 */
779 fgoldi_mul(&u, &u, x); /* 233 | 2^223 - 1 */
780 SQRN(&u, &u, 223); /* 456 | 2^446 - 2^223 */
781 fgoldi_mul(&t, &u, &t); /* 457 | 2^446 - 2^222 - 1 */
782 SQRN(&t, &t, 2); /* 459 | 2^448 - 2^224 - 4 */
783 fgoldi_mul(z, &t, x); /* 460 | 2^448 - 2^224 - 3 */
784
785 #undef SQRN
786 }
787
788 /*----- Test rig ----------------------------------------------------------*/
789
790 #ifdef TEST_RIG
791
792 #include <mLib/report.h>
793 #include <mLib/str.h>
794 #include <mLib/testrig.h>
795
796 static void fixdstr(dstr *d)
797 {
798 if (d->len > 56)
799 die(1, "invalid length for fgoldi");
800 else if (d->len < 56) {
801 dstr_ensure(d, 56);
802 memset(d->buf + d->len, 0, 56 - d->len);
803 d->len = 56;
804 }
805 }
806
807 static void cvt_fgoldi(const char *buf, dstr *d)
808 {
809 dstr dd = DSTR_INIT;
810
811 type_hex.cvt(buf, &dd); fixdstr(&dd);
812 dstr_ensure(d, sizeof(fgoldi)); d->len = sizeof(fgoldi);
813 fgoldi_load((fgoldi *)d->buf, (const octet *)dd.buf);
814 dstr_destroy(&dd);
815 }
816
817 static void dump_fgoldi(dstr *d, FILE *fp)
818 { fdump(stderr, "???", (const piece *)d->buf); }
819
820 static void cvt_fgoldi_ref(const char *buf, dstr *d)
821 { type_hex.cvt(buf, d); fixdstr(d); }
822
823 static void dump_fgoldi_ref(dstr *d, FILE *fp)
824 {
825 fgoldi x;
826
827 fgoldi_load(&x, (const octet *)d->buf);
828 fdump(stderr, "???", x.P);
829 }
830
831 static int eq(const fgoldi *x, dstr *d)
832 { octet b[56]; fgoldi_store(b, x); return (memcmp(b, d->buf, 56) == 0); }
833
834 static const test_type
835 type_fgoldi = { cvt_fgoldi, dump_fgoldi },
836 type_fgoldi_ref = { cvt_fgoldi_ref, dump_fgoldi_ref };
837
838 #define TEST_UNOP(op) \
839 static int vrf_##op(dstr dv[]) \
840 { \
841 fgoldi *x = (fgoldi *)dv[0].buf; \
842 fgoldi z, zz; \
843 int ok = 1; \
844 \
845 fgoldi_##op(&z, x); \
846 if (!eq(&z, &dv[1])) { \
847 ok = 0; \
848 fprintf(stderr, "failed!\n"); \
849 fdump(stderr, "x", x->P); \
850 fdump(stderr, "calc", z.P); \
851 fgoldi_load(&zz, (const octet *)dv[1].buf); \
852 fdump(stderr, "z", zz.P); \
853 } \
854 \
855 return (ok); \
856 }
857
858 TEST_UNOP(sqr)
859 TEST_UNOP(inv)
860
861 #define TEST_BINOP(op) \
862 static int vrf_##op(dstr dv[]) \
863 { \
864 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf; \
865 fgoldi z, zz; \
866 int ok = 1; \
867 \
868 fgoldi_##op(&z, x, y); \
869 if (!eq(&z, &dv[2])) { \
870 ok = 0; \
871 fprintf(stderr, "failed!\n"); \
872 fdump(stderr, "x", x->P); \
873 fdump(stderr, "y", y->P); \
874 fdump(stderr, "calc", z.P); \
875 fgoldi_load(&zz, (const octet *)dv[2].buf); \
876 fdump(stderr, "z", zz.P); \
877 } \
878 \
879 return (ok); \
880 }
881
882 TEST_BINOP(add)
883 TEST_BINOP(sub)
884 TEST_BINOP(mul)
885
886 static int vrf_mulc(dstr dv[])
887 {
888 fgoldi *x = (fgoldi *)dv[0].buf;
889 long a = *(const long *)dv[1].buf;
890 fgoldi z, zz;
891 int ok = 1;
892
893 fgoldi_mulconst(&z, x, a);
894 if (!eq(&z, &dv[2])) {
895 ok = 0;
896 fprintf(stderr, "failed!\n");
897 fdump(stderr, "x", x->P);
898 fprintf(stderr, "a = %ld\n", a);
899 fdump(stderr, "calc", z.P);
900 fgoldi_load(&zz, (const octet *)dv[2].buf);
901 fdump(stderr, "z", zz.P);
902 }
903
904 return (ok);
905 }
906
907 static int vrf_condswap(dstr dv[])
908 {
909 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
910 fgoldi xx = *x, yy = *y;
911 uint32 m = *(uint32 *)dv[2].buf;
912 int ok = 1;
913
914 fgoldi_condswap(&xx, &yy, m);
915 if (!eq(&xx, &dv[3]) || !eq(&yy, &dv[4])) {
916 ok = 0;
917 fprintf(stderr, "failed!\n");
918 fdump(stderr, "x", x->P);
919 fdump(stderr, "y", y->P);
920 fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
921 fdump(stderr, "calc xx", xx.P);
922 fdump(stderr, "calc yy", yy.P);
923 fgoldi_load(&xx, (const octet *)dv[3].buf);
924 fgoldi_load(&yy, (const octet *)dv[4].buf);
925 fdump(stderr, "want xx", xx.P);
926 fdump(stderr, "want yy", yy.P);
927 }
928
929 return (ok);
930 }
931
932 static int vrf_sub_mulc_add_sub_mul(dstr dv[])
933 {
934 fgoldi *u = (fgoldi *)dv[0].buf, *v = (fgoldi *)dv[1].buf,
935 *w = (fgoldi *)dv[3].buf, *x = (fgoldi *)dv[4].buf,
936 *y = (fgoldi *)dv[5].buf;
937 long a = *(const long *)dv[2].buf;
938 fgoldi umv, aumv, wpaumv, xmy, z, zz;
939 int ok = 1;
940
941 fgoldi_sub(&umv, u, v);
942 fgoldi_mulconst(&aumv, &umv, a);
943 fgoldi_add(&wpaumv, w, &aumv);
944 fgoldi_sub(&xmy, x, y);
945 fgoldi_mul(&z, &wpaumv, &xmy);
946
947 if (!eq(&z, &dv[6])) {
948 ok = 0;
949 fprintf(stderr, "failed!\n");
950 fdump(stderr, "u", u->P);
951 fdump(stderr, "v", v->P);
952 fdump(stderr, "u - v", umv.P);
953 fprintf(stderr, "a = %ld\n", a);
954 fdump(stderr, "a (u - v)", aumv.P);
955 fdump(stderr, "w + a (u - v)", wpaumv.P);
956 fdump(stderr, "x", x->P);
957 fdump(stderr, "y", y->P);
958 fdump(stderr, "x - y", xmy.P);
959 fdump(stderr, "(x - y) (w + a (u - v))", z.P);
960 fgoldi_load(&zz, (const octet *)dv[6].buf); fdump(stderr, "z", zz.P);
961 }
962
963 return (ok);
964 }
965
966 static test_chunk tests[] = {
967 { "add", vrf_add, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
968 { "sub", vrf_sub, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
969 { "mul", vrf_mul, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
970 { "mulconst", vrf_mulc, { &type_fgoldi, &type_long, &type_fgoldi_ref } },
971 { "condswap", vrf_condswap,
972 { &type_fgoldi, &type_fgoldi, &type_uint32,
973 &type_fgoldi_ref, &type_fgoldi_ref } },
974 { "sqr", vrf_sqr, { &type_fgoldi, &type_fgoldi_ref } },
975 { "inv", vrf_inv, { &type_fgoldi, &type_fgoldi_ref } },
976 { "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul,
977 { &type_fgoldi, &type_fgoldi, &type_long, &type_fgoldi,
978 &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
979 { 0, 0, { 0 } }
980 };
981
982 int main(int argc, char *argv[])
983 {
984 test_run(argc, argv, tests, SRCDIR "/t/fgoldi");
985 return (0);
986 }
987
988 #endif
989
990 /*----- That's all, folks -------------------------------------------------*/