Import implementations of X25519 and X448 from Catacomb.
[secnet] / fgoldi.c
1 /* -*-c-*-
2 *
3 * Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
4 *
5 * (c) 2017 Straylight/Edgeware
6 */
7
8 /*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of Catacomb.
11 *
12 * Catacomb is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU Library General Public License as
14 * published by the Free Software Foundation; either version 2 of the
15 * License, or (at your option) any later version.
16 *
17 * Catacomb is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU Library General Public License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb; if not, write to the Free
24 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 * MA 02111-1307, USA.
26 */
27
28 /*----- Header files ------------------------------------------------------*/
29
30 #include "config.h"
31
32 #include "fgoldi.h"
33
34 /*----- Basic setup -------------------------------------------------------*
35 *
36 * Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
37 * (hence the name).
38 */
39
40 #if FGOLDI_IMPL == 28
41 /* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
42 * x = SUM_{0<=i<16} x_i 2^(28i).
43 */
44
45 typedef int32 piece; typedef int64 dblpiece;
46 typedef uint32 upiece; typedef uint64 udblpiece;
47 #define PIECEWD(i) 28
48 #define NPIECE 16
49 #define P p28
50
51 #define B28 0x10000000u
52 #define B27 0x08000000u
53 #define M28 0x0fffffffu
54 #define M27 0x07ffffffu
55 #define M32 0xffffffffu
56
57 #elif FGOLDI_IMPL == 12
58 /* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
59 * SUM_{0<=i<40} x_i 2^ceil(224i/20). Pieces i with i == 0 (mod 5) are 12
60 * bits wide; the others are 11 bits wide, so they form eight groups of 56
61 * bits.
62 */
63
64 typedef int16 piece; typedef int32 dblpiece;
65 typedef uint16 upiece; typedef uint32 udblpiece;
66 #define PIECEWD(i) ((i)%5 ? 11 : 12)
67 #define NPIECE 40
68 #define P p12
69
70 #define B12 0x1000u
71 #define B11 0x0800u
72 #define B10 0x0400u
73 #define M12 0xfffu
74 #define M11 0x7ffu
75 #define M10 0x3ffu
76 #define M8 0xffu
77
78 #endif
79
80 /*----- Debugging machinery -----------------------------------------------*/
81
82 #if defined(FGOLDI_DEBUG) || defined(TEST_RIG)
83
84 #include <stdio.h>
85
86 #include "mp.h"
87 #include "mptext.h"
88
89 static mp *get_pgoldi(void)
90 {
91 mp *p = MP_NEW, *t = MP_NEW;
92
93 p = mp_setbit(p, MP_ZERO, 448);
94 t = mp_setbit(t, MP_ZERO, 224);
95 p = mp_sub(p, p, t);
96 p = mp_sub(p, p, MP_ONE);
97 mp_drop(t);
98 return (p);
99 }
100
101 DEF_FDUMP(fdump, piece, PIECEWD, NPIECE, 56, get_pgoldi())
102
103 #endif
104
105 /*----- Loading and storing -----------------------------------------------*/
106
107 /* --- @fgoldi_load@ --- *
108 *
109 * Arguments: @fgoldi *z@ = where to store the result
110 * @const octet xv[56]@ = source to read
111 *
112 * Returns: ---
113 *
114 * Use: Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
115 * external representation from @xv@ and stores it in @z@.
116 *
117 * External representation is little-endian base-256. Some
118 * elements have multiple encodings, which are not produced by
119 * correct software; use of noncanonical encodings is not an
120 * error, and toleration of them is considered a performance
121 * feature.
122 */
123
124 void fgoldi_load(fgoldi *z, const octet xv[56])
125 {
126 #if FGOLDI_IMPL == 28
127
128 unsigned i;
129 uint32 xw[14];
130 piece b, c;
131
132 /* First, read the input value as words. */
133 for (i = 0; i < 14; i++) xw[i] = LOAD32_L(xv + 4*i);
134
135 /* Extract unsigned 28-bit pieces from the words. */
136 z->P[ 0] = (xw[ 0] >> 0)&M28;
137 z->P[ 7] = (xw[ 6] >> 4)&M28;
138 z->P[ 8] = (xw[ 7] >> 0)&M28;
139 z->P[15] = (xw[13] >> 4)&M28;
140 for (i = 1; i < 7; i++) {
141 z->P[i + 0] = ((xw[i + 0] << (4*i)) | (xw[i - 1] >> (32 - 4*i)))&M28;
142 z->P[i + 8] = ((xw[i + 7] << (4*i)) | (xw[i + 6] >> (32 - 4*i)))&M28;
143 }
144
145 /* Convert the nonnegative pieces into a balanced signed representation, so
146 * each piece ends up in the interval |z_i| <= 2^27. For each piece, if
147 * its top bit is set, lend a bit leftwards; in the case of z_15, reduce
148 * this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
149 * φ^2 = φ + 1. We delay this carry until after all of the pieces have
150 * been balanced. If we don't do this, then we have to do a more expensive
151 * test for nonzeroness to decide whether to lend a bit leftwards rather
152 * than just testing a single bit.
153 *
154 * Note that we don't try for a canonical representation here: both upper
155 * and lower bounds are achievable.
156 */
157 b = z->P[15]&B27; z->P[15] -= b << 1; c = b >> 27;
158 for (i = NPIECE - 1; i--; )
159 { b = z->P[i]&B27; z->P[i] -= b << 1; z->P[i + 1] += b >> 27; }
160 z->P[0] += c; z->P[8] += c;
161
162 #elif FGOLDI_IMPL == 12
163
164 unsigned i, j, n, w, b;
165 uint32 a;
166 int c;
167
168 /* First, convert the bytes into nonnegative pieces. */
169 for (i = j = a = n = 0, w = PIECEWD(0); i < 56; i++) {
170 a |= (uint32)xv[i] << n; n += 8;
171 if (n >= w) {
172 z->P[j++] = a&MASK(w);
173 a >>= w; n -= w; w = PIECEWD(j);
174 }
175 }
176
177 /* Convert the nonnegative pieces into a balanced signed representation, so
178 * each piece ends up in the interval |z_i| <= 2^11 + 1.
179 */
180 b = z->P[39]&B10; z->P[39] -= b << 1; c = b >> 10;
181 for (i = NPIECE - 1; i--; ) {
182 w = PIECEWD(i) - 1;
183 b = z->P[i]&BIT(w);
184 z->P[i] -= b << 1;
185 z->P[i + 1] += b >> w;
186 }
187 z->P[0] += c; z->P[20] += c;
188
189 #endif
190 }
191
192 /* --- @fgoldi_store@ --- *
193 *
194 * Arguments: @octet zv[56]@ = where to write the result
195 * @const fgoldi *x@ = the field element to write
196 *
197 * Returns: ---
198 *
199 * Use: Stores a field element in the given octet vector in external
200 * representation. A canonical encoding is always stored.
201 */
202
203 void fgoldi_store(octet zv[56], const fgoldi *x)
204 {
205 #if FGOLDI_IMPL == 28
206
207 piece y[NPIECE], yy[NPIECE], c, d;
208 uint32 u, v;
209 mask32 m;
210 unsigned i;
211
212 for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
213
214 /* First, propagate the carries. By the end of this, we'll have all of the
215 * the pieces canonically sized and positive, and maybe there'll be
216 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
217 * value will be in the half-open interval [0, φ^2). The whole represented
218 * value is then y + φ^2 c.
219 *
220 * Assume that we start out with |y_i| <= 2^30. We start off by cutting
221 * off and reducing the carry c_15 from the topmost piece, y_15. This
222 * leaves 0 <= y_15 < 2^28; and we'll have |c_15| <= 4. We'll add this
223 * onto y_0 and y_8, and propagate the carries. It's very clear that we'll
224 * end up with |y + (φ + 1) c_15 - φ^2/2| << φ^2.
225 *
226 * Here, the y_i are signed, so we must be cautious about bithacking them.
227 */
228 c = ASR(piece, y[15], 28); y[15] = (upiece)y[15]&M28; y[8] += c;
229 for (i = 0; i < NPIECE; i++)
230 { y[i] += c; c = ASR(piece, y[i], 28); y[i] = (upiece)y[i]&M28; }
231
232 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
233 * y >= p, then we should subtract p from the whole value; if c = -1 then
234 * we should add p; and otherwise we should do nothing.
235 *
236 * But conditional behaviour is bad, m'kay. So here's what we do instead.
237 *
238 * The first job is to sort out what we wanted to do. If c = -1 then we
239 * want to (a) invert the constant addend and (b) feed in a carry-in;
240 * otherwise, we don't.
241 */
242 m = SIGN(c)&M28;
243 d = m&1;
244
245 /* Now do the addition/subtraction. Remember that all of the y_i are
246 * nonnegative, so shifting and masking are safe and easy.
247 */
248 d += y[0] + (1 ^ m); yy[0] = d&M28; d >>= 28;
249 for (i = 1; i < 8; i++)
250 { d += y[i] + m; yy[i] = d&M28; d >>= 28; }
251 d += y[8] + (1 ^ m); yy[8] = d&M28; d >>= 28;
252 for (i = 9; i < 16; i++)
253 { d += y[i] + m; yy[i] = d&M28; d >>= 28; }
254
255 /* The final carry-out is in d; since we only did addition, and the y_i are
256 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
257 * if (a) c /= 0 (in which case we know that the old value was
258 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
259 * the subtraction didn't cause a borrow, so we must be in the case where
260 * p <= y < φ^2.
261 */
262 m = NONZEROP(c) | ~NONZEROP(d - 1);
263 for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
264
265 /* Extract 32-bit words from the value. */
266 for (i = 0; i < 7; i++) {
267 u = ((y[i + 0] >> (4*i)) | ((uint32)y[i + 1] << (28 - 4*i)))&M32;
268 v = ((y[i + 8] >> (4*i)) | ((uint32)y[i + 9] << (28 - 4*i)))&M32;
269 STORE32_L(zv + 4*i, u);
270 STORE32_L(zv + 4*i + 28, v);
271 }
272
273 #elif FGOLDI_IMPL == 12
274
275 piece y[NPIECE], yy[NPIECE], c, d;
276 uint32 a;
277 mask32 m, mm;
278 unsigned i, j, n, w;
279
280 for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
281
282 /* First, propagate the carries. By the end of this, we'll have all of the
283 * the pieces canonically sized and positive, and maybe there'll be
284 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
285 * value will be in the half-open interval [0, φ^2). The whole represented
286 * value is then y + φ^2 c.
287 *
288 * Assume that we start out with |y_i| <= 2^14. We start off by cutting
289 * off and reducing the carry c_39 from the topmost piece, y_39. This
290 * leaves 0 <= y_39 < 2^11; and we'll have |c_39| <= 16. We'll add this
291 * onto y_0 and y_20, and propagate the carries. It's very clear that
292 * we'll end up with |y + (φ + 1) c_39 - φ^2/2| << φ^2.
293 *
294 * Here, the y_i are signed, so we must be cautious about bithacking them.
295 */
296 c = ASR(piece, y[39], 11); y[39] = (piece)y[39]&M11; y[20] += c;
297 for (i = 0; i < NPIECE; i++) {
298 w = PIECEWD(i); m = (1 << w) - 1;
299 y[i] += c; c = ASR(piece, y[i], w); y[i] = (upiece)y[i]&m;
300 }
301
302 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
303 * y >= p, then we should subtract p from the whole value; if c = -1 then
304 * we should add p; and otherwise we should do nothing.
305 *
306 * But conditional behaviour is bad, m'kay. So here's what we do instead.
307 *
308 * The first job is to sort out what we wanted to do. If c = -1 then we
309 * want to (a) invert the constant addend and (b) feed in a carry-in;
310 * otherwise, we don't.
311 */
312 mm = SIGN(c);
313 d = m&1;
314
315 /* Now do the addition/subtraction. Remember that all of the y_i are
316 * nonnegative, so shifting and masking are safe and easy.
317 */
318 d += y[ 0] + (1 ^ (mm&M12)); yy[ 0] = d&M12; d >>= 12;
319 for (i = 1; i < 20; i++) {
320 w = PIECEWD(i); m = MASK(w);
321 d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
322 }
323 d += y[20] + (1 ^ (mm&M12)); yy[20] = d&M12; d >>= 12;
324 for (i = 21; i < 40; i++) {
325 w = PIECEWD(i); m = MASK(w);
326 d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
327 }
328
329 /* The final carry-out is in d; since we only did addition, and the y_i are
330 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
331 * if (a) c /= 0 (in which case we know that the old value was
332 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
333 * the subtraction didn't cause a borrow, so we must be in the case where
334 * p <= y < φ^2.
335 */
336 m = NONZEROP(c) | ~NONZEROP(d - 1);
337 for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
338
339 /* Convert that back into octets. */
340 for (i = j = a = n = 0; i < NPIECE; i++) {
341 a |= (uint32)y[i] << n; n += PIECEWD(i);
342 while (n >= 8) { zv[j++] = a&M8; a >>= 8; n -= 8; }
343 }
344
345 #endif
346 }
347
348 /* --- @fgoldi_set@ --- *
349 *
350 * Arguments: @fgoldi *z@ = where to write the result
351 * @int a@ = a small-ish constant
352 *
353 * Returns: ---
354 *
355 * Use: Sets @z@ to equal @a@.
356 */
357
358 void fgoldi_set(fgoldi *x, int a)
359 {
360 unsigned i;
361
362 x->P[0] = a;
363 for (i = 1; i < NPIECE; i++) x->P[i] = 0;
364 }
365
366 /*----- Basic arithmetic --------------------------------------------------*/
367
368 /* --- @fgoldi_add@ --- *
369 *
370 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
371 * @const fgoldi *x, *y@ = two operands
372 *
373 * Returns: ---
374 *
375 * Use: Set @z@ to the sum %$x + y$%.
376 */
377
378 void fgoldi_add(fgoldi *z, const fgoldi *x, const fgoldi *y)
379 {
380 unsigned i;
381 for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] + y->P[i];
382 }
383
384 /* --- @fgoldi_sub@ --- *
385 *
386 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
387 * @const fgoldi *x, *y@ = two operands
388 *
389 * Returns: ---
390 *
391 * Use: Set @z@ to the difference %$x - y$%.
392 */
393
394 void fgoldi_sub(fgoldi *z, const fgoldi *x, const fgoldi *y)
395 {
396 unsigned i;
397 for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] - y->P[i];
398 }
399
400 /*----- Constant-time utilities -------------------------------------------*/
401
402 /* --- @fgoldi_condswap@ --- *
403 *
404 * Arguments: @fgoldi *x, *y@ = two operands
405 * @uint32 m@ = a mask
406 *
407 * Returns: ---
408 *
409 * Use: If @m@ is zero, do nothing; if @m@ is all-bits-set, then
410 * exchange @x@ and @y@. If @m@ has some other value, then
411 * scramble @x@ and @y@ in an unhelpful way.
412 */
413
414 void fgoldi_condswap(fgoldi *x, fgoldi *y, uint32 m)
415 {
416 unsigned i;
417 mask32 mm = FIX_MASK32(m);
418
419 for (i = 0; i < NPIECE; i++) CONDSWAP(x->P[i], y->P[i], mm);
420 }
421
422 /*----- Multiplication ----------------------------------------------------*/
423
424 #if FGOLDI_IMPL == 28
425
426 /* Let B = 2^63 - 1 be the largest value such that +B and -B can be
427 * represented in a double-precision piece. On entry, it must be the case
428 * that |X_i| <= M <= B - 2^27 for some M. If this is the case, then, on
429 * exit, we will have |Z_i| <= 2^27 + M/2^27.
430 */
431 #define CARRY_REDUCE(z, x) do { \
432 dblpiece _t[NPIECE], _c; \
433 unsigned _i; \
434 \
435 /* Bias the input pieces. This keeps the carries and so on centred \
436 * around zero rather than biased positive. \
437 */ \
438 for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27; \
439 \
440 /* Calculate the reduced pieces. Careful with the bithacking. */ \
441 _c = ASR(dblpiece, _t[15], 28); \
442 (z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c; \
443 for (_i = 1; _i < NPIECE; _i++) { \
444 (z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 + \
445 ASR(dblpiece, _t[_i - 1], 28); \
446 } \
447 (z)[8] += _c; \
448 } while (0)
449
450 #elif FGOLDI_IMPL == 12
451
452 static void carry_reduce(dblpiece x[NPIECE])
453 {
454 /* Initial bounds: we assume |x_i| < 2^31 - 2^27. */
455
456 unsigned i, j;
457 dblpiece c;
458
459 /* The result is nearly canonical, because we do sequential carry
460 * propagation, because smaller processors are more likely to prefer the
461 * smaller working set than the instruction-level parallelism.
462 *
463 * Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
464 * Truncate x_38 to 10 bits, and add the carry onto x_39. Truncate x_39 to
465 * 10 bits, and add the carry onto x_0 and x_20. And so on.
466 *
467 * Once we reach x_37 for the second time, we start with |x_37| <= 2^10.
468 * The carry into x_37 is at most 2^21; so the carry out into x_38 has
469 * magnitude at most 2^10. In turn, |x_38| <= 2^10 before the carry, so is
470 * now no more than 2^11 in magnitude, and the carry out into x_39 is at
471 * most 1. This leaves |x_39| <= 2^10 + 1 after carry propagation.
472 *
473 * Be careful with the bit hacking because the quantities involved are
474 * signed.
475 */
476
477 /* For each piece, we bias it so that floor division (as done by an
478 * arithmetic right shift) and modulus (as done by bitwise-AND) does the
479 * right thing.
480 */
481 #define CARRY(i, wd, b, m) do { \
482 x[i] += (b); \
483 c = ASR(dblpiece, x[i], (wd)); \
484 x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b); \
485 } while (0)
486
487 { CARRY(37, 11, B10, M11); }
488 { x[38] += c; CARRY(38, 11, B10, M11); }
489 { x[39] += c; CARRY(39, 11, B10, M11); }
490 x[20] += c;
491 for (i = 0; i < 35; ) {
492 { x[i] += c; CARRY( i, 12, B11, M12); i++; }
493 for (j = i + 4; i < j; ) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
494 }
495 { x[i] += c; CARRY( i, 12, B11, M12); i++; }
496 while (i < 39) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
497 x[39] += c;
498 }
499
500 #endif
501
502 /* --- @fgoldi_mulconst@ --- *
503 *
504 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
505 * @const fgoldi *x@ = an operand
506 * @long a@ = a small-ish constant; %$|a| < 2^{20}$%.
507 *
508 * Returns: ---
509 *
510 * Use: Set @z@ to the product %$a x$%.
511 */
512
513 void fgoldi_mulconst(fgoldi *z, const fgoldi *x, long a)
514 {
515 unsigned i;
516 dblpiece zz[NPIECE], aa = a;
517
518 for (i = 0; i < NPIECE; i++) zz[i] = aa*x->P[i];
519 #if FGOLDI_IMPL == 28
520 CARRY_REDUCE(z->P, zz);
521 #elif FGOLDI_IMPL == 12
522 carry_reduce(zz);
523 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
524 #endif
525 }
526
527 /* --- @fgoldi_mul@ --- *
528 *
529 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
530 * @const fgoldi *x, *y@ = two operands
531 *
532 * Returns: ---
533 *
534 * Use: Set @z@ to the product %$x y$%.
535 */
536
537 void fgoldi_mul(fgoldi *z, const fgoldi *x, const fgoldi *y)
538 {
539 dblpiece zz[NPIECE], u[NPIECE];
540 piece ab[NPIECE/2], cd[NPIECE/2];
541 const piece
542 *a = x->P + NPIECE/2, *b = x->P,
543 *c = y->P + NPIECE/2, *d = y->P;
544 unsigned i, j;
545
546 #if FGOLDI_IMPL == 28
547
548 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
549
550 #elif FGOLDI_IMPL == 12
551
552 static const unsigned short off[39] = {
553 0, 12, 23, 34, 45, 56, 68, 79, 90, 101,
554 112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
555 224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
556 336, 348, 359, 370, 381, 392, 404, 415, 426
557 };
558
559 #define M(x,i, y,j) \
560 (((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
561
562 #endif
563
564 /* Behold the magic.
565 *
566 * Write x = a φ + b, and y = c φ + d. Then x y = a c φ^2 +
567 * (a d + b c) φ + b d. Karatsuba and Ofman observed that a d + b c =
568 * (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
569 * the prime p so that φ^2 = φ + 1. So
570 *
571 * x y = ((a + b) (c + d) - b d) φ + a c + b d
572 */
573
574 for (i = 0; i < NPIECE; i++) zz[i] = 0;
575
576 /* Our first job will be to calculate (1 - φ) b d, and write the result
577 * into z. As we do this, an interesting thing will happen. Write
578 * b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
579 * So, what we do is to write the product end-swapped and negated, and then
580 * we'll subtract the (negated, remember) high half from the low half.
581 */
582 for (i = 0; i < NPIECE/2; i++) {
583 for (j = 0; j < NPIECE/2 - i; j++)
584 zz[i + j + NPIECE/2] -= M(b,i, d,j);
585 for (; j < NPIECE/2; j++)
586 zz[i + j - NPIECE/2] -= M(b,i, d,j);
587 }
588 for (i = 0; i < NPIECE/2; i++)
589 zz[i] -= zz[i + NPIECE/2];
590
591 /* Next, we add on a c. There are no surprises here. */
592 for (i = 0; i < NPIECE/2; i++)
593 for (j = 0; j < NPIECE/2; j++)
594 zz[i + j] += M(a,i, c,j);
595
596 /* Now, calculate a + b and c + d. */
597 for (i = 0; i < NPIECE/2; i++)
598 { ab[i] = a[i] + b[i]; cd[i] = c[i] + d[i]; }
599
600 /* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
601 * Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
602 * v φ + (1 + φ) u. We'll store u in a temporary place and add it on
603 * twice.
604 */
605 for (i = 0; i < NPIECE; i++) u[i] = 0;
606 for (i = 0; i < NPIECE/2; i++) {
607 for (j = 0; j < NPIECE/2 - i; j++)
608 zz[i + j + NPIECE/2] += M(ab,i, cd,j);
609 for (; j < NPIECE/2; j++)
610 u[i + j - NPIECE/2] += M(ab,i, cd,j);
611 }
612 for (i = 0; i < NPIECE/2; i++)
613 { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
614
615 #undef M
616
617 #if FGOLDI_IMPL == 28
618 /* That wraps it up for the multiplication. Let's figure out some bounds.
619 * Fortunately, Karatsuba is a polynomial identity, so all of the pieces
620 * end up the way they'd be if we'd done the thing the easy way, which
621 * simplifies the analysis. Suppose we started with |x_i|, |y_i| <= 9/5
622 * 2^28. The overheads in the result are given by the coefficients of
623 *
624 * ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
625 *
626 * the greatest of which is 38. So |z_i| <= 38*81/25*2^56 < 2^63.
627 *
628 * Anyway, a round of `CARRY_REDUCE' will leave us with |z_i| < 2^27 +
629 * 2^36; and a second round will leave us with |z_i| < 2^27 + 512.
630 */
631 for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
632 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
633 #elif FGOLDI_IMPL == 12
634 carry_reduce(zz);
635 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
636 #endif
637 }
638
639 /* --- @fgoldi_sqr@ --- *
640 *
641 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
642 * @const fgoldi *x@ = an operand
643 *
644 * Returns: ---
645 *
646 * Use: Set @z@ to the square %$x^2$%.
647 */
648
649 void fgoldi_sqr(fgoldi *z, const fgoldi *x)
650 {
651 #if FGOLDI_IMPL == 28
652
653 dblpiece zz[NPIECE], u[NPIECE];
654 piece ab[NPIECE];
655 const piece *a = x->P + NPIECE/2, *b = x->P;
656 unsigned i, j;
657
658 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
659
660 /* The magic is basically the same as `fgoldi_mul' above. We write
661 * x = a φ + b and use Karatsuba and the special prime shape. This time,
662 * we have
663 *
664 * x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
665 */
666
667 for (i = 0; i < NPIECE; i++) zz[i] = 0;
668
669 /* Our first job will be to calculate (1 - φ) b^2, and write the result
670 * into z. Again, this interacts pleasantly with the prime shape.
671 */
672 for (i = 0; i < NPIECE/4; i++) {
673 zz[2*i + NPIECE/2] -= M(b,i, b,i);
674 for (j = i + 1; j < NPIECE/2 - i; j++)
675 zz[i + j + NPIECE/2] -= 2*M(b,i, b,j);
676 for (; j < NPIECE/2; j++)
677 zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
678 }
679 for (; i < NPIECE/2; i++) {
680 zz[2*i - NPIECE/2] -= M(b,i, b,i);
681 for (j = i + 1; j < NPIECE/2; j++)
682 zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
683 }
684 for (i = 0; i < NPIECE/2; i++)
685 zz[i] -= zz[i + NPIECE/2];
686
687 /* Next, we add on a^2. There are no surprises here. */
688 for (i = 0; i < NPIECE/2; i++) {
689 zz[2*i] += M(a,i, a,i);
690 for (j = i + 1; j < NPIECE/2; j++)
691 zz[i + j] += 2*M(a,i, a,j);
692 }
693
694 /* Now, calculate a + b. */
695 for (i = 0; i < NPIECE/2; i++)
696 ab[i] = a[i] + b[i];
697
698 /* Finally (for the multiplication) we must add on (a + b)^2 φ.
699 * Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u. We'll
700 * store u in a temporary place and add it on twice.
701 */
702 for (i = 0; i < NPIECE; i++) u[i] = 0;
703 for (i = 0; i < NPIECE/4; i++) {
704 zz[2*i + NPIECE/2] += M(ab,i, ab,i);
705 for (j = i + 1; j < NPIECE/2 - i; j++)
706 zz[i + j + NPIECE/2] += 2*M(ab,i, ab,j);
707 for (; j < NPIECE/2; j++)
708 u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
709 }
710 for (; i < NPIECE/2; i++) {
711 u[2*i - NPIECE/2] += M(ab,i, ab,i);
712 for (j = i + 1; j < NPIECE/2; j++)
713 u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
714 }
715 for (i = 0; i < NPIECE/2; i++)
716 { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
717
718 #undef M
719
720 /* Finally, carrying. */
721 for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
722 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
723
724 #elif FGOLDI_IMPL == 12
725 fgoldi_mul(z, x, x);
726 #endif
727 }
728
729 /*----- More advanced operations ------------------------------------------*/
730
731 /* --- @fgoldi_inv@ --- *
732 *
733 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
734 * @const fgoldi *x@ = an operand
735 *
736 * Returns: ---
737 *
738 * Use: Stores in @z@ the multiplicative inverse %$x^{-1}$%. If
739 * %$x = 0$% then @z@ is set to zero. This is considered a
740 * feature.
741 */
742
743 void fgoldi_inv(fgoldi *z, const fgoldi *x)
744 {
745 fgoldi t, u;
746 unsigned i;
747
748 #define SQRN(z, x, n) do { \
749 fgoldi_sqr((z), (x)); \
750 for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
751 } while (0)
752
753 /* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
754 * x = 0 as intended. The addition chain is home-made.
755 */ /* step | value */
756 fgoldi_sqr(&u, x); /* 1 | 2 */
757 fgoldi_mul(&t, &u, x); /* 2 | 3 */
758 SQRN(&u, &t, 2); /* 4 | 12 */
759 fgoldi_mul(&t, &u, &t); /* 5 | 15 */
760 SQRN(&u, &t, 4); /* 9 | 240 */
761 fgoldi_mul(&u, &u, &t); /* 10 | 255 = 2^8 - 1 */
762 SQRN(&u, &u, 4); /* 14 | 2^12 - 16 */
763 fgoldi_mul(&t, &u, &t); /* 15 | 2^12 - 1 */
764 SQRN(&u, &t, 12); /* 27 | 2^24 - 2^12 */
765 fgoldi_mul(&u, &u, &t); /* 28 | 2^24 - 1 */
766 SQRN(&u, &u, 12); /* 40 | 2^36 - 2^12 */
767 fgoldi_mul(&t, &u, &t); /* 41 | 2^36 - 1 */
768 fgoldi_sqr(&t, &t); /* 42 | 2^37 - 2 */
769 fgoldi_mul(&t, &t, x); /* 43 | 2^37 - 1 */
770 SQRN(&u, &t, 37); /* 80 | 2^74 - 2^37 */
771 fgoldi_mul(&u, &u, &t); /* 81 | 2^74 - 1 */
772 SQRN(&u, &u, 37); /* 118 | 2^111 - 2^37 */
773 fgoldi_mul(&t, &u, &t); /* 119 | 2^111 - 1 */
774 SQRN(&u, &t, 111); /* 230 | 2^222 - 2^111 */
775 fgoldi_mul(&t, &u, &t); /* 231 | 2^222 - 1 */
776 fgoldi_sqr(&u, &t); /* 232 | 2^223 - 2 */
777 fgoldi_mul(&u, &u, x); /* 233 | 2^223 - 1 */
778 SQRN(&u, &u, 223); /* 456 | 2^446 - 2^223 */
779 fgoldi_mul(&t, &u, &t); /* 457 | 2^446 - 2^222 - 1 */
780 SQRN(&t, &t, 2); /* 459 | 2^448 - 2^224 - 4 */
781 fgoldi_mul(z, &t, x); /* 460 | 2^448 - 2^224 - 3 */
782
783 #undef SQRN
784 }
785
786 /*----- Test rig ----------------------------------------------------------*/
787
788 #ifdef TEST_RIG
789
790 #include <mLib/report.h>
791 #include <mLib/str.h>
792 #include <mLib/testrig.h>
793
794 static void fixdstr(dstr *d)
795 {
796 if (d->len > 56)
797 die(1, "invalid length for fgoldi");
798 else if (d->len < 56) {
799 dstr_ensure(d, 56);
800 memset(d->buf + d->len, 0, 56 - d->len);
801 d->len = 56;
802 }
803 }
804
805 static void cvt_fgoldi(const char *buf, dstr *d)
806 {
807 dstr dd = DSTR_INIT;
808
809 type_hex.cvt(buf, &dd); fixdstr(&dd);
810 dstr_ensure(d, sizeof(fgoldi)); d->len = sizeof(fgoldi);
811 fgoldi_load((fgoldi *)d->buf, (const octet *)dd.buf);
812 dstr_destroy(&dd);
813 }
814
815 static void dump_fgoldi(dstr *d, FILE *fp)
816 { fdump(stderr, "???", (const piece *)d->buf); }
817
818 static void cvt_fgoldi_ref(const char *buf, dstr *d)
819 { type_hex.cvt(buf, d); fixdstr(d); }
820
821 static void dump_fgoldi_ref(dstr *d, FILE *fp)
822 {
823 fgoldi x;
824
825 fgoldi_load(&x, (const octet *)d->buf);
826 fdump(stderr, "???", x.P);
827 }
828
829 static int eq(const fgoldi *x, dstr *d)
830 { octet b[56]; fgoldi_store(b, x); return (memcmp(b, d->buf, 56) == 0); }
831
832 static const test_type
833 type_fgoldi = { cvt_fgoldi, dump_fgoldi },
834 type_fgoldi_ref = { cvt_fgoldi_ref, dump_fgoldi_ref };
835
836 #define TEST_UNOP(op) \
837 static int vrf_##op(dstr dv[]) \
838 { \
839 fgoldi *x = (fgoldi *)dv[0].buf; \
840 fgoldi z, zz; \
841 int ok = 1; \
842 \
843 fgoldi_##op(&z, x); \
844 if (!eq(&z, &dv[1])) { \
845 ok = 0; \
846 fprintf(stderr, "failed!\n"); \
847 fdump(stderr, "x", x->P); \
848 fdump(stderr, "calc", z.P); \
849 fgoldi_load(&zz, (const octet *)dv[1].buf); \
850 fdump(stderr, "z", zz.P); \
851 } \
852 \
853 return (ok); \
854 }
855
856 TEST_UNOP(sqr)
857 TEST_UNOP(inv)
858
859 #define TEST_BINOP(op) \
860 static int vrf_##op(dstr dv[]) \
861 { \
862 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf; \
863 fgoldi z, zz; \
864 int ok = 1; \
865 \
866 fgoldi_##op(&z, x, y); \
867 if (!eq(&z, &dv[2])) { \
868 ok = 0; \
869 fprintf(stderr, "failed!\n"); \
870 fdump(stderr, "x", x->P); \
871 fdump(stderr, "y", y->P); \
872 fdump(stderr, "calc", z.P); \
873 fgoldi_load(&zz, (const octet *)dv[2].buf); \
874 fdump(stderr, "z", zz.P); \
875 } \
876 \
877 return (ok); \
878 }
879
880 TEST_BINOP(add)
881 TEST_BINOP(sub)
882 TEST_BINOP(mul)
883
884 static int vrf_mulc(dstr dv[])
885 {
886 fgoldi *x = (fgoldi *)dv[0].buf;
887 long a = *(const long *)dv[1].buf;
888 fgoldi z, zz;
889 int ok = 1;
890
891 fgoldi_mulconst(&z, x, a);
892 if (!eq(&z, &dv[2])) {
893 ok = 0;
894 fprintf(stderr, "failed!\n");
895 fdump(stderr, "x", x->P);
896 fprintf(stderr, "a = %ld\n", a);
897 fdump(stderr, "calc", z.P);
898 fgoldi_load(&zz, (const octet *)dv[2].buf);
899 fdump(stderr, "z", zz.P);
900 }
901
902 return (ok);
903 }
904
905 static int vrf_condswap(dstr dv[])
906 {
907 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
908 fgoldi xx = *x, yy = *y;
909 uint32 m = *(uint32 *)dv[2].buf;
910 int ok = 1;
911
912 fgoldi_condswap(&xx, &yy, m);
913 if (!eq(&xx, &dv[3]) || !eq(&yy, &dv[4])) {
914 ok = 0;
915 fprintf(stderr, "failed!\n");
916 fdump(stderr, "x", x->P);
917 fdump(stderr, "y", y->P);
918 fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
919 fdump(stderr, "calc xx", xx.P);
920 fdump(stderr, "calc yy", yy.P);
921 fgoldi_load(&xx, (const octet *)dv[3].buf);
922 fgoldi_load(&yy, (const octet *)dv[4].buf);
923 fdump(stderr, "want xx", xx.P);
924 fdump(stderr, "want yy", yy.P);
925 }
926
927 return (ok);
928 }
929
930 static int vrf_sub_mulc_add_sub_mul(dstr dv[])
931 {
932 fgoldi *u = (fgoldi *)dv[0].buf, *v = (fgoldi *)dv[1].buf,
933 *w = (fgoldi *)dv[3].buf, *x = (fgoldi *)dv[4].buf,
934 *y = (fgoldi *)dv[5].buf;
935 long a = *(const long *)dv[2].buf;
936 fgoldi umv, aumv, wpaumv, xmy, z, zz;
937 int ok = 1;
938
939 fgoldi_sub(&umv, u, v);
940 fgoldi_mulconst(&aumv, &umv, a);
941 fgoldi_add(&wpaumv, w, &aumv);
942 fgoldi_sub(&xmy, x, y);
943 fgoldi_mul(&z, &wpaumv, &xmy);
944
945 if (!eq(&z, &dv[6])) {
946 ok = 0;
947 fprintf(stderr, "failed!\n");
948 fdump(stderr, "u", u->P);
949 fdump(stderr, "v", v->P);
950 fdump(stderr, "u - v", umv.P);
951 fprintf(stderr, "a = %ld\n", a);
952 fdump(stderr, "a (u - v)", aumv.P);
953 fdump(stderr, "w + a (u - v)", wpaumv.P);
954 fdump(stderr, "x", x->P);
955 fdump(stderr, "y", y->P);
956 fdump(stderr, "x - y", xmy.P);
957 fdump(stderr, "(x - y) (w + a (u - v))", z.P);
958 fgoldi_load(&zz, (const octet *)dv[6].buf); fdump(stderr, "z", zz.P);
959 }
960
961 return (ok);
962 }
963
964 static test_chunk tests[] = {
965 { "add", vrf_add, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
966 { "sub", vrf_sub, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
967 { "mul", vrf_mul, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
968 { "mulconst", vrf_mulc, { &type_fgoldi, &type_long, &type_fgoldi_ref } },
969 { "condswap", vrf_condswap,
970 { &type_fgoldi, &type_fgoldi, &type_uint32,
971 &type_fgoldi_ref, &type_fgoldi_ref } },
972 { "sqr", vrf_sqr, { &type_fgoldi, &type_fgoldi_ref } },
973 { "inv", vrf_inv, { &type_fgoldi, &type_fgoldi_ref } },
974 { "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul,
975 { &type_fgoldi, &type_fgoldi, &type_long, &type_fgoldi,
976 &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
977 { 0, 0, { 0 } }
978 };
979
980 int main(int argc, char *argv[])
981 {
982 test_run(argc, argv, tests, SRCDIR "/t/fgoldi");
983 return (0);
984 }
985
986 #endif
987
988 /*----- That's all, folks -------------------------------------------------*/