math/Makefile.am, symm/Makefile.am: Use `--no-install' on oddball tests.
[catacomb] / math / fgoldi.c
1 /* -*-c-*-
2 *
3 * Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
4 *
5 * (c) 2017 Straylight/Edgeware
6 */
7
8 /*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of Catacomb.
11 *
12 * Catacomb is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU Library General Public License as
14 * published by the Free Software Foundation; either version 2 of the
15 * License, or (at your option) any later version.
16 *
17 * Catacomb is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU Library General Public License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb; if not, write to the Free
24 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 * MA 02111-1307, USA.
26 */
27
28 /*----- Header files ------------------------------------------------------*/
29
30 #include "config.h"
31
32 #include "ct.h"
33 #include "fgoldi.h"
34
35 /*----- Basic setup -------------------------------------------------------*
36 *
37 * Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
38 * (hence the name).
39 */
40
41 typedef fgoldi_piece piece;
42
43 #if FGOLDI_IMPL == 28
44 /* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
45 * x = SUM_{0<=i<16} x_i 2^(28i).
46 */
47
48 typedef int64 dblpiece;
49 typedef uint32 upiece; typedef uint64 udblpiece;
50 #define PIECEWD(i) 28
51 #define NPIECE 16
52 #define P p28
53
54 #define B28 0x10000000u
55 #define B27 0x08000000u
56 #define M28 0x0fffffffu
57 #define M27 0x07ffffffu
58 #define M32 0xffffffffu
59
60 #elif FGOLDI_IMPL == 12
61 /* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
62 * SUM_{0<=i<40} x_i 2^ceil(224i/20). Pieces i with i == 0 (mod 5) are 12
63 * bits wide; the others are 11 bits wide, so they form eight groups of 56
64 * bits.
65 */
66
67 typedef int32 dblpiece;
68 typedef uint16 upiece; typedef uint32 udblpiece;
69 #define PIECEWD(i) ((i)%5 ? 11 : 12)
70 #define NPIECE 40
71 #define P p12
72
73 #define B12 0x1000u
74 #define B11 0x0800u
75 #define B10 0x0400u
76 #define M12 0xfffu
77 #define M11 0x7ffu
78 #define M10 0x3ffu
79 #define M8 0xffu
80
81 #endif
82
83 /*----- Debugging machinery -----------------------------------------------*/
84
85 #if defined(FGOLDI_DEBUG) || defined(TEST_RIG)
86
87 #include <stdio.h>
88
89 #include "mp.h"
90 #include "mptext.h"
91
92 static mp *get_pgoldi(void)
93 {
94 mp *p = MP_NEW, *t = MP_NEW;
95
96 p = mp_setbit(p, MP_ZERO, 448);
97 t = mp_setbit(t, MP_ZERO, 224);
98 p = mp_sub(p, p, t);
99 p = mp_sub(p, p, MP_ONE);
100 mp_drop(t);
101 return (p);
102 }
103
104 DEF_FDUMP(fdump, piece, PIECEWD, NPIECE, 56, get_pgoldi())
105
106 #endif
107
108 /*----- Loading and storing -----------------------------------------------*/
109
110 /* --- @fgoldi_load@ --- *
111 *
112 * Arguments: @fgoldi *z@ = where to store the result
113 * @const octet xv[56]@ = source to read
114 *
115 * Returns: ---
116 *
117 * Use: Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
118 * external representation from @xv@ and stores it in @z@.
119 *
120 * External representation is little-endian base-256. Some
121 * elements have multiple encodings, which are not produced by
122 * correct software; use of noncanonical encodings is not an
123 * error, and toleration of them is considered a performance
124 * feature.
125 */
126
127 void fgoldi_load(fgoldi *z, const octet xv[56])
128 {
129 #if FGOLDI_IMPL == 28
130
131 unsigned i;
132 uint32 xw[14];
133 piece b, c;
134
135 /* First, read the input value as words. */
136 for (i = 0; i < 14; i++) xw[i] = LOAD32_L(xv + 4*i);
137
138 /* Extract unsigned 28-bit pieces from the words. */
139 z->P[ 0] = (xw[ 0] >> 0)&M28;
140 z->P[ 7] = (xw[ 6] >> 4)&M28;
141 z->P[ 8] = (xw[ 7] >> 0)&M28;
142 z->P[15] = (xw[13] >> 4)&M28;
143 for (i = 1; i < 7; i++) {
144 z->P[i + 0] = ((xw[i + 0] << (4*i)) | (xw[i - 1] >> (32 - 4*i)))&M28;
145 z->P[i + 8] = ((xw[i + 7] << (4*i)) | (xw[i + 6] >> (32 - 4*i)))&M28;
146 }
147
148 /* Convert the nonnegative pieces into a balanced signed representation, so
149 * each piece ends up in the interval |z_i| <= 2^27. For each piece, if
150 * its top bit is set, lend a bit leftwards; in the case of z_15, reduce
151 * this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
152 * φ^2 = φ + 1. We delay this carry until after all of the pieces have
153 * been balanced. If we don't do this, then we have to do a more expensive
154 * test for nonzeroness to decide whether to lend a bit leftwards rather
155 * than just testing a single bit.
156 *
157 * Note that we don't try for a canonical representation here: both upper
158 * and lower bounds are achievable.
159 */
160 b = z->P[15]&B27; z->P[15] -= b << 1; c = b >> 27;
161 for (i = NPIECE - 1; i--; )
162 { b = z->P[i]&B27; z->P[i] -= b << 1; z->P[i + 1] += b >> 27; }
163 z->P[0] += c; z->P[8] += c;
164
165 #elif FGOLDI_IMPL == 12
166
167 unsigned i, j, n, w, b;
168 uint32 a;
169 int c;
170
171 /* First, convert the bytes into nonnegative pieces. */
172 for (i = j = a = n = 0, w = PIECEWD(0); i < 56; i++) {
173 a |= (uint32)xv[i] << n; n += 8;
174 if (n >= w) {
175 z->P[j++] = a&MASK(w);
176 a >>= w; n -= w; w = PIECEWD(j);
177 }
178 }
179
180 /* Convert the nonnegative pieces into a balanced signed representation, so
181 * each piece ends up in the interval |z_i| <= 2^11 + 1.
182 */
183 b = z->P[39]&B10; z->P[39] -= b << 1; c = b >> 10;
184 for (i = NPIECE - 1; i--; ) {
185 w = PIECEWD(i) - 1;
186 b = z->P[i]&BIT(w);
187 z->P[i] -= b << 1;
188 z->P[i + 1] += b >> w;
189 }
190 z->P[0] += c; z->P[20] += c;
191
192 #endif
193 }
194
195 /* --- @fgoldi_store@ --- *
196 *
197 * Arguments: @octet zv[56]@ = where to write the result
198 * @const fgoldi *x@ = the field element to write
199 *
200 * Returns: ---
201 *
202 * Use: Stores a field element in the given octet vector in external
203 * representation. A canonical encoding is always stored.
204 */
205
206 void fgoldi_store(octet zv[56], const fgoldi *x)
207 {
208 #if FGOLDI_IMPL == 28
209
210 piece y[NPIECE], yy[NPIECE], c, d;
211 uint32 u, v;
212 mask32 m;
213 unsigned i;
214
215 for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
216
217 /* First, propagate the carries. By the end of this, we'll have all of the
218 * the pieces canonically sized and positive, and maybe there'll be
219 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
220 * value will be in the half-open interval [0, φ^2). The whole represented
221 * value is then y + φ^2 c.
222 *
223 * Assume that we start out with |y_i| <= 2^30. We start off by cutting
224 * off and reducing the carry c_15 from the topmost piece, y_15. This
225 * leaves 0 <= y_15 < 2^28; and we'll have |c_15| <= 4. We'll add this
226 * onto y_0 and y_8, and propagate the carries. It's very clear that we'll
227 * end up with |y + (φ + 1) c_15 - φ^2/2| << φ^2.
228 *
229 * Here, the y_i are signed, so we must be cautious about bithacking them.
230 */
231 c = ASR(piece, y[15], 28); y[15] = (upiece)y[15]&M28; y[8] += c;
232 for (i = 0; i < NPIECE; i++)
233 { y[i] += c; c = ASR(piece, y[i], 28); y[i] = (upiece)y[i]&M28; }
234
235 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
236 * y >= p, then we should subtract p from the whole value; if c = -1 then
237 * we should add p; and otherwise we should do nothing.
238 *
239 * But conditional behaviour is bad, m'kay. So here's what we do instead.
240 *
241 * The first job is to sort out what we wanted to do. If c = -1 then we
242 * want to (a) invert the constant addend and (b) feed in a carry-in;
243 * otherwise, we don't.
244 */
245 m = SIGN(c)&M28;
246 d = m&1;
247
248 /* Now do the addition/subtraction. Remember that all of the y_i are
249 * nonnegative, so shifting and masking are safe and easy.
250 */
251 d += y[0] + (1 ^ m); yy[0] = d&M28; d >>= 28;
252 for (i = 1; i < 8; i++)
253 { d += y[i] + m; yy[i] = d&M28; d >>= 28; }
254 d += y[8] + (1 ^ m); yy[8] = d&M28; d >>= 28;
255 for (i = 9; i < 16; i++)
256 { d += y[i] + m; yy[i] = d&M28; d >>= 28; }
257
258 /* The final carry-out is in d; since we only did addition, and the y_i are
259 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
260 * if (a) c /= 0 (in which case we know that the old value was
261 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
262 * the subtraction didn't cause a borrow, so we must be in the case where
263 * p <= y < φ^2.
264 */
265 m = NONZEROP(c) | ~NONZEROP(d - 1);
266 for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
267
268 /* Extract 32-bit words from the value. */
269 for (i = 0; i < 7; i++) {
270 u = ((y[i + 0] >> (4*i)) | ((uint32)y[i + 1] << (28 - 4*i)))&M32;
271 v = ((y[i + 8] >> (4*i)) | ((uint32)y[i + 9] << (28 - 4*i)))&M32;
272 STORE32_L(zv + 4*i, u);
273 STORE32_L(zv + 4*i + 28, v);
274 }
275
276 #elif FGOLDI_IMPL == 12
277
278 piece y[NPIECE], yy[NPIECE], c, d;
279 uint32 a;
280 mask32 m, mm;
281 unsigned i, j, n, w;
282
283 for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
284
285 /* First, propagate the carries. By the end of this, we'll have all of the
286 * the pieces canonically sized and positive, and maybe there'll be
287 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
288 * value will be in the half-open interval [0, φ^2). The whole represented
289 * value is then y + φ^2 c.
290 *
291 * Assume that we start out with |y_i| <= 2^14. We start off by cutting
292 * off and reducing the carry c_39 from the topmost piece, y_39. This
293 * leaves 0 <= y_39 < 2^11; and we'll have |c_39| <= 16. We'll add this
294 * onto y_0 and y_20, and propagate the carries. It's very clear that
295 * we'll end up with |y + (φ + 1) c_39 - φ^2/2| << φ^2.
296 *
297 * Here, the y_i are signed, so we must be cautious about bithacking them.
298 */
299 c = ASR(piece, y[39], 11); y[39] = (piece)y[39]&M11; y[20] += c;
300 for (i = 0; i < NPIECE; i++) {
301 w = PIECEWD(i); m = (1 << w) - 1;
302 y[i] += c; c = ASR(piece, y[i], w); y[i] = (upiece)y[i]&m;
303 }
304
305 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
306 * y >= p, then we should subtract p from the whole value; if c = -1 then
307 * we should add p; and otherwise we should do nothing.
308 *
309 * But conditional behaviour is bad, m'kay. So here's what we do instead.
310 *
311 * The first job is to sort out what we wanted to do. If c = -1 then we
312 * want to (a) invert the constant addend and (b) feed in a carry-in;
313 * otherwise, we don't.
314 */
315 mm = SIGN(c);
316 d = m&1;
317
318 /* Now do the addition/subtraction. Remember that all of the y_i are
319 * nonnegative, so shifting and masking are safe and easy.
320 */
321 d += y[ 0] + (1 ^ (mm&M12)); yy[ 0] = d&M12; d >>= 12;
322 for (i = 1; i < 20; i++) {
323 w = PIECEWD(i); m = MASK(w);
324 d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
325 }
326 d += y[20] + (1 ^ (mm&M12)); yy[20] = d&M12; d >>= 12;
327 for (i = 21; i < 40; i++) {
328 w = PIECEWD(i); m = MASK(w);
329 d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
330 }
331
332 /* The final carry-out is in d; since we only did addition, and the y_i are
333 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
334 * if (a) c /= 0 (in which case we know that the old value was
335 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
336 * the subtraction didn't cause a borrow, so we must be in the case where
337 * p <= y < φ^2.
338 */
339 m = NONZEROP(c) | ~NONZEROP(d - 1);
340 for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
341
342 /* Convert that back into octets. */
343 for (i = j = a = n = 0; i < NPIECE; i++) {
344 a |= (uint32)y[i] << n; n += PIECEWD(i);
345 while (n >= 8) { zv[j++] = a&M8; a >>= 8; n -= 8; }
346 }
347
348 #endif
349 }
350
351 /* --- @fgoldi_set@ --- *
352 *
353 * Arguments: @fgoldi *z@ = where to write the result
354 * @int a@ = a small-ish constant
355 *
356 * Returns: ---
357 *
358 * Use: Sets @z@ to equal @a@.
359 */
360
361 void fgoldi_set(fgoldi *x, int a)
362 {
363 unsigned i;
364
365 x->P[0] = a;
366 for (i = 1; i < NPIECE; i++) x->P[i] = 0;
367 }
368
369 /*----- Basic arithmetic --------------------------------------------------*/
370
371 /* --- @fgoldi_add@ --- *
372 *
373 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
374 * @const fgoldi *x, *y@ = two operands
375 *
376 * Returns: ---
377 *
378 * Use: Set @z@ to the sum %$x + y$%.
379 */
380
381 void fgoldi_add(fgoldi *z, const fgoldi *x, const fgoldi *y)
382 {
383 unsigned i;
384 for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] + y->P[i];
385 }
386
387 /* --- @fgoldi_sub@ --- *
388 *
389 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
390 * @const fgoldi *x, *y@ = two operands
391 *
392 * Returns: ---
393 *
394 * Use: Set @z@ to the difference %$x - y$%.
395 */
396
397 void fgoldi_sub(fgoldi *z, const fgoldi *x, const fgoldi *y)
398 {
399 unsigned i;
400 for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] - y->P[i];
401 }
402
403 /* --- @fgoldi_neg@ --- *
404 *
405 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
406 * @const fgoldi *x@ = an operand
407 *
408 * Returns: ---
409 *
410 * Use: Set @z = -x@.
411 */
412
413 void fgoldi_neg(fgoldi *z, const fgoldi *x)
414 {
415 unsigned i;
416 for (i = 0; i < NPIECE; i++) z->P[i] = -x->P[i];
417 }
418
419 /*----- Constant-time utilities -------------------------------------------*/
420
421 /* --- @fgoldi_pick2@ --- *
422 *
423 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
424 * @const fgoldi *x, *y@ = two operands
425 * @uint32 m@ = a mask
426 *
427 * Returns: ---
428 *
429 * Use: If @m@ is zero, set @z = y@; if @m@ is all-bits-set, then set
430 * @z = x@. If @m@ has some other value, then scramble @z@ in
431 * an unhelpful way.
432 */
433
434 void fgoldi_pick2(fgoldi *z, const fgoldi *x, const fgoldi *y, uint32 m)
435 {
436 mask32 mm = FIX_MASK32(m);
437 unsigned i;
438 for (i = 0; i < NPIECE; i++) z->P[i] = PICK2(x->P[i], y->P[i], mm);
439 }
440
441 /* --- @fgoldi_pickn@ --- *
442 *
443 * Arguments: @fgoldi *z@ = where to put the result
444 * @const fgoldi *v@ = a table of entries
445 * @size_t n@ = the number of entries in @v@
446 * @size_t i@ = an index
447 *
448 * Returns: ---
449 *
450 * Use: If @0 <= i < n < 32@ then set @z = v[i]@. If @n >= 32@ then
451 * do something unhelpful; otherwise, if @i >= n@ then set @z@
452 * to zero.
453 */
454
455 void fgoldi_pickn(fgoldi *z, const fgoldi *v, size_t n, size_t i)
456 {
457 uint32 b = (uint32)1 << (31 - i);
458 mask32 m;
459 unsigned j;
460
461 for (j = 0; j < NPIECE; j++) z->P[j] = 0;
462 while (n--) {
463 m = SIGN(b);
464 for (j = 0; j < NPIECE; j++) CONDPICK(z->P[j], v->P[j], m);
465 v++; b <<= 1;
466 }
467 }
468
469 /* --- @fgoldi_condswap@ --- *
470 *
471 * Arguments: @fgoldi *x, *y@ = two operands
472 * @uint32 m@ = a mask
473 *
474 * Returns: ---
475 *
476 * Use: If @m@ is zero, do nothing; if @m@ is all-bits-set, then
477 * exchange @x@ and @y@. If @m@ has some other value, then
478 * scramble @x@ and @y@ in an unhelpful way.
479 */
480
481 void fgoldi_condswap(fgoldi *x, fgoldi *y, uint32 m)
482 {
483 unsigned i;
484 mask32 mm = FIX_MASK32(m);
485
486 for (i = 0; i < NPIECE; i++) CONDSWAP(x->P[i], y->P[i], mm);
487 }
488
489 /* --- @fgoldi_condneg@ --- *
490 *
491 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
492 * @const fgoldi *x@ = an operand
493 * @uint32 m@ = a mask
494 *
495 * Returns: ---
496 *
497 * Use: If @m@ is zero, set @z = x@; if @m@ is all-bits-set, then set
498 * @z = -x@. If @m@ has some other value then scramble @z@ in
499 * an unhelpful way.
500 */
501
502 void fgoldi_condneg(fgoldi *z, const fgoldi *x, uint32 m)
503 {
504 #ifdef NEG_TWOC
505 mask32 m_xor = FIX_MASK32(m);
506 piece m_add = m&1;
507 # define CONDNEG(x) (((x) ^ m_xor) + m_add)
508 #else
509 int s = PICK2(-1, +1, m);
510 # define CONDNEG(x) (s*(x))
511 #endif
512
513 unsigned i;
514 for (i = 0; i < NPIECE; i++) z->P[i] = CONDNEG(x->P[i]);
515
516 #undef CONDNEG
517 }
518
519 /*----- Multiplication ----------------------------------------------------*/
520
521 #if FGOLDI_IMPL == 28
522
523 /* Let B = 2^63 - 1 be the largest value such that +B and -B can be
524 * represented in a double-precision piece. On entry, it must be the case
525 * that |X_i| <= M <= B - 2^27 for some M. If this is the case, then, on
526 * exit, we will have |Z_i| <= 2^27 + M/2^27.
527 */
528 #define CARRY_REDUCE(z, x) do { \
529 dblpiece _t[NPIECE], _c; \
530 unsigned _i; \
531 \
532 /* Bias the input pieces. This keeps the carries and so on centred \
533 * around zero rather than biased positive. \
534 */ \
535 for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27; \
536 \
537 /* Calculate the reduced pieces. Careful with the bithacking. */ \
538 _c = ASR(dblpiece, _t[15], 28); \
539 (z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c; \
540 for (_i = 1; _i < NPIECE; _i++) { \
541 (z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 + \
542 ASR(dblpiece, _t[_i - 1], 28); \
543 } \
544 (z)[8] += _c; \
545 } while (0)
546
547 #elif FGOLDI_IMPL == 12
548
549 static void carry_reduce(dblpiece x[NPIECE])
550 {
551 /* Initial bounds: we assume |x_i| < 2^31 - 2^27. */
552
553 unsigned i, j;
554 dblpiece c;
555
556 /* The result is nearly canonical, because we do sequential carry
557 * propagation, because smaller processors are more likely to prefer the
558 * smaller working set than the instruction-level parallelism.
559 *
560 * Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
561 * Truncate x_38 to 10 bits, and add the carry onto x_39. Truncate x_39 to
562 * 10 bits, and add the carry onto x_0 and x_20. And so on.
563 *
564 * Once we reach x_37 for the second time, we start with |x_37| <= 2^10.
565 * The carry into x_37 is at most 2^21; so the carry out into x_38 has
566 * magnitude at most 2^10. In turn, |x_38| <= 2^10 before the carry, so is
567 * now no more than 2^11 in magnitude, and the carry out into x_39 is at
568 * most 1. This leaves |x_39| <= 2^10 + 1 after carry propagation.
569 *
570 * Be careful with the bit hacking because the quantities involved are
571 * signed.
572 */
573
574 /* For each piece, we bias it so that floor division (as done by an
575 * arithmetic right shift) and modulus (as done by bitwise-AND) does the
576 * right thing.
577 */
578 #define CARRY(i, wd, b, m) do { \
579 x[i] += (b); \
580 c = ASR(dblpiece, x[i], (wd)); \
581 x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b); \
582 } while (0)
583
584 { CARRY(37, 11, B10, M11); }
585 { x[38] += c; CARRY(38, 11, B10, M11); }
586 { x[39] += c; CARRY(39, 11, B10, M11); }
587 x[20] += c;
588 for (i = 0; i < 35; ) {
589 { x[i] += c; CARRY( i, 12, B11, M12); i++; }
590 for (j = i + 4; i < j; ) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
591 }
592 { x[i] += c; CARRY( i, 12, B11, M12); i++; }
593 while (i < 39) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
594 x[39] += c;
595 }
596
597 #endif
598
599 /* --- @fgoldi_mulconst@ --- *
600 *
601 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
602 * @const fgoldi *x@ = an operand
603 * @long a@ = a small-ish constant; %$|a| < 2^{20}$%.
604 *
605 * Returns: ---
606 *
607 * Use: Set @z@ to the product %$a x$%.
608 */
609
610 void fgoldi_mulconst(fgoldi *z, const fgoldi *x, long a)
611 {
612 unsigned i;
613 dblpiece zz[NPIECE], aa = a;
614
615 for (i = 0; i < NPIECE; i++) zz[i] = aa*x->P[i];
616 #if FGOLDI_IMPL == 28
617 CARRY_REDUCE(z->P, zz);
618 #elif FGOLDI_IMPL == 12
619 carry_reduce(zz);
620 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
621 #endif
622 }
623
624 /* --- @fgoldi_mul@ --- *
625 *
626 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
627 * @const fgoldi *x, *y@ = two operands
628 *
629 * Returns: ---
630 *
631 * Use: Set @z@ to the product %$x y$%.
632 */
633
634 void fgoldi_mul(fgoldi *z, const fgoldi *x, const fgoldi *y)
635 {
636 dblpiece zz[NPIECE], u[NPIECE];
637 piece ab[NPIECE/2], cd[NPIECE/2];
638 const piece
639 *a = x->P + NPIECE/2, *b = x->P,
640 *c = y->P + NPIECE/2, *d = y->P;
641 unsigned i, j;
642
643 #if FGOLDI_IMPL == 28
644
645 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
646
647 #elif FGOLDI_IMPL == 12
648
649 static const unsigned short off[39] = {
650 0, 12, 23, 34, 45, 56, 68, 79, 90, 101,
651 112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
652 224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
653 336, 348, 359, 370, 381, 392, 404, 415, 426
654 };
655
656 #define M(x,i, y,j) \
657 (((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
658
659 #endif
660
661 /* Behold the magic.
662 *
663 * Write x = a φ + b, and y = c φ + d. Then x y = a c φ^2 +
664 * (a d + b c) φ + b d. Karatsuba and Ofman observed that a d + b c =
665 * (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
666 * the prime p so that φ^2 = φ + 1. So
667 *
668 * x y = ((a + b) (c + d) - b d) φ + a c + b d
669 */
670
671 for (i = 0; i < NPIECE; i++) zz[i] = 0;
672
673 /* Our first job will be to calculate (1 - φ) b d, and write the result
674 * into z. As we do this, an interesting thing will happen. Write
675 * b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
676 * So, what we do is to write the product end-swapped and negated, and then
677 * we'll subtract the (negated, remember) high half from the low half.
678 */
679 for (i = 0; i < NPIECE/2; i++) {
680 for (j = 0; j < NPIECE/2 - i; j++)
681 zz[i + j + NPIECE/2] -= M(b,i, d,j);
682 for (; j < NPIECE/2; j++)
683 zz[i + j - NPIECE/2] -= M(b,i, d,j);
684 }
685 for (i = 0; i < NPIECE/2; i++)
686 zz[i] -= zz[i + NPIECE/2];
687
688 /* Next, we add on a c. There are no surprises here. */
689 for (i = 0; i < NPIECE/2; i++)
690 for (j = 0; j < NPIECE/2; j++)
691 zz[i + j] += M(a,i, c,j);
692
693 /* Now, calculate a + b and c + d. */
694 for (i = 0; i < NPIECE/2; i++)
695 { ab[i] = a[i] + b[i]; cd[i] = c[i] + d[i]; }
696
697 /* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
698 * Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
699 * v φ + (1 + φ) u. We'll store u in a temporary place and add it on
700 * twice.
701 */
702 for (i = 0; i < NPIECE; i++) u[i] = 0;
703 for (i = 0; i < NPIECE/2; i++) {
704 for (j = 0; j < NPIECE/2 - i; j++)
705 zz[i + j + NPIECE/2] += M(ab,i, cd,j);
706 for (; j < NPIECE/2; j++)
707 u[i + j - NPIECE/2] += M(ab,i, cd,j);
708 }
709 for (i = 0; i < NPIECE/2; i++)
710 { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
711
712 #undef M
713
714 #if FGOLDI_IMPL == 28
715 /* That wraps it up for the multiplication. Let's figure out some bounds.
716 * Fortunately, Karatsuba is a polynomial identity, so all of the pieces
717 * end up the way they'd be if we'd done the thing the easy way, which
718 * simplifies the analysis. Suppose we started with |x_i|, |y_i| <= 9/5
719 * 2^28. The overheads in the result are given by the coefficients of
720 *
721 * ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
722 *
723 * the greatest of which is 38. So |z_i| <= 38*81/25*2^56 < 2^63.
724 *
725 * Anyway, a round of `CARRY_REDUCE' will leave us with |z_i| < 2^27 +
726 * 2^36; and a second round will leave us with |z_i| < 2^27 + 512.
727 */
728 for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
729 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
730 #elif FGOLDI_IMPL == 12
731 carry_reduce(zz);
732 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
733 #endif
734 }
735
736 /* --- @fgoldi_sqr@ --- *
737 *
738 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
739 * @const fgoldi *x@ = an operand
740 *
741 * Returns: ---
742 *
743 * Use: Set @z@ to the square %$x^2$%.
744 */
745
746 void fgoldi_sqr(fgoldi *z, const fgoldi *x)
747 {
748 #if FGOLDI_IMPL == 28
749
750 dblpiece zz[NPIECE], u[NPIECE];
751 piece ab[NPIECE];
752 const piece *a = x->P + NPIECE/2, *b = x->P;
753 unsigned i, j;
754
755 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
756
757 /* The magic is basically the same as `fgoldi_mul' above. We write
758 * x = a φ + b and use Karatsuba and the special prime shape. This time,
759 * we have
760 *
761 * x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
762 */
763
764 for (i = 0; i < NPIECE; i++) zz[i] = 0;
765
766 /* Our first job will be to calculate (1 - φ) b^2, and write the result
767 * into z. Again, this interacts pleasantly with the prime shape.
768 */
769 for (i = 0; i < NPIECE/4; i++) {
770 zz[2*i + NPIECE/2] -= M(b,i, b,i);
771 for (j = i + 1; j < NPIECE/2 - i; j++)
772 zz[i + j + NPIECE/2] -= 2*M(b,i, b,j);
773 for (; j < NPIECE/2; j++)
774 zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
775 }
776 for (; i < NPIECE/2; i++) {
777 zz[2*i - NPIECE/2] -= M(b,i, b,i);
778 for (j = i + 1; j < NPIECE/2; j++)
779 zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
780 }
781 for (i = 0; i < NPIECE/2; i++)
782 zz[i] -= zz[i + NPIECE/2];
783
784 /* Next, we add on a^2. There are no surprises here. */
785 for (i = 0; i < NPIECE/2; i++) {
786 zz[2*i] += M(a,i, a,i);
787 for (j = i + 1; j < NPIECE/2; j++)
788 zz[i + j] += 2*M(a,i, a,j);
789 }
790
791 /* Now, calculate a + b. */
792 for (i = 0; i < NPIECE/2; i++)
793 ab[i] = a[i] + b[i];
794
795 /* Finally (for the multiplication) we must add on (a + b)^2 φ.
796 * Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u. We'll
797 * store u in a temporary place and add it on twice.
798 */
799 for (i = 0; i < NPIECE; i++) u[i] = 0;
800 for (i = 0; i < NPIECE/4; i++) {
801 zz[2*i + NPIECE/2] += M(ab,i, ab,i);
802 for (j = i + 1; j < NPIECE/2 - i; j++)
803 zz[i + j + NPIECE/2] += 2*M(ab,i, ab,j);
804 for (; j < NPIECE/2; j++)
805 u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
806 }
807 for (; i < NPIECE/2; i++) {
808 u[2*i - NPIECE/2] += M(ab,i, ab,i);
809 for (j = i + 1; j < NPIECE/2; j++)
810 u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
811 }
812 for (i = 0; i < NPIECE/2; i++)
813 { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
814
815 #undef M
816
817 /* Finally, carrying. */
818 for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
819 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
820
821 #elif FGOLDI_IMPL == 12
822 fgoldi_mul(z, x, x);
823 #endif
824 }
825
826 /*----- More advanced operations ------------------------------------------*/
827
828 /* --- @fgoldi_inv@ --- *
829 *
830 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
831 * @const fgoldi *x@ = an operand
832 *
833 * Returns: ---
834 *
835 * Use: Stores in @z@ the multiplicative inverse %$x^{-1}$%. If
836 * %$x = 0$% then @z@ is set to zero. This is considered a
837 * feature.
838 */
839
840 void fgoldi_inv(fgoldi *z, const fgoldi *x)
841 {
842 fgoldi t, u;
843 unsigned i;
844
845 #define SQRN(z, x, n) do { \
846 fgoldi_sqr((z), (x)); \
847 for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
848 } while (0)
849
850 /* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
851 * x = 0 as intended. The addition chain is home-made.
852 */ /* step | value */
853 fgoldi_sqr(&u, x); /* 1 | 2 */
854 fgoldi_mul(&t, &u, x); /* 2 | 3 */
855 SQRN(&u, &t, 2); /* 4 | 12 */
856 fgoldi_mul(&t, &u, &t); /* 5 | 15 */
857 SQRN(&u, &t, 4); /* 9 | 240 */
858 fgoldi_mul(&u, &u, &t); /* 10 | 255 = 2^8 - 1 */
859 SQRN(&u, &u, 4); /* 14 | 2^12 - 16 */
860 fgoldi_mul(&t, &u, &t); /* 15 | 2^12 - 1 */
861 SQRN(&u, &t, 12); /* 27 | 2^24 - 2^12 */
862 fgoldi_mul(&u, &u, &t); /* 28 | 2^24 - 1 */
863 SQRN(&u, &u, 12); /* 40 | 2^36 - 2^12 */
864 fgoldi_mul(&t, &u, &t); /* 41 | 2^36 - 1 */
865 fgoldi_sqr(&t, &t); /* 42 | 2^37 - 2 */
866 fgoldi_mul(&t, &t, x); /* 43 | 2^37 - 1 */
867 SQRN(&u, &t, 37); /* 80 | 2^74 - 2^37 */
868 fgoldi_mul(&u, &u, &t); /* 81 | 2^74 - 1 */
869 SQRN(&u, &u, 37); /* 118 | 2^111 - 2^37 */
870 fgoldi_mul(&t, &u, &t); /* 119 | 2^111 - 1 */
871 SQRN(&u, &t, 111); /* 230 | 2^222 - 2^111 */
872 fgoldi_mul(&t, &u, &t); /* 231 | 2^222 - 1 */
873 fgoldi_sqr(&u, &t); /* 232 | 2^223 - 2 */
874 fgoldi_mul(&u, &u, x); /* 233 | 2^223 - 1 */
875 SQRN(&u, &u, 223); /* 456 | 2^446 - 2^223 */
876 fgoldi_mul(&t, &u, &t); /* 457 | 2^446 - 2^222 - 1 */
877 SQRN(&t, &t, 2); /* 459 | 2^448 - 2^224 - 4 */
878 fgoldi_mul(z, &t, x); /* 460 | 2^448 - 2^224 - 3 */
879
880 #undef SQRN
881 }
882
883 /* --- @fgoldi_quosqrt@ --- *
884 *
885 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
886 * @const fgoldi *x, *y@ = two operands
887 *
888 * Returns: Zero if successful, @-1@ if %$x/y$% is not a square.
889 *
890 * Use: Stores in @z@ the one of the square roots %$\pm\sqrt{x/y}$%.
891 * If %$x = y = 0% then the result is zero; if %$y = 0$% but %$x
892 * \ne 0$% then the operation fails. If you wanted a specific
893 * square root then you'll have to pick it yourself.
894 */
895
896 int fgoldi_quosqrt(fgoldi *z, const fgoldi *x, const fgoldi *y)
897 {
898 fgoldi t, u, v;
899 octet xb[56], b0[56];
900 int32 rc = -1;
901 mask32 m;
902 unsigned i;
903
904 #define SQRN(z, x, n) do { \
905 fgoldi_sqr((z), (x)); \
906 for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
907 } while (0)
908
909 /* This is, fortunately, significantly easier than the equivalent problem
910 * in GF(2^255 - 19), since p == 3 (mod 4).
911 *
912 * If x/y is square, then so is v = y^2 x/y = x y, and therefore u has
913 * order r = (p - 1)/2. Let w = v^{(p-3)/4}. Then w^2 = v^{(p-3)/2} =
914 * u^{r-1} = 1/v = 1/x y. Clearly, then, (x w)^2 = x^2/x y = x/y, so x w
915 * is one of the square roots we seek.
916 *
917 * The addition chain, then, is a prefix of the previous one.
918 */
919 fgoldi_mul(&v, x, y);
920
921 fgoldi_sqr(&u, &v); /* 1 | 2 */
922 fgoldi_mul(&t, &u, &v); /* 2 | 3 */
923 SQRN(&u, &t, 2); /* 4 | 12 */
924 fgoldi_mul(&t, &u, &t); /* 5 | 15 */
925 SQRN(&u, &t, 4); /* 9 | 240 */
926 fgoldi_mul(&u, &u, &t); /* 10 | 255 = 2^8 - 1 */
927 SQRN(&u, &u, 4); /* 14 | 2^12 - 16 */
928 fgoldi_mul(&t, &u, &t); /* 15 | 2^12 - 1 */
929 SQRN(&u, &t, 12); /* 27 | 2^24 - 2^12 */
930 fgoldi_mul(&u, &u, &t); /* 28 | 2^24 - 1 */
931 SQRN(&u, &u, 12); /* 40 | 2^36 - 2^12 */
932 fgoldi_mul(&t, &u, &t); /* 41 | 2^36 - 1 */
933 fgoldi_sqr(&t, &t); /* 42 | 2^37 - 2 */
934 fgoldi_mul(&t, &t, &v); /* 43 | 2^37 - 1 */
935 SQRN(&u, &t, 37); /* 80 | 2^74 - 2^37 */
936 fgoldi_mul(&u, &u, &t); /* 81 | 2^74 - 1 */
937 SQRN(&u, &u, 37); /* 118 | 2^111 - 2^37 */
938 fgoldi_mul(&t, &u, &t); /* 119 | 2^111 - 1 */
939 SQRN(&u, &t, 111); /* 230 | 2^222 - 2^111 */
940 fgoldi_mul(&t, &u, &t); /* 231 | 2^222 - 1 */
941 fgoldi_sqr(&u, &t); /* 232 | 2^223 - 2 */
942 fgoldi_mul(&u, &u, &v); /* 233 | 2^223 - 1 */
943 SQRN(&u, &u, 223); /* 456 | 2^446 - 2^223 */
944 fgoldi_mul(&t, &u, &t); /* 457 | 2^446 - 2^222 - 1 */
945
946 #undef SQRN
947
948 /* Now we must decide whether the answer was right. We should have z^2 =
949 * x/y, so y z^2 = x.
950 *
951 * The easiest way to compare is to encode. This isn't as wasteful as it
952 * sounds: the hard part is normalizing the representations, which we have
953 * to do anyway.
954 */
955 fgoldi_mul(z, x, &t);
956 fgoldi_sqr(&t, z);
957 fgoldi_mul(&t, &t, y);
958 fgoldi_store(xb, x);
959 fgoldi_store(b0, &t);
960 m = -ct_memeq(xb, b0, 56);
961 rc = PICK2(0, rc, m);
962 return (rc);
963 }
964
965 /*----- Test rig ----------------------------------------------------------*/
966
967 #ifdef TEST_RIG
968
969 #include <mLib/report.h>
970 #include <mLib/str.h>
971 #include <mLib/testrig.h>
972
973 static void fixdstr(dstr *d)
974 {
975 if (d->len > 56)
976 die(1, "invalid length for fgoldi");
977 else if (d->len < 56) {
978 dstr_ensure(d, 56);
979 memset(d->buf + d->len, 0, 56 - d->len);
980 d->len = 56;
981 }
982 }
983
984 static void cvt_fgoldi(const char *buf, dstr *d)
985 {
986 dstr dd = DSTR_INIT;
987
988 type_hex.cvt(buf, &dd); fixdstr(&dd);
989 dstr_ensure(d, sizeof(fgoldi)); d->len = sizeof(fgoldi);
990 fgoldi_load((fgoldi *)d->buf, (const octet *)dd.buf);
991 dstr_destroy(&dd);
992 }
993
994 static void dump_fgoldi(dstr *d, FILE *fp)
995 { fdump(stderr, "???", (const piece *)d->buf); }
996
997 static void cvt_fgoldi_ref(const char *buf, dstr *d)
998 { type_hex.cvt(buf, d); fixdstr(d); }
999
1000 static void dump_fgoldi_ref(dstr *d, FILE *fp)
1001 {
1002 fgoldi x;
1003
1004 fgoldi_load(&x, (const octet *)d->buf);
1005 fdump(stderr, "???", x.P);
1006 }
1007
1008 static int eq(const fgoldi *x, dstr *d)
1009 { octet b[56]; fgoldi_store(b, x); return (memcmp(b, d->buf, 56) == 0); }
1010
1011 static const test_type
1012 type_fgoldi = { cvt_fgoldi, dump_fgoldi },
1013 type_fgoldi_ref = { cvt_fgoldi_ref, dump_fgoldi_ref };
1014
1015 #define TEST_UNOP(op) \
1016 static int vrf_##op(dstr dv[]) \
1017 { \
1018 fgoldi *x = (fgoldi *)dv[0].buf; \
1019 fgoldi z, zz; \
1020 int ok = 1; \
1021 \
1022 fgoldi_##op(&z, x); \
1023 if (!eq(&z, &dv[1])) { \
1024 ok = 0; \
1025 fprintf(stderr, "failed!\n"); \
1026 fdump(stderr, "x", x->P); \
1027 fdump(stderr, "calc", z.P); \
1028 fgoldi_load(&zz, (const octet *)dv[1].buf); \
1029 fdump(stderr, "z", zz.P); \
1030 } \
1031 \
1032 return (ok); \
1033 }
1034
1035 TEST_UNOP(sqr)
1036 TEST_UNOP(inv)
1037 TEST_UNOP(neg)
1038
1039 #define TEST_BINOP(op) \
1040 static int vrf_##op(dstr dv[]) \
1041 { \
1042 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf; \
1043 fgoldi z, zz; \
1044 int ok = 1; \
1045 \
1046 fgoldi_##op(&z, x, y); \
1047 if (!eq(&z, &dv[2])) { \
1048 ok = 0; \
1049 fprintf(stderr, "failed!\n"); \
1050 fdump(stderr, "x", x->P); \
1051 fdump(stderr, "y", y->P); \
1052 fdump(stderr, "calc", z.P); \
1053 fgoldi_load(&zz, (const octet *)dv[2].buf); \
1054 fdump(stderr, "z", zz.P); \
1055 } \
1056 \
1057 return (ok); \
1058 }
1059
1060 TEST_BINOP(add)
1061 TEST_BINOP(sub)
1062 TEST_BINOP(mul)
1063
1064 static int vrf_mulc(dstr dv[])
1065 {
1066 fgoldi *x = (fgoldi *)dv[0].buf;
1067 long a = *(const long *)dv[1].buf;
1068 fgoldi z, zz;
1069 int ok = 1;
1070
1071 fgoldi_mulconst(&z, x, a);
1072 if (!eq(&z, &dv[2])) {
1073 ok = 0;
1074 fprintf(stderr, "failed!\n");
1075 fdump(stderr, "x", x->P);
1076 fprintf(stderr, "a = %ld\n", a);
1077 fdump(stderr, "calc", z.P);
1078 fgoldi_load(&zz, (const octet *)dv[2].buf);
1079 fdump(stderr, "z", zz.P);
1080 }
1081
1082 return (ok);
1083 }
1084
1085 static int vrf_condneg(dstr dv[])
1086 {
1087 fgoldi *x = (fgoldi *)dv[0].buf;
1088 uint32 m = *(uint32 *)dv[1].buf;
1089 fgoldi z;
1090 int ok = 1;
1091
1092 fgoldi_condneg(&z, x, m);
1093 if (!eq(&z, &dv[2])) {
1094 ok = 0;
1095 fprintf(stderr, "failed!\n");
1096 fdump(stderr, "x", x->P);
1097 fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1098 fdump(stderr, "calc z", z.P);
1099 fgoldi_load(&z, (const octet *)dv[1].buf);
1100 fdump(stderr, "want z", z.P);
1101 }
1102
1103 return (ok);
1104 }
1105
1106 static int vrf_pick2(dstr dv[])
1107 {
1108 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1109 uint32 m = *(uint32 *)dv[2].buf;
1110 fgoldi z;
1111 int ok = 1;
1112
1113 fgoldi_pick2(&z, x, y, m);
1114 if (!eq(&z, &dv[3])) {
1115 ok = 0;
1116 fprintf(stderr, "failed!\n");
1117 fdump(stderr, "x", x->P);
1118 fdump(stderr, "y", y->P);
1119 fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1120 fdump(stderr, "calc z", z.P);
1121 fgoldi_load(&z, (const octet *)dv[3].buf);
1122 fdump(stderr, "want z", z.P);
1123 }
1124
1125 return (ok);
1126 }
1127
1128 static int vrf_pickn(dstr dv[])
1129 {
1130 dstr d = DSTR_INIT;
1131 fgoldi v[32], z;
1132 size_t i = *(uint32 *)dv[1].buf, j, n;
1133 const char *p;
1134 char *q;
1135 int ok = 1;
1136
1137 for (q = dv[0].buf, n = 0; (p = str_qword(&q, 0)) != 0; n++)
1138 { cvt_fgoldi(p, &d); v[n] = *(fgoldi *)d.buf; }
1139
1140 fgoldi_pickn(&z, v, n, i);
1141 if (!eq(&z, &dv[2])) {
1142 ok = 0;
1143 fprintf(stderr, "failed!\n");
1144 for (j = 0; j < n; j++) {
1145 fprintf(stderr, "v[%2u]", (unsigned)j);
1146 fdump(stderr, "", v[j].P);
1147 }
1148 fprintf(stderr, "i = %u\n", (unsigned)i);
1149 fdump(stderr, "calc z", z.P);
1150 fgoldi_load(&z, (const octet *)dv[2].buf);
1151 fdump(stderr, "want z", z.P);
1152 }
1153
1154 dstr_destroy(&d);
1155 return (ok);
1156 }
1157
1158 static int vrf_condswap(dstr dv[])
1159 {
1160 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1161 fgoldi xx = *x, yy = *y;
1162 uint32 m = *(uint32 *)dv[2].buf;
1163 int ok = 1;
1164
1165 fgoldi_condswap(&xx, &yy, m);
1166 if (!eq(&xx, &dv[3]) || !eq(&yy, &dv[4])) {
1167 ok = 0;
1168 fprintf(stderr, "failed!\n");
1169 fdump(stderr, "x", x->P);
1170 fdump(stderr, "y", y->P);
1171 fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
1172 fdump(stderr, "calc xx", xx.P);
1173 fdump(stderr, "calc yy", yy.P);
1174 fgoldi_load(&xx, (const octet *)dv[3].buf);
1175 fgoldi_load(&yy, (const octet *)dv[4].buf);
1176 fdump(stderr, "want xx", xx.P);
1177 fdump(stderr, "want yy", yy.P);
1178 }
1179
1180 return (ok);
1181 }
1182
1183 static int vrf_quosqrt(dstr dv[])
1184 {
1185 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf;
1186 fgoldi z, zz;
1187 int rc;
1188 int ok = 1;
1189
1190 if (dv[2].len) { fixdstr(&dv[2]); fixdstr(&dv[3]); }
1191 rc = fgoldi_quosqrt(&z, x, y);
1192 if (!dv[2].len ? !rc : (rc || (!eq(&z, &dv[2]) && !eq(&z, &dv[3])))) {
1193 ok = 0;
1194 fprintf(stderr, "failed!\n");
1195 fdump(stderr, "x", x->P);
1196 fdump(stderr, "y", y->P);
1197 if (rc) fprintf(stderr, "calc: FAIL\n");
1198 else fdump(stderr, "calc", z.P);
1199 if (!dv[2].len)
1200 fprintf(stderr, "exp: FAIL\n");
1201 else {
1202 fgoldi_load(&zz, (const octet *)dv[2].buf);
1203 fdump(stderr, "z", zz.P);
1204 fgoldi_load(&zz, (const octet *)dv[3].buf);
1205 fdump(stderr, "z'", zz.P);
1206 }
1207 }
1208
1209 return (ok);
1210 }
1211
1212 static int vrf_sub_mulc_add_sub_mul(dstr dv[])
1213 {
1214 fgoldi *u = (fgoldi *)dv[0].buf, *v = (fgoldi *)dv[1].buf,
1215 *w = (fgoldi *)dv[3].buf, *x = (fgoldi *)dv[4].buf,
1216 *y = (fgoldi *)dv[5].buf;
1217 long a = *(const long *)dv[2].buf;
1218 fgoldi umv, aumv, wpaumv, xmy, z, zz;
1219 int ok = 1;
1220
1221 fgoldi_sub(&umv, u, v);
1222 fgoldi_mulconst(&aumv, &umv, a);
1223 fgoldi_add(&wpaumv, w, &aumv);
1224 fgoldi_sub(&xmy, x, y);
1225 fgoldi_mul(&z, &wpaumv, &xmy);
1226
1227 if (!eq(&z, &dv[6])) {
1228 ok = 0;
1229 fprintf(stderr, "failed!\n");
1230 fdump(stderr, "u", u->P);
1231 fdump(stderr, "v", v->P);
1232 fdump(stderr, "u - v", umv.P);
1233 fprintf(stderr, "a = %ld\n", a);
1234 fdump(stderr, "a (u - v)", aumv.P);
1235 fdump(stderr, "w + a (u - v)", wpaumv.P);
1236 fdump(stderr, "x", x->P);
1237 fdump(stderr, "y", y->P);
1238 fdump(stderr, "x - y", xmy.P);
1239 fdump(stderr, "(x - y) (w + a (u - v))", z.P);
1240 fgoldi_load(&zz, (const octet *)dv[6].buf); fdump(stderr, "z", zz.P);
1241 }
1242
1243 return (ok);
1244 }
1245
1246 static test_chunk tests[] = {
1247 { "add", vrf_add, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1248 { "sub", vrf_sub, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1249 { "neg", vrf_neg, { &type_fgoldi, &type_fgoldi_ref } },
1250 { "condneg", vrf_condneg,
1251 { &type_fgoldi, &type_uint32, &type_fgoldi_ref } },
1252 { "mul", vrf_mul, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1253 { "mulconst", vrf_mulc, { &type_fgoldi, &type_long, &type_fgoldi_ref } },
1254 { "pick2", vrf_pick2,
1255 { &type_fgoldi, &type_fgoldi, &type_uint32, &type_fgoldi_ref } },
1256 { "pickn", vrf_pickn,
1257 { &type_string, &type_uint32, &type_fgoldi_ref } },
1258 { "condswap", vrf_condswap,
1259 { &type_fgoldi, &type_fgoldi, &type_uint32,
1260 &type_fgoldi_ref, &type_fgoldi_ref } },
1261 { "sqr", vrf_sqr, { &type_fgoldi, &type_fgoldi_ref } },
1262 { "inv", vrf_inv, { &type_fgoldi, &type_fgoldi_ref } },
1263 { "quosqrt", vrf_quosqrt,
1264 { &type_fgoldi, &type_fgoldi, &type_hex, &type_hex } },
1265 { "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul,
1266 { &type_fgoldi, &type_fgoldi, &type_long, &type_fgoldi,
1267 &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
1268 { 0, 0, { 0 } }
1269 };
1270
1271 int main(int argc, char *argv[])
1272 {
1273 test_run(argc, argv, tests, SRCDIR "/t/fgoldi");
1274 return (0);
1275 }
1276
1277 #endif
1278
1279 /*----- That's all, folks -------------------------------------------------*/