@@@ import-catacomb-crypto keccak/sha3
[secnet] / fgoldi.c
CommitLineData
8a7654a6
MW
1/* -*-c-*-
2 *
3 * Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
4 *
5 * (c) 2017 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of secnet.
11 * See README for full list of copyright holders.
12 *
13 * secnet is free software; you can redistribute it and/or modify it
14 * under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version d of the License, or
16 * (at your option) any later version.
17 *
18 * secnet is distributed in the hope that it will be useful, but
19 * WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 * General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * version 3 along with secnet; if not, see
25 * https://www.gnu.org/licenses/gpl.html.
26 *
27 * This file was originally part of Catacomb, but has been automatically
28 * modified for incorporation into secnet: see `import-catacomb-crypto'
29 * for details.
30 *
31 * Catacomb is free software; you can redistribute it and/or modify
32 * it under the terms of the GNU Library General Public License as
33 * published by the Free Software Foundation; either version 2 of the
34 * License, or (at your option) any later version.
35 *
36 * Catacomb is distributed in the hope that it will be useful,
37 * but WITHOUT ANY WARRANTY; without even the implied warranty of
38 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
39 * GNU Library General Public License for more details.
40 *
41 * You should have received a copy of the GNU Library General Public
42 * License along with Catacomb; if not, write to the Free
43 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
44 * MA 02111-1307, USA.
45 */
46
47/*----- Header files ------------------------------------------------------*/
48
49#include "fgoldi.h"
50
51/*----- Basic setup -------------------------------------------------------*
52 *
53 * Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
54 * (hence the name).
55 */
56
a1a6042e
MW
57typedef fgoldi_piece piece;
58
8a7654a6
MW
59/* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
60 * x = SUM_{0<=i<16} x_i 2^(28i).
61 */
62
a1a6042e 63 typedef int64 dblpiece;
8a7654a6
MW
64typedef uint32 upiece; typedef uint64 udblpiece;
65#define PIECEWD(i) 28
66#define NPIECE 16
67#define P p28
68
8a7654a6
MW
69#define B27 0x08000000u
70#define M28 0x0fffffffu
8a7654a6
MW
71#define M32 0xffffffffu
72
73/*----- Debugging machinery -----------------------------------------------*/
74
75#if defined(FGOLDI_DEBUG)
76
77#include <stdio.h>
78
79#include "mp.h"
80#include "mptext.h"
81
82static mp *get_pgoldi(void)
83{
84 mp *p = MP_NEW, *t = MP_NEW;
85
86 p = mp_setbit(p, MP_ZERO, 448);
87 t = mp_setbit(t, MP_ZERO, 224);
88 p = mp_sub(p, p, t);
89 p = mp_sub(p, p, MP_ONE);
90 mp_drop(t);
91 return (p);
92}
93
94DEF_FDUMP(fdump, piece, PIECEWD, NPIECE, 56, get_pgoldi())
95
96#endif
97
98/*----- Loading and storing -----------------------------------------------*/
99
100/* --- @fgoldi_load@ --- *
101 *
102 * Arguments: @fgoldi *z@ = where to store the result
103 * @const octet xv[56]@ = source to read
104 *
105 * Returns: ---
106 *
107 * Use: Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
108 * external representation from @xv@ and stores it in @z@.
109 *
110 * External representation is little-endian base-256. Some
111 * elements have multiple encodings, which are not produced by
112 * correct software; use of noncanonical encodings is not an
113 * error, and toleration of them is considered a performance
114 * feature.
115 */
116
117void fgoldi_load(fgoldi *z, const octet xv[56])
118{
119
120 unsigned i;
121 uint32 xw[14];
122 piece b, c;
123
124 /* First, read the input value as words. */
125 for (i = 0; i < 14; i++) xw[i] = LOAD32_L(xv + 4*i);
126
127 /* Extract unsigned 28-bit pieces from the words. */
128 z->P[ 0] = (xw[ 0] >> 0)&M28;
129 z->P[ 7] = (xw[ 6] >> 4)&M28;
130 z->P[ 8] = (xw[ 7] >> 0)&M28;
131 z->P[15] = (xw[13] >> 4)&M28;
132 for (i = 1; i < 7; i++) {
133 z->P[i + 0] = ((xw[i + 0] << (4*i)) | (xw[i - 1] >> (32 - 4*i)))&M28;
134 z->P[i + 8] = ((xw[i + 7] << (4*i)) | (xw[i + 6] >> (32 - 4*i)))&M28;
135 }
136
137 /* Convert the nonnegative pieces into a balanced signed representation, so
138 * each piece ends up in the interval |z_i| <= 2^27. For each piece, if
139 * its top bit is set, lend a bit leftwards; in the case of z_15, reduce
140 * this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
141 * φ^2 = φ + 1. We delay this carry until after all of the pieces have
142 * been balanced. If we don't do this, then we have to do a more expensive
143 * test for nonzeroness to decide whether to lend a bit leftwards rather
144 * than just testing a single bit.
145 *
146 * Note that we don't try for a canonical representation here: both upper
147 * and lower bounds are achievable.
148 */
149 b = z->P[15]&B27; z->P[15] -= b << 1; c = b >> 27;
150 for (i = NPIECE - 1; i--; )
151 { b = z->P[i]&B27; z->P[i] -= b << 1; z->P[i + 1] += b >> 27; }
152 z->P[0] += c; z->P[8] += c;
153}
154
155/* --- @fgoldi_store@ --- *
156 *
157 * Arguments: @octet zv[56]@ = where to write the result
158 * @const fgoldi *x@ = the field element to write
159 *
160 * Returns: ---
161 *
162 * Use: Stores a field element in the given octet vector in external
163 * representation. A canonical encoding is always stored.
164 */
165
166void fgoldi_store(octet zv[56], const fgoldi *x)
167{
168
169 piece y[NPIECE], yy[NPIECE], c, d;
170 uint32 u, v;
171 mask32 m;
172 unsigned i;
173
174 for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
175
176 /* First, propagate the carries. By the end of this, we'll have all of the
177 * the pieces canonically sized and positive, and maybe there'll be
178 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
179 * value will be in the half-open interval [0, φ^2). The whole represented
180 * value is then y + φ^2 c.
181 *
182 * Assume that we start out with |y_i| <= 2^30. We start off by cutting
183 * off and reducing the carry c_15 from the topmost piece, y_15. This
184 * leaves 0 <= y_15 < 2^28; and we'll have |c_15| <= 4. We'll add this
185 * onto y_0 and y_8, and propagate the carries. It's very clear that we'll
186 * end up with |y + (φ + 1) c_15 - φ^2/2| << φ^2.
187 *
188 * Here, the y_i are signed, so we must be cautious about bithacking them.
189 */
190 c = ASR(piece, y[15], 28); y[15] = (upiece)y[15]&M28; y[8] += c;
191 for (i = 0; i < NPIECE; i++)
192 { y[i] += c; c = ASR(piece, y[i], 28); y[i] = (upiece)y[i]&M28; }
193
194 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
195 * y >= p, then we should subtract p from the whole value; if c = -1 then
196 * we should add p; and otherwise we should do nothing.
197 *
198 * But conditional behaviour is bad, m'kay. So here's what we do instead.
199 *
200 * The first job is to sort out what we wanted to do. If c = -1 then we
201 * want to (a) invert the constant addend and (b) feed in a carry-in;
202 * otherwise, we don't.
203 */
204 m = SIGN(c)&M28;
205 d = m&1;
206
207 /* Now do the addition/subtraction. Remember that all of the y_i are
208 * nonnegative, so shifting and masking are safe and easy.
209 */
210 d += y[0] + (1 ^ m); yy[0] = d&M28; d >>= 28;
211 for (i = 1; i < 8; i++)
212 { d += y[i] + m; yy[i] = d&M28; d >>= 28; }
213 d += y[8] + (1 ^ m); yy[8] = d&M28; d >>= 28;
214 for (i = 9; i < 16; i++)
215 { d += y[i] + m; yy[i] = d&M28; d >>= 28; }
216
217 /* The final carry-out is in d; since we only did addition, and the y_i are
218 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
219 * if (a) c /= 0 (in which case we know that the old value was
220 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
221 * the subtraction didn't cause a borrow, so we must be in the case where
222 * p <= y < φ^2.
223 */
224 m = NONZEROP(c) | ~NONZEROP(d - 1);
225 for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) | (y[i]&~m);
226
227 /* Extract 32-bit words from the value. */
228 for (i = 0; i < 7; i++) {
229 u = ((y[i + 0] >> (4*i)) | ((uint32)y[i + 1] << (28 - 4*i)))&M32;
230 v = ((y[i + 8] >> (4*i)) | ((uint32)y[i + 9] << (28 - 4*i)))&M32;
231 STORE32_L(zv + 4*i, u);
232 STORE32_L(zv + 4*i + 28, v);
233 }
234}
235
236/* --- @fgoldi_set@ --- *
237 *
238 * Arguments: @fgoldi *z@ = where to write the result
239 * @int a@ = a small-ish constant
240 *
241 * Returns: ---
242 *
243 * Use: Sets @z@ to equal @a@.
244 */
245
246void fgoldi_set(fgoldi *x, int a)
247{
248 unsigned i;
249
250 x->P[0] = a;
251 for (i = 1; i < NPIECE; i++) x->P[i] = 0;
252}
253
254/*----- Basic arithmetic --------------------------------------------------*/
255
256/* --- @fgoldi_add@ --- *
257 *
258 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
259 * @const fgoldi *x, *y@ = two operands
260 *
261 * Returns: ---
262 *
263 * Use: Set @z@ to the sum %$x + y$%.
264 */
265
266void fgoldi_add(fgoldi *z, const fgoldi *x, const fgoldi *y)
267{
268 unsigned i;
269 for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] + y->P[i];
270}
271
272/* --- @fgoldi_sub@ --- *
273 *
274 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
275 * @const fgoldi *x, *y@ = two operands
276 *
277 * Returns: ---
278 *
279 * Use: Set @z@ to the difference %$x - y$%.
280 */
281
282void fgoldi_sub(fgoldi *z, const fgoldi *x, const fgoldi *y)
283{
284 unsigned i;
285 for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] - y->P[i];
286}
287
a1a6042e
MW
288/* --- @fgoldi_neg@ --- *
289 *
290 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
291 * @const fgoldi *x@ = an operand
292 *
293 * Returns: ---
294 *
295 * Use: Set @z = -x@.
296 */
297
298void fgoldi_neg(fgoldi *z, const fgoldi *x)
299{
300 unsigned i;
301 for (i = 0; i < NPIECE; i++) z->P[i] = -x->P[i];
302}
303
8a7654a6
MW
304/*----- Constant-time utilities -------------------------------------------*/
305
a1a6042e
MW
306/* --- @fgoldi_pick2@ --- *
307 *
308 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
309 * @const fgoldi *x, *y@ = two operands
310 * @uint32 m@ = a mask
311 *
312 * Returns: ---
313 *
314 * Use: If @m@ is zero, set @z = y@; if @m@ is all-bits-set, then set
315 * @z = x@. If @m@ has some other value, then scramble @z@ in
316 * an unhelpful way.
317 */
318
319void fgoldi_pick2(fgoldi *z, const fgoldi *x, const fgoldi *y, uint32 m)
320{
321 mask32 mm = FIX_MASK32(m);
322 unsigned i;
323 for (i = 0; i < NPIECE; i++) z->P[i] = PICK2(x->P[i], y->P[i], mm);
324}
325
326/* --- @fgoldi_pickn@ --- *
327 *
328 * Arguments: @fgoldi *z@ = where to put the result
329 * @const fgoldi *v@ = a table of entries
330 * @size_t n@ = the number of entries in @v@
331 * @size_t i@ = an index
332 *
333 * Returns: ---
334 *
335 * Use: If @0 <= i < n < 32@ then set @z = v[i]@. If @n >= 32@ then
336 * do something unhelpful; otherwise, if @i >= n@ then set @z@
337 * to zero.
338 */
339
340void fgoldi_pickn(fgoldi *z, const fgoldi *v, size_t n, size_t i)
341{
342 uint32 b = (uint32)1 << (31 - i);
343 mask32 m;
344 unsigned j;
345
346 for (j = 0; j < NPIECE; j++) z->P[j] = 0;
347 while (n--) {
348 m = SIGN(b);
349 for (j = 0; j < NPIECE; j++) CONDPICK(z->P[j], v->P[j], m);
350 v++; b <<= 1;
351 }
352}
353
8a7654a6
MW
354/* --- @fgoldi_condswap@ --- *
355 *
356 * Arguments: @fgoldi *x, *y@ = two operands
357 * @uint32 m@ = a mask
358 *
359 * Returns: ---
360 *
361 * Use: If @m@ is zero, do nothing; if @m@ is all-bits-set, then
362 * exchange @x@ and @y@. If @m@ has some other value, then
363 * scramble @x@ and @y@ in an unhelpful way.
364 */
365
366void fgoldi_condswap(fgoldi *x, fgoldi *y, uint32 m)
367{
368 unsigned i;
369 mask32 mm = FIX_MASK32(m);
370
371 for (i = 0; i < NPIECE; i++) CONDSWAP(x->P[i], y->P[i], mm);
372}
373
a1a6042e
MW
374/* --- @fgoldi_condneg@ --- *
375 *
376 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
377 * @const fgoldi *x@ = an operand
378 * @uint32 m@ = a mask
379 *
380 * Returns: ---
381 *
382 * Use: If @m@ is zero, set @z = x@; if @m@ is all-bits-set, then set
383 * @z = -x@. If @m@ has some other value then scramble @z@ in
384 * an unhelpful way.
385 */
386
387void fgoldi_condneg(fgoldi *z, const fgoldi *x, uint32 m)
388{
389 mask32 m_xor = FIX_MASK32(m);
390 piece m_add = m&1;
391# define CONDNEG(x) (((x) ^ m_xor) + m_add)
392
393 unsigned i;
394 for (i = 0; i < NPIECE; i++) z->P[i] = CONDNEG(x->P[i]);
395
396#undef CONDNEG
397}
398
8a7654a6
MW
399/*----- Multiplication ----------------------------------------------------*/
400
401/* Let B = 2^63 - 1 be the largest value such that +B and -B can be
402 * represented in a double-precision piece. On entry, it must be the case
403 * that |X_i| <= M <= B - 2^27 for some M. If this is the case, then, on
404 * exit, we will have |Z_i| <= 2^27 + M/2^27.
405 */
406#define CARRY_REDUCE(z, x) do { \
407 dblpiece _t[NPIECE], _c; \
408 unsigned _i; \
409 \
410 /* Bias the input pieces. This keeps the carries and so on centred \
411 * around zero rather than biased positive. \
412 */ \
413 for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27; \
414 \
415 /* Calculate the reduced pieces. Careful with the bithacking. */ \
416 _c = ASR(dblpiece, _t[15], 28); \
417 (z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c; \
418 for (_i = 1; _i < NPIECE; _i++) { \
419 (z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 + \
420 ASR(dblpiece, _t[_i - 1], 28); \
421 } \
422 (z)[8] += _c; \
423} while (0)
424
425/* --- @fgoldi_mulconst@ --- *
426 *
427 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
428 * @const fgoldi *x@ = an operand
429 * @long a@ = a small-ish constant; %$|a| < 2^{20}$%.
430 *
431 * Returns: ---
432 *
433 * Use: Set @z@ to the product %$a x$%.
434 */
435
436void fgoldi_mulconst(fgoldi *z, const fgoldi *x, long a)
437{
438 unsigned i;
439 dblpiece zz[NPIECE], aa = a;
440
441 for (i = 0; i < NPIECE; i++) zz[i] = aa*x->P[i];
442 CARRY_REDUCE(z->P, zz);
443}
444
445/* --- @fgoldi_mul@ --- *
446 *
447 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
448 * @const fgoldi *x, *y@ = two operands
449 *
450 * Returns: ---
451 *
452 * Use: Set @z@ to the product %$x y$%.
453 */
454
455void fgoldi_mul(fgoldi *z, const fgoldi *x, const fgoldi *y)
456{
457 dblpiece zz[NPIECE], u[NPIECE];
458 piece ab[NPIECE/2], cd[NPIECE/2];
459 const piece
460 *a = x->P + NPIECE/2, *b = x->P,
461 *c = y->P + NPIECE/2, *d = y->P;
462 unsigned i, j;
463
464# define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
465
466 /* Behold the magic.
467 *
468 * Write x = a φ + b, and y = c φ + d. Then x y = a c φ^2 +
469 * (a d + b c) φ + b d. Karatsuba and Ofman observed that a d + b c =
470 * (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
471 * the prime p so that φ^2 = φ + 1. So
472 *
473 * x y = ((a + b) (c + d) - b d) φ + a c + b d
474 */
475
476 for (i = 0; i < NPIECE; i++) zz[i] = 0;
477
478 /* Our first job will be to calculate (1 - φ) b d, and write the result
479 * into z. As we do this, an interesting thing will happen. Write
480 * b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
481 * So, what we do is to write the product end-swapped and negated, and then
482 * we'll subtract the (negated, remember) high half from the low half.
483 */
484 for (i = 0; i < NPIECE/2; i++) {
485 for (j = 0; j < NPIECE/2 - i; j++)
486 zz[i + j + NPIECE/2] -= M(b,i, d,j);
487 for (; j < NPIECE/2; j++)
488 zz[i + j - NPIECE/2] -= M(b,i, d,j);
489 }
490 for (i = 0; i < NPIECE/2; i++)
491 zz[i] -= zz[i + NPIECE/2];
492
493 /* Next, we add on a c. There are no surprises here. */
494 for (i = 0; i < NPIECE/2; i++)
495 for (j = 0; j < NPIECE/2; j++)
496 zz[i + j] += M(a,i, c,j);
497
498 /* Now, calculate a + b and c + d. */
499 for (i = 0; i < NPIECE/2; i++)
500 { ab[i] = a[i] + b[i]; cd[i] = c[i] + d[i]; }
501
502 /* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
503 * Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
504 * v φ + (1 + φ) u. We'll store u in a temporary place and add it on
505 * twice.
506 */
507 for (i = 0; i < NPIECE; i++) u[i] = 0;
508 for (i = 0; i < NPIECE/2; i++) {
509 for (j = 0; j < NPIECE/2 - i; j++)
510 zz[i + j + NPIECE/2] += M(ab,i, cd,j);
511 for (; j < NPIECE/2; j++)
512 u[i + j - NPIECE/2] += M(ab,i, cd,j);
513 }
514 for (i = 0; i < NPIECE/2; i++)
515 { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
516
517#undef M
518
519 /* That wraps it up for the multiplication. Let's figure out some bounds.
520 * Fortunately, Karatsuba is a polynomial identity, so all of the pieces
521 * end up the way they'd be if we'd done the thing the easy way, which
522 * simplifies the analysis. Suppose we started with |x_i|, |y_i| <= 9/5
523 * 2^28. The overheads in the result are given by the coefficients of
524 *
525 * ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
526 *
527 * the greatest of which is 38. So |z_i| <= 38*81/25*2^56 < 2^63.
528 *
529 * Anyway, a round of `CARRY_REDUCE' will leave us with |z_i| < 2^27 +
530 * 2^36; and a second round will leave us with |z_i| < 2^27 + 512.
531 */
532 for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
533 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
534}
535
536/* --- @fgoldi_sqr@ --- *
537 *
538 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
539 * @const fgoldi *x@ = an operand
540 *
541 * Returns: ---
542 *
543 * Use: Set @z@ to the square %$x^2$%.
544 */
545
546void fgoldi_sqr(fgoldi *z, const fgoldi *x)
547{
548
549 dblpiece zz[NPIECE], u[NPIECE];
550 piece ab[NPIECE];
551 const piece *a = x->P + NPIECE/2, *b = x->P;
552 unsigned i, j;
553
554# define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
555
556 /* The magic is basically the same as `fgoldi_mul' above. We write
557 * x = a φ + b and use Karatsuba and the special prime shape. This time,
558 * we have
559 *
560 * x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
561 */
562
563 for (i = 0; i < NPIECE; i++) zz[i] = 0;
564
565 /* Our first job will be to calculate (1 - φ) b^2, and write the result
566 * into z. Again, this interacts pleasantly with the prime shape.
567 */
568 for (i = 0; i < NPIECE/4; i++) {
569 zz[2*i + NPIECE/2] -= M(b,i, b,i);
570 for (j = i + 1; j < NPIECE/2 - i; j++)
571 zz[i + j + NPIECE/2] -= 2*M(b,i, b,j);
572 for (; j < NPIECE/2; j++)
573 zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
574 }
575 for (; i < NPIECE/2; i++) {
576 zz[2*i - NPIECE/2] -= M(b,i, b,i);
577 for (j = i + 1; j < NPIECE/2; j++)
578 zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
579 }
580 for (i = 0; i < NPIECE/2; i++)
581 zz[i] -= zz[i + NPIECE/2];
582
583 /* Next, we add on a^2. There are no surprises here. */
584 for (i = 0; i < NPIECE/2; i++) {
585 zz[2*i] += M(a,i, a,i);
586 for (j = i + 1; j < NPIECE/2; j++)
587 zz[i + j] += 2*M(a,i, a,j);
588 }
589
590 /* Now, calculate a + b. */
591 for (i = 0; i < NPIECE/2; i++)
592 ab[i] = a[i] + b[i];
593
594 /* Finally (for the multiplication) we must add on (a + b)^2 φ.
595 * Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u. We'll
596 * store u in a temporary place and add it on twice.
597 */
598 for (i = 0; i < NPIECE; i++) u[i] = 0;
599 for (i = 0; i < NPIECE/4; i++) {
600 zz[2*i + NPIECE/2] += M(ab,i, ab,i);
601 for (j = i + 1; j < NPIECE/2 - i; j++)
602 zz[i + j + NPIECE/2] += 2*M(ab,i, ab,j);
603 for (; j < NPIECE/2; j++)
604 u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
605 }
606 for (; i < NPIECE/2; i++) {
607 u[2*i - NPIECE/2] += M(ab,i, ab,i);
608 for (j = i + 1; j < NPIECE/2; j++)
609 u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
610 }
611 for (i = 0; i < NPIECE/2; i++)
612 { zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
613
614#undef M
615
616 /* Finally, carrying. */
617 for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
618 for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
619}
620
621/*----- More advanced operations ------------------------------------------*/
622
623/* --- @fgoldi_inv@ --- *
624 *
625 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
626 * @const fgoldi *x@ = an operand
627 *
628 * Returns: ---
629 *
630 * Use: Stores in @z@ the multiplicative inverse %$x^{-1}$%. If
631 * %$x = 0$% then @z@ is set to zero. This is considered a
632 * feature.
633 */
634
635void fgoldi_inv(fgoldi *z, const fgoldi *x)
636{
637 fgoldi t, u;
638 unsigned i;
639
640#define SQRN(z, x, n) do { \
641 fgoldi_sqr((z), (x)); \
642 for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
643} while (0)
644
645 /* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
646 * x = 0 as intended. The addition chain is home-made.
647 */ /* step | value */
648 fgoldi_sqr(&u, x); /* 1 | 2 */
649 fgoldi_mul(&t, &u, x); /* 2 | 3 */
650 SQRN(&u, &t, 2); /* 4 | 12 */
651 fgoldi_mul(&t, &u, &t); /* 5 | 15 */
652 SQRN(&u, &t, 4); /* 9 | 240 */
653 fgoldi_mul(&u, &u, &t); /* 10 | 255 = 2^8 - 1 */
654 SQRN(&u, &u, 4); /* 14 | 2^12 - 16 */
655 fgoldi_mul(&t, &u, &t); /* 15 | 2^12 - 1 */
656 SQRN(&u, &t, 12); /* 27 | 2^24 - 2^12 */
657 fgoldi_mul(&u, &u, &t); /* 28 | 2^24 - 1 */
658 SQRN(&u, &u, 12); /* 40 | 2^36 - 2^12 */
659 fgoldi_mul(&t, &u, &t); /* 41 | 2^36 - 1 */
660 fgoldi_sqr(&t, &t); /* 42 | 2^37 - 2 */
661 fgoldi_mul(&t, &t, x); /* 43 | 2^37 - 1 */
662 SQRN(&u, &t, 37); /* 80 | 2^74 - 2^37 */
663 fgoldi_mul(&u, &u, &t); /* 81 | 2^74 - 1 */
664 SQRN(&u, &u, 37); /* 118 | 2^111 - 2^37 */
665 fgoldi_mul(&t, &u, &t); /* 119 | 2^111 - 1 */
666 SQRN(&u, &t, 111); /* 230 | 2^222 - 2^111 */
667 fgoldi_mul(&t, &u, &t); /* 231 | 2^222 - 1 */
668 fgoldi_sqr(&u, &t); /* 232 | 2^223 - 2 */
669 fgoldi_mul(&u, &u, x); /* 233 | 2^223 - 1 */
670 SQRN(&u, &u, 223); /* 456 | 2^446 - 2^223 */
671 fgoldi_mul(&t, &u, &t); /* 457 | 2^446 - 2^222 - 1 */
672 SQRN(&t, &t, 2); /* 459 | 2^448 - 2^224 - 4 */
673 fgoldi_mul(z, &t, x); /* 460 | 2^448 - 2^224 - 3 */
674
675#undef SQRN
676}
677
a1a6042e
MW
678/* --- @fgoldi_quosqrt@ --- *
679 *
680 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
681 * @const fgoldi *x, *y@ = two operands
682 *
683 * Returns: Zero if successful, @-1@ if %$x/y$% is not a square.
684 *
685 * Use: Stores in @z@ the one of the square roots %$\pm\sqrt{x/y}$%.
686 * If %$x = y = 0% then the result is zero; if %$y = 0$% but %$x
687 * \ne 0$% then the operation fails. If you wanted a specific
688 * square root then you'll have to pick it yourself.
689 */
690
691int fgoldi_quosqrt(fgoldi *z, const fgoldi *x, const fgoldi *y)
692{
693 fgoldi t, u, v;
694 octet xb[56], b0[56];
695 int32 rc = -1;
696 mask32 m;
697 unsigned i;
698
699#define SQRN(z, x, n) do { \
700 fgoldi_sqr((z), (x)); \
701 for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
702} while (0)
703
704 /* This is, fortunately, significantly easier than the equivalent problem
705 * in GF(2^255 - 19), since p == 3 (mod 4).
706 *
707 * If x/y is square, then so is v = y^2 x/y = x y, and therefore u has
708 * order r = (p - 1)/2. Let w = v^{(p-3)/4}. Then w^2 = v^{(p-3)/2} =
709 * u^{r-1} = 1/v = 1/x y. Clearly, then, (x w)^2 = x^2/x y = x/y, so x w
710 * is one of the square roots we seek.
711 *
712 * The addition chain, then, is a prefix of the previous one.
713 */
714 fgoldi_mul(&v, x, y);
715
716 fgoldi_sqr(&u, &v); /* 1 | 2 */
717 fgoldi_mul(&t, &u, &v); /* 2 | 3 */
718 SQRN(&u, &t, 2); /* 4 | 12 */
719 fgoldi_mul(&t, &u, &t); /* 5 | 15 */
720 SQRN(&u, &t, 4); /* 9 | 240 */
721 fgoldi_mul(&u, &u, &t); /* 10 | 255 = 2^8 - 1 */
722 SQRN(&u, &u, 4); /* 14 | 2^12 - 16 */
723 fgoldi_mul(&t, &u, &t); /* 15 | 2^12 - 1 */
724 SQRN(&u, &t, 12); /* 27 | 2^24 - 2^12 */
725 fgoldi_mul(&u, &u, &t); /* 28 | 2^24 - 1 */
726 SQRN(&u, &u, 12); /* 40 | 2^36 - 2^12 */
727 fgoldi_mul(&t, &u, &t); /* 41 | 2^36 - 1 */
728 fgoldi_sqr(&t, &t); /* 42 | 2^37 - 2 */
729 fgoldi_mul(&t, &t, &v); /* 43 | 2^37 - 1 */
730 SQRN(&u, &t, 37); /* 80 | 2^74 - 2^37 */
731 fgoldi_mul(&u, &u, &t); /* 81 | 2^74 - 1 */
732 SQRN(&u, &u, 37); /* 118 | 2^111 - 2^37 */
733 fgoldi_mul(&t, &u, &t); /* 119 | 2^111 - 1 */
734 SQRN(&u, &t, 111); /* 230 | 2^222 - 2^111 */
735 fgoldi_mul(&t, &u, &t); /* 231 | 2^222 - 1 */
736 fgoldi_sqr(&u, &t); /* 232 | 2^223 - 2 */
737 fgoldi_mul(&u, &u, &v); /* 233 | 2^223 - 1 */
738 SQRN(&u, &u, 223); /* 456 | 2^446 - 2^223 */
739 fgoldi_mul(&t, &u, &t); /* 457 | 2^446 - 2^222 - 1 */
740
741#undef SQRN
742
743 /* Now we must decide whether the answer was right. We should have z^2 =
744 * x/y, so y z^2 = x.
745 *
746 * The easiest way to compare is to encode. This isn't as wasteful as it
747 * sounds: the hard part is normalizing the representations, which we have
748 * to do anyway.
749 */
750 fgoldi_mul(z, x, &t);
751 fgoldi_sqr(&t, z);
752 fgoldi_mul(&t, &t, y);
753 fgoldi_store(xb, x);
754 fgoldi_store(b0, &t);
755 m = -consttime_memeq(xb, b0, 56);
756 rc = PICK2(0, rc, m);
757 return (rc);
758}
759
8a7654a6 760/*----- That's all, folks -------------------------------------------------*/