3 * Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
5 * (c) 2017 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of Catacomb.
12 * Catacomb is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU Library General Public License as
14 * published by the Free Software Foundation; either version 2 of the
15 * License, or (at your option) any later version.
17 * Catacomb is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU Library General Public License for more details.
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb; if not, write to the Free
24 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
28 /*----- Header files ------------------------------------------------------*/
34 /*----- Basic setup -------------------------------------------------------*
36 * Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
41 /* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
42 * x = SUM_{0<=i<16} x_i 2^(28i).
45 typedef int32 piece
; typedef int64 dblpiece
;
46 typedef uint32 upiece
; typedef uint64 udblpiece
;
51 #define B28 0x10000000u
52 #define B27 0x08000000u
53 #define M28 0x0fffffffu
54 #define M27 0x07ffffffu
55 #define M32 0xffffffffu
57 #elif FGOLDI_IMPL == 12
58 /* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
59 * SUM_{0<=i<40} x_i 2^ceil(224i/20). Pieces i with i == 0 (mod 5) are 12
60 * bits wide; the others are 11 bits wide, so they form eight groups of 56
64 typedef int16 piece
; typedef int32 dblpiece
;
65 typedef uint16 upiece
; typedef uint32 udblpiece
;
66 #define PIECEWD(i) ((i)%5 ? 11 : 12)
80 /*----- Debugging machinery -----------------------------------------------*/
82 #if defined(FGOLDI_DEBUG) || defined(TEST_RIG)
89 static mp
*get_pgoldi(void)
91 mp
*p
= MP_NEW
, *t
= MP_NEW
;
93 p
= mp_setbit(p
, MP_ZERO
, 448);
94 t
= mp_setbit(t
, MP_ZERO
, 224);
96 p
= mp_sub(p
, p
, MP_ONE
);
101 DEF_FDUMP(fdump
, piece
, PIECEWD
, NPIECE
, 56, get_pgoldi())
105 /*----- Loading and storing -----------------------------------------------*/
107 /* --- @fgoldi_load@ --- *
109 * Arguments: @fgoldi *z@ = where to store the result
110 * @const octet xv[56]@ = source to read
114 * Use: Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
115 * external representation from @xv@ and stores it in @z@.
117 * External representation is little-endian base-256. Some
118 * elements have multiple encodings, which are not produced by
119 * correct software; use of noncanonical encodings is not an
120 * error, and toleration of them is considered a performance
124 void fgoldi_load(fgoldi
*z
, const octet xv
[56])
126 #if FGOLDI_IMPL == 28
132 /* First, read the input value as words. */
133 for (i
= 0; i
< 14; i
++) xw
[i
] = LOAD32_L(xv
+ 4*i
);
135 /* Extract unsigned 28-bit pieces from the words. */
136 z
->P
[ 0] = (xw
[ 0] >> 0)&M28
;
137 z
->P
[ 7] = (xw
[ 6] >> 4)&M28
;
138 z
->P
[ 8] = (xw
[ 7] >> 0)&M28
;
139 z
->P
[15] = (xw
[13] >> 4)&M28
;
140 for (i
= 1; i
< 7; i
++) {
141 z
->P
[i
+ 0] = ((xw
[i
+ 0] << (4*i
)) | (xw
[i
- 1] >> (32 - 4*i
)))&M28
;
142 z
->P
[i
+ 8] = ((xw
[i
+ 7] << (4*i
)) | (xw
[i
+ 6] >> (32 - 4*i
)))&M28
;
145 /* Convert the nonnegative pieces into a balanced signed representation, so
146 * each piece ends up in the interval |z_i| <= 2^27. For each piece, if
147 * its top bit is set, lend a bit leftwards; in the case of z_15, reduce
148 * this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
149 * φ^2 = φ + 1. We delay this carry until after all of the pieces have
150 * been balanced. If we don't do this, then we have to do a more expensive
151 * test for nonzeroness to decide whether to lend a bit leftwards rather
152 * than just testing a single bit.
154 * Note that we don't try for a canonical representation here: both upper
155 * and lower bounds are achievable.
157 b
= z
->P
[15]&B27
; z
->P
[15] -= b
<< 1; c
= b
>> 27;
158 for (i
= NPIECE
- 1; i
--; )
159 { b
= z
->P
[i
]&B27
; z
->P
[i
] -= b
<< 1; z
->P
[i
+ 1] += b
>> 27; }
160 z
->P
[0] += c
; z
->P
[8] += c
;
162 #elif FGOLDI_IMPL == 12
164 unsigned i
, j
, n
, w
, b
;
168 /* First, convert the bytes into nonnegative pieces. */
169 for (i
= j
= a
= n
= 0, w
= PIECEWD(0); i
< 56; i
++) {
170 a
|= (uint32
)xv
[i
] << n
; n
+= 8;
172 z
->P
[j
++] = a
&MASK(w
);
173 a
>>= w
; n
-= w
; w
= PIECEWD(j
);
177 /* Convert the nonnegative pieces into a balanced signed representation, so
178 * each piece ends up in the interval |z_i| <= 2^11 + 1.
180 b
= z
->P
[39]&B10
; z
->P
[39] -= b
<< 1; c
= b
>> 10;
181 for (i
= NPIECE
- 1; i
--; ) {
185 z
->P
[i
+ 1] += b
>> w
;
187 z
->P
[0] += c
; z
->P
[20] += c
;
192 /* --- @fgoldi_store@ --- *
194 * Arguments: @octet zv[56]@ = where to write the result
195 * @const fgoldi *x@ = the field element to write
199 * Use: Stores a field element in the given octet vector in external
200 * representation. A canonical encoding is always stored.
203 void fgoldi_store(octet zv
[56], const fgoldi
*x
)
205 #if FGOLDI_IMPL == 28
207 piece y
[NPIECE
], yy
[NPIECE
], c
, d
;
212 for (i
= 0; i
< NPIECE
; i
++) y
[i
] = x
->P
[i
];
214 /* First, propagate the carries. By the end of this, we'll have all of the
215 * the pieces canonically sized and positive, and maybe there'll be
216 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
217 * value will be in the half-open interval [0, φ^2). The whole represented
218 * value is then y + φ^2 c.
220 * Assume that we start out with |y_i| <= 2^30. We start off by cutting
221 * off and reducing the carry c_15 from the topmost piece, y_15. This
222 * leaves 0 <= y_15 < 2^28; and we'll have |c_15| <= 4. We'll add this
223 * onto y_0 and y_8, and propagate the carries. It's very clear that we'll
224 * end up with |y + (φ + 1) c_15 - φ^2/2| << φ^2.
226 * Here, the y_i are signed, so we must be cautious about bithacking them.
228 c
= ASR(piece
, y
[15], 28); y
[15] = (upiece
)y
[15]&M28
; y
[8] += c
;
229 for (i
= 0; i
< NPIECE
; i
++)
230 { y
[i
] += c
; c
= ASR(piece
, y
[i
], 28); y
[i
] = (upiece
)y
[i
]&M28
; }
232 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
233 * y >= p, then we should subtract p from the whole value; if c = -1 then
234 * we should add p; and otherwise we should do nothing.
236 * But conditional behaviour is bad, m'kay. So here's what we do instead.
238 * The first job is to sort out what we wanted to do. If c = -1 then we
239 * want to (a) invert the constant addend and (b) feed in a carry-in;
240 * otherwise, we don't.
245 /* Now do the addition/subtraction. Remember that all of the y_i are
246 * nonnegative, so shifting and masking are safe and easy.
248 d
+= y
[0] + (1 ^ m
); yy
[0] = d
&M28
; d
>>= 28;
249 for (i
= 1; i
< 8; i
++)
250 { d
+= y
[i
] + m
; yy
[i
] = d
&M28
; d
>>= 28; }
251 d
+= y
[8] + (1 ^ m
); yy
[8] = d
&M28
; d
>>= 28;
252 for (i
= 9; i
< 16; i
++)
253 { d
+= y
[i
] + m
; yy
[i
] = d
&M28
; d
>>= 28; }
255 /* The final carry-out is in d; since we only did addition, and the y_i are
256 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
257 * if (a) c /= 0 (in which case we know that the old value was
258 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
259 * the subtraction didn't cause a borrow, so we must be in the case where
262 m
= NONZEROP(c
) | ~NONZEROP(d
- 1);
263 for (i
= 0; i
< NPIECE
; i
++) y
[i
] = (yy
[i
]&m
) | (y
[i
]&~m
);
265 /* Extract 32-bit words from the value. */
266 for (i
= 0; i
< 7; i
++) {
267 u
= ((y
[i
+ 0] >> (4*i
)) | ((uint32
)y
[i
+ 1] << (28 - 4*i
)))&M32
;
268 v
= ((y
[i
+ 8] >> (4*i
)) | ((uint32
)y
[i
+ 9] << (28 - 4*i
)))&M32
;
269 STORE32_L(zv
+ 4*i
, u
);
270 STORE32_L(zv
+ 4*i
+ 28, v
);
273 #elif FGOLDI_IMPL == 12
275 piece y
[NPIECE
], yy
[NPIECE
], c
, d
;
280 for (i
= 0; i
< NPIECE
; i
++) y
[i
] = x
->P
[i
];
282 /* First, propagate the carries. By the end of this, we'll have all of the
283 * the pieces canonically sized and positive, and maybe there'll be
284 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
285 * value will be in the half-open interval [0, φ^2). The whole represented
286 * value is then y + φ^2 c.
288 * Assume that we start out with |y_i| <= 2^14. We start off by cutting
289 * off and reducing the carry c_39 from the topmost piece, y_39. This
290 * leaves 0 <= y_39 < 2^11; and we'll have |c_39| <= 16. We'll add this
291 * onto y_0 and y_20, and propagate the carries. It's very clear that
292 * we'll end up with |y + (φ + 1) c_39 - φ^2/2| << φ^2.
294 * Here, the y_i are signed, so we must be cautious about bithacking them.
296 c
= ASR(piece
, y
[39], 11); y
[39] = (piece
)y
[39]&M11
; y
[20] += c
;
297 for (i
= 0; i
< NPIECE
; i
++) {
298 w
= PIECEWD(i
); m
= (1 << w
) - 1;
299 y
[i
] += c
; c
= ASR(piece
, y
[i
], w
); y
[i
] = (upiece
)y
[i
]&m
;
302 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
303 * y >= p, then we should subtract p from the whole value; if c = -1 then
304 * we should add p; and otherwise we should do nothing.
306 * But conditional behaviour is bad, m'kay. So here's what we do instead.
308 * The first job is to sort out what we wanted to do. If c = -1 then we
309 * want to (a) invert the constant addend and (b) feed in a carry-in;
310 * otherwise, we don't.
315 /* Now do the addition/subtraction. Remember that all of the y_i are
316 * nonnegative, so shifting and masking are safe and easy.
318 d
+= y
[ 0] + (1 ^ (mm
&M12
)); yy
[ 0] = d
&M12
; d
>>= 12;
319 for (i
= 1; i
< 20; i
++) {
320 w
= PIECEWD(i
); m
= MASK(w
);
321 d
+= y
[ i
] + (mm
&m
); yy
[ i
] = d
&m
; d
>>= w
;
323 d
+= y
[20] + (1 ^ (mm
&M12
)); yy
[20] = d
&M12
; d
>>= 12;
324 for (i
= 21; i
< 40; i
++) {
325 w
= PIECEWD(i
); m
= MASK(w
);
326 d
+= y
[ i
] + (mm
&m
); yy
[ i
] = d
&m
; d
>>= w
;
329 /* The final carry-out is in d; since we only did addition, and the y_i are
330 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
331 * if (a) c /= 0 (in which case we know that the old value was
332 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
333 * the subtraction didn't cause a borrow, so we must be in the case where
336 m
= NONZEROP(c
) | ~NONZEROP(d
- 1);
337 for (i
= 0; i
< NPIECE
; i
++) y
[i
] = (yy
[i
]&m
) | (y
[i
]&~m
);
339 /* Convert that back into octets. */
340 for (i
= j
= a
= n
= 0; i
< NPIECE
; i
++) {
341 a
|= (uint32
)y
[i
] << n
; n
+= PIECEWD(i
);
342 while (n
>= 8) { zv
[j
++] = a
&M8
; a
>>= 8; n
-= 8; }
348 /* --- @fgoldi_set@ --- *
350 * Arguments: @fgoldi *z@ = where to write the result
351 * @int a@ = a small-ish constant
355 * Use: Sets @z@ to equal @a@.
358 void fgoldi_set(fgoldi
*x
, int a
)
363 for (i
= 1; i
< NPIECE
; i
++) x
->P
[i
] = 0;
366 /*----- Basic arithmetic --------------------------------------------------*/
368 /* --- @fgoldi_add@ --- *
370 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
371 * @const fgoldi *x, *y@ = two operands
375 * Use: Set @z@ to the sum %$x + y$%.
378 void fgoldi_add(fgoldi
*z
, const fgoldi
*x
, const fgoldi
*y
)
381 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = x
->P
[i
] + y
->P
[i
];
384 /* --- @fgoldi_sub@ --- *
386 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
387 * @const fgoldi *x, *y@ = two operands
391 * Use: Set @z@ to the difference %$x - y$%.
394 void fgoldi_sub(fgoldi
*z
, const fgoldi
*x
, const fgoldi
*y
)
397 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = x
->P
[i
] - y
->P
[i
];
400 /*----- Constant-time utilities -------------------------------------------*/
402 /* --- @fgoldi_condswap@ --- *
404 * Arguments: @fgoldi *x, *y@ = two operands
405 * @uint32 m@ = a mask
409 * Use: If @m@ is zero, do nothing; if @m@ is all-bits-set, then
410 * exchange @x@ and @y@. If @m@ has some other value, then
411 * scramble @x@ and @y@ in an unhelpful way.
414 void fgoldi_condswap(fgoldi
*x
, fgoldi
*y
, uint32 m
)
417 mask32 mm
= FIX_MASK32(m
);
419 for (i
= 0; i
< NPIECE
; i
++) CONDSWAP(x
->P
[i
], y
->P
[i
], mm
);
422 /*----- Multiplication ----------------------------------------------------*/
424 #if FGOLDI_IMPL == 28
426 /* Let B = 2^63 - 1 be the largest value such that +B and -B can be
427 * represented in a double-precision piece. On entry, it must be the case
428 * that |X_i| <= M <= B - 2^27 for some M. If this is the case, then, on
429 * exit, we will have |Z_i| <= 2^27 + M/2^27.
431 #define CARRY_REDUCE(z, x) do { \
432 dblpiece _t[NPIECE], _c; \
435 /* Bias the input pieces. This keeps the carries and so on centred \
436 * around zero rather than biased positive. \
438 for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27; \
440 /* Calculate the reduced pieces. Careful with the bithacking. */ \
441 _c = ASR(dblpiece, _t[15], 28); \
442 (z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c; \
443 for (_i = 1; _i < NPIECE; _i++) { \
444 (z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 + \
445 ASR(dblpiece, _t[_i - 1], 28); \
450 #elif FGOLDI_IMPL == 12
452 static void carry_reduce(dblpiece x
[NPIECE
])
454 /* Initial bounds: we assume |x_i| < 2^31 - 2^27. */
459 /* The result is nearly canonical, because we do sequential carry
460 * propagation, because smaller processors are more likely to prefer the
461 * smaller working set than the instruction-level parallelism.
463 * Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
464 * Truncate x_38 to 10 bits, and add the carry onto x_39. Truncate x_39 to
465 * 10 bits, and add the carry onto x_0 and x_20. And so on.
467 * Once we reach x_37 for the second time, we start with |x_37| <= 2^10.
468 * The carry into x_37 is at most 2^21; so the carry out into x_38 has
469 * magnitude at most 2^10. In turn, |x_38| <= 2^10 before the carry, so is
470 * now no more than 2^11 in magnitude, and the carry out into x_39 is at
471 * most 1. This leaves |x_39| <= 2^10 + 1 after carry propagation.
473 * Be careful with the bit hacking because the quantities involved are
477 /* For each piece, we bias it so that floor division (as done by an
478 * arithmetic right shift) and modulus (as done by bitwise-AND) does the
481 #define CARRY(i, wd, b, m) do { \
483 c = ASR(dblpiece, x[i], (wd)); \
484 x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b); \
487 { CARRY(37, 11, B10
, M11
); }
488 { x
[38] += c
; CARRY(38, 11, B10
, M11
); }
489 { x
[39] += c
; CARRY(39, 11, B10
, M11
); }
491 for (i
= 0; i
< 35; ) {
492 { x
[i
] += c
; CARRY( i
, 12, B11
, M12
); i
++; }
493 for (j
= i
+ 4; i
< j
; ) { x
[i
] += c
; CARRY( i
, 11, B10
, M11
); i
++; }
495 { x
[i
] += c
; CARRY( i
, 12, B11
, M12
); i
++; }
496 while (i
< 39) { x
[i
] += c
; CARRY( i
, 11, B10
, M11
); i
++; }
502 /* --- @fgoldi_mulconst@ --- *
504 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
505 * @const fgoldi *x@ = an operand
506 * @long a@ = a small-ish constant; %$|a| < 2^{20}$%.
510 * Use: Set @z@ to the product %$a x$%.
513 void fgoldi_mulconst(fgoldi
*z
, const fgoldi
*x
, long a
)
516 dblpiece zz
[NPIECE
], aa
= a
;
518 for (i
= 0; i
< NPIECE
; i
++) zz
[i
] = aa
*x
->P
[i
];
519 #if FGOLDI_IMPL == 28
520 CARRY_REDUCE(z
->P
, zz
);
521 #elif FGOLDI_IMPL == 12
523 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = zz
[i
];
527 /* --- @fgoldi_mul@ --- *
529 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
530 * @const fgoldi *x, *y@ = two operands
534 * Use: Set @z@ to the product %$x y$%.
537 void fgoldi_mul(fgoldi
*z
, const fgoldi
*x
, const fgoldi
*y
)
539 dblpiece zz
[NPIECE
], u
[NPIECE
];
540 piece ab
[NPIECE
/2], cd
[NPIECE
/2];
542 *a
= x
->P
+ NPIECE
/2, *b
= x
->P
,
543 *c
= y
->P
+ NPIECE
/2, *d
= y
->P
;
546 #if FGOLDI_IMPL == 28
548 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
550 #elif FGOLDI_IMPL == 12
552 static const unsigned short off
[39] = {
553 0, 12, 23, 34, 45, 56, 68, 79, 90, 101,
554 112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
555 224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
556 336, 348, 359, 370, 381, 392, 404, 415, 426
559 #define M(x,i, y,j) \
560 (((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
566 * Write x = a φ + b, and y = c φ + d. Then x y = a c φ^2 +
567 * (a d + b c) φ + b d. Karatsuba and Ofman observed that a d + b c =
568 * (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
569 * the prime p so that φ^2 = φ + 1. So
571 * x y = ((a + b) (c + d) - b d) φ + a c + b d
574 for (i
= 0; i
< NPIECE
; i
++) zz
[i
] = 0;
576 /* Our first job will be to calculate (1 - φ) b d, and write the result
577 * into z. As we do this, an interesting thing will happen. Write
578 * b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
579 * So, what we do is to write the product end-swapped and negated, and then
580 * we'll subtract the (negated, remember) high half from the low half.
582 for (i
= 0; i
< NPIECE
/2; i
++) {
583 for (j
= 0; j
< NPIECE
/2 - i
; j
++)
584 zz
[i
+ j
+ NPIECE
/2] -= M(b
,i
, d
,j
);
585 for (; j
< NPIECE
/2; j
++)
586 zz
[i
+ j
- NPIECE
/2] -= M(b
,i
, d
,j
);
588 for (i
= 0; i
< NPIECE
/2; i
++)
589 zz
[i
] -= zz
[i
+ NPIECE
/2];
591 /* Next, we add on a c. There are no surprises here. */
592 for (i
= 0; i
< NPIECE
/2; i
++)
593 for (j
= 0; j
< NPIECE
/2; j
++)
594 zz
[i
+ j
] += M(a
,i
, c
,j
);
596 /* Now, calculate a + b and c + d. */
597 for (i
= 0; i
< NPIECE
/2; i
++)
598 { ab
[i
] = a
[i
] + b
[i
]; cd
[i
] = c
[i
] + d
[i
]; }
600 /* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
601 * Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
602 * v φ + (1 + φ) u. We'll store u in a temporary place and add it on
605 for (i
= 0; i
< NPIECE
; i
++) u
[i
] = 0;
606 for (i
= 0; i
< NPIECE
/2; i
++) {
607 for (j
= 0; j
< NPIECE
/2 - i
; j
++)
608 zz
[i
+ j
+ NPIECE
/2] += M(ab
,i
, cd
,j
);
609 for (; j
< NPIECE
/2; j
++)
610 u
[i
+ j
- NPIECE
/2] += M(ab
,i
, cd
,j
);
612 for (i
= 0; i
< NPIECE
/2; i
++)
613 { zz
[i
] += u
[i
]; zz
[i
+ NPIECE
/2] += u
[i
]; }
617 #if FGOLDI_IMPL == 28
618 /* That wraps it up for the multiplication. Let's figure out some bounds.
619 * Fortunately, Karatsuba is a polynomial identity, so all of the pieces
620 * end up the way they'd be if we'd done the thing the easy way, which
621 * simplifies the analysis. Suppose we started with |x_i|, |y_i| <= 9/5
622 * 2^28. The overheads in the result are given by the coefficients of
624 * ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
626 * the greatest of which is 38. So |z_i| <= 38*81/25*2^56 < 2^63.
628 * Anyway, a round of `CARRY_REDUCE' will leave us with |z_i| < 2^27 +
629 * 2^36; and a second round will leave us with |z_i| < 2^27 + 512.
631 for (i
= 0; i
< 2; i
++) CARRY_REDUCE(zz
, zz
);
632 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = zz
[i
];
633 #elif FGOLDI_IMPL == 12
635 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = zz
[i
];
639 /* --- @fgoldi_sqr@ --- *
641 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
642 * @const fgoldi *x@ = an operand
646 * Use: Set @z@ to the square %$x^2$%.
649 void fgoldi_sqr(fgoldi
*z
, const fgoldi
*x
)
651 #if FGOLDI_IMPL == 28
653 dblpiece zz
[NPIECE
], u
[NPIECE
];
655 const piece
*a
= x
->P
+ NPIECE
/2, *b
= x
->P
;
658 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
660 /* The magic is basically the same as `fgoldi_mul' above. We write
661 * x = a φ + b and use Karatsuba and the special prime shape. This time,
664 * x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
667 for (i
= 0; i
< NPIECE
; i
++) zz
[i
] = 0;
669 /* Our first job will be to calculate (1 - φ) b^2, and write the result
670 * into z. Again, this interacts pleasantly with the prime shape.
672 for (i
= 0; i
< NPIECE
/4; i
++) {
673 zz
[2*i
+ NPIECE
/2] -= M(b
,i
, b
,i
);
674 for (j
= i
+ 1; j
< NPIECE
/2 - i
; j
++)
675 zz
[i
+ j
+ NPIECE
/2] -= 2*M(b
,i
, b
,j
);
676 for (; j
< NPIECE
/2; j
++)
677 zz
[i
+ j
- NPIECE
/2] -= 2*M(b
,i
, b
,j
);
679 for (; i
< NPIECE
/2; i
++) {
680 zz
[2*i
- NPIECE
/2] -= M(b
,i
, b
,i
);
681 for (j
= i
+ 1; j
< NPIECE
/2; j
++)
682 zz
[i
+ j
- NPIECE
/2] -= 2*M(b
,i
, b
,j
);
684 for (i
= 0; i
< NPIECE
/2; i
++)
685 zz
[i
] -= zz
[i
+ NPIECE
/2];
687 /* Next, we add on a^2. There are no surprises here. */
688 for (i
= 0; i
< NPIECE
/2; i
++) {
689 zz
[2*i
] += M(a
,i
, a
,i
);
690 for (j
= i
+ 1; j
< NPIECE
/2; j
++)
691 zz
[i
+ j
] += 2*M(a
,i
, a
,j
);
694 /* Now, calculate a + b. */
695 for (i
= 0; i
< NPIECE
/2; i
++)
698 /* Finally (for the multiplication) we must add on (a + b)^2 φ.
699 * Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u. We'll
700 * store u in a temporary place and add it on twice.
702 for (i
= 0; i
< NPIECE
; i
++) u
[i
] = 0;
703 for (i
= 0; i
< NPIECE
/4; i
++) {
704 zz
[2*i
+ NPIECE
/2] += M(ab
,i
, ab
,i
);
705 for (j
= i
+ 1; j
< NPIECE
/2 - i
; j
++)
706 zz
[i
+ j
+ NPIECE
/2] += 2*M(ab
,i
, ab
,j
);
707 for (; j
< NPIECE
/2; j
++)
708 u
[i
+ j
- NPIECE
/2] += 2*M(ab
,i
, ab
,j
);
710 for (; i
< NPIECE
/2; i
++) {
711 u
[2*i
- NPIECE
/2] += M(ab
,i
, ab
,i
);
712 for (j
= i
+ 1; j
< NPIECE
/2; j
++)
713 u
[i
+ j
- NPIECE
/2] += 2*M(ab
,i
, ab
,j
);
715 for (i
= 0; i
< NPIECE
/2; i
++)
716 { zz
[i
] += u
[i
]; zz
[i
+ NPIECE
/2] += u
[i
]; }
720 /* Finally, carrying. */
721 for (i
= 0; i
< 2; i
++) CARRY_REDUCE(zz
, zz
);
722 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = zz
[i
];
724 #elif FGOLDI_IMPL == 12
729 /*----- More advanced operations ------------------------------------------*/
731 /* --- @fgoldi_inv@ --- *
733 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
734 * @const fgoldi *x@ = an operand
738 * Use: Stores in @z@ the multiplicative inverse %$x^{-1}$%. If
739 * %$x = 0$% then @z@ is set to zero. This is considered a
743 void fgoldi_inv(fgoldi
*z
, const fgoldi
*x
)
748 #define SQRN(z, x, n) do { \
749 fgoldi_sqr((z), (x)); \
750 for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
753 /* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
754 * x = 0 as intended. The addition chain is home-made.
755 */ /* step | value */
756 fgoldi_sqr(&u
, x
); /* 1 | 2 */
757 fgoldi_mul(&t
, &u
, x
); /* 2 | 3 */
758 SQRN(&u
, &t
, 2); /* 4 | 12 */
759 fgoldi_mul(&t
, &u
, &t
); /* 5 | 15 */
760 SQRN(&u
, &t
, 4); /* 9 | 240 */
761 fgoldi_mul(&u
, &u
, &t
); /* 10 | 255 = 2^8 - 1 */
762 SQRN(&u
, &u
, 4); /* 14 | 2^12 - 16 */
763 fgoldi_mul(&t
, &u
, &t
); /* 15 | 2^12 - 1 */
764 SQRN(&u
, &t
, 12); /* 27 | 2^24 - 2^12 */
765 fgoldi_mul(&u
, &u
, &t
); /* 28 | 2^24 - 1 */
766 SQRN(&u
, &u
, 12); /* 40 | 2^36 - 2^12 */
767 fgoldi_mul(&t
, &u
, &t
); /* 41 | 2^36 - 1 */
768 fgoldi_sqr(&t
, &t
); /* 42 | 2^37 - 2 */
769 fgoldi_mul(&t
, &t
, x
); /* 43 | 2^37 - 1 */
770 SQRN(&u
, &t
, 37); /* 80 | 2^74 - 2^37 */
771 fgoldi_mul(&u
, &u
, &t
); /* 81 | 2^74 - 1 */
772 SQRN(&u
, &u
, 37); /* 118 | 2^111 - 2^37 */
773 fgoldi_mul(&t
, &u
, &t
); /* 119 | 2^111 - 1 */
774 SQRN(&u
, &t
, 111); /* 230 | 2^222 - 2^111 */
775 fgoldi_mul(&t
, &u
, &t
); /* 231 | 2^222 - 1 */
776 fgoldi_sqr(&u
, &t
); /* 232 | 2^223 - 2 */
777 fgoldi_mul(&u
, &u
, x
); /* 233 | 2^223 - 1 */
778 SQRN(&u
, &u
, 223); /* 456 | 2^446 - 2^223 */
779 fgoldi_mul(&t
, &u
, &t
); /* 457 | 2^446 - 2^222 - 1 */
780 SQRN(&t
, &t
, 2); /* 459 | 2^448 - 2^224 - 4 */
781 fgoldi_mul(z
, &t
, x
); /* 460 | 2^448 - 2^224 - 3 */
786 /*----- Test rig ----------------------------------------------------------*/
790 #include <mLib/report.h>
791 #include <mLib/str.h>
792 #include <mLib/testrig.h>
794 static void fixdstr(dstr
*d
)
797 die(1, "invalid length for fgoldi");
798 else if (d
->len
< 56) {
800 memset(d
->buf
+ d
->len
, 0, 56 - d
->len
);
805 static void cvt_fgoldi(const char *buf
, dstr
*d
)
809 type_hex
.cvt(buf
, &dd
); fixdstr(&dd
);
810 dstr_ensure(d
, sizeof(fgoldi
)); d
->len
= sizeof(fgoldi
);
811 fgoldi_load((fgoldi
*)d
->buf
, (const octet
*)dd
.buf
);
815 static void dump_fgoldi(dstr
*d
, FILE *fp
)
816 { fdump(stderr
, "???", (const piece
*)d
->buf
); }
818 static void cvt_fgoldi_ref(const char *buf
, dstr
*d
)
819 { type_hex
.cvt(buf
, d
); fixdstr(d
); }
821 static void dump_fgoldi_ref(dstr
*d
, FILE *fp
)
825 fgoldi_load(&x
, (const octet
*)d
->buf
);
826 fdump(stderr
, "???", x
.P
);
829 static int eq(const fgoldi
*x
, dstr
*d
)
830 { octet b
[56]; fgoldi_store(b
, x
); return (memcmp(b
, d
->buf
, 56) == 0); }
832 static const test_type
833 type_fgoldi
= { cvt_fgoldi
, dump_fgoldi
},
834 type_fgoldi_ref
= { cvt_fgoldi_ref
, dump_fgoldi_ref
};
836 #define TEST_UNOP(op) \
837 static int vrf_##op(dstr dv[]) \
839 fgoldi *x = (fgoldi *)dv[0].buf; \
843 fgoldi_##op(&z, x); \
844 if (!eq(&z, &dv[1])) { \
846 fprintf(stderr, "failed!\n"); \
847 fdump(stderr, "x", x->P); \
848 fdump(stderr, "calc", z.P); \
849 fgoldi_load(&zz, (const octet *)dv[1].buf); \
850 fdump(stderr, "z", zz.P); \
859 #define TEST_BINOP(op) \
860 static int vrf_##op(dstr dv[]) \
862 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf; \
866 fgoldi_##op(&z, x, y); \
867 if (!eq(&z, &dv[2])) { \
869 fprintf(stderr, "failed!\n"); \
870 fdump(stderr, "x", x->P); \
871 fdump(stderr, "y", y->P); \
872 fdump(stderr, "calc", z.P); \
873 fgoldi_load(&zz, (const octet *)dv[2].buf); \
874 fdump(stderr, "z", zz.P); \
884 static int vrf_mulc(dstr dv
[])
886 fgoldi
*x
= (fgoldi
*)dv
[0].buf
;
887 long a
= *(const long *)dv
[1].buf
;
891 fgoldi_mulconst(&z
, x
, a
);
892 if (!eq(&z
, &dv
[2])) {
894 fprintf(stderr
, "failed!\n");
895 fdump(stderr
, "x", x
->P
);
896 fprintf(stderr
, "a = %ld\n", a
);
897 fdump(stderr
, "calc", z
.P
);
898 fgoldi_load(&zz
, (const octet
*)dv
[2].buf
);
899 fdump(stderr
, "z", zz
.P
);
905 static int vrf_condswap(dstr dv
[])
907 fgoldi
*x
= (fgoldi
*)dv
[0].buf
, *y
= (fgoldi
*)dv
[1].buf
;
908 fgoldi xx
= *x
, yy
= *y
;
909 uint32 m
= *(uint32
*)dv
[2].buf
;
912 fgoldi_condswap(&xx
, &yy
, m
);
913 if (!eq(&xx
, &dv
[3]) || !eq(&yy
, &dv
[4])) {
915 fprintf(stderr
, "failed!\n");
916 fdump(stderr
, "x", x
->P
);
917 fdump(stderr
, "y", y
->P
);
918 fprintf(stderr
, "m = 0x%08lx\n", (unsigned long)m
);
919 fdump(stderr
, "calc xx", xx
.P
);
920 fdump(stderr
, "calc yy", yy
.P
);
921 fgoldi_load(&xx
, (const octet
*)dv
[3].buf
);
922 fgoldi_load(&yy
, (const octet
*)dv
[4].buf
);
923 fdump(stderr
, "want xx", xx
.P
);
924 fdump(stderr
, "want yy", yy
.P
);
930 static int vrf_sub_mulc_add_sub_mul(dstr dv
[])
932 fgoldi
*u
= (fgoldi
*)dv
[0].buf
, *v
= (fgoldi
*)dv
[1].buf
,
933 *w
= (fgoldi
*)dv
[3].buf
, *x
= (fgoldi
*)dv
[4].buf
,
934 *y
= (fgoldi
*)dv
[5].buf
;
935 long a
= *(const long *)dv
[2].buf
;
936 fgoldi umv
, aumv
, wpaumv
, xmy
, z
, zz
;
939 fgoldi_sub(&umv
, u
, v
);
940 fgoldi_mulconst(&aumv
, &umv
, a
);
941 fgoldi_add(&wpaumv
, w
, &aumv
);
942 fgoldi_sub(&xmy
, x
, y
);
943 fgoldi_mul(&z
, &wpaumv
, &xmy
);
945 if (!eq(&z
, &dv
[6])) {
947 fprintf(stderr
, "failed!\n");
948 fdump(stderr
, "u", u
->P
);
949 fdump(stderr
, "v", v
->P
);
950 fdump(stderr
, "u - v", umv
.P
);
951 fprintf(stderr
, "a = %ld\n", a
);
952 fdump(stderr
, "a (u - v)", aumv
.P
);
953 fdump(stderr
, "w + a (u - v)", wpaumv
.P
);
954 fdump(stderr
, "x", x
->P
);
955 fdump(stderr
, "y", y
->P
);
956 fdump(stderr
, "x - y", xmy
.P
);
957 fdump(stderr
, "(x - y) (w + a (u - v))", z
.P
);
958 fgoldi_load(&zz
, (const octet
*)dv
[6].buf
); fdump(stderr
, "z", zz
.P
);
964 static test_chunk tests
[] = {
965 { "add", vrf_add
, { &type_fgoldi
, &type_fgoldi
, &type_fgoldi_ref
} },
966 { "sub", vrf_sub
, { &type_fgoldi
, &type_fgoldi
, &type_fgoldi_ref
} },
967 { "mul", vrf_mul
, { &type_fgoldi
, &type_fgoldi
, &type_fgoldi_ref
} },
968 { "mulconst", vrf_mulc
, { &type_fgoldi
, &type_long
, &type_fgoldi_ref
} },
969 { "condswap", vrf_condswap
,
970 { &type_fgoldi
, &type_fgoldi
, &type_uint32
,
971 &type_fgoldi_ref
, &type_fgoldi_ref
} },
972 { "sqr", vrf_sqr
, { &type_fgoldi
, &type_fgoldi_ref
} },
973 { "inv", vrf_inv
, { &type_fgoldi
, &type_fgoldi_ref
} },
974 { "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul
,
975 { &type_fgoldi
, &type_fgoldi
, &type_long
, &type_fgoldi
,
976 &type_fgoldi
, &type_fgoldi
, &type_fgoldi_ref
} },
980 int main(int argc
, char *argv
[])
982 test_run(argc
, argv
, tests
, SRCDIR
"/t/fgoldi");
988 /*----- That's all, folks -------------------------------------------------*/