3 * Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
5 * (c) 2017 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of Catacomb.
12 * Catacomb is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU Library General Public License as
14 * published by the Free Software Foundation; either version 2 of the
15 * License, or (at your option) any later version.
17 * Catacomb is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU Library General Public License for more details.
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb; if not, write to the Free
24 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
28 /*----- Header files ------------------------------------------------------*/
34 /*----- Basic setup -------------------------------------------------------*
36 * Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
40 typedef fgoldi_piece piece
;
43 /* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
44 * x = SUM_{0<=i<16} x_i 2^(28i).
47 typedef int64 dblpiece
;
48 typedef uint32 upiece
; typedef uint64 udblpiece
;
53 #define B28 0x10000000u
54 #define B27 0x08000000u
55 #define M28 0x0fffffffu
56 #define M27 0x07ffffffu
57 #define M32 0xffffffffu
59 #elif FGOLDI_IMPL == 12
60 /* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
61 * SUM_{0<=i<40} x_i 2^ceil(224i/20). Pieces i with i == 0 (mod 5) are 12
62 * bits wide; the others are 11 bits wide, so they form eight groups of 56
66 typedef int32 dblpiece
;
67 typedef uint16 upiece
; typedef uint32 udblpiece
;
68 #define PIECEWD(i) ((i)%5 ? 11 : 12)
82 /*----- Debugging machinery -----------------------------------------------*/
84 #if defined(FGOLDI_DEBUG) || defined(TEST_RIG)
91 static mp
*get_pgoldi(void)
93 mp
*p
= MP_NEW
, *t
= MP_NEW
;
95 p
= mp_setbit(p
, MP_ZERO
, 448);
96 t
= mp_setbit(t
, MP_ZERO
, 224);
98 p
= mp_sub(p
, p
, MP_ONE
);
103 DEF_FDUMP(fdump
, piece
, PIECEWD
, NPIECE
, 56, get_pgoldi())
107 /*----- Loading and storing -----------------------------------------------*/
109 /* --- @fgoldi_load@ --- *
111 * Arguments: @fgoldi *z@ = where to store the result
112 * @const octet xv[56]@ = source to read
116 * Use: Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
117 * external representation from @xv@ and stores it in @z@.
119 * External representation is little-endian base-256. Some
120 * elements have multiple encodings, which are not produced by
121 * correct software; use of noncanonical encodings is not an
122 * error, and toleration of them is considered a performance
126 void fgoldi_load(fgoldi
*z
, const octet xv
[56])
128 #if FGOLDI_IMPL == 28
134 /* First, read the input value as words. */
135 for (i
= 0; i
< 14; i
++) xw
[i
] = LOAD32_L(xv
+ 4*i
);
137 /* Extract unsigned 28-bit pieces from the words. */
138 z
->P
[ 0] = (xw
[ 0] >> 0)&M28
;
139 z
->P
[ 7] = (xw
[ 6] >> 4)&M28
;
140 z
->P
[ 8] = (xw
[ 7] >> 0)&M28
;
141 z
->P
[15] = (xw
[13] >> 4)&M28
;
142 for (i
= 1; i
< 7; i
++) {
143 z
->P
[i
+ 0] = ((xw
[i
+ 0] << (4*i
)) | (xw
[i
- 1] >> (32 - 4*i
)))&M28
;
144 z
->P
[i
+ 8] = ((xw
[i
+ 7] << (4*i
)) | (xw
[i
+ 6] >> (32 - 4*i
)))&M28
;
147 /* Convert the nonnegative pieces into a balanced signed representation, so
148 * each piece ends up in the interval |z_i| <= 2^27. For each piece, if
149 * its top bit is set, lend a bit leftwards; in the case of z_15, reduce
150 * this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
151 * φ^2 = φ + 1. We delay this carry until after all of the pieces have
152 * been balanced. If we don't do this, then we have to do a more expensive
153 * test for nonzeroness to decide whether to lend a bit leftwards rather
154 * than just testing a single bit.
156 * Note that we don't try for a canonical representation here: both upper
157 * and lower bounds are achievable.
159 b
= z
->P
[15]&B27
; z
->P
[15] -= b
<< 1; c
= b
>> 27;
160 for (i
= NPIECE
- 1; i
--; )
161 { b
= z
->P
[i
]&B27
; z
->P
[i
] -= b
<< 1; z
->P
[i
+ 1] += b
>> 27; }
162 z
->P
[0] += c
; z
->P
[8] += c
;
164 #elif FGOLDI_IMPL == 12
166 unsigned i
, j
, n
, w
, b
;
170 /* First, convert the bytes into nonnegative pieces. */
171 for (i
= j
= a
= n
= 0, w
= PIECEWD(0); i
< 56; i
++) {
172 a
|= (uint32
)xv
[i
] << n
; n
+= 8;
174 z
->P
[j
++] = a
&MASK(w
);
175 a
>>= w
; n
-= w
; w
= PIECEWD(j
);
179 /* Convert the nonnegative pieces into a balanced signed representation, so
180 * each piece ends up in the interval |z_i| <= 2^11 + 1.
182 b
= z
->P
[39]&B10
; z
->P
[39] -= b
<< 1; c
= b
>> 10;
183 for (i
= NPIECE
- 1; i
--; ) {
187 z
->P
[i
+ 1] += b
>> w
;
189 z
->P
[0] += c
; z
->P
[20] += c
;
194 /* --- @fgoldi_store@ --- *
196 * Arguments: @octet zv[56]@ = where to write the result
197 * @const fgoldi *x@ = the field element to write
201 * Use: Stores a field element in the given octet vector in external
202 * representation. A canonical encoding is always stored.
205 void fgoldi_store(octet zv
[56], const fgoldi
*x
)
207 #if FGOLDI_IMPL == 28
209 piece y
[NPIECE
], yy
[NPIECE
], c
, d
;
214 for (i
= 0; i
< NPIECE
; i
++) y
[i
] = x
->P
[i
];
216 /* First, propagate the carries. By the end of this, we'll have all of the
217 * the pieces canonically sized and positive, and maybe there'll be
218 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
219 * value will be in the half-open interval [0, φ^2). The whole represented
220 * value is then y + φ^2 c.
222 * Assume that we start out with |y_i| <= 2^30. We start off by cutting
223 * off and reducing the carry c_15 from the topmost piece, y_15. This
224 * leaves 0 <= y_15 < 2^28; and we'll have |c_15| <= 4. We'll add this
225 * onto y_0 and y_8, and propagate the carries. It's very clear that we'll
226 * end up with |y + (φ + 1) c_15 - φ^2/2| << φ^2.
228 * Here, the y_i are signed, so we must be cautious about bithacking them.
230 c
= ASR(piece
, y
[15], 28); y
[15] = (upiece
)y
[15]&M28
; y
[8] += c
;
231 for (i
= 0; i
< NPIECE
; i
++)
232 { y
[i
] += c
; c
= ASR(piece
, y
[i
], 28); y
[i
] = (upiece
)y
[i
]&M28
; }
234 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
235 * y >= p, then we should subtract p from the whole value; if c = -1 then
236 * we should add p; and otherwise we should do nothing.
238 * But conditional behaviour is bad, m'kay. So here's what we do instead.
240 * The first job is to sort out what we wanted to do. If c = -1 then we
241 * want to (a) invert the constant addend and (b) feed in a carry-in;
242 * otherwise, we don't.
247 /* Now do the addition/subtraction. Remember that all of the y_i are
248 * nonnegative, so shifting and masking are safe and easy.
250 d
+= y
[0] + (1 ^ m
); yy
[0] = d
&M28
; d
>>= 28;
251 for (i
= 1; i
< 8; i
++)
252 { d
+= y
[i
] + m
; yy
[i
] = d
&M28
; d
>>= 28; }
253 d
+= y
[8] + (1 ^ m
); yy
[8] = d
&M28
; d
>>= 28;
254 for (i
= 9; i
< 16; i
++)
255 { d
+= y
[i
] + m
; yy
[i
] = d
&M28
; d
>>= 28; }
257 /* The final carry-out is in d; since we only did addition, and the y_i are
258 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
259 * if (a) c /= 0 (in which case we know that the old value was
260 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
261 * the subtraction didn't cause a borrow, so we must be in the case where
264 m
= NONZEROP(c
) | ~NONZEROP(d
- 1);
265 for (i
= 0; i
< NPIECE
; i
++) y
[i
] = (yy
[i
]&m
) | (y
[i
]&~m
);
267 /* Extract 32-bit words from the value. */
268 for (i
= 0; i
< 7; i
++) {
269 u
= ((y
[i
+ 0] >> (4*i
)) | ((uint32
)y
[i
+ 1] << (28 - 4*i
)))&M32
;
270 v
= ((y
[i
+ 8] >> (4*i
)) | ((uint32
)y
[i
+ 9] << (28 - 4*i
)))&M32
;
271 STORE32_L(zv
+ 4*i
, u
);
272 STORE32_L(zv
+ 4*i
+ 28, v
);
275 #elif FGOLDI_IMPL == 12
277 piece y
[NPIECE
], yy
[NPIECE
], c
, d
;
282 for (i
= 0; i
< NPIECE
; i
++) y
[i
] = x
->P
[i
];
284 /* First, propagate the carries. By the end of this, we'll have all of the
285 * the pieces canonically sized and positive, and maybe there'll be
286 * (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
287 * value will be in the half-open interval [0, φ^2). The whole represented
288 * value is then y + φ^2 c.
290 * Assume that we start out with |y_i| <= 2^14. We start off by cutting
291 * off and reducing the carry c_39 from the topmost piece, y_39. This
292 * leaves 0 <= y_39 < 2^11; and we'll have |c_39| <= 16. We'll add this
293 * onto y_0 and y_20, and propagate the carries. It's very clear that
294 * we'll end up with |y + (φ + 1) c_39 - φ^2/2| << φ^2.
296 * Here, the y_i are signed, so we must be cautious about bithacking them.
298 c
= ASR(piece
, y
[39], 11); y
[39] = (piece
)y
[39]&M11
; y
[20] += c
;
299 for (i
= 0; i
< NPIECE
; i
++) {
300 w
= PIECEWD(i
); m
= (1 << w
) - 1;
301 y
[i
] += c
; c
= ASR(piece
, y
[i
], w
); y
[i
] = (upiece
)y
[i
]&m
;
304 /* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
305 * y >= p, then we should subtract p from the whole value; if c = -1 then
306 * we should add p; and otherwise we should do nothing.
308 * But conditional behaviour is bad, m'kay. So here's what we do instead.
310 * The first job is to sort out what we wanted to do. If c = -1 then we
311 * want to (a) invert the constant addend and (b) feed in a carry-in;
312 * otherwise, we don't.
317 /* Now do the addition/subtraction. Remember that all of the y_i are
318 * nonnegative, so shifting and masking are safe and easy.
320 d
+= y
[ 0] + (1 ^ (mm
&M12
)); yy
[ 0] = d
&M12
; d
>>= 12;
321 for (i
= 1; i
< 20; i
++) {
322 w
= PIECEWD(i
); m
= MASK(w
);
323 d
+= y
[ i
] + (mm
&m
); yy
[ i
] = d
&m
; d
>>= w
;
325 d
+= y
[20] + (1 ^ (mm
&M12
)); yy
[20] = d
&M12
; d
>>= 12;
326 for (i
= 21; i
< 40; i
++) {
327 w
= PIECEWD(i
); m
= MASK(w
);
328 d
+= y
[ i
] + (mm
&m
); yy
[ i
] = d
&m
; d
>>= w
;
331 /* The final carry-out is in d; since we only did addition, and the y_i are
332 * nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
333 * if (a) c /= 0 (in which case we know that the old value was
334 * unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
335 * the subtraction didn't cause a borrow, so we must be in the case where
338 m
= NONZEROP(c
) | ~NONZEROP(d
- 1);
339 for (i
= 0; i
< NPIECE
; i
++) y
[i
] = (yy
[i
]&m
) | (y
[i
]&~m
);
341 /* Convert that back into octets. */
342 for (i
= j
= a
= n
= 0; i
< NPIECE
; i
++) {
343 a
|= (uint32
)y
[i
] << n
; n
+= PIECEWD(i
);
344 while (n
>= 8) { zv
[j
++] = a
&M8
; a
>>= 8; n
-= 8; }
350 /* --- @fgoldi_set@ --- *
352 * Arguments: @fgoldi *z@ = where to write the result
353 * @int a@ = a small-ish constant
357 * Use: Sets @z@ to equal @a@.
360 void fgoldi_set(fgoldi
*x
, int a
)
365 for (i
= 1; i
< NPIECE
; i
++) x
->P
[i
] = 0;
368 /*----- Basic arithmetic --------------------------------------------------*/
370 /* --- @fgoldi_add@ --- *
372 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
373 * @const fgoldi *x, *y@ = two operands
377 * Use: Set @z@ to the sum %$x + y$%.
380 void fgoldi_add(fgoldi
*z
, const fgoldi
*x
, const fgoldi
*y
)
383 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = x
->P
[i
] + y
->P
[i
];
386 /* --- @fgoldi_sub@ --- *
388 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
389 * @const fgoldi *x, *y@ = two operands
393 * Use: Set @z@ to the difference %$x - y$%.
396 void fgoldi_sub(fgoldi
*z
, const fgoldi
*x
, const fgoldi
*y
)
399 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = x
->P
[i
] - y
->P
[i
];
402 /*----- Constant-time utilities -------------------------------------------*/
404 /* --- @fgoldi_condswap@ --- *
406 * Arguments: @fgoldi *x, *y@ = two operands
407 * @uint32 m@ = a mask
411 * Use: If @m@ is zero, do nothing; if @m@ is all-bits-set, then
412 * exchange @x@ and @y@. If @m@ has some other value, then
413 * scramble @x@ and @y@ in an unhelpful way.
416 void fgoldi_condswap(fgoldi
*x
, fgoldi
*y
, uint32 m
)
419 mask32 mm
= FIX_MASK32(m
);
421 for (i
= 0; i
< NPIECE
; i
++) CONDSWAP(x
->P
[i
], y
->P
[i
], mm
);
424 /*----- Multiplication ----------------------------------------------------*/
426 #if FGOLDI_IMPL == 28
428 /* Let B = 2^63 - 1 be the largest value such that +B and -B can be
429 * represented in a double-precision piece. On entry, it must be the case
430 * that |X_i| <= M <= B - 2^27 for some M. If this is the case, then, on
431 * exit, we will have |Z_i| <= 2^27 + M/2^27.
433 #define CARRY_REDUCE(z, x) do { \
434 dblpiece _t[NPIECE], _c; \
437 /* Bias the input pieces. This keeps the carries and so on centred \
438 * around zero rather than biased positive. \
440 for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27; \
442 /* Calculate the reduced pieces. Careful with the bithacking. */ \
443 _c = ASR(dblpiece, _t[15], 28); \
444 (z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c; \
445 for (_i = 1; _i < NPIECE; _i++) { \
446 (z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 + \
447 ASR(dblpiece, _t[_i - 1], 28); \
452 #elif FGOLDI_IMPL == 12
454 static void carry_reduce(dblpiece x
[NPIECE
])
456 /* Initial bounds: we assume |x_i| < 2^31 - 2^27. */
461 /* The result is nearly canonical, because we do sequential carry
462 * propagation, because smaller processors are more likely to prefer the
463 * smaller working set than the instruction-level parallelism.
465 * Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
466 * Truncate x_38 to 10 bits, and add the carry onto x_39. Truncate x_39 to
467 * 10 bits, and add the carry onto x_0 and x_20. And so on.
469 * Once we reach x_37 for the second time, we start with |x_37| <= 2^10.
470 * The carry into x_37 is at most 2^21; so the carry out into x_38 has
471 * magnitude at most 2^10. In turn, |x_38| <= 2^10 before the carry, so is
472 * now no more than 2^11 in magnitude, and the carry out into x_39 is at
473 * most 1. This leaves |x_39| <= 2^10 + 1 after carry propagation.
475 * Be careful with the bit hacking because the quantities involved are
479 /* For each piece, we bias it so that floor division (as done by an
480 * arithmetic right shift) and modulus (as done by bitwise-AND) does the
483 #define CARRY(i, wd, b, m) do { \
485 c = ASR(dblpiece, x[i], (wd)); \
486 x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b); \
489 { CARRY(37, 11, B10
, M11
); }
490 { x
[38] += c
; CARRY(38, 11, B10
, M11
); }
491 { x
[39] += c
; CARRY(39, 11, B10
, M11
); }
493 for (i
= 0; i
< 35; ) {
494 { x
[i
] += c
; CARRY( i
, 12, B11
, M12
); i
++; }
495 for (j
= i
+ 4; i
< j
; ) { x
[i
] += c
; CARRY( i
, 11, B10
, M11
); i
++; }
497 { x
[i
] += c
; CARRY( i
, 12, B11
, M12
); i
++; }
498 while (i
< 39) { x
[i
] += c
; CARRY( i
, 11, B10
, M11
); i
++; }
504 /* --- @fgoldi_mulconst@ --- *
506 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
507 * @const fgoldi *x@ = an operand
508 * @long a@ = a small-ish constant; %$|a| < 2^{20}$%.
512 * Use: Set @z@ to the product %$a x$%.
515 void fgoldi_mulconst(fgoldi
*z
, const fgoldi
*x
, long a
)
518 dblpiece zz
[NPIECE
], aa
= a
;
520 for (i
= 0; i
< NPIECE
; i
++) zz
[i
] = aa
*x
->P
[i
];
521 #if FGOLDI_IMPL == 28
522 CARRY_REDUCE(z
->P
, zz
);
523 #elif FGOLDI_IMPL == 12
525 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = zz
[i
];
529 /* --- @fgoldi_mul@ --- *
531 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
532 * @const fgoldi *x, *y@ = two operands
536 * Use: Set @z@ to the product %$x y$%.
539 void fgoldi_mul(fgoldi
*z
, const fgoldi
*x
, const fgoldi
*y
)
541 dblpiece zz
[NPIECE
], u
[NPIECE
];
542 piece ab
[NPIECE
/2], cd
[NPIECE
/2];
544 *a
= x
->P
+ NPIECE
/2, *b
= x
->P
,
545 *c
= y
->P
+ NPIECE
/2, *d
= y
->P
;
548 #if FGOLDI_IMPL == 28
550 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
552 #elif FGOLDI_IMPL == 12
554 static const unsigned short off
[39] = {
555 0, 12, 23, 34, 45, 56, 68, 79, 90, 101,
556 112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
557 224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
558 336, 348, 359, 370, 381, 392, 404, 415, 426
561 #define M(x,i, y,j) \
562 (((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
568 * Write x = a φ + b, and y = c φ + d. Then x y = a c φ^2 +
569 * (a d + b c) φ + b d. Karatsuba and Ofman observed that a d + b c =
570 * (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
571 * the prime p so that φ^2 = φ + 1. So
573 * x y = ((a + b) (c + d) - b d) φ + a c + b d
576 for (i
= 0; i
< NPIECE
; i
++) zz
[i
] = 0;
578 /* Our first job will be to calculate (1 - φ) b d, and write the result
579 * into z. As we do this, an interesting thing will happen. Write
580 * b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
581 * So, what we do is to write the product end-swapped and negated, and then
582 * we'll subtract the (negated, remember) high half from the low half.
584 for (i
= 0; i
< NPIECE
/2; i
++) {
585 for (j
= 0; j
< NPIECE
/2 - i
; j
++)
586 zz
[i
+ j
+ NPIECE
/2] -= M(b
,i
, d
,j
);
587 for (; j
< NPIECE
/2; j
++)
588 zz
[i
+ j
- NPIECE
/2] -= M(b
,i
, d
,j
);
590 for (i
= 0; i
< NPIECE
/2; i
++)
591 zz
[i
] -= zz
[i
+ NPIECE
/2];
593 /* Next, we add on a c. There are no surprises here. */
594 for (i
= 0; i
< NPIECE
/2; i
++)
595 for (j
= 0; j
< NPIECE
/2; j
++)
596 zz
[i
+ j
] += M(a
,i
, c
,j
);
598 /* Now, calculate a + b and c + d. */
599 for (i
= 0; i
< NPIECE
/2; i
++)
600 { ab
[i
] = a
[i
] + b
[i
]; cd
[i
] = c
[i
] + d
[i
]; }
602 /* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
603 * Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
604 * v φ + (1 + φ) u. We'll store u in a temporary place and add it on
607 for (i
= 0; i
< NPIECE
; i
++) u
[i
] = 0;
608 for (i
= 0; i
< NPIECE
/2; i
++) {
609 for (j
= 0; j
< NPIECE
/2 - i
; j
++)
610 zz
[i
+ j
+ NPIECE
/2] += M(ab
,i
, cd
,j
);
611 for (; j
< NPIECE
/2; j
++)
612 u
[i
+ j
- NPIECE
/2] += M(ab
,i
, cd
,j
);
614 for (i
= 0; i
< NPIECE
/2; i
++)
615 { zz
[i
] += u
[i
]; zz
[i
+ NPIECE
/2] += u
[i
]; }
619 #if FGOLDI_IMPL == 28
620 /* That wraps it up for the multiplication. Let's figure out some bounds.
621 * Fortunately, Karatsuba is a polynomial identity, so all of the pieces
622 * end up the way they'd be if we'd done the thing the easy way, which
623 * simplifies the analysis. Suppose we started with |x_i|, |y_i| <= 9/5
624 * 2^28. The overheads in the result are given by the coefficients of
626 * ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
628 * the greatest of which is 38. So |z_i| <= 38*81/25*2^56 < 2^63.
630 * Anyway, a round of `CARRY_REDUCE' will leave us with |z_i| < 2^27 +
631 * 2^36; and a second round will leave us with |z_i| < 2^27 + 512.
633 for (i
= 0; i
< 2; i
++) CARRY_REDUCE(zz
, zz
);
634 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = zz
[i
];
635 #elif FGOLDI_IMPL == 12
637 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = zz
[i
];
641 /* --- @fgoldi_sqr@ --- *
643 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
644 * @const fgoldi *x@ = an operand
648 * Use: Set @z@ to the square %$x^2$%.
651 void fgoldi_sqr(fgoldi
*z
, const fgoldi
*x
)
653 #if FGOLDI_IMPL == 28
655 dblpiece zz
[NPIECE
], u
[NPIECE
];
657 const piece
*a
= x
->P
+ NPIECE
/2, *b
= x
->P
;
660 # define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
662 /* The magic is basically the same as `fgoldi_mul' above. We write
663 * x = a φ + b and use Karatsuba and the special prime shape. This time,
666 * x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
669 for (i
= 0; i
< NPIECE
; i
++) zz
[i
] = 0;
671 /* Our first job will be to calculate (1 - φ) b^2, and write the result
672 * into z. Again, this interacts pleasantly with the prime shape.
674 for (i
= 0; i
< NPIECE
/4; i
++) {
675 zz
[2*i
+ NPIECE
/2] -= M(b
,i
, b
,i
);
676 for (j
= i
+ 1; j
< NPIECE
/2 - i
; j
++)
677 zz
[i
+ j
+ NPIECE
/2] -= 2*M(b
,i
, b
,j
);
678 for (; j
< NPIECE
/2; j
++)
679 zz
[i
+ j
- NPIECE
/2] -= 2*M(b
,i
, b
,j
);
681 for (; i
< NPIECE
/2; i
++) {
682 zz
[2*i
- NPIECE
/2] -= M(b
,i
, b
,i
);
683 for (j
= i
+ 1; j
< NPIECE
/2; j
++)
684 zz
[i
+ j
- NPIECE
/2] -= 2*M(b
,i
, b
,j
);
686 for (i
= 0; i
< NPIECE
/2; i
++)
687 zz
[i
] -= zz
[i
+ NPIECE
/2];
689 /* Next, we add on a^2. There are no surprises here. */
690 for (i
= 0; i
< NPIECE
/2; i
++) {
691 zz
[2*i
] += M(a
,i
, a
,i
);
692 for (j
= i
+ 1; j
< NPIECE
/2; j
++)
693 zz
[i
+ j
] += 2*M(a
,i
, a
,j
);
696 /* Now, calculate a + b. */
697 for (i
= 0; i
< NPIECE
/2; i
++)
700 /* Finally (for the multiplication) we must add on (a + b)^2 φ.
701 * Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u. We'll
702 * store u in a temporary place and add it on twice.
704 for (i
= 0; i
< NPIECE
; i
++) u
[i
] = 0;
705 for (i
= 0; i
< NPIECE
/4; i
++) {
706 zz
[2*i
+ NPIECE
/2] += M(ab
,i
, ab
,i
);
707 for (j
= i
+ 1; j
< NPIECE
/2 - i
; j
++)
708 zz
[i
+ j
+ NPIECE
/2] += 2*M(ab
,i
, ab
,j
);
709 for (; j
< NPIECE
/2; j
++)
710 u
[i
+ j
- NPIECE
/2] += 2*M(ab
,i
, ab
,j
);
712 for (; i
< NPIECE
/2; i
++) {
713 u
[2*i
- NPIECE
/2] += M(ab
,i
, ab
,i
);
714 for (j
= i
+ 1; j
< NPIECE
/2; j
++)
715 u
[i
+ j
- NPIECE
/2] += 2*M(ab
,i
, ab
,j
);
717 for (i
= 0; i
< NPIECE
/2; i
++)
718 { zz
[i
] += u
[i
]; zz
[i
+ NPIECE
/2] += u
[i
]; }
722 /* Finally, carrying. */
723 for (i
= 0; i
< 2; i
++) CARRY_REDUCE(zz
, zz
);
724 for (i
= 0; i
< NPIECE
; i
++) z
->P
[i
] = zz
[i
];
726 #elif FGOLDI_IMPL == 12
731 /*----- More advanced operations ------------------------------------------*/
733 /* --- @fgoldi_inv@ --- *
735 * Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
736 * @const fgoldi *x@ = an operand
740 * Use: Stores in @z@ the multiplicative inverse %$x^{-1}$%. If
741 * %$x = 0$% then @z@ is set to zero. This is considered a
745 void fgoldi_inv(fgoldi
*z
, const fgoldi
*x
)
750 #define SQRN(z, x, n) do { \
751 fgoldi_sqr((z), (x)); \
752 for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
755 /* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
756 * x = 0 as intended. The addition chain is home-made.
757 */ /* step | value */
758 fgoldi_sqr(&u
, x
); /* 1 | 2 */
759 fgoldi_mul(&t
, &u
, x
); /* 2 | 3 */
760 SQRN(&u
, &t
, 2); /* 4 | 12 */
761 fgoldi_mul(&t
, &u
, &t
); /* 5 | 15 */
762 SQRN(&u
, &t
, 4); /* 9 | 240 */
763 fgoldi_mul(&u
, &u
, &t
); /* 10 | 255 = 2^8 - 1 */
764 SQRN(&u
, &u
, 4); /* 14 | 2^12 - 16 */
765 fgoldi_mul(&t
, &u
, &t
); /* 15 | 2^12 - 1 */
766 SQRN(&u
, &t
, 12); /* 27 | 2^24 - 2^12 */
767 fgoldi_mul(&u
, &u
, &t
); /* 28 | 2^24 - 1 */
768 SQRN(&u
, &u
, 12); /* 40 | 2^36 - 2^12 */
769 fgoldi_mul(&t
, &u
, &t
); /* 41 | 2^36 - 1 */
770 fgoldi_sqr(&t
, &t
); /* 42 | 2^37 - 2 */
771 fgoldi_mul(&t
, &t
, x
); /* 43 | 2^37 - 1 */
772 SQRN(&u
, &t
, 37); /* 80 | 2^74 - 2^37 */
773 fgoldi_mul(&u
, &u
, &t
); /* 81 | 2^74 - 1 */
774 SQRN(&u
, &u
, 37); /* 118 | 2^111 - 2^37 */
775 fgoldi_mul(&t
, &u
, &t
); /* 119 | 2^111 - 1 */
776 SQRN(&u
, &t
, 111); /* 230 | 2^222 - 2^111 */
777 fgoldi_mul(&t
, &u
, &t
); /* 231 | 2^222 - 1 */
778 fgoldi_sqr(&u
, &t
); /* 232 | 2^223 - 2 */
779 fgoldi_mul(&u
, &u
, x
); /* 233 | 2^223 - 1 */
780 SQRN(&u
, &u
, 223); /* 456 | 2^446 - 2^223 */
781 fgoldi_mul(&t
, &u
, &t
); /* 457 | 2^446 - 2^222 - 1 */
782 SQRN(&t
, &t
, 2); /* 459 | 2^448 - 2^224 - 4 */
783 fgoldi_mul(z
, &t
, x
); /* 460 | 2^448 - 2^224 - 3 */
788 /*----- Test rig ----------------------------------------------------------*/
792 #include <mLib/report.h>
793 #include <mLib/str.h>
794 #include <mLib/testrig.h>
796 static void fixdstr(dstr
*d
)
799 die(1, "invalid length for fgoldi");
800 else if (d
->len
< 56) {
802 memset(d
->buf
+ d
->len
, 0, 56 - d
->len
);
807 static void cvt_fgoldi(const char *buf
, dstr
*d
)
811 type_hex
.cvt(buf
, &dd
); fixdstr(&dd
);
812 dstr_ensure(d
, sizeof(fgoldi
)); d
->len
= sizeof(fgoldi
);
813 fgoldi_load((fgoldi
*)d
->buf
, (const octet
*)dd
.buf
);
817 static void dump_fgoldi(dstr
*d
, FILE *fp
)
818 { fdump(stderr
, "???", (const piece
*)d
->buf
); }
820 static void cvt_fgoldi_ref(const char *buf
, dstr
*d
)
821 { type_hex
.cvt(buf
, d
); fixdstr(d
); }
823 static void dump_fgoldi_ref(dstr
*d
, FILE *fp
)
827 fgoldi_load(&x
, (const octet
*)d
->buf
);
828 fdump(stderr
, "???", x
.P
);
831 static int eq(const fgoldi
*x
, dstr
*d
)
832 { octet b
[56]; fgoldi_store(b
, x
); return (memcmp(b
, d
->buf
, 56) == 0); }
834 static const test_type
835 type_fgoldi
= { cvt_fgoldi
, dump_fgoldi
},
836 type_fgoldi_ref
= { cvt_fgoldi_ref
, dump_fgoldi_ref
};
838 #define TEST_UNOP(op) \
839 static int vrf_##op(dstr dv[]) \
841 fgoldi *x = (fgoldi *)dv[0].buf; \
845 fgoldi_##op(&z, x); \
846 if (!eq(&z, &dv[1])) { \
848 fprintf(stderr, "failed!\n"); \
849 fdump(stderr, "x", x->P); \
850 fdump(stderr, "calc", z.P); \
851 fgoldi_load(&zz, (const octet *)dv[1].buf); \
852 fdump(stderr, "z", zz.P); \
861 #define TEST_BINOP(op) \
862 static int vrf_##op(dstr dv[]) \
864 fgoldi *x = (fgoldi *)dv[0].buf, *y = (fgoldi *)dv[1].buf; \
868 fgoldi_##op(&z, x, y); \
869 if (!eq(&z, &dv[2])) { \
871 fprintf(stderr, "failed!\n"); \
872 fdump(stderr, "x", x->P); \
873 fdump(stderr, "y", y->P); \
874 fdump(stderr, "calc", z.P); \
875 fgoldi_load(&zz, (const octet *)dv[2].buf); \
876 fdump(stderr, "z", zz.P); \
886 static int vrf_mulc(dstr dv
[])
888 fgoldi
*x
= (fgoldi
*)dv
[0].buf
;
889 long a
= *(const long *)dv
[1].buf
;
893 fgoldi_mulconst(&z
, x
, a
);
894 if (!eq(&z
, &dv
[2])) {
896 fprintf(stderr
, "failed!\n");
897 fdump(stderr
, "x", x
->P
);
898 fprintf(stderr
, "a = %ld\n", a
);
899 fdump(stderr
, "calc", z
.P
);
900 fgoldi_load(&zz
, (const octet
*)dv
[2].buf
);
901 fdump(stderr
, "z", zz
.P
);
907 static int vrf_condswap(dstr dv
[])
909 fgoldi
*x
= (fgoldi
*)dv
[0].buf
, *y
= (fgoldi
*)dv
[1].buf
;
910 fgoldi xx
= *x
, yy
= *y
;
911 uint32 m
= *(uint32
*)dv
[2].buf
;
914 fgoldi_condswap(&xx
, &yy
, m
);
915 if (!eq(&xx
, &dv
[3]) || !eq(&yy
, &dv
[4])) {
917 fprintf(stderr
, "failed!\n");
918 fdump(stderr
, "x", x
->P
);
919 fdump(stderr
, "y", y
->P
);
920 fprintf(stderr
, "m = 0x%08lx\n", (unsigned long)m
);
921 fdump(stderr
, "calc xx", xx
.P
);
922 fdump(stderr
, "calc yy", yy
.P
);
923 fgoldi_load(&xx
, (const octet
*)dv
[3].buf
);
924 fgoldi_load(&yy
, (const octet
*)dv
[4].buf
);
925 fdump(stderr
, "want xx", xx
.P
);
926 fdump(stderr
, "want yy", yy
.P
);
932 static int vrf_sub_mulc_add_sub_mul(dstr dv
[])
934 fgoldi
*u
= (fgoldi
*)dv
[0].buf
, *v
= (fgoldi
*)dv
[1].buf
,
935 *w
= (fgoldi
*)dv
[3].buf
, *x
= (fgoldi
*)dv
[4].buf
,
936 *y
= (fgoldi
*)dv
[5].buf
;
937 long a
= *(const long *)dv
[2].buf
;
938 fgoldi umv
, aumv
, wpaumv
, xmy
, z
, zz
;
941 fgoldi_sub(&umv
, u
, v
);
942 fgoldi_mulconst(&aumv
, &umv
, a
);
943 fgoldi_add(&wpaumv
, w
, &aumv
);
944 fgoldi_sub(&xmy
, x
, y
);
945 fgoldi_mul(&z
, &wpaumv
, &xmy
);
947 if (!eq(&z
, &dv
[6])) {
949 fprintf(stderr
, "failed!\n");
950 fdump(stderr
, "u", u
->P
);
951 fdump(stderr
, "v", v
->P
);
952 fdump(stderr
, "u - v", umv
.P
);
953 fprintf(stderr
, "a = %ld\n", a
);
954 fdump(stderr
, "a (u - v)", aumv
.P
);
955 fdump(stderr
, "w + a (u - v)", wpaumv
.P
);
956 fdump(stderr
, "x", x
->P
);
957 fdump(stderr
, "y", y
->P
);
958 fdump(stderr
, "x - y", xmy
.P
);
959 fdump(stderr
, "(x - y) (w + a (u - v))", z
.P
);
960 fgoldi_load(&zz
, (const octet
*)dv
[6].buf
); fdump(stderr
, "z", zz
.P
);
966 static test_chunk tests
[] = {
967 { "add", vrf_add
, { &type_fgoldi
, &type_fgoldi
, &type_fgoldi_ref
} },
968 { "sub", vrf_sub
, { &type_fgoldi
, &type_fgoldi
, &type_fgoldi_ref
} },
969 { "mul", vrf_mul
, { &type_fgoldi
, &type_fgoldi
, &type_fgoldi_ref
} },
970 { "mulconst", vrf_mulc
, { &type_fgoldi
, &type_long
, &type_fgoldi_ref
} },
971 { "condswap", vrf_condswap
,
972 { &type_fgoldi
, &type_fgoldi
, &type_uint32
,
973 &type_fgoldi_ref
, &type_fgoldi_ref
} },
974 { "sqr", vrf_sqr
, { &type_fgoldi
, &type_fgoldi_ref
} },
975 { "inv", vrf_inv
, { &type_fgoldi
, &type_fgoldi_ref
} },
976 { "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul
,
977 { &type_fgoldi
, &type_fgoldi
, &type_long
, &type_fgoldi
,
978 &type_fgoldi
, &type_fgoldi
, &type_fgoldi_ref
} },
982 int main(int argc
, char *argv
[])
984 test_run(argc
, argv
, tests
, SRCDIR
"/t/fgoldi");
990 /*----- That's all, folks -------------------------------------------------*/