mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/* --c--
	2	*
	3	* Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of Catacomb.
	11	*
	12	* Catacomb is free software; you can redistribute it and/or modify
	13	* it under the terms of the GNU Library General Public License as
	14	* published by the Free Software Foundation; either version 2 of the
	15	* License, or (at your option) any later version.
	16	*
	17	* Catacomb is distributed in the hope that it will be useful,
	18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	* GNU Library General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with Catacomb; if not, write to the Free
	24	* Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	* MA 02111-1307, USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include "config.h"
	31
	32	#include "fgoldi.h"
	33
	34	/----- Basic setup -------------------------------------------------------
	35	*
	36	* Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
	37	* (hence the name).
	38	*/
	39
	40	typedef fgoldi_piece piece;
	41
	42	#if FGOLDI_IMPL == 28
	43	/* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
	44	* x = SUM_{0<=i<16} x_i 2^(28i).
	45	*/
	46
	47	typedef int64 dblpiece;
	48	typedef uint32 upiece; typedef uint64 udblpiece;
	49	#define PIECEWD(i) 28
	50	#define NPIECE 16
	51	#define P p28
	52
	53	#define B28 0x10000000u
	54	#define B27 0x08000000u
	55	#define M28 0x0fffffffu
	56	#define M27 0x07ffffffu
	57	#define M32 0xffffffffu
	58
	59	#elif FGOLDI_IMPL == 12
	60	/* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
	61	* SUM_{0<=i<40} x_i 2^ceil(224i/20). Pieces i with i == 0 (mod 5) are 12
	62	* bits wide; the others are 11 bits wide, so they form eight groups of 56
	63	* bits.
	64	*/
	65
	66	typedef int32 dblpiece;
	67	typedef uint16 upiece; typedef uint32 udblpiece;
	68	#define PIECEWD(i) ((i)%5 ? 11 : 12)
	69	#define NPIECE 40
	70	#define P p12
	71
	72	#define B12 0x1000u
	73	#define B11 0x0800u
	74	#define B10 0x0400u
	75	#define M12 0xfffu
	76	#define M11 0x7ffu
	77	#define M10 0x3ffu
	78	#define M8 0xffu
	79
	80	#endif
	81
	82	/----- Debugging machinery -----------------------------------------------/
	83
	84	#if defined(FGOLDI_DEBUG) \|\| defined(TEST_RIG)
	85
	86	#include <stdio.h>
	87
	88	#include "mp.h"
	89	#include "mptext.h"
	90
	91	static mp *get_pgoldi(void)
	92	{
	93	mp p = MP_NEW, t = MP_NEW;
	94
	95	p = mp_setbit(p, MP_ZERO, 448);
	96	t = mp_setbit(t, MP_ZERO, 224);
	97	p = mp_sub(p, p, t);
	98	p = mp_sub(p, p, MP_ONE);
	99	mp_drop(t);
	100	return (p);
	101	}
	102
	103	DEF_FDUMP(fdump, piece, PIECEWD, NPIECE, 56, get_pgoldi())
	104
	105	#endif
	106
	107	/----- Loading and storing -----------------------------------------------/
	108
	109	/* --- @fgoldi_load@ --- *
	110	*
	111	* Arguments: @fgoldi *z@ = where to store the result
	112	* @const octet xv[56]@ = source to read
	113	*
	114	* Returns: ---
	115	*
	116	* Use: Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
	117	* external representation from @xv@ and stores it in @z@.
	118	*
	119	* External representation is little-endian base-256. Some
	120	* elements have multiple encodings, which are not produced by
	121	* correct software; use of noncanonical encodings is not an
	122	* error, and toleration of them is considered a performance
	123	* feature.
	124	*/
	125
	126	void fgoldi_load(fgoldi *z, const octet xv[56])
	127	{
	128	#if FGOLDI_IMPL == 28
	129
	130	unsigned i;
	131	uint32 xw[14];
	132	piece b, c;
	133
	134	/* First, read the input value as words. */
	135	for (i = 0; i < 14; i++) xw[i] = LOAD32_L(xv + 4*i);
	136
	137	/* Extract unsigned 28-bit pieces from the words. */
	138	z->P[ 0] = (xw[ 0] >> 0)&M28;
	139	z->P[ 7] = (xw[ 6] >> 4)&M28;
	140	z->P[ 8] = (xw[ 7] >> 0)&M28;
	141	z->P[15] = (xw[13] >> 4)&M28;
	142	for (i = 1; i < 7; i++) {
	143	z->P[i + 0] = ((xw[i + 0] << (4i)) \| (xw[i - 1] >> (32 - 4i)))&M28;
	144	z->P[i + 8] = ((xw[i + 7] << (4i)) \| (xw[i + 6] >> (32 - 4i)))&M28;
	145	}
	146
	147	/* Convert the nonnegative pieces into a balanced signed representation, so
	148	* each piece ends up in the interval \|z_i\| <= 2^27. For each piece, if
	149	* its top bit is set, lend a bit leftwards; in the case of z_15, reduce
	150	* this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
	151	* φ^2 = φ + 1. We delay this carry until after all of the pieces have
	152	* been balanced. If we don't do this, then we have to do a more expensive
	153	* test for nonzeroness to decide whether to lend a bit leftwards rather
	154	* than just testing a single bit.
	155	*
	156	* Note that we don't try for a canonical representation here: both upper
	157	* and lower bounds are achievable.
	158	*/
	159	b = z->P[15]&B27; z->P[15] -= b << 1; c = b >> 27;
	160	for (i = NPIECE - 1; i--; )
	161	{ b = z->P[i]&B27; z->P[i] -= b << 1; z->P[i + 1] += b >> 27; }
	162	z->P[0] += c; z->P[8] += c;
	163
	164	#elif FGOLDI_IMPL == 12
	165
	166	unsigned i, j, n, w, b;
	167	uint32 a;
	168	int c;
	169
	170	/* First, convert the bytes into nonnegative pieces. */
	171	for (i = j = a = n = 0, w = PIECEWD(0); i < 56; i++) {
	172	a \|= (uint32)xv[i] << n; n += 8;
	173	if (n >= w) {
	174	z->P[j++] = a&MASK(w);
	175	a >>= w; n -= w; w = PIECEWD(j);
	176	}
	177	}
	178
	179	/* Convert the nonnegative pieces into a balanced signed representation, so
	180	* each piece ends up in the interval \|z_i\| <= 2^11 + 1.
	181	*/
	182	b = z->P[39]&B10; z->P[39] -= b << 1; c = b >> 10;
	183	for (i = NPIECE - 1; i--; ) {
	184	w = PIECEWD(i) - 1;
	185	b = z->P[i]&BIT(w);
	186	z->P[i] -= b << 1;
	187	z->P[i + 1] += b >> w;
	188	}
	189	z->P[0] += c; z->P[20] += c;
	190
	191	#endif
	192	}
	193
	194	/* --- @fgoldi_store@ --- *
	195	*
	196	* Arguments: @octet zv[56]@ = where to write the result
	197	* @const fgoldi *x@ = the field element to write
	198	*
	199	* Returns: ---
	200	*
	201	* Use: Stores a field element in the given octet vector in external
	202	* representation. A canonical encoding is always stored.
	203	*/
	204
	205	void fgoldi_store(octet zv[56], const fgoldi *x)
	206	{
	207	#if FGOLDI_IMPL == 28
	208
	209	piece y[NPIECE], yy[NPIECE], c, d;
	210	uint32 u, v;
	211	mask32 m;
	212	unsigned i;
	213
	214	for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
	215
	216	/* First, propagate the carries. By the end of this, we'll have all of the
	217	* the pieces canonically sized and positive, and maybe there'll be
	218	* (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
	219	* value will be in the half-open interval [0, φ^2). The whole represented
	220	* value is then y + φ^2 c.
	221	*
	222	* Assume that we start out with \|y_i\| <= 2^30. We start off by cutting
	223	* off and reducing the carry c_15 from the topmost piece, y_15. This
	224	* leaves 0 <= y_15 < 2^28; and we'll have \|c_15\| <= 4. We'll add this
	225	* onto y_0 and y_8, and propagate the carries. It's very clear that we'll
	226	* end up with \|y + (φ + 1) c_15 - φ^2/2\| << φ^2.
	227	*
	228	* Here, the y_i are signed, so we must be cautious about bithacking them.
	229	*/
	230	c = ASR(piece, y[15], 28); y[15] = (upiece)y[15]&M28; y[8] += c;
	231	for (i = 0; i < NPIECE; i++)
	232	{ y[i] += c; c = ASR(piece, y[i], 28); y[i] = (upiece)y[i]&M28; }
	233
	234	/* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
	235	* y >= p, then we should subtract p from the whole value; if c = -1 then
	236	* we should add p; and otherwise we should do nothing.
	237	*
	238	* But conditional behaviour is bad, m'kay. So here's what we do instead.
	239	*
	240	* The first job is to sort out what we wanted to do. If c = -1 then we
	241	* want to (a) invert the constant addend and (b) feed in a carry-in;
	242	* otherwise, we don't.
	243	*/
	244	m = SIGN(c)&M28;
	245	d = m&1;
	246
	247	/* Now do the addition/subtraction. Remember that all of the y_i are
	248	* nonnegative, so shifting and masking are safe and easy.
	249	*/
	250	d += y[0] + (1 ^ m); yy[0] = d&M28; d >>= 28;
	251	for (i = 1; i < 8; i++)
	252	{ d += y[i] + m; yy[i] = d&M28; d >>= 28; }
	253	d += y[8] + (1 ^ m); yy[8] = d&M28; d >>= 28;
	254	for (i = 9; i < 16; i++)
	255	{ d += y[i] + m; yy[i] = d&M28; d >>= 28; }
	256
	257	/* The final carry-out is in d; since we only did addition, and the y_i are
	258	* nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
	259	* if (a) c /= 0 (in which case we know that the old value was
	260	* unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
	261	* the subtraction didn't cause a borrow, so we must be in the case where
	262	* p <= y < φ^2.
	263	*/
	264	m = NONZEROP(c) \| ~NONZEROP(d - 1);
	265	for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) \| (y[i]&~m);
	266
	267	/* Extract 32-bit words from the value. */
	268	for (i = 0; i < 7; i++) {
	269	u = ((y[i + 0] >> (4i)) \| ((uint32)y[i + 1] << (28 - 4i)))&M32;
	270	v = ((y[i + 8] >> (4i)) \| ((uint32)y[i + 9] << (28 - 4i)))&M32;
	271	STORE32_L(zv + 4*i, u);
	272	STORE32_L(zv + 4*i + 28, v);
	273	}
	274
	275	#elif FGOLDI_IMPL == 12
	276
	277	piece y[NPIECE], yy[NPIECE], c, d;
	278	uint32 a;
	279	mask32 m, mm;
	280	unsigned i, j, n, w;
	281
	282	for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
	283
	284	/* First, propagate the carries. By the end of this, we'll have all of the
	285	* the pieces canonically sized and positive, and maybe there'll be
	286	* (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
	287	* value will be in the half-open interval [0, φ^2). The whole represented
	288	* value is then y + φ^2 c.
	289	*
	290	* Assume that we start out with \|y_i\| <= 2^14. We start off by cutting
	291	* off and reducing the carry c_39 from the topmost piece, y_39. This
	292	* leaves 0 <= y_39 < 2^11; and we'll have \|c_39\| <= 16. We'll add this
	293	* onto y_0 and y_20, and propagate the carries. It's very clear that
	294	* we'll end up with \|y + (φ + 1) c_39 - φ^2/2\| << φ^2.
	295	*
	296	* Here, the y_i are signed, so we must be cautious about bithacking them.
	297	*/
	298	c = ASR(piece, y[39], 11); y[39] = (piece)y[39]&M11; y[20] += c;
	299	for (i = 0; i < NPIECE; i++) {
	300	w = PIECEWD(i); m = (1 << w) - 1;
	301	y[i] += c; c = ASR(piece, y[i], w); y[i] = (upiece)y[i]&m;
	302	}
	303
	304	/* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
	305	* y >= p, then we should subtract p from the whole value; if c = -1 then
	306	* we should add p; and otherwise we should do nothing.
	307	*
	308	* But conditional behaviour is bad, m'kay. So here's what we do instead.
	309	*
	310	* The first job is to sort out what we wanted to do. If c = -1 then we
	311	* want to (a) invert the constant addend and (b) feed in a carry-in;
	312	* otherwise, we don't.
	313	*/
	314	mm = SIGN(c);
	315	d = m&1;
	316
	317	/* Now do the addition/subtraction. Remember that all of the y_i are
	318	* nonnegative, so shifting and masking are safe and easy.
	319	*/
	320	d += y[ 0] + (1 ^ (mm&M12)); yy[ 0] = d&M12; d >>= 12;
	321	for (i = 1; i < 20; i++) {
	322	w = PIECEWD(i); m = MASK(w);
	323	d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
	324	}
	325	d += y[20] + (1 ^ (mm&M12)); yy[20] = d&M12; d >>= 12;
	326	for (i = 21; i < 40; i++) {
	327	w = PIECEWD(i); m = MASK(w);
	328	d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
	329	}
	330
	331	/* The final carry-out is in d; since we only did addition, and the y_i are
	332	* nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
	333	* if (a) c /= 0 (in which case we know that the old value was
	334	* unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
	335	* the subtraction didn't cause a borrow, so we must be in the case where
	336	* p <= y < φ^2.
	337	*/
	338	m = NONZEROP(c) \| ~NONZEROP(d - 1);
	339	for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) \| (y[i]&~m);
	340
	341	/* Convert that back into octets. */
	342	for (i = j = a = n = 0; i < NPIECE; i++) {
	343	a \|= (uint32)y[i] << n; n += PIECEWD(i);
	344	while (n >= 8) { zv[j++] = a&M8; a >>= 8; n -= 8; }
	345	}
	346
	347	#endif
	348	}
	349
	350	/* --- @fgoldi_set@ --- *
	351	*
	352	* Arguments: @fgoldi *z@ = where to write the result
	353	* @int a@ = a small-ish constant
	354	*
	355	* Returns: ---
	356	*
	357	* Use: Sets @z@ to equal @a@.
	358	*/
	359
	360	void fgoldi_set(fgoldi *x, int a)
	361	{
	362	unsigned i;
	363
	364	x->P[0] = a;
	365	for (i = 1; i < NPIECE; i++) x->P[i] = 0;
	366	}
	367
	368	/----- Basic arithmetic --------------------------------------------------/
	369
	370	/* --- @fgoldi_add@ --- *
	371	*
	372	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
	373	* @const fgoldi x, y@ = two operands
	374	*
	375	* Returns: ---
	376	*
	377	* Use: Set @z@ to the sum %$x + y$%.
	378	*/
	379
	380	void fgoldi_add(fgoldi z, const fgoldi x, const fgoldi *y)
	381	{
	382	unsigned i;
	383	for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] + y->P[i];
	384	}
	385
	386	/* --- @fgoldi_sub@ --- *
	387	*
	388	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
	389	* @const fgoldi x, y@ = two operands
	390	*
	391	* Returns: ---
	392	*
	393	* Use: Set @z@ to the difference %$x - y$%.
	394	*/
	395
	396	void fgoldi_sub(fgoldi z, const fgoldi x, const fgoldi *y)
	397	{
	398	unsigned i;
	399	for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] - y->P[i];
	400	}
	401
	402	/----- Constant-time utilities -------------------------------------------/
	403
	404	/* --- @fgoldi_condswap@ --- *
	405	*
	406	* Arguments: @fgoldi x, y@ = two operands
	407	* @uint32 m@ = a mask
	408	*
	409	* Returns: ---
	410	*
	411	* Use: If @m@ is zero, do nothing; if @m@ is all-bits-set, then
	412	* exchange @x@ and @y@. If @m@ has some other value, then
	413	* scramble @x@ and @y@ in an unhelpful way.
	414	*/
	415
	416	void fgoldi_condswap(fgoldi x, fgoldi y, uint32 m)
	417	{
	418	unsigned i;
	419	mask32 mm = FIX_MASK32(m);
	420
	421	for (i = 0; i < NPIECE; i++) CONDSWAP(x->P[i], y->P[i], mm);
	422	}
	423
	424	/----- Multiplication ----------------------------------------------------/
	425
	426	#if FGOLDI_IMPL == 28
	427
	428	/* Let B = 2^63 - 1 be the largest value such that +B and -B can be
	429	* represented in a double-precision piece. On entry, it must be the case
	430	* that \|X_i\| <= M <= B - 2^27 for some M. If this is the case, then, on
	431	* exit, we will have \|Z_i\| <= 2^27 + M/2^27.
	432	*/
	433	#define CARRY_REDUCE(z, x) do { \
	434	dblpiece _t[NPIECE], _c; \
	435	unsigned _i; \
	436	\
	437	/* Bias the input pieces. This keeps the carries and so on centred \
	438	* around zero rather than biased positive. \
	439	*/ \
	440	for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27; \
	441	\
	442	/* Calculate the reduced pieces. Careful with the bithacking. */ \
	443	_c = ASR(dblpiece, _t[15], 28); \
	444	(z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c; \
	445	for (_i = 1; _i < NPIECE; _i++) { \
	446	(z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 + \
	447	ASR(dblpiece, _t[_i - 1], 28); \
	448	} \
	449	(z)[8] += _c; \
	450	} while (0)
	451
	452	#elif FGOLDI_IMPL == 12
	453
	454	static void carry_reduce(dblpiece x[NPIECE])
	455	{
	456	/* Initial bounds: we assume \|x_i\| < 2^31 - 2^27. */
	457
	458	unsigned i, j;
	459	dblpiece c;
	460
	461	/* The result is nearly canonical, because we do sequential carry
	462	* propagation, because smaller processors are more likely to prefer the
	463	* smaller working set than the instruction-level parallelism.
	464	*
	465	* Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
	466	* Truncate x_38 to 10 bits, and add the carry onto x_39. Truncate x_39 to
	467	* 10 bits, and add the carry onto x_0 and x_20. And so on.
	468	*
	469	* Once we reach x_37 for the second time, we start with \|x_37\| <= 2^10.
	470	* The carry into x_37 is at most 2^21; so the carry out into x_38 has
	471	* magnitude at most 2^10. In turn, \|x_38\| <= 2^10 before the carry, so is
	472	* now no more than 2^11 in magnitude, and the carry out into x_39 is at
	473	* most 1. This leaves \|x_39\| <= 2^10 + 1 after carry propagation.
	474	*
	475	* Be careful with the bit hacking because the quantities involved are
	476	* signed.
	477	*/
	478
	479	/* For each piece, we bias it so that floor division (as done by an
	480	* arithmetic right shift) and modulus (as done by bitwise-AND) does the
	481	* right thing.
	482	*/
	483	#define CARRY(i, wd, b, m) do { \
	484	x[i] += (b); \
	485	c = ASR(dblpiece, x[i], (wd)); \
	486	x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b); \
	487	} while (0)
	488
	489	{ CARRY(37, 11, B10, M11); }
	490	{ x[38] += c; CARRY(38, 11, B10, M11); }
	491	{ x[39] += c; CARRY(39, 11, B10, M11); }
	492	x[20] += c;
	493	for (i = 0; i < 35; ) {
	494	{ x[i] += c; CARRY( i, 12, B11, M12); i++; }
	495	for (j = i + 4; i < j; ) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
	496	}
	497	{ x[i] += c; CARRY( i, 12, B11, M12); i++; }
	498	while (i < 39) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
	499	x[39] += c;
	500	}
	501
	502	#endif
	503
	504	/* --- @fgoldi_mulconst@ --- *
	505	*
	506	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
	507	* @const fgoldi *x@ = an operand
	508	* @long a@ = a small-ish constant; %$\|a\| < 2^{20}$%.
	509	*
	510	* Returns: ---
	511	*
	512	* Use: Set @z@ to the product %$a x$%.
	513	*/
	514
	515	void fgoldi_mulconst(fgoldi z, const fgoldi x, long a)
	516	{
	517	unsigned i;
	518	dblpiece zz[NPIECE], aa = a;
	519
	520	for (i = 0; i < NPIECE; i++) zz[i] = aa*x->P[i];
	521	#if FGOLDI_IMPL == 28
	522	CARRY_REDUCE(z->P, zz);
	523	#elif FGOLDI_IMPL == 12
	524	carry_reduce(zz);
	525	for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
	526	#endif
	527	}
	528
	529	/* --- @fgoldi_mul@ --- *
	530	*
	531	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
	532	* @const fgoldi x, y@ = two operands
	533	*
	534	* Returns: ---
	535	*
	536	* Use: Set @z@ to the product %$x y$%.
	537	*/
	538
	539	void fgoldi_mul(fgoldi z, const fgoldi x, const fgoldi *y)
	540	{
	541	dblpiece zz[NPIECE], u[NPIECE];
	542	piece ab[NPIECE/2], cd[NPIECE/2];
	543	const piece
	544	a = x->P + NPIECE/2, b = x->P,
	545	c = y->P + NPIECE/2, d = y->P;
	546	unsigned i, j;
	547
	548	#if FGOLDI_IMPL == 28
	549
	550	# define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
	551
	552	#elif FGOLDI_IMPL == 12
	553
	554	static const unsigned short off[39] = {
	555	0, 12, 23, 34, 45, 56, 68, 79, 90, 101,
	556	112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
	557	224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
	558	336, 348, 359, 370, 381, 392, 404, 415, 426
	559	};
	560
	561	#define M(x,i, y,j) \
	562	(((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
	563
	564	#endif
	565
	566	/* Behold the magic.
	567	*
	568	* Write x = a φ + b, and y = c φ + d. Then x y = a c φ^2 +
	569	* (a d + b c) φ + b d. Karatsuba and Ofman observed that a d + b c =
	570	* (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
	571	* the prime p so that φ^2 = φ + 1. So
	572	*
	573	* x y = ((a + b) (c + d) - b d) φ + a c + b d
	574	*/
	575
	576	for (i = 0; i < NPIECE; i++) zz[i] = 0;
	577
	578	/* Our first job will be to calculate (1 - φ) b d, and write the result
	579	* into z. As we do this, an interesting thing will happen. Write
	580	* b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
	581	* So, what we do is to write the product end-swapped and negated, and then
	582	* we'll subtract the (negated, remember) high half from the low half.
	583	*/
	584	for (i = 0; i < NPIECE/2; i++) {
	585	for (j = 0; j < NPIECE/2 - i; j++)
	586	zz[i + j + NPIECE/2] -= M(b,i, d,j);
	587	for (; j < NPIECE/2; j++)
	588	zz[i + j - NPIECE/2] -= M(b,i, d,j);
	589	}
	590	for (i = 0; i < NPIECE/2; i++)
	591	zz[i] -= zz[i + NPIECE/2];
	592
	593	/* Next, we add on a c. There are no surprises here. */
	594	for (i = 0; i < NPIECE/2; i++)
	595	for (j = 0; j < NPIECE/2; j++)
	596	zz[i + j] += M(a,i, c,j);
	597
	598	/* Now, calculate a + b and c + d. */
	599	for (i = 0; i < NPIECE/2; i++)
	600	{ ab[i] = a[i] + b[i]; cd[i] = c[i] + d[i]; }
	601
	602	/* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
	603	* Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
	604	* v φ + (1 + φ) u. We'll store u in a temporary place and add it on
	605	* twice.
	606	*/
	607	for (i = 0; i < NPIECE; i++) u[i] = 0;
	608	for (i = 0; i < NPIECE/2; i++) {
	609	for (j = 0; j < NPIECE/2 - i; j++)
	610	zz[i + j + NPIECE/2] += M(ab,i, cd,j);
	611	for (; j < NPIECE/2; j++)
	612	u[i + j - NPIECE/2] += M(ab,i, cd,j);
	613	}
	614	for (i = 0; i < NPIECE/2; i++)
	615	{ zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
	616
	617	#undef M
	618
	619	#if FGOLDI_IMPL == 28
	620	/* That wraps it up for the multiplication. Let's figure out some bounds.
	621	* Fortunately, Karatsuba is a polynomial identity, so all of the pieces
	622	* end up the way they'd be if we'd done the thing the easy way, which
	623	* simplifies the analysis. Suppose we started with \|x_i\|, \|y_i\| <= 9/5
	624	* 2^28. The overheads in the result are given by the coefficients of
	625	*
	626	* ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
	627	*
	628	* the greatest of which is 38. So \|z_i\| <= 3881/252^56 < 2^63.
	629	*
	630	* Anyway, a round of `CARRY_REDUCE' will leave us with \|z_i\| < 2^27 +
	631	* 2^36; and a second round will leave us with \|z_i\| < 2^27 + 512.
	632	*/
	633	for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
	634	for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
	635	#elif FGOLDI_IMPL == 12
	636	carry_reduce(zz);
	637	for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
	638	#endif
	639	}
	640
	641	/* --- @fgoldi_sqr@ --- *
	642	*
	643	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
	644	* @const fgoldi *x@ = an operand
	645	*
	646	* Returns: ---
	647	*
	648	* Use: Set @z@ to the square %$x^2$%.
	649	*/
	650
	651	void fgoldi_sqr(fgoldi z, const fgoldi x)
	652	{
	653	#if FGOLDI_IMPL == 28
	654
	655	dblpiece zz[NPIECE], u[NPIECE];
	656	piece ab[NPIECE];
	657	const piece a = x->P + NPIECE/2, b = x->P;
	658	unsigned i, j;
	659
	660	# define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
	661
	662	/* The magic is basically the same as `fgoldi_mul' above. We write
	663	* x = a φ + b and use Karatsuba and the special prime shape. This time,
	664	* we have
	665	*
	666	* x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
	667	*/
	668
	669	for (i = 0; i < NPIECE; i++) zz[i] = 0;
	670
	671	/* Our first job will be to calculate (1 - φ) b^2, and write the result
	672	* into z. Again, this interacts pleasantly with the prime shape.
	673	*/
	674	for (i = 0; i < NPIECE/4; i++) {
	675	zz[2*i + NPIECE/2] -= M(b,i, b,i);
	676	for (j = i + 1; j < NPIECE/2 - i; j++)
	677	zz[i + j + NPIECE/2] -= 2*M(b,i, b,j);
	678	for (; j < NPIECE/2; j++)
	679	zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
	680	}
	681	for (; i < NPIECE/2; i++) {
	682	zz[2*i - NPIECE/2] -= M(b,i, b,i);
	683	for (j = i + 1; j < NPIECE/2; j++)
	684	zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
	685	}
	686	for (i = 0; i < NPIECE/2; i++)
	687	zz[i] -= zz[i + NPIECE/2];
	688
	689	/* Next, we add on a^2. There are no surprises here. */
	690	for (i = 0; i < NPIECE/2; i++) {
	691	zz[2*i] += M(a,i, a,i);
	692	for (j = i + 1; j < NPIECE/2; j++)
	693	zz[i + j] += 2*M(a,i, a,j);
	694	}
	695
	696	/* Now, calculate a + b. */
	697	for (i = 0; i < NPIECE/2; i++)
	698	ab[i] = a[i] + b[i];
	699
	700	/* Finally (for the multiplication) we must add on (a + b)^2 φ.
	701	* Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u. We'll
	702	* store u in a temporary place and add it on twice.
	703	*/
	704	for (i = 0; i < NPIECE; i++) u[i] = 0;
	705	for (i = 0; i < NPIECE/4; i++) {
	706	zz[2*i + NPIECE/2] += M(ab,i, ab,i);
	707	for (j = i + 1; j < NPIECE/2 - i; j++)
	708	zz[i + j + NPIECE/2] += 2*M(ab,i, ab,j);
	709	for (; j < NPIECE/2; j++)
	710	u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
	711	}
	712	for (; i < NPIECE/2; i++) {
	713	u[2*i - NPIECE/2] += M(ab,i, ab,i);
	714	for (j = i + 1; j < NPIECE/2; j++)
	715	u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
	716	}
	717	for (i = 0; i < NPIECE/2; i++)
	718	{ zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
	719
	720	#undef M
	721
	722	/* Finally, carrying. */
	723	for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
	724	for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
	725
	726	#elif FGOLDI_IMPL == 12
	727	fgoldi_mul(z, x, x);
	728	#endif
	729	}
	730
	731	/----- More advanced operations ------------------------------------------/
	732
	733	/* --- @fgoldi_inv@ --- *
	734	*
	735	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
	736	* @const fgoldi *x@ = an operand
	737	*
	738	* Returns: ---
	739	*
	740	* Use: Stores in @z@ the multiplicative inverse %$x^{-1}$%. If
	741	* %$x = 0$% then @z@ is set to zero. This is considered a
	742	* feature.
	743	*/
	744
	745	void fgoldi_inv(fgoldi z, const fgoldi x)
	746	{
	747	fgoldi t, u;
	748	unsigned i;
	749
	750	#define SQRN(z, x, n) do { \
	751	fgoldi_sqr((z), (x)); \
	752	for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
	753	} while (0)
	754
	755	/* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
	756	* x = 0 as intended. The addition chain is home-made.
	757	/ / step \| value */
	758	fgoldi_sqr(&u, x); /* 1 \| 2 */
	759	fgoldi_mul(&t, &u, x); /* 2 \| 3 */
	760	SQRN(&u, &t, 2); /* 4 \| 12 */
	761	fgoldi_mul(&t, &u, &t); /* 5 \| 15 */
	762	SQRN(&u, &t, 4); /* 9 \| 240 */
	763	fgoldi_mul(&u, &u, &t); /* 10 \| 255 = 2^8 - 1 */
	764	SQRN(&u, &u, 4); /* 14 \| 2^12 - 16 */
	765	fgoldi_mul(&t, &u, &t); /* 15 \| 2^12 - 1 */
	766	SQRN(&u, &t, 12); /* 27 \| 2^24 - 2^12 */
	767	fgoldi_mul(&u, &u, &t); /* 28 \| 2^24 - 1 */
	768	SQRN(&u, &u, 12); /* 40 \| 2^36 - 2^12 */
	769	fgoldi_mul(&t, &u, &t); /* 41 \| 2^36 - 1 */
	770	fgoldi_sqr(&t, &t); /* 42 \| 2^37 - 2 */
	771	fgoldi_mul(&t, &t, x); /* 43 \| 2^37 - 1 */
	772	SQRN(&u, &t, 37); /* 80 \| 2^74 - 2^37 */
	773	fgoldi_mul(&u, &u, &t); /* 81 \| 2^74 - 1 */
	774	SQRN(&u, &u, 37); /* 118 \| 2^111 - 2^37 */
	775	fgoldi_mul(&t, &u, &t); /* 119 \| 2^111 - 1 */
	776	SQRN(&u, &t, 111); /* 230 \| 2^222 - 2^111 */
	777	fgoldi_mul(&t, &u, &t); /* 231 \| 2^222 - 1 */
	778	fgoldi_sqr(&u, &t); /* 232 \| 2^223 - 2 */
	779	fgoldi_mul(&u, &u, x); /* 233 \| 2^223 - 1 */
	780	SQRN(&u, &u, 223); /* 456 \| 2^446 - 2^223 */
	781	fgoldi_mul(&t, &u, &t); /* 457 \| 2^446 - 2^222 - 1 */
	782	SQRN(&t, &t, 2); /* 459 \| 2^448 - 2^224 - 4 */
	783	fgoldi_mul(z, &t, x); /* 460 \| 2^448 - 2^224 - 3 */
	784
	785	#undef SQRN
	786	}
	787
	788	/----- Test rig ----------------------------------------------------------/
	789
	790	#ifdef TEST_RIG
	791
	792	#include <mLib/report.h>
	793	#include <mLib/str.h>
	794	#include <mLib/testrig.h>
	795
	796	static void fixdstr(dstr *d)
	797	{
	798	if (d->len > 56)
	799	die(1, "invalid length for fgoldi");
	800	else if (d->len < 56) {
	801	dstr_ensure(d, 56);
	802	memset(d->buf + d->len, 0, 56 - d->len);
	803	d->len = 56;
	804	}
	805	}
	806
	807	static void cvt_fgoldi(const char buf, dstr d)
	808	{
	809	dstr dd = DSTR_INIT;
	810
	811	type_hex.cvt(buf, &dd); fixdstr(&dd);
	812	dstr_ensure(d, sizeof(fgoldi)); d->len = sizeof(fgoldi);
	813	fgoldi_load((fgoldi )d->buf, (const octet )dd.buf);
	814	dstr_destroy(&dd);
	815	}
	816
	817	static void dump_fgoldi(dstr d, FILE fp)
	818	{ fdump(stderr, "???", (const piece *)d->buf); }
	819
	820	static void cvt_fgoldi_ref(const char buf, dstr d)
	821	{ type_hex.cvt(buf, d); fixdstr(d); }
	822
	823	static void dump_fgoldi_ref(dstr d, FILE fp)
	824	{
	825	fgoldi x;
	826
	827	fgoldi_load(&x, (const octet *)d->buf);
	828	fdump(stderr, "???", x.P);
	829	}
	830
	831	static int eq(const fgoldi x, dstr d)
	832	{ octet b[56]; fgoldi_store(b, x); return (memcmp(b, d->buf, 56) == 0); }
	833
	834	static const test_type
	835	type_fgoldi = { cvt_fgoldi, dump_fgoldi },
	836	type_fgoldi_ref = { cvt_fgoldi_ref, dump_fgoldi_ref };
	837
	838	#define TEST_UNOP(op) \
	839	static int vrf_##op(dstr dv[]) \
	840	{ \
	841	fgoldi x = (fgoldi )dv[0].buf; \
	842	fgoldi z, zz; \
	843	int ok = 1; \
	844	\
	845	fgoldi_##op(&z, x); \
	846	if (!eq(&z, &dv[1])) { \
	847	ok = 0; \
	848	fprintf(stderr, "failed!\n"); \
	849	fdump(stderr, "x", x->P); \
	850	fdump(stderr, "calc", z.P); \
	851	fgoldi_load(&zz, (const octet *)dv[1].buf); \
	852	fdump(stderr, "z", zz.P); \
	853	} \
	854	\
	855	return (ok); \
	856	}
	857
	858	TEST_UNOP(sqr)
	859	TEST_UNOP(inv)
	860
	861	#define TEST_BINOP(op) \
	862	static int vrf_##op(dstr dv[]) \
	863	{ \
	864	fgoldi x = (fgoldi )dv[0].buf, y = (fgoldi )dv[1].buf; \
	865	fgoldi z, zz; \
	866	int ok = 1; \
	867	\
	868	fgoldi_##op(&z, x, y); \
	869	if (!eq(&z, &dv[2])) { \
	870	ok = 0; \
	871	fprintf(stderr, "failed!\n"); \
	872	fdump(stderr, "x", x->P); \
	873	fdump(stderr, "y", y->P); \
	874	fdump(stderr, "calc", z.P); \
	875	fgoldi_load(&zz, (const octet *)dv[2].buf); \
	876	fdump(stderr, "z", zz.P); \
	877	} \
	878	\
	879	return (ok); \
	880	}
	881
	882	TEST_BINOP(add)
	883	TEST_BINOP(sub)
	884	TEST_BINOP(mul)
	885
	886	static int vrf_mulc(dstr dv[])
	887	{
	888	fgoldi x = (fgoldi )dv[0].buf;
	889	long a = (const long )dv[1].buf;
	890	fgoldi z, zz;
	891	int ok = 1;
	892
	893	fgoldi_mulconst(&z, x, a);
	894	if (!eq(&z, &dv[2])) {
	895	ok = 0;
	896	fprintf(stderr, "failed!\n");
	897	fdump(stderr, "x", x->P);
	898	fprintf(stderr, "a = %ld\n", a);
	899	fdump(stderr, "calc", z.P);
	900	fgoldi_load(&zz, (const octet *)dv[2].buf);
	901	fdump(stderr, "z", zz.P);
	902	}
	903
	904	return (ok);
	905	}
	906
	907	static int vrf_condswap(dstr dv[])
	908	{
	909	fgoldi x = (fgoldi )dv[0].buf, y = (fgoldi )dv[1].buf;
	910	fgoldi xx = x, yy = y;
	911	uint32 m = (uint32 )dv[2].buf;
	912	int ok = 1;
	913
	914	fgoldi_condswap(&xx, &yy, m);
	915	if (!eq(&xx, &dv[3]) \|\| !eq(&yy, &dv[4])) {
	916	ok = 0;
	917	fprintf(stderr, "failed!\n");
	918	fdump(stderr, "x", x->P);
	919	fdump(stderr, "y", y->P);
	920	fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
	921	fdump(stderr, "calc xx", xx.P);
	922	fdump(stderr, "calc yy", yy.P);
	923	fgoldi_load(&xx, (const octet *)dv[3].buf);
	924	fgoldi_load(&yy, (const octet *)dv[4].buf);
	925	fdump(stderr, "want xx", xx.P);
	926	fdump(stderr, "want yy", yy.P);
	927	}
	928
	929	return (ok);
	930	}
	931
	932	static int vrf_sub_mulc_add_sub_mul(dstr dv[])
	933	{
	934	fgoldi u = (fgoldi )dv[0].buf, v = (fgoldi )dv[1].buf,
	935	w = (fgoldi )dv[3].buf, x = (fgoldi )dv[4].buf,
	936	y = (fgoldi )dv[5].buf;
	937	long a = (const long )dv[2].buf;
	938	fgoldi umv, aumv, wpaumv, xmy, z, zz;
	939	int ok = 1;
	940
	941	fgoldi_sub(&umv, u, v);
	942	fgoldi_mulconst(&aumv, &umv, a);
	943	fgoldi_add(&wpaumv, w, &aumv);
	944	fgoldi_sub(&xmy, x, y);
	945	fgoldi_mul(&z, &wpaumv, &xmy);
	946
	947	if (!eq(&z, &dv[6])) {
	948	ok = 0;
	949	fprintf(stderr, "failed!\n");
	950	fdump(stderr, "u", u->P);
	951	fdump(stderr, "v", v->P);
	952	fdump(stderr, "u - v", umv.P);
	953	fprintf(stderr, "a = %ld\n", a);
	954	fdump(stderr, "a (u - v)", aumv.P);
	955	fdump(stderr, "w + a (u - v)", wpaumv.P);
	956	fdump(stderr, "x", x->P);
	957	fdump(stderr, "y", y->P);
	958	fdump(stderr, "x - y", xmy.P);
	959	fdump(stderr, "(x - y) (w + a (u - v))", z.P);
	960	fgoldi_load(&zz, (const octet *)dv[6].buf); fdump(stderr, "z", zz.P);
	961	}
	962
	963	return (ok);
	964	}
	965
	966	static test_chunk tests[] = {
	967	{ "add", vrf_add, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
	968	{ "sub", vrf_sub, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
	969	{ "mul", vrf_mul, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
	970	{ "mulconst", vrf_mulc, { &type_fgoldi, &type_long, &type_fgoldi_ref } },
	971	{ "condswap", vrf_condswap,
	972	{ &type_fgoldi, &type_fgoldi, &type_uint32,
	973	&type_fgoldi_ref, &type_fgoldi_ref } },
	974	{ "sqr", vrf_sqr, { &type_fgoldi, &type_fgoldi_ref } },
	975	{ "inv", vrf_inv, { &type_fgoldi, &type_fgoldi_ref } },
	976	{ "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul,
	977	{ &type_fgoldi, &type_fgoldi, &type_long, &type_fgoldi,
	978	&type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
	979	{ 0, 0, { 0 } }
	980	};
	981
	982	int main(int argc, char *argv[])
	983	{
	984	test_run(argc, argv, tests, SRCDIR "/t/fgoldi");
	985	return (0);
	986	}
	987
	988	#endif
	989
	990	/----- That's all, folks -------------------------------------------------/