mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/* --c--
	2	*
	3	* Arithmetic in the Goldilocks field GF(2^448 - 2^224 - 1)
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of Catacomb.
	11	*
	12	* Catacomb is free software; you can redistribute it and/or modify
	13	* it under the terms of the GNU Library General Public License as
	14	* published by the Free Software Foundation; either version 2 of the
	15	* License, or (at your option) any later version.
	16	*
	17	* Catacomb is distributed in the hope that it will be useful,
	18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	* GNU Library General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with Catacomb; if not, write to the Free
	24	* Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	* MA 02111-1307, USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include "config.h"
	31
	32	#include "ct.h"
	33	#include "fgoldi.h"
	34
	35	/----- Basic setup -------------------------------------------------------
	36	*
	37	* Let φ = 2^224; then p = φ^2 - φ - 1, and, in GF(p), we have φ^2 = φ + 1
	38	* (hence the name).
	39	*/
	40
	41	typedef fgoldi_piece piece;
	42
	43	#if FGOLDI_IMPL == 28
	44	/* We represent an element of GF(p) as 16 28-bit signed integer pieces x_i:
	45	* x = SUM_{0<=i<16} x_i 2^(28i).
	46	*/
	47
	48	typedef int64 dblpiece;
	49	typedef uint32 upiece; typedef uint64 udblpiece;
	50	#define PIECEWD(i) 28
	51	#define NPIECE 16
	52	#define P p28
	53
	54	#define B28 0x10000000u
	55	#define B27 0x08000000u
	56	#define M28 0x0fffffffu
	57	#define M27 0x07ffffffu
	58	#define M32 0xffffffffu
	59
	60	#elif FGOLDI_IMPL == 12
	61	/* We represent an element of GF(p) as 40 signed integer pieces x_i: x =
	62	* SUM_{0<=i<40} x_i 2^ceil(224i/20). Pieces i with i == 0 (mod 5) are 12
	63	* bits wide; the others are 11 bits wide, so they form eight groups of 56
	64	* bits.
	65	*/
	66
	67	typedef int32 dblpiece;
	68	typedef uint16 upiece; typedef uint32 udblpiece;
	69	#define PIECEWD(i) ((i)%5 ? 11 : 12)
	70	#define NPIECE 40
	71	#define P p12
	72
	73	#define B12 0x1000u
	74	#define B11 0x0800u
	75	#define B10 0x0400u
	76	#define M12 0xfffu
	77	#define M11 0x7ffu
	78	#define M10 0x3ffu
	79	#define M8 0xffu
	80
	81	#endif
	82
	83	/----- Debugging machinery -----------------------------------------------/
	84
	85	#if defined(FGOLDI_DEBUG) \|\| defined(TEST_RIG)
	86
	87	#include <stdio.h>
	88
	89	#include "mp.h"
	90	#include "mptext.h"
	91
	92	static mp *get_pgoldi(void)
	93	{
	94	mp p = MP_NEW, t = MP_NEW;
	95
	96	p = mp_setbit(p, MP_ZERO, 448);
	97	t = mp_setbit(t, MP_ZERO, 224);
	98	p = mp_sub(p, p, t);
	99	p = mp_sub(p, p, MP_ONE);
	100	mp_drop(t);
	101	return (p);
	102	}
	103
	104	DEF_FDUMP(fdump, piece, PIECEWD, NPIECE, 56, get_pgoldi())
	105
	106	#endif
	107
	108	/----- Loading and storing -----------------------------------------------/
	109
	110	/* --- @fgoldi_load@ --- *
	111	*
	112	* Arguments: @fgoldi *z@ = where to store the result
	113	* @const octet xv[56]@ = source to read
	114	*
	115	* Returns: ---
	116	*
	117	* Use: Reads an element of %$\gf{2^{448} - 2^{224} - 19}$% in
	118	* external representation from @xv@ and stores it in @z@.
	119	*
	120	* External representation is little-endian base-256. Some
	121	* elements have multiple encodings, which are not produced by
	122	* correct software; use of noncanonical encodings is not an
	123	* error, and toleration of them is considered a performance
	124	* feature.
	125	*/
	126
	127	void fgoldi_load(fgoldi *z, const octet xv[56])
	128	{
	129	#if FGOLDI_IMPL == 28
	130
	131	unsigned i;
	132	uint32 xw[14];
	133	piece b, c;
	134
	135	/* First, read the input value as words. */
	136	for (i = 0; i < 14; i++) xw[i] = LOAD32_L(xv + 4*i);
	137
	138	/* Extract unsigned 28-bit pieces from the words. */
	139	z->P[ 0] = (xw[ 0] >> 0)&M28;
	140	z->P[ 7] = (xw[ 6] >> 4)&M28;
	141	z->P[ 8] = (xw[ 7] >> 0)&M28;
	142	z->P[15] = (xw[13] >> 4)&M28;
	143	for (i = 1; i < 7; i++) {
	144	z->P[i + 0] = ((xw[i + 0] << (4i)) \| (xw[i - 1] >> (32 - 4i)))&M28;
	145	z->P[i + 8] = ((xw[i + 7] << (4i)) \| (xw[i + 6] >> (32 - 4i)))&M28;
	146	}
	147
	148	/* Convert the nonnegative pieces into a balanced signed representation, so
	149	* each piece ends up in the interval \|z_i\| <= 2^27. For each piece, if
	150	* its top bit is set, lend a bit leftwards; in the case of z_15, reduce
	151	* this bit by adding it onto z_0 and z_8, since this is the φ^2 bit, and
	152	* φ^2 = φ + 1. We delay this carry until after all of the pieces have
	153	* been balanced. If we don't do this, then we have to do a more expensive
	154	* test for nonzeroness to decide whether to lend a bit leftwards rather
	155	* than just testing a single bit.
	156	*
	157	* Note that we don't try for a canonical representation here: both upper
	158	* and lower bounds are achievable.
	159	*/
	160	b = z->P[15]&B27; z->P[15] -= b << 1; c = b >> 27;
	161	for (i = NPIECE - 1; i--; )
	162	{ b = z->P[i]&B27; z->P[i] -= b << 1; z->P[i + 1] += b >> 27; }
	163	z->P[0] += c; z->P[8] += c;
	164
	165	#elif FGOLDI_IMPL == 12
	166
	167	unsigned i, j, n, w, b;
	168	uint32 a;
	169	int c;
	170
	171	/* First, convert the bytes into nonnegative pieces. */
	172	for (i = j = a = n = 0, w = PIECEWD(0); i < 56; i++) {
	173	a \|= (uint32)xv[i] << n; n += 8;
	174	if (n >= w) {
	175	z->P[j++] = a&MASK(w);
	176	a >>= w; n -= w; w = PIECEWD(j);
	177	}
	178	}
	179
	180	/* Convert the nonnegative pieces into a balanced signed representation, so
	181	* each piece ends up in the interval \|z_i\| <= 2^11 + 1.
	182	*/
	183	b = z->P[39]&B10; z->P[39] -= b << 1; c = b >> 10;
	184	for (i = NPIECE - 1; i--; ) {
	185	w = PIECEWD(i) - 1;
	186	b = z->P[i]&BIT(w);
	187	z->P[i] -= b << 1;
	188	z->P[i + 1] += b >> w;
	189	}
	190	z->P[0] += c; z->P[20] += c;
	191
	192	#endif
	193	}
	194
	195	/* --- @fgoldi_store@ --- *
	196	*
	197	* Arguments: @octet zv[56]@ = where to write the result
	198	* @const fgoldi *x@ = the field element to write
	199	*
	200	* Returns: ---
	201	*
	202	* Use: Stores a field element in the given octet vector in external
	203	* representation. A canonical encoding is always stored.
	204	*/
	205
	206	void fgoldi_store(octet zv[56], const fgoldi *x)
	207	{
	208	#if FGOLDI_IMPL == 28
	209
	210	piece y[NPIECE], yy[NPIECE], c, d;
	211	uint32 u, v;
	212	mask32 m;
	213	unsigned i;
	214
	215	for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
	216
	217	/* First, propagate the carries. By the end of this, we'll have all of the
	218	* the pieces canonically sized and positive, and maybe there'll be
	219	* (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
	220	* value will be in the half-open interval [0, φ^2). The whole represented
	221	* value is then y + φ^2 c.
	222	*
	223	* Assume that we start out with \|y_i\| <= 2^30. We start off by cutting
	224	* off and reducing the carry c_15 from the topmost piece, y_15. This
	225	* leaves 0 <= y_15 < 2^28; and we'll have \|c_15\| <= 4. We'll add this
	226	* onto y_0 and y_8, and propagate the carries. It's very clear that we'll
	227	* end up with \|y + (φ + 1) c_15 - φ^2/2\| << φ^2.
	228	*
	229	* Here, the y_i are signed, so we must be cautious about bithacking them.
	230	*/
	231	c = ASR(piece, y[15], 28); y[15] = (upiece)y[15]&M28; y[8] += c;
	232	for (i = 0; i < NPIECE; i++)
	233	{ y[i] += c; c = ASR(piece, y[i], 28); y[i] = (upiece)y[i]&M28; }
	234
	235	/* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
	236	* y >= p, then we should subtract p from the whole value; if c = -1 then
	237	* we should add p; and otherwise we should do nothing.
	238	*
	239	* But conditional behaviour is bad, m'kay. So here's what we do instead.
	240	*
	241	* The first job is to sort out what we wanted to do. If c = -1 then we
	242	* want to (a) invert the constant addend and (b) feed in a carry-in;
	243	* otherwise, we don't.
	244	*/
	245	m = SIGN(c)&M28;
	246	d = m&1;
	247
	248	/* Now do the addition/subtraction. Remember that all of the y_i are
	249	* nonnegative, so shifting and masking are safe and easy.
	250	*/
	251	d += y[0] + (1 ^ m); yy[0] = d&M28; d >>= 28;
	252	for (i = 1; i < 8; i++)
	253	{ d += y[i] + m; yy[i] = d&M28; d >>= 28; }
	254	d += y[8] + (1 ^ m); yy[8] = d&M28; d >>= 28;
	255	for (i = 9; i < 16; i++)
	256	{ d += y[i] + m; yy[i] = d&M28; d >>= 28; }
	257
	258	/* The final carry-out is in d; since we only did addition, and the y_i are
	259	* nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
	260	* if (a) c /= 0 (in which case we know that the old value was
	261	* unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
	262	* the subtraction didn't cause a borrow, so we must be in the case where
	263	* p <= y < φ^2.
	264	*/
	265	m = NONZEROP(c) \| ~NONZEROP(d - 1);
	266	for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) \| (y[i]&~m);
	267
	268	/* Extract 32-bit words from the value. */
	269	for (i = 0; i < 7; i++) {
	270	u = ((y[i + 0] >> (4i)) \| ((uint32)y[i + 1] << (28 - 4i)))&M32;
	271	v = ((y[i + 8] >> (4i)) \| ((uint32)y[i + 9] << (28 - 4i)))&M32;
	272	STORE32_L(zv + 4*i, u);
	273	STORE32_L(zv + 4*i + 28, v);
	274	}
	275
	276	#elif FGOLDI_IMPL == 12
	277
	278	piece y[NPIECE], yy[NPIECE], c, d;
	279	uint32 a;
	280	mask32 m, mm;
	281	unsigned i, j, n, w;
	282
	283	for (i = 0; i < NPIECE; i++) y[i] = x->P[i];
	284
	285	/* First, propagate the carries. By the end of this, we'll have all of the
	286	* the pieces canonically sized and positive, and maybe there'll be
	287	* (signed) carry out. The carry c is in { -1, 0, +1 }, and the remaining
	288	* value will be in the half-open interval [0, φ^2). The whole represented
	289	* value is then y + φ^2 c.
	290	*
	291	* Assume that we start out with \|y_i\| <= 2^14. We start off by cutting
	292	* off and reducing the carry c_39 from the topmost piece, y_39. This
	293	* leaves 0 <= y_39 < 2^11; and we'll have \|c_39\| <= 16. We'll add this
	294	* onto y_0 and y_20, and propagate the carries. It's very clear that
	295	* we'll end up with \|y + (φ + 1) c_39 - φ^2/2\| << φ^2.
	296	*
	297	* Here, the y_i are signed, so we must be cautious about bithacking them.
	298	*/
	299	c = ASR(piece, y[39], 11); y[39] = (piece)y[39]&M11; y[20] += c;
	300	for (i = 0; i < NPIECE; i++) {
	301	w = PIECEWD(i); m = (1 << w) - 1;
	302	y[i] += c; c = ASR(piece, y[i], w); y[i] = (upiece)y[i]&m;
	303	}
	304
	305	/* Now we have a slightly fiddly job to do. If c = +1, or if c = 0 and
	306	* y >= p, then we should subtract p from the whole value; if c = -1 then
	307	* we should add p; and otherwise we should do nothing.
	308	*
	309	* But conditional behaviour is bad, m'kay. So here's what we do instead.
	310	*
	311	* The first job is to sort out what we wanted to do. If c = -1 then we
	312	* want to (a) invert the constant addend and (b) feed in a carry-in;
	313	* otherwise, we don't.
	314	*/
	315	mm = SIGN(c);
	316	d = m&1;
	317
	318	/* Now do the addition/subtraction. Remember that all of the y_i are
	319	* nonnegative, so shifting and masking are safe and easy.
	320	*/
	321	d += y[ 0] + (1 ^ (mm&M12)); yy[ 0] = d&M12; d >>= 12;
	322	for (i = 1; i < 20; i++) {
	323	w = PIECEWD(i); m = MASK(w);
	324	d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
	325	}
	326	d += y[20] + (1 ^ (mm&M12)); yy[20] = d&M12; d >>= 12;
	327	for (i = 21; i < 40; i++) {
	328	w = PIECEWD(i); m = MASK(w);
	329	d += y[ i] + (mm&m); yy[ i] = d&m; d >>= w;
	330	}
	331
	332	/* The final carry-out is in d; since we only did addition, and the y_i are
	333	* nonnegative, then d is in { 0, 1 }. We want to keep y', rather than y,
	334	* if (a) c /= 0 (in which case we know that the old value was
	335	* unsatisfactory), or (b) if d = 1 (in which case, if c = 0, we know that
	336	* the subtraction didn't cause a borrow, so we must be in the case where
	337	* p <= y < φ^2.
	338	*/
	339	m = NONZEROP(c) \| ~NONZEROP(d - 1);
	340	for (i = 0; i < NPIECE; i++) y[i] = (yy[i]&m) \| (y[i]&~m);
	341
	342	/* Convert that back into octets. */
	343	for (i = j = a = n = 0; i < NPIECE; i++) {
	344	a \|= (uint32)y[i] << n; n += PIECEWD(i);
	345	while (n >= 8) { zv[j++] = a&M8; a >>= 8; n -= 8; }
	346	}
	347
	348	#endif
	349	}
	350
	351	/* --- @fgoldi_set@ --- *
	352	*
	353	* Arguments: @fgoldi *z@ = where to write the result
	354	* @int a@ = a small-ish constant
	355	*
	356	* Returns: ---
	357	*
	358	* Use: Sets @z@ to equal @a@.
	359	*/
	360
	361	void fgoldi_set(fgoldi *x, int a)
	362	{
	363	unsigned i;
	364
	365	x->P[0] = a;
	366	for (i = 1; i < NPIECE; i++) x->P[i] = 0;
	367	}
	368
	369	/----- Basic arithmetic --------------------------------------------------/
	370
	371	/* --- @fgoldi_add@ --- *
	372	*
	373	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
	374	* @const fgoldi x, y@ = two operands
	375	*
	376	* Returns: ---
	377	*
	378	* Use: Set @z@ to the sum %$x + y$%.
	379	*/
	380
	381	void fgoldi_add(fgoldi z, const fgoldi x, const fgoldi *y)
	382	{
	383	unsigned i;
	384	for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] + y->P[i];
	385	}
	386
	387	/* --- @fgoldi_sub@ --- *
	388	*
	389	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
	390	* @const fgoldi x, y@ = two operands
	391	*
	392	* Returns: ---
	393	*
	394	* Use: Set @z@ to the difference %$x - y$%.
	395	*/
	396
	397	void fgoldi_sub(fgoldi z, const fgoldi x, const fgoldi *y)
	398	{
	399	unsigned i;
	400	for (i = 0; i < NPIECE; i++) z->P[i] = x->P[i] - y->P[i];
	401	}
	402
	403	/* --- @fgoldi_neg@ --- *
	404	*
	405	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
	406	* @const fgoldi *x@ = an operand
	407	*
	408	* Returns: ---
	409	*
	410	* Use: Set @z = -x@.
	411	*/
	412
	413	void fgoldi_neg(fgoldi z, const fgoldi x)
	414	{
	415	unsigned i;
	416	for (i = 0; i < NPIECE; i++) z->P[i] = -x->P[i];
	417	}
	418
	419	/----- Constant-time utilities -------------------------------------------/
	420
	421	/* --- @fgoldi_pick2@ --- *
	422	*
	423	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
	424	* @const fgoldi x, y@ = two operands
	425	* @uint32 m@ = a mask
	426	*
	427	* Returns: ---
	428	*
	429	* Use: If @m@ is zero, set @z = y@; if @m@ is all-bits-set, then set
	430	* @z = x@. If @m@ has some other value, then scramble @z@ in
	431	* an unhelpful way.
	432	*/
	433
	434	void fgoldi_pick2(fgoldi z, const fgoldi x, const fgoldi *y, uint32 m)
	435	{
	436	mask32 mm = FIX_MASK32(m);
	437	unsigned i;
	438	for (i = 0; i < NPIECE; i++) z->P[i] = PICK2(x->P[i], y->P[i], mm);
	439	}
	440
	441	/* --- @fgoldi_pickn@ --- *
	442	*
	443	* Arguments: @fgoldi *z@ = where to put the result
	444	* @const fgoldi *v@ = a table of entries
	445	* @size_t n@ = the number of entries in @v@
	446	* @size_t i@ = an index
	447	*
	448	* Returns: ---
	449	*
	450	* Use: If @0 <= i < n < 32@ then set @z = v[i]@. If @n >= 32@ then
	451	* do something unhelpful; otherwise, if @i >= n@ then set @z@
	452	* to zero.
	453	*/
	454
	455	void fgoldi_pickn(fgoldi z, const fgoldi v, size_t n, size_t i)
	456	{
	457	uint32 b = (uint32)1 << (31 - i);
	458	mask32 m;
	459	unsigned j;
	460
	461	for (j = 0; j < NPIECE; j++) z->P[j] = 0;
	462	while (n--) {
	463	m = SIGN(b);
	464	for (j = 0; j < NPIECE; j++) CONDPICK(z->P[j], v->P[j], m);
	465	v++; b <<= 1;
	466	}
	467	}
	468
	469	/* --- @fgoldi_condswap@ --- *
	470	*
	471	* Arguments: @fgoldi x, y@ = two operands
	472	* @uint32 m@ = a mask
	473	*
	474	* Returns: ---
	475	*
	476	* Use: If @m@ is zero, do nothing; if @m@ is all-bits-set, then
	477	* exchange @x@ and @y@. If @m@ has some other value, then
	478	* scramble @x@ and @y@ in an unhelpful way.
	479	*/
	480
	481	void fgoldi_condswap(fgoldi x, fgoldi y, uint32 m)
	482	{
	483	unsigned i;
	484	mask32 mm = FIX_MASK32(m);
	485
	486	for (i = 0; i < NPIECE; i++) CONDSWAP(x->P[i], y->P[i], mm);
	487	}
	488
	489	/* --- @fgoldi_condneg@ --- *
	490	*
	491	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
	492	* @const fgoldi *x@ = an operand
	493	* @uint32 m@ = a mask
	494	*
	495	* Returns: ---
	496	*
	497	* Use: If @m@ is zero, set @z = x@; if @m@ is all-bits-set, then set
	498	* @z = -x@. If @m@ has some other value then scramble @z@ in
	499	* an unhelpful way.
	500	*/
	501
	502	void fgoldi_condneg(fgoldi z, const fgoldi x, uint32 m)
	503	{
	504	#ifdef NEG_TWOC
	505	mask32 m_xor = FIX_MASK32(m);
	506	piece m_add = m&1;
	507	# define CONDNEG(x) (((x) ^ m_xor) + m_add)
	508	#else
	509	int s = PICK2(-1, +1, m);
	510	# define CONDNEG(x) (s*(x))
	511	#endif
	512
	513	unsigned i;
	514	for (i = 0; i < NPIECE; i++) z->P[i] = CONDNEG(x->P[i]);
	515
	516	#undef CONDNEG
	517	}
	518
	519	/----- Multiplication ----------------------------------------------------/
	520
	521	#if FGOLDI_IMPL == 28
	522
	523	/* Let B = 2^63 - 1 be the largest value such that +B and -B can be
	524	* represented in a double-precision piece. On entry, it must be the case
	525	* that \|X_i\| <= M <= B - 2^27 for some M. If this is the case, then, on
	526	* exit, we will have \|Z_i\| <= 2^27 + M/2^27.
	527	*/
	528	#define CARRY_REDUCE(z, x) do { \
	529	dblpiece _t[NPIECE], _c; \
	530	unsigned _i; \
	531	\
	532	/* Bias the input pieces. This keeps the carries and so on centred \
	533	* around zero rather than biased positive. \
	534	*/ \
	535	for (_i = 0; _i < NPIECE; _i++) _t[_i] = (x)[_i] + B27; \
	536	\
	537	/* Calculate the reduced pieces. Careful with the bithacking. */ \
	538	_c = ASR(dblpiece, _t[15], 28); \
	539	(z)[0] = (dblpiece)((udblpiece)_t[0]&M28) - B27 + _c; \
	540	for (_i = 1; _i < NPIECE; _i++) { \
	541	(z)[_i] = (dblpiece)((udblpiece)_t[_i]&M28) - B27 + \
	542	ASR(dblpiece, _t[_i - 1], 28); \
	543	} \
	544	(z)[8] += _c; \
	545	} while (0)
	546
	547	#elif FGOLDI_IMPL == 12
	548
	549	static void carry_reduce(dblpiece x[NPIECE])
	550	{
	551	/* Initial bounds: we assume \|x_i\| < 2^31 - 2^27. */
	552
	553	unsigned i, j;
	554	dblpiece c;
	555
	556	/* The result is nearly canonical, because we do sequential carry
	557	* propagation, because smaller processors are more likely to prefer the
	558	* smaller working set than the instruction-level parallelism.
	559	*
	560	* Start at x_37; truncate it to 10 bits, and propagate the carry to x_38.
	561	* Truncate x_38 to 10 bits, and add the carry onto x_39. Truncate x_39 to
	562	* 10 bits, and add the carry onto x_0 and x_20. And so on.
	563	*
	564	* Once we reach x_37 for the second time, we start with \|x_37\| <= 2^10.
	565	* The carry into x_37 is at most 2^21; so the carry out into x_38 has
	566	* magnitude at most 2^10. In turn, \|x_38\| <= 2^10 before the carry, so is
	567	* now no more than 2^11 in magnitude, and the carry out into x_39 is at
	568	* most 1. This leaves \|x_39\| <= 2^10 + 1 after carry propagation.
	569	*
	570	* Be careful with the bit hacking because the quantities involved are
	571	* signed.
	572	*/
	573
	574	/* For each piece, we bias it so that floor division (as done by an
	575	* arithmetic right shift) and modulus (as done by bitwise-AND) does the
	576	* right thing.
	577	*/
	578	#define CARRY(i, wd, b, m) do { \
	579	x[i] += (b); \
	580	c = ASR(dblpiece, x[i], (wd)); \
	581	x[i] = (dblpiece)((udblpiece)x[i]&(m)) - (b); \
	582	} while (0)
	583
	584	{ CARRY(37, 11, B10, M11); }
	585	{ x[38] += c; CARRY(38, 11, B10, M11); }
	586	{ x[39] += c; CARRY(39, 11, B10, M11); }
	587	x[20] += c;
	588	for (i = 0; i < 35; ) {
	589	{ x[i] += c; CARRY( i, 12, B11, M12); i++; }
	590	for (j = i + 4; i < j; ) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
	591	}
	592	{ x[i] += c; CARRY( i, 12, B11, M12); i++; }
	593	while (i < 39) { x[i] += c; CARRY( i, 11, B10, M11); i++; }
	594	x[39] += c;
	595	}
	596
	597	#endif
	598
	599	/* --- @fgoldi_mulconst@ --- *
	600	*
	601	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
	602	* @const fgoldi *x@ = an operand
	603	* @long a@ = a small-ish constant; %$\|a\| < 2^{20}$%.
	604	*
	605	* Returns: ---
	606	*
	607	* Use: Set @z@ to the product %$a x$%.
	608	*/
	609
	610	void fgoldi_mulconst(fgoldi z, const fgoldi x, long a)
	611	{
	612	unsigned i;
	613	dblpiece zz[NPIECE], aa = a;
	614
	615	for (i = 0; i < NPIECE; i++) zz[i] = aa*x->P[i];
	616	#if FGOLDI_IMPL == 28
	617	CARRY_REDUCE(z->P, zz);
	618	#elif FGOLDI_IMPL == 12
	619	carry_reduce(zz);
	620	for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
	621	#endif
	622	}
	623
	624	/* --- @fgoldi_mul@ --- *
	625	*
	626	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
	627	* @const fgoldi x, y@ = two operands
	628	*
	629	* Returns: ---
	630	*
	631	* Use: Set @z@ to the product %$x y$%.
	632	*/
	633
	634	void fgoldi_mul(fgoldi z, const fgoldi x, const fgoldi *y)
	635	{
	636	dblpiece zz[NPIECE], u[NPIECE];
	637	piece ab[NPIECE/2], cd[NPIECE/2];
	638	const piece
	639	a = x->P + NPIECE/2, b = x->P,
	640	c = y->P + NPIECE/2, d = y->P;
	641	unsigned i, j;
	642
	643	#if FGOLDI_IMPL == 28
	644
	645	# define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
	646
	647	#elif FGOLDI_IMPL == 12
	648
	649	static const unsigned short off[39] = {
	650	0, 12, 23, 34, 45, 56, 68, 79, 90, 101,
	651	112, 124, 135, 146, 157, 168, 180, 191, 202, 213,
	652	224, 236, 247, 258, 269, 280, 292, 303, 314, 325,
	653	336, 348, 359, 370, 381, 392, 404, 415, 426
	654	};
	655
	656	#define M(x,i, y,j) \
	657	(((dblpiece)(x)[i]*(y)[j]) << (off[i] + off[j] - off[(i) + (j)]))
	658
	659	#endif
	660
	661	/* Behold the magic.
	662	*
	663	* Write x = a φ + b, and y = c φ + d. Then x y = a c φ^2 +
	664	* (a d + b c) φ + b d. Karatsuba and Ofman observed that a d + b c =
	665	* (a + b) (c + d) - a c - b d, saving a multiplication, and Hamburg chose
	666	* the prime p so that φ^2 = φ + 1. So
	667	*
	668	* x y = ((a + b) (c + d) - b d) φ + a c + b d
	669	*/
	670
	671	for (i = 0; i < NPIECE; i++) zz[i] = 0;
	672
	673	/* Our first job will be to calculate (1 - φ) b d, and write the result
	674	* into z. As we do this, an interesting thing will happen. Write
	675	* b d = u φ + v; then (1 - φ) b d = u φ + v - u φ^2 - v φ = (1 - φ) v - u.
	676	* So, what we do is to write the product end-swapped and negated, and then
	677	* we'll subtract the (negated, remember) high half from the low half.
	678	*/
	679	for (i = 0; i < NPIECE/2; i++) {
	680	for (j = 0; j < NPIECE/2 - i; j++)
	681	zz[i + j + NPIECE/2] -= M(b,i, d,j);
	682	for (; j < NPIECE/2; j++)
	683	zz[i + j - NPIECE/2] -= M(b,i, d,j);
	684	}
	685	for (i = 0; i < NPIECE/2; i++)
	686	zz[i] -= zz[i + NPIECE/2];
	687
	688	/* Next, we add on a c. There are no surprises here. */
	689	for (i = 0; i < NPIECE/2; i++)
	690	for (j = 0; j < NPIECE/2; j++)
	691	zz[i + j] += M(a,i, c,j);
	692
	693	/* Now, calculate a + b and c + d. */
	694	for (i = 0; i < NPIECE/2; i++)
	695	{ ab[i] = a[i] + b[i]; cd[i] = c[i] + d[i]; }
	696
	697	/* Finally (for the multiplication) we must add on (a + b) (c + d) φ.
	698	* Write (a + b) (c + d) as u φ + v; then we actually want u φ^2 + v φ =
	699	* v φ + (1 + φ) u. We'll store u in a temporary place and add it on
	700	* twice.
	701	*/
	702	for (i = 0; i < NPIECE; i++) u[i] = 0;
	703	for (i = 0; i < NPIECE/2; i++) {
	704	for (j = 0; j < NPIECE/2 - i; j++)
	705	zz[i + j + NPIECE/2] += M(ab,i, cd,j);
	706	for (; j < NPIECE/2; j++)
	707	u[i + j - NPIECE/2] += M(ab,i, cd,j);
	708	}
	709	for (i = 0; i < NPIECE/2; i++)
	710	{ zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
	711
	712	#undef M
	713
	714	#if FGOLDI_IMPL == 28
	715	/* That wraps it up for the multiplication. Let's figure out some bounds.
	716	* Fortunately, Karatsuba is a polynomial identity, so all of the pieces
	717	* end up the way they'd be if we'd done the thing the easy way, which
	718	* simplifies the analysis. Suppose we started with \|x_i\|, \|y_i\| <= 9/5
	719	* 2^28. The overheads in the result are given by the coefficients of
	720	*
	721	* ((u^16 - 1)/(u - 1))^2 mod u^16 - u^8 - 1
	722	*
	723	* the greatest of which is 38. So \|z_i\| <= 3881/252^56 < 2^63.
	724	*
	725	* Anyway, a round of `CARRY_REDUCE' will leave us with \|z_i\| < 2^27 +
	726	* 2^36; and a second round will leave us with \|z_i\| < 2^27 + 512.
	727	*/
	728	for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
	729	for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
	730	#elif FGOLDI_IMPL == 12
	731	carry_reduce(zz);
	732	for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
	733	#endif
	734	}
	735
	736	/* --- @fgoldi_sqr@ --- *
	737	*
	738	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
	739	* @const fgoldi *x@ = an operand
	740	*
	741	* Returns: ---
	742	*
	743	* Use: Set @z@ to the square %$x^2$%.
	744	*/
	745
	746	void fgoldi_sqr(fgoldi z, const fgoldi x)
	747	{
	748	#if FGOLDI_IMPL == 28
	749
	750	dblpiece zz[NPIECE], u[NPIECE];
	751	piece ab[NPIECE];
	752	const piece a = x->P + NPIECE/2, b = x->P;
	753	unsigned i, j;
	754
	755	# define M(x,i, y,j) ((dblpiece)(x)[i]*(y)[j])
	756
	757	/* The magic is basically the same as `fgoldi_mul' above. We write
	758	* x = a φ + b and use Karatsuba and the special prime shape. This time,
	759	* we have
	760	*
	761	* x^2 = ((a + b)^2 - b^2) φ + a^2 + b^2
	762	*/
	763
	764	for (i = 0; i < NPIECE; i++) zz[i] = 0;
	765
	766	/* Our first job will be to calculate (1 - φ) b^2, and write the result
	767	* into z. Again, this interacts pleasantly with the prime shape.
	768	*/
	769	for (i = 0; i < NPIECE/4; i++) {
	770	zz[2*i + NPIECE/2] -= M(b,i, b,i);
	771	for (j = i + 1; j < NPIECE/2 - i; j++)
	772	zz[i + j + NPIECE/2] -= 2*M(b,i, b,j);
	773	for (; j < NPIECE/2; j++)
	774	zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
	775	}
	776	for (; i < NPIECE/2; i++) {
	777	zz[2*i - NPIECE/2] -= M(b,i, b,i);
	778	for (j = i + 1; j < NPIECE/2; j++)
	779	zz[i + j - NPIECE/2] -= 2*M(b,i, b,j);
	780	}
	781	for (i = 0; i < NPIECE/2; i++)
	782	zz[i] -= zz[i + NPIECE/2];
	783
	784	/* Next, we add on a^2. There are no surprises here. */
	785	for (i = 0; i < NPIECE/2; i++) {
	786	zz[2*i] += M(a,i, a,i);
	787	for (j = i + 1; j < NPIECE/2; j++)
	788	zz[i + j] += 2*M(a,i, a,j);
	789	}
	790
	791	/* Now, calculate a + b. */
	792	for (i = 0; i < NPIECE/2; i++)
	793	ab[i] = a[i] + b[i];
	794
	795	/* Finally (for the multiplication) we must add on (a + b)^2 φ.
	796	* Write (a + b)^2 as u φ + v; then we actually want (u + v) φ + u. We'll
	797	* store u in a temporary place and add it on twice.
	798	*/
	799	for (i = 0; i < NPIECE; i++) u[i] = 0;
	800	for (i = 0; i < NPIECE/4; i++) {
	801	zz[2*i + NPIECE/2] += M(ab,i, ab,i);
	802	for (j = i + 1; j < NPIECE/2 - i; j++)
	803	zz[i + j + NPIECE/2] += 2*M(ab,i, ab,j);
	804	for (; j < NPIECE/2; j++)
	805	u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
	806	}
	807	for (; i < NPIECE/2; i++) {
	808	u[2*i - NPIECE/2] += M(ab,i, ab,i);
	809	for (j = i + 1; j < NPIECE/2; j++)
	810	u[i + j - NPIECE/2] += 2*M(ab,i, ab,j);
	811	}
	812	for (i = 0; i < NPIECE/2; i++)
	813	{ zz[i] += u[i]; zz[i + NPIECE/2] += u[i]; }
	814
	815	#undef M
	816
	817	/* Finally, carrying. */
	818	for (i = 0; i < 2; i++) CARRY_REDUCE(zz, zz);
	819	for (i = 0; i < NPIECE; i++) z->P[i] = zz[i];
	820
	821	#elif FGOLDI_IMPL == 12
	822	fgoldi_mul(z, x, x);
	823	#endif
	824	}
	825
	826	/----- More advanced operations ------------------------------------------/
	827
	828	/* --- @fgoldi_inv@ --- *
	829	*
	830	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@)
	831	* @const fgoldi *x@ = an operand
	832	*
	833	* Returns: ---
	834	*
	835	* Use: Stores in @z@ the multiplicative inverse %$x^{-1}$%. If
	836	* %$x = 0$% then @z@ is set to zero. This is considered a
	837	* feature.
	838	*/
	839
	840	void fgoldi_inv(fgoldi z, const fgoldi x)
	841	{
	842	fgoldi t, u;
	843	unsigned i;
	844
	845	#define SQRN(z, x, n) do { \
	846	fgoldi_sqr((z), (x)); \
	847	for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
	848	} while (0)
	849
	850	/* Calculate x^-1 = x^(p - 2) = x^(2^448 - 2^224 - 3), which also handles
	851	* x = 0 as intended. The addition chain is home-made.
	852	/ / step \| value */
	853	fgoldi_sqr(&u, x); /* 1 \| 2 */
	854	fgoldi_mul(&t, &u, x); /* 2 \| 3 */
	855	SQRN(&u, &t, 2); /* 4 \| 12 */
	856	fgoldi_mul(&t, &u, &t); /* 5 \| 15 */
	857	SQRN(&u, &t, 4); /* 9 \| 240 */
	858	fgoldi_mul(&u, &u, &t); /* 10 \| 255 = 2^8 - 1 */
	859	SQRN(&u, &u, 4); /* 14 \| 2^12 - 16 */
	860	fgoldi_mul(&t, &u, &t); /* 15 \| 2^12 - 1 */
	861	SQRN(&u, &t, 12); /* 27 \| 2^24 - 2^12 */
	862	fgoldi_mul(&u, &u, &t); /* 28 \| 2^24 - 1 */
	863	SQRN(&u, &u, 12); /* 40 \| 2^36 - 2^12 */
	864	fgoldi_mul(&t, &u, &t); /* 41 \| 2^36 - 1 */
	865	fgoldi_sqr(&t, &t); /* 42 \| 2^37 - 2 */
	866	fgoldi_mul(&t, &t, x); /* 43 \| 2^37 - 1 */
	867	SQRN(&u, &t, 37); /* 80 \| 2^74 - 2^37 */
	868	fgoldi_mul(&u, &u, &t); /* 81 \| 2^74 - 1 */
	869	SQRN(&u, &u, 37); /* 118 \| 2^111 - 2^37 */
	870	fgoldi_mul(&t, &u, &t); /* 119 \| 2^111 - 1 */
	871	SQRN(&u, &t, 111); /* 230 \| 2^222 - 2^111 */
	872	fgoldi_mul(&t, &u, &t); /* 231 \| 2^222 - 1 */
	873	fgoldi_sqr(&u, &t); /* 232 \| 2^223 - 2 */
	874	fgoldi_mul(&u, &u, x); /* 233 \| 2^223 - 1 */
	875	SQRN(&u, &u, 223); /* 456 \| 2^446 - 2^223 */
	876	fgoldi_mul(&t, &u, &t); /* 457 \| 2^446 - 2^222 - 1 */
	877	SQRN(&t, &t, 2); /* 459 \| 2^448 - 2^224 - 4 */
	878	fgoldi_mul(z, &t, x); /* 460 \| 2^448 - 2^224 - 3 */
	879
	880	#undef SQRN
	881	}
	882
	883	/* --- @fgoldi_quosqrt@ --- *
	884	*
	885	* Arguments: @fgoldi *z@ = where to put the result (may alias @x@ or @y@)
	886	* @const fgoldi x, y@ = two operands
	887	*
	888	* Returns: Zero if successful, @-1@ if %$x/y$% is not a square.
	889	*
	890	* Use: Stores in @z@ the one of the square roots %$\pm\sqrt{x/y}$%.
	891	* If %$x = y = 0% then the result is zero; if %$y = 0$% but %$x
	892	* \ne 0$% then the operation fails. If you wanted a specific
	893	* square root then you'll have to pick it yourself.
	894	*/
	895
	896	int fgoldi_quosqrt(fgoldi z, const fgoldi x, const fgoldi *y)
	897	{
	898	fgoldi t, u, v;
	899	octet xb[56], b0[56];
	900	int32 rc = -1;
	901	mask32 m;
	902	unsigned i;
	903
	904	#define SQRN(z, x, n) do { \
	905	fgoldi_sqr((z), (x)); \
	906	for (i = 1; i < (n); i++) fgoldi_sqr((z), (z)); \
	907	} while (0)
	908
	909	/* This is, fortunately, significantly easier than the equivalent problem
	910	* in GF(2^255 - 19), since p == 3 (mod 4).
	911	*
	912	* If x/y is square, then so is v = y^2 x/y = x y, and therefore u has
	913	* order r = (p - 1)/2. Let w = v^{(p-3)/4}. Then w^2 = v^{(p-3)/2} =
	914	* u^{r-1} = 1/v = 1/x y. Clearly, then, (x w)^2 = x^2/x y = x/y, so x w
	915	* is one of the square roots we seek.
	916	*
	917	* The addition chain, then, is a prefix of the previous one.
	918	*/
	919	fgoldi_mul(&v, x, y);
	920
	921	fgoldi_sqr(&u, &v); /* 1 \| 2 */
	922	fgoldi_mul(&t, &u, &v); /* 2 \| 3 */
	923	SQRN(&u, &t, 2); /* 4 \| 12 */
	924	fgoldi_mul(&t, &u, &t); /* 5 \| 15 */
	925	SQRN(&u, &t, 4); /* 9 \| 240 */
	926	fgoldi_mul(&u, &u, &t); /* 10 \| 255 = 2^8 - 1 */
	927	SQRN(&u, &u, 4); /* 14 \| 2^12 - 16 */
	928	fgoldi_mul(&t, &u, &t); /* 15 \| 2^12 - 1 */
	929	SQRN(&u, &t, 12); /* 27 \| 2^24 - 2^12 */
	930	fgoldi_mul(&u, &u, &t); /* 28 \| 2^24 - 1 */
	931	SQRN(&u, &u, 12); /* 40 \| 2^36 - 2^12 */
	932	fgoldi_mul(&t, &u, &t); /* 41 \| 2^36 - 1 */
	933	fgoldi_sqr(&t, &t); /* 42 \| 2^37 - 2 */
	934	fgoldi_mul(&t, &t, &v); /* 43 \| 2^37 - 1 */
	935	SQRN(&u, &t, 37); /* 80 \| 2^74 - 2^37 */
	936	fgoldi_mul(&u, &u, &t); /* 81 \| 2^74 - 1 */
	937	SQRN(&u, &u, 37); /* 118 \| 2^111 - 2^37 */
	938	fgoldi_mul(&t, &u, &t); /* 119 \| 2^111 - 1 */
	939	SQRN(&u, &t, 111); /* 230 \| 2^222 - 2^111 */
	940	fgoldi_mul(&t, &u, &t); /* 231 \| 2^222 - 1 */
	941	fgoldi_sqr(&u, &t); /* 232 \| 2^223 - 2 */
	942	fgoldi_mul(&u, &u, &v); /* 233 \| 2^223 - 1 */
	943	SQRN(&u, &u, 223); /* 456 \| 2^446 - 2^223 */
	944	fgoldi_mul(&t, &u, &t); /* 457 \| 2^446 - 2^222 - 1 */
	945
	946	#undef SQRN
	947
	948	/* Now we must decide whether the answer was right. We should have z^2 =
	949	* x/y, so y z^2 = x.
	950	*
	951	* The easiest way to compare is to encode. This isn't as wasteful as it
	952	* sounds: the hard part is normalizing the representations, which we have
	953	* to do anyway.
	954	*/
	955	fgoldi_mul(z, x, &t);
	956	fgoldi_sqr(&t, z);
	957	fgoldi_mul(&t, &t, y);
	958	fgoldi_store(xb, x);
	959	fgoldi_store(b0, &t);
	960	m = -ct_memeq(xb, b0, 56);
	961	rc = PICK2(0, rc, m);
	962	return (rc);
	963	}
	964
	965	/----- Test rig ----------------------------------------------------------/
	966
	967	#ifdef TEST_RIG
	968
	969	#include <mLib/report.h>
	970	#include <mLib/str.h>
	971	#include <mLib/testrig.h>
	972
	973	static void fixdstr(dstr *d)
	974	{
	975	if (d->len > 56)
	976	die(1, "invalid length for fgoldi");
	977	else if (d->len < 56) {
	978	dstr_ensure(d, 56);
	979	memset(d->buf + d->len, 0, 56 - d->len);
	980	d->len = 56;
	981	}
	982	}
	983
	984	static void cvt_fgoldi(const char buf, dstr d)
	985	{
	986	dstr dd = DSTR_INIT;
	987
	988	type_hex.cvt(buf, &dd); fixdstr(&dd);
	989	dstr_ensure(d, sizeof(fgoldi)); d->len = sizeof(fgoldi);
	990	fgoldi_load((fgoldi )d->buf, (const octet )dd.buf);
	991	dstr_destroy(&dd);
	992	}
	993
	994	static void dump_fgoldi(dstr d, FILE fp)
	995	{ fdump(stderr, "???", (const piece *)d->buf); }
	996
	997	static void cvt_fgoldi_ref(const char buf, dstr d)
	998	{ type_hex.cvt(buf, d); fixdstr(d); }
	999
	1000	static void dump_fgoldi_ref(dstr d, FILE fp)
	1001	{
	1002	fgoldi x;
	1003
	1004	fgoldi_load(&x, (const octet *)d->buf);
	1005	fdump(stderr, "???", x.P);
	1006	}
	1007
	1008	static int eq(const fgoldi x, dstr d)
	1009	{ octet b[56]; fgoldi_store(b, x); return (memcmp(b, d->buf, 56) == 0); }
	1010
	1011	static const test_type
	1012	type_fgoldi = { cvt_fgoldi, dump_fgoldi },
	1013	type_fgoldi_ref = { cvt_fgoldi_ref, dump_fgoldi_ref };
	1014
	1015	#define TEST_UNOP(op) \
	1016	static int vrf_##op(dstr dv[]) \
	1017	{ \
	1018	fgoldi x = (fgoldi )dv[0].buf; \
	1019	fgoldi z, zz; \
	1020	int ok = 1; \
	1021	\
	1022	fgoldi_##op(&z, x); \
	1023	if (!eq(&z, &dv[1])) { \
	1024	ok = 0; \
	1025	fprintf(stderr, "failed!\n"); \
	1026	fdump(stderr, "x", x->P); \
	1027	fdump(stderr, "calc", z.P); \
	1028	fgoldi_load(&zz, (const octet *)dv[1].buf); \
	1029	fdump(stderr, "z", zz.P); \
	1030	} \
	1031	\
	1032	return (ok); \
	1033	}
	1034
	1035	TEST_UNOP(sqr)
	1036	TEST_UNOP(inv)
	1037	TEST_UNOP(neg)
	1038
	1039	#define TEST_BINOP(op) \
	1040	static int vrf_##op(dstr dv[]) \
	1041	{ \
	1042	fgoldi x = (fgoldi )dv[0].buf, y = (fgoldi )dv[1].buf; \
	1043	fgoldi z, zz; \
	1044	int ok = 1; \
	1045	\
	1046	fgoldi_##op(&z, x, y); \
	1047	if (!eq(&z, &dv[2])) { \
	1048	ok = 0; \
	1049	fprintf(stderr, "failed!\n"); \
	1050	fdump(stderr, "x", x->P); \
	1051	fdump(stderr, "y", y->P); \
	1052	fdump(stderr, "calc", z.P); \
	1053	fgoldi_load(&zz, (const octet *)dv[2].buf); \
	1054	fdump(stderr, "z", zz.P); \
	1055	} \
	1056	\
	1057	return (ok); \
	1058	}
	1059
	1060	TEST_BINOP(add)
	1061	TEST_BINOP(sub)
	1062	TEST_BINOP(mul)
	1063
	1064	static int vrf_mulc(dstr dv[])
	1065	{
	1066	fgoldi x = (fgoldi )dv[0].buf;
	1067	long a = (const long )dv[1].buf;
	1068	fgoldi z, zz;
	1069	int ok = 1;
	1070
	1071	fgoldi_mulconst(&z, x, a);
	1072	if (!eq(&z, &dv[2])) {
	1073	ok = 0;
	1074	fprintf(stderr, "failed!\n");
	1075	fdump(stderr, "x", x->P);
	1076	fprintf(stderr, "a = %ld\n", a);
	1077	fdump(stderr, "calc", z.P);
	1078	fgoldi_load(&zz, (const octet *)dv[2].buf);
	1079	fdump(stderr, "z", zz.P);
	1080	}
	1081
	1082	return (ok);
	1083	}
	1084
	1085	static int vrf_condneg(dstr dv[])
	1086	{
	1087	fgoldi x = (fgoldi )dv[0].buf;
	1088	uint32 m = (uint32 )dv[1].buf;
	1089	fgoldi z;
	1090	int ok = 1;
	1091
	1092	fgoldi_condneg(&z, x, m);
	1093	if (!eq(&z, &dv[2])) {
	1094	ok = 0;
	1095	fprintf(stderr, "failed!\n");
	1096	fdump(stderr, "x", x->P);
	1097	fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
	1098	fdump(stderr, "calc z", z.P);
	1099	fgoldi_load(&z, (const octet *)dv[1].buf);
	1100	fdump(stderr, "want z", z.P);
	1101	}
	1102
	1103	return (ok);
	1104	}
	1105
	1106	static int vrf_pick2(dstr dv[])
	1107	{
	1108	fgoldi x = (fgoldi )dv[0].buf, y = (fgoldi )dv[1].buf;
	1109	uint32 m = (uint32 )dv[2].buf;
	1110	fgoldi z;
	1111	int ok = 1;
	1112
	1113	fgoldi_pick2(&z, x, y, m);
	1114	if (!eq(&z, &dv[3])) {
	1115	ok = 0;
	1116	fprintf(stderr, "failed!\n");
	1117	fdump(stderr, "x", x->P);
	1118	fdump(stderr, "y", y->P);
	1119	fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
	1120	fdump(stderr, "calc z", z.P);
	1121	fgoldi_load(&z, (const octet *)dv[3].buf);
	1122	fdump(stderr, "want z", z.P);
	1123	}
	1124
	1125	return (ok);
	1126	}
	1127
	1128	static int vrf_pickn(dstr dv[])
	1129	{
	1130	dstr d = DSTR_INIT;
	1131	fgoldi v[32], z;
	1132	size_t i = (uint32 )dv[1].buf, j, n;
	1133	const char *p;
	1134	char *q;
	1135	int ok = 1;
	1136
	1137	for (q = dv[0].buf, n = 0; (p = str_qword(&q, 0)) != 0; n++)
	1138	{ cvt_fgoldi(p, &d); v[n] = (fgoldi )d.buf; }
	1139
	1140	fgoldi_pickn(&z, v, n, i);
	1141	if (!eq(&z, &dv[2])) {
	1142	ok = 0;
	1143	fprintf(stderr, "failed!\n");
	1144	for (j = 0; j < n; j++) {
	1145	fprintf(stderr, "v[%2u]", (unsigned)j);
	1146	fdump(stderr, "", v[j].P);
	1147	}
	1148	fprintf(stderr, "i = %u\n", (unsigned)i);
	1149	fdump(stderr, "calc z", z.P);
	1150	fgoldi_load(&z, (const octet *)dv[2].buf);
	1151	fdump(stderr, "want z", z.P);
	1152	}
	1153
	1154	dstr_destroy(&d);
	1155	return (ok);
	1156	}
	1157
	1158	static int vrf_condswap(dstr dv[])
	1159	{
	1160	fgoldi x = (fgoldi )dv[0].buf, y = (fgoldi )dv[1].buf;
	1161	fgoldi xx = x, yy = y;
	1162	uint32 m = (uint32 )dv[2].buf;
	1163	int ok = 1;
	1164
	1165	fgoldi_condswap(&xx, &yy, m);
	1166	if (!eq(&xx, &dv[3]) \|\| !eq(&yy, &dv[4])) {
	1167	ok = 0;
	1168	fprintf(stderr, "failed!\n");
	1169	fdump(stderr, "x", x->P);
	1170	fdump(stderr, "y", y->P);
	1171	fprintf(stderr, "m = 0x%08lx\n", (unsigned long)m);
	1172	fdump(stderr, "calc xx", xx.P);
	1173	fdump(stderr, "calc yy", yy.P);
	1174	fgoldi_load(&xx, (const octet *)dv[3].buf);
	1175	fgoldi_load(&yy, (const octet *)dv[4].buf);
	1176	fdump(stderr, "want xx", xx.P);
	1177	fdump(stderr, "want yy", yy.P);
	1178	}
	1179
	1180	return (ok);
	1181	}
	1182
	1183	static int vrf_quosqrt(dstr dv[])
	1184	{
	1185	fgoldi x = (fgoldi )dv[0].buf, y = (fgoldi )dv[1].buf;
	1186	fgoldi z, zz;
	1187	int rc;
	1188	int ok = 1;
	1189
	1190	if (dv[2].len) { fixdstr(&dv[2]); fixdstr(&dv[3]); }
	1191	rc = fgoldi_quosqrt(&z, x, y);
	1192	if (!dv[2].len ? !rc : (rc \|\| (!eq(&z, &dv[2]) && !eq(&z, &dv[3])))) {
	1193	ok = 0;
	1194	fprintf(stderr, "failed!\n");
	1195	fdump(stderr, "x", x->P);
	1196	fdump(stderr, "y", y->P);
	1197	if (rc) fprintf(stderr, "calc: FAIL\n");
	1198	else fdump(stderr, "calc", z.P);
	1199	if (!dv[2].len)
	1200	fprintf(stderr, "exp: FAIL\n");
	1201	else {
	1202	fgoldi_load(&zz, (const octet *)dv[2].buf);
	1203	fdump(stderr, "z", zz.P);
	1204	fgoldi_load(&zz, (const octet *)dv[3].buf);
	1205	fdump(stderr, "z'", zz.P);
	1206	}
	1207	}
	1208
	1209	return (ok);
	1210	}
	1211
	1212	static int vrf_sub_mulc_add_sub_mul(dstr dv[])
	1213	{
	1214	fgoldi u = (fgoldi )dv[0].buf, v = (fgoldi )dv[1].buf,
	1215	w = (fgoldi )dv[3].buf, x = (fgoldi )dv[4].buf,
	1216	y = (fgoldi )dv[5].buf;
	1217	long a = (const long )dv[2].buf;
	1218	fgoldi umv, aumv, wpaumv, xmy, z, zz;
	1219	int ok = 1;
	1220
	1221	fgoldi_sub(&umv, u, v);
	1222	fgoldi_mulconst(&aumv, &umv, a);
	1223	fgoldi_add(&wpaumv, w, &aumv);
	1224	fgoldi_sub(&xmy, x, y);
	1225	fgoldi_mul(&z, &wpaumv, &xmy);
	1226
	1227	if (!eq(&z, &dv[6])) {
	1228	ok = 0;
	1229	fprintf(stderr, "failed!\n");
	1230	fdump(stderr, "u", u->P);
	1231	fdump(stderr, "v", v->P);
	1232	fdump(stderr, "u - v", umv.P);
	1233	fprintf(stderr, "a = %ld\n", a);
	1234	fdump(stderr, "a (u - v)", aumv.P);
	1235	fdump(stderr, "w + a (u - v)", wpaumv.P);
	1236	fdump(stderr, "x", x->P);
	1237	fdump(stderr, "y", y->P);
	1238	fdump(stderr, "x - y", xmy.P);
	1239	fdump(stderr, "(x - y) (w + a (u - v))", z.P);
	1240	fgoldi_load(&zz, (const octet *)dv[6].buf); fdump(stderr, "z", zz.P);
	1241	}
	1242
	1243	return (ok);
	1244	}
	1245
	1246	static test_chunk tests[] = {
	1247	{ "add", vrf_add, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
	1248	{ "sub", vrf_sub, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
	1249	{ "neg", vrf_neg, { &type_fgoldi, &type_fgoldi_ref } },
	1250	{ "condneg", vrf_condneg,
	1251	{ &type_fgoldi, &type_uint32, &type_fgoldi_ref } },
	1252	{ "mul", vrf_mul, { &type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
	1253	{ "mulconst", vrf_mulc, { &type_fgoldi, &type_long, &type_fgoldi_ref } },
	1254	{ "pick2", vrf_pick2,
	1255	{ &type_fgoldi, &type_fgoldi, &type_uint32, &type_fgoldi_ref } },
	1256	{ "pickn", vrf_pickn,
	1257	{ &type_string, &type_uint32, &type_fgoldi_ref } },
	1258	{ "condswap", vrf_condswap,
	1259	{ &type_fgoldi, &type_fgoldi, &type_uint32,
	1260	&type_fgoldi_ref, &type_fgoldi_ref } },
	1261	{ "sqr", vrf_sqr, { &type_fgoldi, &type_fgoldi_ref } },
	1262	{ "inv", vrf_inv, { &type_fgoldi, &type_fgoldi_ref } },
	1263	{ "quosqrt", vrf_quosqrt,
	1264	{ &type_fgoldi, &type_fgoldi, &type_hex, &type_hex } },
	1265	{ "sub-mulc-add-sub-mul", vrf_sub_mulc_add_sub_mul,
	1266	{ &type_fgoldi, &type_fgoldi, &type_long, &type_fgoldi,
	1267	&type_fgoldi, &type_fgoldi, &type_fgoldi_ref } },
	1268	{ 0, 0, { 0 } }
	1269	};
	1270
	1271	int main(int argc, char *argv[])
	1272	{
	1273	test_run(argc, argv, tests, SRCDIR "/t/fgoldi");
	1274	return (0);
	1275	}
	1276
	1277	#endif
	1278
	1279	/----- That's all, folks -------------------------------------------------/