[secnet] / scaf.c

/* -*-c-*-
 *
 * Simple scalar fields
 *
 * (c) 2017 Straylight/Edgeware
 */

/*----- Licensing notice --------------------------------------------------*
 *
 * This file is part of secnet.
 * See README for full list of copyright holders.
 *
 * secnet is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version d of the License, or
 * (at your option) any later version.
 *
 * secnet is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * version 3 along with secnet; if not, see
 * https://www.gnu.org/licenses/gpl.html.
 *
 * This file was originally part of Catacomb, but has been automatically
 * modified for incorporation into secnet: see `import-catacomb-crypto'
 * for details.
 *
 * Catacomb is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Library General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * Catacomb is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with Catacomb; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
 * MA 02111-1307, USA.
 */

/*----- Header files ------------------------------------------------------*/

#include <string.h>

#include "scaf.h"

/*----- Debugging utilties ------------------------------------------------*/

#ifdef SCAF_DEBUG

#include <stdio.h>

#include "mp.h"
#include "mpint.h"
#include "mptext.h"

static void scaf_dump(const char *what, const scaf_piece *x,
		      size_t npiece, size_t piecewd)
{
  mp *y = MP_ZERO, *t = MP_NEW;
  size_t i;
  unsigned o = 0;

  for (i = 0; i < npiece; i++) {
    t = mp_fromuint64(t, x[i]);
    t = mp_lsl(t, t, o);
    y = mp_add(y, y, t);
    o += piecewd;
  }
  printf(";; %s", what); MP_PRINT("", y); putchar('\n');
  mp_drop(y); mp_drop(t);
}

static void scaf_dumpdbl(const char *what, const scaf_dblpiece *x,
		      size_t npiece, size_t piecewd)
{
  mp *y = MP_ZERO, *t = MP_NEW;
  size_t i;
  unsigned o = 0;

  for (i = 0; i < npiece; i++) {
    t = mp_fromuint64(t, x[i]);
    t = mp_lsl(t, t, o);
    y = mp_add(y, y, t);
    o += piecewd;
  }
  printf(";; %s", what); MP_PRINT("", y); putchar('\n');
  mp_drop(y); mp_drop(t);
}

#endif

/*----- Main code ---------------------------------------------------------*/

/* --- @scaf_load@ --- *
 *
 * Arguments:	@scaf_piece *z@ = where to write the result
 *		@const octet *b@ = source buffer to read
 *		@size_t sz@ = size of the source buffer
 *		@size_t npiece@ = number of pieces to read
 *		@unsigned piecewd@ = nominal width of pieces in bits
 *
 * Returns:	---
 *
 * Use:		Loads a little-endian encoded scalar into a vector @z@ of
 *		single-precision pieces.
 */

void scaf_load(scaf_piece *z, const octet *b, size_t sz,
	       size_t npiece, unsigned piecewd)
{
  uint32 a, m = ((scaf_piece)1 << piecewd) - 1;
  unsigned i, j, n;

  for (i = j = n = 0, a = 0; i < sz; i++) {
    a |= b[i] << n; n += 8;
    if (n >= piecewd) {
      z[j++] = a&m; a >>= piecewd; n -= piecewd;
      if (j >= npiece) return;
    }
  }
  z[j++] = a;
  while (j < npiece) z[j++] = 0;
}

/* --- @scaf_loaddbl@ --- *
 *
 * Arguments:	@scaf_dblpiece *z@ = where to write the result
 *		@const octet *b@ = source buffer to read
 *		@size_t sz@ = size of the source buffer
 *		@size_t npiece@ = number of pieces to read
 *		@unsigned piecewd@ = nominal width of pieces in bits
 *
 * Returns:	---
 *
 * Use:		Loads a little-endian encoded scalar into a vector @z@ of
 *		double-precision pieces.
 */

void scaf_loaddbl(scaf_dblpiece *z, const octet *b, size_t sz,
		  size_t npiece, unsigned piecewd)
{
  uint32 a, m = ((scaf_piece)1 << piecewd) - 1;
  unsigned i, j, n;

  for (i = j = n = 0, a = 0; i < sz; i++) {
    a |= b[i] << n; n += 8;
    if (n >= piecewd) {
      z[j++] = a&m; a >>= piecewd; n -= piecewd;
      if (j >= npiece) return;
    }
  }
  z[j++] = a;
  while (j < npiece) z[j++] = 0;
}

/* --- @scaf_store@ --- *
 *
 * Arguments:	@octet *b@ = buffer to fill in
 *		@size_t sz@ = size of the buffer
 *		@const scaf_piece *x@ = scalar to store
 *		@size_t npiece@ = number of pieces in @x@
 *		@unsigned piecewd@ = nominal width of pieces in bits
 *
 * Returns:	---
 *
 * Use:		Stores a scalar in a vector of single-precison pieces as a
 *		little-endian vector of bytes.
 */

void scaf_store(octet *b, size_t sz, const scaf_piece *x,
		size_t npiece, unsigned piecewd)
{
  uint32 a;
  unsigned i, j, n;

  for (i = j = n = 0, a = 0; i < npiece; i++) {
    a |= x[i] << n; n += piecewd;
    while (n >= 8) {
      b[j++] = a&0xffu; a >>= 8; n -= 8;
      if (j >= sz) return;
    }
  }
  b[j++] = a;
  memset(b + j, 0, sz - j);
}

/* --- @scaf_mul@ --- *
 *
 * Arguments:	@scaf_dblpiece *z@ = where to put the answer
 *		@const scaf_piece *x, *y@ = the operands
 *		@size_t npiece@ = the length of the operands
 *
 * Returns:	---
 *
 * Use:		Multiply two scalars.  The destination must have space for
 *		@2*npiece@ pieces (though the last one will always be zero).
 *		The result is not reduced.
 */

void scaf_mul(scaf_dblpiece *z, const scaf_piece *x, const scaf_piece *y,
	      size_t npiece)
{
  unsigned i, j;

  for (i = 0; i < 2*npiece; i++) z[i] = 0;

  for (i = 0; i < npiece; i++)
    for (j = 0; j < npiece; j++)
      z[i + j] += (scaf_dblpiece)x[i]*y[j];
}

/* --- @scaf_reduce@ --- *
 *
 * Arguments:	@scaf_piece *z@ = where to write the result
 *		@const scaf_dblpiece *x@ = the operand to reduce
 *		@const scaf_piece *l@ = the modulus, in internal format
 *		@const scaf_piece *mu@ = scaled approximation to @1/l@
 *		@size_t npiece@ = number of pieces in @l@
 *		@unsigned piecewd@ = nominal width of a piece in bits
 *		@scaf_piece *scratch@ = @3*npiece@ scratch pieces
 *
 * Returns:	---
 *
 * Use:		Reduce @x@ (a vector of @2*npiece@ double-precision pieces)
 *		modulo @l@ (a vector of @npiece@ single-precision pieces),
 *		writing the result to @z@.
 *
 *		Write @n = npiece@, @w = piecewd@, and %$B = 2^w$%.  The
 *		operand @mu@ must contain %$\lfloor B^{2n}/l \rfloor$%, in
 *		@npiece + 1@ pieces.  Furthermore, we must have
 *		%$3 l < B^n$%.  (Fiddle with %$w$% if necessary.)
 */

void scaf_reduce(scaf_piece *z, const scaf_dblpiece *x,
		 const scaf_piece *l, const scaf_piece *mu,
		 size_t npiece, unsigned piecewd, scaf_piece *scratch)
{
  unsigned i, j;
  scaf_piece *t = scratch, *q = scratch + 2*npiece;
  scaf_piece u, m = ((scaf_piece)1 << piecewd) - 1;
  scaf_dblpiece c;

  /* This here is the hard part.
   *
   * Let w = PIECEWD, let n = NPIECE, and let B = 2^w.  We must have
   * B^(n-1) <= l < B^n.
   *
   * The argument MU contains pieces of the quantity µ = floor(B^2n/l), which
   * is a scaled approximation to 1/l.  We'll calculate
   *
   *	q = floor(µ floor(x/B^(n-1))/B^(n+1))
   *
   * which is an underestimate of x/l.
   *
   * With a bit more precision: by definition, u - 1 < floor(u) <= u.  Hence,
   *
   *	B^2n/l - 1 < µ <= B^2/l
   *
   * and
   *
   *	x/B^(n-1) - 1 < floor(x/B^(n-1)) <= x/B^(n-1)
   *
   * Multiplying these together, and dividing through by B^(n+1), gives
   *
   *	floor(x/l - B^(n-1)/l - x/B^2n + 1/B^(n+1)) <=
   *		q <= µ floor(x/B^(n-1))/B^(n+1) <= floor(x/l)
   *
   * Now, noticing that x < B^2n and l > B^(n-1) shows that x/B^2n and
   * B^(n-1)/l are each less than 1; hence
   *
   *	floor(x/l) - 2 <= q <= floor(x/l) <= x/l
   *
   * Now we set r = x - q l.  Certainly, r == x (mod l); and
   *
   *	0 <= r < x - l floor(x/l) + 2 l < 3 l < B^n
   */

  /* Before we start on the fancy stuff, we need to resolve the pending
   * carries in x.  We'll be doing the floor division by just ignoring some
   * of the pieces, and it would be bad if we missed some significant bits.
   * Of course, this means that we don't actually have to store the low
   * NPIECE - 1 pieces of the result.
   */
  for (i = 0, c = 0; i < 2*npiece; i++)
    { c += x[i]; t[i] = c&m; c >>= piecewd; }

  /* Now we calculate q.  If we calculate this in product-scanning order, we
   * can avoid having to store the low NPIECE + 1 pieces of the product as
   * long as we keep track of the carry out properly.  Conveniently, NMU =
   * NPIECE + 1, which keeps the loop bounds easy in the first pass.
   *
   * Furthermore, because we know that r fits in NPIECE pieces, we only need
   * the low NPIECE pieces of q.
   */
  for (i = 0, c = 0; i < npiece + 1; i++) {
    for (j = 0; j <= i; j++)
      c += (scaf_dblpiece)t[j + npiece - 1]*mu[i - j];
    c >>= piecewd;
  }
  for (i = 0; i < npiece; i++) {
    for (j = i + 1; j < npiece + 1; j++)
      c += (scaf_dblpiece)t[j + npiece - 1]*mu[npiece + 1 + i - j];
    q[i] = c&m; c >>= piecewd;
  }

  /* Next, we calculate r - q l in z.  Product-scanning seems to be working
   * out for us, and this time it will save us needing a large temporary
   * space for the product q l as we go.  On the downside, we have to track
   * the carries from the multiplication and subtraction separately.
   *
   * Notice that the result r is at most NPIECE pieces long, so we can stop
   * once we have that many.
   */
  u = 1; c = 0;
  for (i = 0; i < npiece; i++) {
    for (j = 0; j <= i; j++) c += (scaf_dblpiece)q[j]*l[i - j];
    u += t[i] + ((scaf_piece)(c&m) ^ m);
    z[i] = u&m; u >>= piecewd; c >>= piecewd;
  }

  /* Finally, two passes of conditional subtraction.  Calculate t = z - l; if
   * there's no borrow out the top, then update z = t; otherwise leave t
   * alone.
   */
  for (i = 0; i < 2; i++) {
    for (j = 0, u = 1; j < npiece; j++) {
      u += z[j] + (l[j] ^ m);
      t[j] = u&m; u >>= piecewd;
    }
    for (j = 0, u = -u; j < npiece; j++) z[j] = (t[j]&u) | (z[j]&~u);
  }
}

/*----- That's all, folks -------------------------------------------------*/
Commit	Line	Data
a1a6042e MW	1	/* --c--
	2	*
	3	* Simple scalar fields
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of secnet.
	11	* See README for full list of copyright holders.
	12	*
	13	* secnet is free software; you can redistribute it and/or modify it
	14	* under the terms of the GNU General Public License as published by
	15	* the Free Software Foundation; either version d of the License, or
	16	* (at your option) any later version.
	17	*
	18	* secnet is distributed in the hope that it will be useful, but
	19	* WITHOUT ANY WARRANTY; without even the implied warranty of
	20	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	21	* General Public License for more details.
	22	*
	23	* You should have received a copy of the GNU General Public License
	24	* version 3 along with secnet; if not, see
	25	* https://www.gnu.org/licenses/gpl.html.
	26	*
	27	* This file was originally part of Catacomb, but has been automatically
	28	* modified for incorporation into secnet: see `import-catacomb-crypto'
	29	* for details.
	30	*
	31	* Catacomb is free software; you can redistribute it and/or modify
	32	* it under the terms of the GNU Library General Public License as
	33	* published by the Free Software Foundation; either version 2 of the
	34	* License, or (at your option) any later version.
	35	*
	36	* Catacomb is distributed in the hope that it will be useful,
	37	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	38	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	39	* GNU Library General Public License for more details.
	40	*
	41	* You should have received a copy of the GNU Library General Public
	42	* License along with Catacomb; if not, write to the Free
	43	* Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	44	* MA 02111-1307, USA.
	45	*/
	46
	47	/----- Header files ------------------------------------------------------/
	48
	49	#include <string.h>
	50
	51	#include "scaf.h"
	52
	53	/----- Debugging utilties ------------------------------------------------/
	54
	55	#ifdef SCAF_DEBUG
	56
	57	#include <stdio.h>
	58
	59	#include "mp.h"
	60	#include "mpint.h"
	61	#include "mptext.h"
	62
	63	static void scaf_dump(const char what, const scaf_piece x,
	64	size_t npiece, size_t piecewd)
65	{
66	mp y = MP_ZERO, t = MP_NEW;
67	size_t i;
68	unsigned o = 0;
69
70	for (i = 0; i < npiece; i++) {
71	t = mp_fromuint64(t, x[i]);
72	t = mp_lsl(t, t, o);
73	y = mp_add(y, y, t);
74	o += piecewd;
75	}
76	printf(";; %s", what); MP_PRINT("", y); putchar('\n');
77	mp_drop(y); mp_drop(t);
78	}
79
80	static void scaf_dumpdbl(const char what, const scaf_dblpiece x,
81	size_t npiece, size_t piecewd)
82	{
83	mp y = MP_ZERO, t = MP_NEW;
84	size_t i;
85	unsigned o = 0;
86
87	for (i = 0; i < npiece; i++) {
88	t = mp_fromuint64(t, x[i]);
89	t = mp_lsl(t, t, o);
90	y = mp_add(y, y, t);
91	o += piecewd;
92	}
93	printf(";; %s", what); MP_PRINT("", y); putchar('\n');
94	mp_drop(y); mp_drop(t);
95	}
96
97	#endif
98
99	/----- Main code ---------------------------------------------------------/
100
101	/* --- @scaf_load@ --- *
102	*
103	* Arguments: @scaf_piece *z@ = where to write the result
104	* @const octet *b@ = source buffer to read
105	* @size_t sz@ = size of the source buffer
106	* @size_t npiece@ = number of pieces to read
107	* @unsigned piecewd@ = nominal width of pieces in bits
108	*
109	* Returns: ---
110	*
111	* Use: Loads a little-endian encoded scalar into a vector @z@ of
112	* single-precision pieces.
113	*/
114
115	void scaf_load(scaf_piece z, const octet b, size_t sz,
116	size_t npiece, unsigned piecewd)
117	{
118	uint32 a, m = ((scaf_piece)1 << piecewd) - 1;
119	unsigned i, j, n;
120
121	for (i = j = n = 0, a = 0; i < sz; i++) {
122	a \|= b[i] << n; n += 8;
123	if (n >= piecewd) {
124	z[j++] = a&m; a >>= piecewd; n -= piecewd;
125	if (j >= npiece) return;
126	}
127	}
128	z[j++] = a;
129	while (j < npiece) z[j++] = 0;
130	}
131
132	/* --- @scaf_loaddbl@ --- *
133	*
134	* Arguments: @scaf_dblpiece *z@ = where to write the result
135	* @const octet *b@ = source buffer to read
136	* @size_t sz@ = size of the source buffer
137	* @size_t npiece@ = number of pieces to read
138	* @unsigned piecewd@ = nominal width of pieces in bits
139	*
140	* Returns: ---
141	*
142	* Use: Loads a little-endian encoded scalar into a vector @z@ of
143	* double-precision pieces.
144	*/
145
146	void scaf_loaddbl(scaf_dblpiece z, const octet b, size_t sz,
147	size_t npiece, unsigned piecewd)
148	{
149	uint32 a, m = ((scaf_piece)1 << piecewd) - 1;
150	unsigned i, j, n;
151
152	for (i = j = n = 0, a = 0; i < sz; i++) {
153	a \|= b[i] << n; n += 8;
154	if (n >= piecewd) {
155	z[j++] = a&m; a >>= piecewd; n -= piecewd;
156	if (j >= npiece) return;
157	}
158	}
159	z[j++] = a;
160	while (j < npiece) z[j++] = 0;
161	}
162
163	/* --- @scaf_store@ --- *
164	*
165	* Arguments: @octet *b@ = buffer to fill in
166	* @size_t sz@ = size of the buffer
167	* @const scaf_piece *x@ = scalar to store
168	* @size_t npiece@ = number of pieces in @x@
169	* @unsigned piecewd@ = nominal width of pieces in bits
170	*
171	* Returns: ---
172	*
173	* Use: Stores a scalar in a vector of single-precison pieces as a
174	* little-endian vector of bytes.
175	*/
176
177	void scaf_store(octet b, size_t sz, const scaf_piece x,
178	size_t npiece, unsigned piecewd)
179	{
180	uint32 a;
181	unsigned i, j, n;
182
183	for (i = j = n = 0, a = 0; i < npiece; i++) {
184	a \|= x[i] << n; n += piecewd;
185	while (n >= 8) {
186	b[j++] = a&0xffu; a >>= 8; n -= 8;
187	if (j >= sz) return;
188	}
189	}
190	b[j++] = a;
191	memset(b + j, 0, sz - j);
192	}
193
194	/* --- @scaf_mul@ --- *
195	*
196	* Arguments: @scaf_dblpiece *z@ = where to put the answer
197	* @const scaf_piece x, y@ = the operands
198	* @size_t npiece@ = the length of the operands
199	*
200	* Returns: ---
201	*
202	* Use: Multiply two scalars. The destination must have space for
203	* @2*npiece@ pieces (though the last one will always be zero).
204	* The result is not reduced.
205	*/
206
207	void scaf_mul(scaf_dblpiece z, const scaf_piece x, const scaf_piece *y,
208	size_t npiece)
209	{
210	unsigned i, j;
211
212	for (i = 0; i < 2*npiece; i++) z[i] = 0;
213
214	for (i = 0; i < npiece; i++)
215	for (j = 0; j < npiece; j++)
216	z[i + j] += (scaf_dblpiece)x[i]*y[j];
217	}
218
219	/* --- @scaf_reduce@ --- *
220	*
221	* Arguments: @scaf_piece *z@ = where to write the result
222	* @const scaf_dblpiece *x@ = the operand to reduce
223	* @const scaf_piece *l@ = the modulus, in internal format
224	* @const scaf_piece *mu@ = scaled approximation to @1/l@
225	* @size_t npiece@ = number of pieces in @l@
226	* @unsigned piecewd@ = nominal width of a piece in bits
227	* @scaf_piece scratch@ = @3npiece@ scratch pieces
228	*
229	* Returns: ---
230	*
231	* Use: Reduce @x@ (a vector of @2*npiece@ double-precision pieces)
232	* modulo @l@ (a vector of @npiece@ single-precision pieces),
233	* writing the result to @z@.
234	*
235	* Write @n = npiece@, @w = piecewd@, and %$B = 2^w$%. The
236	* operand @mu@ must contain %$\lfloor B^{2n}/l \rfloor$%, in
237	* @npiece + 1@ pieces. Furthermore, we must have
238	* %$3 l < B^n$%. (Fiddle with %$w$% if necessary.)
239	*/
240
241	void scaf_reduce(scaf_piece z, const scaf_dblpiece x,
242	const scaf_piece l, const scaf_piece mu,
243	size_t npiece, unsigned piecewd, scaf_piece *scratch)
244	{
245	unsigned i, j;
246	scaf_piece t = scratch, q = scratch + 2*npiece;
247	scaf_piece u, m = ((scaf_piece)1 << piecewd) - 1;
248	scaf_dblpiece c;
249
250	/* This here is the hard part.
251	*
252	* Let w = PIECEWD, let n = NPIECE, and let B = 2^w. We must have
253	* B^(n-1) <= l < B^n.
254	*
255	* The argument MU contains pieces of the quantity µ = floor(B^2n/l), which
256	* is a scaled approximation to 1/l. We'll calculate
257	*
258	* q = floor(µ floor(x/B^(n-1))/B^(n+1))
259	*
260	* which is an underestimate of x/l.
261	*
262	* With a bit more precision: by definition, u - 1 < floor(u) <= u. Hence,
263	*
264	* B^2n/l - 1 < µ <= B^2/l
265	*
266	* and
267	*
268	* x/B^(n-1) - 1 < floor(x/B^(n-1)) <= x/B^(n-1)
269	*
270	* Multiplying these together, and dividing through by B^(n+1), gives
271	*
272	* floor(x/l - B^(n-1)/l - x/B^2n + 1/B^(n+1)) <=
273	* q <= µ floor(x/B^(n-1))/B^(n+1) <= floor(x/l)
274	*
275	* Now, noticing that x < B^2n and l > B^(n-1) shows that x/B^2n and
276	* B^(n-1)/l are each less than 1; hence
277	*
278	* floor(x/l) - 2 <= q <= floor(x/l) <= x/l
279	*
280	* Now we set r = x - q l. Certainly, r == x (mod l); and
281	*
282	* 0 <= r < x - l floor(x/l) + 2 l < 3 l < B^n
283	*/
284
285	/* Before we start on the fancy stuff, we need to resolve the pending
286	* carries in x. We'll be doing the floor division by just ignoring some
287	* of the pieces, and it would be bad if we missed some significant bits.
288	* Of course, this means that we don't actually have to store the low
289	* NPIECE - 1 pieces of the result.
290	*/
291	for (i = 0, c = 0; i < 2*npiece; i++)
292	{ c += x[i]; t[i] = c&m; c >>= piecewd; }
293
294	/* Now we calculate q. If we calculate this in product-scanning order, we
295	* can avoid having to store the low NPIECE + 1 pieces of the product as
296	* long as we keep track of the carry out properly. Conveniently, NMU =
297	* NPIECE + 1, which keeps the loop bounds easy in the first pass.
298	*
299	* Furthermore, because we know that r fits in NPIECE pieces, we only need
300	* the low NPIECE pieces of q.
301	*/
302	for (i = 0, c = 0; i < npiece + 1; i++) {
303	for (j = 0; j <= i; j++)
304	c += (scaf_dblpiece)t[j + npiece - 1]*mu[i - j];
305	c >>= piecewd;
306	}
307	for (i = 0; i < npiece; i++) {
308	for (j = i + 1; j < npiece + 1; j++)
309	c += (scaf_dblpiece)t[j + npiece - 1]*mu[npiece + 1 + i - j];
310	q[i] = c&m; c >>= piecewd;
311	}
312
313	/* Next, we calculate r - q l in z. Product-scanning seems to be working
314	* out for us, and this time it will save us needing a large temporary
315	* space for the product q l as we go. On the downside, we have to track
316	* the carries from the multiplication and subtraction separately.
317	*
318	* Notice that the result r is at most NPIECE pieces long, so we can stop
319	* once we have that many.
320	*/
321	u = 1; c = 0;
322	for (i = 0; i < npiece; i++) {
323	for (j = 0; j <= i; j++) c += (scaf_dblpiece)q[j]*l[i - j];
324	u += t[i] + ((scaf_piece)(c&m) ^ m);
325	z[i] = u&m; u >>= piecewd; c >>= piecewd;
326	}
327
328	/* Finally, two passes of conditional subtraction. Calculate t = z - l; if
329	* there's no borrow out the top, then update z = t; otherwise leave t
330	* alone.
331	*/
332	for (i = 0; i < 2; i++) {
333	for (j = 0, u = 1; j < npiece; j++) {
334	u += z[j] + (l[j] ^ m);
335	t[j] = u&m; u >>= piecewd;
336	}
337	for (j = 0, u = -u; j < npiece; j++) z[j] = (t[j]&u) \| (z[j]&~u);
338	}
339	}
340
341	/----- That's all, folks -------------------------------------------------/