mdw@git.distorted.org.uk Git - catacomb/blame_incremental

... / ...

Commit	Line	Data
	1	/* --c--
	2	*
	3	* Simple scalar fields
	4	*
	5	* (c) 2017 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of Catacomb.
	11	*
	12	* Catacomb is free software; you can redistribute it and/or modify
	13	* it under the terms of the GNU Library General Public License as
	14	* published by the Free Software Foundation; either version 2 of the
	15	* License, or (at your option) any later version.
	16	*
	17	* Catacomb is distributed in the hope that it will be useful,
	18	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	* GNU Library General Public License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with Catacomb; if not, write to the Free
	24	* Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
	25	* MA 02111-1307, USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include <string.h>
	31
	32	#include "scaf.h"
	33
	34	/----- Debugging utilties ------------------------------------------------/
	35
	36	#ifdef SCAF_DEBUG
	37
	38	#include <stdio.h>
	39
	40	#include "mp.h"
	41	#include "mpint.h"
	42	#include "mptext.h"
	43
	44	static void scaf_dump(const char what, const scaf_piece x,
	45	size_t npiece, size_t piecewd)
	46	{
	47	mp y = MP_ZERO, t = MP_NEW;
	48	size_t i;
	49	unsigned o = 0;
	50
	51	for (i = 0; i < npiece; i++) {
	52	t = mp_fromuint64(t, x[i]);
	53	t = mp_lsl(t, t, o);
	54	y = mp_add(y, y, t);
	55	o += piecewd;
	56	}
	57	printf(";; %s", what); MP_PRINT("", y); putchar('\n');
	58	mp_drop(y); mp_drop(t);
	59	}
	60
	61	static void scaf_dumpdbl(const char what, const scaf_dblpiece x,
	62	size_t npiece, size_t piecewd)
	63	{
	64	mp y = MP_ZERO, t = MP_NEW;
	65	size_t i;
	66	unsigned o = 0;
	67
	68	for (i = 0; i < npiece; i++) {
	69	t = mp_fromuint64(t, x[i]);
	70	t = mp_lsl(t, t, o);
	71	y = mp_add(y, y, t);
	72	o += piecewd;
	73	}
	74	printf(";; %s", what); MP_PRINT("", y); putchar('\n');
	75	mp_drop(y); mp_drop(t);
	76	}
	77
	78	#endif
	79
	80	/----- Main code ---------------------------------------------------------/
	81
	82	/* --- @scaf_load@ --- *
	83	*
	84	* Arguments: @scaf_piece *z@ = where to write the result
	85	* @const octet *b@ = source buffer to read
	86	* @size_t sz@ = size of the source buffer
	87	* @size_t npiece@ = number of pieces to read
	88	* @unsigned piecewd@ = nominal width of pieces in bits
	89	*
	90	* Returns: ---
	91	*
	92	* Use: Loads a little-endian encoded scalar into a vector @z@ of
	93	* single-precision pieces.
	94	*/
	95
	96	void scaf_load(scaf_piece z, const octet b, size_t sz,
	97	size_t npiece, unsigned piecewd)
	98	{
	99	uint32 a, m = ((scaf_piece)1 << piecewd) - 1;
	100	unsigned i, j, n;
	101
	102	for (i = j = n = 0, a = 0; i < sz; i++) {
	103	a \|= b[i] << n; n += 8;
	104	if (n >= piecewd) {
	105	z[j++] = a&m; a >>= piecewd; n -= piecewd;
	106	if (j >= npiece) return;
	107	}
	108	}
	109	z[j++] = a;
	110	while (j < npiece) z[j++] = 0;
	111	}
	112
	113	/* --- @scaf_loaddbl@ --- *
	114	*
	115	* Arguments: @scaf_dblpiece *z@ = where to write the result
	116	* @const octet *b@ = source buffer to read
	117	* @size_t sz@ = size of the source buffer
	118	* @size_t npiece@ = number of pieces to read
	119	* @unsigned piecewd@ = nominal width of pieces in bits
	120	*
	121	* Returns: ---
	122	*
	123	* Use: Loads a little-endian encoded scalar into a vector @z@ of
	124	* double-precision pieces.
	125	*/
	126
	127	void scaf_loaddbl(scaf_dblpiece z, const octet b, size_t sz,
	128	size_t npiece, unsigned piecewd)
	129	{
	130	uint32 a, m = ((scaf_piece)1 << piecewd) - 1;
	131	unsigned i, j, n;
	132
	133	for (i = j = n = 0, a = 0; i < sz; i++) {
	134	a \|= b[i] << n; n += 8;
	135	if (n >= piecewd) {
	136	z[j++] = a&m; a >>= piecewd; n -= piecewd;
	137	if (j >= npiece) return;
	138	}
	139	}
	140	z[j++] = a;
	141	while (j < npiece) z[j++] = 0;
	142	}
	143
	144	/* --- @scaf_store@ --- *
	145	*
	146	* Arguments: @octet *b@ = buffer to fill in
	147	* @size_t sz@ = size of the buffer
	148	* @const scaf_piece *x@ = scalar to store
	149	* @size_t npiece@ = number of pieces in @x@
	150	* @unsigned piecewd@ = nominal width of pieces in bits
	151	*
	152	* Returns: ---
	153	*
	154	* Use: Stores a scalar in a vector of single-precison pieces as a
	155	* little-endian vector of bytes.
	156	*/
	157
	158	void scaf_store(octet b, size_t sz, const scaf_piece x,
	159	size_t npiece, unsigned piecewd)
	160	{
	161	uint32 a;
	162	unsigned i, j, n;
	163
	164	for (i = j = n = 0, a = 0; i < npiece; i++) {
	165	a \|= x[i] << n; n += piecewd;
	166	while (n >= 8) {
	167	b[j++] = a&0xffu; a >>= 8; n -= 8;
	168	if (j >= sz) return;
	169	}
	170	}
	171	b[j++] = a;
	172	memset(b + j, 0, sz - j);
	173	}
	174
	175	/* --- @scaf_mul@ --- *
	176	*
	177	* Arguments: @scaf_dblpiece *z@ = where to put the answer
	178	* @const scaf_piece x, y@ = the operands
	179	* @size_t npiece@ = the length of the operands
	180	*
	181	* Returns: ---
	182	*
	183	* Use: Multiply two scalars. The destination must have space for
	184	* @2*npiece@ pieces (though the last one will always be zero).
	185	* The result is not reduced.
	186	*/
	187
	188	void scaf_mul(scaf_dblpiece z, const scaf_piece x, const scaf_piece *y,
	189	size_t npiece)
	190	{
	191	unsigned i, j;
	192
	193	for (i = 0; i < 2*npiece; i++) z[i] = 0;
	194
	195	for (i = 0; i < npiece; i++)
	196	for (j = 0; j < npiece; j++)
	197	z[i + j] += (scaf_dblpiece)x[i]*y[j];
	198	}
	199
	200	/* --- @scaf_reduce@ --- *
	201	*
	202	* Arguments: @scaf_piece *z@ = where to write the result
	203	* @const scaf_dblpiece *x@ = the operand to reduce
	204	* @const scaf_piece *l@ = the modulus, in internal format
	205	* @const scaf_piece *mu@ = scaled approximation to @1/l@
	206	* @size_t npiece@ = number of pieces in @l@
	207	* @unsigned piecewd@ = nominal width of a piece in bits
	208	* @scaf_piece scratch@ = @3npiece@ scratch pieces
	209	*
	210	* Returns: ---
	211	*
	212	* Use: Reduce @x@ (a vector of @2*npiece@ double-precision pieces)
	213	* modulo @l@ (a vector of @npiece@ single-precision pieces),
	214	* writing the result to @z@.
	215	*
	216	* Write @n = npiece@, @w = piecewd@, and %$B = 2^w$%. The
	217	* operand @mu@ must contain %$\lfloor B^{2n}/l \rfloor$%, in
	218	* @npiece + 1@ pieces. Furthermore, we must have
	219	* %$3 l < B^n$%. (Fiddle with %$w$% if necessary.)
	220	*/
	221
	222	void scaf_reduce(scaf_piece z, const scaf_dblpiece x,
	223	const scaf_piece l, const scaf_piece mu,
	224	size_t npiece, unsigned piecewd, scaf_piece *scratch)
	225	{
	226	unsigned i, j;
	227	scaf_piece t = scratch, q = scratch + 2*npiece;
	228	scaf_piece u, m = ((scaf_piece)1 << piecewd) - 1;
	229	scaf_dblpiece c;
	230
	231	/* This here is the hard part.
	232	*
	233	* Let w = PIECEWD, let n = NPIECE, and let B = 2^w. We must have
	234	* B^(n-1) <= l < B^n.
	235	*
	236	* The argument MU contains pieces of the quantity µ = floor(B^2n/l), which
	237	* is a scaled approximation to 1/l. We'll calculate
	238	*
	239	* q = floor(µ floor(x/B^(n-1))/B^(n+1))
	240	*
	241	* which is an underestimate of x/l.
	242	*
	243	* With a bit more precision: by definition, u - 1 < floor(u) <= u. Hence,
	244	*
	245	* B^2n/l - 1 < µ <= B^2/l
	246	*
	247	* and
	248	*
	249	* x/B^(n-1) - 1 < floor(x/B^(n-1)) <= x/B^(n-1)
	250	*
	251	* Multiplying these together, and dividing through by B^(n+1), gives
	252	*
	253	* floor(x/l - B^(n-1)/l - x/B^2n + 1/B^(n+1)) <=
	254	* q <= µ floor(x/B^(n-1))/B^(n+1) <= floor(x/l)
	255	*
	256	* Now, noticing that x < B^2n and l > B^(n-1) shows that x/B^2n and
	257	* B^(n-1)/l are each less than 1; hence
	258	*
	259	* floor(x/l) - 2 <= q <= floor(x/l) <= x/l
	260	*
	261	* Now we set r = x - q l. Certainly, r == x (mod l); and
	262	*
	263	* 0 <= r < x - l floor(x/l) + 2 l < 3 l < B^n
	264	*/
	265
	266	/* Before we start on the fancy stuff, we need to resolve the pending
	267	* carries in x. We'll be doing the floor division by just ignoring some
	268	* of the pieces, and it would be bad if we missed some significant bits.
	269	* Of course, this means that we don't actually have to store the low
	270	* NPIECE - 1 pieces of the result.
	271	*/
	272	for (i = 0, c = 0; i < 2*npiece; i++)
	273	{ c += x[i]; t[i] = c&m; c >>= piecewd; }
	274
	275	/* Now we calculate q. If we calculate this in product-scanning order, we
	276	* can avoid having to store the low NPIECE + 1 pieces of the product as
	277	* long as we keep track of the carry out properly. Conveniently, NMU =
	278	* NPIECE + 1, which keeps the loop bounds easy in the first pass.
	279	*
	280	* Furthermore, because we know that r fits in NPIECE pieces, we only need
	281	* the low NPIECE pieces of q.
	282	*/
	283	for (i = 0, c = 0; i < npiece + 1; i++) {
	284	for (j = 0; j <= i; j++)
	285	c += (scaf_dblpiece)t[j + npiece - 1]*mu[i - j];
	286	c >>= piecewd;
	287	}
	288	for (i = 0; i < npiece; i++) {
	289	for (j = i + 1; j < npiece + 1; j++)
	290	c += (scaf_dblpiece)t[j + npiece - 1]*mu[npiece + 1 + i - j];
	291	q[i] = c&m; c >>= piecewd;
	292	}
	293
	294	/* Next, we calculate r - q l in z. Product-scanning seems to be working
	295	* out for us, and this time it will save us needing a large temporary
	296	* space for the product q l as we go. On the downside, we have to track
	297	* the carries from the multiplication and subtraction separately.
	298	*
	299	* Notice that the result r is at most NPIECE pieces long, so we can stop
	300	* once we have that many.
	301	*/
	302	u = 1; c = 0;
	303	for (i = 0; i < npiece; i++) {
	304	for (j = 0; j <= i; j++) c += (scaf_dblpiece)q[j]*l[i - j];
	305	u += t[i] + ((scaf_piece)(c&m) ^ m);
	306	z[i] = u&m; u >>= piecewd; c >>= piecewd;
	307	}
	308
	309	/* Finally, two passes of conditional subtraction. Calculate t = z - l; if
	310	* there's no borrow out the top, then update z = t; otherwise leave t
	311	* alone.
	312	*/
	313	for (i = 0; i < 2; i++) {
	314	for (j = 0, u = 1; j < npiece; j++) {
	315	u += z[j] + (l[j] ^ m);
	316	t[j] = u&m; u >>= piecewd;
	317	}
	318	for (j = 0, u = -u; j < npiece; j++) z[j] = (t[j]&u) \| (z[j]&~u);
	319	}
	320	}
	321
	322	/----- That's all, folks -------------------------------------------------/