progs/perftest.c: Use from Glibc syscall numbers.
[catacomb] / math / scaf.c
CommitLineData
d56fd9d1
MW
1/* -*-c-*-
2 *
3 * Simple scalar fields
4 *
5 * (c) 2017 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of Catacomb.
11 *
12 * Catacomb is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU Library General Public License as
14 * published by the Free Software Foundation; either version 2 of the
15 * License, or (at your option) any later version.
16 *
17 * Catacomb is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU Library General Public License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with Catacomb; if not, write to the Free
24 * Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
25 * MA 02111-1307, USA.
26 */
27
28/*----- Header files ------------------------------------------------------*/
29
30#include <string.h>
31
32#include "scaf.h"
33
fac7ff6e
MW
34/*----- Debugging utilties ------------------------------------------------*/
35
36#ifdef SCAF_DEBUG
37
38#include <stdio.h>
39
40#include "mp.h"
41#include "mpint.h"
42#include "mptext.h"
43
44static void scaf_dump(const char *what, const scaf_piece *x,
45 size_t npiece, size_t piecewd)
46{
47 mp *y = MP_ZERO, *t = MP_NEW;
48 size_t i;
49 unsigned o = 0;
50
51 for (i = 0; i < npiece; i++) {
52 t = mp_fromuint64(t, x[i]);
53 t = mp_lsl(t, t, o);
54 y = mp_add(y, y, t);
55 o += piecewd;
56 }
57 printf(";; %s", what); MP_PRINT("", y); putchar('\n');
58 mp_drop(y); mp_drop(t);
59}
60
61static void scaf_dumpdbl(const char *what, const scaf_dblpiece *x,
62 size_t npiece, size_t piecewd)
63{
64 mp *y = MP_ZERO, *t = MP_NEW;
65 size_t i;
66 unsigned o = 0;
67
68 for (i = 0; i < npiece; i++) {
69 t = mp_fromuint64(t, x[i]);
70 t = mp_lsl(t, t, o);
71 y = mp_add(y, y, t);
72 o += piecewd;
73 }
74 printf(";; %s", what); MP_PRINT("", y); putchar('\n');
75 mp_drop(y); mp_drop(t);
76}
77
78#endif
79
d56fd9d1
MW
80/*----- Main code ---------------------------------------------------------*/
81
82/* --- @scaf_load@ --- *
83 *
84 * Arguments: @scaf_piece *z@ = where to write the result
85 * @const octet *b@ = source buffer to read
86 * @size_t sz@ = size of the source buffer
87 * @size_t npiece@ = number of pieces to read
88 * @unsigned piecewd@ = nominal width of pieces in bits
89 *
90 * Returns: ---
91 *
92 * Use: Loads a little-endian encoded scalar into a vector @z@ of
93 * single-precision pieces.
94 */
95
96void scaf_load(scaf_piece *z, const octet *b, size_t sz,
97 size_t npiece, unsigned piecewd)
98{
99 uint32 a, m = ((scaf_piece)1 << piecewd) - 1;
100 unsigned i, j, n;
101
102 for (i = j = n = 0, a = 0; i < sz; i++) {
103 a |= b[i] << n; n += 8;
104 if (n >= piecewd) {
105 z[j++] = a&m; a >>= piecewd; n -= piecewd;
106 if (j >= npiece) return;
107 }
108 }
109 z[j++] = a;
110 while (j < npiece) z[j++] = 0;
111}
112
113/* --- @scaf_loaddbl@ --- *
114 *
115 * Arguments: @scaf_dblpiece *z@ = where to write the result
116 * @const octet *b@ = source buffer to read
117 * @size_t sz@ = size of the source buffer
118 * @size_t npiece@ = number of pieces to read
119 * @unsigned piecewd@ = nominal width of pieces in bits
120 *
121 * Returns: ---
122 *
123 * Use: Loads a little-endian encoded scalar into a vector @z@ of
124 * double-precision pieces.
125 */
126
127void scaf_loaddbl(scaf_dblpiece *z, const octet *b, size_t sz,
128 size_t npiece, unsigned piecewd)
129{
130 uint32 a, m = ((scaf_piece)1 << piecewd) - 1;
131 unsigned i, j, n;
132
133 for (i = j = n = 0, a = 0; i < sz; i++) {
134 a |= b[i] << n; n += 8;
135 if (n >= piecewd) {
136 z[j++] = a&m; a >>= piecewd; n -= piecewd;
137 if (j >= npiece) return;
138 }
139 }
140 z[j++] = a;
141 while (j < npiece) z[j++] = 0;
142}
143
144/* --- @scaf_store@ --- *
145 *
146 * Arguments: @octet *b@ = buffer to fill in
147 * @size_t sz@ = size of the buffer
148 * @const scaf_piece *x@ = scalar to store
149 * @size_t npiece@ = number of pieces in @x@
150 * @unsigned piecewd@ = nominal width of pieces in bits
151 *
152 * Returns: ---
153 *
154 * Use: Stores a scalar in a vector of single-precison pieces as a
155 * little-endian vector of bytes.
156 */
157
158void scaf_store(octet *b, size_t sz, const scaf_piece *x,
159 size_t npiece, unsigned piecewd)
160{
161 uint32 a;
162 unsigned i, j, n;
163
164 for (i = j = n = 0, a = 0; i < npiece; i++) {
165 a |= x[i] << n; n += piecewd;
166 while (n >= 8) {
167 b[j++] = a&0xffu; a >>= 8; n -= 8;
168 if (j >= sz) return;
169 }
170 }
171 b[j++] = a;
172 memset(b + j, 0, sz - j);
173}
174
175/* --- @scaf_mul@ --- *
176 *
177 * Arguments: @scaf_dblpiece *z@ = where to put the answer
178 * @const scaf_piece *x, *y@ = the operands
179 * @size_t npiece@ = the length of the operands
180 *
181 * Returns: ---
182 *
183 * Use: Multiply two scalars. The destination must have space for
184 * @2*npiece@ pieces (though the last one will always be zero).
185 * The result is not reduced.
186 */
187
188void scaf_mul(scaf_dblpiece *z, const scaf_piece *x, const scaf_piece *y,
189 size_t npiece)
190{
191 unsigned i, j;
192
193 for (i = 0; i < 2*npiece; i++) z[i] = 0;
194
195 for (i = 0; i < npiece; i++)
196 for (j = 0; j < npiece; j++)
197 z[i + j] += (scaf_dblpiece)x[i]*y[j];
198}
199
200/* --- @scaf_reduce@ --- *
201 *
202 * Arguments: @scaf_piece *z@ = where to write the result
203 * @const scaf_dblpiece *x@ = the operand to reduce
204 * @const scaf_piece *l@ = the modulus, in internal format
205 * @const scaf_piece *mu@ = scaled approximation to @1/l@
206 * @size_t npiece@ = number of pieces in @l@
207 * @unsigned piecewd@ = nominal width of a piece in bits
e057fe08 208 * @scaf_piece *scratch@ = @3*npiece@ scratch pieces
d56fd9d1
MW
209 *
210 * Returns: ---
211 *
212 * Use: Reduce @x@ (a vector of @2*npiece@ double-precision pieces)
213 * modulo @l@ (a vector of @npiece@ single-precision pieces),
214 * writing the result to @z@.
215 *
216 * Write @n = npiece@, @w = piecewd@, and %$B = 2^w$%. The
217 * operand @mu@ must contain %$\lfloor B^{2n}/l \rfloor$%, in
218 * @npiece + 1@ pieces. Furthermore, we must have
219 * %$3 l < B^n$%. (Fiddle with %$w$% if necessary.)
220 */
221
222void scaf_reduce(scaf_piece *z, const scaf_dblpiece *x,
223 const scaf_piece *l, const scaf_piece *mu,
224 size_t npiece, unsigned piecewd, scaf_piece *scratch)
225{
226 unsigned i, j;
227 scaf_piece *t = scratch, *q = scratch + 2*npiece;
228 scaf_piece u, m = ((scaf_piece)1 << piecewd) - 1;
229 scaf_dblpiece c;
230
231 /* This here is the hard part.
232 *
d1dace00 233 * Let w = PIECEWD, let n = NPIECE, and let B = 2^w. We must have
d56fd9d1
MW
234 * B^(n-1) <= l < B^n.
235 *
236 * The argument MU contains pieces of the quantity µ = floor(B^2n/l), which
237 * is a scaled approximation to 1/l. We'll calculate
238 *
239 * q = floor(µ floor(x/B^(n-1))/B^(n+1))
240 *
241 * which is an underestimate of x/l.
242 *
243 * With a bit more precision: by definition, u - 1 < floor(u) <= u. Hence,
244 *
245 * B^2n/l - 1 < µ <= B^2/l
246 *
247 * and
248 *
249 * x/B^(n-1) - 1 < floor(x/B^(n-1)) <= x/B^(n-1)
250 *
251 * Multiplying these together, and dividing through by B^(n+1), gives
252 *
253 * floor(x/l - B^(n-1)/l - x/B^2n + 1/B^(n+1)) <=
254 * q <= µ floor(x/B^(n-1))/B^(n+1) <= floor(x/l)
255 *
256 * Now, noticing that x < B^2n and l > B^(n-1) shows that x/B^2n and
257 * B^(n-1)/l are each less than 1; hence
258 *
259 * floor(x/l) - 2 <= q <= floor(x/l) <= x/l
260 *
261 * Now we set r = x - q l. Certainly, r == x (mod l); and
262 *
263 * 0 <= r < x - l floor(x/l) + 2 l < 3 l < B^n
264 */
265
266 /* Before we start on the fancy stuff, we need to resolve the pending
267 * carries in x. We'll be doing the floor division by just ignoring some
268 * of the pieces, and it would be bad if we missed some significant bits.
269 * Of course, this means that we don't actually have to store the low
270 * NPIECE - 1 pieces of the result.
271 */
272 for (i = 0, c = 0; i < 2*npiece; i++)
273 { c += x[i]; t[i] = c&m; c >>= piecewd; }
274
275 /* Now we calculate q. If we calculate this in product-scanning order, we
276 * can avoid having to store the low NPIECE + 1 pieces of the product as
277 * long as we keep track of the carry out properly. Conveniently, NMU =
278 * NPIECE + 1, which keeps the loop bounds easy in the first pass.
279 *
280 * Furthermore, because we know that r fits in NPIECE pieces, we only need
281 * the low NPIECE pieces of q.
282 */
283 for (i = 0, c = 0; i < npiece + 1; i++) {
284 for (j = 0; j <= i; j++)
285 c += (scaf_dblpiece)t[j + npiece - 1]*mu[i - j];
286 c >>= piecewd;
287 }
288 for (i = 0; i < npiece; i++) {
289 for (j = i + 1; j < npiece + 1; j++)
290 c += (scaf_dblpiece)t[j + npiece - 1]*mu[npiece + 1 + i - j];
291 q[i] = c&m; c >>= piecewd;
292 }
293
294 /* Next, we calculate r - q l in z. Product-scanning seems to be working
295 * out for us, and this time it will save us needing a large temporary
296 * space for the product q l as we go. On the downside, we have to track
297 * the carries from the multiplication and subtraction separately.
298 *
299 * Notice that the result r is at most NPIECE pieces long, so we can stop
300 * once we have that many.
301 */
302 u = 1; c = 0;
303 for (i = 0; i < npiece; i++) {
304 for (j = 0; j <= i; j++) c += (scaf_dblpiece)q[j]*l[i - j];
305 u += t[i] + ((scaf_piece)(c&m) ^ m);
306 z[i] = u&m; u >>= piecewd; c >>= piecewd;
307 }
308
309 /* Finally, two passes of conditional subtraction. Calculate t = z - l; if
310 * there's no borrow out the top, then update z = t; otherwise leave t
311 * alone.
312 */
313 for (i = 0; i < 2; i++) {
314 for (j = 0, u = 1; j < npiece; j++) {
315 u += z[j] + (l[j] ^ m);
316 t[j] = u&m; u >>= piecewd;
317 }
06325636 318 for (j = 0, u = -u; j < npiece; j++) z[j] = (t[j]&u) | (z[j]&~u);
d56fd9d1
MW
319 }
320}
321
322/*----- That's all, folks -------------------------------------------------*/