@@@ doc wip
[mLib] / utils / fltfmt.c
CommitLineData
b1a20bee
MW
1/* -*-c-*-
2 *
3 * Floating-point format conversions
4 *
5 * (c) 2024 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of the mLib utilities library.
11 *
12 * mLib is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU Library General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version.
16 *
17 * mLib is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 * License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with mLib. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 * USA.
26 */
27
28/*----- Header files ------------------------------------------------------*/
29
30#include "config.h"
31
32#include <assert.h>
33#include <float.h>
34#include <limits.h>
35#include <math.h>
36
37#include "alloc.h"
38#include "arena.h"
39#include "bits.h"
40#include "fltfmt.h"
41#include "growbuf.h"
42#include "macros.h"
43#include "maths.h"
44
45#if GCC_VERSION_P(4, 4)
46# pragma GCC optimize "-frounding-math"
47#elif CLANG_VERSION_P(11, 0) && !CLANG_VERSION_P(12, 0)
48# pragma clang optimize "-frounding-math"
49#elif GCC_VERSION_P(0, 0) || \
50 (CLANG_VERSION_P(0, 0) && !CLANG_VERSION_P(12, 0))
51 /* We just lose. */
52#elif __STDC_VERSION__ >= 199001
53# include <fenv.h>
54# pragma STDC FENV_ACCESS ON
55#endif
56
57/*----- Some useful constants ---------------------------------------------*/
58
289651a7
MW
59#define B31 0x80000000u /* just bit 31 */
60#define B30 0x40000000u /* just bit 30 */
b1a20bee
MW
61
62#define SH32 4294967296.0 /* 2^32 as floating-point */
63
64/*----- Useful macros -----------------------------------------------------*/
65
66#define B32(k) ((uint32)1 << (k))
67#define M32(k) (B32(k) - 1)
68
69#define FINITEP(x) (!((x)->f&(FLTF_INF | FLTF_NANMASK | FLTF_ZERO)))
70
71/*----- Utility functions -------------------------------------------------*/
72
73/* --- @clz32@ --- *
74 *
75 * Arguments: @uint32 x@ = a 32-bit value
76 *
77 * Returns: The number of leading zeros in @x@, i.e., the nonnegative
78 * integer %$n$% such that %$2^{31} \le 2^n x < 2^{32}$%.
79 * Returns a nonsensical value if %$x = 0$%.
80 */
81
82static unsigned clz32(uint32 x)
83{
84 unsigned n = 0;
85
86 /* Divide and conquer. If the top half of the bits are clear, then there
87 * must be at least 16 leading zero bits, so accumulate and shift. Repeat
88 * for smaller powers of two.
89 *
90 * This ends up returning 31 if %$x = 0$%, but don't rely on this.
91 */
92 if (!(x&0xffff0000)) { x <<= 16; n += 16; }
93 if (!(x&0xff000000)) { x <<= 8; n += 8; }
94 if (!(x&0xf0000000)) { x <<= 4; n += 4; }
95 if (!(x&0xc0000000)) { x <<= 2; n += 2; }
96 if (!(x&0x80000000)) { n += 1; }
97 return (n);
98}
99
100/* --- @ctz32@ --- *
101 *
102 * Arguments: @uint32 x@ = a 32-bit value
103 *
104 * Returns: The number of trailing zeros in @x@, i.e., the nonnegative
105 * integer %$n$% such that %$x/2^n$% is an odd integer.
106 * Returns a nonsensical value if %$x = 0$%.
107 */
108
109static unsigned ctz32(uint32 x)
110{
111#ifdef CTZ_TRADITIONAL
112
113 unsigned n = 0;
114
115 /* Divide and conquer. If the bottom half of the bits are clear, then
116 * there must be at least 16 trailing zero bits, so accumulate and shift.
117 * Repeat for smaller powers of two.
118 *
119 * This ends up returning 31 if %$x = 0$%, but don't rely on this.
120 */
121 if (!(x&0x0000ffff)) { x >>= 16; n += 16; }
122 if (!(x&0x000000ff)) { x >>= 8; n += 8; }
123 if (!(x&0x0000000f)) { x >>= 4; n += 4; }
124 if (!(x&0x00000003)) { x >>= 2; n += 2; }
125 if (!(x&0x00000001)) { n += 1; }
126 return (n);
127
128#else
129
130 static unsigned char tab[] =
131 /*
132 ;;; Compute the decoding table for the de Bruijn sequence trick below.
133
134 (let ((db #x04653adf)
135 (rv (make-vector 32 nil)))
136 (dotimes (i 32)
137 (aset rv (logand (ash db (- i 27)) #x1f) i))
138 (save-excursion
139 (goto-char (point-min))
140 (search-forward (concat "***" "BEGIN ctz32tab" "***"))
141 (beginning-of-line 2)
142 (delete-region (point)
143 (progn
144 (search-forward "***END***")
145 (beginning-of-line)
146 (point)))
147 (dotimes (i 32)
148 (cond ((zerop i) (insert " { "))
149 ((zerop (mod i 16)) (insert ",\n "))
150 ((zerop (mod i 4)) (insert ", "))
151 (t (insert ", ")))
152 (insert (format "%2d" (aref rv i))))
153 (insert " };\n")))
154 */
155
156 /* ***BEGIN ctz32tab*** */
157 { 0, 1, 2, 6, 3, 11, 7, 16, 4, 14, 12, 21, 8, 23, 17, 26,
158 31, 5, 10, 15, 13, 20, 22, 25, 30, 9, 19, 24, 29, 18, 28, 27 };
159 /* ***END*** */
160
161 /* Sneaky trick. Two's complement negation (which you effectively get
162 * using C unsigned arithmetic, whether you like it or not) complements all
163 * of the bits of an operand more significant than the least significant
164 * set bit. Therefore, this bit is the only one set in both %$x$% and
165 * %$-x$%, so @x&-x@ will isolate it for us.
166 *
167 * The magic number @0x04653adf@ is a %%\emph{de Bruijn} number%%: every
168 * group of five consecutive bits is distinct, including groups which `wrap
169 * around', including some low bits and some high bits. Multiplying this
170 * number by a power of two is equivalent to a left shift; and, because the
171 * top five bits are all zero, the most significant five bits of the
172 * product are the same as if we'd applied a rotation. The result is that
173 * we end up with a distinctive pattern in those bits which perfectly
174 * diagnose each shift from 0 up to 31, which we can decode using a table.
175 *
176 * David Seal described a similar trick -- using the six-bit pattern
177 * generated by the constant @0x0450fbaf@ -- in `comp.sys.arm' in 1994;
178 * this constant was particularly convenient to multiply by on early ARM
179 * processors. The use of a de Bruijn number is described in Henry
180 * Warren's %%\emph{Hacker's Delight}%%.
181 */
182 return (tab[((x&-x)*0x04653adf >> 27)&0x1f]);
183
184#endif
185}
186
187/* --- @shl@, @shr@ --- *
188 *
189 * Arguments: @uint32 *z@ = destination vector
190 * @const uint32 *x@ = source vector
191 * @size_t sz@ = size of source vector, in elements
192 * @unsigned n@ = number of bits to shift by; must be less than
193 * 32
194 *
195 * Returns: The bits shifted off the end of the vector.
196 *
197 * Use: Shift a vector of 32-bit words left (@shl@) or right (@shr@)
198 * by some number of bit positions. These functions work
199 * correctly if @z@ and @x@ are the same pointer, but not if
200 * they otherwise overlap.
201 */
202
203static uint32 shl(uint32 *z, const uint32 *x, size_t sz, unsigned n)
204{
205 size_t i;
206 uint32 t, u;
207 unsigned r;
208
209 if (!n) {
210 for (i = 0; i < sz; i++) z[i] = x[i];
211 t = 0;
212 } else {
213 r = 32 - n;
214 for (t = 0, i = sz; i--; )
215 { u = x[i]; z[i] = ((u << n) | t)&MASK32; t = u >> r; }
216 }
217 return (t);
218}
219
220static uint32 shr(uint32 *z, const uint32 *x, size_t sz, unsigned n)
221{
222 size_t i;
223 uint32 t, u;
224 unsigned r;
225
226 if (!n) {
227 for (i = 0; i < sz; i++) z[i] = x[i];
228 t = 0;
229 } else {
230 r = 32 - n;
231 for (t = 0, i = 0; i < sz; i++)
232 { u = x[i]; z[i] = ((u >> n) | t)&MASK32; t = u << r; }
233 }
234 return (t);
235}
236
237/* --- @sigbits@ --- *
238 *
239 * Arguments: @const struct floatbits *x@ = decoded floating-point number
240 *
241 * Returns: The number of significant digits in @x@'s fraction. This
242 * will be zero if @x@ is zero or infinite.
243 */
244
245static unsigned sigbits(const struct floatbits *x)
246{
247 unsigned i;
248 uint32 w;
249
250 if (x->f&(FLTF_ZERO | FLTF_INF)) return (0);
251 i = x->n;
252 for (;;) {
253 if (!i) return (0);
254 w = x->frac[--i]; if (w) return (32*(i + 1) - ctz32(w));
255 }
256}
257
258/* --- @ms_set_bit@ --- *
259 *
260 * Arguments: @const uint32 *x@ = pointer to the %%\emph{end}%% of the
261 * buffer
262 * @unsigned from, to@ = lower (inclusive) and upper (exclusive)
263 * bounds on the region of bits to inspect
264 *
265 * Returns: Index of the most significant set bit, or @ALLCLEAR@.
266 *
267 * Use: For the (rather unusual) purposes of this function, the bits
268 * of the input are numbered from zero, being the least
269 * significant bit of @x[-1]@, upwards through more significant
270 * bits of @x[-1]@, and then through @x[-2]@ and so on.
271 *
272 * If all of the bits in the half-open region are clear then
273 * @ALLCLEAR@ is returned; otherwise, the return value is the
274 * index of the most significant set bit in the region. Note
275 * that @ALLCLEAR@ is equal to @UINT_MAX@: since this is the
276 * largest possible value of @to@, and the upper bound is
277 * exclusive, this cannot be the index of a bit in the region.
278 */
279
280#define ALLCLEAR UINT_MAX
281static unsigned ms_set_bit(const uint32 *x, unsigned from, unsigned to)
282{
283 unsigned n0, n, b, base;
284 uint32 m, w;
285
286 /* <--- increasing indices <---
287 *
288 * ---+-------+-------+-------+-------+-------+---
289 * ...S |///| | | | | |//| S...
290 * ---+-------+-------+-------+-------+-------+---
291 */
292
293 /* If the region is empty then it's technically true that all of the bits
294 * are zero. It's important to be able to do answer the case where
295 * %$\id{from} = \id{to} = 0$% without accessing memory.
296 */
297 assert(to >= from); if (to == from) return (ALLCLEAR);
298
299 /* This is distressingly complicated. Here's how it's going to work.
300 *
301 * There's at least one bit to check, or we'd have returned already --
302 * specifically, we must check the bit with index @from@. But that's at
303 * the wrong end. Instead, we start at the most significant end, with the
304 * word containing the bit one short of the @to@ position. Even though
305 * it's actually one off, because we use a half-open interval, we'll call
306 * that the `@to@ bit'.
307 *
308 * We start by loading the word containing the @to@ bit, and start @base@
309 * off as the bit index of the least significant bit of this word. We mask
310 * off the high bits (if any), leaving only the @to@ bit and the less
311 * significant ones. We %%\emph{don't}%% check the remaining bits yet.
312 *
313 * We then start an offset loop. In each iteration, we check the word
314 * we're currently holding: if it's not zero, then we return @base@ plus
315 * the position of the most-significant set bit, using @clz32@. Otherwise,
316 * we load the next (less significant) word, and drop @base@ by 32, but
317 * don't check it yet. We end this loop when the word we're holding
318 * contains the @from@ bit. It's possible that we didn't do any iterations
319 * of the loop, in which case we're still holding the word containing the
320 * @to@ bit at this point.
321 *
322 * Finally, we mask off the bits below the @from@ bit, and check that what
323 * we have left is zero. If it isn't, we return @base@ plus the position
324 * of the most significant bit; if it is, we return @ALLCEAR@.
325 */
326
327 /* The first job is to find the word containing the @to@ bit and mask off
328 * any higher bits that we don't care about.
329 *
330 * Recall that the bit's index is @to - 1@, but this must be a valid index
331 * because there is at least one bit in the region. But we start out
332 * pointing beyond the vector, so we must add an extra 32 bits.
333 */
334 n0 = (to + 31)/32; x -= n0; base = (to - 1)&~31u; w = *x++;
335 b = to%32; if (b) w &= M32(b);
336
337 /* Before we start the loop, it'd be useful to know how many iterations we
338 * need. This is going to be the offset from the word containing the @to@
339 * bit to the word containing the @from@ bit. Again, we're off by one
340 * because that's how our initial indexing is set up.
341 */
342 n = n0 - from/32 - 1;
343
344 /* Now it's time to do the loop. This is the easy bit. */
345 while (n--) {
346 if (w) return (base + 31 - clz32(w));
347 w = *x++&MASK32; base -= 32;
348 }
349
350 /* We're now holding the final word -- the one containing the @from@ bit.
351 * We need to mask off any low bits that we don't care about.
352 */
353 m = M32(from%32); w &= MASK32&~m;
354
355 /* Final check. */
356 if (w) return (base + 31 - clz32(w));
357 else return (ALLCLEAR);
358}
359
360/*----- General floating-point hacking ------------------------------------*/
361
362/* --- @fltfmt_initbits@ --- *
363 *
364 * Arguments: @struct floatbits *x@ = pointer to structure to initialize
365 *
366 * Returns: ---
367 *
368 * Use: Dynamically initialize @x@ to (positive) zero so that it can
369 * be used as the destination operand by other operations. This
370 * doesn't allocate resources and cannot fail. The
371 * @FLOATBITS_INIT@ macro is a suitable static initializer for
372 * performing the same task.
373 */
374
375void fltfmt_initbits(struct floatbits *x)
376{
377 x->f = FLTF_ZERO;
378 x->a = arena_global;
379 x->frac = 0; x->n = x->fracsz = 0;
380}
381
382/* --- @fltfmt_freebits@ --- *
383 *
384 * Arguments: @struct floatbits *x@ = pointer to structure to free
385 *
386 * Returns: ---
387 *
388 * Use: Releases the memory held by @x@. Afterwards, @x@ is a valid
389 * (positive) zero, but can safely be discarded.
390 */
391
392void fltfmt_freebits(struct floatbits *x)
393{
394 if (x->frac) x_free(x->a, x->frac);
395 x->f = FLTF_ZERO;
396 x->frac = 0; x->n = x->fracsz = 0;
397}
398
399/* --- @fltfmt_allocfrac@ --- *
400 *
401 * Arguments: @struct floatbits *x@ = structure to adjust
402 * @unsigned n@ = number of words required
403 *
404 * Returns: ---
405 *
406 * Use: Reallocate the @frac@ vector so that it has space for at
407 * least @n@ 32-bit words, and set @x->n@ equal to @n@. If the
408 * current size is already @n@ or greater, then just update the
409 * active length @n@ and return; otherwise, any existing vector
410 * is discarded and a fresh, larger one allocated.
411 */
412
413void fltfmt_allocfrac(struct floatbits *x, unsigned n)
414 { GROWBUF_REPLACEV(unsigned, x->a, x->frac, x->fracsz, n, 4); x->n = n; }
415
416/* --- @fltfmt_copybits@ --- *
417 *
418 * Arguments: @struct floatbits *z_out@ = where to leave the result
419 * @const struct floatbits *x@ = source to copy
420 *
421 * Returns: ---
422 *
423 * Use: Make @z_out@ be a copy of @x@. If @z_out@ is the same object
424 * as @x@ then do nothing.
425 */
426
427void fltfmt_copybits(struct floatbits *z_out, const struct floatbits *x)
428{
429 unsigned i;
430
431 if (z_out == x) return;
432 z_out->f = x->f;
433 if (!FINITEP(x)) z_out->exp = 0;
434 else z_out->exp = x->exp;
435 if ((x->f&(FLTF_ZERO | FLTF_INF)) || !x->n)
436 { z_out->n = 0; z_out->frac = 0; }
437 else {
438 fltfmt_allocfrac(z_out, x->n);
439 for (i = 0; i < x->n; i++) z_out->frac[i] = x->frac[i];
440 }
441}
442
443/* --- @fltfmt_round@ --- *
444 *
445 * Arguments: @struct floatbits *z_out@ = destination (may equal source)
446 * @const struct floatbits *x@ = source
447 * @unsigned r@ = rounding mode (@FLTRND_...@ code)
448 * @unsigned n@ = nonzero number of bits to leave
449 *
450 * Returns: A @FLTERR_...@ code, specifically either @FLTERR_INEXACT@ if
451 * rounding discarded some nonzero value bits, or @FLTERR_OK@ if
452 * rounding was unnecessary.
453 *
454 * Use: Rounds a floating-point value to a given number of
455 * significant bits, using the given rounding rule.
456 */
457
458unsigned fltfmt_round(struct floatbits *z_out, const struct floatbits *x,
459 unsigned r, unsigned n)
460{
461 unsigned rf, i, uw, ub, hw, hb, rc = 0;
462 uint32 um, hm, w;
463 int exp;
464
465 /* Check that this is likely to work. We must have at least one bit
466 * remaining, so that we can inspect the last-place unit bit. And we
467 * mustn't round up if the current value is already exact, because that
468 * would be nonsensical (and inconvenient).
469 */
470 assert(n > 0); assert(!(r&~(FRPMASK_LOW | FRPMASK_HALF)));
471
472 /* Eliminate trivial cases. There's nothing to do if the value is infinite
473 * or zero, or if we don't have enough precision already.
474 *
475 * The caller will have set the rounding mode and length suitably for a
476 * NaN.
477 */
478 if (x->f&(FLTF_ZERO | FLTF_INF) || n >= 32*x->n)
479 { fltfmt_copybits(z_out, x); return (FLTERR_OK); }
480
481 /* Determine various indices.
482 *
483 * The quantities @uw@ and @ub@ are the word and bit number which will hold
484 * the unit bit when we've finished; @hw@ and @hb@ similarly index the
485 * `half' bit, which is the next less significant bit.
486 */
487 uw = (n - 1)/32; ub = -n&31;
488 if (!ub) { hw = uw + 1; hb = 31; }
489 else { hw = uw; hb = ub - 1; }
490
491 /* Determine the necessary predicates for the rounding decision. */
492 rf = 0;
493 if (x->f&FLTF_NEG) rf |= FRPF_NEG;
494 um = B32(ub); if (x->frac[uw]&um) rf |= FRPF_ODD;
495 hm = B32(hb); if (x->frac[hw]&hm) rf |= FRPF_HALF;
496 if (x->frac[hw]&(hm - 1)) rf |= FRPF_LOW;
497 for (i = hw + 1; i < x->n; i++) if (x->frac[i]) rf |= FRPF_LOW;
498 if (rf&(FRPF_LOW | FRPF_HALF)) rc |= FLTERR_INEXACT;
499
500 /* Allocate space for the result. */
501 fltfmt_allocfrac(z_out, uw + 1);
502
503 /* We start looking at the least significant word of the result. Clear the
504 * low bits.
505 */
506 i = uw; exp = x->exp; w = x->frac[i]&~(um - 1);
507
508 /* If the rounding function is true then we need to add one to the
509 * truncated fraction and propagate carries.
510 */
511 if ((r >> rf)&1) {
512 w = (w + um)&MASK32;
513 while (i && !w) {
514 z_out->frac[i] = w;
515 w = (x->frac[--i] + 1)&MASK32;
516 }
517 if (!w) { w = B31; exp++; }
518 }
519
520 /* Store, and copy the remaining words. */
521 for (;;) {
522 z_out->frac[i] = w;
523 if (!i) break;
524 w = x->frac[--i];
525 }
526
527 /* Done. */
528 z_out->f = x->f&(FLTF_NEG | FLTF_NANMASK);
529 if (x->f&FLTF_NANMASK) z_out->exp = 0;
530 else z_out->exp = exp;
531 return (rc);
532}
533
534/*----- IEEE formats ------------------------------------------------------*/
535
536/* IEEE (and related) format descriptions. */
537const struct fltfmt_ieeefmt
c752173d
MW
538 fltfmt_mini = { FLTIF_HIDDEN, 4, 4 },
539 fltfmt_bf16 = { FLTIF_HIDDEN, 8, 8 },
540 fltfmt_f16 = { FLTIF_HIDDEN, 5, 11 },
541 fltfmt_f32 = { FLTIF_HIDDEN, 8, 24 },
542 fltfmt_f64 = { FLTIF_HIDDEN, 11, 53 },
543 fltfmt_f128 = { FLTIF_HIDDEN, 15, 113 },
b1a20bee
MW
544 fltfmt_idblext80 = { 0, 15, 64 };
545
546/* --- @fltfmt_encieee@ ---
547 *
548 * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description
549 * @uint32 *z@ = output vector
550 * @const struct floatbits *x@ = value to encode
551 * @unsigned r@ = rounding mode
552 * @unsigned errmask@ = error mask
553 *
554 * Returns: Error flags (@FLTERR_...@).
555 *
556 * Use: Encode a floating-point value in an IEEE format. This is the
557 * machinery shared by the @fltfmt_enc...@ functions for
558 * encoding IEEE-format values. Most of the arguments and
559 * behaviour are as described for those functions.
560 *
561 * The encoded value is right-aligned and big-endian; i.e., the
562 * sign bit ends up in @z[0]@, and the least significant bit of
563 * the significand ends up in the least significant bit of
564 * @z[n - 1]@.
565 */
566
567unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *fmt,
568 uint32 *z, const struct floatbits *x,
569 unsigned r, unsigned errmask)
570{
571 struct floatbits y;
572 unsigned sigwd, fracwd, err = 0, f = x->f, rf;
573 unsigned i, j, n, nb, nw, mb, mw, esh, sh;
574 int exp, minexp, maxexp;
575 uint32 z0, t;
576
577#define ERR(e) do { err |= (e); if (err&~errmask) goto end; } while (0)
578
579 /* The following code assumes that the sign, biased exponent, unit, and
580 * quiet/signalling bits can all fit into the most significant 32 bits of
581 * the result.
582 */
583 assert(fmt->expwd + 3 <= 32);
584 esh = 31 - fmt->expwd;
585
586 /* Determine the output size. */
587 nb = fmt->prec + fmt->expwd + 1;
c752173d 588 if (fmt->f&FLTIF_HIDDEN) nb--;
b1a20bee
MW
589 nw = (nb + 31)/32;
590
591 /* Determine the top bits. */
592 z0 = 0; i = 0;
593 if (x->f&FLTF_NEG) z0 |= B31;
594
595 /* And now for the main case analysis. */
596
597 if (f&FLTF_ZERO) {
598 /* Zero. There's very little to do here. */
599
600 } else if (f&FLTF_INF) {
601 /* Infinity. Set the exponent and, for non-hidden-bit formats, the unit
602 * bit.
603 */
604
605 z0 |= M32(fmt->expwd) << esh;
c752173d 606 if (!(fmt->f&FLTIF_HIDDEN)) z0 |= B32(esh - 1);
b1a20bee
MW
607
608 } else if (f&FLTF_NANMASK) {
609 /* Not-a-number.
610 *
611 * We must check that we won't lose significant bits. We need a bit for
612 * the quiet/signalling flag, and enough space for the significant
613 * payload bits. The unit bit is not in play here, so the available
614 * space is always one less than the advertised precision. To put it
615 * another way, we need space for the payload, a bit for the
616 * quiet/signalling flag, and a bit for the unit place.
617 */
618
619 fracwd = sigbits(x);
620 if (fracwd + 2 > fmt->prec) ERR(FLTERR_INEXACT);
621
622 /* Copy the payload.
623 *
624 * If the payload is all-zero and we're meant to set a signalling NaN
625 * then report an exactness failure and set the low bit.
626 */
627 mb = fmt->prec - 2; mw = (mb + 31)/32; sh = -mb%32;
628 for (i = 0; i < nw - mw; i++) z[i] = 0;
629 n = x->n; if (n > mw) n = nw;
630 t = shr(z + i, x->frac, n, sh); i += n;
631 if (i < nw) z[i++] = t;
c752173d 632 sh = esh - 2; if (fmt->f&FLTIF_HIDDEN) sh++;
b1a20bee
MW
633 if (f&FLTF_QNAN) z0 |= B32(sh);
634 else if (!fracwd) { ERR(FLTERR_INEXACT); z[nw - 1] |= 1; }
635
636 /* Set the exponent and, for non-hidden-bit formats, the unit bit. */
637 z0 |= M32(fmt->expwd) << esh;
c752173d 638 if (!(fmt->f&FLTIF_HIDDEN)) z0 |= B32(esh - 1);
b1a20bee
MW
639
640 } else {
641 /* A finite value.
642 *
643 * Adjust the exponent by one place to compensate for the difference in
644 * significant conventions. Our significand lies between zero (in fact,
645 * a half, because we require normalization) and one, while an IEEE
646 * significand lies between zero (in fact, one) and two. Our exponent is
647 * therefore one larger than the IEEE exponent will be.
648 */
649
650 /* Determine the maximum true (unbiased) exponent. As noted above, this
651 * is also the bias.
652 */
653 exp = x->exp - 1;
654 maxexp = (1 << (fmt->expwd - 1)) - 1;
655 minexp = 1 - maxexp;
656
657 if (exp <= minexp - (int)fmt->prec) {
658 /* If the exponent is very small then we underflow. We have %$p - 1$%
659 * bits available to represent a subnormal significand, and therefore
660 * can represent at least one bit of a value as small as
661 * %$2^{e_{\text{min}}-p+1}$%.
662 *
663 * If the exponent is one short of the threshold, then we check to see
664 * whether the value will round up.
665 */
666
667 if ((minexp - exp == fmt->prec) &&
668 ((r >> (FRPF_HALF |
669 (sigbits(x) > 1 ? FRPF_LOW : 0) |
670 (f&FLTF_NEG ? FRPF_NEG : 0)))&1)) {
671 ERR(FLTERR_INEXACT);
672 for (i = 0; i < nw - 1; i++) z[i] = 0;
673 z[i++] = 1;
674 } else {
675 ERR(FLTERR_UFLOW | FLTERR_INEXACT);
676 /* Return (signed) zero. */
677 }
678
679 } else {
680 /* We can at least try to store some bits. */
681
682 /* Let's see how many we need to deal with and how much space we have.
683 * We might as well set the biased exponent here while we're at it.
684 *
685 * If %$e \ge e_{\text{min}}$% then we can store %$p$% bits of
686 * significand. Otherwise, we must make a subnormal and we can only
687 * store %$p + e - e_{\text{min}}$% bits. (Cross-check: if %$e \le
688 * e_{\text{min}} - p$% then we can store zero bits or fewer and have
689 * underflowed to zero, which matches the previous case.) In the
690 * subnormal case, we also `correct' the exponent so that we store the
691 * correct sentinel value later.
692 */
693 fracwd = sigbits(x);
694 if (exp >= minexp) sigwd = fmt->prec;
695 else { sigwd = fmt->prec + exp - minexp; exp = minexp - 1; }
696 mw = (sigwd + 31)/32; sh = -sigwd%32;
697
698 /* If we don't have enough significand bits then we must round. This
699 * might increase the exponent, so we must reload.
700 */
701 if (fracwd > sigwd) {
702 ERR(FLTERR_INEXACT);
703 y.frac = z + nw - mw; y.fracsz = mw; fltfmt_round(&y, x, r, sigwd);
704 x = &y; exp = y.exp - 1; fracwd = sigwd;
705 }
706
707 if (exp > maxexp) {
708 /* If the exponent is too large, then we overflow. If the error is
709 * masked, then we must produce a default value, choosing between
710 * infinity and the largest representable finite value according to
711 * the rounding mode.
712 */
713
c752173d 714 ERR(FLTERR_OFLOW | FLTERR_INEXACT);
b1a20bee
MW
715 rf = FRPF_ODD | FRPF_HALF | FRPF_LOW;
716 if (f&FLTF_NEG) rf |= FRPF_NEG;
c752173d 717 if ((r >> rf)&1)
b1a20bee 718 z0 |= M32(fmt->expwd) << esh;
c752173d 719 else {
b1a20bee 720 z0 |= (B32(fmt->expwd) - 2) << esh;
c752173d 721 mb = fmt->prec; if (fmt->f&FLTIF_HIDDEN) mb--;
b1a20bee
MW
722 mw = (mb + 31)/32;
723 i = nw - mw;
724 z[i++] = M32(mb%32);
725 while (i < nw) z[i++] = MASK32;
726 }
727
728 } else {
729 /* The exponent is in range. Everything is ready. */
730
731 /* Store the significand. */
732 n = (fracwd + 31)/32; i = nw - mw;
733 t = shr(z + i, x->frac, n, sh); i += n;
734 if (i < nw) z[i++] = t;
735
736 /* Fill in the top end. */
737 for (j = nw - mw; j--; ) z[j] = 0;
738
739 /* Set the biased exponent. */
740 z0 |= (exp + maxexp) << esh;
741
742 /* Clear the unit bit if we're suppose to use a hidden-bit convention. */
c752173d 743 if (fmt->f&FLTIF_HIDDEN) {
b1a20bee
MW
744 mb = fmt->prec - 1; mw = (mb + 31)/32; mb = mb%32;
745 z[nw - mw] &= ~B32(mb);
746 }
747 }
748 }
749 }
750
751 /* Clear the significand bits that we haven't set explicitly yet. */
752 while (i < nw) z[i++] = 0;
753
754 /* All that remains is to insert the top bits @z0@ in the right place.
755 * This will set the exponent, and the unit and quiet bits.
756 */
757 sh = -nb%32;
758 z[0] |= z0 >> sh;
759 if (sh && nb >= 32) z[1] |= z0 << (32 - sh);
760
761end:
762 return (err);
763
764#undef ERR
765}
766
767/* --- @fltfmt_encTY@ --- *
768 *
769 * Arguments: @octet *z_out@, @uint16 *z_out@, @uint32 *z_out@,
770 * @kludge64 *z_out@ = where to put the encoded value
771 * @uint16 *se_out@, @kludge64 *m_out@ = where to put the
772 * encoded sign-and-exponent and significand
773 * @const struct floatbits *x@ = value to encode
774 * @unsigned r@ = rounding mode
775 * @unsigned errmask@ = error mask
776 *
777 * Returns: Error flags (@FLTERR_...@).
778 *
779 * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent)
780 * format.
781 *
782 * If an error is encountered during the encoding, and the
783 * corresponding bit of @errmask@ is clear, then processing
784 * stops immediately and the error is returned; if the bit is
785 * set, then processing continues as described below.
786 *
787 * The @TY@ may be
788 *
789 * * @mini@ for the 8-bit `1.4.3 minifloat' format, with
790 * four-bit exponent and four-bit significand, represented
791 * as a single octet;
792 *
793 * * @bf16@ for the Google `bfloat16' format, with eight-bit
794 * exponent and eight-bit significand, represented as a
795 * @uint16@;
796 *
797 * * @f16@ for the IEEE `binary16' format, with five-bit
798 * exponent and eleven-bit significand, represented as a
799 * @uint16@;
800 *
801 * * @f32@ for the IEEE `binary32' format, with eight-bit
802 * exponent and 24-bit significand, represented as a
803 * @uint32@;
804 *
805 * * @f64@ for the IEEE `binary64' format, with eleven-bit
806 * exponent and 53-bit significand, represented as a
807 * @kludge64@;
808 *
809 * * @f128@ for the IEEE `binary128' format, with fifteen-bit
810 * exponent and 113-bit significand, represented as four
811 * @uint32@ limbs, most significant first; or
812 *
813 * * @idblext80@ for the Intel 80-bit `double extended'
814 * format, with fifteen-bit exponent and 64-bit significand
815 * with no hidden bit, represented as a @uint16 se@
816 * holding the sign and exponent, and a @kludge64 m@
817 * holding the significand.
818 *
819 * Positive and negative zero and infinity are representable
820 * exactly.
821 *
822 * Following IEEE recommendations (and most implementations),
823 * the most significant fraction bit of a quiet NaN is set; this
824 * bit is clear in a signalling NaN. The most significant
825 * payload bits of a NaN, held in the top bits of @x->frac[0]@,
826 * are encoded in the output significand following the `quiet'
827 * bit. If the chosen format's significand field is too small
828 * to accommodate all of the set payload bits then the
829 * @FLTERR_INEXACT@ error bit is set and, if masked, the
830 * excess payload bits are discarded. No rounding of NaN
831 * payloads is performed.
832 *
833 * Otherwise, the input value is finite and nonzero. If the
834 * significand cannot be represented exactly then the
835 * @FLTERR_INEXACT@ error bit is set, and, if masked, the value
836 * will be rounded (internally -- the input @x@ is not changed).
837 * If the (rounded) value's exponent is too large to represent,
838 * then the @FLTERR_OFLOW@ and @FLTERR_INEXACT@ error bits are
839 * set and, if masked, the result is either the (absolute)
840 * largest representable finite value or infinity, with the
841 * appropriate sign, chosen according to the rounding mode. If
842 * the exponent is too small to represent, then the
843 * @FLTERR_UFLOW@ and @FLTERR_INEXACT@ error bits are set and,
844 * if masked, the result is either the (absolute) smallest
845 * nonzero value or zero, with the appropriate sign, chosen
846 * according to the rounding mode.
847 */
848
849unsigned fltfmt_encmini(octet *z_out, const struct floatbits *x,
850 unsigned r, unsigned errmask)
851{
852 uint32 t[1];
853 unsigned rc;
854
855 rc = fltfmt_encieee(&fltfmt_mini, t, x, r, errmask);
856 if (!(rc&~errmask)) *z_out = t[0];
857 return (rc);
858}
859
860unsigned fltfmt_encbf16(uint16 *z_out, const struct floatbits *x,
861 unsigned r, unsigned errmask)
862{
863 uint32 t[1];
864 unsigned rc;
865
866 rc = fltfmt_encieee(&fltfmt_bf16, t, x, r, errmask);
867 if (!(rc&~errmask)) *z_out = t[0];
868 return (rc);
869}
870
871unsigned fltfmt_encf16(uint16 *z_out, const struct floatbits *x,
872 unsigned r, unsigned errmask)
873{
874 uint32 t[1];
875 unsigned rc;
876
877 rc = fltfmt_encieee(&fltfmt_f16, t, x, r, errmask);
878 if (!(rc&~errmask)) *z_out = t[0];
879 return (rc);
880}
881
882unsigned fltfmt_encf32(uint32 *z_out, const struct floatbits *x,
883 unsigned r, unsigned errmask)
884 { return (fltfmt_encieee(&fltfmt_f32, z_out, x, r, errmask)); }
885
886unsigned fltfmt_encf64(kludge64 *z_out, const struct floatbits *x,
887 unsigned r, unsigned errmask)
888{
889 uint32 t[2];
890 unsigned rc;
891
892 rc = fltfmt_encieee(&fltfmt_f64, t, x, r, errmask);
893 if (!(rc&~errmask)) SET64(*z_out, t[0], t[1]);
894 return (rc);
895}
896
897unsigned fltfmt_encf128(uint32 *z_out, const struct floatbits *x,
898 unsigned r, unsigned errmask)
899 { return (fltfmt_encieee(&fltfmt_f128, z_out, x, r, errmask)); }
900
901unsigned fltfmt_encidblext80(uint16 *se_out, kludge64 *m_out,
902 const struct floatbits *x,
903 unsigned r, unsigned errmask)
904{
905 uint32 t[3];
906 unsigned rc;
907
908 rc = fltfmt_encieee(&fltfmt_idblext80, t, x, r, errmask);
909 if (!(rc&~errmask)) { *se_out = t[0]; SET64(*m_out, t[1], t[2]); }
910 return (rc);
911}
912
913/* --- @fltfmt_decieee@ --- *
914 *
915 * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description
916 * @struct floatbits *z_out@ = output decoded representation
917 * @const uint32 *x@ = input encoding
918 *
919 * Returns: Error flags (@FLTERR_...@).
920 *
921 * Use: Decode a floating-point value in an IEEE format. This is the
922 * machinery shared by the @fltfmt_dec...@ functions for
923 * deccoding IEEE-format values. Most of the arguments and
924 * behaviour are as described for those functions.
925 *
926 * The encoded value should be right-aligned and big-endian;
927 * i.e., the sign bit ends up in @z[0]@, and the least
928 * significant bit of the significand ends up in the least
929 * significant bit of @z[n - 1]@.
930 */
931
932unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *fmt,
933 struct floatbits *z_out, const uint32 *x)
934{
935 unsigned sigwd, err = 0, f = 0;
936 unsigned i, nb, nw, mw, esh, sh;
937 int exp, minexp, maxexp;
938 uint32 x0, t, u, emask;
939
940 /* The following code assumes that the sign, biased exponent, unit, and
941 * quiet/signalling bits can all fit into the most significant 32 bits of
942 * the result.
943 */
944 assert(fmt->expwd + 3 <= 32);
945 esh = 31 - fmt->expwd; emask = M32(fmt->expwd);
c752173d 946 sigwd = fmt->prec; if (fmt->f&FLTIF_HIDDEN) sigwd--;
b1a20bee
MW
947
948 /* Determine the input size. */
949 nb = sigwd + fmt->expwd + 1; nw = (nb + 31)/32;
950
951 /* Extract the sign, exponent, and top of the significand. */
952 sh = -nb%32;
953 x0 = x[0] << sh;
954 if (sh && nb >= 32) x0 |= x[1] >> (32 - sh);
955 if (x0&B31) f |= FLTF_NEG;
956 t = (x0 >> esh)&emask;
957
958 /* Time for a case analysis. */
959
960 if (t == emask) {
961 /* Infinity or NaN.
962 *
963 * Note that we don't include the quiet bit in our decoded payload.
964 */
965
c752173d 966 if (!(fmt->f&FLTIF_HIDDEN)) {
b1a20bee
MW
967 /* No hidden bit, so we expect the unit bit to be set. If it isn't,
968 * that's technically invalid, and its absence won't survive a round
969 * trip, since the bit isn't considered part of a NaN payload -- or
970 * even to distinguish a NaN from an infinity. In any event, reduce
971 * the notional significand size to exclude this bit from further
972 * consideration.
973 */
974
975 if (!(x0&B32(esh - 1))) err = FLTERR_INVAL;
976 sigwd--;
977 }
978
979 if (ms_set_bit(x + nw, 0, sigwd) == ALLCLEAR)
980 f |= FLTF_INF;
981 else {
c752173d 982 sh = esh - 2; if (fmt->f&FLTIF_HIDDEN) sh++;
b1a20bee
MW
983 if (x0&B32(sh)) f |= FLTF_QNAN;
984 else f |= FLTF_SNAN;
985 sigwd--; mw = (sigwd + 31)/32;
986 fltfmt_allocfrac(z_out, mw);
987 shl(z_out->frac, x + nw - mw, mw, -sigwd%32);
988 }
989 goto end;
990 }
991
992 /* Determine the exponent bounds. */
993 maxexp = (1 << (fmt->expwd - 1)) - 1;
994 minexp = 1 - maxexp;
995
996 /* Dispatch. If there's a hidden bit then everything is well defined.
997 * Otherwise, we'll normalize the incoming value regardless, but report
998 * settings of the unit bit which are inconsistent with the exponent.
999 */
c752173d 1000 if (fmt->f&FLTIF_HIDDEN) {
b1a20bee
MW
1001 if (!t) { exp = minexp; goto normalize; }
1002 else { exp = t - maxexp; goto hidden; }
1003 } else {
1004 u = x0&B32(esh - 1);
1005 if (!t) { exp = minexp; if (u) err |= FLTERR_INVAL; }
1006 else { exp = t - maxexp; if (!u) err |= FLTERR_INVAL; }
1007 goto normalize;
1008 }
1009
1010hidden:
1011 /* We have a normal real number with a hidden bit. */
1012
1013 mw = (sigwd + 31)/32;
1014
1015 if (!(sigwd%32)) {
1016 /* The bits we have occupy a whole number of words, but we need to shift
1017 * to make space for the unit bit.
1018 */
1019
1020 fltfmt_allocfrac(z_out, mw + 1);
1021 z_out->frac[mw] = shr(z_out->frac, x + nw - mw, mw, 1);
1022 } else {
1023 fltfmt_allocfrac(z_out, mw);
1024 shl(z_out->frac, x + nw - mw, mw, -(sigwd + 1)%32);
1025 }
1026 z_out->frac[0] |= B31;
1027 z_out->exp = exp + 1;
1028 goto end;
1029
1030normalize:
1031 /* We have, at least potentially, a subnormal number, with no hidden
1032 * bit.
1033 */
1034
1035 i = ms_set_bit(x + nw, 0, sigwd);
1036 if (i == ALLCLEAR) { f |= FLTF_ZERO; goto end; }
1037 mw = i/32 + 1; sh = 32*mw - i - 1;
1038 fltfmt_allocfrac(z_out, mw);
1039 shl(z_out->frac, x + nw - mw, mw, sh);
1040 z_out->exp = exp - fmt->prec + 2 + i;
1041 goto end;
1042
1043end:
1044 /* All done. */
1045 z_out->f = f; return (err);
1046}
1047
1048/* --- @fltfmt_decTY@ --- *
1049 *
1050 * Arguments: @const struct floatbits *z_out@ = storage for the result
1051 * @octet x@, @uint16 x@, @uint32 x@, @kludge64 x@ =
1052 * encoded input
1053 * @uint16 se@, @kludge64 m@ = encoded sign-and-exponent and
1054 * significand
1055 *
1056 * Returns: Error flags (@FLTERR_...@).
1057 *
1058 * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent)
1059 * format.
1060 *
1061 * The options for @TY@ are as documented for the encoding
1062 * functions above.
1063 *
1064 * In formats without a hidden bit -- currently only @idblext80@
1065 * -- not all bit patterns are valid encodings. If the explicit
1066 * unit bit is set when the exponent field is all-bits-zero, or
1067 * clear when the exponent field is not all-bits-zero, then the
1068 * @FLTERR_INVAL@ error bit is set. If the exponent is all-
1069 * bits-set, denoting infinity or a NaN, then the unit bit is
1070 * otherwise ignored -- in particular, it does not affect the
1071 * NaN payload, or even whether the input encodes a NaN or
1072 * infinity. Otherwise, the unit bit is considered significant,
1073 * and the result is normalized as one would expect.
1074 * Consequently, biased exponent values 0 and 1 are distinct
1075 * only with respect to which bit patterns are considered valid,
1076 * and not with respect to the set of values denoted.
1077 */
1078
1079unsigned fltfmt_decmini(struct floatbits *z_out, octet x)
1080 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_mini, z_out, t)); }
1081
1082unsigned fltfmt_decbf16(struct floatbits *z_out, uint16 x)
1083 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_bf16, z_out, t)); }
1084
1085unsigned fltfmt_decf16(struct floatbits *z_out, uint16 x)
1086 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f16, z_out, t)); }
1087
1088unsigned fltfmt_decf32(struct floatbits *z_out, uint32 x)
1089 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f32, z_out, t)); }
1090
1091unsigned fltfmt_decf64(struct floatbits *z_out, kludge64 x)
1092{
1093 uint32 t[2];
1094
1095 t[0] = HI64(x); t[1] = LO64(x);
1096 return (fltfmt_decieee(&fltfmt_f64, z_out, t));
1097}
1098
1099unsigned fltfmt_decf128(struct floatbits *z_out, const uint32 *x)
1100 { return (fltfmt_decieee(&fltfmt_f128, z_out, x)); }
1101
1102unsigned fltfmt_decidblext80(struct floatbits *z_out, uint16 se, kludge64 m)
1103{
1104 uint32 t[3];
1105
1106 t[0] = se; t[1] = HI64(m); t[2] = LO64(m);
1107 return (fltfmt_decieee(&fltfmt_idblext80, z_out, t));
1108}
1109
1110/*----- Native formats ----------------------------------------------------*/
1111
1112/* If the floating-point radix is a power of two, determine how many bits
1113 * there are in each digit. This isn't exhaustive, but it covers most of the
1114 * bases, so to speak.
1115 */
1116#if FLT_RADIX == 2
1117# define DIGIT_BITS 1
1118#elif FLT_RADIX == 4
1119# define DIGIT_BITS 2
1120#elif FLT_RADIX == 8
1121# define DIGIT_BITS 3
1122#elif FLT_RADIX == 16
1123# define DIGIT_BITS 4
1124#endif
1125
1126/* --- @ENCFLT@ --- *
1127 *
1128 * Arguments: @ty@ = the C type to encode
1129 * @TY@ = the uppercase prefix for macros
1130 * @ty (*ldexp)(ty, int)@ = function to scale a @ty@ value by a
1131 * power of two
1132 * @unsigned &rc@ = error code to set
1133 * @ty *z_out@ = storage for the result
1134 * @const struct floatbits *x@ = value to convert
1135 * @unsigned r@ = rounding mode
1136 *
1137 * Returns: ---
1138 *
1139 * Use: Encode a floating-point value @x@ as a native C object of
1140 * type @ty@. This is the machinery shared by the
1141 * @fltfmt_enc...@ functions for enccoding native-format values.
1142 * Most of the behaviour is as described for those functions.
1143 */
1144
1145/* Utilities based on conditional compilation, because we can't use
1146 * %|#ifdef|% directly in macros.
1147 */
1148
1149#ifdef NAN
1150 /* The C implementation acknowledges the existence of (quiet) NaN values,
1151 * but will neither let us set the payload in a useful way, nor
1152 * acknowledge the existence of signalling NaNs. We have no good way to
1153 * determine which NaN the @NAN@ macro produces, so report this conversion
1154 * as inexact.
1155 */
1156
1157# define SETNAN(rc, z) do { (z) = NAN; (rc) = FLTERR_INEXACT; } while (0)
1158#else
1159 /* This C implementation doesn't recognize NaNs. This value is totally
1160 * unrepresentable, so just report the error. (Maybe it's C89 and would
1161 * actually do the right thing with @0/0@. I'm not sure the requisite
1162 * compile-time configuration machinery is worth the effort.)
1163 */
1164
1165# define SETNAN(rc, z) do { (z) = 0; (rc) = FLTERR_REPR; } while (0)
1166#endif
1167
1168#ifdef INFINITY
1169 /* The C implementation supports infinities. This is a simple win. */
1170
1171# define SETINF(TY, rc, z) \
1172 do { (z) = INFINITY; (rc) = FLTERR_OK; } while (0)
1173#else
1174 /* The C implementation doesn't support infinities. Return the maximum
1175 * value and report it as an overflow; I think this is more useful than
1176 * reporting a complete representation failure. (Maybe it's C89 and would
1177 * actually do the right thing with @1/0@. Again, I'm not sure the
1178 * requisite compile-time configuration machinery is worth the effort.)
1179 */
1180
1181# define SETINF(TY, rc, z) \
1182 do { (z) = TY##_MAX; (rc) = FLTERR_OFLOW | FLTERR_INEXACT; } while (0)
1183#endif
1184
1185#ifdef DIGIT_BITS
1186 /* The floating point formats use a power-of-two radix. This means that
1187 * we can determine the correctly rounded value before we start building
1188 * the native floating-point value.
1189 */
1190
1191# define ENC_ROUND_DECLS struct floatbits _y;
1192# define ENC_ROUND(TY, rc, x, r) do { \
1193 (rc) |= fltfmt_round(&_y, (x), (r), DIGIT_BITS*TY##_MANT_DIG); \
1194 (x) = &_y; \
1195 } while (0)
1196#else
1197 /* The floating point formats use a non-power-of-two radix. This means
1198 * that conversion is inherently inexact.
1199 */
1200
1201# define ENC_ROUND_DECLS
1202# define ENC_ROUND(TY, rc, x, r) \
1203 do (rc) |= FLTERR_INEXACT; while (0)
1204# define ENC_FIXUP(...)
1205
1206#endif
1207
1208#define ENCFLT(ty, TY, ldexp, rc, z_out, x, r) do { \
1209 unsigned _rc = 0; \
1210 \
1211 /* See if the native format is one that we recognize. */ \
1212 switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \
1213 \
1214 case FLTFMT_IEEE_F32: { \
1215 uint32 _t[1]; \
1216 unsigned char *_z = (unsigned char *)(z_out); \
1217 \
1218 (rc) = fltfmt_encieee(&fltfmt_f32, _t, (x), (r), FLTERR_ALLERRS); \
1219 FLTFMT__FROB_NAN_F32(_t, _rc); \
1220 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1221 case FLTFMT_BE: STORE32_B(_z, _t[0]); break; \
1222 case FLTFMT_LE: STORE32_L(_z, _t[0]); break; \
1223 default: assert(!"unimplemented byte order"); break; \
1224 } \
1225 } break; \
1226 \
1227 case FLTFMT_IEEE_F64: { \
1228 uint32 _t[2]; \
1229 unsigned char *_z = (unsigned char *)(z_out); \
1230 (rc) = fltfmt_encieee(&fltfmt_f64, _t, (x), (r), FLTERR_ALLERRS); \
1231 FLTFMT__FROB_NAN_F64(_t, _rc); \
1232 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1233 case FLTFMT_BE: \
1234 STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \
1235 break; \
1236 case FLTFMT_LE: \
1237 STORE32_L(_z + 0, _t[1]); STORE32_L(_z + 4, _t[0]); \
1238 break; \
1239 case FLTFMT_ARME: \
1240 STORE32_L(_z + 0, _t[0]); STORE32_L(_z + 4, _t[1]); \
1241 break; \
1242 default: assert(!"unimplemented byte order"); break; \
1243 } \
1244 } break; \
1245 \
1246 case FLTFMT_IEEE_F128: { \
1247 uint32 _t[4]; \
1248 unsigned char *_z = (unsigned char *)(z_out); \
1249 \
1250 FLTFMT__FROB_NAN_F128(_t, _rc); \
1251 (rc) = fltfmt_encieee(&fltfmt_f128, _t, (x), (r), FLTERR_ALLERRS); \
1252 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1253 case FLTFMT_BE: \
1254 STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \
1255 STORE32_B(_z + 8, _t[0]); STORE32_B(_z + 12, _t[1]); \
1256 break; \
1257 case FLTFMT_LE: \
1258 STORE32_L(_z + 0, _t[3]); STORE32_L(_z + 4, _t[2]); \
1259 STORE32_L(_z + 8, _t[1]); STORE32_L(_z + 12, _t[0]); \
1260 break; \
1261 default: assert(!"unimplemented byte order"); break; \
1262 } \
1263 } break; \
1264 \
1265 case FLTFMT_INTEL_F80: { \
1266 uint32 _t[3]; \
1267 unsigned char *_z = (unsigned char *)(z_out); \
1268 \
1269 (rc) = fltfmt_encieee(&fltfmt_idblext80, _t, (x), (r), FLTERR_ALLERRS); \
1270 FLTFMT__FROB_NAN_IDBLEXT80(_t, _rc); \
1271 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1272 case FLTFMT_BE: \
1273 STORE16_B(_z + 0, _t[0]); \
1274 STORE32_B(_z + 2, _t[1]); STORE32_B(_z + 6, _t[2]); \
1275 break; \
1276 case FLTFMT_LE: \
1277 STORE32_L(_z + 0, _t[2]); STORE32_L(_z + 4, _t[1]); \
1278 STORE16_L(_z + 8, _t[0]); \
1279 break; \
1280 default: assert(!"unimplemented byte order"); break; \
1281 } \
1282 } break; \
1283 \
1284 default: { \
1285 /* We must do this the hard way. */ \
1286 \
1287 const struct floatbits *_x = (x); \
1288 ty _z; \
1289 unsigned _i; \
1290 ENC_ROUND_DECLS; \
1291 \
1292 /* Case analysis... */ \
1293 if (_x->f&FLTF_NANMASK) { \
1294 /* A NaN. Use the macro above. */ \
1295 \
1296 SETNAN(_rc, _z); \
1297 if (x->f&FLTF_NEG) _z = -_z; \
1298 } else if (_x->f&FLTF_INF) { \
1299 /* Infinity. Use the macro. */ \
1300 \
1301 SETINF(TY, _rc, _z); \
1302 if (_x->f&FLTF_NEG) _z = -_z; \
1303 } else if (_x->f&FLTF_ZERO) { \
1304 /* Zero. If we're asked for a negative zero then check that we \
1305 * produced one: if not, then report an exactness failure. \
1306 */ \
1307 \
1308 _z = 0.0; \
1309 if (_x->f&FLTF_NEG) \
1310 { _z = -_z; if (!NEGP(_z)) _rc |= FLTERR_INEXACT; } \
1311 } else { \
1312 /* A finite value. */ \
1313 \
1314 /* If the radix is a power of two, we can round to the correct \
1315 * precision, which will save inexactness later. \
1316 */ \
1317 ENC_ROUND(TY, _rc, _x, (r)); \
1318 \
1319 /* Insert the 32-bit pieces of the fraction one at a time, \
1320 * starting from the least-significant end. This minimizes the \
1321 * inaccuracy from the overall approach, but it's imperfect \
1322 * unless the value has already been rounded correctly. \
1323 */ \
1324 _z = 0.0; \
1325 for (_i = _x->n, _z = 0.0; _i--; ) \
1326 _z += ldexp(_x->frac[_i], _x->exp - 32*_i); \
1327 \
1328 /* Negate the value if we need to. */ \
1329 if (_x->f&FLTF_NEG) _z = -_z; \
1330 } \
1331 \
1332 /* All done. */ \
1333 *(z_out) = _z; \
1334 } break; \
1335 } \
1336 \
1337 /* Set the error code. */ \
1338 (rc) = _rc; \
1339} while (0)
1340
1341/* --- @fltfmt_encTY@ --- *
1342 *
1343 * Arguments: @ty *z_out@ = storage for the result
1344 * @const struct floatbits *x@ = value to encode
1345 * @unsigned r@ = rounding mode
1346 *
1347 * Returns: Error flags (@FLTERR_...@).
1348 *
1349 * Use: Encode the floating-point value @x@ as a native C object and
1350 * store the result in @z_out@.
1351 *
1352 * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
1353 * @double@, or (on C99 implementations) @ldbl@ to encode a
1354 * @long double@.
1355 *
1356 * In detail, conversion is performed as follows.
1357 *
1358 * * If a non-finite value cannot be represented by the
1359 * implementation then the @FLTERR_REPR@ error bit is set
1360 * and @*z_out@ is set to zero if @x@ is a NaN, or the
1361 * (absolute) largest representable value, with appropriate
1362 * sign, if @x@ is an infinity.
1363 *
1364 * * If the implementation can represent NaNs, but cannot set
1365 * NaN payloads, then the @FLTERR_INEXACT@ error bit is set,
1366 * and @*z_out@ is set to an arbitrary (quiet) NaN value.
1367 *
1368 * * If @x@ is negative zero, but the implementation does not
1369 * distinguish negative and positive zero, then the
1370 * @FLTERR_INEXACT@ error bit is set and @*z_out@ is set to
1371 * zero.
1372 *
1373 * * If the implementation's floating-point radix is not a
1374 * power of two, and @x@ is a nonzero finite value, then
1375 * @FLTERR_INEXACT@ error bit is set (unconditionally), and
1376 * the value is rounded by the implementation using its
1377 * prevailing rounding policy. If the radix is a power of
1378 * two, then the @FLTERR_INEXACT@ error bit is set only if
1379 * rounding is necessary, and rounding is performed using
1380 * the rounding mode @r@.
1381 */
1382
1383unsigned fltfmt_encflt(float *z_out, const struct floatbits *x, unsigned r)
1384{
1385 unsigned rc;
1386
1387 ENCFLT(double, FLT, ldexp, rc, z_out, x, r);
1388 return (rc);
1389}
1390
1391unsigned fltfmt_encdbl(double *z_out, const struct floatbits *x, unsigned r)
1392{
1393 unsigned rc;
1394
1395 ENCFLT(double, DBL, ldexp, rc, z_out, x, r);
1396 return (rc);
1397}
1398
1399#if __STDC_VERSION__ >= 199001
1400unsigned fltfmt_encldbl(long double *z_out,
1401 const struct floatbits *x, unsigned r)
1402{
1403 unsigned rc;
1404
1405 ENCFLT(long double, LDBL, ldexpl, rc, z_out, x, r);
1406 return (rc);
1407}
1408#endif
1409
1410/* --- @DECFLT@ --- *
1411 *
1412 * Arguments: @ty@ = the C type to encode
1413 * @TY@ = the uppercase prefix for macros
1414 * @ty (*frexp)(ty, int *)@ = function to decompose a @ty@ value
1415 * into a binary exponent and normalized fraction
1416 * @unsigned &rc@ = error code to set
1417 * @struct floatbits *z_out@ = storage for the result
1418 * @ty x@ = value to convert
1419 * @unsigned r@ = rounding mode
1420 *
1421 * Returns: ---
1422 *
1423 * Use: Decode a C native floating-point object. This is the
1424 * machinery shared by the @fltfmt_dec...@ functions for
1425 * decoding native-format values. Most of the behaviour is as
1426 * described for those functions.
1427 */
1428
1429/* Define some utilities for decoding native floating-point formats.
1430 *
1431 * * @NFRAC(d)@ is the number of fraction limbs we'll need for @d@ native
1432 * digits.
1433 *
1434 * * @CONVFIX@ is a final fixup applied to the decoded value.
1435 */
1436#ifdef DIGIT_BITS
1437# define NFRAC(TY) ((DIGIT_BITS*TY##_MANT_DIG + 31)/32)
1438# define CONVFIX(ty, rc, z, x, n, f, r) do assert(!(x)); while (0)
1439#else
1440# define NFRAC(TY) \
1441 (ceil(log(pow(FLT_RADIX, TY##_MANT_DIG) - 1)/32.0*log(2.0)) + 1)
1442# define CONVFIX(ty, rc, z, x, n, f, r) do { \
1443 ty _x_ = (x); \
1444 struct floatbits *_z_ = (z); \
1445 uint32 _w_; \
1446 unsigned _i_, _n_ = (n), _f_; \
1447 \
1448 /* Round the result according to the remainder left in %$x$%. */ \
1449 _f_ = 0; _i_ = _n_ - 1; _w_ = _z_->frac[_i_]; \
1450 if ((f)&FLTF_NEG) _f_ |= FRPF_NEG; \
1451 if (_w_&1) _f_ |= FRPF_ODD; \
1452 if (_y_ >= 0.5) _f_ |= FRPF_HALF; \
1453 if (_y_ != 0 && _y_ != 0.5) _f_ |= FRPF_LOW; \
1454 if (((r) >> _f_)&1) { \
1455 for (;;) { \
1456 _w_ = (_w_ + 1)&MASK32; \
1457 if (_w_ || !_i_) break; \
1458 _z_->frac[_i_] = 0; _w_ = _z_->frac[--_i_]; \
1459 } \
1460 if (!_w_) { _z_->exp++; _w_ = B31; } \
1461 _z_->frac[_i_] = w; \
1462 } \
1463 \
1464 /* The result is not exact. */ \
1465 (rc) |= FLTERR_INEXACT; \
1466 } while (0)
1467#endif
1468
1469#define DECFLT(ty, TY, frexp, rc, z_out, x, r) do { \
1470 unsigned _rc = 0; \
1471 \
1472 switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \
1473 \
1474 case FLTFMT_IEEE_F32: { \
1475 unsigned _t[1]; \
1476 unsigned char *_x = (unsigned char *)&(x); \
1477 \
1478 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1479 case FLTFMT_BE: _t[0] = LOAD32_B(_x); break; \
1480 case FLTFMT_LE: _t[0] = LOAD32_L(_x); break; \
1481 default: assert(!"unimplemented byte order"); break; \
1482 } \
1483 FLTFMT__FROB_NAN_F32(_t, _rc); \
1484 _rc |= fltfmt_decieee(&fltfmt_f32, (z_out), _t); \
1485 } break; \
1486 \
1487 case FLTFMT_IEEE_F64: { \
1488 unsigned _t[2]; \
1489 unsigned char *_x = (unsigned char *)&(x); \
1490 \
1491 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1492 case FLTFMT_BE: \
1493 _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \
1494 break; \
1495 case FLTFMT_LE: \
1496 _t[1] = LOAD32_L(_x + 0); _t[0] = LOAD32_L(_x + 4); \
1497 break; \
1498 case FLTFMT_ARME: \
1499 _t[0] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \
1500 break; \
1501 default: assert(!"unimplemented byte order"); break; \
1502 } \
1503 FLTFMT__FROB_NAN_F64(_t, _rc); \
1504 _rc |= fltfmt_decieee(&fltfmt_f64, (z_out), _t); \
1505 } break; \
1506 \
1507 case FLTFMT_IEEE_F128: { \
1508 unsigned _t[4]; \
1509 unsigned char *_x = (unsigned char *)&(x); \
1510 \
1511 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1512 case FLTFMT_BE: \
1513 _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \
1514 _t[2] = LOAD32_B(_x + 8); _t[3] = LOAD32_B(_x + 12); \
1515 break; \
1516 case FLTFMT_LE: \
1517 _t[3] = LOAD32_L(_x + 0); _t[2] = LOAD32_L(_x + 4); \
1518 _t[1] = LOAD32_L(_x + 8); _t[0] = LOAD32_L(_x + 12); \
1519 break; \
1520 default: assert(!"unimplemented byte order"); break; \
1521 } \
1522 FLTFMT__FROB_NAN_F128(_t, _rc); \
1523 _rc |= fltfmt_decieee(&fltfmt_f128, (z_out), _t); \
1524 } break; \
1525 \
1526 case FLTFMT_INTEL_F80: { \
1527 unsigned _t[3]; \
1528 unsigned char *_x = (unsigned char *)&(x); \
1529 \
1530 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1531 case FLTFMT_BE: \
1532 _t[0] = LOAD16_B(_x + 0); \
1533 _t[1] = LOAD32_B(_x + 2); _t[2] = LOAD32_B(_x + 6); \
1534 break; \
1535 case FLTFMT_LE: \
1536 _t[2] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \
1537 _t[0] = LOAD16_L(_x + 8); \
1538 break; \
1539 default: assert(!"unimplemented byte order"); break; \
1540 } \
1541 FLTFMT__FROB_NAN_IDBLEXT80(_t, _rc); \
1542 _rc |= fltfmt_decieee(&fltfmt_idblext80, (z_out), _t); \
1543 } break; \
1544 \
1545 default: { \
1546 struct floatbits *_z = (z_out); \
1547 ty _x = (x), _y; \
1548 unsigned _i, _n, _f = 0; \
1549 uint32 _t; \
1550 \
1551 /* If the value looks negative then negate it and set the sign \
1552 * flag. \
1553 */ \
1554 if (NEGP(_x)) { _f |= FLTF_NEG; _x = -_x; } \
1555 \
1556 /* Now for the case analysis. Infinities and zero are \
1557 * unproblematic. NaNs can't be decoded exactly using the \
1558 * portable machinery. \
1559 */ \
1560 if (INFP(_x)) _f |= FLTF_INF; \
1561 else if (_x == 0.0) _f |= FLTF_ZERO; \
1562 else if (NANP(_x)) { _f |= FLTF_QNAN; _rc |= FLTERR_INEXACT; } \
1563 else { \
1564 /* A finite value. Determine the number of fraction limbs \
1565 * we'll need based on the precision and radix and pull out \
1566 * 32-bit chunks one at a time. This will be unproblematic \
1567 * for power-of-two radices, requiring at most shifting the \
1568 * significand left by a few bits, but inherently inexact (for \
1569 * the most part) for others. \
1570 */ \
1571 \
1572 _n = NFRAC(TY); fltfmt_allocfrac(_z, _n); \
1573 _y = frexp(_x, &_z->exp); \
1574 for (_i = 0; _i < _n; _i++) \
1575 { _y *= SH32; _t = _y; _y -= _t; _z->frac[_i] = _t; } \
1576 CONVFIX(ty, _rc, _z, _y, _n, _f, (r)); \
1577 } \
1578 \
1579 /* Done. */ \
1580 _z->f = _f; \
1581 } break; \
1582 } \
1583 \
1584 /* Set the error code. */ \
1585 (rc) = _rc; \
1586} while (0)
1587
1588/* --- @fltfmt_decTY@ --- *
1589 *
1590 * Arguments: @struct floatbits *z_out@ = storage for the result
1591 * @ty x@ = value to decode
1592 * @unsigned r@ = rounding mode
1593 *
1594 * Returns: Error flags (@FLTERR_...@).
1595 *
1596 * Use: Decode the native C floatingpoint value @x@ and store the
1597 * result in @z_out@.
1598 *
1599 * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
1600 * @double@, or (on C99 implementations) @ldbl@ to encode a
1601 * @long double@.
1602 *
1603 * In detail, conversion is performed as follows.
1604 *
1605 * * If the implementation supports negative zeros and/or
1606 * infinity, then these are recognized and decoded.
1607 *
1608 * * If the input as a NaN, but the implementation cannot
1609 * usefully report NaN payloads, then the @FLTERR_INEXACT@
1610 * error bit is set and the decoded payload is left empty.
1611 *
1612 * * If the implementation's floating-point radix is not a
1613 * power of two, and @x@ is a nonzero finite value, then
1614 * @FLTERR_INEXACT@ error bit is set (unconditionally), and
1615 * the rounded value (according to the rounding mode @r@) is
1616 * stored in as many fraction words as necessary to identify
1617 * the original value uniquely. If the radix is a power of
1618 * two, then the value is represented exactly.
1619 */
1620
1621unsigned fltfmt_decflt(struct floatbits *z_out, float x, unsigned r)
1622{
1623 unsigned rc;
1624
1625 DECFLT(double, FLT, frexp, rc, z_out, x, r);
1626 return (rc);
1627}
1628
1629unsigned fltfmt_decdbl(struct floatbits *z_out, double x, unsigned r)
1630{
1631 unsigned rc;
1632
1633 DECFLT(double, DBL, frexp, rc, z_out, x, r);
1634 return (rc);
1635}
1636
1637#if __STDC_VERSION__ >= 199001
1638unsigned fltfmt_decldbl(struct floatbits *z_out, long double x, unsigned r)
1639{
1640 unsigned rc;
1641
1642 DECFLT(long double, LDBL, frexpl, rc, z_out, x, r);
1643 return (rc);
1644}
1645#endif
1646
1647/*----- That's all, folks -------------------------------------------------*/