@@@ fltfmt mess
[mLib] / utils / fltfmt.c
CommitLineData
b1a20bee
MW
1/* -*-c-*-
2 *
3 * Floating-point format conversions
4 *
5 * (c) 2024 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of the mLib utilities library.
11 *
12 * mLib is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU Library General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version.
16 *
17 * mLib is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 * License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with mLib. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 * USA.
26 */
27
28/*----- Header files ------------------------------------------------------*/
29
30#include "config.h"
31
32#include <assert.h>
33#include <float.h>
34#include <limits.h>
35#include <math.h>
36
37#include "alloc.h"
38#include "arena.h"
39#include "bits.h"
40#include "fltfmt.h"
41#include "growbuf.h"
42#include "macros.h"
43#include "maths.h"
44
45#if GCC_VERSION_P(4, 4)
46# pragma GCC optimize "-frounding-math"
47#elif CLANG_VERSION_P(11, 0) && !CLANG_VERSION_P(12, 0)
48# pragma clang optimize "-frounding-math"
49#elif GCC_VERSION_P(0, 0) || \
50 (CLANG_VERSION_P(0, 0) && !CLANG_VERSION_P(12, 0))
51 /* We just lose. */
52#elif __STDC_VERSION__ >= 199001
53# include <fenv.h>
54# pragma STDC FENV_ACCESS ON
55#endif
56
57/*----- Some useful constants ---------------------------------------------*/
58
59#define B31 0x80000000 /* just bit 31 */
60#define B30 0x40000000 /* just bit 30 */
61
62#define SH32 4294967296.0 /* 2^32 as floating-point */
63
64/*----- Useful macros -----------------------------------------------------*/
65
66#define B32(k) ((uint32)1 << (k))
67#define M32(k) (B32(k) - 1)
68
69#define FINITEP(x) (!((x)->f&(FLTF_INF | FLTF_NANMASK | FLTF_ZERO)))
70
71/*----- Utility functions -------------------------------------------------*/
72
73/* --- @clz32@ --- *
74 *
75 * Arguments: @uint32 x@ = a 32-bit value
76 *
77 * Returns: The number of leading zeros in @x@, i.e., the nonnegative
78 * integer %$n$% such that %$2^{31} \le 2^n x < 2^{32}$%.
79 * Returns a nonsensical value if %$x = 0$%.
80 */
81
82static unsigned clz32(uint32 x)
83{
84 unsigned n = 0;
85
86 /* Divide and conquer. If the top half of the bits are clear, then there
87 * must be at least 16 leading zero bits, so accumulate and shift. Repeat
88 * for smaller powers of two.
89 *
90 * This ends up returning 31 if %$x = 0$%, but don't rely on this.
91 */
92 if (!(x&0xffff0000)) { x <<= 16; n += 16; }
93 if (!(x&0xff000000)) { x <<= 8; n += 8; }
94 if (!(x&0xf0000000)) { x <<= 4; n += 4; }
95 if (!(x&0xc0000000)) { x <<= 2; n += 2; }
96 if (!(x&0x80000000)) { n += 1; }
97 return (n);
98}
99
100/* --- @ctz32@ --- *
101 *
102 * Arguments: @uint32 x@ = a 32-bit value
103 *
104 * Returns: The number of trailing zeros in @x@, i.e., the nonnegative
105 * integer %$n$% such that %$x/2^n$% is an odd integer.
106 * Returns a nonsensical value if %$x = 0$%.
107 */
108
109static unsigned ctz32(uint32 x)
110{
111#ifdef CTZ_TRADITIONAL
112
113 unsigned n = 0;
114
115 /* Divide and conquer. If the bottom half of the bits are clear, then
116 * there must be at least 16 trailing zero bits, so accumulate and shift.
117 * Repeat for smaller powers of two.
118 *
119 * This ends up returning 31 if %$x = 0$%, but don't rely on this.
120 */
121 if (!(x&0x0000ffff)) { x >>= 16; n += 16; }
122 if (!(x&0x000000ff)) { x >>= 8; n += 8; }
123 if (!(x&0x0000000f)) { x >>= 4; n += 4; }
124 if (!(x&0x00000003)) { x >>= 2; n += 2; }
125 if (!(x&0x00000001)) { n += 1; }
126 return (n);
127
128#else
129
130 static unsigned char tab[] =
131 /*
132 ;;; Compute the decoding table for the de Bruijn sequence trick below.
133
134 (let ((db #x04653adf)
135 (rv (make-vector 32 nil)))
136 (dotimes (i 32)
137 (aset rv (logand (ash db (- i 27)) #x1f) i))
138 (save-excursion
139 (goto-char (point-min))
140 (search-forward (concat "***" "BEGIN ctz32tab" "***"))
141 (beginning-of-line 2)
142 (delete-region (point)
143 (progn
144 (search-forward "***END***")
145 (beginning-of-line)
146 (point)))
147 (dotimes (i 32)
148 (cond ((zerop i) (insert " { "))
149 ((zerop (mod i 16)) (insert ",\n "))
150 ((zerop (mod i 4)) (insert ", "))
151 (t (insert ", ")))
152 (insert (format "%2d" (aref rv i))))
153 (insert " };\n")))
154 */
155
156 /* ***BEGIN ctz32tab*** */
157 { 0, 1, 2, 6, 3, 11, 7, 16, 4, 14, 12, 21, 8, 23, 17, 26,
158 31, 5, 10, 15, 13, 20, 22, 25, 30, 9, 19, 24, 29, 18, 28, 27 };
159 /* ***END*** */
160
161 /* Sneaky trick. Two's complement negation (which you effectively get
162 * using C unsigned arithmetic, whether you like it or not) complements all
163 * of the bits of an operand more significant than the least significant
164 * set bit. Therefore, this bit is the only one set in both %$x$% and
165 * %$-x$%, so @x&-x@ will isolate it for us.
166 *
167 * The magic number @0x04653adf@ is a %%\emph{de Bruijn} number%%: every
168 * group of five consecutive bits is distinct, including groups which `wrap
169 * around', including some low bits and some high bits. Multiplying this
170 * number by a power of two is equivalent to a left shift; and, because the
171 * top five bits are all zero, the most significant five bits of the
172 * product are the same as if we'd applied a rotation. The result is that
173 * we end up with a distinctive pattern in those bits which perfectly
174 * diagnose each shift from 0 up to 31, which we can decode using a table.
175 *
176 * David Seal described a similar trick -- using the six-bit pattern
177 * generated by the constant @0x0450fbaf@ -- in `comp.sys.arm' in 1994;
178 * this constant was particularly convenient to multiply by on early ARM
179 * processors. The use of a de Bruijn number is described in Henry
180 * Warren's %%\emph{Hacker's Delight}%%.
181 */
182 return (tab[((x&-x)*0x04653adf >> 27)&0x1f]);
183
184#endif
185}
186
187/* --- @shl@, @shr@ --- *
188 *
189 * Arguments: @uint32 *z@ = destination vector
190 * @const uint32 *x@ = source vector
191 * @size_t sz@ = size of source vector, in elements
192 * @unsigned n@ = number of bits to shift by; must be less than
193 * 32
194 *
195 * Returns: The bits shifted off the end of the vector.
196 *
197 * Use: Shift a vector of 32-bit words left (@shl@) or right (@shr@)
198 * by some number of bit positions. These functions work
199 * correctly if @z@ and @x@ are the same pointer, but not if
200 * they otherwise overlap.
201 */
202
203static uint32 shl(uint32 *z, const uint32 *x, size_t sz, unsigned n)
204{
205 size_t i;
206 uint32 t, u;
207 unsigned r;
208
209 if (!n) {
210 for (i = 0; i < sz; i++) z[i] = x[i];
211 t = 0;
212 } else {
213 r = 32 - n;
214 for (t = 0, i = sz; i--; )
215 { u = x[i]; z[i] = ((u << n) | t)&MASK32; t = u >> r; }
216 }
217 return (t);
218}
219
220static uint32 shr(uint32 *z, const uint32 *x, size_t sz, unsigned n)
221{
222 size_t i;
223 uint32 t, u;
224 unsigned r;
225
226 if (!n) {
227 for (i = 0; i < sz; i++) z[i] = x[i];
228 t = 0;
229 } else {
230 r = 32 - n;
231 for (t = 0, i = 0; i < sz; i++)
232 { u = x[i]; z[i] = ((u >> n) | t)&MASK32; t = u << r; }
233 }
234 return (t);
235}
236
237/* --- @sigbits@ --- *
238 *
239 * Arguments: @const struct floatbits *x@ = decoded floating-point number
240 *
241 * Returns: The number of significant digits in @x@'s fraction. This
242 * will be zero if @x@ is zero or infinite.
243 */
244
245static unsigned sigbits(const struct floatbits *x)
246{
247 unsigned i;
248 uint32 w;
249
250 if (x->f&(FLTF_ZERO | FLTF_INF)) return (0);
251 i = x->n;
252 for (;;) {
253 if (!i) return (0);
254 w = x->frac[--i]; if (w) return (32*(i + 1) - ctz32(w));
255 }
256}
257
258/* --- @ms_set_bit@ --- *
259 *
260 * Arguments: @const uint32 *x@ = pointer to the %%\emph{end}%% of the
261 * buffer
262 * @unsigned from, to@ = lower (inclusive) and upper (exclusive)
263 * bounds on the region of bits to inspect
264 *
265 * Returns: Index of the most significant set bit, or @ALLCLEAR@.
266 *
267 * Use: For the (rather unusual) purposes of this function, the bits
268 * of the input are numbered from zero, being the least
269 * significant bit of @x[-1]@, upwards through more significant
270 * bits of @x[-1]@, and then through @x[-2]@ and so on.
271 *
272 * If all of the bits in the half-open region are clear then
273 * @ALLCLEAR@ is returned; otherwise, the return value is the
274 * index of the most significant set bit in the region. Note
275 * that @ALLCLEAR@ is equal to @UINT_MAX@: since this is the
276 * largest possible value of @to@, and the upper bound is
277 * exclusive, this cannot be the index of a bit in the region.
278 */
279
280#define ALLCLEAR UINT_MAX
281static unsigned ms_set_bit(const uint32 *x, unsigned from, unsigned to)
282{
283 unsigned n0, n, b, base;
284 uint32 m, w;
285
286 /* <--- increasing indices <---
287 *
288 * ---+-------+-------+-------+-------+-------+---
289 * ...S |///| | | | | |//| S...
290 * ---+-------+-------+-------+-------+-------+---
291 */
292
293 /* If the region is empty then it's technically true that all of the bits
294 * are zero. It's important to be able to do answer the case where
295 * %$\id{from} = \id{to} = 0$% without accessing memory.
296 */
297 assert(to >= from); if (to == from) return (ALLCLEAR);
298
299 /* This is distressingly complicated. Here's how it's going to work.
300 *
301 * There's at least one bit to check, or we'd have returned already --
302 * specifically, we must check the bit with index @from@. But that's at
303 * the wrong end. Instead, we start at the most significant end, with the
304 * word containing the bit one short of the @to@ position. Even though
305 * it's actually one off, because we use a half-open interval, we'll call
306 * that the `@to@ bit'.
307 *
308 * We start by loading the word containing the @to@ bit, and start @base@
309 * off as the bit index of the least significant bit of this word. We mask
310 * off the high bits (if any), leaving only the @to@ bit and the less
311 * significant ones. We %%\emph{don't}%% check the remaining bits yet.
312 *
313 * We then start an offset loop. In each iteration, we check the word
314 * we're currently holding: if it's not zero, then we return @base@ plus
315 * the position of the most-significant set bit, using @clz32@. Otherwise,
316 * we load the next (less significant) word, and drop @base@ by 32, but
317 * don't check it yet. We end this loop when the word we're holding
318 * contains the @from@ bit. It's possible that we didn't do any iterations
319 * of the loop, in which case we're still holding the word containing the
320 * @to@ bit at this point.
321 *
322 * Finally, we mask off the bits below the @from@ bit, and check that what
323 * we have left is zero. If it isn't, we return @base@ plus the position
324 * of the most significant bit; if it is, we return @ALLCEAR@.
325 */
326
327 /* The first job is to find the word containing the @to@ bit and mask off
328 * any higher bits that we don't care about.
329 *
330 * Recall that the bit's index is @to - 1@, but this must be a valid index
331 * because there is at least one bit in the region. But we start out
332 * pointing beyond the vector, so we must add an extra 32 bits.
333 */
334 n0 = (to + 31)/32; x -= n0; base = (to - 1)&~31u; w = *x++;
335 b = to%32; if (b) w &= M32(b);
336
337 /* Before we start the loop, it'd be useful to know how many iterations we
338 * need. This is going to be the offset from the word containing the @to@
339 * bit to the word containing the @from@ bit. Again, we're off by one
340 * because that's how our initial indexing is set up.
341 */
342 n = n0 - from/32 - 1;
343
344 /* Now it's time to do the loop. This is the easy bit. */
345 while (n--) {
346 if (w) return (base + 31 - clz32(w));
347 w = *x++&MASK32; base -= 32;
348 }
349
350 /* We're now holding the final word -- the one containing the @from@ bit.
351 * We need to mask off any low bits that we don't care about.
352 */
353 m = M32(from%32); w &= MASK32&~m;
354
355 /* Final check. */
356 if (w) return (base + 31 - clz32(w));
357 else return (ALLCLEAR);
358}
359
360/*----- General floating-point hacking ------------------------------------*/
361
362/* --- @fltfmt_initbits@ --- *
363 *
364 * Arguments: @struct floatbits *x@ = pointer to structure to initialize
365 *
366 * Returns: ---
367 *
368 * Use: Dynamically initialize @x@ to (positive) zero so that it can
369 * be used as the destination operand by other operations. This
370 * doesn't allocate resources and cannot fail. The
371 * @FLOATBITS_INIT@ macro is a suitable static initializer for
372 * performing the same task.
373 */
374
375void fltfmt_initbits(struct floatbits *x)
376{
377 x->f = FLTF_ZERO;
378 x->a = arena_global;
379 x->frac = 0; x->n = x->fracsz = 0;
380}
381
382/* --- @fltfmt_freebits@ --- *
383 *
384 * Arguments: @struct floatbits *x@ = pointer to structure to free
385 *
386 * Returns: ---
387 *
388 * Use: Releases the memory held by @x@. Afterwards, @x@ is a valid
389 * (positive) zero, but can safely be discarded.
390 */
391
392void fltfmt_freebits(struct floatbits *x)
393{
394 if (x->frac) x_free(x->a, x->frac);
395 x->f = FLTF_ZERO;
396 x->frac = 0; x->n = x->fracsz = 0;
397}
398
399/* --- @fltfmt_allocfrac@ --- *
400 *
401 * Arguments: @struct floatbits *x@ = structure to adjust
402 * @unsigned n@ = number of words required
403 *
404 * Returns: ---
405 *
406 * Use: Reallocate the @frac@ vector so that it has space for at
407 * least @n@ 32-bit words, and set @x->n@ equal to @n@. If the
408 * current size is already @n@ or greater, then just update the
409 * active length @n@ and return; otherwise, any existing vector
410 * is discarded and a fresh, larger one allocated.
411 */
412
413void fltfmt_allocfrac(struct floatbits *x, unsigned n)
414 { GROWBUF_REPLACEV(unsigned, x->a, x->frac, x->fracsz, n, 4); x->n = n; }
415
416/* --- @fltfmt_copybits@ --- *
417 *
418 * Arguments: @struct floatbits *z_out@ = where to leave the result
419 * @const struct floatbits *x@ = source to copy
420 *
421 * Returns: ---
422 *
423 * Use: Make @z_out@ be a copy of @x@. If @z_out@ is the same object
424 * as @x@ then do nothing.
425 */
426
427void fltfmt_copybits(struct floatbits *z_out, const struct floatbits *x)
428{
429 unsigned i;
430
431 if (z_out == x) return;
432 z_out->f = x->f;
433 if (!FINITEP(x)) z_out->exp = 0;
434 else z_out->exp = x->exp;
435 if ((x->f&(FLTF_ZERO | FLTF_INF)) || !x->n)
436 { z_out->n = 0; z_out->frac = 0; }
437 else {
438 fltfmt_allocfrac(z_out, x->n);
439 for (i = 0; i < x->n; i++) z_out->frac[i] = x->frac[i];
440 }
441}
442
443/* --- @fltfmt_round@ --- *
444 *
445 * Arguments: @struct floatbits *z_out@ = destination (may equal source)
446 * @const struct floatbits *x@ = source
447 * @unsigned r@ = rounding mode (@FLTRND_...@ code)
448 * @unsigned n@ = nonzero number of bits to leave
449 *
450 * Returns: A @FLTERR_...@ code, specifically either @FLTERR_INEXACT@ if
451 * rounding discarded some nonzero value bits, or @FLTERR_OK@ if
452 * rounding was unnecessary.
453 *
454 * Use: Rounds a floating-point value to a given number of
455 * significant bits, using the given rounding rule.
456 */
457
458unsigned fltfmt_round(struct floatbits *z_out, const struct floatbits *x,
459 unsigned r, unsigned n)
460{
461 unsigned rf, i, uw, ub, hw, hb, rc = 0;
462 uint32 um, hm, w;
463 int exp;
464
465 /* Check that this is likely to work. We must have at least one bit
466 * remaining, so that we can inspect the last-place unit bit. And we
467 * mustn't round up if the current value is already exact, because that
468 * would be nonsensical (and inconvenient).
469 */
470 assert(n > 0); assert(!(r&~(FRPMASK_LOW | FRPMASK_HALF)));
471
472 /* Eliminate trivial cases. There's nothing to do if the value is infinite
473 * or zero, or if we don't have enough precision already.
474 *
475 * The caller will have set the rounding mode and length suitably for a
476 * NaN.
477 */
478 if (x->f&(FLTF_ZERO | FLTF_INF) || n >= 32*x->n)
479 { fltfmt_copybits(z_out, x); return (FLTERR_OK); }
480
481 /* Determine various indices.
482 *
483 * The quantities @uw@ and @ub@ are the word and bit number which will hold
484 * the unit bit when we've finished; @hw@ and @hb@ similarly index the
485 * `half' bit, which is the next less significant bit.
486 */
487 uw = (n - 1)/32; ub = -n&31;
488 if (!ub) { hw = uw + 1; hb = 31; }
489 else { hw = uw; hb = ub - 1; }
490
491 /* Determine the necessary predicates for the rounding decision. */
492 rf = 0;
493 if (x->f&FLTF_NEG) rf |= FRPF_NEG;
494 um = B32(ub); if (x->frac[uw]&um) rf |= FRPF_ODD;
495 hm = B32(hb); if (x->frac[hw]&hm) rf |= FRPF_HALF;
496 if (x->frac[hw]&(hm - 1)) rf |= FRPF_LOW;
497 for (i = hw + 1; i < x->n; i++) if (x->frac[i]) rf |= FRPF_LOW;
498 if (rf&(FRPF_LOW | FRPF_HALF)) rc |= FLTERR_INEXACT;
499
500 /* Allocate space for the result. */
501 fltfmt_allocfrac(z_out, uw + 1);
502
503 /* We start looking at the least significant word of the result. Clear the
504 * low bits.
505 */
506 i = uw; exp = x->exp; w = x->frac[i]&~(um - 1);
507
508 /* If the rounding function is true then we need to add one to the
509 * truncated fraction and propagate carries.
510 */
511 if ((r >> rf)&1) {
512 w = (w + um)&MASK32;
513 while (i && !w) {
514 z_out->frac[i] = w;
515 w = (x->frac[--i] + 1)&MASK32;
516 }
517 if (!w) { w = B31; exp++; }
518 }
519
520 /* Store, and copy the remaining words. */
521 for (;;) {
522 z_out->frac[i] = w;
523 if (!i) break;
524 w = x->frac[--i];
525 }
526
527 /* Done. */
528 z_out->f = x->f&(FLTF_NEG | FLTF_NANMASK);
529 if (x->f&FLTF_NANMASK) z_out->exp = 0;
530 else z_out->exp = exp;
531 return (rc);
532}
533
534/*----- IEEE formats ------------------------------------------------------*/
535
536/* IEEE (and related) format descriptions. */
537const struct fltfmt_ieeefmt
538 fltfmt_mini = { IEEEF_HIDDEN, 4, 4 },
539 fltfmt_bf16 = { IEEEF_HIDDEN, 8, 8 },
540 fltfmt_f16 = { IEEEF_HIDDEN, 5, 11 },
541 fltfmt_f32 = { IEEEF_HIDDEN, 8, 24 },
542 fltfmt_f64 = { IEEEF_HIDDEN, 11, 53 },
543 fltfmt_f128 = { IEEEF_HIDDEN, 15, 113 },
544 fltfmt_idblext80 = { 0, 15, 64 };
545
546/* --- @fltfmt_encieee@ ---
547 *
548 * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description
549 * @uint32 *z@ = output vector
550 * @const struct floatbits *x@ = value to encode
551 * @unsigned r@ = rounding mode
552 * @unsigned errmask@ = error mask
553 *
554 * Returns: Error flags (@FLTERR_...@).
555 *
556 * Use: Encode a floating-point value in an IEEE format. This is the
557 * machinery shared by the @fltfmt_enc...@ functions for
558 * encoding IEEE-format values. Most of the arguments and
559 * behaviour are as described for those functions.
560 *
561 * The encoded value is right-aligned and big-endian; i.e., the
562 * sign bit ends up in @z[0]@, and the least significant bit of
563 * the significand ends up in the least significant bit of
564 * @z[n - 1]@.
565 */
566
567unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *fmt,
568 uint32 *z, const struct floatbits *x,
569 unsigned r, unsigned errmask)
570{
571 struct floatbits y;
572 unsigned sigwd, fracwd, err = 0, f = x->f, rf;
573 unsigned i, j, n, nb, nw, mb, mw, esh, sh;
574 int exp, minexp, maxexp;
575 uint32 z0, t;
576
577#define ERR(e) do { err |= (e); if (err&~errmask) goto end; } while (0)
578
579 /* The following code assumes that the sign, biased exponent, unit, and
580 * quiet/signalling bits can all fit into the most significant 32 bits of
581 * the result.
582 */
583 assert(fmt->expwd + 3 <= 32);
584 esh = 31 - fmt->expwd;
585
586 /* Determine the output size. */
587 nb = fmt->prec + fmt->expwd + 1;
588 if (fmt->f&IEEEF_HIDDEN) nb--;
589 nw = (nb + 31)/32;
590
591 /* Determine the top bits. */
592 z0 = 0; i = 0;
593 if (x->f&FLTF_NEG) z0 |= B31;
594
595 /* And now for the main case analysis. */
596
597 if (f&FLTF_ZERO) {
598 /* Zero. There's very little to do here. */
599
600 } else if (f&FLTF_INF) {
601 /* Infinity. Set the exponent and, for non-hidden-bit formats, the unit
602 * bit.
603 */
604
605 z0 |= M32(fmt->expwd) << esh;
606 if (!(fmt->f&IEEEF_HIDDEN)) z0 |= B32(esh - 1);
607
608 } else if (f&FLTF_NANMASK) {
609 /* Not-a-number.
610 *
611 * We must check that we won't lose significant bits. We need a bit for
612 * the quiet/signalling flag, and enough space for the significant
613 * payload bits. The unit bit is not in play here, so the available
614 * space is always one less than the advertised precision. To put it
615 * another way, we need space for the payload, a bit for the
616 * quiet/signalling flag, and a bit for the unit place.
617 */
618
619 fracwd = sigbits(x);
620 if (fracwd + 2 > fmt->prec) ERR(FLTERR_INEXACT);
621
622 /* Copy the payload.
623 *
624 * If the payload is all-zero and we're meant to set a signalling NaN
625 * then report an exactness failure and set the low bit.
626 */
627 mb = fmt->prec - 2; mw = (mb + 31)/32; sh = -mb%32;
628 for (i = 0; i < nw - mw; i++) z[i] = 0;
629 n = x->n; if (n > mw) n = nw;
630 t = shr(z + i, x->frac, n, sh); i += n;
631 if (i < nw) z[i++] = t;
632 sh = esh - 2; if (fmt->f&IEEEF_HIDDEN) sh++;
633 if (f&FLTF_QNAN) z0 |= B32(sh);
634 else if (!fracwd) { ERR(FLTERR_INEXACT); z[nw - 1] |= 1; }
635
636 /* Set the exponent and, for non-hidden-bit formats, the unit bit. */
637 z0 |= M32(fmt->expwd) << esh;
638 if (!(fmt->f&IEEEF_HIDDEN)) z0 |= B32(esh - 1);
639
640 } else {
641 /* A finite value.
642 *
643 * Adjust the exponent by one place to compensate for the difference in
644 * significant conventions. Our significand lies between zero (in fact,
645 * a half, because we require normalization) and one, while an IEEE
646 * significand lies between zero (in fact, one) and two. Our exponent is
647 * therefore one larger than the IEEE exponent will be.
648 */
649
650 /* Determine the maximum true (unbiased) exponent. As noted above, this
651 * is also the bias.
652 */
653 exp = x->exp - 1;
654 maxexp = (1 << (fmt->expwd - 1)) - 1;
655 minexp = 1 - maxexp;
656
657 if (exp <= minexp - (int)fmt->prec) {
658 /* If the exponent is very small then we underflow. We have %$p - 1$%
659 * bits available to represent a subnormal significand, and therefore
660 * can represent at least one bit of a value as small as
661 * %$2^{e_{\text{min}}-p+1}$%.
662 *
663 * If the exponent is one short of the threshold, then we check to see
664 * whether the value will round up.
665 */
666
667 if ((minexp - exp == fmt->prec) &&
668 ((r >> (FRPF_HALF |
669 (sigbits(x) > 1 ? FRPF_LOW : 0) |
670 (f&FLTF_NEG ? FRPF_NEG : 0)))&1)) {
671 ERR(FLTERR_INEXACT);
672 for (i = 0; i < nw - 1; i++) z[i] = 0;
673 z[i++] = 1;
674 } else {
675 ERR(FLTERR_UFLOW | FLTERR_INEXACT);
676 /* Return (signed) zero. */
677 }
678
679 } else {
680 /* We can at least try to store some bits. */
681
682 /* Let's see how many we need to deal with and how much space we have.
683 * We might as well set the biased exponent here while we're at it.
684 *
685 * If %$e \ge e_{\text{min}}$% then we can store %$p$% bits of
686 * significand. Otherwise, we must make a subnormal and we can only
687 * store %$p + e - e_{\text{min}}$% bits. (Cross-check: if %$e \le
688 * e_{\text{min}} - p$% then we can store zero bits or fewer and have
689 * underflowed to zero, which matches the previous case.) In the
690 * subnormal case, we also `correct' the exponent so that we store the
691 * correct sentinel value later.
692 */
693 fracwd = sigbits(x);
694 if (exp >= minexp) sigwd = fmt->prec;
695 else { sigwd = fmt->prec + exp - minexp; exp = minexp - 1; }
696 mw = (sigwd + 31)/32; sh = -sigwd%32;
697
698 /* If we don't have enough significand bits then we must round. This
699 * might increase the exponent, so we must reload.
700 */
701 if (fracwd > sigwd) {
702 ERR(FLTERR_INEXACT);
703 y.frac = z + nw - mw; y.fracsz = mw; fltfmt_round(&y, x, r, sigwd);
704 x = &y; exp = y.exp - 1; fracwd = sigwd;
705 }
706
707 if (exp > maxexp) {
708 /* If the exponent is too large, then we overflow. If the error is
709 * masked, then we must produce a default value, choosing between
710 * infinity and the largest representable finite value according to
711 * the rounding mode.
712 */
713
714 rf = FRPF_ODD | FRPF_HALF | FRPF_LOW;
715 if (f&FLTF_NEG) rf |= FRPF_NEG;
716 if ((r >> rf)&1) {
717 ERR(FLTERR_OFLOW | FLTERR_INEXACT);
718 z0 |= M32(fmt->expwd) << esh;
719 } else {
720 ERR(FLTERR_INEXACT);
721 z0 |= (B32(fmt->expwd) - 2) << esh;
722 mb = fmt->prec; if (fmt->f&IEEEF_HIDDEN) mb--;
723 mw = (mb + 31)/32;
724 i = nw - mw;
725 z[i++] = M32(mb%32);
726 while (i < nw) z[i++] = MASK32;
727 }
728
729 } else {
730 /* The exponent is in range. Everything is ready. */
731
732 /* Store the significand. */
733 n = (fracwd + 31)/32; i = nw - mw;
734 t = shr(z + i, x->frac, n, sh); i += n;
735 if (i < nw) z[i++] = t;
736
737 /* Fill in the top end. */
738 for (j = nw - mw; j--; ) z[j] = 0;
739
740 /* Set the biased exponent. */
741 z0 |= (exp + maxexp) << esh;
742
743 /* Clear the unit bit if we're suppose to use a hidden-bit convention. */
744 if (fmt->f&IEEEF_HIDDEN) {
745 mb = fmt->prec - 1; mw = (mb + 31)/32; mb = mb%32;
746 z[nw - mw] &= ~B32(mb);
747 }
748 }
749 }
750 }
751
752 /* Clear the significand bits that we haven't set explicitly yet. */
753 while (i < nw) z[i++] = 0;
754
755 /* All that remains is to insert the top bits @z0@ in the right place.
756 * This will set the exponent, and the unit and quiet bits.
757 */
758 sh = -nb%32;
759 z[0] |= z0 >> sh;
760 if (sh && nb >= 32) z[1] |= z0 << (32 - sh);
761
762end:
763 return (err);
764
765#undef ERR
766}
767
768/* --- @fltfmt_encTY@ --- *
769 *
770 * Arguments: @octet *z_out@, @uint16 *z_out@, @uint32 *z_out@,
771 * @kludge64 *z_out@ = where to put the encoded value
772 * @uint16 *se_out@, @kludge64 *m_out@ = where to put the
773 * encoded sign-and-exponent and significand
774 * @const struct floatbits *x@ = value to encode
775 * @unsigned r@ = rounding mode
776 * @unsigned errmask@ = error mask
777 *
778 * Returns: Error flags (@FLTERR_...@).
779 *
780 * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent)
781 * format.
782 *
783 * If an error is encountered during the encoding, and the
784 * corresponding bit of @errmask@ is clear, then processing
785 * stops immediately and the error is returned; if the bit is
786 * set, then processing continues as described below.
787 *
788 * The @TY@ may be
789 *
790 * * @mini@ for the 8-bit `1.4.3 minifloat' format, with
791 * four-bit exponent and four-bit significand, represented
792 * as a single octet;
793 *
794 * * @bf16@ for the Google `bfloat16' format, with eight-bit
795 * exponent and eight-bit significand, represented as a
796 * @uint16@;
797 *
798 * * @f16@ for the IEEE `binary16' format, with five-bit
799 * exponent and eleven-bit significand, represented as a
800 * @uint16@;
801 *
802 * * @f32@ for the IEEE `binary32' format, with eight-bit
803 * exponent and 24-bit significand, represented as a
804 * @uint32@;
805 *
806 * * @f64@ for the IEEE `binary64' format, with eleven-bit
807 * exponent and 53-bit significand, represented as a
808 * @kludge64@;
809 *
810 * * @f128@ for the IEEE `binary128' format, with fifteen-bit
811 * exponent and 113-bit significand, represented as four
812 * @uint32@ limbs, most significant first; or
813 *
814 * * @idblext80@ for the Intel 80-bit `double extended'
815 * format, with fifteen-bit exponent and 64-bit significand
816 * with no hidden bit, represented as a @uint16 se@
817 * holding the sign and exponent, and a @kludge64 m@
818 * holding the significand.
819 *
820 * Positive and negative zero and infinity are representable
821 * exactly.
822 *
823 * Following IEEE recommendations (and most implementations),
824 * the most significant fraction bit of a quiet NaN is set; this
825 * bit is clear in a signalling NaN. The most significant
826 * payload bits of a NaN, held in the top bits of @x->frac[0]@,
827 * are encoded in the output significand following the `quiet'
828 * bit. If the chosen format's significand field is too small
829 * to accommodate all of the set payload bits then the
830 * @FLTERR_INEXACT@ error bit is set and, if masked, the
831 * excess payload bits are discarded. No rounding of NaN
832 * payloads is performed.
833 *
834 * Otherwise, the input value is finite and nonzero. If the
835 * significand cannot be represented exactly then the
836 * @FLTERR_INEXACT@ error bit is set, and, if masked, the value
837 * will be rounded (internally -- the input @x@ is not changed).
838 * If the (rounded) value's exponent is too large to represent,
839 * then the @FLTERR_OFLOW@ and @FLTERR_INEXACT@ error bits are
840 * set and, if masked, the result is either the (absolute)
841 * largest representable finite value or infinity, with the
842 * appropriate sign, chosen according to the rounding mode. If
843 * the exponent is too small to represent, then the
844 * @FLTERR_UFLOW@ and @FLTERR_INEXACT@ error bits are set and,
845 * if masked, the result is either the (absolute) smallest
846 * nonzero value or zero, with the appropriate sign, chosen
847 * according to the rounding mode.
848 */
849
850unsigned fltfmt_encmini(octet *z_out, const struct floatbits *x,
851 unsigned r, unsigned errmask)
852{
853 uint32 t[1];
854 unsigned rc;
855
856 rc = fltfmt_encieee(&fltfmt_mini, t, x, r, errmask);
857 if (!(rc&~errmask)) *z_out = t[0];
858 return (rc);
859}
860
861unsigned fltfmt_encbf16(uint16 *z_out, const struct floatbits *x,
862 unsigned r, unsigned errmask)
863{
864 uint32 t[1];
865 unsigned rc;
866
867 rc = fltfmt_encieee(&fltfmt_bf16, t, x, r, errmask);
868 if (!(rc&~errmask)) *z_out = t[0];
869 return (rc);
870}
871
872unsigned fltfmt_encf16(uint16 *z_out, const struct floatbits *x,
873 unsigned r, unsigned errmask)
874{
875 uint32 t[1];
876 unsigned rc;
877
878 rc = fltfmt_encieee(&fltfmt_f16, t, x, r, errmask);
879 if (!(rc&~errmask)) *z_out = t[0];
880 return (rc);
881}
882
883unsigned fltfmt_encf32(uint32 *z_out, const struct floatbits *x,
884 unsigned r, unsigned errmask)
885 { return (fltfmt_encieee(&fltfmt_f32, z_out, x, r, errmask)); }
886
887unsigned fltfmt_encf64(kludge64 *z_out, const struct floatbits *x,
888 unsigned r, unsigned errmask)
889{
890 uint32 t[2];
891 unsigned rc;
892
893 rc = fltfmt_encieee(&fltfmt_f64, t, x, r, errmask);
894 if (!(rc&~errmask)) SET64(*z_out, t[0], t[1]);
895 return (rc);
896}
897
898unsigned fltfmt_encf128(uint32 *z_out, const struct floatbits *x,
899 unsigned r, unsigned errmask)
900 { return (fltfmt_encieee(&fltfmt_f128, z_out, x, r, errmask)); }
901
902unsigned fltfmt_encidblext80(uint16 *se_out, kludge64 *m_out,
903 const struct floatbits *x,
904 unsigned r, unsigned errmask)
905{
906 uint32 t[3];
907 unsigned rc;
908
909 rc = fltfmt_encieee(&fltfmt_idblext80, t, x, r, errmask);
910 if (!(rc&~errmask)) { *se_out = t[0]; SET64(*m_out, t[1], t[2]); }
911 return (rc);
912}
913
914/* --- @fltfmt_decieee@ --- *
915 *
916 * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description
917 * @struct floatbits *z_out@ = output decoded representation
918 * @const uint32 *x@ = input encoding
919 *
920 * Returns: Error flags (@FLTERR_...@).
921 *
922 * Use: Decode a floating-point value in an IEEE format. This is the
923 * machinery shared by the @fltfmt_dec...@ functions for
924 * deccoding IEEE-format values. Most of the arguments and
925 * behaviour are as described for those functions.
926 *
927 * The encoded value should be right-aligned and big-endian;
928 * i.e., the sign bit ends up in @z[0]@, and the least
929 * significant bit of the significand ends up in the least
930 * significant bit of @z[n - 1]@.
931 */
932
933unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *fmt,
934 struct floatbits *z_out, const uint32 *x)
935{
936 unsigned sigwd, err = 0, f = 0;
937 unsigned i, nb, nw, mw, esh, sh;
938 int exp, minexp, maxexp;
939 uint32 x0, t, u, emask;
940
941 /* The following code assumes that the sign, biased exponent, unit, and
942 * quiet/signalling bits can all fit into the most significant 32 bits of
943 * the result.
944 */
945 assert(fmt->expwd + 3 <= 32);
946 esh = 31 - fmt->expwd; emask = M32(fmt->expwd);
947 sigwd = fmt->prec; if (fmt->f&IEEEF_HIDDEN) sigwd--;
948
949 /* Determine the input size. */
950 nb = sigwd + fmt->expwd + 1; nw = (nb + 31)/32;
951
952 /* Extract the sign, exponent, and top of the significand. */
953 sh = -nb%32;
954 x0 = x[0] << sh;
955 if (sh && nb >= 32) x0 |= x[1] >> (32 - sh);
956 if (x0&B31) f |= FLTF_NEG;
957 t = (x0 >> esh)&emask;
958
959 /* Time for a case analysis. */
960
961 if (t == emask) {
962 /* Infinity or NaN.
963 *
964 * Note that we don't include the quiet bit in our decoded payload.
965 */
966
967 if (!(fmt->f&IEEEF_HIDDEN)) {
968 /* No hidden bit, so we expect the unit bit to be set. If it isn't,
969 * that's technically invalid, and its absence won't survive a round
970 * trip, since the bit isn't considered part of a NaN payload -- or
971 * even to distinguish a NaN from an infinity. In any event, reduce
972 * the notional significand size to exclude this bit from further
973 * consideration.
974 */
975
976 if (!(x0&B32(esh - 1))) err = FLTERR_INVAL;
977 sigwd--;
978 }
979
980 if (ms_set_bit(x + nw, 0, sigwd) == ALLCLEAR)
981 f |= FLTF_INF;
982 else {
983 sh = esh - 2; if (fmt->f&IEEEF_HIDDEN) sh++;
984 if (x0&B32(sh)) f |= FLTF_QNAN;
985 else f |= FLTF_SNAN;
986 sigwd--; mw = (sigwd + 31)/32;
987 fltfmt_allocfrac(z_out, mw);
988 shl(z_out->frac, x + nw - mw, mw, -sigwd%32);
989 }
990 goto end;
991 }
992
993 /* Determine the exponent bounds. */
994 maxexp = (1 << (fmt->expwd - 1)) - 1;
995 minexp = 1 - maxexp;
996
997 /* Dispatch. If there's a hidden bit then everything is well defined.
998 * Otherwise, we'll normalize the incoming value regardless, but report
999 * settings of the unit bit which are inconsistent with the exponent.
1000 */
1001 if (fmt->f&IEEEF_HIDDEN) {
1002 if (!t) { exp = minexp; goto normalize; }
1003 else { exp = t - maxexp; goto hidden; }
1004 } else {
1005 u = x0&B32(esh - 1);
1006 if (!t) { exp = minexp; if (u) err |= FLTERR_INVAL; }
1007 else { exp = t - maxexp; if (!u) err |= FLTERR_INVAL; }
1008 goto normalize;
1009 }
1010
1011hidden:
1012 /* We have a normal real number with a hidden bit. */
1013
1014 mw = (sigwd + 31)/32;
1015
1016 if (!(sigwd%32)) {
1017 /* The bits we have occupy a whole number of words, but we need to shift
1018 * to make space for the unit bit.
1019 */
1020
1021 fltfmt_allocfrac(z_out, mw + 1);
1022 z_out->frac[mw] = shr(z_out->frac, x + nw - mw, mw, 1);
1023 } else {
1024 fltfmt_allocfrac(z_out, mw);
1025 shl(z_out->frac, x + nw - mw, mw, -(sigwd + 1)%32);
1026 }
1027 z_out->frac[0] |= B31;
1028 z_out->exp = exp + 1;
1029 goto end;
1030
1031normalize:
1032 /* We have, at least potentially, a subnormal number, with no hidden
1033 * bit.
1034 */
1035
1036 i = ms_set_bit(x + nw, 0, sigwd);
1037 if (i == ALLCLEAR) { f |= FLTF_ZERO; goto end; }
1038 mw = i/32 + 1; sh = 32*mw - i - 1;
1039 fltfmt_allocfrac(z_out, mw);
1040 shl(z_out->frac, x + nw - mw, mw, sh);
1041 z_out->exp = exp - fmt->prec + 2 + i;
1042 goto end;
1043
1044end:
1045 /* All done. */
1046 z_out->f = f; return (err);
1047}
1048
1049/* --- @fltfmt_decTY@ --- *
1050 *
1051 * Arguments: @const struct floatbits *z_out@ = storage for the result
1052 * @octet x@, @uint16 x@, @uint32 x@, @kludge64 x@ =
1053 * encoded input
1054 * @uint16 se@, @kludge64 m@ = encoded sign-and-exponent and
1055 * significand
1056 *
1057 * Returns: Error flags (@FLTERR_...@).
1058 *
1059 * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent)
1060 * format.
1061 *
1062 * The options for @TY@ are as documented for the encoding
1063 * functions above.
1064 *
1065 * In formats without a hidden bit -- currently only @idblext80@
1066 * -- not all bit patterns are valid encodings. If the explicit
1067 * unit bit is set when the exponent field is all-bits-zero, or
1068 * clear when the exponent field is not all-bits-zero, then the
1069 * @FLTERR_INVAL@ error bit is set. If the exponent is all-
1070 * bits-set, denoting infinity or a NaN, then the unit bit is
1071 * otherwise ignored -- in particular, it does not affect the
1072 * NaN payload, or even whether the input encodes a NaN or
1073 * infinity. Otherwise, the unit bit is considered significant,
1074 * and the result is normalized as one would expect.
1075 * Consequently, biased exponent values 0 and 1 are distinct
1076 * only with respect to which bit patterns are considered valid,
1077 * and not with respect to the set of values denoted.
1078 */
1079
1080unsigned fltfmt_decmini(struct floatbits *z_out, octet x)
1081 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_mini, z_out, t)); }
1082
1083unsigned fltfmt_decbf16(struct floatbits *z_out, uint16 x)
1084 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_bf16, z_out, t)); }
1085
1086unsigned fltfmt_decf16(struct floatbits *z_out, uint16 x)
1087 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f16, z_out, t)); }
1088
1089unsigned fltfmt_decf32(struct floatbits *z_out, uint32 x)
1090 { uint32 t[1]; t[0] = x; return (fltfmt_decieee(&fltfmt_f32, z_out, t)); }
1091
1092unsigned fltfmt_decf64(struct floatbits *z_out, kludge64 x)
1093{
1094 uint32 t[2];
1095
1096 t[0] = HI64(x); t[1] = LO64(x);
1097 return (fltfmt_decieee(&fltfmt_f64, z_out, t));
1098}
1099
1100unsigned fltfmt_decf128(struct floatbits *z_out, const uint32 *x)
1101 { return (fltfmt_decieee(&fltfmt_f128, z_out, x)); }
1102
1103unsigned fltfmt_decidblext80(struct floatbits *z_out, uint16 se, kludge64 m)
1104{
1105 uint32 t[3];
1106
1107 t[0] = se; t[1] = HI64(m); t[2] = LO64(m);
1108 return (fltfmt_decieee(&fltfmt_idblext80, z_out, t));
1109}
1110
1111/*----- Native formats ----------------------------------------------------*/
1112
1113/* If the floating-point radix is a power of two, determine how many bits
1114 * there are in each digit. This isn't exhaustive, but it covers most of the
1115 * bases, so to speak.
1116 */
1117#if FLT_RADIX == 2
1118# define DIGIT_BITS 1
1119#elif FLT_RADIX == 4
1120# define DIGIT_BITS 2
1121#elif FLT_RADIX == 8
1122# define DIGIT_BITS 3
1123#elif FLT_RADIX == 16
1124# define DIGIT_BITS 4
1125#endif
1126
1127/* --- @ENCFLT@ --- *
1128 *
1129 * Arguments: @ty@ = the C type to encode
1130 * @TY@ = the uppercase prefix for macros
1131 * @ty (*ldexp)(ty, int)@ = function to scale a @ty@ value by a
1132 * power of two
1133 * @unsigned &rc@ = error code to set
1134 * @ty *z_out@ = storage for the result
1135 * @const struct floatbits *x@ = value to convert
1136 * @unsigned r@ = rounding mode
1137 *
1138 * Returns: ---
1139 *
1140 * Use: Encode a floating-point value @x@ as a native C object of
1141 * type @ty@. This is the machinery shared by the
1142 * @fltfmt_enc...@ functions for enccoding native-format values.
1143 * Most of the behaviour is as described for those functions.
1144 */
1145
1146/* Utilities based on conditional compilation, because we can't use
1147 * %|#ifdef|% directly in macros.
1148 */
1149
1150#ifdef NAN
1151 /* The C implementation acknowledges the existence of (quiet) NaN values,
1152 * but will neither let us set the payload in a useful way, nor
1153 * acknowledge the existence of signalling NaNs. We have no good way to
1154 * determine which NaN the @NAN@ macro produces, so report this conversion
1155 * as inexact.
1156 */
1157
1158# define SETNAN(rc, z) do { (z) = NAN; (rc) = FLTERR_INEXACT; } while (0)
1159#else
1160 /* This C implementation doesn't recognize NaNs. This value is totally
1161 * unrepresentable, so just report the error. (Maybe it's C89 and would
1162 * actually do the right thing with @0/0@. I'm not sure the requisite
1163 * compile-time configuration machinery is worth the effort.)
1164 */
1165
1166# define SETNAN(rc, z) do { (z) = 0; (rc) = FLTERR_REPR; } while (0)
1167#endif
1168
1169#ifdef INFINITY
1170 /* The C implementation supports infinities. This is a simple win. */
1171
1172# define SETINF(TY, rc, z) \
1173 do { (z) = INFINITY; (rc) = FLTERR_OK; } while (0)
1174#else
1175 /* The C implementation doesn't support infinities. Return the maximum
1176 * value and report it as an overflow; I think this is more useful than
1177 * reporting a complete representation failure. (Maybe it's C89 and would
1178 * actually do the right thing with @1/0@. Again, I'm not sure the
1179 * requisite compile-time configuration machinery is worth the effort.)
1180 */
1181
1182# define SETINF(TY, rc, z) \
1183 do { (z) = TY##_MAX; (rc) = FLTERR_OFLOW | FLTERR_INEXACT; } while (0)
1184#endif
1185
1186#ifdef DIGIT_BITS
1187 /* The floating point formats use a power-of-two radix. This means that
1188 * we can determine the correctly rounded value before we start building
1189 * the native floating-point value.
1190 */
1191
1192# define ENC_ROUND_DECLS struct floatbits _y;
1193# define ENC_ROUND(TY, rc, x, r) do { \
1194 (rc) |= fltfmt_round(&_y, (x), (r), DIGIT_BITS*TY##_MANT_DIG); \
1195 (x) = &_y; \
1196 } while (0)
1197#else
1198 /* The floating point formats use a non-power-of-two radix. This means
1199 * that conversion is inherently inexact.
1200 */
1201
1202# define ENC_ROUND_DECLS
1203# define ENC_ROUND(TY, rc, x, r) \
1204 do (rc) |= FLTERR_INEXACT; while (0)
1205# define ENC_FIXUP(...)
1206
1207#endif
1208
1209#define ENCFLT(ty, TY, ldexp, rc, z_out, x, r) do { \
1210 unsigned _rc = 0; \
1211 \
1212 /* See if the native format is one that we recognize. */ \
1213 switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \
1214 \
1215 case FLTFMT_IEEE_F32: { \
1216 uint32 _t[1]; \
1217 unsigned char *_z = (unsigned char *)(z_out); \
1218 \
1219 (rc) = fltfmt_encieee(&fltfmt_f32, _t, (x), (r), FLTERR_ALLERRS); \
1220 FLTFMT__FROB_NAN_F32(_t, _rc); \
1221 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1222 case FLTFMT_BE: STORE32_B(_z, _t[0]); break; \
1223 case FLTFMT_LE: STORE32_L(_z, _t[0]); break; \
1224 default: assert(!"unimplemented byte order"); break; \
1225 } \
1226 } break; \
1227 \
1228 case FLTFMT_IEEE_F64: { \
1229 uint32 _t[2]; \
1230 unsigned char *_z = (unsigned char *)(z_out); \
1231 (rc) = fltfmt_encieee(&fltfmt_f64, _t, (x), (r), FLTERR_ALLERRS); \
1232 FLTFMT__FROB_NAN_F64(_t, _rc); \
1233 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1234 case FLTFMT_BE: \
1235 STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \
1236 break; \
1237 case FLTFMT_LE: \
1238 STORE32_L(_z + 0, _t[1]); STORE32_L(_z + 4, _t[0]); \
1239 break; \
1240 case FLTFMT_ARME: \
1241 STORE32_L(_z + 0, _t[0]); STORE32_L(_z + 4, _t[1]); \
1242 break; \
1243 default: assert(!"unimplemented byte order"); break; \
1244 } \
1245 } break; \
1246 \
1247 case FLTFMT_IEEE_F128: { \
1248 uint32 _t[4]; \
1249 unsigned char *_z = (unsigned char *)(z_out); \
1250 \
1251 FLTFMT__FROB_NAN_F128(_t, _rc); \
1252 (rc) = fltfmt_encieee(&fltfmt_f128, _t, (x), (r), FLTERR_ALLERRS); \
1253 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1254 case FLTFMT_BE: \
1255 STORE32_B(_z + 0, _t[0]); STORE32_B(_z + 4, _t[1]); \
1256 STORE32_B(_z + 8, _t[0]); STORE32_B(_z + 12, _t[1]); \
1257 break; \
1258 case FLTFMT_LE: \
1259 STORE32_L(_z + 0, _t[3]); STORE32_L(_z + 4, _t[2]); \
1260 STORE32_L(_z + 8, _t[1]); STORE32_L(_z + 12, _t[0]); \
1261 break; \
1262 default: assert(!"unimplemented byte order"); break; \
1263 } \
1264 } break; \
1265 \
1266 case FLTFMT_INTEL_F80: { \
1267 uint32 _t[3]; \
1268 unsigned char *_z = (unsigned char *)(z_out); \
1269 \
1270 (rc) = fltfmt_encieee(&fltfmt_idblext80, _t, (x), (r), FLTERR_ALLERRS); \
1271 FLTFMT__FROB_NAN_IDBLEXT80(_t, _rc); \
1272 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1273 case FLTFMT_BE: \
1274 STORE16_B(_z + 0, _t[0]); \
1275 STORE32_B(_z + 2, _t[1]); STORE32_B(_z + 6, _t[2]); \
1276 break; \
1277 case FLTFMT_LE: \
1278 STORE32_L(_z + 0, _t[2]); STORE32_L(_z + 4, _t[1]); \
1279 STORE16_L(_z + 8, _t[0]); \
1280 break; \
1281 default: assert(!"unimplemented byte order"); break; \
1282 } \
1283 } break; \
1284 \
1285 default: { \
1286 /* We must do this the hard way. */ \
1287 \
1288 const struct floatbits *_x = (x); \
1289 ty _z; \
1290 unsigned _i; \
1291 ENC_ROUND_DECLS; \
1292 \
1293 /* Case analysis... */ \
1294 if (_x->f&FLTF_NANMASK) { \
1295 /* A NaN. Use the macro above. */ \
1296 \
1297 SETNAN(_rc, _z); \
1298 if (x->f&FLTF_NEG) _z = -_z; \
1299 } else if (_x->f&FLTF_INF) { \
1300 /* Infinity. Use the macro. */ \
1301 \
1302 SETINF(TY, _rc, _z); \
1303 if (_x->f&FLTF_NEG) _z = -_z; \
1304 } else if (_x->f&FLTF_ZERO) { \
1305 /* Zero. If we're asked for a negative zero then check that we \
1306 * produced one: if not, then report an exactness failure. \
1307 */ \
1308 \
1309 _z = 0.0; \
1310 if (_x->f&FLTF_NEG) \
1311 { _z = -_z; if (!NEGP(_z)) _rc |= FLTERR_INEXACT; } \
1312 } else { \
1313 /* A finite value. */ \
1314 \
1315 /* If the radix is a power of two, we can round to the correct \
1316 * precision, which will save inexactness later. \
1317 */ \
1318 ENC_ROUND(TY, _rc, _x, (r)); \
1319 \
1320 /* Insert the 32-bit pieces of the fraction one at a time, \
1321 * starting from the least-significant end. This minimizes the \
1322 * inaccuracy from the overall approach, but it's imperfect \
1323 * unless the value has already been rounded correctly. \
1324 */ \
1325 _z = 0.0; \
1326 for (_i = _x->n, _z = 0.0; _i--; ) \
1327 _z += ldexp(_x->frac[_i], _x->exp - 32*_i); \
1328 \
1329 /* Negate the value if we need to. */ \
1330 if (_x->f&FLTF_NEG) _z = -_z; \
1331 } \
1332 \
1333 /* All done. */ \
1334 *(z_out) = _z; \
1335 } break; \
1336 } \
1337 \
1338 /* Set the error code. */ \
1339 (rc) = _rc; \
1340} while (0)
1341
1342/* --- @fltfmt_encTY@ --- *
1343 *
1344 * Arguments: @ty *z_out@ = storage for the result
1345 * @const struct floatbits *x@ = value to encode
1346 * @unsigned r@ = rounding mode
1347 *
1348 * Returns: Error flags (@FLTERR_...@).
1349 *
1350 * Use: Encode the floating-point value @x@ as a native C object and
1351 * store the result in @z_out@.
1352 *
1353 * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
1354 * @double@, or (on C99 implementations) @ldbl@ to encode a
1355 * @long double@.
1356 *
1357 * In detail, conversion is performed as follows.
1358 *
1359 * * If a non-finite value cannot be represented by the
1360 * implementation then the @FLTERR_REPR@ error bit is set
1361 * and @*z_out@ is set to zero if @x@ is a NaN, or the
1362 * (absolute) largest representable value, with appropriate
1363 * sign, if @x@ is an infinity.
1364 *
1365 * * If the implementation can represent NaNs, but cannot set
1366 * NaN payloads, then the @FLTERR_INEXACT@ error bit is set,
1367 * and @*z_out@ is set to an arbitrary (quiet) NaN value.
1368 *
1369 * * If @x@ is negative zero, but the implementation does not
1370 * distinguish negative and positive zero, then the
1371 * @FLTERR_INEXACT@ error bit is set and @*z_out@ is set to
1372 * zero.
1373 *
1374 * * If the implementation's floating-point radix is not a
1375 * power of two, and @x@ is a nonzero finite value, then
1376 * @FLTERR_INEXACT@ error bit is set (unconditionally), and
1377 * the value is rounded by the implementation using its
1378 * prevailing rounding policy. If the radix is a power of
1379 * two, then the @FLTERR_INEXACT@ error bit is set only if
1380 * rounding is necessary, and rounding is performed using
1381 * the rounding mode @r@.
1382 */
1383
1384unsigned fltfmt_encflt(float *z_out, const struct floatbits *x, unsigned r)
1385{
1386 unsigned rc;
1387
1388 ENCFLT(double, FLT, ldexp, rc, z_out, x, r);
1389 return (rc);
1390}
1391
1392unsigned fltfmt_encdbl(double *z_out, const struct floatbits *x, unsigned r)
1393{
1394 unsigned rc;
1395
1396 ENCFLT(double, DBL, ldexp, rc, z_out, x, r);
1397 return (rc);
1398}
1399
1400#if __STDC_VERSION__ >= 199001
1401unsigned fltfmt_encldbl(long double *z_out,
1402 const struct floatbits *x, unsigned r)
1403{
1404 unsigned rc;
1405
1406 ENCFLT(long double, LDBL, ldexpl, rc, z_out, x, r);
1407 return (rc);
1408}
1409#endif
1410
1411/* --- @DECFLT@ --- *
1412 *
1413 * Arguments: @ty@ = the C type to encode
1414 * @TY@ = the uppercase prefix for macros
1415 * @ty (*frexp)(ty, int *)@ = function to decompose a @ty@ value
1416 * into a binary exponent and normalized fraction
1417 * @unsigned &rc@ = error code to set
1418 * @struct floatbits *z_out@ = storage for the result
1419 * @ty x@ = value to convert
1420 * @unsigned r@ = rounding mode
1421 *
1422 * Returns: ---
1423 *
1424 * Use: Decode a C native floating-point object. This is the
1425 * machinery shared by the @fltfmt_dec...@ functions for
1426 * decoding native-format values. Most of the behaviour is as
1427 * described for those functions.
1428 */
1429
1430/* Define some utilities for decoding native floating-point formats.
1431 *
1432 * * @NFRAC(d)@ is the number of fraction limbs we'll need for @d@ native
1433 * digits.
1434 *
1435 * * @CONVFIX@ is a final fixup applied to the decoded value.
1436 */
1437#ifdef DIGIT_BITS
1438# define NFRAC(TY) ((DIGIT_BITS*TY##_MANT_DIG + 31)/32)
1439# define CONVFIX(ty, rc, z, x, n, f, r) do assert(!(x)); while (0)
1440#else
1441# define NFRAC(TY) \
1442 (ceil(log(pow(FLT_RADIX, TY##_MANT_DIG) - 1)/32.0*log(2.0)) + 1)
1443# define CONVFIX(ty, rc, z, x, n, f, r) do { \
1444 ty _x_ = (x); \
1445 struct floatbits *_z_ = (z); \
1446 uint32 _w_; \
1447 unsigned _i_, _n_ = (n), _f_; \
1448 \
1449 /* Round the result according to the remainder left in %$x$%. */ \
1450 _f_ = 0; _i_ = _n_ - 1; _w_ = _z_->frac[_i_]; \
1451 if ((f)&FLTF_NEG) _f_ |= FRPF_NEG; \
1452 if (_w_&1) _f_ |= FRPF_ODD; \
1453 if (_y_ >= 0.5) _f_ |= FRPF_HALF; \
1454 if (_y_ != 0 && _y_ != 0.5) _f_ |= FRPF_LOW; \
1455 if (((r) >> _f_)&1) { \
1456 for (;;) { \
1457 _w_ = (_w_ + 1)&MASK32; \
1458 if (_w_ || !_i_) break; \
1459 _z_->frac[_i_] = 0; _w_ = _z_->frac[--_i_]; \
1460 } \
1461 if (!_w_) { _z_->exp++; _w_ = B31; } \
1462 _z_->frac[_i_] = w; \
1463 } \
1464 \
1465 /* The result is not exact. */ \
1466 (rc) |= FLTERR_INEXACT; \
1467 } while (0)
1468#endif
1469
1470#define DECFLT(ty, TY, frexp, rc, z_out, x, r) do { \
1471 unsigned _rc = 0; \
1472 \
1473 switch (TY##_FORMAT&(FLTFMT_ORGMASK | FLTFMT_TYPEMASK)) { \
1474 \
1475 case FLTFMT_IEEE_F32: { \
1476 unsigned _t[1]; \
1477 unsigned char *_x = (unsigned char *)&(x); \
1478 \
1479 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1480 case FLTFMT_BE: _t[0] = LOAD32_B(_x); break; \
1481 case FLTFMT_LE: _t[0] = LOAD32_L(_x); break; \
1482 default: assert(!"unimplemented byte order"); break; \
1483 } \
1484 FLTFMT__FROB_NAN_F32(_t, _rc); \
1485 _rc |= fltfmt_decieee(&fltfmt_f32, (z_out), _t); \
1486 } break; \
1487 \
1488 case FLTFMT_IEEE_F64: { \
1489 unsigned _t[2]; \
1490 unsigned char *_x = (unsigned char *)&(x); \
1491 \
1492 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1493 case FLTFMT_BE: \
1494 _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \
1495 break; \
1496 case FLTFMT_LE: \
1497 _t[1] = LOAD32_L(_x + 0); _t[0] = LOAD32_L(_x + 4); \
1498 break; \
1499 case FLTFMT_ARME: \
1500 _t[0] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \
1501 break; \
1502 default: assert(!"unimplemented byte order"); break; \
1503 } \
1504 FLTFMT__FROB_NAN_F64(_t, _rc); \
1505 _rc |= fltfmt_decieee(&fltfmt_f64, (z_out), _t); \
1506 } break; \
1507 \
1508 case FLTFMT_IEEE_F128: { \
1509 unsigned _t[4]; \
1510 unsigned char *_x = (unsigned char *)&(x); \
1511 \
1512 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1513 case FLTFMT_BE: \
1514 _t[0] = LOAD32_B(_x + 0); _t[1] = LOAD32_B(_x + 4); \
1515 _t[2] = LOAD32_B(_x + 8); _t[3] = LOAD32_B(_x + 12); \
1516 break; \
1517 case FLTFMT_LE: \
1518 _t[3] = LOAD32_L(_x + 0); _t[2] = LOAD32_L(_x + 4); \
1519 _t[1] = LOAD32_L(_x + 8); _t[0] = LOAD32_L(_x + 12); \
1520 break; \
1521 default: assert(!"unimplemented byte order"); break; \
1522 } \
1523 FLTFMT__FROB_NAN_F128(_t, _rc); \
1524 _rc |= fltfmt_decieee(&fltfmt_f128, (z_out), _t); \
1525 } break; \
1526 \
1527 case FLTFMT_INTEL_F80: { \
1528 unsigned _t[3]; \
1529 unsigned char *_x = (unsigned char *)&(x); \
1530 \
1531 switch (TY##_FORMAT&FLTFMT_ENDMASK) { \
1532 case FLTFMT_BE: \
1533 _t[0] = LOAD16_B(_x + 0); \
1534 _t[1] = LOAD32_B(_x + 2); _t[2] = LOAD32_B(_x + 6); \
1535 break; \
1536 case FLTFMT_LE: \
1537 _t[2] = LOAD32_L(_x + 0); _t[1] = LOAD32_L(_x + 4); \
1538 _t[0] = LOAD16_L(_x + 8); \
1539 break; \
1540 default: assert(!"unimplemented byte order"); break; \
1541 } \
1542 FLTFMT__FROB_NAN_IDBLEXT80(_t, _rc); \
1543 _rc |= fltfmt_decieee(&fltfmt_idblext80, (z_out), _t); \
1544 } break; \
1545 \
1546 default: { \
1547 struct floatbits *_z = (z_out); \
1548 ty _x = (x), _y; \
1549 unsigned _i, _n, _f = 0; \
1550 uint32 _t; \
1551 \
1552 /* If the value looks negative then negate it and set the sign \
1553 * flag. \
1554 */ \
1555 if (NEGP(_x)) { _f |= FLTF_NEG; _x = -_x; } \
1556 \
1557 /* Now for the case analysis. Infinities and zero are \
1558 * unproblematic. NaNs can't be decoded exactly using the \
1559 * portable machinery. \
1560 */ \
1561 if (INFP(_x)) _f |= FLTF_INF; \
1562 else if (_x == 0.0) _f |= FLTF_ZERO; \
1563 else if (NANP(_x)) { _f |= FLTF_QNAN; _rc |= FLTERR_INEXACT; } \
1564 else { \
1565 /* A finite value. Determine the number of fraction limbs \
1566 * we'll need based on the precision and radix and pull out \
1567 * 32-bit chunks one at a time. This will be unproblematic \
1568 * for power-of-two radices, requiring at most shifting the \
1569 * significand left by a few bits, but inherently inexact (for \
1570 * the most part) for others. \
1571 */ \
1572 \
1573 _n = NFRAC(TY); fltfmt_allocfrac(_z, _n); \
1574 _y = frexp(_x, &_z->exp); \
1575 for (_i = 0; _i < _n; _i++) \
1576 { _y *= SH32; _t = _y; _y -= _t; _z->frac[_i] = _t; } \
1577 CONVFIX(ty, _rc, _z, _y, _n, _f, (r)); \
1578 } \
1579 \
1580 /* Done. */ \
1581 _z->f = _f; \
1582 } break; \
1583 } \
1584 \
1585 /* Set the error code. */ \
1586 (rc) = _rc; \
1587} while (0)
1588
1589/* --- @fltfmt_decTY@ --- *
1590 *
1591 * Arguments: @struct floatbits *z_out@ = storage for the result
1592 * @ty x@ = value to decode
1593 * @unsigned r@ = rounding mode
1594 *
1595 * Returns: Error flags (@FLTERR_...@).
1596 *
1597 * Use: Decode the native C floatingpoint value @x@ and store the
1598 * result in @z_out@.
1599 *
1600 * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a
1601 * @double@, or (on C99 implementations) @ldbl@ to encode a
1602 * @long double@.
1603 *
1604 * In detail, conversion is performed as follows.
1605 *
1606 * * If the implementation supports negative zeros and/or
1607 * infinity, then these are recognized and decoded.
1608 *
1609 * * If the input as a NaN, but the implementation cannot
1610 * usefully report NaN payloads, then the @FLTERR_INEXACT@
1611 * error bit is set and the decoded payload is left empty.
1612 *
1613 * * If the implementation's floating-point radix is not a
1614 * power of two, and @x@ is a nonzero finite value, then
1615 * @FLTERR_INEXACT@ error bit is set (unconditionally), and
1616 * the rounded value (according to the rounding mode @r@) is
1617 * stored in as many fraction words as necessary to identify
1618 * the original value uniquely. If the radix is a power of
1619 * two, then the value is represented exactly.
1620 */
1621
1622unsigned fltfmt_decflt(struct floatbits *z_out, float x, unsigned r)
1623{
1624 unsigned rc;
1625
1626 DECFLT(double, FLT, frexp, rc, z_out, x, r);
1627 return (rc);
1628}
1629
1630unsigned fltfmt_decdbl(struct floatbits *z_out, double x, unsigned r)
1631{
1632 unsigned rc;
1633
1634 DECFLT(double, DBL, frexp, rc, z_out, x, r);
1635 return (rc);
1636}
1637
1638#if __STDC_VERSION__ >= 199001
1639unsigned fltfmt_decldbl(struct floatbits *z_out, long double x, unsigned r)
1640{
1641 unsigned rc;
1642
1643 DECFLT(long double, LDBL, frexpl, rc, z_out, x, r);
1644 return (rc);
1645}
1646#endif
1647
1648/*----- That's all, folks -------------------------------------------------*/