@@@ timeout wip
[mLib] / struct / buf-float.c
CommitLineData
e63124bc
MW
1/* -*-c-*-
2 *
3 * Encoding and decoding floating-point values
4 *
5 * (c) 2023 Straylight/Edgeware
6 */
7
8/*----- Licensing notice --------------------------------------------------*
9 *
10 * This file is part of the mLib utilities library.
11 *
12 * mLib is free software: you can redistribute it and/or modify it under
13 * the terms of the GNU Library General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or (at
15 * your option) any later version.
16 *
17 * mLib is distributed in the hope that it will be useful, but WITHOUT
18 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20 * License for more details.
21 *
22 * You should have received a copy of the GNU Library General Public
23 * License along with mLib. If not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25 * USA.
26 */
27
28/*----- Header files ------------------------------------------------------*/
29
30#include <float.h>
31#include <math.h>
32
33#include "bits.h"
34#include "buf.h"
67b5031e 35#include "maths.h"
e63124bc
MW
36
37/*----- Formatting primitives ---------------------------------------------*/
38
39/* We use the IEEE 754 `binary64' format. Briefly:
40 *
41 * * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes
42 * %$s = -1$%.. The format is signed-magnitude, so everything else is
43 * the same for positive and negative numbers.
44 *
45 * * The next eleven bits are the biased exponent %$e$%.
46 *
47 * * The remaining 52 bits are the significand %$m$%.
48 *
49 * If %$0 < e < 2047$% then the encoding represents the normal number
50 * %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%.
51 *
52 * If %$e = 0$% and %$m = 0$% then the encoding represents positive or
53 * negative zero.
54 *
55 * If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal
56 * number %$s \cdot m/2^{52} \cdot 2^{-1022}$%.
57 *
58 * If %$e = 2047$% and %$m = 0$% then the encoding represents positive or
59 * negative infinity.
60 *
61 * If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN. If
62 * the most significant bit of %$m$% is set then this is a quiet NaN;
63 * otherwise it's a signalling NaN.
64 */
65
31d0247c
MW
66/* --- @k64_to_f64@ --- *
67 *
68 * Arguments: @double *x_out@ = where to put the result
69 * @kludge64 k@ = a 64-bit encoding of a floating-point value
70 *
71 * Returns: Zero on success, @-1@ on failure.
72 *
73 * Use: Decodes @k@ as a `binary64' value. See `buf_getf64' for the
74 * caveats.
75 */
76
77static int k64_to_f64(double *x_out, kludge64 k)
78{
79 uint32 lo, hi, t;
80 int s, e; double x;
81
82 /* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */
83
84 /* Pick the encoded number apart. */
85 hi = HI64(k); lo = LO64(k);
86 s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff;
87
88 /* Deal with various special cases. */
89 if (e == 2047) {
90 /* Maximum exponent indicates (positive or negative) infinity or NaN. */
91
92 if (t || lo) {
93 /* It's a NaN. We're not going to be picky about which one. If we
94 * can't represent it then we'll just have to fail.
95 */
96
97#ifdef NAN
98 x = NAN;
99#else
100 return (-1);
101#endif
102 } else {
103 /* It's an infinity. If we don't have one of those to hand, then pick
104 * something really big.
105 */
106
107#ifdef INFINITY
108 x = s ? -INFINITY : INFINITY;
109#else
110 x = s ? -DBL_MAX : DBL_MAX;
111#endif
112 }
113 } else {
114 /* It's a finite number, though maybe it's weird in some way. */
115
116 if (e == 0) {
117 /* Minimum exponent indicates zero or a subnormal number. The
118 * subnormal exponent is a sentinel value that shouldn't be taken
119 * literally, so we should fix that. If the number is actually zero
120 * then the exponent won't matter much so don't bother checking.
121 */
122
123 e = 1;
124 } else {
125 /* It's a normal number. In which case there's an implicit bit which
126 * we can now set.
127 */
128
129 t |= 0x00100000;
130 }
131
132 /* All that remains is to stuff the significant and exponent into a
133 * floating point number. We'll have to do this in pieces, and we'll
134 * lean on the floating-point machinery to do rounding correctly.
135 */
136 x = ldexp(t, e - 1043) + ldexp(lo, e - 1075);
137 if (s) x = -x;
138 }
139
140 /* And we're done. */
141 *x_out = x; return (0);
142}
143
e63124bc
MW
144/* --- @f64_to_k64@ --- *
145 *
146 * Arguments: @double x@ = a floating-point number
147 *
148 * Returns: A 64-bit encoding of @x@.
149 *
150 * Use: Encodes @x@ as a `binary64' value. See `buf_putf64' for the
151 * caveats.
152 */
153
154static kludge64 f64_to_k64(double x)
155{
156 kludge64 k;
157 uint32 lo, hi, t;
158 int e; double m;
159
160 /* Some machinery before we start. */
161
e63124bc
MW
162 if (NANP(x)) {
163 /* A NaN. */
164 hi = 0x7ff80000; lo = 0;
165 } else if (INFP(x)) {
166 /* Positive or negative infinity. */
167 hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0;
168 } else if (x == 0) {
169 /* Positive or negative zero. */
170 hi = NEGP(x) ? 0x80000000 : 0; lo = 0;
171 } else {
172 /* A normal or subnormal number. Now we have to do some actual work. */
173
174 /* Let's get the sign dealt with so we don't have to worry about it any
175 * more.
176 */
177 if (!NEGP(x)) hi = 0;
178 else { x = -x; hi = 0x80000000; }
179
180 /* Now we start on the value. The first thing to do is to split off the
181 * exponent. Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%.
182 */
183 m = frexp(x, &e);
184
185 /* If our number is too big, we'll round it to infinity. This will
186 * happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%.
187 */
188 if (e > 1024)
189 { hi |= 0x7ff00000; lo = 0; }
190 else {
191 /* Our number is sufficiently small that we can represent it at least
192 * approximately (though maybe we'll have to flush it to zero). The
193 * next step, then, is to pull the significand bits out.
194 */
195
196 /* Determine the correct exponent to store. We're not going to bias it
197 * yet, but this is where we deal with subnormal numbers. Our number
198 * is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%. In this
199 * case, there's an implicit bit which we'll clear. Otherwise, if it's
200 * subnormal, we'll scale our floating-point number so that the
201 * significand will look right when we extract it, and adjust the
202 * exponent so that, when we're finally done, it will have the correct
203 * sentinel value.
204 */
205 if (e > -1022) m -= 0.5;
206 else { m = ldexp(m, 1021 + e); e = -1022; }
207
208 /* Now we pull out the 53 bits of the significand. This will, in
209 * general, leave a tail which we address through rounding. Scale it
210 * up so that we end up with %$0 \le m' < 2$%; then we round up if
211 * %$m > 1$%, or if %$m = 1$% and the low bit of the significand is
212 * set.
213 */
214 t = ldexp(m, 21); m -= ldexp(t, -21);
215 lo = ldexp(m, 53); m -= ldexp(lo, -53);
216 m = ldexp(m, 54);
217
218 /* Round the number if necessary. */
219 if (lo&1 ? m >= 1.0 : m > 1)
220 { lo = U32(lo + 1); if (!lo) t++; }
221
222 /* Now we just put the pieces together. Note that our %$e$% is one
223 * greater than it should be, because our implicit bit should have
224 * been the unit bit not the 1/2 bit.
225 */
226 hi |= ((uint32)(e + 1022) << 20) | t;
227 }
228 }
229
230 /* Convert to external format and go home. */
231 SET64(k, hi, lo); return (k);
e63124bc
MW
232}
233
e63124bc
MW
234/*----- External functions ------------------------------------------------*/
235
31d0247c 236/* --- @buf_getf64{,l,b} --- *
e63124bc
MW
237 *
238 * Arguments: @buf *b@ = a buffer to read from
239 * @double *x_out@ = where to put the result
240 *
241 * Returns: Zero on success, @-1@ on failure (and the buffer is broken).
242 *
243 * If the system supports NaNs, then any encoded NaN is returned
244 * as the value of @NAN@ in @<math.h>@; otherwise, this function
245 * reports failure.
246 *
247 * In general, values are rounded to the nearest available
248 * value, in the way that the system usually rounds. If the
249 * system doesn't support infinities, then any encoded infinity
250 * is reported as the largest-possible-magnitude finite value
251 * instead.
252 */
253
254int buf_getf64(buf *b, double *x_out)
255{
256 kludge64 k;
257
258 if (buf_getk64(b, &k)) return (-1);
259 if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
260 return (0);
261}
31d0247c
MW
262
263int buf_getf64l(buf *b, double *x_out)
e63124bc
MW
264{
265 kludge64 k;
266
31d0247c 267 if (buf_getk64l(b, &k)) return (-1);
e63124bc
MW
268 if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
269 return (0);
270}
31d0247c
MW
271
272int buf_getf64b(buf *b, double *x_out)
e63124bc
MW
273{
274 kludge64 k;
275
31d0247c 276 if (buf_getk64b(b, &k)) return (-1);
e63124bc
MW
277 if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
278 return (0);
279}
280
31d0247c
MW
281int (dbuf_getf64)(dbuf *db, double *x_out)
282 { return (dbuf_getf64(db, x_out)); }
283int (dbuf_getf64l)(dbuf *db, double *x_out)
284 { return (dbuf_getf64l(db, x_out)); }
285int (dbuf_getf64b)(dbuf *db, double *x_out)
286 { return (dbuf_getf64b(db, x_out)); }
287
288/* --- @buf_putf64{,l,b} --- *
289 *
290 * Arguments: @buf *b@ = a buffer to write to
291 * @double x@ = a number to write
292 *
293 * Returns: Zero on success, @-1@ on failure (and the buffer is broken).
294 *
295 * On C89, this function can't detect negative zero so these
296 * will be silently written as positive zero.
297 *
298 * This function doesn't distinguish NaNs. Any NaN is written
299 * as a quiet NaN with all payload bits zero.
300 *
301 * A finite value with too large a magnitude to be represented
302 * is rounded to the appropriate infinity. Other finite values
303 * are rounded as necessary, in the usual IEEE 754 round-to-
304 * nearest-or-even way.
305 */
306
307int buf_putf64(buf *b, double x)
308 { return (buf_putk64(b, f64_to_k64(x))); }
309int buf_putf64l(buf *b, double x)
310 { return (buf_putk64l(b, f64_to_k64(x))); }
311int buf_putf64b(buf *b, double x)
312 { return (buf_putk64b(b, f64_to_k64(x))); }
313
314int (dbuf_putf64)(dbuf *db, double x)
315 { return (dbuf_putf64(db, x)); }
316int (dbuf_putf64l)(dbuf *db, double x)
317 { return (dbuf_putf64l(db, x)); }
318int (dbuf_putf64b)(dbuf *db, double x)
319 { return (dbuf_putf64b(db, x)); }
320
e63124bc 321/*----- That's all, folks -------------------------------------------------*/