Commit | Line | Data |
---|---|---|
e63124bc MW |
1 | /* -*-c-*- |
2 | * | |
3 | * Encoding and decoding floating-point values | |
4 | * | |
5 | * (c) 2023 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of the mLib utilities library. | |
11 | * | |
12 | * mLib is free software: you can redistribute it and/or modify it under | |
13 | * the terms of the GNU Library General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or (at | |
15 | * your option) any later version. | |
16 | * | |
17 | * mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | * License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with mLib. If not, write to the Free Software | |
24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | * USA. | |
26 | */ | |
27 | ||
28 | /*----- Header files ------------------------------------------------------*/ | |
29 | ||
30 | #include <float.h> | |
31 | #include <math.h> | |
32 | ||
33 | #include "bits.h" | |
34 | #include "buf.h" | |
67b5031e | 35 | #include "maths.h" |
e63124bc MW |
36 | |
37 | /*----- Formatting primitives ---------------------------------------------*/ | |
38 | ||
39 | /* We use the IEEE 754 `binary64' format. Briefly: | |
40 | * | |
41 | * * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes | |
42 | * %$s = -1$%.. The format is signed-magnitude, so everything else is | |
43 | * the same for positive and negative numbers. | |
44 | * | |
45 | * * The next eleven bits are the biased exponent %$e$%. | |
46 | * | |
47 | * * The remaining 52 bits are the significand %$m$%. | |
48 | * | |
49 | * If %$0 < e < 2047$% then the encoding represents the normal number | |
50 | * %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%. | |
51 | * | |
52 | * If %$e = 0$% and %$m = 0$% then the encoding represents positive or | |
53 | * negative zero. | |
54 | * | |
55 | * If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal | |
56 | * number %$s \cdot m/2^{52} \cdot 2^{-1022}$%. | |
57 | * | |
58 | * If %$e = 2047$% and %$m = 0$% then the encoding represents positive or | |
59 | * negative infinity. | |
60 | * | |
61 | * If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN. If | |
62 | * the most significant bit of %$m$% is set then this is a quiet NaN; | |
63 | * otherwise it's a signalling NaN. | |
64 | */ | |
65 | ||
31d0247c MW |
66 | /* --- @k64_to_f64@ --- * |
67 | * | |
68 | * Arguments: @double *x_out@ = where to put the result | |
69 | * @kludge64 k@ = a 64-bit encoding of a floating-point value | |
70 | * | |
71 | * Returns: Zero on success, @-1@ on failure. | |
72 | * | |
73 | * Use: Decodes @k@ as a `binary64' value. See `buf_getf64' for the | |
74 | * caveats. | |
75 | */ | |
76 | ||
77 | static int k64_to_f64(double *x_out, kludge64 k) | |
78 | { | |
79 | uint32 lo, hi, t; | |
80 | int s, e; double x; | |
81 | ||
82 | /* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */ | |
83 | ||
84 | /* Pick the encoded number apart. */ | |
85 | hi = HI64(k); lo = LO64(k); | |
86 | s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff; | |
87 | ||
88 | /* Deal with various special cases. */ | |
89 | if (e == 2047) { | |
90 | /* Maximum exponent indicates (positive or negative) infinity or NaN. */ | |
91 | ||
92 | if (t || lo) { | |
93 | /* It's a NaN. We're not going to be picky about which one. If we | |
94 | * can't represent it then we'll just have to fail. | |
95 | */ | |
96 | ||
97 | #ifdef NAN | |
98 | x = NAN; | |
99 | #else | |
100 | return (-1); | |
101 | #endif | |
102 | } else { | |
103 | /* It's an infinity. If we don't have one of those to hand, then pick | |
104 | * something really big. | |
105 | */ | |
106 | ||
107 | #ifdef INFINITY | |
108 | x = s ? -INFINITY : INFINITY; | |
109 | #else | |
110 | x = s ? -DBL_MAX : DBL_MAX; | |
111 | #endif | |
112 | } | |
113 | } else { | |
114 | /* It's a finite number, though maybe it's weird in some way. */ | |
115 | ||
116 | if (e == 0) { | |
117 | /* Minimum exponent indicates zero or a subnormal number. The | |
118 | * subnormal exponent is a sentinel value that shouldn't be taken | |
119 | * literally, so we should fix that. If the number is actually zero | |
120 | * then the exponent won't matter much so don't bother checking. | |
121 | */ | |
122 | ||
123 | e = 1; | |
124 | } else { | |
125 | /* It's a normal number. In which case there's an implicit bit which | |
126 | * we can now set. | |
127 | */ | |
128 | ||
129 | t |= 0x00100000; | |
130 | } | |
131 | ||
132 | /* All that remains is to stuff the significant and exponent into a | |
133 | * floating point number. We'll have to do this in pieces, and we'll | |
134 | * lean on the floating-point machinery to do rounding correctly. | |
135 | */ | |
136 | x = ldexp(t, e - 1043) + ldexp(lo, e - 1075); | |
137 | if (s) x = -x; | |
138 | } | |
139 | ||
140 | /* And we're done. */ | |
141 | *x_out = x; return (0); | |
142 | } | |
143 | ||
e63124bc MW |
144 | /* --- @f64_to_k64@ --- * |
145 | * | |
146 | * Arguments: @double x@ = a floating-point number | |
147 | * | |
148 | * Returns: A 64-bit encoding of @x@. | |
149 | * | |
150 | * Use: Encodes @x@ as a `binary64' value. See `buf_putf64' for the | |
151 | * caveats. | |
152 | */ | |
153 | ||
154 | static kludge64 f64_to_k64(double x) | |
155 | { | |
156 | kludge64 k; | |
157 | uint32 lo, hi, t; | |
158 | int e; double m; | |
159 | ||
160 | /* Some machinery before we start. */ | |
161 | ||
e63124bc MW |
162 | if (NANP(x)) { |
163 | /* A NaN. */ | |
164 | hi = 0x7ff80000; lo = 0; | |
165 | } else if (INFP(x)) { | |
166 | /* Positive or negative infinity. */ | |
167 | hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0; | |
168 | } else if (x == 0) { | |
169 | /* Positive or negative zero. */ | |
170 | hi = NEGP(x) ? 0x80000000 : 0; lo = 0; | |
171 | } else { | |
172 | /* A normal or subnormal number. Now we have to do some actual work. */ | |
173 | ||
174 | /* Let's get the sign dealt with so we don't have to worry about it any | |
175 | * more. | |
176 | */ | |
177 | if (!NEGP(x)) hi = 0; | |
178 | else { x = -x; hi = 0x80000000; } | |
179 | ||
180 | /* Now we start on the value. The first thing to do is to split off the | |
181 | * exponent. Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%. | |
182 | */ | |
183 | m = frexp(x, &e); | |
184 | ||
185 | /* If our number is too big, we'll round it to infinity. This will | |
186 | * happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%. | |
187 | */ | |
188 | if (e > 1024) | |
189 | { hi |= 0x7ff00000; lo = 0; } | |
190 | else { | |
191 | /* Our number is sufficiently small that we can represent it at least | |
192 | * approximately (though maybe we'll have to flush it to zero). The | |
193 | * next step, then, is to pull the significand bits out. | |
194 | */ | |
195 | ||
196 | /* Determine the correct exponent to store. We're not going to bias it | |
197 | * yet, but this is where we deal with subnormal numbers. Our number | |
198 | * is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%. In this | |
199 | * case, there's an implicit bit which we'll clear. Otherwise, if it's | |
200 | * subnormal, we'll scale our floating-point number so that the | |
201 | * significand will look right when we extract it, and adjust the | |
202 | * exponent so that, when we're finally done, it will have the correct | |
203 | * sentinel value. | |
204 | */ | |
205 | if (e > -1022) m -= 0.5; | |
206 | else { m = ldexp(m, 1021 + e); e = -1022; } | |
207 | ||
208 | /* Now we pull out the 53 bits of the significand. This will, in | |
209 | * general, leave a tail which we address through rounding. Scale it | |
210 | * up so that we end up with %$0 \le m' < 2$%; then we round up if | |
211 | * %$m > 1$%, or if %$m = 1$% and the low bit of the significand is | |
212 | * set. | |
213 | */ | |
214 | t = ldexp(m, 21); m -= ldexp(t, -21); | |
215 | lo = ldexp(m, 53); m -= ldexp(lo, -53); | |
216 | m = ldexp(m, 54); | |
217 | ||
218 | /* Round the number if necessary. */ | |
219 | if (lo&1 ? m >= 1.0 : m > 1) | |
220 | { lo = U32(lo + 1); if (!lo) t++; } | |
221 | ||
222 | /* Now we just put the pieces together. Note that our %$e$% is one | |
223 | * greater than it should be, because our implicit bit should have | |
224 | * been the unit bit not the 1/2 bit. | |
225 | */ | |
226 | hi |= ((uint32)(e + 1022) << 20) | t; | |
227 | } | |
228 | } | |
229 | ||
230 | /* Convert to external format and go home. */ | |
231 | SET64(k, hi, lo); return (k); | |
e63124bc MW |
232 | } |
233 | ||
e63124bc MW |
234 | /*----- External functions ------------------------------------------------*/ |
235 | ||
31d0247c | 236 | /* --- @buf_getf64{,l,b} --- * |
e63124bc MW |
237 | * |
238 | * Arguments: @buf *b@ = a buffer to read from | |
239 | * @double *x_out@ = where to put the result | |
240 | * | |
241 | * Returns: Zero on success, @-1@ on failure (and the buffer is broken). | |
242 | * | |
243 | * If the system supports NaNs, then any encoded NaN is returned | |
244 | * as the value of @NAN@ in @<math.h>@; otherwise, this function | |
245 | * reports failure. | |
246 | * | |
247 | * In general, values are rounded to the nearest available | |
248 | * value, in the way that the system usually rounds. If the | |
249 | * system doesn't support infinities, then any encoded infinity | |
250 | * is reported as the largest-possible-magnitude finite value | |
251 | * instead. | |
252 | */ | |
253 | ||
254 | int buf_getf64(buf *b, double *x_out) | |
255 | { | |
256 | kludge64 k; | |
257 | ||
258 | if (buf_getk64(b, &k)) return (-1); | |
259 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } | |
260 | return (0); | |
261 | } | |
31d0247c MW |
262 | |
263 | int buf_getf64l(buf *b, double *x_out) | |
e63124bc MW |
264 | { |
265 | kludge64 k; | |
266 | ||
31d0247c | 267 | if (buf_getk64l(b, &k)) return (-1); |
e63124bc MW |
268 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } |
269 | return (0); | |
270 | } | |
31d0247c MW |
271 | |
272 | int buf_getf64b(buf *b, double *x_out) | |
e63124bc MW |
273 | { |
274 | kludge64 k; | |
275 | ||
31d0247c | 276 | if (buf_getk64b(b, &k)) return (-1); |
e63124bc MW |
277 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } |
278 | return (0); | |
279 | } | |
280 | ||
31d0247c MW |
281 | int (dbuf_getf64)(dbuf *db, double *x_out) |
282 | { return (dbuf_getf64(db, x_out)); } | |
283 | int (dbuf_getf64l)(dbuf *db, double *x_out) | |
284 | { return (dbuf_getf64l(db, x_out)); } | |
285 | int (dbuf_getf64b)(dbuf *db, double *x_out) | |
286 | { return (dbuf_getf64b(db, x_out)); } | |
287 | ||
288 | /* --- @buf_putf64{,l,b} --- * | |
289 | * | |
290 | * Arguments: @buf *b@ = a buffer to write to | |
291 | * @double x@ = a number to write | |
292 | * | |
293 | * Returns: Zero on success, @-1@ on failure (and the buffer is broken). | |
294 | * | |
295 | * On C89, this function can't detect negative zero so these | |
296 | * will be silently written as positive zero. | |
297 | * | |
298 | * This function doesn't distinguish NaNs. Any NaN is written | |
299 | * as a quiet NaN with all payload bits zero. | |
300 | * | |
301 | * A finite value with too large a magnitude to be represented | |
302 | * is rounded to the appropriate infinity. Other finite values | |
303 | * are rounded as necessary, in the usual IEEE 754 round-to- | |
304 | * nearest-or-even way. | |
305 | */ | |
306 | ||
307 | int buf_putf64(buf *b, double x) | |
308 | { return (buf_putk64(b, f64_to_k64(x))); } | |
309 | int buf_putf64l(buf *b, double x) | |
310 | { return (buf_putk64l(b, f64_to_k64(x))); } | |
311 | int buf_putf64b(buf *b, double x) | |
312 | { return (buf_putk64b(b, f64_to_k64(x))); } | |
313 | ||
314 | int (dbuf_putf64)(dbuf *db, double x) | |
315 | { return (dbuf_putf64(db, x)); } | |
316 | int (dbuf_putf64l)(dbuf *db, double x) | |
317 | { return (dbuf_putf64l(db, x)); } | |
318 | int (dbuf_putf64b)(dbuf *db, double x) | |
319 | { return (dbuf_putf64b(db, x)); } | |
320 | ||
e63124bc | 321 | /*----- That's all, folks -------------------------------------------------*/ |