Commit | Line | Data |
---|---|---|
e63124bc MW |
1 | /* -*-c-*- |
2 | * | |
3 | * Encoding and decoding floating-point values | |
4 | * | |
5 | * (c) 2023 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of the mLib utilities library. | |
11 | * | |
12 | * mLib is free software: you can redistribute it and/or modify it under | |
13 | * the terms of the GNU Library General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or (at | |
15 | * your option) any later version. | |
16 | * | |
17 | * mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | * License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with mLib. If not, write to the Free Software | |
24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | * USA. | |
26 | */ | |
27 | ||
28 | /*----- Header files ------------------------------------------------------*/ | |
29 | ||
30 | #include <float.h> | |
31 | #include <math.h> | |
32 | ||
33 | #include "bits.h" | |
34 | #include "buf.h" | |
35 | ||
36 | /*----- Formatting primitives ---------------------------------------------*/ | |
37 | ||
38 | /* We use the IEEE 754 `binary64' format. Briefly: | |
39 | * | |
40 | * * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes | |
41 | * %$s = -1$%.. The format is signed-magnitude, so everything else is | |
42 | * the same for positive and negative numbers. | |
43 | * | |
44 | * * The next eleven bits are the biased exponent %$e$%. | |
45 | * | |
46 | * * The remaining 52 bits are the significand %$m$%. | |
47 | * | |
48 | * If %$0 < e < 2047$% then the encoding represents the normal number | |
49 | * %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%. | |
50 | * | |
51 | * If %$e = 0$% and %$m = 0$% then the encoding represents positive or | |
52 | * negative zero. | |
53 | * | |
54 | * If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal | |
55 | * number %$s \cdot m/2^{52} \cdot 2^{-1022}$%. | |
56 | * | |
57 | * If %$e = 2047$% and %$m = 0$% then the encoding represents positive or | |
58 | * negative infinity. | |
59 | * | |
60 | * If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN. If | |
61 | * the most significant bit of %$m$% is set then this is a quiet NaN; | |
62 | * otherwise it's a signalling NaN. | |
63 | */ | |
64 | ||
65 | /* --- @f64_to_k64@ --- * | |
66 | * | |
67 | * Arguments: @double x@ = a floating-point number | |
68 | * | |
69 | * Returns: A 64-bit encoding of @x@. | |
70 | * | |
71 | * Use: Encodes @x@ as a `binary64' value. See `buf_putf64' for the | |
72 | * caveats. | |
73 | */ | |
74 | ||
75 | static kludge64 f64_to_k64(double x) | |
76 | { | |
77 | kludge64 k; | |
78 | uint32 lo, hi, t; | |
79 | int e; double m; | |
80 | ||
81 | /* Some machinery before we start. */ | |
82 | ||
83 | #ifdef isnan | |
84 | # define NANP(x) isnan(x) | |
85 | #else | |
86 | # define NANP(x) (!((x) == (x))) | |
87 | #endif | |
88 | ||
89 | #ifdef isinf | |
90 | # define INFP(x) isinf(x) | |
91 | #else | |
92 | # define INFP(x) ((x) > DBL_MAX || (x) < -DBL_MAX) | |
93 | #endif | |
94 | ||
95 | #ifdef signbit | |
96 | # define NEGP(x) signbit(x) | |
97 | #else | |
98 | # define NEGP(x) ((x) < 0) /* incorrect for negative zero! */ | |
99 | #endif | |
100 | ||
101 | if (NANP(x)) { | |
102 | /* A NaN. */ | |
103 | hi = 0x7ff80000; lo = 0; | |
104 | } else if (INFP(x)) { | |
105 | /* Positive or negative infinity. */ | |
106 | hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0; | |
107 | } else if (x == 0) { | |
108 | /* Positive or negative zero. */ | |
109 | hi = NEGP(x) ? 0x80000000 : 0; lo = 0; | |
110 | } else { | |
111 | /* A normal or subnormal number. Now we have to do some actual work. */ | |
112 | ||
113 | /* Let's get the sign dealt with so we don't have to worry about it any | |
114 | * more. | |
115 | */ | |
116 | if (!NEGP(x)) hi = 0; | |
117 | else { x = -x; hi = 0x80000000; } | |
118 | ||
119 | /* Now we start on the value. The first thing to do is to split off the | |
120 | * exponent. Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%. | |
121 | */ | |
122 | m = frexp(x, &e); | |
123 | ||
124 | /* If our number is too big, we'll round it to infinity. This will | |
125 | * happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%. | |
126 | */ | |
127 | if (e > 1024) | |
128 | { hi |= 0x7ff00000; lo = 0; } | |
129 | else { | |
130 | /* Our number is sufficiently small that we can represent it at least | |
131 | * approximately (though maybe we'll have to flush it to zero). The | |
132 | * next step, then, is to pull the significand bits out. | |
133 | */ | |
134 | ||
135 | /* Determine the correct exponent to store. We're not going to bias it | |
136 | * yet, but this is where we deal with subnormal numbers. Our number | |
137 | * is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%. In this | |
138 | * case, there's an implicit bit which we'll clear. Otherwise, if it's | |
139 | * subnormal, we'll scale our floating-point number so that the | |
140 | * significand will look right when we extract it, and adjust the | |
141 | * exponent so that, when we're finally done, it will have the correct | |
142 | * sentinel value. | |
143 | */ | |
144 | if (e > -1022) m -= 0.5; | |
145 | else { m = ldexp(m, 1021 + e); e = -1022; } | |
146 | ||
147 | /* Now we pull out the 53 bits of the significand. This will, in | |
148 | * general, leave a tail which we address through rounding. Scale it | |
149 | * up so that we end up with %$0 \le m' < 2$%; then we round up if | |
150 | * %$m > 1$%, or if %$m = 1$% and the low bit of the significand is | |
151 | * set. | |
152 | */ | |
153 | t = ldexp(m, 21); m -= ldexp(t, -21); | |
154 | lo = ldexp(m, 53); m -= ldexp(lo, -53); | |
155 | m = ldexp(m, 54); | |
156 | ||
157 | /* Round the number if necessary. */ | |
158 | if (lo&1 ? m >= 1.0 : m > 1) | |
159 | { lo = U32(lo + 1); if (!lo) t++; } | |
160 | ||
161 | /* Now we just put the pieces together. Note that our %$e$% is one | |
162 | * greater than it should be, because our implicit bit should have | |
163 | * been the unit bit not the 1/2 bit. | |
164 | */ | |
165 | hi |= ((uint32)(e + 1022) << 20) | t; | |
166 | } | |
167 | } | |
168 | ||
169 | /* Convert to external format and go home. */ | |
170 | SET64(k, hi, lo); return (k); | |
171 | ||
172 | #undef NANP | |
173 | #undef INFP | |
174 | #undef NEGP | |
175 | } | |
176 | ||
177 | /* --- @k64_to_f64@ --- * | |
178 | * | |
179 | * Arguments: @double *x_out@ = where to put the result | |
180 | * @kludge64 k@ = a 64-bit encoding of a floating-point value | |
181 | * | |
182 | * Returns: Zero on success, @-1@ on failure. | |
183 | * | |
184 | * Use: Decodes @k@ as a `binary64' value. See `buf_getf64' for the | |
185 | * caveats. | |
186 | */ | |
187 | ||
188 | static int k64_to_f64(double *x_out, kludge64 k) | |
189 | { | |
190 | uint32 lo, hi, t; | |
191 | int s, e; double x; | |
192 | ||
193 | /* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */ | |
194 | ||
195 | /* Pick the encoded number apart. */ | |
196 | hi = HI64(k); lo = LO64(k); | |
197 | s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff; | |
198 | ||
199 | /* Deal with various special cases. */ | |
200 | if (e == 2047) { | |
201 | /* Maximum exponent indicates (positive or negative) infinity or NaN. */ | |
202 | ||
203 | if (t || lo) { | |
204 | /* It's a NaN. We're not going to be picky about which one. If we | |
205 | * can't represent it then we'll just have to fail. | |
206 | */ | |
207 | ||
208 | #ifdef NAN | |
209 | x = NAN; | |
210 | #else | |
211 | return (-1); | |
212 | #endif | |
213 | } else { | |
214 | /* It's an infinity. If we don't have one of those to hand, then pick | |
215 | * something really big. | |
216 | */ | |
217 | ||
218 | #ifdef INFINITY | |
219 | x = s ? -INFINITY : INFINITY; | |
220 | #else | |
221 | x = s ? -DBL_MAX : DBL_MAX; | |
222 | #endif | |
223 | } | |
224 | } else { | |
225 | /* It's a finite number, though maybe it's weird in some way. */ | |
226 | ||
227 | if (e == 0) { | |
228 | /* Minimum exponent indicates zero or a subnormal number. The | |
229 | * subnormal exponent is a sentinel value that shouldn't be taken | |
230 | * literally, so we should fix that. If the number is actually zero | |
231 | * then the exponent won't matter much so don't bother checking. | |
232 | */ | |
233 | ||
234 | e = 1; | |
235 | } else { | |
236 | /* It's a normal number. In which case there's an implicit bit which | |
237 | * we can now set. | |
238 | */ | |
239 | ||
240 | t |= 0x00100000; | |
241 | } | |
242 | ||
243 | /* All that remains is to stuff the significant and exponent into a | |
244 | * floating point number. We'll have to do this in pieces, and we'll | |
245 | * lean on the floating-point machinery to do rounding correctly. | |
246 | */ | |
247 | x = ldexp(t, e - 1043) + ldexp(lo, e - 1075); | |
248 | if (s) x = -x; | |
249 | } | |
250 | ||
251 | /* And we're done. */ | |
252 | *x_out = x; return (0); | |
253 | } | |
254 | ||
255 | /*----- External functions ------------------------------------------------*/ | |
256 | ||
257 | /* --- @buf_putf64{,b,l} --- * | |
258 | * | |
259 | * Arguments: @buf *b@ = a buffer to write to | |
260 | * @double x@ = a number to write | |
261 | * | |
262 | * Returns: Zero on success, @-1@ on failure (and the buffer is broken). | |
263 | * | |
264 | * On C89, this function can't detect negative zero so these | |
265 | * will be silently written as positive zero. | |
266 | * | |
267 | * This function doesn't distinguish NaNs. Any NaN is written | |
268 | * as a quiet NaN with all payload bits zero. | |
269 | * | |
270 | * A finite value with too large a magnitude to be represented | |
271 | * is rounded to the appropriate infinity. Other finite values | |
272 | * are rounded as necessary, in the usual IEEE 754 round-to- | |
273 | * nearest-or-even way. | |
274 | */ | |
275 | ||
276 | int buf_putf64(buf *b, double x) | |
277 | { return (buf_putk64(b, f64_to_k64(x))); } | |
278 | int buf_putf64b(buf *b, double x) | |
279 | { return (buf_putk64b(b, f64_to_k64(x))); } | |
280 | int buf_putf64l(buf *b, double x) | |
281 | { return (buf_putk64l(b, f64_to_k64(x))); } | |
282 | ||
283 | /* --- @buf_getf64{,b,l} --- * | |
284 | * | |
285 | * Arguments: @buf *b@ = a buffer to read from | |
286 | * @double *x_out@ = where to put the result | |
287 | * | |
288 | * Returns: Zero on success, @-1@ on failure (and the buffer is broken). | |
289 | * | |
290 | * If the system supports NaNs, then any encoded NaN is returned | |
291 | * as the value of @NAN@ in @<math.h>@; otherwise, this function | |
292 | * reports failure. | |
293 | * | |
294 | * In general, values are rounded to the nearest available | |
295 | * value, in the way that the system usually rounds. If the | |
296 | * system doesn't support infinities, then any encoded infinity | |
297 | * is reported as the largest-possible-magnitude finite value | |
298 | * instead. | |
299 | */ | |
300 | ||
301 | int buf_getf64(buf *b, double *x_out) | |
302 | { | |
303 | kludge64 k; | |
304 | ||
305 | if (buf_getk64(b, &k)) return (-1); | |
306 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } | |
307 | return (0); | |
308 | } | |
309 | int buf_getf64b(buf *b, double *x_out) | |
310 | { | |
311 | kludge64 k; | |
312 | ||
313 | if (buf_getk64b(b, &k)) return (-1); | |
314 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } | |
315 | return (0); | |
316 | } | |
317 | int buf_getf64l(buf *b, double *x_out) | |
318 | { | |
319 | kludge64 k; | |
320 | ||
321 | if (buf_getk64l(b, &k)) return (-1); | |
322 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } | |
323 | return (0); | |
324 | } | |
325 | ||
326 | /*----- That's all, folks -------------------------------------------------*/ |