Commit | Line | Data |
---|---|---|
e63124bc MW |
1 | /* -*-c-*- |
2 | * | |
3 | * Encoding and decoding floating-point values | |
4 | * | |
5 | * (c) 2023 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of the mLib utilities library. | |
11 | * | |
12 | * mLib is free software: you can redistribute it and/or modify it under | |
13 | * the terms of the GNU Library General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or (at | |
15 | * your option) any later version. | |
16 | * | |
17 | * mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | * License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with mLib. If not, write to the Free Software | |
24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | * USA. | |
26 | */ | |
27 | ||
28 | /*----- Header files ------------------------------------------------------*/ | |
29 | ||
30 | #include <float.h> | |
31 | #include <math.h> | |
32 | ||
33 | #include "bits.h" | |
34 | #include "buf.h" | |
67b5031e | 35 | #include "maths.h" |
e63124bc MW |
36 | |
37 | /*----- Formatting primitives ---------------------------------------------*/ | |
38 | ||
39 | /* We use the IEEE 754 `binary64' format. Briefly: | |
40 | * | |
41 | * * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes | |
42 | * %$s = -1$%.. The format is signed-magnitude, so everything else is | |
43 | * the same for positive and negative numbers. | |
44 | * | |
45 | * * The next eleven bits are the biased exponent %$e$%. | |
46 | * | |
47 | * * The remaining 52 bits are the significand %$m$%. | |
48 | * | |
49 | * If %$0 < e < 2047$% then the encoding represents the normal number | |
50 | * %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%. | |
51 | * | |
52 | * If %$e = 0$% and %$m = 0$% then the encoding represents positive or | |
53 | * negative zero. | |
54 | * | |
55 | * If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal | |
56 | * number %$s \cdot m/2^{52} \cdot 2^{-1022}$%. | |
57 | * | |
58 | * If %$e = 2047$% and %$m = 0$% then the encoding represents positive or | |
59 | * negative infinity. | |
60 | * | |
61 | * If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN. If | |
62 | * the most significant bit of %$m$% is set then this is a quiet NaN; | |
63 | * otherwise it's a signalling NaN. | |
64 | */ | |
65 | ||
66 | /* --- @f64_to_k64@ --- * | |
67 | * | |
68 | * Arguments: @double x@ = a floating-point number | |
69 | * | |
70 | * Returns: A 64-bit encoding of @x@. | |
71 | * | |
72 | * Use: Encodes @x@ as a `binary64' value. See `buf_putf64' for the | |
73 | * caveats. | |
74 | */ | |
75 | ||
76 | static kludge64 f64_to_k64(double x) | |
77 | { | |
78 | kludge64 k; | |
79 | uint32 lo, hi, t; | |
80 | int e; double m; | |
81 | ||
82 | /* Some machinery before we start. */ | |
83 | ||
e63124bc MW |
84 | if (NANP(x)) { |
85 | /* A NaN. */ | |
86 | hi = 0x7ff80000; lo = 0; | |
87 | } else if (INFP(x)) { | |
88 | /* Positive or negative infinity. */ | |
89 | hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0; | |
90 | } else if (x == 0) { | |
91 | /* Positive or negative zero. */ | |
92 | hi = NEGP(x) ? 0x80000000 : 0; lo = 0; | |
93 | } else { | |
94 | /* A normal or subnormal number. Now we have to do some actual work. */ | |
95 | ||
96 | /* Let's get the sign dealt with so we don't have to worry about it any | |
97 | * more. | |
98 | */ | |
99 | if (!NEGP(x)) hi = 0; | |
100 | else { x = -x; hi = 0x80000000; } | |
101 | ||
102 | /* Now we start on the value. The first thing to do is to split off the | |
103 | * exponent. Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%. | |
104 | */ | |
105 | m = frexp(x, &e); | |
106 | ||
107 | /* If our number is too big, we'll round it to infinity. This will | |
108 | * happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%. | |
109 | */ | |
110 | if (e > 1024) | |
111 | { hi |= 0x7ff00000; lo = 0; } | |
112 | else { | |
113 | /* Our number is sufficiently small that we can represent it at least | |
114 | * approximately (though maybe we'll have to flush it to zero). The | |
115 | * next step, then, is to pull the significand bits out. | |
116 | */ | |
117 | ||
118 | /* Determine the correct exponent to store. We're not going to bias it | |
119 | * yet, but this is where we deal with subnormal numbers. Our number | |
120 | * is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%. In this | |
121 | * case, there's an implicit bit which we'll clear. Otherwise, if it's | |
122 | * subnormal, we'll scale our floating-point number so that the | |
123 | * significand will look right when we extract it, and adjust the | |
124 | * exponent so that, when we're finally done, it will have the correct | |
125 | * sentinel value. | |
126 | */ | |
127 | if (e > -1022) m -= 0.5; | |
128 | else { m = ldexp(m, 1021 + e); e = -1022; } | |
129 | ||
130 | /* Now we pull out the 53 bits of the significand. This will, in | |
131 | * general, leave a tail which we address through rounding. Scale it | |
132 | * up so that we end up with %$0 \le m' < 2$%; then we round up if | |
133 | * %$m > 1$%, or if %$m = 1$% and the low bit of the significand is | |
134 | * set. | |
135 | */ | |
136 | t = ldexp(m, 21); m -= ldexp(t, -21); | |
137 | lo = ldexp(m, 53); m -= ldexp(lo, -53); | |
138 | m = ldexp(m, 54); | |
139 | ||
140 | /* Round the number if necessary. */ | |
141 | if (lo&1 ? m >= 1.0 : m > 1) | |
142 | { lo = U32(lo + 1); if (!lo) t++; } | |
143 | ||
144 | /* Now we just put the pieces together. Note that our %$e$% is one | |
145 | * greater than it should be, because our implicit bit should have | |
146 | * been the unit bit not the 1/2 bit. | |
147 | */ | |
148 | hi |= ((uint32)(e + 1022) << 20) | t; | |
149 | } | |
150 | } | |
151 | ||
152 | /* Convert to external format and go home. */ | |
153 | SET64(k, hi, lo); return (k); | |
e63124bc MW |
154 | } |
155 | ||
156 | /* --- @k64_to_f64@ --- * | |
157 | * | |
158 | * Arguments: @double *x_out@ = where to put the result | |
159 | * @kludge64 k@ = a 64-bit encoding of a floating-point value | |
160 | * | |
161 | * Returns: Zero on success, @-1@ on failure. | |
162 | * | |
163 | * Use: Decodes @k@ as a `binary64' value. See `buf_getf64' for the | |
164 | * caveats. | |
165 | */ | |
166 | ||
167 | static int k64_to_f64(double *x_out, kludge64 k) | |
168 | { | |
169 | uint32 lo, hi, t; | |
170 | int s, e; double x; | |
171 | ||
172 | /* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */ | |
173 | ||
174 | /* Pick the encoded number apart. */ | |
175 | hi = HI64(k); lo = LO64(k); | |
176 | s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff; | |
177 | ||
178 | /* Deal with various special cases. */ | |
179 | if (e == 2047) { | |
180 | /* Maximum exponent indicates (positive or negative) infinity or NaN. */ | |
181 | ||
182 | if (t || lo) { | |
183 | /* It's a NaN. We're not going to be picky about which one. If we | |
184 | * can't represent it then we'll just have to fail. | |
185 | */ | |
186 | ||
187 | #ifdef NAN | |
188 | x = NAN; | |
189 | #else | |
190 | return (-1); | |
191 | #endif | |
192 | } else { | |
193 | /* It's an infinity. If we don't have one of those to hand, then pick | |
194 | * something really big. | |
195 | */ | |
196 | ||
197 | #ifdef INFINITY | |
198 | x = s ? -INFINITY : INFINITY; | |
199 | #else | |
200 | x = s ? -DBL_MAX : DBL_MAX; | |
201 | #endif | |
202 | } | |
203 | } else { | |
204 | /* It's a finite number, though maybe it's weird in some way. */ | |
205 | ||
206 | if (e == 0) { | |
207 | /* Minimum exponent indicates zero or a subnormal number. The | |
208 | * subnormal exponent is a sentinel value that shouldn't be taken | |
209 | * literally, so we should fix that. If the number is actually zero | |
210 | * then the exponent won't matter much so don't bother checking. | |
211 | */ | |
212 | ||
213 | e = 1; | |
214 | } else { | |
215 | /* It's a normal number. In which case there's an implicit bit which | |
216 | * we can now set. | |
217 | */ | |
218 | ||
219 | t |= 0x00100000; | |
220 | } | |
221 | ||
222 | /* All that remains is to stuff the significant and exponent into a | |
223 | * floating point number. We'll have to do this in pieces, and we'll | |
224 | * lean on the floating-point machinery to do rounding correctly. | |
225 | */ | |
226 | x = ldexp(t, e - 1043) + ldexp(lo, e - 1075); | |
227 | if (s) x = -x; | |
228 | } | |
229 | ||
230 | /* And we're done. */ | |
231 | *x_out = x; return (0); | |
232 | } | |
233 | ||
234 | /*----- External functions ------------------------------------------------*/ | |
235 | ||
236 | /* --- @buf_putf64{,b,l} --- * | |
237 | * | |
238 | * Arguments: @buf *b@ = a buffer to write to | |
239 | * @double x@ = a number to write | |
240 | * | |
241 | * Returns: Zero on success, @-1@ on failure (and the buffer is broken). | |
242 | * | |
243 | * On C89, this function can't detect negative zero so these | |
244 | * will be silently written as positive zero. | |
245 | * | |
246 | * This function doesn't distinguish NaNs. Any NaN is written | |
247 | * as a quiet NaN with all payload bits zero. | |
248 | * | |
249 | * A finite value with too large a magnitude to be represented | |
250 | * is rounded to the appropriate infinity. Other finite values | |
251 | * are rounded as necessary, in the usual IEEE 754 round-to- | |
252 | * nearest-or-even way. | |
253 | */ | |
254 | ||
255 | int buf_putf64(buf *b, double x) | |
256 | { return (buf_putk64(b, f64_to_k64(x))); } | |
257 | int buf_putf64b(buf *b, double x) | |
258 | { return (buf_putk64b(b, f64_to_k64(x))); } | |
259 | int buf_putf64l(buf *b, double x) | |
260 | { return (buf_putk64l(b, f64_to_k64(x))); } | |
261 | ||
262 | /* --- @buf_getf64{,b,l} --- * | |
263 | * | |
264 | * Arguments: @buf *b@ = a buffer to read from | |
265 | * @double *x_out@ = where to put the result | |
266 | * | |
267 | * Returns: Zero on success, @-1@ on failure (and the buffer is broken). | |
268 | * | |
269 | * If the system supports NaNs, then any encoded NaN is returned | |
270 | * as the value of @NAN@ in @<math.h>@; otherwise, this function | |
271 | * reports failure. | |
272 | * | |
273 | * In general, values are rounded to the nearest available | |
274 | * value, in the way that the system usually rounds. If the | |
275 | * system doesn't support infinities, then any encoded infinity | |
276 | * is reported as the largest-possible-magnitude finite value | |
277 | * instead. | |
278 | */ | |
279 | ||
280 | int buf_getf64(buf *b, double *x_out) | |
281 | { | |
282 | kludge64 k; | |
283 | ||
284 | if (buf_getk64(b, &k)) return (-1); | |
285 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } | |
286 | return (0); | |
287 | } | |
288 | int buf_getf64b(buf *b, double *x_out) | |
289 | { | |
290 | kludge64 k; | |
291 | ||
292 | if (buf_getk64b(b, &k)) return (-1); | |
293 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } | |
294 | return (0); | |
295 | } | |
296 | int buf_getf64l(buf *b, double *x_out) | |
297 | { | |
298 | kludge64 k; | |
299 | ||
300 | if (buf_getk64l(b, &k)) return (-1); | |
301 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } | |
302 | return (0); | |
303 | } | |
304 | ||
305 | /*----- That's all, folks -------------------------------------------------*/ |