| 1 | /* -*-c-*- |
| 2 | * |
| 3 | * Encoding and decoding floating-point values |
| 4 | * |
| 5 | * (c) 2023 Straylight/Edgeware |
| 6 | */ |
| 7 | |
| 8 | /*----- Licensing notice --------------------------------------------------* |
| 9 | * |
| 10 | * This file is part of the mLib utilities library. |
| 11 | * |
| 12 | * mLib is free software: you can redistribute it and/or modify it under |
| 13 | * the terms of the GNU Library General Public License as published by |
| 14 | * the Free Software Foundation; either version 2 of the License, or (at |
| 15 | * your option) any later version. |
| 16 | * |
| 17 | * mLib is distributed in the hope that it will be useful, but WITHOUT |
| 18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public |
| 20 | * License for more details. |
| 21 | * |
| 22 | * You should have received a copy of the GNU Library General Public |
| 23 | * License along with mLib. If not, write to the Free Software |
| 24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, |
| 25 | * USA. |
| 26 | */ |
| 27 | |
| 28 | /*----- Header files ------------------------------------------------------*/ |
| 29 | |
| 30 | #include <float.h> |
| 31 | #include <math.h> |
| 32 | |
| 33 | #include "bits.h" |
| 34 | #include "buf.h" |
| 35 | #include "maths.h" |
| 36 | |
| 37 | /*----- Formatting primitives ---------------------------------------------*/ |
| 38 | |
| 39 | /* We use the IEEE 754 `binary64' format. Briefly: |
| 40 | * |
| 41 | * * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes |
| 42 | * %$s = -1$%.. The format is signed-magnitude, so everything else is |
| 43 | * the same for positive and negative numbers. |
| 44 | * |
| 45 | * * The next eleven bits are the biased exponent %$e$%. |
| 46 | * |
| 47 | * * The remaining 52 bits are the significand %$m$%. |
| 48 | * |
| 49 | * If %$0 < e < 2047$% then the encoding represents the normal number |
| 50 | * %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%. |
| 51 | * |
| 52 | * If %$e = 0$% and %$m = 0$% then the encoding represents positive or |
| 53 | * negative zero. |
| 54 | * |
| 55 | * If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal |
| 56 | * number %$s \cdot m/2^{52} \cdot 2^{-1022}$%. |
| 57 | * |
| 58 | * If %$e = 2047$% and %$m = 0$% then the encoding represents positive or |
| 59 | * negative infinity. |
| 60 | * |
| 61 | * If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN. If |
| 62 | * the most significant bit of %$m$% is set then this is a quiet NaN; |
| 63 | * otherwise it's a signalling NaN. |
| 64 | */ |
| 65 | |
| 66 | /* --- @f64_to_k64@ --- * |
| 67 | * |
| 68 | * Arguments: @double x@ = a floating-point number |
| 69 | * |
| 70 | * Returns: A 64-bit encoding of @x@. |
| 71 | * |
| 72 | * Use: Encodes @x@ as a `binary64' value. See `buf_putf64' for the |
| 73 | * caveats. |
| 74 | */ |
| 75 | |
| 76 | static kludge64 f64_to_k64(double x) |
| 77 | { |
| 78 | kludge64 k; |
| 79 | uint32 lo, hi, t; |
| 80 | int e; double m; |
| 81 | |
| 82 | /* Some machinery before we start. */ |
| 83 | |
| 84 | if (NANP(x)) { |
| 85 | /* A NaN. */ |
| 86 | hi = 0x7ff80000; lo = 0; |
| 87 | } else if (INFP(x)) { |
| 88 | /* Positive or negative infinity. */ |
| 89 | hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0; |
| 90 | } else if (x == 0) { |
| 91 | /* Positive or negative zero. */ |
| 92 | hi = NEGP(x) ? 0x80000000 : 0; lo = 0; |
| 93 | } else { |
| 94 | /* A normal or subnormal number. Now we have to do some actual work. */ |
| 95 | |
| 96 | /* Let's get the sign dealt with so we don't have to worry about it any |
| 97 | * more. |
| 98 | */ |
| 99 | if (!NEGP(x)) hi = 0; |
| 100 | else { x = -x; hi = 0x80000000; } |
| 101 | |
| 102 | /* Now we start on the value. The first thing to do is to split off the |
| 103 | * exponent. Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%. |
| 104 | */ |
| 105 | m = frexp(x, &e); |
| 106 | |
| 107 | /* If our number is too big, we'll round it to infinity. This will |
| 108 | * happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%. |
| 109 | */ |
| 110 | if (e > 1024) |
| 111 | { hi |= 0x7ff00000; lo = 0; } |
| 112 | else { |
| 113 | /* Our number is sufficiently small that we can represent it at least |
| 114 | * approximately (though maybe we'll have to flush it to zero). The |
| 115 | * next step, then, is to pull the significand bits out. |
| 116 | */ |
| 117 | |
| 118 | /* Determine the correct exponent to store. We're not going to bias it |
| 119 | * yet, but this is where we deal with subnormal numbers. Our number |
| 120 | * is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%. In this |
| 121 | * case, there's an implicit bit which we'll clear. Otherwise, if it's |
| 122 | * subnormal, we'll scale our floating-point number so that the |
| 123 | * significand will look right when we extract it, and adjust the |
| 124 | * exponent so that, when we're finally done, it will have the correct |
| 125 | * sentinel value. |
| 126 | */ |
| 127 | if (e > -1022) m -= 0.5; |
| 128 | else { m = ldexp(m, 1021 + e); e = -1022; } |
| 129 | |
| 130 | /* Now we pull out the 53 bits of the significand. This will, in |
| 131 | * general, leave a tail which we address through rounding. Scale it |
| 132 | * up so that we end up with %$0 \le m' < 2$%; then we round up if |
| 133 | * %$m > 1$%, or if %$m = 1$% and the low bit of the significand is |
| 134 | * set. |
| 135 | */ |
| 136 | t = ldexp(m, 21); m -= ldexp(t, -21); |
| 137 | lo = ldexp(m, 53); m -= ldexp(lo, -53); |
| 138 | m = ldexp(m, 54); |
| 139 | |
| 140 | /* Round the number if necessary. */ |
| 141 | if (lo&1 ? m >= 1.0 : m > 1) |
| 142 | { lo = U32(lo + 1); if (!lo) t++; } |
| 143 | |
| 144 | /* Now we just put the pieces together. Note that our %$e$% is one |
| 145 | * greater than it should be, because our implicit bit should have |
| 146 | * been the unit bit not the 1/2 bit. |
| 147 | */ |
| 148 | hi |= ((uint32)(e + 1022) << 20) | t; |
| 149 | } |
| 150 | } |
| 151 | |
| 152 | /* Convert to external format and go home. */ |
| 153 | SET64(k, hi, lo); return (k); |
| 154 | } |
| 155 | |
| 156 | /* --- @k64_to_f64@ --- * |
| 157 | * |
| 158 | * Arguments: @double *x_out@ = where to put the result |
| 159 | * @kludge64 k@ = a 64-bit encoding of a floating-point value |
| 160 | * |
| 161 | * Returns: Zero on success, @-1@ on failure. |
| 162 | * |
| 163 | * Use: Decodes @k@ as a `binary64' value. See `buf_getf64' for the |
| 164 | * caveats. |
| 165 | */ |
| 166 | |
| 167 | static int k64_to_f64(double *x_out, kludge64 k) |
| 168 | { |
| 169 | uint32 lo, hi, t; |
| 170 | int s, e; double x; |
| 171 | |
| 172 | /* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */ |
| 173 | |
| 174 | /* Pick the encoded number apart. */ |
| 175 | hi = HI64(k); lo = LO64(k); |
| 176 | s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff; |
| 177 | |
| 178 | /* Deal with various special cases. */ |
| 179 | if (e == 2047) { |
| 180 | /* Maximum exponent indicates (positive or negative) infinity or NaN. */ |
| 181 | |
| 182 | if (t || lo) { |
| 183 | /* It's a NaN. We're not going to be picky about which one. If we |
| 184 | * can't represent it then we'll just have to fail. |
| 185 | */ |
| 186 | |
| 187 | #ifdef NAN |
| 188 | x = NAN; |
| 189 | #else |
| 190 | return (-1); |
| 191 | #endif |
| 192 | } else { |
| 193 | /* It's an infinity. If we don't have one of those to hand, then pick |
| 194 | * something really big. |
| 195 | */ |
| 196 | |
| 197 | #ifdef INFINITY |
| 198 | x = s ? -INFINITY : INFINITY; |
| 199 | #else |
| 200 | x = s ? -DBL_MAX : DBL_MAX; |
| 201 | #endif |
| 202 | } |
| 203 | } else { |
| 204 | /* It's a finite number, though maybe it's weird in some way. */ |
| 205 | |
| 206 | if (e == 0) { |
| 207 | /* Minimum exponent indicates zero or a subnormal number. The |
| 208 | * subnormal exponent is a sentinel value that shouldn't be taken |
| 209 | * literally, so we should fix that. If the number is actually zero |
| 210 | * then the exponent won't matter much so don't bother checking. |
| 211 | */ |
| 212 | |
| 213 | e = 1; |
| 214 | } else { |
| 215 | /* It's a normal number. In which case there's an implicit bit which |
| 216 | * we can now set. |
| 217 | */ |
| 218 | |
| 219 | t |= 0x00100000; |
| 220 | } |
| 221 | |
| 222 | /* All that remains is to stuff the significant and exponent into a |
| 223 | * floating point number. We'll have to do this in pieces, and we'll |
| 224 | * lean on the floating-point machinery to do rounding correctly. |
| 225 | */ |
| 226 | x = ldexp(t, e - 1043) + ldexp(lo, e - 1075); |
| 227 | if (s) x = -x; |
| 228 | } |
| 229 | |
| 230 | /* And we're done. */ |
| 231 | *x_out = x; return (0); |
| 232 | } |
| 233 | |
| 234 | /*----- External functions ------------------------------------------------*/ |
| 235 | |
| 236 | /* --- @buf_putf64{,b,l} --- * |
| 237 | * |
| 238 | * Arguments: @buf *b@ = a buffer to write to |
| 239 | * @double x@ = a number to write |
| 240 | * |
| 241 | * Returns: Zero on success, @-1@ on failure (and the buffer is broken). |
| 242 | * |
| 243 | * On C89, this function can't detect negative zero so these |
| 244 | * will be silently written as positive zero. |
| 245 | * |
| 246 | * This function doesn't distinguish NaNs. Any NaN is written |
| 247 | * as a quiet NaN with all payload bits zero. |
| 248 | * |
| 249 | * A finite value with too large a magnitude to be represented |
| 250 | * is rounded to the appropriate infinity. Other finite values |
| 251 | * are rounded as necessary, in the usual IEEE 754 round-to- |
| 252 | * nearest-or-even way. |
| 253 | */ |
| 254 | |
| 255 | int buf_putf64(buf *b, double x) |
| 256 | { return (buf_putk64(b, f64_to_k64(x))); } |
| 257 | int buf_putf64b(buf *b, double x) |
| 258 | { return (buf_putk64b(b, f64_to_k64(x))); } |
| 259 | int buf_putf64l(buf *b, double x) |
| 260 | { return (buf_putk64l(b, f64_to_k64(x))); } |
| 261 | |
| 262 | /* --- @buf_getf64{,b,l} --- * |
| 263 | * |
| 264 | * Arguments: @buf *b@ = a buffer to read from |
| 265 | * @double *x_out@ = where to put the result |
| 266 | * |
| 267 | * Returns: Zero on success, @-1@ on failure (and the buffer is broken). |
| 268 | * |
| 269 | * If the system supports NaNs, then any encoded NaN is returned |
| 270 | * as the value of @NAN@ in @<math.h>@; otherwise, this function |
| 271 | * reports failure. |
| 272 | * |
| 273 | * In general, values are rounded to the nearest available |
| 274 | * value, in the way that the system usually rounds. If the |
| 275 | * system doesn't support infinities, then any encoded infinity |
| 276 | * is reported as the largest-possible-magnitude finite value |
| 277 | * instead. |
| 278 | */ |
| 279 | |
| 280 | int buf_getf64(buf *b, double *x_out) |
| 281 | { |
| 282 | kludge64 k; |
| 283 | |
| 284 | if (buf_getk64(b, &k)) return (-1); |
| 285 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } |
| 286 | return (0); |
| 287 | } |
| 288 | int buf_getf64b(buf *b, double *x_out) |
| 289 | { |
| 290 | kludge64 k; |
| 291 | |
| 292 | if (buf_getk64b(b, &k)) return (-1); |
| 293 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } |
| 294 | return (0); |
| 295 | } |
| 296 | int buf_getf64l(buf *b, double *x_out) |
| 297 | { |
| 298 | kludge64 k; |
| 299 | |
| 300 | if (buf_getk64l(b, &k)) return (-1); |
| 301 | if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); } |
| 302 | return (0); |
| 303 | } |
| 304 | |
| 305 | /*----- That's all, folks -------------------------------------------------*/ |