[mLib] / struct / buf-float.c

/* -*-c-*-
 *
 * Encoding and decoding floating-point values
 *
 * (c) 2023 Straylight/Edgeware
 */

/*----- Licensing notice --------------------------------------------------*
 *
 * This file is part of the mLib utilities library.
 *
 * mLib is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Library General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * mLib is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
 * License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with mLib.  If not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
 * USA.
 */

/*----- Header files ------------------------------------------------------*/

#include <float.h>
#include <math.h>

#include "bits.h"
#include "buf.h"

/*----- Formatting primitives ---------------------------------------------*/

/* We use the IEEE 754 `binary64' format.  Briefly:
 *
 *   * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes
 *     %$s = -1$%..  The format is signed-magnitude, so everything else is
 *     the same for positive and negative numbers.
 *
 *   * The next eleven bits are the biased exponent %$e$%.
 *
 *   * The remaining 52 bits are the significand %$m$%.
 *
 * If %$0 < e < 2047$% then the encoding represents the normal number
 * %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%.
 *
 * If %$e = 0$% and %$m = 0$% then the encoding represents positive or
 * negative zero.
 *
 * If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal
 * number %$s \cdot m/2^{52} \cdot 2^{-1022}$%.
 *
 * If %$e = 2047$% and %$m = 0$% then the encoding represents positive or
 * negative infinity.
 *
 * If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN.  If
 * the most significant bit of %$m$% is set then this is a quiet NaN;
 * otherwise it's a signalling NaN.
 */

/* --- @f64_to_k64@ --- *
 *
 * Arguments:	@double x@ = a floating-point number
 *
 * Returns:	A 64-bit encoding of @x@.
 *
 * Use:		Encodes @x@ as a `binary64' value.  See `buf_putf64' for the
 *		caveats.
 */

static kludge64 f64_to_k64(double x)
{
  kludge64 k;
  uint32 lo, hi, t;
  int e; double m;

  /* Some machinery before we start. */

#ifdef isnan
#  define NANP(x) isnan(x)
#else
#  define NANP(x) (!((x) == (x)))
#endif

#ifdef isinf
#  define INFP(x) isinf(x)
#else
#  define INFP(x) ((x) > DBL_MAX || (x) < -DBL_MAX)
#endif

#ifdef signbit
#  define NEGP(x) signbit(x)
#else
#  define NEGP(x) ((x) < 0)		/* incorrect for negative zero! */
#endif

  if (NANP(x)) {
    /* A NaN. */
    hi = 0x7ff80000; lo = 0;
  } else if (INFP(x)) {
    /* Positive or negative infinity. */
    hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0;
  } else if (x == 0) {
    /* Positive or negative zero. */
    hi = NEGP(x) ? 0x80000000 : 0; lo = 0;
  } else {
    /* A normal or subnormal number.  Now we have to do some actual work. */

    /* Let's get the sign dealt with so we don't have to worry about it any
     * more.
     */
    if (!NEGP(x)) hi = 0;
    else { x = -x; hi = 0x80000000; }

    /* Now we start on the value.  The first thing to do is to split off the
     * exponent.  Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%.
     */
    m = frexp(x, &e);

    /* If our number is too big, we'll round it to infinity.  This will
     * happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%.
     */
    if (e > 1024)
      { hi |= 0x7ff00000; lo = 0; }
    else {
      /* Our number is sufficiently small that we can represent it at least
       * approximately (though maybe we'll have to flush it to zero).  The
       * next step, then, is to pull the significand bits out.
       */

      /* Determine the correct exponent to store.  We're not going to bias it
       * yet, but this is where we deal with subnormal numbers.  Our number
       * is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%.  In this
       * case, there's an implicit bit which we'll clear.  Otherwise, if it's
       * subnormal, we'll scale our floating-point number so that the
       * significand will look right when we extract it, and adjust the
       * exponent so that, when we're finally done, it will have the correct
       * sentinel value.
       */
      if (e > -1022) m -= 0.5;
      else { m = ldexp(m, 1021 + e); e = -1022; }

      /* Now we pull out the 53 bits of the significand.   This will, in
       * general, leave a tail which we address through rounding.  Scale it
       * up so that we end up with %$0 \le m' < 2$%; then we round up if
       * %$m > 1$%, or if %$m = 1$% and the low bit of the significand is
       * set.
       */
      t = ldexp(m, 21); m -= ldexp(t, -21);
      lo = ldexp(m, 53); m -= ldexp(lo, -53);
      m = ldexp(m, 54);

      /* Round the number if necessary. */
      if (lo&1 ? m >= 1.0 : m > 1)
	{ lo = U32(lo + 1); if (!lo) t++; }

      /* Now we just put the pieces together.  Note that our %$e$% is one
       * greater than it should be, because our implicit bit should have
       * been the unit bit not the 1/2 bit.
       */
      hi |= ((uint32)(e + 1022) << 20) | t;
    }
  }

  /* Convert to external format and go home. */
  SET64(k, hi, lo); return (k);

#undef NANP
#undef INFP
#undef NEGP
}

/* --- @k64_to_f64@ --- *
 *
 * Arguments:	@double *x_out@ = where to put the result
 *		@kludge64 k@ = a 64-bit encoding of a floating-point value
 *
 * Returns:	Zero on success, @-1@ on failure.
 *
 * Use:		Decodes @k@ as a `binary64' value.  See `buf_getf64' for the
 *		caveats.
 */

static int k64_to_f64(double *x_out, kludge64 k)
{
  uint32 lo, hi, t;
  int s, e; double x;

  /* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */

  /* Pick the encoded number apart. */
  hi = HI64(k); lo = LO64(k);
  s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff;

  /* Deal with various special cases. */
  if (e == 2047) {
    /* Maximum exponent indicates (positive or negative) infinity or NaN. */

    if (t || lo) {
      /* It's a NaN.  We're not going to be picky about which one.  If we
       * can't represent it then we'll just have to fail.
       */

#ifdef NAN
      x = NAN;
#else
      return (-1);
#endif
    } else {
      /* It's an infinity.  If we don't have one of those to hand, then pick
       * something really big.
       */

#ifdef INFINITY
    x = s ? -INFINITY : INFINITY;
#else
    x = s ? -DBL_MAX : DBL_MAX;
#endif
    }
  } else {
    /* It's a finite number, though maybe it's weird in some way. */

    if (e == 0) {
      /* Minimum exponent indicates zero or a subnormal number.  The
       * subnormal exponent is a sentinel value that shouldn't be taken
       * literally, so we should fix that.  If the number is actually zero
       * then the exponent won't matter much so don't bother checking.
       */

      e = 1;
    } else {
      /* It's a normal number.  In which case there's an implicit bit which
       * we can now set.
       */

      t |= 0x00100000;
    }

    /* All that remains is to stuff the significant and exponent into a
     * floating point number.  We'll have to do this in pieces, and we'll
     * lean on the floating-point machinery to do rounding correctly.
     */
    x = ldexp(t, e - 1043) + ldexp(lo, e - 1075);
    if (s) x = -x;
  }

  /* And we're done. */
  *x_out = x; return (0);
}

/*----- External functions ------------------------------------------------*/

/* --- @buf_putf64{,b,l} --- *
 *
 * Arguments:	@buf *b@ = a buffer to write to
 *		@double x@ = a number to write
 *
 * Returns:	Zero on success, @-1@ on failure (and the buffer is broken).
 *
 *		On C89, this function can't detect negative zero so these
 *		will be silently written as positive zero.
 *
 *		This function doesn't distinguish NaNs.  Any NaN is written
 *		as a quiet NaN with all payload bits zero.
 *
 *		A finite value with too large a magnitude to be represented
 *		is rounded to the appropriate infinity.  Other finite values
 *		are rounded as necessary, in the usual IEEE 754 round-to-
 *		nearest-or-even way.
 */

int buf_putf64(buf *b, double x)
  { return (buf_putk64(b, f64_to_k64(x))); }
int buf_putf64b(buf *b, double x)
  { return (buf_putk64b(b, f64_to_k64(x))); }
int buf_putf64l(buf *b, double x)
  { return (buf_putk64l(b, f64_to_k64(x))); }

/* --- @buf_getf64{,b,l} --- *
 *
 * Arguments:	@buf *b@ = a buffer to read from
 *		@double *x_out@ = where to put the result
 *
 * Returns:	Zero on success, @-1@ on failure (and the buffer is broken).
 *
 *		If the system supports NaNs, then any encoded NaN is returned
 *		as the value of @NAN@ in @<math.h>@; otherwise, this function
 *		reports failure.
 *
 *		In general, values are rounded to the nearest available
 *		value, in the way that the system usually rounds.  If the
 *		system doesn't support infinities, then any encoded infinity
 *		is reported as the largest-possible-magnitude finite value
 *		instead.
 */

int buf_getf64(buf *b, double *x_out)
{
  kludge64 k;

  if (buf_getk64(b, &k)) return (-1);
  if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
  return (0);
}
int buf_getf64b(buf *b, double *x_out)
{
  kludge64 k;

  if (buf_getk64b(b, &k)) return (-1);
  if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
  return (0);
}
int buf_getf64l(buf *b, double *x_out)
{
  kludge64 k;

  if (buf_getk64l(b, &k)) return (-1);
  if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
  return (0);
}

/*----- That's all, folks -------------------------------------------------*/
Commit	Line	Data
e63124bc MW	1	/* --c--
	2	*
	3	* Encoding and decoding floating-point values
	4	*
	5	* (c) 2023 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of the mLib utilities library.
	11	*
	12	* mLib is free software: you can redistribute it and/or modify it under
	13	* the terms of the GNU Library General Public License as published by
	14	* the Free Software Foundation; either version 2 of the License, or (at
	15	* your option) any later version.
	16	*
	17	* mLib is distributed in the hope that it will be useful, but WITHOUT
	18	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	19	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
	20	* License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with mLib. If not, write to the Free Software
	24	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	* USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include <float.h>
	31	#include <math.h>
	32
	33	#include "bits.h"
	34	#include "buf.h"
	35
	36	/----- Formatting primitives ---------------------------------------------/
	37
	38	/* We use the IEEE 754 `binary64' format. Briefly:
	39	*
	40	* * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes
	41	* %$s = -1$%.. The format is signed-magnitude, so everything else is
	42	* the same for positive and negative numbers.
	43	*
	44	* * The next eleven bits are the biased exponent %$e$%.
	45	*
	46	* * The remaining 52 bits are the significand %$m$%.
	47	*
	48	* If %$0 < e < 2047$% then the encoding represents the normal number
	49	* %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%.
	50	*
	51	* If %$e = 0$% and %$m = 0$% then the encoding represents positive or
	52	* negative zero.
	53	*
	54	* If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal
	55	* number %$s \cdot m/2^{52} \cdot 2^{-1022}$%.
	56	*
	57	* If %$e = 2047$% and %$m = 0$% then the encoding represents positive or
	58	* negative infinity.
	59	*
	60	* If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN. If
	61	* the most significant bit of %$m$% is set then this is a quiet NaN;
	62	* otherwise it's a signalling NaN.
	63	*/
	64
65	/* --- @f64_to_k64@ --- *
66	*
67	* Arguments: @double x@ = a floating-point number
68	*
69	* Returns: A 64-bit encoding of @x@.
70	*
71	* Use: Encodes @x@ as a `binary64' value. See `buf_putf64' for the
72	* caveats.
73	*/
74
75	static kludge64 f64_to_k64(double x)
76	{
77	kludge64 k;
78	uint32 lo, hi, t;
79	int e; double m;
80
81	/* Some machinery before we start. */
82
83	#ifdef isnan
84	# define NANP(x) isnan(x)
85	#else
86	# define NANP(x) (!((x) == (x)))
87	#endif
88
89	#ifdef isinf
90	# define INFP(x) isinf(x)
91	#else
92	# define INFP(x) ((x) > DBL_MAX \|\| (x) < -DBL_MAX)
93	#endif
94
95	#ifdef signbit
96	# define NEGP(x) signbit(x)
97	#else
98	# define NEGP(x) ((x) < 0) /* incorrect for negative zero! */
99	#endif
100
101	if (NANP(x)) {
102	/* A NaN. */
103	hi = 0x7ff80000; lo = 0;
104	} else if (INFP(x)) {
105	/* Positive or negative infinity. */
106	hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0;
107	} else if (x == 0) {
108	/* Positive or negative zero. */
109	hi = NEGP(x) ? 0x80000000 : 0; lo = 0;
110	} else {
111	/* A normal or subnormal number. Now we have to do some actual work. */
112
113	/* Let's get the sign dealt with so we don't have to worry about it any
114	* more.
115	*/
116	if (!NEGP(x)) hi = 0;
117	else { x = -x; hi = 0x80000000; }
118
119	/* Now we start on the value. The first thing to do is to split off the
120	* exponent. Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%.
121	*/
122	m = frexp(x, &e);
123
124	/* If our number is too big, we'll round it to infinity. This will
125	* happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%.
126	*/
127	if (e > 1024)
128	{ hi \|= 0x7ff00000; lo = 0; }
129	else {
130	/* Our number is sufficiently small that we can represent it at least
131	* approximately (though maybe we'll have to flush it to zero). The
132	* next step, then, is to pull the significand bits out.
133	*/
134
135	/* Determine the correct exponent to store. We're not going to bias it
136	* yet, but this is where we deal with subnormal numbers. Our number
137	* is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%. In this
138	* case, there's an implicit bit which we'll clear. Otherwise, if it's
139	* subnormal, we'll scale our floating-point number so that the
140	* significand will look right when we extract it, and adjust the
141	* exponent so that, when we're finally done, it will have the correct
142	* sentinel value.
143	*/
144	if (e > -1022) m -= 0.5;
145	else { m = ldexp(m, 1021 + e); e = -1022; }
146
147	/* Now we pull out the 53 bits of the significand. This will, in
148	* general, leave a tail which we address through rounding. Scale it
149	* up so that we end up with %$0 \le m' < 2$%; then we round up if
150	* %$m > 1$%, or if %$m = 1$% and the low bit of the significand is
151	* set.
152	*/
153	t = ldexp(m, 21); m -= ldexp(t, -21);
154	lo = ldexp(m, 53); m -= ldexp(lo, -53);
155	m = ldexp(m, 54);
156
157	/* Round the number if necessary. */
158	if (lo&1 ? m >= 1.0 : m > 1)
159	{ lo = U32(lo + 1); if (!lo) t++; }
160
161	/* Now we just put the pieces together. Note that our %$e$% is one
162	* greater than it should be, because our implicit bit should have
163	* been the unit bit not the 1/2 bit.
164	*/
165	hi \|= ((uint32)(e + 1022) << 20) \| t;
166	}
167	}
168
169	/* Convert to external format and go home. */
170	SET64(k, hi, lo); return (k);
171
172	#undef NANP
173	#undef INFP
174	#undef NEGP
175	}
176
177	/* --- @k64_to_f64@ --- *
178	*
179	* Arguments: @double *x_out@ = where to put the result
180	* @kludge64 k@ = a 64-bit encoding of a floating-point value
181	*
182	* Returns: Zero on success, @-1@ on failure.
183	*
184	* Use: Decodes @k@ as a `binary64' value. See `buf_getf64' for the
185	* caveats.
186	*/
187
188	static int k64_to_f64(double *x_out, kludge64 k)
189	{
190	uint32 lo, hi, t;
191	int s, e; double x;
192
193	/* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */
194
195	/* Pick the encoded number apart. */
196	hi = HI64(k); lo = LO64(k);
197	s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff;
198
199	/* Deal with various special cases. */
200	if (e == 2047) {
201	/* Maximum exponent indicates (positive or negative) infinity or NaN. */
202
203	if (t \|\| lo) {
204	/* It's a NaN. We're not going to be picky about which one. If we
205	* can't represent it then we'll just have to fail.
206	*/
207
208	#ifdef NAN
209	x = NAN;
210	#else
211	return (-1);
212	#endif
213	} else {
214	/* It's an infinity. If we don't have one of those to hand, then pick
215	* something really big.
216	*/
217
218	#ifdef INFINITY
219	x = s ? -INFINITY : INFINITY;
220	#else
221	x = s ? -DBL_MAX : DBL_MAX;
222	#endif
223	}
224	} else {
225	/* It's a finite number, though maybe it's weird in some way. */
226
227	if (e == 0) {
228	/* Minimum exponent indicates zero or a subnormal number. The
229	* subnormal exponent is a sentinel value that shouldn't be taken
230	* literally, so we should fix that. If the number is actually zero
231	* then the exponent won't matter much so don't bother checking.
232	*/
233
234	e = 1;
235	} else {
236	/* It's a normal number. In which case there's an implicit bit which
237	* we can now set.
238	*/
239
240	t \|= 0x00100000;
241	}
242
243	/* All that remains is to stuff the significant and exponent into a
244	* floating point number. We'll have to do this in pieces, and we'll
245	* lean on the floating-point machinery to do rounding correctly.
246	*/
247	x = ldexp(t, e - 1043) + ldexp(lo, e - 1075);
248	if (s) x = -x;
249	}
250
251	/* And we're done. */
252	*x_out = x; return (0);
253	}
254
255	/----- External functions ------------------------------------------------/
256
257	/* --- @buf_putf64{,b,l} --- *
258	*
259	* Arguments: @buf *b@ = a buffer to write to
260	* @double x@ = a number to write
261	*
262	* Returns: Zero on success, @-1@ on failure (and the buffer is broken).
263	*
264	* On C89, this function can't detect negative zero so these
265	* will be silently written as positive zero.
266	*
267	* This function doesn't distinguish NaNs. Any NaN is written
268	* as a quiet NaN with all payload bits zero.
269	*
270	* A finite value with too large a magnitude to be represented
271	* is rounded to the appropriate infinity. Other finite values
272	* are rounded as necessary, in the usual IEEE 754 round-to-
273	* nearest-or-even way.
274	*/
275
276	int buf_putf64(buf *b, double x)
277	{ return (buf_putk64(b, f64_to_k64(x))); }
278	int buf_putf64b(buf *b, double x)
279	{ return (buf_putk64b(b, f64_to_k64(x))); }
280	int buf_putf64l(buf *b, double x)
281	{ return (buf_putk64l(b, f64_to_k64(x))); }
282
283	/* --- @buf_getf64{,b,l} --- *
284	*
285	* Arguments: @buf *b@ = a buffer to read from
286	* @double *x_out@ = where to put the result
287	*
288	* Returns: Zero on success, @-1@ on failure (and the buffer is broken).
289	*
290	* If the system supports NaNs, then any encoded NaN is returned
291	* as the value of @NAN@ in @<math.h>@; otherwise, this function
292	* reports failure.
293	*
294	* In general, values are rounded to the nearest available
295	* value, in the way that the system usually rounds. If the
296	* system doesn't support infinities, then any encoded infinity
297	* is reported as the largest-possible-magnitude finite value
298	* instead.
299	*/
300
301	int buf_getf64(buf b, double x_out)
302	{
303	kludge64 k;
304
305	if (buf_getk64(b, &k)) return (-1);
306	if (k64_to_f64(x_out, k)) { b->f \|= BF_BROKEN; return (-1); }
307	return (0);
308	}
309	int buf_getf64b(buf b, double x_out)
310	{
311	kludge64 k;
312
313	if (buf_getk64b(b, &k)) return (-1);
314	if (k64_to_f64(x_out, k)) { b->f \|= BF_BROKEN; return (-1); }
315	return (0);
316	}
317	int buf_getf64l(buf b, double x_out)
318	{
319	kludge64 k;
320
321	if (buf_getk64l(b, &k)) return (-1);
322	if (k64_to_f64(x_out, k)) { b->f \|= BF_BROKEN; return (-1); }
323	return (0);
324	}
325
326	/----- That's all, folks -------------------------------------------------/