[mLib] / struct / buf-float.c

/* -*-c-*-
 *
 * Encoding and decoding floating-point values
 *
 * (c) 2023 Straylight/Edgeware
 */

/*----- Licensing notice --------------------------------------------------*
 *
 * This file is part of the mLib utilities library.
 *
 * mLib is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Library General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * mLib is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
 * License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with mLib.  If not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
 * USA.
 */

/*----- Header files ------------------------------------------------------*/

#include <float.h>
#include <math.h>

#include "bits.h"
#include "buf.h"
#include "maths.h"

/*----- Formatting primitives ---------------------------------------------*/

/* We use the IEEE 754 `binary64' format.  Briefly:
 *
 *   * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes
 *     %$s = -1$%..  The format is signed-magnitude, so everything else is
 *     the same for positive and negative numbers.
 *
 *   * The next eleven bits are the biased exponent %$e$%.
 *
 *   * The remaining 52 bits are the significand %$m$%.
 *
 * If %$0 < e < 2047$% then the encoding represents the normal number
 * %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%.
 *
 * If %$e = 0$% and %$m = 0$% then the encoding represents positive or
 * negative zero.
 *
 * If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal
 * number %$s \cdot m/2^{52} \cdot 2^{-1022}$%.
 *
 * If %$e = 2047$% and %$m = 0$% then the encoding represents positive or
 * negative infinity.
 *
 * If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN.  If
 * the most significant bit of %$m$% is set then this is a quiet NaN;
 * otherwise it's a signalling NaN.
 */

/* --- @f64_to_k64@ --- *
 *
 * Arguments:	@double x@ = a floating-point number
 *
 * Returns:	A 64-bit encoding of @x@.
 *
 * Use:		Encodes @x@ as a `binary64' value.  See `buf_putf64' for the
 *		caveats.
 */

static kludge64 f64_to_k64(double x)
{
  kludge64 k;
  uint32 lo, hi, t;
  int e; double m;

  /* Some machinery before we start. */

  if (NANP(x)) {
    /* A NaN. */
    hi = 0x7ff80000; lo = 0;
  } else if (INFP(x)) {
    /* Positive or negative infinity. */
    hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0;
  } else if (x == 0) {
    /* Positive or negative zero. */
    hi = NEGP(x) ? 0x80000000 : 0; lo = 0;
  } else {
    /* A normal or subnormal number.  Now we have to do some actual work. */

    /* Let's get the sign dealt with so we don't have to worry about it any
     * more.
     */
    if (!NEGP(x)) hi = 0;
    else { x = -x; hi = 0x80000000; }

    /* Now we start on the value.  The first thing to do is to split off the
     * exponent.  Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%.
     */
    m = frexp(x, &e);

    /* If our number is too big, we'll round it to infinity.  This will
     * happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%.
     */
    if (e > 1024)
      { hi |= 0x7ff00000; lo = 0; }
    else {
      /* Our number is sufficiently small that we can represent it at least
       * approximately (though maybe we'll have to flush it to zero).  The
       * next step, then, is to pull the significand bits out.
       */

      /* Determine the correct exponent to store.  We're not going to bias it
       * yet, but this is where we deal with subnormal numbers.  Our number
       * is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%.  In this
       * case, there's an implicit bit which we'll clear.  Otherwise, if it's
       * subnormal, we'll scale our floating-point number so that the
       * significand will look right when we extract it, and adjust the
       * exponent so that, when we're finally done, it will have the correct
       * sentinel value.
       */
      if (e > -1022) m -= 0.5;
      else { m = ldexp(m, 1021 + e); e = -1022; }

      /* Now we pull out the 53 bits of the significand.   This will, in
       * general, leave a tail which we address through rounding.  Scale it
       * up so that we end up with %$0 \le m' < 2$%; then we round up if
       * %$m > 1$%, or if %$m = 1$% and the low bit of the significand is
       * set.
       */
      t = ldexp(m, 21); m -= ldexp(t, -21);
      lo = ldexp(m, 53); m -= ldexp(lo, -53);
      m = ldexp(m, 54);

      /* Round the number if necessary. */
      if (lo&1 ? m >= 1.0 : m > 1)
	{ lo = U32(lo + 1); if (!lo) t++; }

      /* Now we just put the pieces together.  Note that our %$e$% is one
       * greater than it should be, because our implicit bit should have
       * been the unit bit not the 1/2 bit.
       */
      hi |= ((uint32)(e + 1022) << 20) | t;
    }
  }

  /* Convert to external format and go home. */
  SET64(k, hi, lo); return (k);
}

/* --- @k64_to_f64@ --- *
 *
 * Arguments:	@double *x_out@ = where to put the result
 *		@kludge64 k@ = a 64-bit encoding of a floating-point value
 *
 * Returns:	Zero on success, @-1@ on failure.
 *
 * Use:		Decodes @k@ as a `binary64' value.  See `buf_getf64' for the
 *		caveats.
 */

static int k64_to_f64(double *x_out, kludge64 k)
{
  uint32 lo, hi, t;
  int s, e; double x;

  /* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */

  /* Pick the encoded number apart. */
  hi = HI64(k); lo = LO64(k);
  s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff;

  /* Deal with various special cases. */
  if (e == 2047) {
    /* Maximum exponent indicates (positive or negative) infinity or NaN. */

    if (t || lo) {
      /* It's a NaN.  We're not going to be picky about which one.  If we
       * can't represent it then we'll just have to fail.
       */

#ifdef NAN
      x = NAN;
#else
      return (-1);
#endif
    } else {
      /* It's an infinity.  If we don't have one of those to hand, then pick
       * something really big.
       */

#ifdef INFINITY
    x = s ? -INFINITY : INFINITY;
#else
    x = s ? -DBL_MAX : DBL_MAX;
#endif
    }
  } else {
    /* It's a finite number, though maybe it's weird in some way. */

    if (e == 0) {
      /* Minimum exponent indicates zero or a subnormal number.  The
       * subnormal exponent is a sentinel value that shouldn't be taken
       * literally, so we should fix that.  If the number is actually zero
       * then the exponent won't matter much so don't bother checking.
       */

      e = 1;
    } else {
      /* It's a normal number.  In which case there's an implicit bit which
       * we can now set.
       */

      t |= 0x00100000;
    }

    /* All that remains is to stuff the significant and exponent into a
     * floating point number.  We'll have to do this in pieces, and we'll
     * lean on the floating-point machinery to do rounding correctly.
     */
    x = ldexp(t, e - 1043) + ldexp(lo, e - 1075);
    if (s) x = -x;
  }

  /* And we're done. */
  *x_out = x; return (0);
}

/*----- External functions ------------------------------------------------*/

/* --- @buf_putf64{,b,l} --- *
 *
 * Arguments:	@buf *b@ = a buffer to write to
 *		@double x@ = a number to write
 *
 * Returns:	Zero on success, @-1@ on failure (and the buffer is broken).
 *
 *		On C89, this function can't detect negative zero so these
 *		will be silently written as positive zero.
 *
 *		This function doesn't distinguish NaNs.  Any NaN is written
 *		as a quiet NaN with all payload bits zero.
 *
 *		A finite value with too large a magnitude to be represented
 *		is rounded to the appropriate infinity.  Other finite values
 *		are rounded as necessary, in the usual IEEE 754 round-to-
 *		nearest-or-even way.
 */

int buf_putf64(buf *b, double x)
  { return (buf_putk64(b, f64_to_k64(x))); }
int buf_putf64b(buf *b, double x)
  { return (buf_putk64b(b, f64_to_k64(x))); }
int buf_putf64l(buf *b, double x)
  { return (buf_putk64l(b, f64_to_k64(x))); }

/* --- @buf_getf64{,b,l} --- *
 *
 * Arguments:	@buf *b@ = a buffer to read from
 *		@double *x_out@ = where to put the result
 *
 * Returns:	Zero on success, @-1@ on failure (and the buffer is broken).
 *
 *		If the system supports NaNs, then any encoded NaN is returned
 *		as the value of @NAN@ in @<math.h>@; otherwise, this function
 *		reports failure.
 *
 *		In general, values are rounded to the nearest available
 *		value, in the way that the system usually rounds.  If the
 *		system doesn't support infinities, then any encoded infinity
 *		is reported as the largest-possible-magnitude finite value
 *		instead.
 */

int buf_getf64(buf *b, double *x_out)
{
  kludge64 k;

  if (buf_getk64(b, &k)) return (-1);
  if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
  return (0);
}
int buf_getf64b(buf *b, double *x_out)
{
  kludge64 k;

  if (buf_getk64b(b, &k)) return (-1);
  if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
  return (0);
}
int buf_getf64l(buf *b, double *x_out)
{
  kludge64 k;

  if (buf_getk64l(b, &k)) return (-1);
  if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
  return (0);
}

/*----- That's all, folks -------------------------------------------------*/
Commit	Line	Data
e63124bc MW	1	/* --c--
	2	*
	3	* Encoding and decoding floating-point values
	4	*
	5	* (c) 2023 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of the mLib utilities library.
	11	*
	12	* mLib is free software: you can redistribute it and/or modify it under
	13	* the terms of the GNU Library General Public License as published by
	14	* the Free Software Foundation; either version 2 of the License, or (at
	15	* your option) any later version.
	16	*
	17	* mLib is distributed in the hope that it will be useful, but WITHOUT
	18	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	19	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
	20	* License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with mLib. If not, write to the Free Software
	24	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	* USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include <float.h>
	31	#include <math.h>
	32
	33	#include "bits.h"
	34	#include "buf.h"
67b5031e	35	#include "maths.h"
e63124bc MW	36
	37	/----- Formatting primitives ---------------------------------------------/
	38
	39	/* We use the IEEE 754 `binary64' format. Briefly:
	40	*
	41	* * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes
	42	* %$s = -1$%.. The format is signed-magnitude, so everything else is
	43	* the same for positive and negative numbers.
	44	*
	45	* * The next eleven bits are the biased exponent %$e$%.
	46	*
	47	* * The remaining 52 bits are the significand %$m$%.
	48	*
	49	* If %$0 < e < 2047$% then the encoding represents the normal number
	50	* %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%.
	51	*
	52	* If %$e = 0$% and %$m = 0$% then the encoding represents positive or
	53	* negative zero.
	54	*
	55	* If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal
	56	* number %$s \cdot m/2^{52} \cdot 2^{-1022}$%.
	57	*
	58	* If %$e = 2047$% and %$m = 0$% then the encoding represents positive or
	59	* negative infinity.
	60	*
	61	* If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN. If
	62	* the most significant bit of %$m$% is set then this is a quiet NaN;
	63	* otherwise it's a signalling NaN.
	64	*/
	65
	66	/* --- @f64_to_k64@ --- *
	67	*
	68	* Arguments: @double x@ = a floating-point number
	69	*
	70	* Returns: A 64-bit encoding of @x@.
	71	*
	72	* Use: Encodes @x@ as a `binary64' value. See `buf_putf64' for the
	73	* caveats.
	74	*/
	75
	76	static kludge64 f64_to_k64(double x)
	77	{
	78	kludge64 k;
	79	uint32 lo, hi, t;
	80	int e; double m;
	81
	82	/* Some machinery before we start. */
	83
e63124bc MW	84	if (NANP(x)) {
	85	/* A NaN. */
	86	hi = 0x7ff80000; lo = 0;
	87	} else if (INFP(x)) {
	88	/* Positive or negative infinity. */
	89	hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0;
	90	} else if (x == 0) {
	91	/* Positive or negative zero. */
	92	hi = NEGP(x) ? 0x80000000 : 0; lo = 0;
	93	} else {
	94	/* A normal or subnormal number. Now we have to do some actual work. */
	95
	96	/* Let's get the sign dealt with so we don't have to worry about it any
	97	* more.
	98	*/
	99	if (!NEGP(x)) hi = 0;
	100	else { x = -x; hi = 0x80000000; }
	101
	102	/* Now we start on the value. The first thing to do is to split off the
	103	* exponent. Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%.
	104	*/
	105	m = frexp(x, &e);
	106
	107	/* If our number is too big, we'll round it to infinity. This will
	108	* happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%.
	109	*/
	110	if (e > 1024)
	111	{ hi \|= 0x7ff00000; lo = 0; }
	112	else {
	113	/* Our number is sufficiently small that we can represent it at least
	114	* approximately (though maybe we'll have to flush it to zero). The
	115	* next step, then, is to pull the significand bits out.
	116	*/
	117
	118	/* Determine the correct exponent to store. We're not going to bias it
	119	* yet, but this is where we deal with subnormal numbers. Our number
	120	* is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%. In this
	121	* case, there's an implicit bit which we'll clear. Otherwise, if it's
	122	* subnormal, we'll scale our floating-point number so that the
	123	* significand will look right when we extract it, and adjust the
	124	* exponent so that, when we're finally done, it will have the correct
	125	* sentinel value.
	126	*/
	127	if (e > -1022) m -= 0.5;
	128	else { m = ldexp(m, 1021 + e); e = -1022; }
	129
	130	/* Now we pull out the 53 bits of the significand. This will, in
	131	* general, leave a tail which we address through rounding. Scale it
	132	* up so that we end up with %$0 \le m' < 2$%; then we round up if
	133	* %$m > 1$%, or if %$m = 1$% and the low bit of the significand is
	134	* set.
	135	*/
	136	t = ldexp(m, 21); m -= ldexp(t, -21);
	137	lo = ldexp(m, 53); m -= ldexp(lo, -53);
	138	m = ldexp(m, 54);
	139
	140	/* Round the number if necessary. */
	141	if (lo&1 ? m >= 1.0 : m > 1)
	142	{ lo = U32(lo + 1); if (!lo) t++; }
	143
	144	/* Now we just put the pieces together. Note that our %$e$% is one
	145	* greater than it should be, because our implicit bit should have
	146	* been the unit bit not the 1/2 bit.
	147	*/
148	hi \|= ((uint32)(e + 1022) << 20) \| t;
149	}
150	}
151
152	/* Convert to external format and go home. */
153	SET64(k, hi, lo); return (k);
e63124bc MW	154	}
	155
	156	/* --- @k64_to_f64@ --- *
	157	*
	158	* Arguments: @double *x_out@ = where to put the result
	159	* @kludge64 k@ = a 64-bit encoding of a floating-point value
	160	*
	161	* Returns: Zero on success, @-1@ on failure.
	162	*
	163	* Use: Decodes @k@ as a `binary64' value. See `buf_getf64' for the
	164	* caveats.
	165	*/
	166
	167	static int k64_to_f64(double *x_out, kludge64 k)
	168	{
	169	uint32 lo, hi, t;
	170	int s, e; double x;
	171
	172	/* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */
	173
	174	/* Pick the encoded number apart. */
	175	hi = HI64(k); lo = LO64(k);
	176	s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff;
	177
	178	/* Deal with various special cases. */
	179	if (e == 2047) {
	180	/* Maximum exponent indicates (positive or negative) infinity or NaN. */
	181
	182	if (t \|\| lo) {
	183	/* It's a NaN. We're not going to be picky about which one. If we
	184	* can't represent it then we'll just have to fail.
	185	*/
	186
	187	#ifdef NAN
	188	x = NAN;
	189	#else
	190	return (-1);
	191	#endif
	192	} else {
	193	/* It's an infinity. If we don't have one of those to hand, then pick
	194	* something really big.
	195	*/
	196
	197	#ifdef INFINITY
	198	x = s ? -INFINITY : INFINITY;
	199	#else
	200	x = s ? -DBL_MAX : DBL_MAX;
	201	#endif
	202	}
	203	} else {
	204	/* It's a finite number, though maybe it's weird in some way. */
	205
	206	if (e == 0) {
	207	/* Minimum exponent indicates zero or a subnormal number. The
	208	* subnormal exponent is a sentinel value that shouldn't be taken
	209	* literally, so we should fix that. If the number is actually zero
	210	* then the exponent won't matter much so don't bother checking.
	211	*/
	212
	213	e = 1;
	214	} else {
	215	/* It's a normal number. In which case there's an implicit bit which
	216	* we can now set.
	217	*/
218
219	t \|= 0x00100000;
220	}
221
222	/* All that remains is to stuff the significant and exponent into a
223	* floating point number. We'll have to do this in pieces, and we'll
224	* lean on the floating-point machinery to do rounding correctly.
225	*/
226	x = ldexp(t, e - 1043) + ldexp(lo, e - 1075);
227	if (s) x = -x;
228	}
229
230	/* And we're done. */
231	*x_out = x; return (0);
232	}
233
234	/----- External functions ------------------------------------------------/
235
236	/* --- @buf_putf64{,b,l} --- *
237	*
238	* Arguments: @buf *b@ = a buffer to write to
239	* @double x@ = a number to write
240	*
241	* Returns: Zero on success, @-1@ on failure (and the buffer is broken).
242	*
243	* On C89, this function can't detect negative zero so these
244	* will be silently written as positive zero.
245	*
246	* This function doesn't distinguish NaNs. Any NaN is written
247	* as a quiet NaN with all payload bits zero.
248	*
249	* A finite value with too large a magnitude to be represented
250	* is rounded to the appropriate infinity. Other finite values
251	* are rounded as necessary, in the usual IEEE 754 round-to-
252	* nearest-or-even way.
253	*/
254
255	int buf_putf64(buf *b, double x)
256	{ return (buf_putk64(b, f64_to_k64(x))); }
257	int buf_putf64b(buf *b, double x)
258	{ return (buf_putk64b(b, f64_to_k64(x))); }
259	int buf_putf64l(buf *b, double x)
260	{ return (buf_putk64l(b, f64_to_k64(x))); }
261
262	/* --- @buf_getf64{,b,l} --- *
263	*
264	* Arguments: @buf *b@ = a buffer to read from
265	* @double *x_out@ = where to put the result
266	*
267	* Returns: Zero on success, @-1@ on failure (and the buffer is broken).
268	*
269	* If the system supports NaNs, then any encoded NaN is returned
270	* as the value of @NAN@ in @<math.h>@; otherwise, this function
271	* reports failure.
272	*
273	* In general, values are rounded to the nearest available
274	* value, in the way that the system usually rounds. If the
275	* system doesn't support infinities, then any encoded infinity
276	* is reported as the largest-possible-magnitude finite value
277	* instead.
278	*/
279
280	int buf_getf64(buf b, double x_out)
281	{
282	kludge64 k;
283
284	if (buf_getk64(b, &k)) return (-1);
285	if (k64_to_f64(x_out, k)) { b->f \|= BF_BROKEN; return (-1); }
286	return (0);
287	}
288	int buf_getf64b(buf b, double x_out)
289	{
290	kludge64 k;
291
292	if (buf_getk64b(b, &k)) return (-1);
293	if (k64_to_f64(x_out, k)) { b->f \|= BF_BROKEN; return (-1); }
294	return (0);
295	}
296	int buf_getf64l(buf b, double x_out)
297	{
298	kludge64 k;
299
300	if (buf_getk64l(b, &k)) return (-1);
301	if (k64_to_f64(x_out, k)) { b->f \|= BF_BROKEN; return (-1); }
302	return (0);
303	}
304
305	/----- That's all, folks -------------------------------------------------/