[mLib] / struct / buf-float.c

/* -*-c-*-
 *
 * Encoding and decoding floating-point values
 *
 * (c) 2023 Straylight/Edgeware
 */

/*----- Licensing notice --------------------------------------------------*
 *
 * This file is part of the mLib utilities library.
 *
 * mLib is free software: you can redistribute it and/or modify it under
 * the terms of the GNU Library General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 *
 * mLib is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
 * License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with mLib.  If not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
 * USA.
 */

/*----- Header files ------------------------------------------------------*/

#include <float.h>
#include <math.h>

#include "bits.h"
#include "buf.h"
#include "maths.h"

/*----- Formatting primitives ---------------------------------------------*/

/* We use the IEEE 754 `binary64' format.  Briefly:
 *
 *   * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes
 *     %$s = -1$%..  The format is signed-magnitude, so everything else is
 *     the same for positive and negative numbers.
 *
 *   * The next eleven bits are the biased exponent %$e$%.
 *
 *   * The remaining 52 bits are the significand %$m$%.
 *
 * If %$0 < e < 2047$% then the encoding represents the normal number
 * %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%.
 *
 * If %$e = 0$% and %$m = 0$% then the encoding represents positive or
 * negative zero.
 *
 * If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal
 * number %$s \cdot m/2^{52} \cdot 2^{-1022}$%.
 *
 * If %$e = 2047$% and %$m = 0$% then the encoding represents positive or
 * negative infinity.
 *
 * If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN.  If
 * the most significant bit of %$m$% is set then this is a quiet NaN;
 * otherwise it's a signalling NaN.
 */

/* --- @k64_to_f64@ --- *
 *
 * Arguments:	@double *x_out@ = where to put the result
 *		@kludge64 k@ = a 64-bit encoding of a floating-point value
 *
 * Returns:	Zero on success, @-1@ on failure.
 *
 * Use:		Decodes @k@ as a `binary64' value.  See `buf_getf64' for the
 *		caveats.
 */

static int k64_to_f64(double *x_out, kludge64 k)
{
  uint32 lo, hi, t;
  int s, e; double x;

  /* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */

  /* Pick the encoded number apart. */
  hi = HI64(k); lo = LO64(k);
  s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff;

  /* Deal with various special cases. */
  if (e == 2047) {
    /* Maximum exponent indicates (positive or negative) infinity or NaN. */

    if (t || lo) {
      /* It's a NaN.  We're not going to be picky about which one.  If we
       * can't represent it then we'll just have to fail.
       */

#ifdef NAN
      x = NAN;
#else
      return (-1);
#endif
    } else {
      /* It's an infinity.  If we don't have one of those to hand, then pick
       * something really big.
       */

#ifdef INFINITY
    x = s ? -INFINITY : INFINITY;
#else
    x = s ? -DBL_MAX : DBL_MAX;
#endif
    }
  } else {
    /* It's a finite number, though maybe it's weird in some way. */

    if (e == 0) {
      /* Minimum exponent indicates zero or a subnormal number.  The
       * subnormal exponent is a sentinel value that shouldn't be taken
       * literally, so we should fix that.  If the number is actually zero
       * then the exponent won't matter much so don't bother checking.
       */

      e = 1;
    } else {
      /* It's a normal number.  In which case there's an implicit bit which
       * we can now set.
       */

      t |= 0x00100000;
    }

    /* All that remains is to stuff the significant and exponent into a
     * floating point number.  We'll have to do this in pieces, and we'll
     * lean on the floating-point machinery to do rounding correctly.
     */
    x = ldexp(t, e - 1043) + ldexp(lo, e - 1075);
    if (s) x = -x;
  }

  /* And we're done. */
  *x_out = x; return (0);
}

/* --- @f64_to_k64@ --- *
 *
 * Arguments:	@double x@ = a floating-point number
 *
 * Returns:	A 64-bit encoding of @x@.
 *
 * Use:		Encodes @x@ as a `binary64' value.  See `buf_putf64' for the
 *		caveats.
 */

static kludge64 f64_to_k64(double x)
{
  kludge64 k;
  uint32 lo, hi, t;
  int e; double m;

  /* Some machinery before we start. */

  if (NANP(x)) {
    /* A NaN. */
    hi = 0x7ff80000; lo = 0;
  } else if (INFP(x)) {
    /* Positive or negative infinity. */
    hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0;
  } else if (x == 0) {
    /* Positive or negative zero. */
    hi = NEGP(x) ? 0x80000000 : 0; lo = 0;
  } else {
    /* A normal or subnormal number.  Now we have to do some actual work. */

    /* Let's get the sign dealt with so we don't have to worry about it any
     * more.
     */
    if (!NEGP(x)) hi = 0;
    else { x = -x; hi = 0x80000000; }

    /* Now we start on the value.  The first thing to do is to split off the
     * exponent.  Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%.
     */
    m = frexp(x, &e);

    /* If our number is too big, we'll round it to infinity.  This will
     * happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%.
     */
    if (e > 1024)
      { hi |= 0x7ff00000; lo = 0; }
    else {
      /* Our number is sufficiently small that we can represent it at least
       * approximately (though maybe we'll have to flush it to zero).  The
       * next step, then, is to pull the significand bits out.
       */

      /* Determine the correct exponent to store.  We're not going to bias it
       * yet, but this is where we deal with subnormal numbers.  Our number
       * is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%.  In this
       * case, there's an implicit bit which we'll clear.  Otherwise, if it's
       * subnormal, we'll scale our floating-point number so that the
       * significand will look right when we extract it, and adjust the
       * exponent so that, when we're finally done, it will have the correct
       * sentinel value.
       */
      if (e > -1022) m -= 0.5;
      else { m = ldexp(m, 1021 + e); e = -1022; }

      /* Now we pull out the 53 bits of the significand.   This will, in
       * general, leave a tail which we address through rounding.  Scale it
       * up so that we end up with %$0 \le m' < 2$%; then we round up if
       * %$m > 1$%, or if %$m = 1$% and the low bit of the significand is
       * set.
       */
      t = ldexp(m, 21); m -= ldexp(t, -21);
      lo = ldexp(m, 53); m -= ldexp(lo, -53);
      m = ldexp(m, 54);

      /* Round the number if necessary. */
      if (lo&1 ? m >= 1.0 : m > 1)
	{ lo = U32(lo + 1); if (!lo) t++; }

      /* Now we just put the pieces together.  Note that our %$e$% is one
       * greater than it should be, because our implicit bit should have
       * been the unit bit not the 1/2 bit.
       */
      hi |= ((uint32)(e + 1022) << 20) | t;
    }
  }

  /* Convert to external format and go home. */
  SET64(k, hi, lo); return (k);
}

/*----- External functions ------------------------------------------------*/

/* --- @buf_getf64{,l,b} --- *
 *
 * Arguments:	@buf *b@ = a buffer to read from
 *		@double *x_out@ = where to put the result
 *
 * Returns:	Zero on success, @-1@ on failure (and the buffer is broken).
 *
 *		If the system supports NaNs, then any encoded NaN is returned
 *		as the value of @NAN@ in @<math.h>@; otherwise, this function
 *		reports failure.
 *
 *		In general, values are rounded to the nearest available
 *		value, in the way that the system usually rounds.  If the
 *		system doesn't support infinities, then any encoded infinity
 *		is reported as the largest-possible-magnitude finite value
 *		instead.
 */

int buf_getf64(buf *b, double *x_out)
{
  kludge64 k;

  if (buf_getk64(b, &k)) return (-1);
  if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
  return (0);
}

int buf_getf64l(buf *b, double *x_out)
{
  kludge64 k;

  if (buf_getk64l(b, &k)) return (-1);
  if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
  return (0);
}

int buf_getf64b(buf *b, double *x_out)
{
  kludge64 k;

  if (buf_getk64b(b, &k)) return (-1);
  if (k64_to_f64(x_out, k)) { b->f |= BF_BROKEN; return (-1); }
  return (0);
}

int (dbuf_getf64)(dbuf *db, double *x_out)
  { return (dbuf_getf64(db, x_out)); }
int (dbuf_getf64l)(dbuf *db, double *x_out)
  { return (dbuf_getf64l(db, x_out)); }
int (dbuf_getf64b)(dbuf *db, double *x_out)
  { return (dbuf_getf64b(db, x_out)); }

/* --- @buf_putf64{,l,b} --- *
 *
 * Arguments:	@buf *b@ = a buffer to write to
 *		@double x@ = a number to write
 *
 * Returns:	Zero on success, @-1@ on failure (and the buffer is broken).
 *
 *		On C89, this function can't detect negative zero so these
 *		will be silently written as positive zero.
 *
 *		This function doesn't distinguish NaNs.  Any NaN is written
 *		as a quiet NaN with all payload bits zero.
 *
 *		A finite value with too large a magnitude to be represented
 *		is rounded to the appropriate infinity.  Other finite values
 *		are rounded as necessary, in the usual IEEE 754 round-to-
 *		nearest-or-even way.
 */

int buf_putf64(buf *b, double x)
  { return (buf_putk64(b, f64_to_k64(x))); }
int buf_putf64l(buf *b, double x)
  { return (buf_putk64l(b, f64_to_k64(x))); }
int buf_putf64b(buf *b, double x)
  { return (buf_putk64b(b, f64_to_k64(x))); }

int (dbuf_putf64)(dbuf *db, double x)
  { return (dbuf_putf64(db, x)); }
int (dbuf_putf64l)(dbuf *db, double x)
  { return (dbuf_putf64l(db, x)); }
int (dbuf_putf64b)(dbuf *db, double x)
  { return (dbuf_putf64b(db, x)); }

/*----- That's all, folks -------------------------------------------------*/
Commit	Line	Data
e63124bc MW	1	/* --c--
	2	*
	3	* Encoding and decoding floating-point values
	4	*
	5	* (c) 2023 Straylight/Edgeware
	6	*/
	7
	8	/----- Licensing notice --------------------------------------------------
	9	*
	10	* This file is part of the mLib utilities library.
	11	*
	12	* mLib is free software: you can redistribute it and/or modify it under
	13	* the terms of the GNU Library General Public License as published by
	14	* the Free Software Foundation; either version 2 of the License, or (at
	15	* your option) any later version.
	16	*
	17	* mLib is distributed in the hope that it will be useful, but WITHOUT
	18	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	19	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
	20	* License for more details.
	21	*
	22	* You should have received a copy of the GNU Library General Public
	23	* License along with mLib. If not, write to the Free Software
	24	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	25	* USA.
	26	*/
	27
	28	/----- Header files ------------------------------------------------------/
	29
	30	#include <float.h>
	31	#include <math.h>
	32
	33	#include "bits.h"
	34	#include "buf.h"
67b5031e	35	#include "maths.h"
e63124bc MW	36
	37	/----- Formatting primitives ---------------------------------------------/
	38
	39	/* We use the IEEE 754 `binary64' format. Briefly:
	40	*
	41	* * The top bit is the sign %$s$%: 0 encodes %$s = +1$%, and 1 encodes
	42	* %$s = -1$%.. The format is signed-magnitude, so everything else is
	43	* the same for positive and negative numbers.
	44	*
	45	* * The next eleven bits are the biased exponent %$e$%.
	46	*
	47	* * The remaining 52 bits are the significand %$m$%.
	48	*
	49	* If %$0 < e < 2047$% then the encoding represents the normal number
	50	* %$s \cdot (1 + m/2^{52}) \cdot 2^{e-1023}$%.
	51	*
	52	* If %$e = 0$% and %$m = 0$% then the encoding represents positive or
	53	* negative zero.
	54	*
	55	* If %$e = 0$% and %$m \ne 0$% then the encoding represents a subnormal
	56	* number %$s \cdot m/2^{52} \cdot 2^{-1022}$%.
	57	*
	58	* If %$e = 2047$% and %$m = 0$% then the encoding represents positive or
	59	* negative infinity.
	60	*
	61	* If %$e = 2047$% and %$m \ne 0$% then the encoding represents a NaN. If
	62	* the most significant bit of %$m$% is set then this is a quiet NaN;
	63	* otherwise it's a signalling NaN.
	64	*/
	65
31d0247c MW	66	/* --- @k64_to_f64@ --- *
	67	*
	68	* Arguments: @double *x_out@ = where to put the result
	69	* @kludge64 k@ = a 64-bit encoding of a floating-point value
	70	*
	71	* Returns: Zero on success, @-1@ on failure.
	72	*
	73	* Use: Decodes @k@ as a `binary64' value. See `buf_getf64' for the
	74	* caveats.
	75	*/
	76
	77	static int k64_to_f64(double *x_out, kludge64 k)
	78	{
	79	uint32 lo, hi, t;
	80	int s, e; double x;
	81
	82	/* We're using the IEEE 754 `binary64' format: see `float_to_k64' above. */
	83
	84	/* Pick the encoded number apart. */
	85	hi = HI64(k); lo = LO64(k);
	86	s = (hi >> 31)&1; e = (hi >> 20)&0x07ff; t = hi&0x000fffff;
	87
	88	/* Deal with various special cases. */
	89	if (e == 2047) {
	90	/* Maximum exponent indicates (positive or negative) infinity or NaN. */
	91
	92	if (t \|\| lo) {
	93	/* It's a NaN. We're not going to be picky about which one. If we
	94	* can't represent it then we'll just have to fail.
	95	*/
	96
	97	#ifdef NAN
	98	x = NAN;
	99	#else
	100	return (-1);
	101	#endif
	102	} else {
	103	/* It's an infinity. If we don't have one of those to hand, then pick
	104	* something really big.
	105	*/
	106
	107	#ifdef INFINITY
	108	x = s ? -INFINITY : INFINITY;
	109	#else
	110	x = s ? -DBL_MAX : DBL_MAX;
	111	#endif
	112	}
	113	} else {
	114	/* It's a finite number, though maybe it's weird in some way. */
	115
	116	if (e == 0) {
	117	/* Minimum exponent indicates zero or a subnormal number. The
	118	* subnormal exponent is a sentinel value that shouldn't be taken
	119	* literally, so we should fix that. If the number is actually zero
	120	* then the exponent won't matter much so don't bother checking.
	121	*/
	122
	123	e = 1;
	124	} else {
	125	/* It's a normal number. In which case there's an implicit bit which
	126	* we can now set.
	127	*/
	128
	129	t \|= 0x00100000;
130	}
131
132	/* All that remains is to stuff the significant and exponent into a
133	* floating point number. We'll have to do this in pieces, and we'll
134	* lean on the floating-point machinery to do rounding correctly.
135	*/
136	x = ldexp(t, e - 1043) + ldexp(lo, e - 1075);
137	if (s) x = -x;
138	}
139
140	/* And we're done. */
141	*x_out = x; return (0);
142	}
143
e63124bc MW	144	/* --- @f64_to_k64@ --- *
	145	*
	146	* Arguments: @double x@ = a floating-point number
	147	*
	148	* Returns: A 64-bit encoding of @x@.
	149	*
	150	* Use: Encodes @x@ as a `binary64' value. See `buf_putf64' for the
	151	* caveats.
	152	*/
	153
	154	static kludge64 f64_to_k64(double x)
	155	{
	156	kludge64 k;
	157	uint32 lo, hi, t;
	158	int e; double m;
	159
	160	/* Some machinery before we start. */
	161
e63124bc MW	162	if (NANP(x)) {
	163	/* A NaN. */
	164	hi = 0x7ff80000; lo = 0;
	165	} else if (INFP(x)) {
	166	/* Positive or negative infinity. */
	167	hi = NEGP(x) ? 0xfff00000 : 0x7ff00000; lo = 0;
	168	} else if (x == 0) {
	169	/* Positive or negative zero. */
	170	hi = NEGP(x) ? 0x80000000 : 0; lo = 0;
	171	} else {
	172	/* A normal or subnormal number. Now we have to do some actual work. */
	173
	174	/* Let's get the sign dealt with so we don't have to worry about it any
	175	* more.
	176	*/
	177	if (!NEGP(x)) hi = 0;
	178	else { x = -x; hi = 0x80000000; }
	179
	180	/* Now we start on the value. The first thing to do is to split off the
	181	* exponent. Our number will be %$m \cdot 2^e$%, with %$1/2 \le m < 1$%.
	182	*/
	183	m = frexp(x, &e);
	184
	185	/* If our number is too big, we'll round it to infinity. This will
	186	* happen if %$x \ge 2^{1024}$%, i.e., if %$e > 1024$%.
	187	*/
	188	if (e > 1024)
	189	{ hi \|= 0x7ff00000; lo = 0; }
	190	else {
	191	/* Our number is sufficiently small that we can represent it at least
	192	* approximately (though maybe we'll have to flush it to zero). The
	193	* next step, then, is to pull the significand bits out.
	194	*/
	195
	196	/* Determine the correct exponent to store. We're not going to bias it
	197	* yet, but this is where we deal with subnormal numbers. Our number
	198	* is normal if %$x \ge 2^{-1022}$%, i.e., %$e > -1022$%. In this
	199	* case, there's an implicit bit which we'll clear. Otherwise, if it's
	200	* subnormal, we'll scale our floating-point number so that the
	201	* significand will look right when we extract it, and adjust the
	202	* exponent so that, when we're finally done, it will have the correct
	203	* sentinel value.
	204	*/
	205	if (e > -1022) m -= 0.5;
	206	else { m = ldexp(m, 1021 + e); e = -1022; }
	207
	208	/* Now we pull out the 53 bits of the significand. This will, in
	209	* general, leave a tail which we address through rounding. Scale it
	210	* up so that we end up with %$0 \le m' < 2$%; then we round up if
	211	* %$m > 1$%, or if %$m = 1$% and the low bit of the significand is
	212	* set.
	213	*/
	214	t = ldexp(m, 21); m -= ldexp(t, -21);
	215	lo = ldexp(m, 53); m -= ldexp(lo, -53);
	216	m = ldexp(m, 54);
	217
	218	/* Round the number if necessary. */
	219	if (lo&1 ? m >= 1.0 : m > 1)
	220	{ lo = U32(lo + 1); if (!lo) t++; }
	221
	222	/* Now we just put the pieces together. Note that our %$e$% is one
	223	* greater than it should be, because our implicit bit should have
	224	* been the unit bit not the 1/2 bit.
	225	*/
226	hi \|= ((uint32)(e + 1022) << 20) \| t;
227	}
228	}
229
230	/* Convert to external format and go home. */
231	SET64(k, hi, lo); return (k);
e63124bc MW	232	}
e63124bc MW	233
e63124bc MW	234	/----- External functions ------------------------------------------------/
e63124bc MW	235
31d0247c	236	/* --- @buf_getf64{,l,b} --- *
e63124bc MW	237	*
	238	* Arguments: @buf *b@ = a buffer to read from
	239	* @double *x_out@ = where to put the result
	240	*
	241	* Returns: Zero on success, @-1@ on failure (and the buffer is broken).
	242	*
	243	* If the system supports NaNs, then any encoded NaN is returned
	244	* as the value of @NAN@ in @<math.h>@; otherwise, this function
	245	* reports failure.
	246	*
	247	* In general, values are rounded to the nearest available
	248	* value, in the way that the system usually rounds. If the
	249	* system doesn't support infinities, then any encoded infinity
	250	* is reported as the largest-possible-magnitude finite value
	251	* instead.
	252	*/
	253
	254	int buf_getf64(buf b, double x_out)
	255	{
	256	kludge64 k;
	257
	258	if (buf_getk64(b, &k)) return (-1);
	259	if (k64_to_f64(x_out, k)) { b->f \|= BF_BROKEN; return (-1); }
	260	return (0);
	261	}
31d0247c MW	262
31d0247c MW	263	int buf_getf64l(buf b, double x_out)
e63124bc MW	264	{
	265	kludge64 k;
	266
31d0247c	267	if (buf_getk64l(b, &k)) return (-1);
e63124bc MW	268	if (k64_to_f64(x_out, k)) { b->f \|= BF_BROKEN; return (-1); }
	269	return (0);
	270	}
31d0247c MW	271
31d0247c MW	272	int buf_getf64b(buf b, double x_out)
e63124bc MW	273	{
	274	kludge64 k;
	275
31d0247c	276	if (buf_getk64b(b, &k)) return (-1);
e63124bc MW	277	if (k64_to_f64(x_out, k)) { b->f \|= BF_BROKEN; return (-1); }
	278	return (0);
	279	}
	280
31d0247c MW	281	int (dbuf_getf64)(dbuf db, double x_out)
	282	{ return (dbuf_getf64(db, x_out)); }
	283	int (dbuf_getf64l)(dbuf db, double x_out)
	284	{ return (dbuf_getf64l(db, x_out)); }
	285	int (dbuf_getf64b)(dbuf db, double x_out)
	286	{ return (dbuf_getf64b(db, x_out)); }
	287
	288	/* --- @buf_putf64{,l,b} --- *
	289	*
	290	* Arguments: @buf *b@ = a buffer to write to
	291	* @double x@ = a number to write
	292	*
	293	* Returns: Zero on success, @-1@ on failure (and the buffer is broken).
	294	*
	295	* On C89, this function can't detect negative zero so these
	296	* will be silently written as positive zero.
	297	*
	298	* This function doesn't distinguish NaNs. Any NaN is written
	299	* as a quiet NaN with all payload bits zero.
	300	*
	301	* A finite value with too large a magnitude to be represented
	302	* is rounded to the appropriate infinity. Other finite values
	303	* are rounded as necessary, in the usual IEEE 754 round-to-
	304	* nearest-or-even way.
	305	*/
	306
	307	int buf_putf64(buf *b, double x)
	308	{ return (buf_putk64(b, f64_to_k64(x))); }
	309	int buf_putf64l(buf *b, double x)
	310	{ return (buf_putk64l(b, f64_to_k64(x))); }
	311	int buf_putf64b(buf *b, double x)
	312	{ return (buf_putk64b(b, f64_to_k64(x))); }
	313
	314	int (dbuf_putf64)(dbuf *db, double x)
	315	{ return (dbuf_putf64(db, x)); }
	316	int (dbuf_putf64l)(dbuf *db, double x)
	317	{ return (dbuf_putf64l(db, x)); }
	318	int (dbuf_putf64b)(dbuf *db, double x)
	319	{ return (dbuf_putf64b(db, x)); }
	320
e63124bc	321	/----- That's all, folks -------------------------------------------------/