@@@ doc wip

[mLib] / utils / fltfmt.3.in
diff --git a/utils/fltfmt.3.in b/utils/fltfmt.3.in

new file mode 100644 (file)

index 0000000..e5ae258
--- /dev/null
+++ b/utils/fltfmt.3.in
@@ -0,0 +1,988 @@
+.\" -*-nroff-*-
+.\"
+.\" Manual for floating-point format conversions
+.\"
+.\" (c) 2024 Straylight/Edgeware
+.\"
+.
+.\"----- Licensing notice ---------------------------------------------------
+.\"
+.\" This file is part of the mLib utilities library.
+.\"
+.\" mLib is free software: you can redistribute it and/or modify it under
+.\" the terms of the GNU Library General Public License as published by
+.\" the Free Software Foundation; either version 2 of the License, or (at
+.\" your option) any later version.
+.\"
+.\" mLib is distributed in the hope that it will be useful, but WITHOUT
+.\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+.\" FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Library General Public
+.\" License for more details.
+.\"
+.\" You should have received a copy of the GNU Library General Public
+.\" License along with mLib.  If not, write to the Free Software
+.\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+.\" USA.
+.
+.\"--------------------------------------------------------------------------
+.so ../defs.man \" @@@PRE@@@
+.
+.\"--------------------------------------------------------------------------
+.TH fltfmt 3mLib "22 April 2024" "Straylight/Edgeware" "mLib utilities library"
+.\" @FLTERR_OK
+.\" @FLTERR_INVAL
+.\" @FLTERR_INEXACT
+.\" @FLTERR_UFLOW
+.\" @FLTERR_OFLOW
+.\" @FLTERR_REPR
+.\" @FLTERR_ALLERRS
+.
+.\" @FRPF_LOW
+.\" @FRPF_HALF
+.\" @FRPF_ODD
+.\" @FRPF_NEG
+.\" @FRPMASK_LOW
+.\" @FRPMASK_HALF
+.\" @FRPMASK_ODD
+.\" @FRPMASK_NEG
+.\" @FRPMASK_INEXACT
+.\" @FRPMASK_NEAR
+.\" @FLTRND_ZERO
+.\" @FLTRND_PROJINF
+.\" @FLTRND_NEGINF
+.\" @FLTRND_POSINF
+.\" @FLTRND_EVEN
+.\" @FLTRND_ODD
+.\" @FLTRND_NEAREVEN
+.\" @FLTRND_NEARODD
+.\" @FLTRND_NEARZERO
+.\" @FLTRND_NEARINF
+.\" @FLTRND_NEARNEG
+.\" @FLTRND_NEARPOS
+.
+.\" @FLTFMT_NEG
+.\" @FLTFMT_INF
+.\" @FLTFMT_QNAN
+.\" @FLTFMT_SNAN
+.\" @FLTFMT_ZERO
+.\" @FLTFMT_NANMASK
+.\" @FLOATBITS_INIT
+.\" @fltfmt_initbits
+.\" @fltfmt_freebits
+.\" @fltfmt_allocfrac
+.\" @fltfmt_copybits
+.\" @fltfmt_round
+.
+.\" @FLTIF_HIDDEN
+.\" @fltfmt_f16
+.\" @fltfmt_f32
+.\" @fltfmt_f64
+.\" @fltfmt_f128
+.\" @fltfmt_mini
+.\" @fltfmt_bf16
+.\" @fltfmt_idblext80
+.
+.\" @fltfmt_encieee
+.\" @fltfmt_encf16
+.\" @fltfmt_encf32
+.\" @fltfmt_encf64
+.\" @fltfmt_encf128
+.\" @fltfmt_encmini
+.\" @fltfmt_encbf16
+.\" @fltfmt_encidblext80
+.\" @fltfmt_decieee
+.\" @fltfmt_decf16
+.\" @fltfmt_decf32
+.\" @fltfmt_decf64
+.\" @fltfmt_decf128
+.\" @fltfmt_decmini
+.\" @fltfmt_decbf16
+.\" @fltfmt_decidblext80
+.
+.\" @fltfmt_encflt
+.\" @fltfmt_encdbl
+.\" @fltfmt_encldbl
+.\" @fltfmt_decflt
+.\" @fltfmt_decdbl
+.\" @fltfmt_decldbl
+.
+.\" @fltfmt_flttof32l
+.\" @fltfmt_flttof32b
+.\" @fltfmt_dbltof64l
+.\" @fltfmt_dbltof64b
+.\" @fltfmt_f32ltoflt
+.\" @fltfmt_f32btoflt
+.\" @fltfmt_f64ltodbl
+.\" @fltfmt_f64btodbl
+.
+.\"--------------------------------------------------------------------------
+.SH NAME
+fltfmt \- floating-point format conversions
+.
+.\"--------------------------------------------------------------------------
+.SH SYNOPSIS
+.
+.nf
+.B "#define FLTERR_OK 0"
+.B "#define FLTERR_INVAL ..."
+.B "#define FLTERR_INEXACT ..."
+.B "#define FLTERR_UFLOW ..."
+.B "#define FLTERR_OFLOW ..."
+.B "#define FLTERR_REPR ..."
+.B "#define FLTERR_ALLERRS ..."
+.PP
+.ta 40n
+.B "#define FRPF_LOW 1u"
+.B "#define FRPF_HALF 2u"
+.B "#define FRPF_ODD 4u"
+.B "#define FRPF_NEG 8u"
+.B "#define FRPMASK_LOW 0xaaaau"
+.B "#define FRPMASK_HALF 0xccccu."
+.B "#define FRPMASK_ODD 0xf0f0u"
+.B "#define FRPMASK_NEG 0xff00u"
+.B "#define FRPMASK_INEXACT ...        /* LOW | HALF */"
+.BI "unsigned FRPMASK_NEAR(unsigned " dir ");  /* HALF&(LOW | " dir ") */"
+.B "#define FLTRND_ZERO ...    /* 0 */"
+.B "#define FLTRND_PROJINF ... /* INEXACT */"
+.B "#define FLTRND_NEGINF ...  /* INEXACT&NEG */"
+.B "#define FLTRND_POSINF ...  /* INEXACT&~NEG */"
+.B "#define FLTRND_EVEN ...    /* INEXACT&ODD */"
+.B "#define FLTRND_ODD ...     /* INEXACT&~ODD */"
+.B "#define FLTRND_NEAREVEN ...        /* HALF&(LOW | ODD) */"
+.B "#define FLTRND_NEARODD ... /* HALF&(LOW | ~ODD) */"
+.B "#define FLTRND_NEARZERO ...        /* HALF&LOW */"
+.B "#define FLTRND_NEARINF ... /* HALF */"
+.B "#define FLTRND_NEARNEG ... /* HALF&(LOW | NEG) */"
+.B "#define FLTRND_NEARPOS ... /* HALF&(LOW | ~NEG) */"
+.PP
+.ta 2n
+.B "#define FLTF_NEG ..."
+.B "#define FLTF_INF ..."
+.B "#define FLTF_QNAN ..."
+.B "#define FLTF_SNAN ..."
+.B "#define FLTF_ZERO ..."
+.B "#define FLTF_NANMASK (FLTF_QNAN | FLTF_SNAN)"
+.B "struct floatbits {"
+.B "   unsigned f;"
+.B "   int exp;"
+.B "   arena *a;"
+.B "   uint32 *frac;"
+.B "   unsigned n, fracsz;"
+.B "};"
+.B "#define FLOATBITS_INIT { ...\& };"
+.PP
+.BI "void fltfmt_initbits(struct floatbits *" x );
+.BI "void fltfmt_freebits(struct floatbits *" x );
+.BI "void fltfmt_allocfrac(struct floatbits *" x ", unsigned " n );
+.ta \w'\fBvoid fltfmt_copybits('u
+.BI "void fltfmt_copybits(struct floatbits *" z_out ,
+.BI "  const struct floatbits *" x );
+.ta \w'\fBvoid fltfmt_round('u
+.BI "void fltfmt_round(struct floatbits *" z_out ,
+.BI "  const struct floatbits *" x ,
+.BI "  unsigned " r ", unsigned " n );
+.PP
+.
+.ta 2n
+.B "#define FLTIF_HIDDEN ..."
+.B "struct fltfmt_ieeefmt {"
+.B "   unsigned f;"
+.B "   unsigned expwd;"
+.B "   unsigned prec;"
+.B "};"
+.B "const struct fltfmt_ieeefmt fltfmt_f16;"
+.B "const struct fltfmt_ieeefmt fltfmt_f32;"
+.B "const struct fltfmt_ieeefmt fltfmt_f64;"
+.B "const struct fltfmt_ieeefmt fltfmt_f128;"
+.B "const struct fltfmt_ieeefmt fltfmt_mini;"
+.B "const struct fltfmt_ieeefmt fltfmt_bf16;"
+.B "const struct fltfmt_ieeefmt fltfmt_idblext80;"
+.PP
+.ta \w'\fBunsigned fltfmt_encieee('u
+.BI "unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *" fmt ,
+.BI "  uint32 *" z ", const struct floatbits *" x ,
+.BI "  unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encf16('u
+.BI "unsigned fltfmt_encf16(uint16 *" z_out ", const struct floatbits *" x ,
+.BI "  unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encf32('u
+.BI "unsigned fltfmt_encf32(uint32 *" z_out ", const struct floatbits *" x ,
+.BI "  unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encf64('u
+.BI "unsigned fltfmt_encf64(kludge64 *" z_out ", const struct floatbits *" x ,
+.BI "  unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encf128('u
+.BI "unsigned fltfmt_encf128(uint32 *" z_out ", const struct floatbits *" x ,
+.BI "  unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encmini('u
+.BI "unsigned fltfmt_encmini(octet *" z_out ", const struct floatbits *" x ,
+.BI "  unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encbf16('u
+.BI "unsigned fltfmt_encbf16(uint16 *" z_out ", const struct floatbits *" x ,
+.BI "  unsigned " r ", unsigned " errmask );
+.ta \w'\fBunsigned fltfmt_encidblext80('u
+.BI "unsigned fltfmt_encidblext80(uint16 *" se_out ", kludge64 *" m_out ,
+.BI "  const struct floatbits *" x ,
+.BI "  unsigned " r ", unsigned " errmask );
+.PP
+.ta \w'\fBunsigned fltfmt_decieee('u
+.BI "unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *" fmt ,
+.BI "  struct floatbits *" z_out ", const uint32 *" x );
+.BI "unsigned fltfmt_decf16(struct floatbits *" z_out ", uint16 " x );
+.BI "unsigned fltfmt_decf32(struct floatbits *" z_out ", uint32 " x );
+.BI "unsigned fltfmt_decf64(struct floatbits *" z_out ", kludge64 " x );
+.BI "unsigned fltfmt_decf128(struct floatbits *" z_out ", const uint32 *" x );
+.BI "unsigned fltfmt_decmini(struct floatbits *" z_out ", octet " x );
+.BI "unsigned fltfmt_decbf16(struct floatbits *" z_out ", uint16 " x );
+.ta \w'\fBunsigned fltfmt_decidblext80('u
+.BI "unsigned fltfmt_decidblext80(struct floatbits *" z_out ,
+.BI "  uint16 " se ", kludge64 " m );
+.PP
+.ta \w'\fBunsigned fltfmt_encflt('u
+.BI "unsigned fltfmt_encflt(float *" z_out ,
+.BI "  const struct floatbits *" x ", unsigned " r );
+.ta \w'\fBunsigned fltfmt_encdbl('u
+.BI "unsigned fltfmt_encdbl(double *" z_out ,
+.BI "  const struct floatbits *" x ", unsigned " r );
+.ta \w'\fBunsigned fltfmt_encldbl('u
+.BI "unsigned fltfmt_encldbl(long double *" z_out ,
+.BI "  const struct floatbits *" x ", unsigned " r );
+.ta \w'\fBunsigned fltfmt_decflt('u
+.BI "unsigned fltfmt_decflt(struct floatbits *" z_out ,
+.BI "  float *" x ", unsigned " r );
+.ta \w'\fBunsigned fltfmt_decdbl('u
+.BI "unsigned fltfmt_decdbl(struct floatbits *" z_out ,
+.BI "  double *" x ", unsigned " r );
+.ta \w'\fBunsigned fltfmt_decldbl('u
+.BI "unsigned fltfmt_decldbl(struct floatbits *" z_out ,
+.BI "  long double *" x ", unsigned " r );
+.PP
+.BI "unsigned fltfmt_flttof32l(octet *" p ", float " x ", unsigned " r );
+.BI "unsigned fltfmt_flttof32b(octet *" p ", float " x ", unsigned " r );
+.BI "unsigned fltfmt_dbltof64l(octet *" p ", double " x ", unsigned " r );
+.BI "unsigned fltfmt_dbltof64b(octet *" p ", double " x ", unsigned " r );
+.BI "unsigned fltfmt_f32ltoflt(float *" z_out ", const octet *" p ", unsigned " r );
+.BI "unsigned fltfmt_f32btoflt(float *" z_out ", const octet *" p ", unsigned " r );
+.BI "unsigned fltfmt_f64ltodbl(float *" z_out ", const octet *" p ", unsigned " r );
+.BI "unsigned fltfmt_f64btodbl(float *" z_out ", const octet *" p ", unsigned " r );
+.
+.\"--------------------------------------------------------------------------
+.SH DESCRIPTION
+.
+The
+.B "<mLib/fltfmt.h>"
+header file defines structures, macros, and functions
+for converting floating-point values between various formats,
+including the native floating-point formats
+and IEEE\ 754 and related formats.
+.
+.SS Error conditions
+Most of the functions in this module return an unsigned integer.
+A return value of zero means that no error occurred;
+set bits indicate various error conditions.
+.TP
+.B FLTERR_INVAL
+A binary input to be decoded contained an invalid bit pattern,
+e.g., an unnormalized input value with a nonminimal exponent.
+The function will have produced a reasonable output anyway,
+but the original value will not be recoverable from the result.
+.TP
+.B FLTERR_INEXACT
+The conversion was inexact.
+Converting the output back into the format of the input
+may not reproduce the original input value.
+This error flag is sometimes set conservatively.
+.TP
+.B FLTERR_UFLOW
+The conversion underflowed:
+a nonzero input was too tiny (in asbolute value) to represent,
+and a zero result was returned.
+.TP
+.B FLTERR_OFLOW
+The conversion overflowed:
+a finite input was too huge (in absolute value) to represent,
+and either the appropriately signed infinity
+or largest-magnitude finite value
+was returned, determined by the requested rounding mode.
+.TP
+.B FLTERR_REPR
+The output format failed entirely to represent the input value.
+The result is zero if the input was a NaN,
+or the appropriately signed largest-magnitude finite value
+if the input was an infinity.
+.
+.SS Rounding modes
+The rounding system works as follows.
+There are four
+.I rounding predicates
+considered when a rounding decision is taken.
+These are determined from the unrounded input value
+.IR x ,
+and the two nearest rounded values
+.RI | u "|\ \*(<=\ |" x |
+and
+.RI | v "|\ >\ |" x |.
+The predicates are as follows.
+.TP
+.B FRPF_LOW
+If
+.IR x "\ \*(/=\ " u
+and
+.IR x "\ \*/=\ (" u "\ +\ " v )/2,
+i.e.,
+.I x
+is neither equal to a rounded value,
+nor exactly halfway between two rounded values.
+This predicate is sometimes referred to as a `sticky bit'.
+.TP
+.B FRPF_HALF
+If
+.RI | x "|\ \*(>=\ |(" u "\ +\ " v )/2|,
+i.e.,
+.I x
+is halfway or more towards its larger rounded neighbour.
+.TP
+.B FRPF_ODD
+If least significant digit of
+.I u
+is odd.
+In binary floating-point formats,
+this is just the least significant bit of
+.IR u .
+.TP
+.B FRPF_NEG
+If
+.I x
+is negative.
+.PP
+These four predicates are packed into a four-bit mask value
+.I rf
+between 0 and 15.
+A
+.I rounding mode
+is simply a 16-bit mask:
+if bit
+.I rf
+of the rounding-mode mask is set,
+then
+.I x
+is rounded to
+.IR v ;
+otherwise it is rounded to
+.IR u .
+That is, the rounding-mode mask is essentially a truth table.
+Rounding modes with
+.I set
+bits corresponding to situations where both
+.B FRPF_LOW
+and
+.FRPF_HALF
+are false,
+i.e., where
+.I x
+is already a rounded value,
+are forbidden.
+.PP
+Some useful machinery is provided
+for constructing rounding-mode masks.
+.BR FRPMASK_LOW ,
+.BR FRPMASK_HALF ,
+.BR FRPMASK_ODD ,
+and
+.B FRPMASK_NEG ,
+are mask with set bits corresponding to their respective predicates.
+Bitwise boolean logic can be applied to these masks
+in order to calculate the masks corresponding to
+the same logical expresssion applied to the individual predicates.
+.B FRPMASK_INEXACT holds if
+.B LOW
+or
+.B HALF
+holds;
+i.e., if
+.IR x "\ \*(/=\ " u ;
+as mentioned above, only these bits may be set
+in a valid rounding-mode mask.
+.BI FRPMASK_NEAR( dir )
+is the mask for rounding to nearest with ties broken according to
+.IR dir ,
+which is another rounding-mode mask.
+The complete set of predefined masks is listed above in the synopsis,
+together with their description in terms of the basic predicates.
+The usual IEEE rounding mode is
+round-to-nearest/ties-to-even,
+denoted
+.BR FLTRND_NEAREVEN .
+This is likely a good option
+if there is no compelling argument for a different specific choice.
+.
+.SS The floatbits structure
+In order to avoid a combinatorial explosion in conversion operations,
+all the basic conversions involve,
+as source or target,
+a `common currency' format represented by the type
+.BR "struct floatbits" .
+.PP
+This structure consists of
+a set of flags
+.BR f ;
+a signed exponent
+.BR exp ;
+an
+.B arena
+pointer
+.BR a ;
+a pointer
+.B frac
+to a vector of
+.B uint32
+values;
+the length
+.B n
+of the
+.B frac
+vector; and
+the currently allocated size
+.B fracsz
+of the vector.
+Both
+.B n
+and
+.B fracsz
+count elements, not bytes.
+.PP
+Storage for
+.B frac
+comes from the arena
+.BR a .
+Only the first
+.B n
+words of
+.B frac
+are significant;
+.B frac[0]
+is the most significant word.
+The value represented by a
+.B struct floatbits
+is never changed by adding or removing zero-valued words
+at the end of the
+.B frac
+vector.
+It is always the case that
+.BR n "\ \*(<=\ " fracsz ;
+if
+.B fracsz
+is zero then
+.B frac
+may be a null pointer.
+.PP
+The interpretation of the
+.B exp
+and
+.B frac
+members depends on the flags set in
+.BR f .
+Apart from
+.BR FLTF_NEG ,
+the flags are
+.IR "mutually exclusive" :
+at most one flag may be set.
+.TP
+.B FLTF_NEG
+The value is negative.
+.TP
+.B FLTF_INF
+The value is positive or negative infinity.
+The
+.B exp
+and
+.B frac
+are ignored.
+.TP
+.BR FLTF_QNAN " and " FLTF_SNAN
+The value is a quiet or signalling not-a-number, respectively.
+The
+.B exp
+is ignored.
+The payload is stored in
+.BR frac ;
+the payload does not include the `quiet' bit.
+.TP
+.B FLTF_ZERO
+The number is zero.
+Negative zero is distinct from positive zero.
+The
+.B exp
+and
+.B frac
+are ignored.
+.IP "All non-sign bits clear"
+The value is a finite nonzero number.
+The
+.B frac
+holds the significand.
+The most significand significand bit must be set, so
+(a)\ the number must be nonzero, and
+(b)\ the significand is normalized.
+The significand is interpreted as a fraction
+.RI "1/2\ \*(<=\ " m "\ <\ 1."
+If
+.I e
+is the value of the
+.B exp
+member,
+and
+.I s
+is \-1 if
+.B FLTF_NEG
+is set
+or +1 if
+.B FLTF_NEG
+is clear,
+then the number represented is
+.IR s "\ \(mu\ " m "\ \(mu\ 2\*(ss" e \*(se.
+.PP
+A
+.B struct floatbits
+can be initialized statically by
+.BR FLOATBITS_INIT ,
+or dynamically using the function
+.BR fltfmt_initbits .
+These are not quite the same:
+.B FLOATBITS_INIT
+initializes
+.B a
+to
+.BR &arena_stdlib ,
+while
+.B fltfmt_initbits
+sets it to the runtime value of
+.BR arena_global .
+With this exception,
+both forms of initialization set the value to (positive) zero;
+neither allocates any storage or other resources,
+leaving
+.B frac
+null.
+.PP
+The
+.B fltfmt_allocfrac
+function is given a pointer
+.I x
+to a
+.B struct floatbits
+and a length
+.IR n :
+it ensures that there is enough storage at
+.IB x ->frac
+for at least
+.I n
+words:
+if the current size is too small,
+then any existing buffer is discarded and a new one allocated
+from the arena
+.IB x ->a \fR;
+any existing contents of the buffer are lost.
+On exit,
+.IB x ->n
+is set to
+.IR n .
+.PP
+The
+.B fltfmt_freebits
+function
+frees a
+.B struct floatbits
+structure, releasing the storage held by
+.BR frac .
+.PP
+The
+.B fltfmt_copybits
+function simply copies its input
+.I x
+to its output
+.IR z_out ;
+both must refer to initialized
+.B struct floatbits
+structures.
+If
+.I z_out
+and
+.I x
+are equal, then nothing happens.
+.PP
+Finally, the
+.B fltfmt_round
+function rounds the value in the
+.B struct floatbits
+structure
+.I x
+to
+.I n
+bits using the rounding mode
+.IR r ;
+the result is written to
+.IR z_out ;
+it is permitted for
+.I z_out
+to be equal to
+.IR x .
+If
+.I x
+is a zero or infinity,
+then the output is equal to the input,
+as if
+.B fltfmt_copybits
+had been called instead.
+If
+.I x
+is a NaN,
+then the payload is simply truncated to
+.I n
+bits, without regard to the rounding mode.
+Otherwise, the input is nonzero and finite;
+the significand is rounded to
+.I n
+bits according to the rounding mode.
+In all cases, the return value is
+zero if the output is equal to the input,
+or
+.B FLTERR_INEXACT
+if the rounded result is not equal to the input.
+.
+.SS IEEE and related formats
+An IEEE floating-point format is characterized by three parameters:
+the
+.I "exponent width"
+.IR w ,
+the
+.I "precision"
+.IR p ,
+and
+the
+.I "unit width"
+.IR h .
+.PP
+The encoded value consists of
+.IR p "\ +\ " w "\ +\ " h "\ \-\ 1"
+bits.
+This is divided, from the most significant bit downwards,
+into a
+.I "sign bit"
+.IR s ,
+a
+.IR w -bit
+.I "biased exponent"
+.IR e \*',
+a
+.IR h -bit
+.I "unit bit"
+.IR u ,
+and a
+.RI ( p "\ \-\ " h )-bit
+.I fraction
+.IR f .
+The
+.I "exponent bias"
+is
+.IR e "\*(us0\*(ue\ =\ 2\*(ss" w "\-1\*(se\ \-\ 1;"
+the true exponent
+.I e
+is calculated from the biased exponent by
+.IR e "\ =\ " e "\*'\ \-\ " e \*(us0\*(ue.
+The unit and fraction field are usually interpreted as denoting
+a significand
+.IR m "\ =\ " u "\ +\ " f /2\*(ss p \-1\*(se
+with
+.RI "0\ \*(<=\ " m "\ <\ 2."
+If
+.I h
+is zero,
+the value of the unit bit
+.I u
+is implied by the exponent as described below.
+The encoded value is interpreted as follows.
+.hP \*o
+If
+.IR e "\ =\ \-" e \*(us0\*(ue
+then the value is zero or a subnormal,
+with the value
+.RI (\-1)\*(ss s "\*(se\ \(mu\ " m "\ \(mu\ 2\*(ss" e +1\*(se.
+In particular,
+if
+.IR m "\ =\ 0"
+then the value is positive or negative zero,
+according to the sign bit
+.IR s .
+If
+.I h
+is zero then
+.IR u "\ =\ 0;"
+if
+.I h
+is nonzero
+but
+.IR u "\ \*(/=\ 0"
+then the encoding is invalid:
+decoding returns
+.BR FLTERR_INVAL ,
+but the result will be as described.
+.hP \*o
+If
+.RI "1\ \-\ " e "\*(us0\*(ue \*(<=\ " e "\ < 2" e "\*(us0\*(ue\ +\ 1"
+then the value is a (supposedly) normal number
+.RI (\-1)\*(ss s "\*(se\ \(mu\ " m "\ \(mu\ 2\*(ss" e \*(se.
+If
+.I h
+is zero then
+.IR u "\ =\ 1;"
+if
+.I h
+is nonzero
+but
+.IR u "\ \*(/=\ 0"
+then the encoding is invalid:
+decoding returns
+.BR FLTERR_INVAL ,
+but the result will be as described.
+.hP \*o
+If
+.IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
+and
+.IR f "\ =\ 0"
+then the value is positive or negative infinity,
+according to the sign bit
+.IR s .
+If
+.I h
+is nonzero and
+.IR u "\ =\ 0"
+then the encoding is invalid:
+decoding returns
+.BR FLTERR_INVAL ,
+but the result will still be infinity.
+.hP \*o
+If
+.IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
+and
+.IR f "\ \*(/=\ 0"
+then the value is not-a-number (NaN).
+The most significant bit of
+.I f
+is the `quiet bit':
+if the bit is set, the value is a `quiet NaN';
+if the bit is clear, the value is a `signalling NaN'.
+(This is the convention recommended by IEEE\ 754-2008 \(sc6.2.1,
+it has the advantage that a signalling NaN can be `quieted'
+by setting the most significant fraction bit;
+HP-PA and older MIPS processors use the opposite convention
+for distinguishing quiet and signalling NaNs,
+but a signalling NaN with all but the most significant
+fraction bit zero cannot be `quieted' by clearing the
+most significant bit, since the resulting encoding denotes
+an infinity, not a QNaN.)
+The remaining bits of
+.I f
+form the
+.I payload.
+Positive and negative NaN values are distinguished,
+with sign determined by the sign bit.
+If
+.I h
+is nonzero and
+.IR u "\ =\ 0"
+then the encoding is invalid:
+decoding returns
+.BR FLTERR_INVAL ,
+but the result will still be a NaN;
+the unit bit does not affect the NaN payload.
+.PP
+An IEEE format is described by the type
+.BR "struct fltfmt_ieeefmt" .
+This has three members:
+.TP
+.B f
+A flags word.
+If
+.B FLTIF_HIDDEN
+is set, the the format uses a `hidden bit' convention:
+in the notation above
+.IR h "\ =\ 0;"
+if the flag is clear,
+the format has an explicit unit bit, and
+.IR h "\ =\ 1."
+.TP
+.B expwd
+The exponent width;
+in the notation above this is
+.IR w .
+.TP
+.B prec
+The precision;
+in the notation above this is
+.IR p .
+.PP
+The following IEEE formats descriptions are already defined.
+.TP
+.B "fltfmt_f16"
+The IEEE\ 754 Binary16 format, with
+.IR w "\ =\ 5,"
+.IR p "\ =\ 11,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_f32"
+The IEEE\ 754 Binary32 (`single precision') format, with
+.IR w "\ =\ 8,"
+.IR p "\ =\ 24,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_f64"
+The IEEE\ 754 Binary64 (`double precision') format, with
+.IR w "\ =\ 11,"
+.IR p "\ =\ 53,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_f128"
+The IEEE\ 754 Binary128 (`quad precision') format, with
+.IR w "\ =\ 15,"
+.IR p "\ =\ 113,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_mini"
+An eight-bit `minifloat' format, with
+.IR w "\ =\ 4,"
+.IR p "\ =\ 4,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_bf16"
+The Google `BFloat16' format, with
+.IR w "\ =\ 8,"
+.IR p "\ =\ 8,"
+and
+.IR h "\ =\ 0."
+.TP
+.B "fltfmt_idblext80"
+The Intel 8087 80-bit `double extended' format, with
+.IR w "\ =\ 15,"
+.IR p "\ =\ 64,"
+and
+.IR h "\ =\ 1."
+.PP
+The
+.B fltfmt_encieee
+and
+.B fltfmt_decieee
+functions convert between IEEE and related formats
+and the internal
+.B struct floatbits
+representation.
+They respectively encode or decode an IEEE-format value,
+as described above,
+from a vector of
+.B uint32
+words,
+most-significant word first
+\(en so the sign bit is in the first word.
+For formats whose size is not a multiple of 32,
+the encoding is
+.IR right-aligned :
+the least significant bit of the fraction
+is in the least significant bit of the last word in the vector.
+.PP
+The
+.B fltfmt_encieee
+function encodes an IEEE-format value.
+The function is given five arguments:
+a pointer
+.I fmt
+to the IEEE format description,
+a pointer
+.I p
+to a sufficiently long vector of 32-bit words
+in which to store the encoded value,
+a pointer
+.I x
+to the
+.B struct floatbits
+holding the value to encode,
+a rounding mode
+.IR r ,
+and an error mask
+.IR errmask .
+If the input is a NaN,
+then the payload is truncated to fit
+regardless of the rounding mode,
+discarding low-significant bits;
+if the input is a finite value,
+then the significand is rounded to fit
+according to the requested rounding mode.
+The possible errors are
+.B FLTERR_UFLOW
+if the value is unrepresentably tiny,
+.B FLTERR_OFLOW
+if the value is unrepresentably huge,
+and
+.B FLTERR_INEXACT
+if the encoding fails to preserve the input value exactly;
+hence
+.B FLTERR_INEXACT
+is set whenever
+.B FLTERR_OFLOW
+or
+.B FLTERR_UFLOW
+is set,
+or if bits are lost due to NaN-payload truncation or rounding.
+If, during encoding,
+an error is encountered,
+processing stops immediately
+unless the corresponding bit of
+.I errmask
+is set.
+.PP
+The
+.B fltfmt_decieee
+function decodes an IEEE-format value.
+The function is given three arguments:
+a pointer
+.I fmt
+to the IEEE format description,
+a pointer
+.I z_out
+to the initialized
+.B struct floatbits
+to fill in, and
+a pointer
+.I p
+to the IEEE-encoded value to decode,
+in a vector of 32-bit words as described above.
+The only error that can occur during decoding is
+.BR FLTERR_INVAL :
+as described above,
+this occurs in non-hidden-bit formats
+when the unit bit does not match that implied by the exponent;
+the result is returned anyway,
+with the unit bit interpreted as encoded in finite numbers,
+and discarded in infinities and NaNs.
+.PP
+For each of the format
+
+
+
+.
+.\"--------------------------------------------------------------------------
+.SH "SEE ALSO"
+.
+.BR bits (3),
+.BR mLib (3).
+.
+.\"--------------------------------------------------------------------------
+.SH AUTHOR
+.
+Mark Wooding, <mdw@distorted.org.uk>
+.
+.\"----- That's all, folks --------------------------------------------------