@@@ doc wip
[mLib] / utils / fltfmt.3.in
CommitLineData
c752173d
MW
1.\" -*-nroff-*-
2.\"
3.\" Manual for floating-point format conversions
4.\"
5.\" (c) 2024 Straylight/Edgeware
6.\"
7.
8.\"----- Licensing notice ---------------------------------------------------
9.\"
10.\" This file is part of the mLib utilities library.
11.\"
12.\" mLib is free software: you can redistribute it and/or modify it under
13.\" the terms of the GNU Library General Public License as published by
14.\" the Free Software Foundation; either version 2 of the License, or (at
15.\" your option) any later version.
16.\"
17.\" mLib is distributed in the hope that it will be useful, but WITHOUT
18.\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19.\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public
20.\" License for more details.
21.\"
22.\" You should have received a copy of the GNU Library General Public
23.\" License along with mLib. If not, write to the Free Software
24.\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
25.\" USA.
26.
27.\"--------------------------------------------------------------------------
28.so ../defs.man \" @@@PRE@@@
29.
30.\"--------------------------------------------------------------------------
31.TH fltfmt 3mLib "22 April 2024" "Straylight/Edgeware" "mLib utilities library"
32.\" @FLTERR_OK
33.\" @FLTERR_INVAL
34.\" @FLTERR_INEXACT
35.\" @FLTERR_UFLOW
36.\" @FLTERR_OFLOW
37.\" @FLTERR_REPR
38.\" @FLTERR_ALLERRS
39.
40.\" @FRPF_LOW
41.\" @FRPF_HALF
42.\" @FRPF_ODD
43.\" @FRPF_NEG
44.\" @FRPMASK_LOW
45.\" @FRPMASK_HALF
46.\" @FRPMASK_ODD
47.\" @FRPMASK_NEG
48.\" @FRPMASK_INEXACT
49.\" @FRPMASK_NEAR
50.\" @FLTRND_ZERO
51.\" @FLTRND_PROJINF
52.\" @FLTRND_NEGINF
53.\" @FLTRND_POSINF
54.\" @FLTRND_EVEN
55.\" @FLTRND_ODD
56.\" @FLTRND_NEAREVEN
57.\" @FLTRND_NEARODD
58.\" @FLTRND_NEARZERO
59.\" @FLTRND_NEARINF
60.\" @FLTRND_NEARNEG
61.\" @FLTRND_NEARPOS
62.
63.\" @FLTFMT_NEG
64.\" @FLTFMT_INF
65.\" @FLTFMT_QNAN
66.\" @FLTFMT_SNAN
67.\" @FLTFMT_ZERO
68.\" @FLTFMT_NANMASK
69.\" @FLOATBITS_INIT
70.\" @fltfmt_initbits
71.\" @fltfmt_freebits
72.\" @fltfmt_allocfrac
73.\" @fltfmt_copybits
74.\" @fltfmt_round
75.
76.\" @FLTIF_HIDDEN
77.\" @fltfmt_f16
78.\" @fltfmt_f32
79.\" @fltfmt_f64
80.\" @fltfmt_f128
81.\" @fltfmt_mini
82.\" @fltfmt_bf16
83.\" @fltfmt_idblext80
84.
85.\" @fltfmt_encieee
86.\" @fltfmt_encf16
87.\" @fltfmt_encf32
88.\" @fltfmt_encf64
89.\" @fltfmt_encf128
90.\" @fltfmt_encmini
91.\" @fltfmt_encbf16
92.\" @fltfmt_encidblext80
93.\" @fltfmt_decieee
94.\" @fltfmt_decf16
95.\" @fltfmt_decf32
96.\" @fltfmt_decf64
97.\" @fltfmt_decf128
98.\" @fltfmt_decmini
99.\" @fltfmt_decbf16
100.\" @fltfmt_decidblext80
101.
102.\" @fltfmt_encflt
103.\" @fltfmt_encdbl
104.\" @fltfmt_encldbl
105.\" @fltfmt_decflt
106.\" @fltfmt_decdbl
107.\" @fltfmt_decldbl
108.
109.\" @fltfmt_flttof32l
110.\" @fltfmt_flttof32b
111.\" @fltfmt_dbltof64l
112.\" @fltfmt_dbltof64b
113.\" @fltfmt_f32ltoflt
114.\" @fltfmt_f32btoflt
115.\" @fltfmt_f64ltodbl
116.\" @fltfmt_f64btodbl
117.
118.\"--------------------------------------------------------------------------
119.SH NAME
120fltfmt \- floating-point format conversions
121.
122.\"--------------------------------------------------------------------------
123.SH SYNOPSIS
124.
125.nf
126.B "#define FLTERR_OK 0"
127.B "#define FLTERR_INVAL ..."
128.B "#define FLTERR_INEXACT ..."
129.B "#define FLTERR_UFLOW ..."
130.B "#define FLTERR_OFLOW ..."
131.B "#define FLTERR_REPR ..."
132.B "#define FLTERR_ALLERRS ..."
133.PP
134.ta 40n
135.B "#define FRPF_LOW 1u"
136.B "#define FRPF_HALF 2u"
137.B "#define FRPF_ODD 4u"
138.B "#define FRPF_NEG 8u"
139.B "#define FRPMASK_LOW 0xaaaau"
140.B "#define FRPMASK_HALF 0xccccu."
141.B "#define FRPMASK_ODD 0xf0f0u"
142.B "#define FRPMASK_NEG 0xff00u"
143.B "#define FRPMASK_INEXACT ... /* LOW | HALF */"
144.BI "unsigned FRPMASK_NEAR(unsigned " dir "); /* HALF&(LOW | " dir ") */"
145.B "#define FLTRND_ZERO ... /* 0 */"
146.B "#define FLTRND_PROJINF ... /* INEXACT */"
147.B "#define FLTRND_NEGINF ... /* INEXACT&NEG */"
148.B "#define FLTRND_POSINF ... /* INEXACT&~NEG */"
149.B "#define FLTRND_EVEN ... /* INEXACT&ODD */"
150.B "#define FLTRND_ODD ... /* INEXACT&~ODD */"
151.B "#define FLTRND_NEAREVEN ... /* HALF&(LOW | ODD) */"
152.B "#define FLTRND_NEARODD ... /* HALF&(LOW | ~ODD) */"
153.B "#define FLTRND_NEARZERO ... /* HALF&LOW */"
154.B "#define FLTRND_NEARINF ... /* HALF */"
155.B "#define FLTRND_NEARNEG ... /* HALF&(LOW | NEG) */"
156.B "#define FLTRND_NEARPOS ... /* HALF&(LOW | ~NEG) */"
157.PP
158.ta 2n
159.B "#define FLTF_NEG ..."
160.B "#define FLTF_INF ..."
161.B "#define FLTF_QNAN ..."
162.B "#define FLTF_SNAN ..."
163.B "#define FLTF_ZERO ..."
164.B "#define FLTF_NANMASK (FLTF_QNAN | FLTF_SNAN)"
165.B "struct floatbits {"
166.B " unsigned f;"
167.B " int exp;"
168.B " arena *a;"
169.B " uint32 *frac;"
170.B " unsigned n, fracsz;"
171.B "};"
172.B "#define FLOATBITS_INIT { ...\& };"
173.PP
174.BI "void fltfmt_initbits(struct floatbits *" x );
175.BI "void fltfmt_freebits(struct floatbits *" x );
176.BI "void fltfmt_allocfrac(struct floatbits *" x ", unsigned " n );
177.ta \w'\fBvoid fltfmt_copybits('u
178.BI "void fltfmt_copybits(struct floatbits *" z_out ,
179.BI " const struct floatbits *" x );
180.ta \w'\fBvoid fltfmt_round('u
181.BI "void fltfmt_round(struct floatbits *" z_out ,
182.BI " const struct floatbits *" x ,
183.BI " unsigned " r ", unsigned " n );
184.PP
185.
186.ta 2n
187.B "#define FLTIF_HIDDEN ..."
188.B "struct fltfmt_ieeefmt {"
189.B " unsigned f;"
190.B " unsigned expwd;"
191.B " unsigned prec;"
192.B "};"
193.B "const struct fltfmt_ieeefmt fltfmt_f16;"
194.B "const struct fltfmt_ieeefmt fltfmt_f32;"
195.B "const struct fltfmt_ieeefmt fltfmt_f64;"
196.B "const struct fltfmt_ieeefmt fltfmt_f128;"
197.B "const struct fltfmt_ieeefmt fltfmt_mini;"
198.B "const struct fltfmt_ieeefmt fltfmt_bf16;"
199.B "const struct fltfmt_ieeefmt fltfmt_idblext80;"
200.PP
201.ta \w'\fBunsigned fltfmt_encieee('u
202.BI "unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *" fmt ,
203.BI " uint32 *" z ", const struct floatbits *" x ,
204.BI " unsigned " r ", unsigned " errmask );
205.ta \w'\fBunsigned fltfmt_encf16('u
206.BI "unsigned fltfmt_encf16(uint16 *" z_out ", const struct floatbits *" x ,
207.BI " unsigned " r ", unsigned " errmask );
208.ta \w'\fBunsigned fltfmt_encf32('u
209.BI "unsigned fltfmt_encf32(uint32 *" z_out ", const struct floatbits *" x ,
210.BI " unsigned " r ", unsigned " errmask );
211.ta \w'\fBunsigned fltfmt_encf64('u
212.BI "unsigned fltfmt_encf64(kludge64 *" z_out ", const struct floatbits *" x ,
213.BI " unsigned " r ", unsigned " errmask );
214.ta \w'\fBunsigned fltfmt_encf128('u
215.BI "unsigned fltfmt_encf128(uint32 *" z_out ", const struct floatbits *" x ,
216.BI " unsigned " r ", unsigned " errmask );
217.ta \w'\fBunsigned fltfmt_encmini('u
218.BI "unsigned fltfmt_encmini(octet *" z_out ", const struct floatbits *" x ,
219.BI " unsigned " r ", unsigned " errmask );
220.ta \w'\fBunsigned fltfmt_encbf16('u
221.BI "unsigned fltfmt_encbf16(uint16 *" z_out ", const struct floatbits *" x ,
222.BI " unsigned " r ", unsigned " errmask );
223.ta \w'\fBunsigned fltfmt_encidblext80('u
224.BI "unsigned fltfmt_encidblext80(uint16 *" se_out ", kludge64 *" m_out ,
225.BI " const struct floatbits *" x ,
226.BI " unsigned " r ", unsigned " errmask );
227.PP
228.ta \w'\fBunsigned fltfmt_decieee('u
229.BI "unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *" fmt ,
230.BI " struct floatbits *" z_out ", const uint32 *" x );
231.BI "unsigned fltfmt_decf16(struct floatbits *" z_out ", uint16 " x );
232.BI "unsigned fltfmt_decf32(struct floatbits *" z_out ", uint32 " x );
233.BI "unsigned fltfmt_decf64(struct floatbits *" z_out ", kludge64 " x );
234.BI "unsigned fltfmt_decf128(struct floatbits *" z_out ", const uint32 *" x );
235.BI "unsigned fltfmt_decmini(struct floatbits *" z_out ", octet " x );
236.BI "unsigned fltfmt_decbf16(struct floatbits *" z_out ", uint16 " x );
237.ta \w'\fBunsigned fltfmt_decidblext80('u
238.BI "unsigned fltfmt_decidblext80(struct floatbits *" z_out ,
239.BI " uint16 " se ", kludge64 " m );
240.PP
241.ta \w'\fBunsigned fltfmt_encflt('u
242.BI "unsigned fltfmt_encflt(float *" z_out ,
243.BI " const struct floatbits *" x ", unsigned " r );
244.ta \w'\fBunsigned fltfmt_encdbl('u
245.BI "unsigned fltfmt_encdbl(double *" z_out ,
246.BI " const struct floatbits *" x ", unsigned " r );
247.ta \w'\fBunsigned fltfmt_encldbl('u
248.BI "unsigned fltfmt_encldbl(long double *" z_out ,
249.BI " const struct floatbits *" x ", unsigned " r );
250.ta \w'\fBunsigned fltfmt_decflt('u
251.BI "unsigned fltfmt_decflt(struct floatbits *" z_out ,
252.BI " float *" x ", unsigned " r );
253.ta \w'\fBunsigned fltfmt_decdbl('u
254.BI "unsigned fltfmt_decdbl(struct floatbits *" z_out ,
255.BI " double *" x ", unsigned " r );
256.ta \w'\fBunsigned fltfmt_decldbl('u
257.BI "unsigned fltfmt_decldbl(struct floatbits *" z_out ,
258.BI " long double *" x ", unsigned " r );
259.PP
260.BI "unsigned fltfmt_flttof32l(octet *" p ", float " x ", unsigned " r );
261.BI "unsigned fltfmt_flttof32b(octet *" p ", float " x ", unsigned " r );
262.BI "unsigned fltfmt_dbltof64l(octet *" p ", double " x ", unsigned " r );
263.BI "unsigned fltfmt_dbltof64b(octet *" p ", double " x ", unsigned " r );
264.BI "unsigned fltfmt_f32ltoflt(float *" z_out ", const octet *" p ", unsigned " r );
265.BI "unsigned fltfmt_f32btoflt(float *" z_out ", const octet *" p ", unsigned " r );
266.BI "unsigned fltfmt_f64ltodbl(float *" z_out ", const octet *" p ", unsigned " r );
267.BI "unsigned fltfmt_f64btodbl(float *" z_out ", const octet *" p ", unsigned " r );
268.
269.\"--------------------------------------------------------------------------
270.SH DESCRIPTION
271.
272The
273.B "<mLib/fltfmt.h>"
274header file defines structures, macros, and functions
275for converting floating-point values between various formats,
276including the native floating-point formats
277and IEEE\ 754 and related formats.
278.
279.SS Error conditions
280Most of the functions in this module return an unsigned integer.
281A return value of zero means that no error occurred;
282set bits indicate various error conditions.
283.TP
284.B FLTERR_INVAL
285A binary input to be decoded contained an invalid bit pattern,
286e.g., an unnormalized input value with a nonminimal exponent.
287The function will have produced a reasonable output anyway,
288but the original value will not be recoverable from the result.
289.TP
290.B FLTERR_INEXACT
291The conversion was inexact.
292Converting the output back into the format of the input
293may not reproduce the original input value.
294This error flag is sometimes set conservatively.
295.TP
296.B FLTERR_UFLOW
297The conversion underflowed:
298a nonzero input was too tiny (in asbolute value) to represent,
299and a zero result was returned.
300.TP
301.B FLTERR_OFLOW
302The conversion overflowed:
303a finite input was too huge (in absolute value) to represent,
304and either the appropriately signed infinity
305or largest-magnitude finite value
306was returned, determined by the requested rounding mode.
307.TP
308.B FLTERR_REPR
309The output format failed entirely to represent the input value.
310The result is zero if the input was a NaN,
311or the appropriately signed largest-magnitude finite value
312if the input was an infinity.
313.
314.SS Rounding modes
315The rounding system works as follows.
316There are four
317.I rounding predicates
318considered when a rounding decision is taken.
319These are determined from the unrounded input value
320.IR x ,
321and the two nearest rounded values
322.RI | u "|\ \*(<=\ |" x |
323and
324.RI | v "|\ >\ |" x |.
325The predicates are as follows.
326.TP
327.B FRPF_LOW
328If
329.IR x "\ \*(/=\ " u
330and
331.IR x "\ \*/=\ (" u "\ +\ " v )/2,
332i.e.,
333.I x
334is neither equal to a rounded value,
335nor exactly halfway between two rounded values.
336This predicate is sometimes referred to as a `sticky bit'.
337.TP
338.B FRPF_HALF
339If
340.RI | x "|\ \*(>=\ |(" u "\ +\ " v )/2|,
341i.e.,
342.I x
343is halfway or more towards its larger rounded neighbour.
344.TP
345.B FRPF_ODD
346If least significant digit of
347.I u
348is odd.
349In binary floating-point formats,
350this is just the least significant bit of
351.IR u .
352.TP
353.B FRPF_NEG
354If
355.I x
356is negative.
357.PP
358These four predicates are packed into a four-bit mask value
359.I rf
360between 0 and 15.
361A
362.I rounding mode
363is simply a 16-bit mask:
364if bit
365.I rf
366of the rounding-mode mask is set,
367then
368.I x
369is rounded to
370.IR v ;
371otherwise it is rounded to
372.IR u .
373That is, the rounding-mode mask is essentially a truth table.
374Rounding modes with
375.I set
376bits corresponding to situations where both
377.B FRPF_LOW
378and
379.FRPF_HALF
380are false,
381i.e., where
382.I x
383is already a rounded value,
384are forbidden.
385.PP
386Some useful machinery is provided
387for constructing rounding-mode masks.
388.BR FRPMASK_LOW ,
389.BR FRPMASK_HALF ,
390.BR FRPMASK_ODD ,
391and
392.B FRPMASK_NEG ,
393are mask with set bits corresponding to their respective predicates.
394Bitwise boolean logic can be applied to these masks
395in order to calculate the masks corresponding to
396the same logical expresssion applied to the individual predicates.
397.B FRPMASK_INEXACT holds if
398.B LOW
399or
400.B HALF
401holds;
402i.e., if
403.IR x "\ \*(/=\ " u ;
404as mentioned above, only these bits may be set
405in a valid rounding-mode mask.
406.BI FRPMASK_NEAR( dir )
407is the mask for rounding to nearest with ties broken according to
408.IR dir ,
409which is another rounding-mode mask.
410The complete set of predefined masks is listed above in the synopsis,
411together with their description in terms of the basic predicates.
412The usual IEEE rounding mode is
413round-to-nearest/ties-to-even,
414denoted
415.BR FLTRND_NEAREVEN .
416This is likely a good option
417if there is no compelling argument for a different specific choice.
418.
419.SS The floatbits structure
420In order to avoid a combinatorial explosion in conversion operations,
421all the basic conversions involve,
422as source or target,
423a `common currency' format represented by the type
424.BR "struct floatbits" .
425.PP
426This structure consists of
427a set of flags
428.BR f ;
429a signed exponent
430.BR exp ;
431an
432.B arena
433pointer
434.BR a ;
435a pointer
436.B frac
437to a vector of
438.B uint32
439values;
440the length
441.B n
442of the
443.B frac
444vector; and
445the currently allocated size
446.B fracsz
447of the vector.
448Both
449.B n
450and
451.B fracsz
452count elements, not bytes.
453.PP
454Storage for
455.B frac
456comes from the arena
457.BR a .
458Only the first
459.B n
460words of
461.B frac
462are significant;
463.B frac[0]
464is the most significant word.
465The value represented by a
466.B struct floatbits
467is never changed by adding or removing zero-valued words
468at the end of the
469.B frac
470vector.
471It is always the case that
472.BR n "\ \*(<=\ " fracsz ;
473if
474.B fracsz
475is zero then
476.B frac
477may be a null pointer.
478.PP
479The interpretation of the
480.B exp
481and
482.B frac
483members depends on the flags set in
484.BR f .
485Apart from
486.BR FLTF_NEG ,
487the flags are
488.IR "mutually exclusive" :
489at most one flag may be set.
490.TP
491.B FLTF_NEG
492The value is negative.
493.TP
494.B FLTF_INF
495The value is positive or negative infinity.
496The
497.B exp
498and
499.B frac
500are ignored.
501.TP
502.BR FLTF_QNAN " and " FLTF_SNAN
503The value is a quiet or signalling not-a-number, respectively.
504The
505.B exp
506is ignored.
507The payload is stored in
508.BR frac ;
509the payload does not include the `quiet' bit.
510.TP
511.B FLTF_ZERO
512The number is zero.
513Negative zero is distinct from positive zero.
514The
515.B exp
516and
517.B frac
518are ignored.
519.IP "All non-sign bits clear"
520The value is a finite nonzero number.
521The
522.B frac
523holds the significand.
524The most significand significand bit must be set, so
525(a)\ the number must be nonzero, and
526(b)\ the significand is normalized.
527The significand is interpreted as a fraction
528.RI "1/2\ \*(<=\ " m "\ <\ 1."
529If
530.I e
531is the value of the
532.B exp
533member,
534and
535.I s
536is \-1 if
537.B FLTF_NEG
538is set
539or +1 if
540.B FLTF_NEG
541is clear,
542then the number represented is
543.IR s "\ \(mu\ " m "\ \(mu\ 2\*(ss" e \*(se.
544.PP
545A
546.B struct floatbits
547can be initialized statically by
548.BR FLOATBITS_INIT ,
549or dynamically using the function
550.BR fltfmt_initbits .
551These are not quite the same:
552.B FLOATBITS_INIT
553initializes
554.B a
555to
556.BR &arena_stdlib ,
557while
558.B fltfmt_initbits
559sets it to the runtime value of
560.BR arena_global .
561With this exception,
562both forms of initialization set the value to (positive) zero;
563neither allocates any storage or other resources,
564leaving
565.B frac
566null.
567.PP
568The
569.B fltfmt_allocfrac
570function is given a pointer
571.I x
572to a
573.B struct floatbits
574and a length
575.IR n :
576it ensures that there is enough storage at
577.IB x ->frac
578for at least
579.I n
580words:
581if the current size is too small,
582then any existing buffer is discarded and a new one allocated
583from the arena
584.IB x ->a \fR;
585any existing contents of the buffer are lost.
586On exit,
587.IB x ->n
588is set to
589.IR n .
590.PP
591The
592.B fltfmt_freebits
593function
594frees a
595.B struct floatbits
596structure, releasing the storage held by
597.BR frac .
598.PP
599The
600.B fltfmt_copybits
601function simply copies its input
602.I x
603to its output
604.IR z_out ;
605both must refer to initialized
606.B struct floatbits
607structures.
608If
609.I z_out
610and
611.I x
612are equal, then nothing happens.
613.PP
614Finally, the
615.B fltfmt_round
616function rounds the value in the
617.B struct floatbits
618structure
619.I x
620to
621.I n
622bits using the rounding mode
623.IR r ;
624the result is written to
625.IR z_out ;
626it is permitted for
627.I z_out
628to be equal to
629.IR x .
630If
631.I x
632is a zero or infinity,
633then the output is equal to the input,
634as if
635.B fltfmt_copybits
636had been called instead.
637If
638.I x
639is a NaN,
640then the payload is simply truncated to
641.I n
642bits, without regard to the rounding mode.
643Otherwise, the input is nonzero and finite;
644the significand is rounded to
645.I n
646bits according to the rounding mode.
647In all cases, the return value is
648zero if the output is equal to the input,
649or
650.B FLTERR_INEXACT
651if the rounded result is not equal to the input.
652.
653.SS IEEE and related formats
654An IEEE floating-point format is characterized by three parameters:
655the
656.I "exponent width"
657.IR w ,
658the
659.I "precision"
660.IR p ,
661and
662the
663.I "unit width"
664.IR h .
665.PP
666The encoded value consists of
667.IR p "\ +\ " w "\ +\ " h "\ \-\ 1"
668bits.
669This is divided, from the most significant bit downwards,
670into a
671.I "sign bit"
672.IR s ,
673a
674.IR w -bit
675.I "biased exponent"
676.IR e \*',
677a
678.IR h -bit
679.I "unit bit"
680.IR u ,
681and a
682.RI ( p "\ \-\ " h )-bit
683.I fraction
684.IR f .
685The
686.I "exponent bias"
687is
688.IR e "\*(us0\*(ue\ =\ 2\*(ss" w "\-1\*(se\ \-\ 1;"
689the true exponent
690.I e
691is calculated from the biased exponent by
692.IR e "\ =\ " e "\*'\ \-\ " e \*(us0\*(ue.
693The unit and fraction field are usually interpreted as denoting
694a significand
695.IR m "\ =\ " u "\ +\ " f /2\*(ss p \-1\*(se
696with
697.RI "0\ \*(<=\ " m "\ <\ 2."
698If
699.I h
700is zero,
701the value of the unit bit
702.I u
703is implied by the exponent as described below.
704The encoded value is interpreted as follows.
705.hP \*o
706If
707.IR e "\ =\ \-" e \*(us0\*(ue
708then the value is zero or a subnormal,
709with the value
710.RI (\-1)\*(ss s "\*(se\ \(mu\ " m "\ \(mu\ 2\*(ss" e +1\*(se.
711In particular,
712if
713.IR m "\ =\ 0"
714then the value is positive or negative zero,
715according to the sign bit
716.IR s .
717If
718.I h
719is zero then
720.IR u "\ =\ 0;"
721if
722.I h
723is nonzero
724but
725.IR u "\ \*(/=\ 0"
726then the encoding is invalid:
727decoding returns
728.BR FLTERR_INVAL ,
729but the result will be as described.
730.hP \*o
731If
732.RI "1\ \-\ " e "\*(us0\*(ue \*(<=\ " e "\ < 2" e "\*(us0\*(ue\ +\ 1"
733then the value is a (supposedly) normal number
734.RI (\-1)\*(ss s "\*(se\ \(mu\ " m "\ \(mu\ 2\*(ss" e \*(se.
735If
736.I h
737is zero then
738.IR u "\ =\ 1;"
739if
740.I h
741is nonzero
742but
743.IR u "\ \*(/=\ 0"
744then the encoding is invalid:
745decoding returns
746.BR FLTERR_INVAL ,
747but the result will be as described.
748.hP \*o
749If
750.IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
751and
752.IR f "\ =\ 0"
753then the value is positive or negative infinity,
754according to the sign bit
755.IR s .
756If
757.I h
758is nonzero and
759.IR u "\ =\ 0"
760then the encoding is invalid:
761decoding returns
762.BR FLTERR_INVAL ,
763but the result will still be infinity.
764.hP \*o
765If
766.IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1"
767and
768.IR f "\ \*(/=\ 0"
769then the value is not-a-number (NaN).
770The most significant bit of
771.I f
772is the `quiet bit':
773if the bit is set, the value is a `quiet NaN';
774if the bit is clear, the value is a `signalling NaN'.
775(This is the convention recommended by IEEE\ 754-2008 \(sc6.2.1,
776it has the advantage that a signalling NaN can be `quieted'
777by setting the most significant fraction bit;
778HP-PA and older MIPS processors use the opposite convention
779for distinguishing quiet and signalling NaNs,
780but a signalling NaN with all but the most significant
781fraction bit zero cannot be `quieted' by clearing the
782most significant bit, since the resulting encoding denotes
783an infinity, not a QNaN.)
784The remaining bits of
785.I f
786form the
787.I payload.
788Positive and negative NaN values are distinguished,
789with sign determined by the sign bit.
790If
791.I h
792is nonzero and
793.IR u "\ =\ 0"
794then the encoding is invalid:
795decoding returns
796.BR FLTERR_INVAL ,
797but the result will still be a NaN;
798the unit bit does not affect the NaN payload.
799.PP
800An IEEE format is described by the type
801.BR "struct fltfmt_ieeefmt" .
802This has three members:
803.TP
804.B f
805A flags word.
806If
807.B FLTIF_HIDDEN
808is set, the the format uses a `hidden bit' convention:
809in the notation above
810.IR h "\ =\ 0;"
811if the flag is clear,
812the format has an explicit unit bit, and
813.IR h "\ =\ 1."
814.TP
815.B expwd
816The exponent width;
817in the notation above this is
818.IR w .
819.TP
820.B prec
821The precision;
822in the notation above this is
823.IR p .
824.PP
825The following IEEE formats descriptions are already defined.
826.TP
827.B "fltfmt_f16"
828The IEEE\ 754 Binary16 format, with
829.IR w "\ =\ 5,"
830.IR p "\ =\ 11,"
831and
832.IR h "\ =\ 0."
833.TP
834.B "fltfmt_f32"
835The IEEE\ 754 Binary32 (`single precision') format, with
836.IR w "\ =\ 8,"
837.IR p "\ =\ 24,"
838and
839.IR h "\ =\ 0."
840.TP
841.B "fltfmt_f64"
842The IEEE\ 754 Binary64 (`double precision') format, with
843.IR w "\ =\ 11,"
844.IR p "\ =\ 53,"
845and
846.IR h "\ =\ 0."
847.TP
848.B "fltfmt_f128"
849The IEEE\ 754 Binary128 (`quad precision') format, with
850.IR w "\ =\ 15,"
851.IR p "\ =\ 113,"
852and
853.IR h "\ =\ 0."
854.TP
855.B "fltfmt_mini"
856An eight-bit `minifloat' format, with
857.IR w "\ =\ 4,"
858.IR p "\ =\ 4,"
859and
860.IR h "\ =\ 0."
861.TP
862.B "fltfmt_bf16"
863The Google `BFloat16' format, with
864.IR w "\ =\ 8,"
865.IR p "\ =\ 8,"
866and
867.IR h "\ =\ 0."
868.TP
869.B "fltfmt_idblext80"
870The Intel 8087 80-bit `double extended' format, with
871.IR w "\ =\ 15,"
872.IR p "\ =\ 64,"
873and
874.IR h "\ =\ 1."
875.PP
876The
877.B fltfmt_encieee
878and
879.B fltfmt_decieee
880functions convert between IEEE and related formats
881and the internal
882.B struct floatbits
883representation.
884They respectively encode or decode an IEEE-format value,
885as described above,
886from a vector of
887.B uint32
888words,
889most-significant word first
890\(en so the sign bit is in the first word.
891For formats whose size is not a multiple of 32,
892the encoding is
893.IR right-aligned :
894the least significant bit of the fraction
895is in the least significant bit of the last word in the vector.
896.PP
897The
898.B fltfmt_encieee
899function encodes an IEEE-format value.
900The function is given five arguments:
901a pointer
902.I fmt
903to the IEEE format description,
904a pointer
905.I p
906to a sufficiently long vector of 32-bit words
907in which to store the encoded value,
908a pointer
909.I x
910to the
911.B struct floatbits
912holding the value to encode,
913a rounding mode
914.IR r ,
915and an error mask
916.IR errmask .
917If the input is a NaN,
918then the payload is truncated to fit
919regardless of the rounding mode,
920discarding low-significant bits;
921if the input is a finite value,
922then the significand is rounded to fit
923according to the requested rounding mode.
924The possible errors are
925.B FLTERR_UFLOW
926if the value is unrepresentably tiny,
927.B FLTERR_OFLOW
928if the value is unrepresentably huge,
929and
930.B FLTERR_INEXACT
931if the encoding fails to preserve the input value exactly;
932hence
933.B FLTERR_INEXACT
934is set whenever
935.B FLTERR_OFLOW
936or
937.B FLTERR_UFLOW
938is set,
939or if bits are lost due to NaN-payload truncation or rounding.
940If, during encoding,
941an error is encountered,
942processing stops immediately
943unless the corresponding bit of
944.I errmask
945is set.
946.PP
947The
948.B fltfmt_decieee
949function decodes an IEEE-format value.
950The function is given three arguments:
951a pointer
952.I fmt
953to the IEEE format description,
954a pointer
955.I z_out
956to the initialized
957.B struct floatbits
958to fill in, and
959a pointer
960.I p
961to the IEEE-encoded value to decode,
962in a vector of 32-bit words as described above.
963The only error that can occur during decoding is
964.BR FLTERR_INVAL :
965as described above,
966this occurs in non-hidden-bit formats
967when the unit bit does not match that implied by the exponent;
968the result is returned anyway,
969with the unit bit interpreted as encoded in finite numbers,
970and discarded in infinities and NaNs.
971.PP
972For each of the format
973
974
975
976.
977.\"--------------------------------------------------------------------------
978.SH "SEE ALSO"
979.
980.BR bits (3),
981.BR mLib (3).
982.
983.\"--------------------------------------------------------------------------
984.SH AUTHOR
985.
986Mark Wooding, <mdw@distorted.org.uk>
987.
988.\"----- That's all, folks --------------------------------------------------