Commit | Line | Data |
---|---|---|
c752173d MW |
1 | .\" -*-nroff-*- |
2 | .\" | |
3 | .\" Manual for floating-point format conversions | |
4 | .\" | |
5 | .\" (c) 2024 Straylight/Edgeware | |
6 | .\" | |
7 | . | |
8 | .\"----- Licensing notice --------------------------------------------------- | |
9 | .\" | |
10 | .\" This file is part of the mLib utilities library. | |
11 | .\" | |
12 | .\" mLib is free software: you can redistribute it and/or modify it under | |
13 | .\" the terms of the GNU Library General Public License as published by | |
14 | .\" the Free Software Foundation; either version 2 of the License, or (at | |
15 | .\" your option) any later version. | |
16 | .\" | |
17 | .\" mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | .\" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | .\" FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | .\" License for more details. | |
21 | .\" | |
22 | .\" You should have received a copy of the GNU Library General Public | |
23 | .\" License along with mLib. If not, write to the Free Software | |
24 | .\" Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | .\" USA. | |
26 | . | |
27 | .\"-------------------------------------------------------------------------- | |
28 | .so ../defs.man \" @@@PRE@@@ | |
29 | . | |
30 | .\"-------------------------------------------------------------------------- | |
31 | .TH fltfmt 3mLib "22 April 2024" "Straylight/Edgeware" "mLib utilities library" | |
32 | .\" @FLTERR_OK | |
33 | .\" @FLTERR_INVAL | |
34 | .\" @FLTERR_INEXACT | |
35 | .\" @FLTERR_UFLOW | |
36 | .\" @FLTERR_OFLOW | |
37 | .\" @FLTERR_REPR | |
38 | .\" @FLTERR_ALLERRS | |
39 | . | |
40 | .\" @FRPF_LOW | |
41 | .\" @FRPF_HALF | |
42 | .\" @FRPF_ODD | |
43 | .\" @FRPF_NEG | |
44 | .\" @FRPMASK_LOW | |
45 | .\" @FRPMASK_HALF | |
46 | .\" @FRPMASK_ODD | |
47 | .\" @FRPMASK_NEG | |
48 | .\" @FRPMASK_INEXACT | |
49 | .\" @FRPMASK_NEAR | |
50 | .\" @FLTRND_ZERO | |
51 | .\" @FLTRND_PROJINF | |
52 | .\" @FLTRND_NEGINF | |
53 | .\" @FLTRND_POSINF | |
54 | .\" @FLTRND_EVEN | |
55 | .\" @FLTRND_ODD | |
56 | .\" @FLTRND_NEAREVEN | |
57 | .\" @FLTRND_NEARODD | |
58 | .\" @FLTRND_NEARZERO | |
59 | .\" @FLTRND_NEARINF | |
60 | .\" @FLTRND_NEARNEG | |
61 | .\" @FLTRND_NEARPOS | |
62 | . | |
63 | .\" @FLTFMT_NEG | |
64 | .\" @FLTFMT_INF | |
65 | .\" @FLTFMT_QNAN | |
66 | .\" @FLTFMT_SNAN | |
67 | .\" @FLTFMT_ZERO | |
68 | .\" @FLTFMT_NANMASK | |
69 | .\" @FLOATBITS_INIT | |
70 | .\" @fltfmt_initbits | |
71 | .\" @fltfmt_freebits | |
72 | .\" @fltfmt_allocfrac | |
73 | .\" @fltfmt_copybits | |
74 | .\" @fltfmt_round | |
75 | . | |
76 | .\" @FLTIF_HIDDEN | |
77 | .\" @fltfmt_f16 | |
78 | .\" @fltfmt_f32 | |
79 | .\" @fltfmt_f64 | |
80 | .\" @fltfmt_f128 | |
81 | .\" @fltfmt_mini | |
82 | .\" @fltfmt_bf16 | |
83 | .\" @fltfmt_idblext80 | |
84 | . | |
85 | .\" @fltfmt_encieee | |
86 | .\" @fltfmt_encf16 | |
87 | .\" @fltfmt_encf32 | |
88 | .\" @fltfmt_encf64 | |
89 | .\" @fltfmt_encf128 | |
90 | .\" @fltfmt_encmini | |
91 | .\" @fltfmt_encbf16 | |
92 | .\" @fltfmt_encidblext80 | |
93 | .\" @fltfmt_decieee | |
94 | .\" @fltfmt_decf16 | |
95 | .\" @fltfmt_decf32 | |
96 | .\" @fltfmt_decf64 | |
97 | .\" @fltfmt_decf128 | |
98 | .\" @fltfmt_decmini | |
99 | .\" @fltfmt_decbf16 | |
100 | .\" @fltfmt_decidblext80 | |
101 | . | |
102 | .\" @fltfmt_encflt | |
103 | .\" @fltfmt_encdbl | |
104 | .\" @fltfmt_encldbl | |
105 | .\" @fltfmt_decflt | |
106 | .\" @fltfmt_decdbl | |
107 | .\" @fltfmt_decldbl | |
108 | . | |
109 | .\" @fltfmt_flttof32l | |
110 | .\" @fltfmt_flttof32b | |
111 | .\" @fltfmt_dbltof64l | |
112 | .\" @fltfmt_dbltof64b | |
113 | .\" @fltfmt_f32ltoflt | |
114 | .\" @fltfmt_f32btoflt | |
115 | .\" @fltfmt_f64ltodbl | |
116 | .\" @fltfmt_f64btodbl | |
117 | . | |
118 | .\"-------------------------------------------------------------------------- | |
119 | .SH NAME | |
120 | fltfmt \- floating-point format conversions | |
121 | . | |
122 | .\"-------------------------------------------------------------------------- | |
123 | .SH SYNOPSIS | |
124 | . | |
125 | .nf | |
126 | .B "#define FLTERR_OK 0" | |
127 | .B "#define FLTERR_INVAL ..." | |
128 | .B "#define FLTERR_INEXACT ..." | |
129 | .B "#define FLTERR_UFLOW ..." | |
130 | .B "#define FLTERR_OFLOW ..." | |
131 | .B "#define FLTERR_REPR ..." | |
132 | .B "#define FLTERR_ALLERRS ..." | |
133 | .PP | |
134 | .ta 40n | |
135 | .B "#define FRPF_LOW 1u" | |
136 | .B "#define FRPF_HALF 2u" | |
137 | .B "#define FRPF_ODD 4u" | |
138 | .B "#define FRPF_NEG 8u" | |
139 | .B "#define FRPMASK_LOW 0xaaaau" | |
140 | .B "#define FRPMASK_HALF 0xccccu." | |
141 | .B "#define FRPMASK_ODD 0xf0f0u" | |
142 | .B "#define FRPMASK_NEG 0xff00u" | |
143 | .B "#define FRPMASK_INEXACT ... /* LOW | HALF */" | |
144 | .BI "unsigned FRPMASK_NEAR(unsigned " dir "); /* HALF&(LOW | " dir ") */" | |
145 | .B "#define FLTRND_ZERO ... /* 0 */" | |
146 | .B "#define FLTRND_PROJINF ... /* INEXACT */" | |
147 | .B "#define FLTRND_NEGINF ... /* INEXACT&NEG */" | |
148 | .B "#define FLTRND_POSINF ... /* INEXACT&~NEG */" | |
149 | .B "#define FLTRND_EVEN ... /* INEXACT&ODD */" | |
150 | .B "#define FLTRND_ODD ... /* INEXACT&~ODD */" | |
151 | .B "#define FLTRND_NEAREVEN ... /* HALF&(LOW | ODD) */" | |
152 | .B "#define FLTRND_NEARODD ... /* HALF&(LOW | ~ODD) */" | |
153 | .B "#define FLTRND_NEARZERO ... /* HALF&LOW */" | |
154 | .B "#define FLTRND_NEARINF ... /* HALF */" | |
155 | .B "#define FLTRND_NEARNEG ... /* HALF&(LOW | NEG) */" | |
156 | .B "#define FLTRND_NEARPOS ... /* HALF&(LOW | ~NEG) */" | |
157 | .PP | |
158 | .ta 2n | |
159 | .B "#define FLTF_NEG ..." | |
160 | .B "#define FLTF_INF ..." | |
161 | .B "#define FLTF_QNAN ..." | |
162 | .B "#define FLTF_SNAN ..." | |
163 | .B "#define FLTF_ZERO ..." | |
164 | .B "#define FLTF_NANMASK (FLTF_QNAN | FLTF_SNAN)" | |
165 | .B "struct floatbits {" | |
166 | .B " unsigned f;" | |
167 | .B " int exp;" | |
168 | .B " arena *a;" | |
169 | .B " uint32 *frac;" | |
170 | .B " unsigned n, fracsz;" | |
171 | .B "};" | |
172 | .B "#define FLOATBITS_INIT { ...\& };" | |
173 | .PP | |
174 | .BI "void fltfmt_initbits(struct floatbits *" x ); | |
175 | .BI "void fltfmt_freebits(struct floatbits *" x ); | |
176 | .BI "void fltfmt_allocfrac(struct floatbits *" x ", unsigned " n ); | |
177 | .ta \w'\fBvoid fltfmt_copybits('u | |
178 | .BI "void fltfmt_copybits(struct floatbits *" z_out , | |
179 | .BI " const struct floatbits *" x ); | |
180 | .ta \w'\fBvoid fltfmt_round('u | |
181 | .BI "void fltfmt_round(struct floatbits *" z_out , | |
182 | .BI " const struct floatbits *" x , | |
183 | .BI " unsigned " r ", unsigned " n ); | |
184 | .PP | |
185 | . | |
186 | .ta 2n | |
187 | .B "#define FLTIF_HIDDEN ..." | |
188 | .B "struct fltfmt_ieeefmt {" | |
189 | .B " unsigned f;" | |
190 | .B " unsigned expwd;" | |
191 | .B " unsigned prec;" | |
192 | .B "};" | |
193 | .B "const struct fltfmt_ieeefmt fltfmt_f16;" | |
194 | .B "const struct fltfmt_ieeefmt fltfmt_f32;" | |
195 | .B "const struct fltfmt_ieeefmt fltfmt_f64;" | |
196 | .B "const struct fltfmt_ieeefmt fltfmt_f128;" | |
197 | .B "const struct fltfmt_ieeefmt fltfmt_mini;" | |
198 | .B "const struct fltfmt_ieeefmt fltfmt_bf16;" | |
199 | .B "const struct fltfmt_ieeefmt fltfmt_idblext80;" | |
200 | .PP | |
201 | .ta \w'\fBunsigned fltfmt_encieee('u | |
202 | .BI "unsigned fltfmt_encieee(const struct fltfmt_ieeefmt *" fmt , | |
203 | .BI " uint32 *" z ", const struct floatbits *" x , | |
204 | .BI " unsigned " r ", unsigned " errmask ); | |
205 | .ta \w'\fBunsigned fltfmt_encf16('u | |
206 | .BI "unsigned fltfmt_encf16(uint16 *" z_out ", const struct floatbits *" x , | |
207 | .BI " unsigned " r ", unsigned " errmask ); | |
208 | .ta \w'\fBunsigned fltfmt_encf32('u | |
209 | .BI "unsigned fltfmt_encf32(uint32 *" z_out ", const struct floatbits *" x , | |
210 | .BI " unsigned " r ", unsigned " errmask ); | |
211 | .ta \w'\fBunsigned fltfmt_encf64('u | |
212 | .BI "unsigned fltfmt_encf64(kludge64 *" z_out ", const struct floatbits *" x , | |
213 | .BI " unsigned " r ", unsigned " errmask ); | |
214 | .ta \w'\fBunsigned fltfmt_encf128('u | |
215 | .BI "unsigned fltfmt_encf128(uint32 *" z_out ", const struct floatbits *" x , | |
216 | .BI " unsigned " r ", unsigned " errmask ); | |
217 | .ta \w'\fBunsigned fltfmt_encmini('u | |
218 | .BI "unsigned fltfmt_encmini(octet *" z_out ", const struct floatbits *" x , | |
219 | .BI " unsigned " r ", unsigned " errmask ); | |
220 | .ta \w'\fBunsigned fltfmt_encbf16('u | |
221 | .BI "unsigned fltfmt_encbf16(uint16 *" z_out ", const struct floatbits *" x , | |
222 | .BI " unsigned " r ", unsigned " errmask ); | |
223 | .ta \w'\fBunsigned fltfmt_encidblext80('u | |
224 | .BI "unsigned fltfmt_encidblext80(uint16 *" se_out ", kludge64 *" m_out , | |
225 | .BI " const struct floatbits *" x , | |
226 | .BI " unsigned " r ", unsigned " errmask ); | |
227 | .PP | |
228 | .ta \w'\fBunsigned fltfmt_decieee('u | |
229 | .BI "unsigned fltfmt_decieee(const struct fltfmt_ieeefmt *" fmt , | |
230 | .BI " struct floatbits *" z_out ", const uint32 *" x ); | |
231 | .BI "unsigned fltfmt_decf16(struct floatbits *" z_out ", uint16 " x ); | |
232 | .BI "unsigned fltfmt_decf32(struct floatbits *" z_out ", uint32 " x ); | |
233 | .BI "unsigned fltfmt_decf64(struct floatbits *" z_out ", kludge64 " x ); | |
234 | .BI "unsigned fltfmt_decf128(struct floatbits *" z_out ", const uint32 *" x ); | |
235 | .BI "unsigned fltfmt_decmini(struct floatbits *" z_out ", octet " x ); | |
236 | .BI "unsigned fltfmt_decbf16(struct floatbits *" z_out ", uint16 " x ); | |
237 | .ta \w'\fBunsigned fltfmt_decidblext80('u | |
238 | .BI "unsigned fltfmt_decidblext80(struct floatbits *" z_out , | |
239 | .BI " uint16 " se ", kludge64 " m ); | |
240 | .PP | |
241 | .ta \w'\fBunsigned fltfmt_encflt('u | |
242 | .BI "unsigned fltfmt_encflt(float *" z_out , | |
243 | .BI " const struct floatbits *" x ", unsigned " r ); | |
244 | .ta \w'\fBunsigned fltfmt_encdbl('u | |
245 | .BI "unsigned fltfmt_encdbl(double *" z_out , | |
246 | .BI " const struct floatbits *" x ", unsigned " r ); | |
247 | .ta \w'\fBunsigned fltfmt_encldbl('u | |
248 | .BI "unsigned fltfmt_encldbl(long double *" z_out , | |
249 | .BI " const struct floatbits *" x ", unsigned " r ); | |
250 | .ta \w'\fBunsigned fltfmt_decflt('u | |
251 | .BI "unsigned fltfmt_decflt(struct floatbits *" z_out , | |
252 | .BI " float *" x ", unsigned " r ); | |
253 | .ta \w'\fBunsigned fltfmt_decdbl('u | |
254 | .BI "unsigned fltfmt_decdbl(struct floatbits *" z_out , | |
255 | .BI " double *" x ", unsigned " r ); | |
256 | .ta \w'\fBunsigned fltfmt_decldbl('u | |
257 | .BI "unsigned fltfmt_decldbl(struct floatbits *" z_out , | |
258 | .BI " long double *" x ", unsigned " r ); | |
259 | .PP | |
260 | .BI "unsigned fltfmt_flttof32l(octet *" p ", float " x ", unsigned " r ); | |
261 | .BI "unsigned fltfmt_flttof32b(octet *" p ", float " x ", unsigned " r ); | |
262 | .BI "unsigned fltfmt_dbltof64l(octet *" p ", double " x ", unsigned " r ); | |
263 | .BI "unsigned fltfmt_dbltof64b(octet *" p ", double " x ", unsigned " r ); | |
264 | .BI "unsigned fltfmt_f32ltoflt(float *" z_out ", const octet *" p ", unsigned " r ); | |
265 | .BI "unsigned fltfmt_f32btoflt(float *" z_out ", const octet *" p ", unsigned " r ); | |
266 | .BI "unsigned fltfmt_f64ltodbl(float *" z_out ", const octet *" p ", unsigned " r ); | |
267 | .BI "unsigned fltfmt_f64btodbl(float *" z_out ", const octet *" p ", unsigned " r ); | |
268 | . | |
269 | .\"-------------------------------------------------------------------------- | |
270 | .SH DESCRIPTION | |
271 | . | |
272 | The | |
273 | .B "<mLib/fltfmt.h>" | |
274 | header file defines structures, macros, and functions | |
275 | for converting floating-point values between various formats, | |
276 | including the native floating-point formats | |
277 | and IEEE\ 754 and related formats. | |
278 | . | |
279 | .SS Error conditions | |
280 | Most of the functions in this module return an unsigned integer. | |
281 | A return value of zero means that no error occurred; | |
282 | set bits indicate various error conditions. | |
283 | .TP | |
284 | .B FLTERR_INVAL | |
285 | A binary input to be decoded contained an invalid bit pattern, | |
286 | e.g., an unnormalized input value with a nonminimal exponent. | |
287 | The function will have produced a reasonable output anyway, | |
288 | but the original value will not be recoverable from the result. | |
289 | .TP | |
290 | .B FLTERR_INEXACT | |
291 | The conversion was inexact. | |
292 | Converting the output back into the format of the input | |
293 | may not reproduce the original input value. | |
294 | This error flag is sometimes set conservatively. | |
295 | .TP | |
296 | .B FLTERR_UFLOW | |
297 | The conversion underflowed: | |
298 | a nonzero input was too tiny (in asbolute value) to represent, | |
299 | and a zero result was returned. | |
300 | .TP | |
301 | .B FLTERR_OFLOW | |
302 | The conversion overflowed: | |
303 | a finite input was too huge (in absolute value) to represent, | |
304 | and either the appropriately signed infinity | |
305 | or largest-magnitude finite value | |
306 | was returned, determined by the requested rounding mode. | |
307 | .TP | |
308 | .B FLTERR_REPR | |
309 | The output format failed entirely to represent the input value. | |
310 | The result is zero if the input was a NaN, | |
311 | or the appropriately signed largest-magnitude finite value | |
312 | if the input was an infinity. | |
313 | . | |
314 | .SS Rounding modes | |
315 | The rounding system works as follows. | |
316 | There are four | |
317 | .I rounding predicates | |
318 | considered when a rounding decision is taken. | |
319 | These are determined from the unrounded input value | |
320 | .IR x , | |
321 | and the two nearest rounded values | |
322 | .RI | u "|\ \*(<=\ |" x | | |
323 | and | |
324 | .RI | v "|\ >\ |" x |. | |
325 | The predicates are as follows. | |
326 | .TP | |
327 | .B FRPF_LOW | |
328 | If | |
329 | .IR x "\ \*(/=\ " u | |
330 | and | |
331 | .IR x "\ \*/=\ (" u "\ +\ " v )/2, | |
332 | i.e., | |
333 | .I x | |
334 | is neither equal to a rounded value, | |
335 | nor exactly halfway between two rounded values. | |
336 | This predicate is sometimes referred to as a `sticky bit'. | |
337 | .TP | |
338 | .B FRPF_HALF | |
339 | If | |
340 | .RI | x "|\ \*(>=\ |(" u "\ +\ " v )/2|, | |
341 | i.e., | |
342 | .I x | |
343 | is halfway or more towards its larger rounded neighbour. | |
344 | .TP | |
345 | .B FRPF_ODD | |
346 | If least significant digit of | |
347 | .I u | |
348 | is odd. | |
349 | In binary floating-point formats, | |
350 | this is just the least significant bit of | |
351 | .IR u . | |
352 | .TP | |
353 | .B FRPF_NEG | |
354 | If | |
355 | .I x | |
356 | is negative. | |
357 | .PP | |
358 | These four predicates are packed into a four-bit mask value | |
359 | .I rf | |
360 | between 0 and 15. | |
361 | A | |
362 | .I rounding mode | |
363 | is simply a 16-bit mask: | |
364 | if bit | |
365 | .I rf | |
366 | of the rounding-mode mask is set, | |
367 | then | |
368 | .I x | |
369 | is rounded to | |
370 | .IR v ; | |
371 | otherwise it is rounded to | |
372 | .IR u . | |
373 | That is, the rounding-mode mask is essentially a truth table. | |
374 | Rounding modes with | |
375 | .I set | |
376 | bits corresponding to situations where both | |
377 | .B FRPF_LOW | |
378 | and | |
379 | .FRPF_HALF | |
380 | are false, | |
381 | i.e., where | |
382 | .I x | |
383 | is already a rounded value, | |
384 | are forbidden. | |
385 | .PP | |
386 | Some useful machinery is provided | |
387 | for constructing rounding-mode masks. | |
388 | .BR FRPMASK_LOW , | |
389 | .BR FRPMASK_HALF , | |
390 | .BR FRPMASK_ODD , | |
391 | and | |
392 | .B FRPMASK_NEG , | |
393 | are mask with set bits corresponding to their respective predicates. | |
394 | Bitwise boolean logic can be applied to these masks | |
395 | in order to calculate the masks corresponding to | |
396 | the same logical expresssion applied to the individual predicates. | |
397 | .B FRPMASK_INEXACT holds if | |
398 | .B LOW | |
399 | or | |
400 | .B HALF | |
401 | holds; | |
402 | i.e., if | |
403 | .IR x "\ \*(/=\ " u ; | |
404 | as mentioned above, only these bits may be set | |
405 | in a valid rounding-mode mask. | |
406 | .BI FRPMASK_NEAR( dir ) | |
407 | is the mask for rounding to nearest with ties broken according to | |
408 | .IR dir , | |
409 | which is another rounding-mode mask. | |
410 | The complete set of predefined masks is listed above in the synopsis, | |
411 | together with their description in terms of the basic predicates. | |
412 | The usual IEEE rounding mode is | |
413 | round-to-nearest/ties-to-even, | |
414 | denoted | |
415 | .BR FLTRND_NEAREVEN . | |
416 | This is likely a good option | |
417 | if there is no compelling argument for a different specific choice. | |
418 | . | |
419 | .SS The floatbits structure | |
420 | In order to avoid a combinatorial explosion in conversion operations, | |
421 | all the basic conversions involve, | |
422 | as source or target, | |
423 | a `common currency' format represented by the type | |
424 | .BR "struct floatbits" . | |
425 | .PP | |
426 | This structure consists of | |
427 | a set of flags | |
428 | .BR f ; | |
429 | a signed exponent | |
430 | .BR exp ; | |
431 | an | |
432 | .B arena | |
433 | pointer | |
434 | .BR a ; | |
435 | a pointer | |
436 | .B frac | |
437 | to a vector of | |
438 | .B uint32 | |
439 | values; | |
440 | the length | |
441 | .B n | |
442 | of the | |
443 | .B frac | |
444 | vector; and | |
445 | the currently allocated size | |
446 | .B fracsz | |
447 | of the vector. | |
448 | Both | |
449 | .B n | |
450 | and | |
451 | .B fracsz | |
452 | count elements, not bytes. | |
453 | .PP | |
454 | Storage for | |
455 | .B frac | |
456 | comes from the arena | |
457 | .BR a . | |
458 | Only the first | |
459 | .B n | |
460 | words of | |
461 | .B frac | |
462 | are significant; | |
463 | .B frac[0] | |
464 | is the most significant word. | |
465 | The value represented by a | |
466 | .B struct floatbits | |
467 | is never changed by adding or removing zero-valued words | |
468 | at the end of the | |
469 | .B frac | |
470 | vector. | |
471 | It is always the case that | |
472 | .BR n "\ \*(<=\ " fracsz ; | |
473 | if | |
474 | .B fracsz | |
475 | is zero then | |
476 | .B frac | |
477 | may be a null pointer. | |
478 | .PP | |
479 | The interpretation of the | |
480 | .B exp | |
481 | and | |
482 | .B frac | |
483 | members depends on the flags set in | |
484 | .BR f . | |
485 | Apart from | |
486 | .BR FLTF_NEG , | |
487 | the flags are | |
488 | .IR "mutually exclusive" : | |
489 | at most one flag may be set. | |
490 | .TP | |
491 | .B FLTF_NEG | |
492 | The value is negative. | |
493 | .TP | |
494 | .B FLTF_INF | |
495 | The value is positive or negative infinity. | |
496 | The | |
497 | .B exp | |
498 | and | |
499 | .B frac | |
500 | are ignored. | |
501 | .TP | |
502 | .BR FLTF_QNAN " and " FLTF_SNAN | |
503 | The value is a quiet or signalling not-a-number, respectively. | |
504 | The | |
505 | .B exp | |
506 | is ignored. | |
507 | The payload is stored in | |
508 | .BR frac ; | |
509 | the payload does not include the `quiet' bit. | |
510 | .TP | |
511 | .B FLTF_ZERO | |
512 | The number is zero. | |
513 | Negative zero is distinct from positive zero. | |
514 | The | |
515 | .B exp | |
516 | and | |
517 | .B frac | |
518 | are ignored. | |
519 | .IP "All non-sign bits clear" | |
520 | The value is a finite nonzero number. | |
521 | The | |
522 | .B frac | |
523 | holds the significand. | |
524 | The most significand significand bit must be set, so | |
525 | (a)\ the number must be nonzero, and | |
526 | (b)\ the significand is normalized. | |
527 | The significand is interpreted as a fraction | |
528 | .RI "1/2\ \*(<=\ " m "\ <\ 1." | |
529 | If | |
530 | .I e | |
531 | is the value of the | |
532 | .B exp | |
533 | member, | |
534 | and | |
535 | .I s | |
536 | is \-1 if | |
537 | .B FLTF_NEG | |
538 | is set | |
539 | or +1 if | |
540 | .B FLTF_NEG | |
541 | is clear, | |
542 | then the number represented is | |
543 | .IR s "\ \(mu\ " m "\ \(mu\ 2\*(ss" e \*(se. | |
544 | .PP | |
545 | A | |
546 | .B struct floatbits | |
547 | can be initialized statically by | |
548 | .BR FLOATBITS_INIT , | |
549 | or dynamically using the function | |
550 | .BR fltfmt_initbits . | |
551 | These are not quite the same: | |
552 | .B FLOATBITS_INIT | |
553 | initializes | |
554 | .B a | |
555 | to | |
556 | .BR &arena_stdlib , | |
557 | while | |
558 | .B fltfmt_initbits | |
559 | sets it to the runtime value of | |
560 | .BR arena_global . | |
561 | With this exception, | |
562 | both forms of initialization set the value to (positive) zero; | |
563 | neither allocates any storage or other resources, | |
564 | leaving | |
565 | .B frac | |
566 | null. | |
567 | .PP | |
568 | The | |
569 | .B fltfmt_allocfrac | |
570 | function is given a pointer | |
571 | .I x | |
572 | to a | |
573 | .B struct floatbits | |
574 | and a length | |
575 | .IR n : | |
576 | it ensures that there is enough storage at | |
577 | .IB x ->frac | |
578 | for at least | |
579 | .I n | |
580 | words: | |
581 | if the current size is too small, | |
582 | then any existing buffer is discarded and a new one allocated | |
583 | from the arena | |
584 | .IB x ->a \fR; | |
585 | any existing contents of the buffer are lost. | |
586 | On exit, | |
587 | .IB x ->n | |
588 | is set to | |
589 | .IR n . | |
590 | .PP | |
591 | The | |
592 | .B fltfmt_freebits | |
593 | function | |
594 | frees a | |
595 | .B struct floatbits | |
596 | structure, releasing the storage held by | |
597 | .BR frac . | |
598 | .PP | |
599 | The | |
600 | .B fltfmt_copybits | |
601 | function simply copies its input | |
602 | .I x | |
603 | to its output | |
604 | .IR z_out ; | |
605 | both must refer to initialized | |
606 | .B struct floatbits | |
607 | structures. | |
608 | If | |
609 | .I z_out | |
610 | and | |
611 | .I x | |
612 | are equal, then nothing happens. | |
613 | .PP | |
614 | Finally, the | |
615 | .B fltfmt_round | |
616 | function rounds the value in the | |
617 | .B struct floatbits | |
618 | structure | |
619 | .I x | |
620 | to | |
621 | .I n | |
622 | bits using the rounding mode | |
623 | .IR r ; | |
624 | the result is written to | |
625 | .IR z_out ; | |
626 | it is permitted for | |
627 | .I z_out | |
628 | to be equal to | |
629 | .IR x . | |
630 | If | |
631 | .I x | |
632 | is a zero or infinity, | |
633 | then the output is equal to the input, | |
634 | as if | |
635 | .B fltfmt_copybits | |
636 | had been called instead. | |
637 | If | |
638 | .I x | |
639 | is a NaN, | |
640 | then the payload is simply truncated to | |
641 | .I n | |
642 | bits, without regard to the rounding mode. | |
643 | Otherwise, the input is nonzero and finite; | |
644 | the significand is rounded to | |
645 | .I n | |
646 | bits according to the rounding mode. | |
647 | In all cases, the return value is | |
648 | zero if the output is equal to the input, | |
649 | or | |
650 | .B FLTERR_INEXACT | |
651 | if the rounded result is not equal to the input. | |
652 | . | |
653 | .SS IEEE and related formats | |
654 | An IEEE floating-point format is characterized by three parameters: | |
655 | the | |
656 | .I "exponent width" | |
657 | .IR w , | |
658 | the | |
659 | .I "precision" | |
660 | .IR p , | |
661 | and | |
662 | the | |
663 | .I "unit width" | |
664 | .IR h . | |
665 | .PP | |
666 | The encoded value consists of | |
667 | .IR p "\ +\ " w "\ +\ " h "\ \-\ 1" | |
668 | bits. | |
669 | This is divided, from the most significant bit downwards, | |
670 | into a | |
671 | .I "sign bit" | |
672 | .IR s , | |
673 | a | |
674 | .IR w -bit | |
675 | .I "biased exponent" | |
676 | .IR e \*', | |
677 | a | |
678 | .IR h -bit | |
679 | .I "unit bit" | |
680 | .IR u , | |
681 | and a | |
682 | .RI ( p "\ \-\ " h )-bit | |
683 | .I fraction | |
684 | .IR f . | |
685 | The | |
686 | .I "exponent bias" | |
687 | is | |
688 | .IR e "\*(us0\*(ue\ =\ 2\*(ss" w "\-1\*(se\ \-\ 1;" | |
689 | the true exponent | |
690 | .I e | |
691 | is calculated from the biased exponent by | |
692 | .IR e "\ =\ " e "\*'\ \-\ " e \*(us0\*(ue. | |
693 | The unit and fraction field are usually interpreted as denoting | |
694 | a significand | |
695 | .IR m "\ =\ " u "\ +\ " f /2\*(ss p \-1\*(se | |
696 | with | |
697 | .RI "0\ \*(<=\ " m "\ <\ 2." | |
698 | If | |
699 | .I h | |
700 | is zero, | |
701 | the value of the unit bit | |
702 | .I u | |
703 | is implied by the exponent as described below. | |
704 | The encoded value is interpreted as follows. | |
705 | .hP \*o | |
706 | If | |
707 | .IR e "\ =\ \-" e \*(us0\*(ue | |
708 | then the value is zero or a subnormal, | |
709 | with the value | |
710 | .RI (\-1)\*(ss s "\*(se\ \(mu\ " m "\ \(mu\ 2\*(ss" e +1\*(se. | |
711 | In particular, | |
712 | if | |
713 | .IR m "\ =\ 0" | |
714 | then the value is positive or negative zero, | |
715 | according to the sign bit | |
716 | .IR s . | |
717 | If | |
718 | .I h | |
719 | is zero then | |
720 | .IR u "\ =\ 0;" | |
721 | if | |
722 | .I h | |
723 | is nonzero | |
724 | but | |
725 | .IR u "\ \*(/=\ 0" | |
726 | then the encoding is invalid: | |
727 | decoding returns | |
728 | .BR FLTERR_INVAL , | |
729 | but the result will be as described. | |
730 | .hP \*o | |
731 | If | |
732 | .RI "1\ \-\ " e "\*(us0\*(ue \*(<=\ " e "\ < 2" e "\*(us0\*(ue\ +\ 1" | |
733 | then the value is a (supposedly) normal number | |
734 | .RI (\-1)\*(ss s "\*(se\ \(mu\ " m "\ \(mu\ 2\*(ss" e \*(se. | |
735 | If | |
736 | .I h | |
737 | is zero then | |
738 | .IR u "\ =\ 1;" | |
739 | if | |
740 | .I h | |
741 | is nonzero | |
742 | but | |
743 | .IR u "\ \*(/=\ 0" | |
744 | then the encoding is invalid: | |
745 | decoding returns | |
746 | .BR FLTERR_INVAL , | |
747 | but the result will be as described. | |
748 | .hP \*o | |
749 | If | |
750 | .IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1" | |
751 | and | |
752 | .IR f "\ =\ 0" | |
753 | then the value is positive or negative infinity, | |
754 | according to the sign bit | |
755 | .IR s . | |
756 | If | |
757 | .I h | |
758 | is nonzero and | |
759 | .IR u "\ =\ 0" | |
760 | then the encoding is invalid: | |
761 | decoding returns | |
762 | .BR FLTERR_INVAL , | |
763 | but the result will still be infinity. | |
764 | .hP \*o | |
765 | If | |
766 | .IR e "\ =\ 2" e "\*(us0\*(ue\ +\ 1" | |
767 | and | |
768 | .IR f "\ \*(/=\ 0" | |
769 | then the value is not-a-number (NaN). | |
770 | The most significant bit of | |
771 | .I f | |
772 | is the `quiet bit': | |
773 | if the bit is set, the value is a `quiet NaN'; | |
774 | if the bit is clear, the value is a `signalling NaN'. | |
775 | (This is the convention recommended by IEEE\ 754-2008 \(sc6.2.1, | |
776 | it has the advantage that a signalling NaN can be `quieted' | |
777 | by setting the most significant fraction bit; | |
778 | HP-PA and older MIPS processors use the opposite convention | |
779 | for distinguishing quiet and signalling NaNs, | |
780 | but a signalling NaN with all but the most significant | |
781 | fraction bit zero cannot be `quieted' by clearing the | |
782 | most significant bit, since the resulting encoding denotes | |
783 | an infinity, not a QNaN.) | |
784 | The remaining bits of | |
785 | .I f | |
786 | form the | |
787 | .I payload. | |
788 | Positive and negative NaN values are distinguished, | |
789 | with sign determined by the sign bit. | |
790 | If | |
791 | .I h | |
792 | is nonzero and | |
793 | .IR u "\ =\ 0" | |
794 | then the encoding is invalid: | |
795 | decoding returns | |
796 | .BR FLTERR_INVAL , | |
797 | but the result will still be a NaN; | |
798 | the unit bit does not affect the NaN payload. | |
799 | .PP | |
800 | An IEEE format is described by the type | |
801 | .BR "struct fltfmt_ieeefmt" . | |
802 | This has three members: | |
803 | .TP | |
804 | .B f | |
805 | A flags word. | |
806 | If | |
807 | .B FLTIF_HIDDEN | |
808 | is set, the the format uses a `hidden bit' convention: | |
809 | in the notation above | |
810 | .IR h "\ =\ 0;" | |
811 | if the flag is clear, | |
812 | the format has an explicit unit bit, and | |
813 | .IR h "\ =\ 1." | |
814 | .TP | |
815 | .B expwd | |
816 | The exponent width; | |
817 | in the notation above this is | |
818 | .IR w . | |
819 | .TP | |
820 | .B prec | |
821 | The precision; | |
822 | in the notation above this is | |
823 | .IR p . | |
824 | .PP | |
825 | The following IEEE formats descriptions are already defined. | |
826 | .TP | |
827 | .B "fltfmt_f16" | |
828 | The IEEE\ 754 Binary16 format, with | |
829 | .IR w "\ =\ 5," | |
830 | .IR p "\ =\ 11," | |
831 | and | |
832 | .IR h "\ =\ 0." | |
833 | .TP | |
834 | .B "fltfmt_f32" | |
835 | The IEEE\ 754 Binary32 (`single precision') format, with | |
836 | .IR w "\ =\ 8," | |
837 | .IR p "\ =\ 24," | |
838 | and | |
839 | .IR h "\ =\ 0." | |
840 | .TP | |
841 | .B "fltfmt_f64" | |
842 | The IEEE\ 754 Binary64 (`double precision') format, with | |
843 | .IR w "\ =\ 11," | |
844 | .IR p "\ =\ 53," | |
845 | and | |
846 | .IR h "\ =\ 0." | |
847 | .TP | |
848 | .B "fltfmt_f128" | |
849 | The IEEE\ 754 Binary128 (`quad precision') format, with | |
850 | .IR w "\ =\ 15," | |
851 | .IR p "\ =\ 113," | |
852 | and | |
853 | .IR h "\ =\ 0." | |
854 | .TP | |
855 | .B "fltfmt_mini" | |
856 | An eight-bit `minifloat' format, with | |
857 | .IR w "\ =\ 4," | |
858 | .IR p "\ =\ 4," | |
859 | and | |
860 | .IR h "\ =\ 0." | |
861 | .TP | |
862 | .B "fltfmt_bf16" | |
863 | The Google `BFloat16' format, with | |
864 | .IR w "\ =\ 8," | |
865 | .IR p "\ =\ 8," | |
866 | and | |
867 | .IR h "\ =\ 0." | |
868 | .TP | |
869 | .B "fltfmt_idblext80" | |
870 | The Intel 8087 80-bit `double extended' format, with | |
871 | .IR w "\ =\ 15," | |
872 | .IR p "\ =\ 64," | |
873 | and | |
874 | .IR h "\ =\ 1." | |
875 | .PP | |
876 | The | |
877 | .B fltfmt_encieee | |
878 | and | |
879 | .B fltfmt_decieee | |
880 | functions convert between IEEE and related formats | |
881 | and the internal | |
882 | .B struct floatbits | |
883 | representation. | |
884 | They respectively encode or decode an IEEE-format value, | |
885 | as described above, | |
886 | from a vector of | |
887 | .B uint32 | |
888 | words, | |
889 | most-significant word first | |
890 | \(en so the sign bit is in the first word. | |
891 | For formats whose size is not a multiple of 32, | |
892 | the encoding is | |
893 | .IR right-aligned : | |
894 | the least significant bit of the fraction | |
895 | is in the least significant bit of the last word in the vector. | |
896 | .PP | |
897 | The | |
898 | .B fltfmt_encieee | |
899 | function encodes an IEEE-format value. | |
900 | The function is given five arguments: | |
901 | a pointer | |
902 | .I fmt | |
903 | to the IEEE format description, | |
904 | a pointer | |
905 | .I p | |
906 | to a sufficiently long vector of 32-bit words | |
907 | in which to store the encoded value, | |
908 | a pointer | |
909 | .I x | |
910 | to the | |
911 | .B struct floatbits | |
912 | holding the value to encode, | |
913 | a rounding mode | |
914 | .IR r , | |
915 | and an error mask | |
916 | .IR errmask . | |
917 | If the input is a NaN, | |
918 | then the payload is truncated to fit | |
919 | regardless of the rounding mode, | |
920 | discarding low-significant bits; | |
921 | if the input is a finite value, | |
922 | then the significand is rounded to fit | |
923 | according to the requested rounding mode. | |
924 | The possible errors are | |
925 | .B FLTERR_UFLOW | |
926 | if the value is unrepresentably tiny, | |
927 | .B FLTERR_OFLOW | |
928 | if the value is unrepresentably huge, | |
929 | and | |
930 | .B FLTERR_INEXACT | |
931 | if the encoding fails to preserve the input value exactly; | |
932 | hence | |
933 | .B FLTERR_INEXACT | |
934 | is set whenever | |
935 | .B FLTERR_OFLOW | |
936 | or | |
937 | .B FLTERR_UFLOW | |
938 | is set, | |
939 | or if bits are lost due to NaN-payload truncation or rounding. | |
940 | If, during encoding, | |
941 | an error is encountered, | |
942 | processing stops immediately | |
943 | unless the corresponding bit of | |
944 | .I errmask | |
945 | is set. | |
946 | .PP | |
947 | The | |
948 | .B fltfmt_decieee | |
949 | function decodes an IEEE-format value. | |
950 | The function is given three arguments: | |
951 | a pointer | |
952 | .I fmt | |
953 | to the IEEE format description, | |
954 | a pointer | |
955 | .I z_out | |
956 | to the initialized | |
957 | .B struct floatbits | |
958 | to fill in, and | |
959 | a pointer | |
960 | .I p | |
961 | to the IEEE-encoded value to decode, | |
962 | in a vector of 32-bit words as described above. | |
963 | The only error that can occur during decoding is | |
964 | .BR FLTERR_INVAL : | |
965 | as described above, | |
966 | this occurs in non-hidden-bit formats | |
967 | when the unit bit does not match that implied by the exponent; | |
968 | the result is returned anyway, | |
969 | with the unit bit interpreted as encoded in finite numbers, | |
970 | and discarded in infinities and NaNs. | |
971 | .PP | |
972 | For each of the format | |
973 | ||
974 | ||
975 | ||
976 | . | |
977 | .\"-------------------------------------------------------------------------- | |
978 | .SH "SEE ALSO" | |
979 | . | |
980 | .BR bits (3), | |
981 | .BR mLib (3). | |
982 | . | |
983 | .\"-------------------------------------------------------------------------- | |
984 | .SH AUTHOR | |
985 | . | |
986 | Mark Wooding, <mdw@distorted.org.uk> | |
987 | . | |
988 | .\"----- That's all, folks -------------------------------------------------- |