Commit | Line | Data |
---|---|---|
b1a20bee MW |
1 | /* -*-c-*- |
2 | * | |
3 | * Floating-point format conversions | |
4 | * | |
5 | * (c) 2024 Straylight/Edgeware | |
6 | */ | |
7 | ||
8 | /*----- Licensing notice --------------------------------------------------* | |
9 | * | |
10 | * This file is part of the mLib utilities library. | |
11 | * | |
12 | * mLib is free software: you can redistribute it and/or modify it under | |
13 | * the terms of the GNU Library General Public License as published by | |
14 | * the Free Software Foundation; either version 2 of the License, or (at | |
15 | * your option) any later version. | |
16 | * | |
17 | * mLib is distributed in the hope that it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public | |
20 | * License for more details. | |
21 | * | |
22 | * You should have received a copy of the GNU Library General Public | |
23 | * License along with mLib. If not, write to the Free Software | |
24 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | |
25 | * USA. | |
26 | */ | |
27 | ||
28 | #ifndef MLIB_FLTFMT_H | |
29 | #define MLIB_FLTFMT_H | |
30 | ||
31 | #ifdef __cplusplus | |
32 | extern "C" { | |
33 | #endif | |
34 | ||
35 | /*----- Header files ------------------------------------------------------*/ | |
36 | ||
37 | #ifndef MLIB_ARENA_H | |
38 | # include "arena.h" | |
39 | #endif | |
40 | ||
41 | #ifndef MLIB_BITS_H | |
42 | # include "bits.h" | |
43 | #endif | |
44 | ||
45 | /*----- Data structures ---------------------------------------------------*/ | |
46 | ||
c752173d MW |
47 | /* Error codes. */ |
48 | #define FLTERR_OK 0x0000u /* no trouble */ | |
49 | #define FLTERR_INVAL 0x0001u /* technically invalid encoding */ | |
50 | #define FLTERR_INEXACT 0x0002u /* result is inexect */ | |
51 | #define FLTERR_UFLOW 0x0004u /* underflowed to zero */ | |
52 | #define FLTERR_OFLOW 0x0008u /* overflowed to ±∞ or max finite */ | |
53 | #define FLTERR_REPR 0x0010 /* not representable */ | |
54 | #define FLTERR_ALLERRS 0xffff /* all errors */ | |
55 | ||
56 | /* Predicates considered for rounding. */ | |
57 | #define FRPF_LOW 0x0001u /* lost bits not exactly zero or half */ | |
58 | #define FRPF_HALF 0x0002u /* lost a half or more */ | |
59 | #define FRPF_ODD 0x0004u /* final place is currently odd */ | |
60 | #define FRPF_NEG 0x0008u /* number is negative */ | |
61 | ||
62 | /* Rounding policies. These are represented as a 16-bit truth table applied | |
63 | * to the predicate bits listed above. The following are the mask values | |
64 | * corresponding to the predicate bits being set; a set bit means that the | |
65 | * number should be rounded away from zero. | |
66 | */ | |
67 | #define FRPMASK_LOW 0xaaaau /* lost bits below half */ | |
68 | #define FRPMASK_HALF 0xccccu /* lost a half or more */ | |
69 | #define FRPMASK_ODD 0xf0f0u /* final place is dod */ | |
70 | #define FRPMASK_NEG 0xff00u /* number is negative */ | |
71 | ||
72 | /* Useful constructed masks from the above. */ | |
73 | #define FRPMASK_INEXACT (FRPMASK_LOW | FRPMASK_HALF) /* lost nonzero bits */ | |
74 | #define FRPMASK_NEAR(dir) (FRPMASK_HALF&(FRPMASK_LOW | (dir))) /* */ | |
75 | ||
76 | /* Generally useful rounding criteria. */ | |
77 | #define FLTRND_ZERO 0 /* towards zero (truncate) */ | |
78 | #define FLTRND_PROJINF FRPMASK_INEXACT /* towards (projective) ±∞ */ | |
79 | #define FLTRND_NEGINF (FRPMASK_INEXACT&FRPMASK_NEG) /* down, towards -∞ */ | |
80 | #define FLTRND_POSINF (FRPMASK_INEXACT&~FRPMASK_NEG) /* up, towards +∞ */ | |
81 | #define FLTRND_EVEN (FRPMASK_INEXACT&FRPMASK_ODD) /* to even */ | |
82 | #define FLTRND_ODD (FRPMASK_INEXACT&~FRPMASK_ODD) /* to odd */ | |
83 | #define FLTRND_NEAREVEN FRPMASK_NEAR(FLTRND_EVEN) /* nearest, ties to even */ | |
84 | #define FLTRND_NEARODD FRPMASK_NEAR(FLTRND_ODD) /* nearest, ties to odd */ | |
85 | #define FLTRND_NEARZERO FRPMASK_NEAR(FLTRND_ZERO) /* nearest, ties to zero */ | |
86 | #define FLTRND_NEARINF FRPMASK_NEAR(FLTRND_PROJINF) /* nearest, ties to ±∞ */ | |
87 | #define FLTRND_NEARNEG FRPMASK_NEAR(FLTRND_NEGINF) /* nearest, ties to -∞ */ | |
88 | #define FLTRND_NEARPOS FRPMASK_NEAR(FLTRND_POSINF) /* nearest, ties to +∞ */ | |
89 | ||
b1a20bee MW |
90 | struct floatbits { |
91 | /* A decoded floating-point number. | |
92 | * | |
93 | * The flags do most of the heavy lifting here. | |
94 | * | |
95 | * * @FLTF_ZERO@ is set if the number is zero. The @frac@ and @exp@ are | |
96 | * ignored. | |
97 | * | |
98 | * * @FLTF_NEG@ is set if the number is negative. The representation is | |
99 | * signed magnitude, because that seems basically universal among | |
100 | * floating-point formats. Negative zero is a thing. | |
101 | * | |
102 | * * @FLTF_SNAN@ and @FLTF_QMAN@ are set if the value is, respectively, a | |
103 | * signalling or quiet not-a-number. The @frac@ holds the payload, | |
104 | * left-aligned, excluding the quiet bit; @exp@ is ignored. | |
105 | * | |
106 | * * @FLTF_INF@ is set if the number is positive or negative infinity. | |
107 | * Projective infinity is not representable. The @frac@ and @exp@ are | |
108 | * ignored. | |
109 | * | |
110 | * The @frac@ field contains the fractional significand, big-end first; | |
111 | * either the number is identically (positive or negative) zero, or the | |
112 | * most significant bit of @sig[0]@ is set, and the significand lies | |
113 | * between a half (inclusive) and one (exclusive). The @exp@ is the power | |
114 | * of two by which the significand is to be scaled. | |
115 | * | |
116 | * The essential convention for @frac@ is that the value is unchanged if | |
117 | * zero-valued words are added or removed at the end. | |
118 | */ | |
119 | ||
120 | unsigned f; /* flags */ | |
121 | #define FLTF_NEG 0x0001u /* number is negative */ | |
122 | #define FLTF_INF 0x0002u /* number is negative */ | |
123 | #define FLTF_QNAN 0x0004u /* quiet not-a-number */ | |
124 | #define FLTF_SNAN 0x0008u /* signalling not-a-number */ | |
125 | #define FLTF_ZERO 0x0010u /* number is zero */ | |
126 | #define FLTF_NANMASK (FLTF_QNAN | FLTF_SNAN) /* any kind of NaN */ | |
127 | int exp; /* exponent, base 2 */ | |
128 | arena *a; /* memory arena */ | |
129 | uint32 *frac; /* fraction */ | |
130 | unsigned n, fracsz; /* fraction limbs used/allocated */ | |
131 | }; | |
132 | #define FLOATBITS_INIT { FLTF_ZERO, 0, &arena_stdlib, 0, 0, 0 } | |
133 | ||
b1a20bee MW |
134 | /*----- General floating-point hacking ------------------------------------*/ |
135 | ||
136 | /* --- @fltfmt_initbits@ --- * | |
137 | * | |
138 | * Arguments: @struct floatbits *x@ = pointer to structure to initialize | |
139 | * | |
140 | * Returns: --- | |
141 | * | |
142 | * Use: Dynamically initialize @x@ to (positive) zero so that it can | |
143 | * be used as the destination operand by other operations. This | |
144 | * doesn't allocate resources and cannot fail. The | |
145 | * @FLOATBITS_INIT@ macro is a suitable static initializer for | |
146 | * performing the same task. | |
147 | */ | |
148 | ||
149 | extern void fltfmt_initbits(struct floatbits */*x*/); | |
150 | ||
151 | /* --- @fltfmt_freebits@ --- * | |
152 | * | |
153 | * Arguments: @struct floatbits *x@ = pointer to structure to free | |
154 | * | |
155 | * Returns: --- | |
156 | * | |
157 | * Use: Releases the memory held by @x@. Afterwards, @x@ is a valid | |
158 | * (positive) zero, but can safely be discarded. | |
159 | */ | |
160 | ||
161 | extern void fltfmt_freebits(struct floatbits */*x*/); | |
162 | ||
163 | /* --- @fltfmt_allocfrac@ --- * | |
164 | * | |
165 | * Arguments: @struct floatbits *x@ = structure to adjust | |
166 | * @unsigned n@ = number of words required | |
167 | * | |
168 | * Returns: --- | |
169 | * | |
170 | * Use: Reallocate the @frac@ vector so that it has space for at | |
171 | * least @n@ 32-bit words, and set @x->n@ equal to @n@. If the | |
172 | * current size is already @n@ or greater, then just update the | |
173 | * active length @n@ and return; otherwise, any existing vector | |
174 | * is discarded and a fresh, larger one allocated. | |
175 | */ | |
176 | ||
177 | extern void fltfmt_allocfrac(struct floatbits */*x*/, unsigned /*n*/); | |
178 | ||
179 | /* --- @fltfmt_copybits@ --- * | |
180 | * | |
181 | * Arguments: @struct floatbits *z_out@ = where to leave the result | |
182 | * @const struct floatbits *x@ = source to copy | |
183 | * | |
184 | * Returns: --- | |
185 | * | |
186 | * Use: Make @z_out@ be a copy of @x@. If @z_out@ is the same object | |
187 | * as @x@ then do nothing. | |
188 | */ | |
189 | ||
190 | extern void fltfmt_copybits(struct floatbits */*z_out*/, | |
191 | const struct floatbits */*x*/); | |
192 | ||
193 | /* --- @fltfmt_round@ --- * | |
194 | * | |
195 | * Arguments: @struct floatbits *z_out@ = destination (may equal source) | |
196 | * @const struct floatbits *x@ = source | |
197 | * @unsigned r@ = rounding mode (@FLTRND_...@ code) | |
198 | * @unsigned n@ = nonzero number of bits to leave | |
199 | * | |
200 | * Returns: A @FLTERR_...@ code, specifically either @FLTERR_INEXACT@ if | |
201 | * rounding discarded some nonzero value bits, or @FLTERR_OK@ if | |
202 | * rounding was unnecessary. | |
203 | * | |
204 | * Use: Rounds a floating-point value to a given number of | |
205 | * significant bits, using the given rounding rule. | |
206 | */ | |
207 | ||
208 | extern unsigned fltfmt_round(struct floatbits */*z_out*/, | |
209 | const struct floatbits */*x*/, | |
210 | unsigned /*r*/, unsigned /*n*/); | |
211 | ||
212 | /*----- IEEE formats ------------------------------------------------------*/ | |
213 | ||
214 | struct fltfmt_ieeefmt { | |
215 | /* Description of a binary IEEE floating-point format. | |
216 | * | |
217 | * An IEEE binary floating-point encoding is split into three fields, | |
218 | * called %$\sigma$%, %$e'$%, and %$m$%. | |
219 | * | |
220 | * The %$\sigma$% field encodes the sign as a single bit: if %$\sigma = 0$% | |
221 | * then the value is nonnegative; if %$\sigma = 1$% then the value is | |
222 | * negative. Signed-magnitude encoding is used: if the rest of the | |
223 | * encoding represents a (necessarily nonnegative) value %$x$% then the | |
224 | * signed value is %$(-1)^\sigma \cdot x$%. | |
225 | * | |
226 | * The %$e'$% field encodes the exponent in a field of %$w$% bits. The | |
227 | * true exponent %$e = e' - e_0$%, where %$e_0 = 2^{w-1} - 1$% is the | |
228 | * %%\emph{exponent bias}%%. The maximum exponent for finite values is | |
229 | * %$e_{\text{max}} = 2^w - 2 - e_0 = 2^{w-1} - 1$%, which is | |
230 | * coincidentally equal to %$e_0$%; and the minimum exponent for | |
231 | * %%\emph{normal}%% finite values is %$e_{\text{min}} = 1 - e_0 = {}$% | |
232 | * %$2 - 2^{w-1}$%. The maximum exponent value %$2^w - 1$% denotes | |
233 | * infinities and NaN values, while the minimum value denotes zeros and | |
234 | * subnormal values. | |
235 | * | |
236 | * If a `hidden-bit' convention is used (@IEEEF_HIDDEN@ is set in @f@), | |
237 | * then %$h = 1$%; otherwise, %$h = 0$%. | |
238 | * | |
239 | * The %$m$% field encodes the %$p$%-bit %%\emph{significand}%%. If a | |
240 | * `hidden-bit' convention is used then the %$m$% field is actually %$p - | |
241 | * 1$% bits wide; otherwise, it is %$p$% bits. | |
242 | * | |
243 | * * If %$e_{\text{min}} \le e \le e_{\text{max}}$% then the encoding | |
244 | * represents a %%\emph{normal} value, specifically the value | |
245 | * %$x = (-1)^\sigma \cdot (h + m/2^{p-1}) \cdot 2^e$%. In formats | |
246 | * which do not use the hidden-bit convention, the most significant bit | |
247 | * of %$m$% must be set; we return @FLTERR_INVAL@ for other | |
248 | * encodings, and interpret the `unnormal' value as encoded. | |
249 | * | |
250 | * * If %$e = e_{\text{min}} - 1$% then the encoding represents (signed) | |
251 | * zero if %$m = 0$%, or a %%\emph{subnormal}%% value %$x = (-1)^\sigma | |
252 | * \cdot m/2^{p-1} \cdot 2^{e_{\text{min}}}$%. Note that, in formats | |
253 | * which do not use the hidden-bit convention, the unit bit should be | |
254 | * clear; we return @FLTERR_INVAL@ for other encodings, and interpret | |
255 | * the `pseudo-denormal' value as encoded. | |
256 | * | |
257 | * * If %e = e_{\text{max}} + 1$% then the encoding represents | |
258 | * %$(-1)^\sigma \cdot \infty$% if %$m = 0$%, or a not-a-number value | |
259 | * (NaN) with payload %$m \ne 0$%. A %%\emph{quiet}%% NaN has bit | |
260 | * %$p - 2$% set in %$m$%; a signalling NaN has this bit reset. Note | |
261 | * that some platform's native format reverses this convention, but | |
262 | * this is handled in code which deals with native formats: the | |
263 | * interchange formats described here always indicate quiet NaNs by | |
264 | * setting the bit. In formats which use the hidden-bit convetion, the | |
265 | * unit bit %$p - 1$% is ignored | |
266 | */ | |
267 | ||
268 | unsigned f; /* flags */ | |
c752173d | 269 | #define FLTIF_HIDDEN 1u /* unit bit is implicit */ |
b1a20bee MW |
270 | unsigned expwd; /* exponent field width %$w$% */ |
271 | unsigned prec; /* precision %$p$% */ | |
272 | }; | |
273 | ||
274 | /* IEEE (and related) format descriptions. */ | |
275 | extern const struct fltfmt_ieeefmt | |
276 | fltfmt_f16, fltfmt_f32, fltfmt_f64, fltfmt_f128, | |
277 | fltfmt_mini, fltfmt_bf16, fltfmt_idblext80; | |
278 | ||
279 | /* --- @fltfmt_encieee@ --- | |
280 | * | |
281 | * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description | |
282 | * @uint32 *z@ = output vector | |
283 | * @const struct floatbits *x@ = value to encode | |
284 | * @unsigned r@ = rounding mode | |
285 | * @unsigned errmask@ = error mask | |
286 | * | |
287 | * Returns: Error flags (@FLTERR_...@). | |
288 | * | |
289 | * Use: Encode a floating-point value in an IEEE format. This is the | |
290 | * machinery shared by the @fltfmt_enc...@ functions for | |
291 | * encoding IEEE-format values. Most of the arguments and | |
292 | * behaviour are as described for those functions. | |
293 | * | |
294 | * The encoded value is right-aligned and big-endian; i.e., the | |
295 | * sign bit ends up in @z[0]@, and the least significant bit of | |
296 | * the significand ends up in the least significant bit of | |
297 | * @z[n - 1]@. | |
298 | */ | |
299 | ||
300 | extern unsigned fltfmt_encieee(const struct fltfmt_ieeefmt */*fmt*/, | |
301 | uint32 */*z*/, const struct floatbits */*x*/, | |
302 | unsigned /*r*/, unsigned /*errmask*/); | |
303 | ||
304 | /* --- @fltfmt_encTY@ --- * | |
305 | * | |
306 | * Arguments: @octet *z_out@, @uint16 *z_out@, @uint32 *z_out@, | |
307 | * @kludge64 *z_out@ = where to put the encoded value | |
308 | * @uint16 *se_out@, @kludge64 *m_out@ = where to put the | |
309 | * encoded sign-and-exponent and significand | |
310 | * @const struct floatbits *x@ = value to encode | |
311 | * @unsigned r@ = rounding mode | |
312 | * @unsigned errmask@ = error mask | |
313 | * | |
314 | * Returns: Error flags (@FLTERR_...@). | |
315 | * | |
316 | * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent) | |
317 | * format. | |
318 | * | |
319 | * If an error is encountered during the encoding, and the | |
320 | * corresponding bit of @errmask@ is clear, then processing | |
321 | * stops immediately and the error is returned; if the bit is | |
322 | * set, then processing continues as described below. | |
323 | * | |
324 | * The @TY@ may be | |
325 | * | |
326 | * * @mini@ for the 8-bit `1.4.3 minifloat' format, with | |
327 | * four-bit exponent and four-bit significand, represented | |
328 | * as a single octet; | |
329 | * | |
330 | * * @bf16@ for the Google `bfloat16' format, with eight-bit | |
331 | * exponent and eight-bit significand, represented as a | |
332 | * @uint16@; | |
333 | * | |
334 | * * @f16@ for the IEEE `binary16' format, with five-bit | |
335 | * exponent and eleven-bit significand, represented as a | |
336 | * @uint16@; | |
337 | * | |
338 | * * @f32@ for the IEEE `binary32' format, with eight-bit | |
339 | * exponent and 24-bit significand, represented as a | |
340 | * @uint32@; | |
341 | * | |
342 | * * @f64@ for the IEEE `binary64' format, with eleven-bit | |
343 | * exponent and 53-bit significand, represented as a | |
344 | * @kludge64@; | |
345 | * | |
346 | * * @f128@ for the IEEE `binary128' format, with fifteen-bit | |
347 | * exponent and 113-bit significand, represented as four | |
348 | * @uint32@ limbs, most significant first; or | |
349 | * | |
350 | * * @idblext80@ for the Intel 80-bit `double extended' | |
351 | * format, with fifteen-bit exponent and 64-bit significand | |
352 | * with no hidden bit, represented as a @uint16 se@ | |
353 | * holding the sign and exponent, and a @kludge64 m@ | |
354 | * holding the significand. | |
355 | * | |
356 | * Positive and negative zero and infinity are representable | |
357 | * exactly. | |
358 | * | |
359 | * Following IEEE recommendations (and most implementations), | |
360 | * the most significant fraction bit of a quiet NaN is set; this | |
361 | * bit is clear in a signalling NaN. The most significant | |
362 | * payload bits of a NaN, held in the top bits of @x->frac[0]@, | |
363 | * are encoded in the output significand following the `quiet' | |
364 | * bit. If the chosen format's significand field is too small | |
365 | * to accommodate all of the set payload bits then the | |
366 | * @FLTERR_INEXACT@ error bit is set and, if masked, the | |
367 | * excess payload bits are discarded. No rounding of NaN | |
368 | * payloads is performed. | |
369 | * | |
370 | * Otherwise, the input value is finite and nonzero. If the | |
371 | * significand cannot be represented exactly then the | |
372 | * @FLTERR_INEXACT@ error bit is set, and, if masked, the value | |
373 | * will be rounded (internally -- the input @x@ is not changed). | |
374 | * If the (rounded) value's exponent is too large to represent, | |
375 | * then the @FLTERR_OFLOW@ and @FLTERR_INEXACT@ error bits are | |
376 | * set and, if masked, the result is either the (absolute) | |
377 | * largest representable finite value or infinity, with the | |
378 | * appropriate sign, chosen according to the rounding mode. If | |
379 | * the exponent is too small to represent, then the | |
380 | * @FLTERR_UFLOW@ and @FLTERR_INEXACT@ error bits are set and, | |
381 | * if masked, the result is either the (absolute) smallest | |
382 | * nonzero value or zero, with the appropriate sign, chosen | |
383 | * according to the rounding mode. | |
384 | */ | |
385 | ||
386 | extern unsigned fltfmt_encmini(octet */*z_out*/, | |
387 | const struct floatbits */*x*/, | |
388 | unsigned /*r*/, unsigned /*errmask*/); | |
389 | ||
390 | extern unsigned fltfmt_encbf16(uint16 */*z_out*/, | |
391 | const struct floatbits */*x*/, | |
392 | unsigned /*r*/, unsigned /*errmask*/); | |
393 | ||
394 | extern unsigned fltfmt_encf16(uint16 */*z_out*/, | |
395 | const struct floatbits */*x*/, | |
396 | unsigned /*r*/, unsigned /*errmask*/); | |
397 | ||
398 | extern unsigned fltfmt_encf32(uint32 */*z_out*/, | |
399 | const struct floatbits */*x*/, | |
400 | unsigned /*r*/, unsigned /*errmask*/); | |
401 | ||
402 | extern unsigned fltfmt_encf64(kludge64 */*z_out*/, | |
403 | const struct floatbits */*x*/, | |
404 | unsigned /*r*/, unsigned /*errmask*/); | |
405 | ||
406 | extern unsigned fltfmt_encf128(uint32 */*z_out*/, | |
407 | const struct floatbits */*x*/, | |
408 | unsigned /*r*/, unsigned /*errmask*/); | |
409 | ||
410 | extern unsigned fltfmt_encidblext80(uint16 */*se_out*/, kludge64 */*f_out*/, | |
411 | const struct floatbits */*x*/, | |
412 | unsigned /*r*/, unsigned /*errmask*/); | |
413 | ||
414 | /* --- @fltfmt_decieee@ --- * | |
415 | * | |
416 | * Arguments: @const struct fltfmt_ieeefmt *fmt@ = format description | |
417 | * @struct floatbits *z_out@ = output decoded representation | |
418 | * @const uint32 *x@ = input encoding | |
419 | * | |
420 | * Returns: Error flags (@FLTERR_...@). | |
421 | * | |
422 | * Use: Decode a floating-point value in an IEEE format. This is the | |
423 | * machinery shared by the @fltfmt_dec...@ functions for | |
424 | * deccoding IEEE-format values. Most of the arguments and | |
425 | * behaviour are as described for those functions. | |
426 | * | |
427 | * The encoded value should be right-aligned and big-endian; | |
428 | * i.e., the sign bit ends up in @z[0]@, and the least | |
429 | * significant bit of the significand ends up in the least | |
430 | * significant bit of @z[n - 1]@. | |
431 | */ | |
432 | ||
433 | extern unsigned fltfmt_decieee(const struct fltfmt_ieeefmt */*fmt*/, | |
434 | struct floatbits */*z_out*/, | |
435 | const uint32 */*x*/); | |
436 | ||
437 | /* --- @fltfmt_decTY@ --- * | |
438 | * | |
439 | * Arguments: @const struct floatbits *z_out@ = storage for the result | |
440 | * @octet x@, @uint16 x@, @uint32 x@, @kludge64 x@ = | |
441 | * encoded input | |
442 | * @uint16 se@, @kludge64 m@ = encoded sign-and-exponent and | |
443 | * significand | |
444 | * | |
445 | * Returns: Error flags (@FLTERR_...@). | |
446 | * | |
447 | * Use: Encode a floating-point value in an IEEE (or IEEE-adjacent) | |
448 | * format. | |
449 | * | |
450 | * The options for @TY@ are as documented for the encoding | |
451 | * functions above. | |
452 | * | |
453 | * In formats without a hidden bit -- currently only @idblext80@ | |
454 | * -- not all bit patterns are valid encodings. If the explicit | |
455 | * unit bit is set when the exponent field is all-bits-zero, or | |
456 | * clear when the exponent field is not all-bits-zero, then the | |
457 | * @FLTERR_INVAL@ error bit is set. If the exponent is all- | |
458 | * bits-set, denoting infinity or a NaN, then the unit bit is | |
459 | * otherwise ignored -- in particular, it does not affect the | |
460 | * NaN payload, or even whether the input encodes a NaN or | |
461 | * infinity. Otherwise, the unit bit is considered significant, | |
462 | * and the result is normalized as one would expect. | |
463 | * Consequently, biased exponent values 0 and 1 are distinct | |
464 | * only with respect to which bit patterns are considered valid, | |
465 | * and not with respect to the set of values denoted. | |
466 | */ | |
467 | ||
468 | extern unsigned fltfmt_decmini(struct floatbits */*z_out*/, octet /*x*/); | |
469 | ||
470 | extern unsigned fltfmt_decbf16(struct floatbits */*z_out*/, uint16 /*x*/); | |
471 | ||
472 | extern unsigned fltfmt_decf16(struct floatbits */*z_out*/, uint16 /*x*/); | |
473 | ||
474 | extern unsigned fltfmt_decf32(struct floatbits */*z_out*/, uint32 /*x*/); | |
475 | ||
476 | extern unsigned fltfmt_decf64(struct floatbits */*z_out*/, kludge64 /*x*/); | |
477 | ||
478 | extern unsigned fltfmt_decf128(struct floatbits */*z_out*/, | |
479 | const uint32 */*x*/); | |
480 | ||
481 | extern unsigned fltfmt_decidblext80(struct floatbits */*z_out*/, | |
482 | uint16 /*se*/, kludge64 /*f*/); | |
483 | ||
484 | /*----- Native formats ----------------------------------------------------*/ | |
485 | ||
486 | /* Hacking for platforms which ill-advisedly have the opposite sense for the | |
487 | * quiet NaN bit. | |
488 | * | |
489 | * Obviously we toggle the quiet bit, but there's a problem: if the quiet bit | |
490 | * is the only one set, then if we toggle it, the fraction will become zero | |
491 | * and we'll be left with an infinity. Follow MIPS and set all of the bits. | |
492 | * | |
493 | * This is all internal machinery and shouldn't be relied on by applications. | |
494 | */ | |
495 | #if defined(__hppa__) || (defined(__mips__) && !defined(__mips_nan2008)) | |
496 | # define FLTFMT__MUST_FROB_NANS | |
497 | ||
498 | # define FLTFMT__FROB_NAN_F32(x_inout, rc) do { \ | |
499 | uint32 *_x_inout_ = (x_inout), _x0_ = _x_inout_[0]; \ | |
500 | \ | |
501 | if ((_x0_&0x7f800000) != 0x7f800000 || !(_x0_&0x007fffff)) \ | |
502 | ; \ | |
503 | else if (_x0_&0x003fffff) \ | |
504 | _x_inout_[0] = _x0_ ^ 0x00400000; \ | |
505 | else { \ | |
506 | _x_inout_[0] = (_x0_&0x80000000) | 0x7fffffff; \ | |
507 | (rc) |= FLTERR_INEXACT; \ | |
508 | } \ | |
509 | } while (0) | |
510 | ||
511 | # define FLTFMT__FROB_NAN_F64(x_inout, rc) do { \ | |
512 | uint32 *_x_inout_ = (x_inout), \ | |
513 | _x0_ = _x_inout_[0], _x1_ = _x_inout_[1]; \ | |
514 | \ | |
515 | if ((_x0_&0x7ff00000) != 0x7ff00000 || (!(_x0_&0x000fffff) && !_x1_)) \ | |
516 | ; \ | |
517 | else if ((_x0_&0x0007ffff) || _x1_) \ | |
518 | _x_inout_[0] = _x0_ ^ 0x00080000; \ | |
519 | else { \ | |
520 | _x_inout_[0] = (_x0_&0x80000000) | 0x7fffffff; \ | |
521 | _x_inout_[1] = 0xffffffff; \ | |
522 | (rc) |= FLTERR_INEXACT; \ | |
523 | } \ | |
524 | } while (0) | |
525 | ||
526 | # define FLTFMT__FROB_NAN_F128(x_inout, rc) do { \ | |
527 | uint32 *_x_inout_ = (x_inout), \ | |
528 | _x0_ = _x_inout_[0], _x1_ = _x_inout_[1], \ | |
529 | _x2_ = _x_inout_[2], _x3_ = _x_inout_[3]; \ | |
530 | \ | |
531 | if ((_x0_&0x7fff0000) != 0x7fff0000 || \ | |
532 | (!(_x0_&0x000fffff) && !_x1_ && !_x2_ && !_x3_)) \ | |
533 | ; \ | |
534 | else if ((_x0_&0x00007fff) || _x1_ || _x2_ || _x3_) \ | |
535 | _x_inout_[0] = _x0_ ^ 0x00008000; \ | |
536 | else { \ | |
537 | _x_inout_[0] = (_x0_&0x80000000) | 0x7fffffff; \ | |
538 | _x_inout_[1] = _x_inout_[2] = _x_inout_[3] = 0xffffffff; \ | |
539 | (rc) |= FLTERR_INEXACT; \ | |
540 | } \ | |
541 | } while (0) | |
542 | ||
543 | # define FLTFMT__FROB_NAN_IDBLEXT80(x_inout, rc) do { \ | |
544 | uint32 *_x_inout_ = (x_inout), \ | |
545 | _x0_ = _x_inout_[0], _x1_ = _x_inout_[1], _x2_ = _x_inout_[2]; \ | |
546 | \ | |
547 | if ((_x0_&0x00007fff) != 0x00007fff || (!(_x1_&0x7fffffff) && !_x2_)) \ | |
548 | ; \ | |
549 | else if ((_x1_&0x3fffffff) || _x1_ || _x2_) \ | |
550 | _x_inout_[1] = _x1_ ^ 0x40000000; \ | |
551 | else { \ | |
552 | _x_inout_[1] = (_x1_&0x80000000) | 0x3fffffff; /* preserve unit */ \ | |
553 | _x_inout_[2] = 0xffffffff; \ | |
554 | } \ | |
555 | } while (0) | |
556 | ||
557 | #else | |
558 | # define FLTFMT__FROB_NAN_F32(x_inout, rc) do ; while (0) | |
559 | # define FLTFMT__FROB_NAN_F64(x_inout, rc) do ; while (0) | |
560 | # define FLTFMT__FROB_NAN_F128(x_inout, rc) do ; while (0) | |
561 | # define FLTFMT__FROB_NAN_IDBLEXT80(x_inout, rc) do ; while (0) | |
562 | #endif | |
563 | ||
564 | /* --- @fltfmt_encTY@ --- * | |
565 | * | |
566 | * Arguments: @ty *z_out@ = storage for the result | |
567 | * @const struct floatbits *x@ = value to encode | |
568 | * @unsigned r@ = rounding mode | |
569 | * | |
570 | * Returns: Error flags (@FLTERR_...@). | |
571 | * | |
572 | * Use: Encode the floating-point value @x@ as a native C object and | |
573 | * store the result in @z_out@. | |
574 | * | |
575 | * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a | |
576 | * @double@, or (on C99 implementations) @ldbl@ to encode a | |
577 | * @long double@. | |
578 | * | |
579 | * In detail, conversion is performed as follows. | |
580 | * | |
581 | * * If a non-finite value cannot be represented by the | |
582 | * implementation then the @FLTERR_REPR@ error bit is set | |
583 | * and @*z_out@ is set to zero if @x@ is a NaN, or the | |
584 | * (absolute) largest representable value, with appropriate | |
585 | * sign, if @x@ is an infinity. | |
586 | * | |
587 | * * If the implementation can represent NaNs, but cannot set | |
588 | * NaN payloads, then the @FLTERR_INEXACT@ error bit is set, | |
589 | * and @*z_out@ is set to an arbitrary (quiet) NaN value. | |
590 | * | |
591 | * * If @x@ is negative zero, but the implementation does not | |
592 | * distinguish negative and positive zero, then the | |
593 | * @FLTERR_INEXACT@ error bit is set and @*z_out@ is set to | |
594 | * zero. | |
595 | * | |
596 | * * If the implementation's floating-point radix is not a | |
597 | * power of two, and @x@ is a nonzero finite value, then | |
598 | * @FLTERR_INEXACT@ error bit is set (unconditionally), and | |
599 | * the value is rounded by the implementation using its | |
600 | * prevailing rounding policy. If the radix is a power of | |
601 | * two, then the @FLTERR_INEXACT@ error bit is set only if | |
602 | * rounding is necessary, and rounding is performed using | |
603 | * the rounding mode @r@. | |
604 | */ | |
605 | ||
606 | extern unsigned fltfmt_encflt(float */*z_out*/, | |
607 | const struct floatbits */*x*/, | |
608 | unsigned /*r*/); | |
609 | ||
610 | extern unsigned fltfmt_encdbl(double */*z_out*/, | |
611 | const struct floatbits */*x*/, | |
612 | unsigned /*r*/); | |
613 | ||
614 | #if __STDC_VERSION__ >= 199001 | |
615 | extern unsigned fltfmt_encldbl(long double */*z_out*/, | |
616 | const struct floatbits */*x*/, | |
617 | unsigned /*r*/); | |
618 | #endif | |
619 | ||
620 | /* --- @fltfmt_decTY@ --- * | |
621 | * | |
622 | * Arguments: @struct floatbits *z_out@ = storage for the result | |
623 | * @ty x@ = value to decode | |
624 | * @unsigned r@ = rounding mode | |
625 | * | |
626 | * Returns: Error flags (@FLTERR_...@). | |
627 | * | |
628 | * Use: Decode the native C floatingpoint value @x@ and store the | |
629 | * result in @z_out@. | |
630 | * | |
631 | * The @TY@ may be @flt@ to encode a @float@, @dbl@ to encode a | |
632 | * @double@, or (on C99 implementations) @ldbl@ to encode a | |
633 | * @long double@. | |
634 | * | |
635 | * In detail, conversion is performed as follows. | |
636 | * | |
637 | * * If the implementation supports negative zeros and/or | |
638 | * infinity, then these are recognized and decoded. | |
639 | * | |
640 | * * If the input as a NaN, but the implementation cannot | |
641 | * usefully report NaN payloads, then the @FLTERR_INEXACT@ | |
642 | * error bit is set and the decoded payload is left empty. | |
643 | * | |
644 | * * If the implementation's floating-point radix is not a | |
645 | * power of two, and @x@ is a nonzero finite value, then | |
646 | * @FLTERR_INEXACT@ error bit is set (unconditionally), and | |
647 | * the rounded value (according to the rounding mode @r@) is | |
648 | * stored in as many fraction words as necessary to identify | |
649 | * the original value uniquely. If the radix is a power of | |
650 | * two, then the value is represented exactly. | |
651 | */ | |
652 | ||
653 | extern unsigned fltfmt_decflt(struct floatbits */*z_out*/, | |
654 | float /*x*/, unsigned /*r*/); | |
655 | ||
656 | extern unsigned fltfmt_decdbl(struct floatbits */*z_out*/, | |
657 | double /*x*/, unsigned /*r*/); | |
658 | ||
659 | #if __STDC_VERSION__ >= 199001 | |
660 | extern unsigned fltfmt_decldbl(struct floatbits */*z_out*/, | |
661 | long double /*x*/, unsigned /*r*/); | |
662 | #endif | |
663 | ||
664 | /*----- Some common conversions packaged up -------------------------------*/ | |
665 | ||
666 | /* --- @fltfmt_CTYtoFTYE@ --- * | |
667 | * | |
668 | * Arguments: @octet *p@ = output pointer | |
669 | * @float x@, @double x@ = value to convert | |
670 | * @unsigned r@ = rounding mode | |
671 | * | |
672 | * Returns: Error flags (@FLTERR_...@). | |
673 | * | |
674 | * Use: Encode a native C floating-point value in an external format. | |
675 | * | |
676 | * The @CTY@ is an abbreviation for a C type: @flt@ for @float@, | |
677 | * or @dbl@ for @double@; @fty@ is an abbreviation for the | |
678 | * external format, @f32@ for IEEE Binary32, or @f64@ for IEEE | |
679 | * Binary64; and @E@ is @l@ for little-endian or @b@ for | |
680 | * big-endian byte order. Not all combinations are currently | |
681 | * supported. | |
682 | * | |
683 | * On platforms where the external format is used natively, | |
684 | * these functions are simple data copies. | |
685 | */ | |
686 | ||
687 | extern unsigned fltfmt_flttof32l(octet */*p*/, float /*x*/, unsigned /*r*/); | |
688 | extern unsigned fltfmt_flttof32b(octet */*p*/, float /*x*/, unsigned /*r*/); | |
689 | extern unsigned fltfmt_dbltof64l(octet */*p*/, double /*x*/, unsigned /*r*/); | |
690 | extern unsigned fltfmt_dbltof64b(octet */*p*/, double /*x*/, unsigned /*r*/); | |
691 | ||
692 | /* --- @fltfmt_FTYEtoCTY@ --- * | |
693 | * | |
694 | * Arguments: @float *z_out@, @double *z_out@ = storage for output | |
695 | * @const octet *p@ = input pointer | |
696 | * @unsigned r@ = rounding mode | |
697 | * | |
698 | * Returns: Error flags (@FLTERR_...@). | |
699 | * | |
700 | * Use: Decodes a floating point value in an external format into a | |
701 | * native value. | |
702 | * | |
703 | * The naming conventions are the same as for @fltfmt_dbltof64b@ | |
704 | * above. | |
705 | * | |
706 | * On platforms where the external format is used natively, | |
707 | * these functions are simple data copies. | |
708 | */ | |
709 | ||
710 | extern unsigned fltfmt_f32ltoflt(float */*z_out*/, const octet */*p*/, | |
711 | unsigned /*r*/); | |
712 | extern unsigned fltfmt_f32btoflt(float */*z_out*/, const octet */*p*/, | |
713 | unsigned /*r*/); | |
714 | extern unsigned fltfmt_f64ltodbl(double */*z_out*/, const octet */*p*/, | |
715 | unsigned /*r*/); | |
716 | extern unsigned fltfmt_f64btodbl(double */*z_out*/, const octet */*p*/, | |
717 | unsigned /*r*/); | |
718 | ||
719 | /*----- That's all, folks -------------------------------------------------*/ | |
720 | ||
721 | #ifdef __cplusplus | |
722 | } | |
723 | #endif | |
724 | ||
725 | #endif |