Remove building with NOCRYPTO option
[minix.git] / lib / libc / softfloat / bits64 / softfloat.c
blobaed0c6ab12d95c71dba4bd03b714741e64088328
1 /* $NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $ */
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
9 /*
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
18 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 #include <sys/cdefs.h>
48 #if defined(LIBC_SCCS) && !defined(lint)
49 __RCSID("$NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $");
50 #endif /* LIBC_SCCS and not lint */
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
54 #endif
56 #include "milieu.h"
57 #include "softfloat.h"
60 * Conversions between floats as stored in memory and floats as
61 * SoftFloat uses them
63 #ifndef FLOAT64_DEMANGLE
64 #define FLOAT64_DEMANGLE(a) (a)
65 #endif
66 #ifndef FLOAT64_MANGLE
67 #define FLOAT64_MANGLE(a) (a)
68 #endif
71 -------------------------------------------------------------------------------
72 Floating-point rounding mode, extended double-precision rounding precision,
73 and exception flags.
74 -------------------------------------------------------------------------------
76 #ifndef set_float_rounding_mode
77 fp_rnd float_rounding_mode = float_round_nearest_even;
78 fp_except float_exception_flags = 0;
79 #endif
80 #ifndef set_float_exception_inexact_flag
81 #define set_float_exception_inexact_flag() \
82 ((void)(float_exception_flags |= float_flag_inexact))
83 #endif
84 #ifdef FLOATX80
85 int8 floatx80_rounding_precision = 80;
86 #endif
89 -------------------------------------------------------------------------------
90 Primitive arithmetic functions, including multi-word arithmetic, and
91 division and square root approximations. (Can be specialized to target if
92 desired.)
93 -------------------------------------------------------------------------------
95 #include "softfloat-macros"
98 -------------------------------------------------------------------------------
99 Functions and definitions to determine: (1) whether tininess for underflow
100 is detected before or after rounding by default, (2) what (if anything)
101 happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 are propagated from function inputs to output. These details are target-
104 specific.
105 -------------------------------------------------------------------------------
107 #include "softfloat-specialize"
109 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
111 -------------------------------------------------------------------------------
112 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
113 and 7, and returns the properly rounded 32-bit integer corresponding to the
114 input. If `zSign' is 1, the input is negated before being converted to an
115 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
116 is simply rounded to an integer, with the inexact exception raised if the
117 input cannot be represented exactly as an integer. However, if the fixed-
118 point input is too large, the invalid exception is raised and the largest
119 positive or negative integer is returned.
120 -------------------------------------------------------------------------------
122 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
124 int8 roundingMode;
125 flag roundNearestEven;
126 int8 roundIncrement, roundBits;
127 int32 z;
129 roundingMode = float_rounding_mode;
130 roundNearestEven = ( roundingMode == float_round_nearest_even );
131 roundIncrement = 0x40;
132 if ( ! roundNearestEven ) {
133 if ( roundingMode == float_round_to_zero ) {
134 roundIncrement = 0;
136 else {
137 roundIncrement = 0x7F;
138 if ( zSign ) {
139 if ( roundingMode == float_round_up ) roundIncrement = 0;
141 else {
142 if ( roundingMode == float_round_down ) roundIncrement = 0;
146 roundBits = (int8)(absZ & 0x7F);
147 absZ = ( absZ + roundIncrement )>>7;
148 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
149 z = (int32)absZ;
150 if ( zSign ) z = - z;
151 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
152 float_raise( float_flag_invalid );
153 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
155 if ( roundBits ) set_float_exception_inexact_flag();
156 return z;
161 -------------------------------------------------------------------------------
162 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
163 `absZ1', with binary point between bits 63 and 64 (between the input words),
164 and returns the properly rounded 64-bit integer corresponding to the input.
165 If `zSign' is 1, the input is negated before being converted to an integer.
166 Ordinarily, the fixed-point input is simply rounded to an integer, with
167 the inexact exception raised if the input cannot be represented exactly as
168 an integer. However, if the fixed-point input is too large, the invalid
169 exception is raised and the largest positive or negative integer is
170 returned.
171 -------------------------------------------------------------------------------
173 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
175 int8 roundingMode;
176 flag roundNearestEven, increment;
177 int64 z;
179 roundingMode = float_rounding_mode;
180 roundNearestEven = ( roundingMode == float_round_nearest_even );
181 increment = ( (sbits64) absZ1 < 0 );
182 if ( ! roundNearestEven ) {
183 if ( roundingMode == float_round_to_zero ) {
184 increment = 0;
186 else {
187 if ( zSign ) {
188 increment = ( roundingMode == float_round_down ) && absZ1;
190 else {
191 increment = ( roundingMode == float_round_up ) && absZ1;
195 if ( increment ) {
196 ++absZ0;
197 if ( absZ0 == 0 ) goto overflow;
198 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
200 z = absZ0;
201 if ( zSign ) z = - z;
202 if ( z && ( ( z < 0 ) ^ zSign ) ) {
203 overflow:
204 float_raise( float_flag_invalid );
205 return
206 zSign ? (sbits64) LIT64( 0x8000000000000000 )
207 : LIT64( 0x7FFFFFFFFFFFFFFF );
209 if ( absZ1 ) set_float_exception_inexact_flag();
210 return z;
213 #endif
216 -------------------------------------------------------------------------------
217 Returns the fraction bits of the single-precision floating-point value `a'.
218 -------------------------------------------------------------------------------
220 INLINE bits32 extractFloat32Frac( float32 a )
223 return a & 0x007FFFFF;
228 -------------------------------------------------------------------------------
229 Returns the exponent bits of the single-precision floating-point value `a'.
230 -------------------------------------------------------------------------------
232 INLINE int16 extractFloat32Exp( float32 a )
235 return ( a>>23 ) & 0xFF;
240 -------------------------------------------------------------------------------
241 Returns the sign bit of the single-precision floating-point value `a'.
242 -------------------------------------------------------------------------------
244 INLINE flag extractFloat32Sign( float32 a )
247 return a>>31;
252 -------------------------------------------------------------------------------
253 Normalizes the subnormal single-precision floating-point value represented
254 by the denormalized significand `aSig'. The normalized exponent and
255 significand are stored at the locations pointed to by `zExpPtr' and
256 `zSigPtr', respectively.
257 -------------------------------------------------------------------------------
259 static void
260 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
262 int8 shiftCount;
264 shiftCount = countLeadingZeros32( aSig ) - 8;
265 *zSigPtr = aSig<<shiftCount;
266 *zExpPtr = 1 - shiftCount;
271 -------------------------------------------------------------------------------
272 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
273 single-precision floating-point value, returning the result. After being
274 shifted into the proper positions, the three fields are simply added
275 together to form the result. This means that any integer portion of `zSig'
276 will be added into the exponent. Since a properly normalized significand
277 will have an integer portion equal to 1, the `zExp' input should be 1 less
278 than the desired result exponent whenever `zSig' is a complete, normalized
279 significand.
280 -------------------------------------------------------------------------------
282 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
285 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
290 -------------------------------------------------------------------------------
291 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
292 and significand `zSig', and returns the proper single-precision floating-
293 point value corresponding to the abstract input. Ordinarily, the abstract
294 value is simply rounded and packed into the single-precision format, with
295 the inexact exception raised if the abstract input cannot be represented
296 exactly. However, if the abstract value is too large, the overflow and
297 inexact exceptions are raised and an infinity or maximal finite value is
298 returned. If the abstract value is too small, the input value is rounded to
299 a subnormal number, and the underflow and inexact exceptions are raised if
300 the abstract input cannot be represented exactly as a subnormal single-
301 precision floating-point number.
302 The input significand `zSig' has its binary point between bits 30
303 and 29, which is 7 bits to the left of the usual location. This shifted
304 significand must be normalized or smaller. If `zSig' is not normalized,
305 `zExp' must be 0; in that case, the result returned is a subnormal number,
306 and it must not require rounding. In the usual case that `zSig' is
307 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
308 The handling of underflow and overflow follows the IEC/IEEE Standard for
309 Binary Floating-Point Arithmetic.
310 -------------------------------------------------------------------------------
312 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
314 int8 roundingMode;
315 flag roundNearestEven;
316 int8 roundIncrement, roundBits;
317 flag isTiny;
319 roundingMode = float_rounding_mode;
320 roundNearestEven = ( roundingMode == float_round_nearest_even );
321 roundIncrement = 0x40;
322 if ( ! roundNearestEven ) {
323 if ( roundingMode == float_round_to_zero ) {
324 roundIncrement = 0;
326 else {
327 roundIncrement = 0x7F;
328 if ( zSign ) {
329 if ( roundingMode == float_round_up ) roundIncrement = 0;
331 else {
332 if ( roundingMode == float_round_down ) roundIncrement = 0;
336 roundBits = zSig & 0x7F;
337 if ( 0xFD <= (bits16) zExp ) {
338 if ( ( 0xFD < zExp )
339 || ( ( zExp == 0xFD )
340 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
342 float_raise( float_flag_overflow | float_flag_inexact );
343 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
345 if ( zExp < 0 ) {
346 isTiny =
347 ( float_detect_tininess == float_tininess_before_rounding )
348 || ( zExp < -1 )
349 || ( zSig + roundIncrement < 0x80000000U );
350 shift32RightJamming( zSig, - zExp, &zSig );
351 zExp = 0;
352 roundBits = zSig & 0x7F;
353 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
356 if ( roundBits ) set_float_exception_inexact_flag();
357 zSig = ( zSig + roundIncrement )>>7;
358 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
359 if ( zSig == 0 ) zExp = 0;
360 return packFloat32( zSign, zExp, zSig );
365 -------------------------------------------------------------------------------
366 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
367 and significand `zSig', and returns the proper single-precision floating-
368 point value corresponding to the abstract input. This routine is just like
369 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
370 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
371 floating-point exponent.
372 -------------------------------------------------------------------------------
374 static float32
375 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
377 int8 shiftCount;
379 shiftCount = countLeadingZeros32( zSig ) - 1;
380 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
385 -------------------------------------------------------------------------------
386 Returns the fraction bits of the double-precision floating-point value `a'.
387 -------------------------------------------------------------------------------
389 INLINE bits64 extractFloat64Frac( float64 a )
392 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
397 -------------------------------------------------------------------------------
398 Returns the exponent bits of the double-precision floating-point value `a'.
399 -------------------------------------------------------------------------------
401 INLINE int16 extractFloat64Exp( float64 a )
404 return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF);
409 -------------------------------------------------------------------------------
410 Returns the sign bit of the double-precision floating-point value `a'.
411 -------------------------------------------------------------------------------
413 INLINE flag extractFloat64Sign( float64 a )
416 return (flag)(FLOAT64_DEMANGLE(a) >> 63);
421 -------------------------------------------------------------------------------
422 Normalizes the subnormal double-precision floating-point value represented
423 by the denormalized significand `aSig'. The normalized exponent and
424 significand are stored at the locations pointed to by `zExpPtr' and
425 `zSigPtr', respectively.
426 -------------------------------------------------------------------------------
428 static void
429 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
431 int8 shiftCount;
433 shiftCount = countLeadingZeros64( aSig ) - 11;
434 *zSigPtr = aSig<<shiftCount;
435 *zExpPtr = 1 - shiftCount;
440 -------------------------------------------------------------------------------
441 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
442 double-precision floating-point value, returning the result. After being
443 shifted into the proper positions, the three fields are simply added
444 together to form the result. This means that any integer portion of `zSig'
445 will be added into the exponent. Since a properly normalized significand
446 will have an integer portion equal to 1, the `zExp' input should be 1 less
447 than the desired result exponent whenever `zSig' is a complete, normalized
448 significand.
449 -------------------------------------------------------------------------------
451 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
454 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
455 ( ( (bits64) zExp )<<52 ) + zSig );
460 -------------------------------------------------------------------------------
461 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
462 and significand `zSig', and returns the proper double-precision floating-
463 point value corresponding to the abstract input. Ordinarily, the abstract
464 value is simply rounded and packed into the double-precision format, with
465 the inexact exception raised if the abstract input cannot be represented
466 exactly. However, if the abstract value is too large, the overflow and
467 inexact exceptions are raised and an infinity or maximal finite value is
468 returned. If the abstract value is too small, the input value is rounded to
469 a subnormal number, and the underflow and inexact exceptions are raised if
470 the abstract input cannot be represented exactly as a subnormal double-
471 precision floating-point number.
472 The input significand `zSig' has its binary point between bits 62
473 and 61, which is 10 bits to the left of the usual location. This shifted
474 significand must be normalized or smaller. If `zSig' is not normalized,
475 `zExp' must be 0; in that case, the result returned is a subnormal number,
476 and it must not require rounding. In the usual case that `zSig' is
477 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
478 The handling of underflow and overflow follows the IEC/IEEE Standard for
479 Binary Floating-Point Arithmetic.
480 -------------------------------------------------------------------------------
482 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
484 int8 roundingMode;
485 flag roundNearestEven;
486 int16 roundIncrement, roundBits;
487 flag isTiny;
489 roundingMode = float_rounding_mode;
490 roundNearestEven = ( roundingMode == float_round_nearest_even );
491 roundIncrement = 0x200;
492 if ( ! roundNearestEven ) {
493 if ( roundingMode == float_round_to_zero ) {
494 roundIncrement = 0;
496 else {
497 roundIncrement = 0x3FF;
498 if ( zSign ) {
499 if ( roundingMode == float_round_up ) roundIncrement = 0;
501 else {
502 if ( roundingMode == float_round_down ) roundIncrement = 0;
506 roundBits = (int16)(zSig & 0x3FF);
507 if ( 0x7FD <= (bits16) zExp ) {
508 if ( ( 0x7FD < zExp )
509 || ( ( zExp == 0x7FD )
510 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
512 float_raise( float_flag_overflow | float_flag_inexact );
513 return FLOAT64_MANGLE(
514 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
515 ( roundIncrement == 0 ));
517 if ( zExp < 0 ) {
518 isTiny =
519 ( float_detect_tininess == float_tininess_before_rounding )
520 || ( zExp < -1 )
521 || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) );
522 shift64RightJamming( zSig, - zExp, &zSig );
523 zExp = 0;
524 roundBits = (int16)(zSig & 0x3FF);
525 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
528 if ( roundBits ) set_float_exception_inexact_flag();
529 zSig = ( zSig + roundIncrement )>>10;
530 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
531 if ( zSig == 0 ) zExp = 0;
532 return packFloat64( zSign, zExp, zSig );
537 -------------------------------------------------------------------------------
538 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
539 and significand `zSig', and returns the proper double-precision floating-
540 point value corresponding to the abstract input. This routine is just like
541 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
542 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
543 floating-point exponent.
544 -------------------------------------------------------------------------------
546 static float64
547 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
549 int8 shiftCount;
551 shiftCount = countLeadingZeros64( zSig ) - 1;
552 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
556 #ifdef FLOATX80
559 -------------------------------------------------------------------------------
560 Returns the fraction bits of the extended double-precision floating-point
561 value `a'.
562 -------------------------------------------------------------------------------
564 INLINE bits64 extractFloatx80Frac( floatx80 a )
567 return a.low;
572 -------------------------------------------------------------------------------
573 Returns the exponent bits of the extended double-precision floating-point
574 value `a'.
575 -------------------------------------------------------------------------------
577 INLINE int32 extractFloatx80Exp( floatx80 a )
580 return a.high & 0x7FFF;
585 -------------------------------------------------------------------------------
586 Returns the sign bit of the extended double-precision floating-point value
587 `a'.
588 -------------------------------------------------------------------------------
590 INLINE flag extractFloatx80Sign( floatx80 a )
593 return a.high>>15;
598 -------------------------------------------------------------------------------
599 Normalizes the subnormal extended double-precision floating-point value
600 represented by the denormalized significand `aSig'. The normalized exponent
601 and significand are stored at the locations pointed to by `zExpPtr' and
602 `zSigPtr', respectively.
603 -------------------------------------------------------------------------------
605 static void
606 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
608 int8 shiftCount;
610 shiftCount = countLeadingZeros64( aSig );
611 *zSigPtr = aSig<<shiftCount;
612 *zExpPtr = 1 - shiftCount;
617 -------------------------------------------------------------------------------
618 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
619 extended double-precision floating-point value, returning the result.
620 -------------------------------------------------------------------------------
622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
624 floatx80 z;
626 z.low = zSig;
627 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
628 return z;
633 -------------------------------------------------------------------------------
634 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
635 and extended significand formed by the concatenation of `zSig0' and `zSig1',
636 and returns the proper extended double-precision floating-point value
637 corresponding to the abstract input. Ordinarily, the abstract value is
638 rounded and packed into the extended double-precision format, with the
639 inexact exception raised if the abstract input cannot be represented
640 exactly. However, if the abstract value is too large, the overflow and
641 inexact exceptions are raised and an infinity or maximal finite value is
642 returned. If the abstract value is too small, the input value is rounded to
643 a subnormal number, and the underflow and inexact exceptions are raised if
644 the abstract input cannot be represented exactly as a subnormal extended
645 double-precision floating-point number.
646 If `roundingPrecision' is 32 or 64, the result is rounded to the same
647 number of bits as single or double precision, respectively. Otherwise, the
648 result is rounded to the full precision of the extended double-precision
649 format.
650 The input significand must be normalized or smaller. If the input
651 significand is not normalized, `zExp' must be 0; in that case, the result
652 returned is a subnormal number, and it must not require rounding. The
653 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
654 Floating-Point Arithmetic.
655 -------------------------------------------------------------------------------
657 static floatx80
658 roundAndPackFloatx80(
659 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
662 int8 roundingMode;
663 flag roundNearestEven, increment, isTiny;
664 int64 roundIncrement, roundMask, roundBits;
666 roundingMode = float_rounding_mode;
667 roundNearestEven = ( roundingMode == float_round_nearest_even );
668 if ( roundingPrecision == 80 ) goto precision80;
669 if ( roundingPrecision == 64 ) {
670 roundIncrement = LIT64( 0x0000000000000400 );
671 roundMask = LIT64( 0x00000000000007FF );
673 else if ( roundingPrecision == 32 ) {
674 roundIncrement = LIT64( 0x0000008000000000 );
675 roundMask = LIT64( 0x000000FFFFFFFFFF );
677 else {
678 goto precision80;
680 zSig0 |= ( zSig1 != 0 );
681 if ( ! roundNearestEven ) {
682 if ( roundingMode == float_round_to_zero ) {
683 roundIncrement = 0;
685 else {
686 roundIncrement = roundMask;
687 if ( zSign ) {
688 if ( roundingMode == float_round_up ) roundIncrement = 0;
690 else {
691 if ( roundingMode == float_round_down ) roundIncrement = 0;
695 roundBits = zSig0 & roundMask;
696 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
697 if ( ( 0x7FFE < zExp )
698 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
700 goto overflow;
702 if ( zExp <= 0 ) {
703 isTiny =
704 ( float_detect_tininess == float_tininess_before_rounding )
705 || ( zExp < 0 )
706 || ( zSig0 <= zSig0 + roundIncrement );
707 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
708 zExp = 0;
709 roundBits = zSig0 & roundMask;
710 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
711 if ( roundBits ) set_float_exception_inexact_flag();
712 zSig0 += roundIncrement;
713 if ( (sbits64) zSig0 < 0 ) zExp = 1;
714 roundIncrement = roundMask + 1;
715 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
716 roundMask |= roundIncrement;
718 zSig0 &= ~ roundMask;
719 return packFloatx80( zSign, zExp, zSig0 );
722 if ( roundBits ) set_float_exception_inexact_flag();
723 zSig0 += roundIncrement;
724 if ( zSig0 < roundIncrement ) {
725 ++zExp;
726 zSig0 = LIT64( 0x8000000000000000 );
728 roundIncrement = roundMask + 1;
729 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
730 roundMask |= roundIncrement;
732 zSig0 &= ~ roundMask;
733 if ( zSig0 == 0 ) zExp = 0;
734 return packFloatx80( zSign, zExp, zSig0 );
735 precision80:
736 increment = ( (sbits64) zSig1 < 0 );
737 if ( ! roundNearestEven ) {
738 if ( roundingMode == float_round_to_zero ) {
739 increment = 0;
741 else {
742 if ( zSign ) {
743 increment = ( roundingMode == float_round_down ) && zSig1;
745 else {
746 increment = ( roundingMode == float_round_up ) && zSig1;
750 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
751 if ( ( 0x7FFE < zExp )
752 || ( ( zExp == 0x7FFE )
753 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
754 && increment
757 roundMask = 0;
758 overflow:
759 float_raise( float_flag_overflow | float_flag_inexact );
760 if ( ( roundingMode == float_round_to_zero )
761 || ( zSign && ( roundingMode == float_round_up ) )
762 || ( ! zSign && ( roundingMode == float_round_down ) )
764 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
766 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
768 if ( zExp <= 0 ) {
769 isTiny =
770 ( float_detect_tininess == float_tininess_before_rounding )
771 || ( zExp < 0 )
772 || ! increment
773 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
774 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
775 zExp = 0;
776 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
777 if ( zSig1 ) set_float_exception_inexact_flag();
778 if ( roundNearestEven ) {
779 increment = ( (sbits64) zSig1 < 0 );
781 else {
782 if ( zSign ) {
783 increment = ( roundingMode == float_round_down ) && zSig1;
785 else {
786 increment = ( roundingMode == float_round_up ) && zSig1;
789 if ( increment ) {
790 ++zSig0;
791 zSig0 &=
792 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
793 if ( (sbits64) zSig0 < 0 ) zExp = 1;
795 return packFloatx80( zSign, zExp, zSig0 );
798 if ( zSig1 ) set_float_exception_inexact_flag();
799 if ( increment ) {
800 ++zSig0;
801 if ( zSig0 == 0 ) {
802 ++zExp;
803 zSig0 = LIT64( 0x8000000000000000 );
805 else {
806 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
809 else {
810 if ( zSig0 == 0 ) zExp = 0;
812 return packFloatx80( zSign, zExp, zSig0 );
817 -------------------------------------------------------------------------------
818 Takes an abstract floating-point value having sign `zSign', exponent
819 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
820 and returns the proper extended double-precision floating-point value
821 corresponding to the abstract input. This routine is just like
822 `roundAndPackFloatx80' except that the input significand does not have to be
823 normalized.
824 -------------------------------------------------------------------------------
826 static floatx80
827 normalizeRoundAndPackFloatx80(
828 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
831 int8 shiftCount;
833 if ( zSig0 == 0 ) {
834 zSig0 = zSig1;
835 zSig1 = 0;
836 zExp -= 64;
838 shiftCount = countLeadingZeros64( zSig0 );
839 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
840 zExp -= shiftCount;
841 return
842 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
846 #endif
848 #ifdef FLOAT128
851 -------------------------------------------------------------------------------
852 Returns the least-significant 64 fraction bits of the quadruple-precision
853 floating-point value `a'.
854 -------------------------------------------------------------------------------
856 INLINE bits64 extractFloat128Frac1( float128 a )
859 return a.low;
864 -------------------------------------------------------------------------------
865 Returns the most-significant 48 fraction bits of the quadruple-precision
866 floating-point value `a'.
867 -------------------------------------------------------------------------------
869 INLINE bits64 extractFloat128Frac0( float128 a )
872 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
877 -------------------------------------------------------------------------------
878 Returns the exponent bits of the quadruple-precision floating-point value
879 `a'.
880 -------------------------------------------------------------------------------
882 INLINE int32 extractFloat128Exp( float128 a )
885 return (int32)((a.high >> 48) & 0x7FFF);
890 -------------------------------------------------------------------------------
891 Returns the sign bit of the quadruple-precision floating-point value `a'.
892 -------------------------------------------------------------------------------
894 INLINE flag extractFloat128Sign( float128 a )
897 return (flag)(a.high >> 63);
902 -------------------------------------------------------------------------------
903 Normalizes the subnormal quadruple-precision floating-point value
904 represented by the denormalized significand formed by the concatenation of
905 `aSig0' and `aSig1'. The normalized exponent is stored at the location
906 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
907 significand are stored at the location pointed to by `zSig0Ptr', and the
908 least significant 64 bits of the normalized significand are stored at the
909 location pointed to by `zSig1Ptr'.
910 -------------------------------------------------------------------------------
912 static void
913 normalizeFloat128Subnormal(
914 bits64 aSig0,
915 bits64 aSig1,
916 int32 *zExpPtr,
917 bits64 *zSig0Ptr,
918 bits64 *zSig1Ptr
921 int8 shiftCount;
923 if ( aSig0 == 0 ) {
924 shiftCount = countLeadingZeros64( aSig1 ) - 15;
925 if ( shiftCount < 0 ) {
926 *zSig0Ptr = aSig1>>( - shiftCount );
927 *zSig1Ptr = aSig1<<( shiftCount & 63 );
929 else {
930 *zSig0Ptr = aSig1<<shiftCount;
931 *zSig1Ptr = 0;
933 *zExpPtr = - shiftCount - 63;
935 else {
936 shiftCount = countLeadingZeros64( aSig0 ) - 15;
937 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
938 *zExpPtr = 1 - shiftCount;
944 -------------------------------------------------------------------------------
945 Packs the sign `zSign', the exponent `zExp', and the significand formed
946 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
947 floating-point value, returning the result. After being shifted into the
948 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
949 added together to form the most significant 32 bits of the result. This
950 means that any integer portion of `zSig0' will be added into the exponent.
951 Since a properly normalized significand will have an integer portion equal
952 to 1, the `zExp' input should be 1 less than the desired result exponent
953 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
954 significand.
955 -------------------------------------------------------------------------------
957 INLINE float128
958 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
960 float128 z;
962 z.low = zSig1;
963 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
964 return z;
969 -------------------------------------------------------------------------------
970 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
971 and extended significand formed by the concatenation of `zSig0', `zSig1',
972 and `zSig2', and returns the proper quadruple-precision floating-point value
973 corresponding to the abstract input. Ordinarily, the abstract value is
974 simply rounded and packed into the quadruple-precision format, with the
975 inexact exception raised if the abstract input cannot be represented
976 exactly. However, if the abstract value is too large, the overflow and
977 inexact exceptions are raised and an infinity or maximal finite value is
978 returned. If the abstract value is too small, the input value is rounded to
979 a subnormal number, and the underflow and inexact exceptions are raised if
980 the abstract input cannot be represented exactly as a subnormal quadruple-
981 precision floating-point number.
982 The input significand must be normalized or smaller. If the input
983 significand is not normalized, `zExp' must be 0; in that case, the result
984 returned is a subnormal number, and it must not require rounding. In the
985 usual case that the input significand is normalized, `zExp' must be 1 less
986 than the ``true'' floating-point exponent. The handling of underflow and
987 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
988 -------------------------------------------------------------------------------
990 static float128
991 roundAndPackFloat128(
992 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
994 int8 roundingMode;
995 flag roundNearestEven, increment, isTiny;
997 roundingMode = float_rounding_mode;
998 roundNearestEven = ( roundingMode == float_round_nearest_even );
999 increment = ( (sbits64) zSig2 < 0 );
1000 if ( ! roundNearestEven ) {
1001 if ( roundingMode == float_round_to_zero ) {
1002 increment = 0;
1004 else {
1005 if ( zSign ) {
1006 increment = ( roundingMode == float_round_down ) && zSig2;
1008 else {
1009 increment = ( roundingMode == float_round_up ) && zSig2;
1013 if ( 0x7FFD <= (bits32) zExp ) {
1014 if ( ( 0x7FFD < zExp )
1015 || ( ( zExp == 0x7FFD )
1016 && eq128(
1017 LIT64( 0x0001FFFFFFFFFFFF ),
1018 LIT64( 0xFFFFFFFFFFFFFFFF ),
1019 zSig0,
1020 zSig1
1022 && increment
1025 float_raise( float_flag_overflow | float_flag_inexact );
1026 if ( ( roundingMode == float_round_to_zero )
1027 || ( zSign && ( roundingMode == float_round_up ) )
1028 || ( ! zSign && ( roundingMode == float_round_down ) )
1030 return
1031 packFloat128(
1032 zSign,
1033 0x7FFE,
1034 LIT64( 0x0000FFFFFFFFFFFF ),
1035 LIT64( 0xFFFFFFFFFFFFFFFF )
1038 return packFloat128( zSign, 0x7FFF, 0, 0 );
1040 if ( zExp < 0 ) {
1041 isTiny =
1042 ( float_detect_tininess == float_tininess_before_rounding )
1043 || ( zExp < -1 )
1044 || ! increment
1045 || lt128(
1046 zSig0,
1047 zSig1,
1048 LIT64( 0x0001FFFFFFFFFFFF ),
1049 LIT64( 0xFFFFFFFFFFFFFFFF )
1051 shift128ExtraRightJamming(
1052 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1053 zExp = 0;
1054 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1055 if ( roundNearestEven ) {
1056 increment = ( (sbits64) zSig2 < 0 );
1058 else {
1059 if ( zSign ) {
1060 increment = ( roundingMode == float_round_down ) && zSig2;
1062 else {
1063 increment = ( roundingMode == float_round_up ) && zSig2;
1068 if ( zSig2 ) set_float_exception_inexact_flag();
1069 if ( increment ) {
1070 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1071 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1073 else {
1074 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1076 return packFloat128( zSign, zExp, zSig0, zSig1 );
1081 -------------------------------------------------------------------------------
1082 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1083 and significand formed by the concatenation of `zSig0' and `zSig1', and
1084 returns the proper quadruple-precision floating-point value corresponding
1085 to the abstract input. This routine is just like `roundAndPackFloat128'
1086 except that the input significand has fewer bits and does not have to be
1087 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1088 point exponent.
1089 -------------------------------------------------------------------------------
1091 static float128
1092 normalizeRoundAndPackFloat128(
1093 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1095 int8 shiftCount;
1096 bits64 zSig2;
1098 if ( zSig0 == 0 ) {
1099 zSig0 = zSig1;
1100 zSig1 = 0;
1101 zExp -= 64;
1103 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1104 if ( 0 <= shiftCount ) {
1105 zSig2 = 0;
1106 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1108 else {
1109 shift128ExtraRightJamming(
1110 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1112 zExp -= shiftCount;
1113 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1117 #endif
1120 -------------------------------------------------------------------------------
1121 Returns the result of converting the 32-bit two's complement integer `a'
1122 to the single-precision floating-point format. The conversion is performed
1123 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1124 -------------------------------------------------------------------------------
1126 float32 int32_to_float32( int32 a )
1128 flag zSign;
1130 if ( a == 0 ) return 0;
1131 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1132 zSign = ( a < 0 );
1133 return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a));
1137 float32 uint32_to_float32( uint32 a )
1139 if ( a == 0 ) return 0;
1140 if ( a & (bits32) 0x80000000 )
1141 return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1142 return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1147 -------------------------------------------------------------------------------
1148 Returns the result of converting the 32-bit two's complement integer `a'
1149 to the double-precision floating-point format. The conversion is performed
1150 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1151 -------------------------------------------------------------------------------
1153 float64 int32_to_float64( int32 a )
1155 flag zSign;
1156 uint32 absA;
1157 int8 shiftCount;
1158 bits64 zSig;
1160 if ( a == 0 ) return 0;
1161 zSign = ( a < 0 );
1162 absA = zSign ? - a : a;
1163 shiftCount = countLeadingZeros32( absA ) + 21;
1164 zSig = absA;
1165 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1169 float64 uint32_to_float64( uint32 a )
1171 int8 shiftCount;
1172 bits64 zSig = a;
1174 if ( a == 0 ) return 0;
1175 shiftCount = countLeadingZeros32( a ) + 21;
1176 return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1180 #ifdef FLOATX80
1183 -------------------------------------------------------------------------------
1184 Returns the result of converting the 32-bit two's complement integer `a'
1185 to the extended double-precision floating-point format. The conversion
1186 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1187 Arithmetic.
1188 -------------------------------------------------------------------------------
1190 floatx80 int32_to_floatx80( int32 a )
1192 flag zSign;
1193 uint32 absA;
1194 int8 shiftCount;
1195 bits64 zSig;
1197 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1198 zSign = ( a < 0 );
1199 absA = zSign ? - a : a;
1200 shiftCount = countLeadingZeros32( absA ) + 32;
1201 zSig = absA;
1202 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1206 floatx80 uint32_to_floatx80( uint32 a )
1208 int8 shiftCount;
1209 bits64 zSig = a;
1211 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1212 shiftCount = countLeadingZeros32( a ) + 32;
1213 return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1217 #endif
1219 #ifdef FLOAT128
1222 -------------------------------------------------------------------------------
1223 Returns the result of converting the 32-bit two's complement integer `a' to
1224 the quadruple-precision floating-point format. The conversion is performed
1225 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1226 -------------------------------------------------------------------------------
1228 float128 int32_to_float128( int32 a )
1230 flag zSign;
1231 uint32 absA;
1232 int8 shiftCount;
1233 bits64 zSig0;
1235 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1236 zSign = ( a < 0 );
1237 absA = zSign ? - a : a;
1238 shiftCount = countLeadingZeros32( absA ) + 17;
1239 zSig0 = absA;
1240 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1244 float128 uint32_to_float128( uint32 a )
1246 int8 shiftCount;
1247 bits64 zSig0 = a;
1249 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1250 shiftCount = countLeadingZeros32( a ) + 17;
1251 return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1255 #endif
1257 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1259 -------------------------------------------------------------------------------
1260 Returns the result of converting the 64-bit two's complement integer `a'
1261 to the single-precision floating-point format. The conversion is performed
1262 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1263 -------------------------------------------------------------------------------
1265 float32 int64_to_float32( int64 a )
1267 flag zSign;
1268 uint64 absA;
1269 int8 shiftCount;
1271 if ( a == 0 ) return 0;
1272 zSign = ( a < 0 );
1273 absA = zSign ? - a : a;
1274 shiftCount = countLeadingZeros64( absA ) - 40;
1275 if ( 0 <= shiftCount ) {
1276 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1278 else {
1279 shiftCount += 7;
1280 if ( shiftCount < 0 ) {
1281 shift64RightJamming( absA, - shiftCount, &absA );
1283 else {
1284 absA <<= shiftCount;
1286 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1292 -------------------------------------------------------------------------------
1293 Returns the result of converting the 64-bit two's complement integer `a'
1294 to the double-precision floating-point format. The conversion is performed
1295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 -------------------------------------------------------------------------------
1298 float64 int64_to_float64( int64 a )
1300 flag zSign;
1302 if ( a == 0 ) return 0;
1303 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1304 return packFloat64( 1, 0x43E, 0 );
1306 zSign = ( a < 0 );
1307 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1311 #ifdef FLOATX80
1314 -------------------------------------------------------------------------------
1315 Returns the result of converting the 64-bit two's complement integer `a'
1316 to the extended double-precision floating-point format. The conversion
1317 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 Arithmetic.
1319 -------------------------------------------------------------------------------
1321 floatx80 int64_to_floatx80( int64 a )
1323 flag zSign;
1324 uint64 absA;
1325 int8 shiftCount;
1327 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1328 zSign = ( a < 0 );
1329 absA = zSign ? - a : a;
1330 shiftCount = countLeadingZeros64( absA );
1331 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1335 #endif
1337 #endif /* !SOFTFLOAT_FOR_GCC */
1339 #ifdef FLOAT128
1342 -------------------------------------------------------------------------------
1343 Returns the result of converting the 64-bit two's complement integer `a' to
1344 the quadruple-precision floating-point format. The conversion is performed
1345 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1346 -------------------------------------------------------------------------------
1348 float128 int64_to_float128( int64 a )
1350 flag zSign;
1351 uint64 absA;
1352 int8 shiftCount;
1353 int32 zExp;
1354 bits64 zSig0, zSig1;
1356 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1357 zSign = ( a < 0 );
1358 absA = zSign ? - a : a;
1359 shiftCount = countLeadingZeros64( absA ) + 49;
1360 zExp = 0x406E - shiftCount;
1361 if ( 64 <= shiftCount ) {
1362 zSig1 = 0;
1363 zSig0 = absA;
1364 shiftCount -= 64;
1366 else {
1367 zSig1 = absA;
1368 zSig0 = 0;
1370 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1371 return packFloat128( zSign, zExp, zSig0, zSig1 );
1375 #endif
1377 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1379 -------------------------------------------------------------------------------
1380 Returns the result of converting the single-precision floating-point value
1381 `a' to the 32-bit two's complement integer format. The conversion is
1382 performed according to the IEC/IEEE Standard for Binary Floating-Point
1383 Arithmetic---which means in particular that the conversion is rounded
1384 according to the current rounding mode. If `a' is a NaN, the largest
1385 positive integer is returned. Otherwise, if the conversion overflows, the
1386 largest integer with the same sign as `a' is returned.
1387 -------------------------------------------------------------------------------
1389 int32 float32_to_int32( float32 a )
1391 flag aSign;
1392 int16 aExp, shiftCount;
1393 bits32 aSig;
1394 bits64 aSig64;
1396 aSig = extractFloat32Frac( a );
1397 aExp = extractFloat32Exp( a );
1398 aSign = extractFloat32Sign( a );
1399 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1400 if ( aExp ) aSig |= 0x00800000;
1401 shiftCount = 0xAF - aExp;
1402 aSig64 = aSig;
1403 aSig64 <<= 32;
1404 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1405 return roundAndPackInt32( aSign, aSig64 );
1408 #endif /* !SOFTFLOAT_FOR_GCC */
1411 -------------------------------------------------------------------------------
1412 Returns the result of converting the single-precision floating-point value
1413 `a' to the 32-bit two's complement integer format. The conversion is
1414 performed according to the IEC/IEEE Standard for Binary Floating-Point
1415 Arithmetic, except that the conversion is always rounded toward zero.
1416 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1417 the conversion overflows, the largest integer with the same sign as `a' is
1418 returned.
1419 -------------------------------------------------------------------------------
1421 int32 float32_to_int32_round_to_zero( float32 a )
1423 flag aSign;
1424 int16 aExp, shiftCount;
1425 bits32 aSig;
1426 int32 z;
1428 aSig = extractFloat32Frac( a );
1429 aExp = extractFloat32Exp( a );
1430 aSign = extractFloat32Sign( a );
1431 shiftCount = aExp - 0x9E;
1432 if ( 0 <= shiftCount ) {
1433 if ( a != 0xCF000000 ) {
1434 float_raise( float_flag_invalid );
1435 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1437 return (sbits32) 0x80000000;
1439 else if ( aExp <= 0x7E ) {
1440 if ( aExp | aSig ) set_float_exception_inexact_flag();
1441 return 0;
1443 aSig = ( aSig | 0x00800000 )<<8;
1444 z = aSig>>( - shiftCount );
1445 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1446 set_float_exception_inexact_flag();
1448 if ( aSign ) z = - z;
1449 return z;
1453 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1455 -------------------------------------------------------------------------------
1456 Returns the result of converting the single-precision floating-point value
1457 `a' to the 64-bit two's complement integer format. The conversion is
1458 performed according to the IEC/IEEE Standard for Binary Floating-Point
1459 Arithmetic---which means in particular that the conversion is rounded
1460 according to the current rounding mode. If `a' is a NaN, the largest
1461 positive integer is returned. Otherwise, if the conversion overflows, the
1462 largest integer with the same sign as `a' is returned.
1463 -------------------------------------------------------------------------------
1465 int64 float32_to_int64( float32 a )
1467 flag aSign;
1468 int16 aExp, shiftCount;
1469 bits32 aSig;
1470 bits64 aSig64, aSigExtra;
1472 aSig = extractFloat32Frac( a );
1473 aExp = extractFloat32Exp( a );
1474 aSign = extractFloat32Sign( a );
1475 shiftCount = 0xBE - aExp;
1476 if ( shiftCount < 0 ) {
1477 float_raise( float_flag_invalid );
1478 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1479 return LIT64( 0x7FFFFFFFFFFFFFFF );
1481 return (sbits64) LIT64( 0x8000000000000000 );
1483 if ( aExp ) aSig |= 0x00800000;
1484 aSig64 = aSig;
1485 aSig64 <<= 40;
1486 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1487 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1492 -------------------------------------------------------------------------------
1493 Returns the result of converting the single-precision floating-point value
1494 `a' to the 64-bit two's complement integer format. The conversion is
1495 performed according to the IEC/IEEE Standard for Binary Floating-Point
1496 Arithmetic, except that the conversion is always rounded toward zero. If
1497 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1498 conversion overflows, the largest integer with the same sign as `a' is
1499 returned.
1500 -------------------------------------------------------------------------------
1502 int64 float32_to_int64_round_to_zero( float32 a )
1504 flag aSign;
1505 int16 aExp, shiftCount;
1506 bits32 aSig;
1507 bits64 aSig64;
1508 int64 z;
1510 aSig = extractFloat32Frac( a );
1511 aExp = extractFloat32Exp( a );
1512 aSign = extractFloat32Sign( a );
1513 shiftCount = aExp - 0xBE;
1514 if ( 0 <= shiftCount ) {
1515 if ( a != 0xDF000000 ) {
1516 float_raise( float_flag_invalid );
1517 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1518 return LIT64( 0x7FFFFFFFFFFFFFFF );
1521 return (sbits64) LIT64( 0x8000000000000000 );
1523 else if ( aExp <= 0x7E ) {
1524 if ( aExp | aSig ) set_float_exception_inexact_flag();
1525 return 0;
1527 aSig64 = aSig | 0x00800000;
1528 aSig64 <<= 40;
1529 z = aSig64>>( - shiftCount );
1530 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1531 set_float_exception_inexact_flag();
1533 if ( aSign ) z = - z;
1534 return z;
1537 #endif /* !SOFTFLOAT_FOR_GCC */
1540 -------------------------------------------------------------------------------
1541 Returns the result of converting the single-precision floating-point value
1542 `a' to the double-precision floating-point format. The conversion is
1543 performed according to the IEC/IEEE Standard for Binary Floating-Point
1544 Arithmetic.
1545 -------------------------------------------------------------------------------
1547 float64 float32_to_float64( float32 a )
1549 flag aSign;
1550 int16 aExp;
1551 bits32 aSig;
1553 aSig = extractFloat32Frac( a );
1554 aExp = extractFloat32Exp( a );
1555 aSign = extractFloat32Sign( a );
1556 if ( aExp == 0xFF ) {
1557 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1558 return packFloat64( aSign, 0x7FF, 0 );
1560 if ( aExp == 0 ) {
1561 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1562 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1563 --aExp;
1565 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1569 #ifdef FLOATX80
1572 -------------------------------------------------------------------------------
1573 Returns the result of converting the single-precision floating-point value
1574 `a' to the extended double-precision floating-point format. The conversion
1575 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1576 Arithmetic.
1577 -------------------------------------------------------------------------------
1579 floatx80 float32_to_floatx80( float32 a )
1581 flag aSign;
1582 int16 aExp;
1583 bits32 aSig;
1585 aSig = extractFloat32Frac( a );
1586 aExp = extractFloat32Exp( a );
1587 aSign = extractFloat32Sign( a );
1588 if ( aExp == 0xFF ) {
1589 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1590 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1592 if ( aExp == 0 ) {
1593 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1594 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1596 aSig |= 0x00800000;
1597 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1601 #endif
1603 #ifdef FLOAT128
1606 -------------------------------------------------------------------------------
1607 Returns the result of converting the single-precision floating-point value
1608 `a' to the double-precision floating-point format. The conversion is
1609 performed according to the IEC/IEEE Standard for Binary Floating-Point
1610 Arithmetic.
1611 -------------------------------------------------------------------------------
1613 float128 float32_to_float128( float32 a )
1615 flag aSign;
1616 int16 aExp;
1617 bits32 aSig;
1619 aSig = extractFloat32Frac( a );
1620 aExp = extractFloat32Exp( a );
1621 aSign = extractFloat32Sign( a );
1622 if ( aExp == 0xFF ) {
1623 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1624 return packFloat128( aSign, 0x7FFF, 0, 0 );
1626 if ( aExp == 0 ) {
1627 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1628 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1629 --aExp;
1631 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1635 #endif
1637 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1639 -------------------------------------------------------------------------------
1640 Rounds the single-precision floating-point value `a' to an integer, and
1641 returns the result as a single-precision floating-point value. The
1642 operation is performed according to the IEC/IEEE Standard for Binary
1643 Floating-Point Arithmetic.
1644 -------------------------------------------------------------------------------
1646 float32 float32_round_to_int( float32 a )
1648 flag aSign;
1649 int16 aExp;
1650 bits32 lastBitMask, roundBitsMask;
1651 int8 roundingMode;
1652 float32 z;
1654 aExp = extractFloat32Exp( a );
1655 if ( 0x96 <= aExp ) {
1656 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1657 return propagateFloat32NaN( a, a );
1659 return a;
1661 if ( aExp <= 0x7E ) {
1662 if ( (bits32) ( a<<1 ) == 0 ) return a;
1663 set_float_exception_inexact_flag();
1664 aSign = extractFloat32Sign( a );
1665 switch ( float_rounding_mode ) {
1666 case float_round_nearest_even:
1667 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1668 return packFloat32( aSign, 0x7F, 0 );
1670 break;
1671 case float_round_to_zero:
1672 break;
1673 case float_round_down:
1674 return aSign ? 0xBF800000 : 0;
1675 case float_round_up:
1676 return aSign ? 0x80000000 : 0x3F800000;
1678 return packFloat32( aSign, 0, 0 );
1680 lastBitMask = 1;
1681 lastBitMask <<= 0x96 - aExp;
1682 roundBitsMask = lastBitMask - 1;
1683 z = a;
1684 roundingMode = float_rounding_mode;
1685 if ( roundingMode == float_round_nearest_even ) {
1686 z += lastBitMask>>1;
1687 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1689 else if ( roundingMode != float_round_to_zero ) {
1690 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1691 z += roundBitsMask;
1694 z &= ~ roundBitsMask;
1695 if ( z != a ) set_float_exception_inexact_flag();
1696 return z;
1699 #endif /* !SOFTFLOAT_FOR_GCC */
1702 -------------------------------------------------------------------------------
1703 Returns the result of adding the absolute values of the single-precision
1704 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1705 before being returned. `zSign' is ignored if the result is a NaN.
1706 The addition is performed according to the IEC/IEEE Standard for Binary
1707 Floating-Point Arithmetic.
1708 -------------------------------------------------------------------------------
1710 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1712 int16 aExp, bExp, zExp;
1713 bits32 aSig, bSig, zSig;
1714 int16 expDiff;
1716 aSig = extractFloat32Frac( a );
1717 aExp = extractFloat32Exp( a );
1718 bSig = extractFloat32Frac( b );
1719 bExp = extractFloat32Exp( b );
1720 expDiff = aExp - bExp;
1721 aSig <<= 6;
1722 bSig <<= 6;
1723 if ( 0 < expDiff ) {
1724 if ( aExp == 0xFF ) {
1725 if ( aSig ) return propagateFloat32NaN( a, b );
1726 return a;
1728 if ( bExp == 0 ) {
1729 --expDiff;
1731 else {
1732 bSig |= 0x20000000;
1734 shift32RightJamming( bSig, expDiff, &bSig );
1735 zExp = aExp;
1737 else if ( expDiff < 0 ) {
1738 if ( bExp == 0xFF ) {
1739 if ( bSig ) return propagateFloat32NaN( a, b );
1740 return packFloat32( zSign, 0xFF, 0 );
1742 if ( aExp == 0 ) {
1743 ++expDiff;
1745 else {
1746 aSig |= 0x20000000;
1748 shift32RightJamming( aSig, - expDiff, &aSig );
1749 zExp = bExp;
1751 else {
1752 if ( aExp == 0xFF ) {
1753 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1754 return a;
1756 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1757 zSig = 0x40000000 + aSig + bSig;
1758 zExp = aExp;
1759 goto roundAndPack;
1761 aSig |= 0x20000000;
1762 zSig = ( aSig + bSig )<<1;
1763 --zExp;
1764 if ( (sbits32) zSig < 0 ) {
1765 zSig = aSig + bSig;
1766 ++zExp;
1768 roundAndPack:
1769 return roundAndPackFloat32( zSign, zExp, zSig );
1774 -------------------------------------------------------------------------------
1775 Returns the result of subtracting the absolute values of the single-
1776 precision floating-point values `a' and `b'. If `zSign' is 1, the
1777 difference is negated before being returned. `zSign' is ignored if the
1778 result is a NaN. The subtraction is performed according to the IEC/IEEE
1779 Standard for Binary Floating-Point Arithmetic.
1780 -------------------------------------------------------------------------------
1782 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1784 int16 aExp, bExp, zExp;
1785 bits32 aSig, bSig, zSig;
1786 int16 expDiff;
1788 aSig = extractFloat32Frac( a );
1789 aExp = extractFloat32Exp( a );
1790 bSig = extractFloat32Frac( b );
1791 bExp = extractFloat32Exp( b );
1792 expDiff = aExp - bExp;
1793 aSig <<= 7;
1794 bSig <<= 7;
1795 if ( 0 < expDiff ) goto aExpBigger;
1796 if ( expDiff < 0 ) goto bExpBigger;
1797 if ( aExp == 0xFF ) {
1798 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1799 float_raise( float_flag_invalid );
1800 return float32_default_nan;
1802 if ( aExp == 0 ) {
1803 aExp = 1;
1804 bExp = 1;
1806 if ( bSig < aSig ) goto aBigger;
1807 if ( aSig < bSig ) goto bBigger;
1808 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1809 bExpBigger:
1810 if ( bExp == 0xFF ) {
1811 if ( bSig ) return propagateFloat32NaN( a, b );
1812 return packFloat32( zSign ^ 1, 0xFF, 0 );
1814 if ( aExp == 0 ) {
1815 ++expDiff;
1817 else {
1818 aSig |= 0x40000000;
1820 shift32RightJamming( aSig, - expDiff, &aSig );
1821 bSig |= 0x40000000;
1822 bBigger:
1823 zSig = bSig - aSig;
1824 zExp = bExp;
1825 zSign ^= 1;
1826 goto normalizeRoundAndPack;
1827 aExpBigger:
1828 if ( aExp == 0xFF ) {
1829 if ( aSig ) return propagateFloat32NaN( a, b );
1830 return a;
1832 if ( bExp == 0 ) {
1833 --expDiff;
1835 else {
1836 bSig |= 0x40000000;
1838 shift32RightJamming( bSig, expDiff, &bSig );
1839 aSig |= 0x40000000;
1840 aBigger:
1841 zSig = aSig - bSig;
1842 zExp = aExp;
1843 normalizeRoundAndPack:
1844 --zExp;
1845 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1850 -------------------------------------------------------------------------------
1851 Returns the result of adding the single-precision floating-point values `a'
1852 and `b'. The operation is performed according to the IEC/IEEE Standard for
1853 Binary Floating-Point Arithmetic.
1854 -------------------------------------------------------------------------------
1856 float32 float32_add( float32 a, float32 b )
1858 flag aSign, bSign;
1860 aSign = extractFloat32Sign( a );
1861 bSign = extractFloat32Sign( b );
1862 if ( aSign == bSign ) {
1863 return addFloat32Sigs( a, b, aSign );
1865 else {
1866 return subFloat32Sigs( a, b, aSign );
1872 -------------------------------------------------------------------------------
1873 Returns the result of subtracting the single-precision floating-point values
1874 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1875 for Binary Floating-Point Arithmetic.
1876 -------------------------------------------------------------------------------
1878 float32 float32_sub( float32 a, float32 b )
1880 flag aSign, bSign;
1882 aSign = extractFloat32Sign( a );
1883 bSign = extractFloat32Sign( b );
1884 if ( aSign == bSign ) {
1885 return subFloat32Sigs( a, b, aSign );
1887 else {
1888 return addFloat32Sigs( a, b, aSign );
1894 -------------------------------------------------------------------------------
1895 Returns the result of multiplying the single-precision floating-point values
1896 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1897 for Binary Floating-Point Arithmetic.
1898 -------------------------------------------------------------------------------
1900 float32 float32_mul( float32 a, float32 b )
1902 flag aSign, bSign, zSign;
1903 int16 aExp, bExp, zExp;
1904 bits32 aSig, bSig;
1905 bits64 zSig64;
1906 bits32 zSig;
1908 aSig = extractFloat32Frac( a );
1909 aExp = extractFloat32Exp( a );
1910 aSign = extractFloat32Sign( a );
1911 bSig = extractFloat32Frac( b );
1912 bExp = extractFloat32Exp( b );
1913 bSign = extractFloat32Sign( b );
1914 zSign = aSign ^ bSign;
1915 if ( aExp == 0xFF ) {
1916 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1917 return propagateFloat32NaN( a, b );
1919 if ( ( bExp | bSig ) == 0 ) {
1920 float_raise( float_flag_invalid );
1921 return float32_default_nan;
1923 return packFloat32( zSign, 0xFF, 0 );
1925 if ( bExp == 0xFF ) {
1926 if ( bSig ) return propagateFloat32NaN( a, b );
1927 if ( ( aExp | aSig ) == 0 ) {
1928 float_raise( float_flag_invalid );
1929 return float32_default_nan;
1931 return packFloat32( zSign, 0xFF, 0 );
1933 if ( aExp == 0 ) {
1934 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1935 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1937 if ( bExp == 0 ) {
1938 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1939 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1941 zExp = aExp + bExp - 0x7F;
1942 aSig = ( aSig | 0x00800000 )<<7;
1943 bSig = ( bSig | 0x00800000 )<<8;
1944 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1945 zSig = (bits32)zSig64;
1946 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1947 zSig <<= 1;
1948 --zExp;
1950 return roundAndPackFloat32( zSign, zExp, zSig );
1955 -------------------------------------------------------------------------------
1956 Returns the result of dividing the single-precision floating-point value `a'
1957 by the corresponding value `b'. The operation is performed according to the
1958 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959 -------------------------------------------------------------------------------
1961 float32 float32_div( float32 a, float32 b )
1963 flag aSign, bSign, zSign;
1964 int16 aExp, bExp, zExp;
1965 bits32 aSig, bSig, zSig;
1967 aSig = extractFloat32Frac( a );
1968 aExp = extractFloat32Exp( a );
1969 aSign = extractFloat32Sign( a );
1970 bSig = extractFloat32Frac( b );
1971 bExp = extractFloat32Exp( b );
1972 bSign = extractFloat32Sign( b );
1973 zSign = aSign ^ bSign;
1974 if ( aExp == 0xFF ) {
1975 if ( aSig ) return propagateFloat32NaN( a, b );
1976 if ( bExp == 0xFF ) {
1977 if ( bSig ) return propagateFloat32NaN( a, b );
1978 float_raise( float_flag_invalid );
1979 return float32_default_nan;
1981 return packFloat32( zSign, 0xFF, 0 );
1983 if ( bExp == 0xFF ) {
1984 if ( bSig ) return propagateFloat32NaN( a, b );
1985 return packFloat32( zSign, 0, 0 );
1987 if ( bExp == 0 ) {
1988 if ( bSig == 0 ) {
1989 if ( ( aExp | aSig ) == 0 ) {
1990 float_raise( float_flag_invalid );
1991 return float32_default_nan;
1993 float_raise( float_flag_divbyzero );
1994 return packFloat32( zSign, 0xFF, 0 );
1996 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1998 if ( aExp == 0 ) {
1999 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2000 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2002 zExp = aExp - bExp + 0x7D;
2003 aSig = ( aSig | 0x00800000 )<<7;
2004 bSig = ( bSig | 0x00800000 )<<8;
2005 if ( bSig <= ( aSig + aSig ) ) {
2006 aSig >>= 1;
2007 ++zExp;
2009 zSig = (bits32)((((bits64) aSig) << 32) / bSig);
2010 if ( ( zSig & 0x3F ) == 0 ) {
2011 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2013 return roundAndPackFloat32( zSign, zExp, zSig );
2017 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2019 -------------------------------------------------------------------------------
2020 Returns the remainder of the single-precision floating-point value `a'
2021 with respect to the corresponding value `b'. The operation is performed
2022 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023 -------------------------------------------------------------------------------
2025 float32 float32_rem( float32 a, float32 b )
2027 flag aSign, bSign, zSign;
2028 int16 aExp, bExp, expDiff;
2029 bits32 aSig, bSig;
2030 bits32 q;
2031 bits64 aSig64, bSig64, q64;
2032 bits32 alternateASig;
2033 sbits32 sigMean;
2035 aSig = extractFloat32Frac( a );
2036 aExp = extractFloat32Exp( a );
2037 aSign = extractFloat32Sign( a );
2038 bSig = extractFloat32Frac( b );
2039 bExp = extractFloat32Exp( b );
2040 bSign = extractFloat32Sign( b );
2041 if ( aExp == 0xFF ) {
2042 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2043 return propagateFloat32NaN( a, b );
2045 float_raise( float_flag_invalid );
2046 return float32_default_nan;
2048 if ( bExp == 0xFF ) {
2049 if ( bSig ) return propagateFloat32NaN( a, b );
2050 return a;
2052 if ( bExp == 0 ) {
2053 if ( bSig == 0 ) {
2054 float_raise( float_flag_invalid );
2055 return float32_default_nan;
2057 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2059 if ( aExp == 0 ) {
2060 if ( aSig == 0 ) return a;
2061 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2063 expDiff = aExp - bExp;
2064 aSig |= 0x00800000;
2065 bSig |= 0x00800000;
2066 if ( expDiff < 32 ) {
2067 aSig <<= 8;
2068 bSig <<= 8;
2069 if ( expDiff < 0 ) {
2070 if ( expDiff < -1 ) return a;
2071 aSig >>= 1;
2073 q = ( bSig <= aSig );
2074 if ( q ) aSig -= bSig;
2075 if ( 0 < expDiff ) {
2076 q = ( ( (bits64) aSig )<<32 ) / bSig;
2077 q >>= 32 - expDiff;
2078 bSig >>= 2;
2079 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2081 else {
2082 aSig >>= 2;
2083 bSig >>= 2;
2086 else {
2087 if ( bSig <= aSig ) aSig -= bSig;
2088 aSig64 = ( (bits64) aSig )<<40;
2089 bSig64 = ( (bits64) bSig )<<40;
2090 expDiff -= 64;
2091 while ( 0 < expDiff ) {
2092 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2093 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2094 aSig64 = - ( ( bSig * q64 )<<38 );
2095 expDiff -= 62;
2097 expDiff += 64;
2098 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2099 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2100 q = q64>>( 64 - expDiff );
2101 bSig <<= 6;
2102 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2104 do {
2105 alternateASig = aSig;
2106 ++q;
2107 aSig -= bSig;
2108 } while ( 0 <= (sbits32) aSig );
2109 sigMean = aSig + alternateASig;
2110 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2111 aSig = alternateASig;
2113 zSign = ( (sbits32) aSig < 0 );
2114 if ( zSign ) aSig = - aSig;
2115 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2118 #endif /* !SOFTFLOAT_FOR_GCC */
2120 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2122 -------------------------------------------------------------------------------
2123 Returns the square root of the single-precision floating-point value `a'.
2124 The operation is performed according to the IEC/IEEE Standard for Binary
2125 Floating-Point Arithmetic.
2126 -------------------------------------------------------------------------------
2128 float32 float32_sqrt( float32 a )
2130 flag aSign;
2131 int16 aExp, zExp;
2132 bits32 aSig, zSig;
2133 bits64 rem, term;
2135 aSig = extractFloat32Frac( a );
2136 aExp = extractFloat32Exp( a );
2137 aSign = extractFloat32Sign( a );
2138 if ( aExp == 0xFF ) {
2139 if ( aSig ) return propagateFloat32NaN( a, 0 );
2140 if ( ! aSign ) return a;
2141 float_raise( float_flag_invalid );
2142 return float32_default_nan;
2144 if ( aSign ) {
2145 if ( ( aExp | aSig ) == 0 ) return a;
2146 float_raise( float_flag_invalid );
2147 return float32_default_nan;
2149 if ( aExp == 0 ) {
2150 if ( aSig == 0 ) return 0;
2151 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2153 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2154 aSig = ( aSig | 0x00800000 )<<8;
2155 zSig = estimateSqrt32( aExp, aSig ) + 2;
2156 if ( ( zSig & 0x7F ) <= 5 ) {
2157 if ( zSig < 2 ) {
2158 zSig = 0x7FFFFFFF;
2159 goto roundAndPack;
2161 aSig >>= aExp & 1;
2162 term = ( (bits64) zSig ) * zSig;
2163 rem = ( ( (bits64) aSig )<<32 ) - term;
2164 while ( (sbits64) rem < 0 ) {
2165 --zSig;
2166 rem += ( ( (bits64) zSig )<<1 ) | 1;
2168 zSig |= ( rem != 0 );
2170 shift32RightJamming( zSig, 1, &zSig );
2171 roundAndPack:
2172 return roundAndPackFloat32( 0, zExp, zSig );
2175 #endif /* !SOFTFLOAT_FOR_GCC */
2178 -------------------------------------------------------------------------------
2179 Returns 1 if the single-precision floating-point value `a' is equal to
2180 the corresponding value `b', and 0 otherwise. The comparison is performed
2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2182 -------------------------------------------------------------------------------
2184 flag float32_eq( float32 a, float32 b )
2187 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2188 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2190 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2191 float_raise( float_flag_invalid );
2193 return 0;
2195 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2200 -------------------------------------------------------------------------------
2201 Returns 1 if the single-precision floating-point value `a' is less than
2202 or equal to the corresponding value `b', and 0 otherwise. The comparison
2203 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2204 Arithmetic.
2205 -------------------------------------------------------------------------------
2207 flag float32_le( float32 a, float32 b )
2209 flag aSign, bSign;
2211 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2212 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2214 float_raise( float_flag_invalid );
2215 return 0;
2217 aSign = extractFloat32Sign( a );
2218 bSign = extractFloat32Sign( b );
2219 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2220 return ( a == b ) || ( aSign ^ ( a < b ) );
2225 -------------------------------------------------------------------------------
2226 Returns 1 if the single-precision floating-point value `a' is less than
2227 the corresponding value `b', and 0 otherwise. The comparison is performed
2228 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2229 -------------------------------------------------------------------------------
2231 flag float32_lt( float32 a, float32 b )
2233 flag aSign, bSign;
2235 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2236 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2238 float_raise( float_flag_invalid );
2239 return 0;
2241 aSign = extractFloat32Sign( a );
2242 bSign = extractFloat32Sign( b );
2243 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2244 return ( a != b ) && ( aSign ^ ( a < b ) );
2248 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2250 -------------------------------------------------------------------------------
2251 Returns 1 if the single-precision floating-point value `a' is equal to
2252 the corresponding value `b', and 0 otherwise. The invalid exception is
2253 raised if either operand is a NaN. Otherwise, the comparison is performed
2254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2255 -------------------------------------------------------------------------------
2257 flag float32_eq_signaling( float32 a, float32 b )
2260 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2261 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2263 float_raise( float_flag_invalid );
2264 return 0;
2266 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2271 -------------------------------------------------------------------------------
2272 Returns 1 if the single-precision floating-point value `a' is less than or
2273 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2274 cause an exception. Otherwise, the comparison is performed according to the
2275 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 -------------------------------------------------------------------------------
2278 flag float32_le_quiet( float32 a, float32 b )
2280 flag aSign, bSign;
2282 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2283 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2285 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2286 float_raise( float_flag_invalid );
2288 return 0;
2290 aSign = extractFloat32Sign( a );
2291 bSign = extractFloat32Sign( b );
2292 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2293 return ( a == b ) || ( aSign ^ ( a < b ) );
2298 -------------------------------------------------------------------------------
2299 Returns 1 if the single-precision floating-point value `a' is less than
2300 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2301 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2302 Standard for Binary Floating-Point Arithmetic.
2303 -------------------------------------------------------------------------------
2305 flag float32_lt_quiet( float32 a, float32 b )
2307 flag aSign, bSign;
2309 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2310 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2312 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2313 float_raise( float_flag_invalid );
2315 return 0;
2317 aSign = extractFloat32Sign( a );
2318 bSign = extractFloat32Sign( b );
2319 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2320 return ( a != b ) && ( aSign ^ ( a < b ) );
2323 #endif /* !SOFTFLOAT_FOR_GCC */
2325 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2327 -------------------------------------------------------------------------------
2328 Returns the result of converting the double-precision floating-point value
2329 `a' to the 32-bit two's complement integer format. The conversion is
2330 performed according to the IEC/IEEE Standard for Binary Floating-Point
2331 Arithmetic---which means in particular that the conversion is rounded
2332 according to the current rounding mode. If `a' is a NaN, the largest
2333 positive integer is returned. Otherwise, if the conversion overflows, the
2334 largest integer with the same sign as `a' is returned.
2335 -------------------------------------------------------------------------------
2337 int32 float64_to_int32( float64 a )
2339 flag aSign;
2340 int16 aExp, shiftCount;
2341 bits64 aSig;
2343 aSig = extractFloat64Frac( a );
2344 aExp = extractFloat64Exp( a );
2345 aSign = extractFloat64Sign( a );
2346 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2347 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2348 shiftCount = 0x42C - aExp;
2349 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2350 return roundAndPackInt32( aSign, aSig );
2353 #endif /* !SOFTFLOAT_FOR_GCC */
2356 -------------------------------------------------------------------------------
2357 Returns the result of converting the double-precision floating-point value
2358 `a' to the 32-bit two's complement integer format. The conversion is
2359 performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 Arithmetic, except that the conversion is always rounded toward zero.
2361 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2362 the conversion overflows, the largest integer with the same sign as `a' is
2363 returned.
2364 -------------------------------------------------------------------------------
2366 int32 float64_to_int32_round_to_zero( float64 a )
2368 flag aSign;
2369 int16 aExp, shiftCount;
2370 bits64 aSig, savedASig;
2371 int32 z;
2373 aSig = extractFloat64Frac( a );
2374 aExp = extractFloat64Exp( a );
2375 aSign = extractFloat64Sign( a );
2376 if ( 0x41E < aExp ) {
2377 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2378 goto invalid;
2380 else if ( aExp < 0x3FF ) {
2381 if ( aExp || aSig ) set_float_exception_inexact_flag();
2382 return 0;
2384 aSig |= LIT64( 0x0010000000000000 );
2385 shiftCount = 0x433 - aExp;
2386 savedASig = aSig;
2387 aSig >>= shiftCount;
2388 z = (int32)aSig;
2389 if ( aSign ) z = - z;
2390 if ( ( z < 0 ) ^ aSign ) {
2391 invalid:
2392 float_raise( float_flag_invalid );
2393 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2395 if ( ( aSig<<shiftCount ) != savedASig ) {
2396 set_float_exception_inexact_flag();
2398 return z;
2402 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2404 -------------------------------------------------------------------------------
2405 Returns the result of converting the double-precision floating-point value
2406 `a' to the 64-bit two's complement integer format. The conversion is
2407 performed according to the IEC/IEEE Standard for Binary Floating-Point
2408 Arithmetic---which means in particular that the conversion is rounded
2409 according to the current rounding mode. If `a' is a NaN, the largest
2410 positive integer is returned. Otherwise, if the conversion overflows, the
2411 largest integer with the same sign as `a' is returned.
2412 -------------------------------------------------------------------------------
2414 int64 float64_to_int64( float64 a )
2416 flag aSign;
2417 int16 aExp, shiftCount;
2418 bits64 aSig, aSigExtra;
2420 aSig = extractFloat64Frac( a );
2421 aExp = extractFloat64Exp( a );
2422 aSign = extractFloat64Sign( a );
2423 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2424 shiftCount = 0x433 - aExp;
2425 if ( shiftCount <= 0 ) {
2426 if ( 0x43E < aExp ) {
2427 float_raise( float_flag_invalid );
2428 if ( ! aSign
2429 || ( ( aExp == 0x7FF )
2430 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2432 return LIT64( 0x7FFFFFFFFFFFFFFF );
2434 return (sbits64) LIT64( 0x8000000000000000 );
2436 aSigExtra = 0;
2437 aSig <<= - shiftCount;
2439 else {
2440 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2442 return roundAndPackInt64( aSign, aSig, aSigExtra );
2447 -------------------------------------------------------------------------------
2448 Returns the result of converting the double-precision floating-point value
2449 `a' to the 64-bit two's complement integer format. The conversion is
2450 performed according to the IEC/IEEE Standard for Binary Floating-Point
2451 Arithmetic, except that the conversion is always rounded toward zero.
2452 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2453 the conversion overflows, the largest integer with the same sign as `a' is
2454 returned.
2455 -------------------------------------------------------------------------------
2457 int64 float64_to_int64_round_to_zero( float64 a )
2459 flag aSign;
2460 int16 aExp, shiftCount;
2461 bits64 aSig;
2462 int64 z;
2464 aSig = extractFloat64Frac( a );
2465 aExp = extractFloat64Exp( a );
2466 aSign = extractFloat64Sign( a );
2467 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2468 shiftCount = aExp - 0x433;
2469 if ( 0 <= shiftCount ) {
2470 if ( 0x43E <= aExp ) {
2471 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2472 float_raise( float_flag_invalid );
2473 if ( ! aSign
2474 || ( ( aExp == 0x7FF )
2475 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2477 return LIT64( 0x7FFFFFFFFFFFFFFF );
2480 return (sbits64) LIT64( 0x8000000000000000 );
2482 z = aSig<<shiftCount;
2484 else {
2485 if ( aExp < 0x3FE ) {
2486 if ( aExp | aSig ) set_float_exception_inexact_flag();
2487 return 0;
2489 z = aSig>>( - shiftCount );
2490 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2491 set_float_exception_inexact_flag();
2494 if ( aSign ) z = - z;
2495 return z;
2498 #endif /* !SOFTFLOAT_FOR_GCC */
2501 -------------------------------------------------------------------------------
2502 Returns the result of converting the double-precision floating-point value
2503 `a' to the single-precision floating-point format. The conversion is
2504 performed according to the IEC/IEEE Standard for Binary Floating-Point
2505 Arithmetic.
2506 -------------------------------------------------------------------------------
2508 float32 float64_to_float32( float64 a )
2510 flag aSign;
2511 int16 aExp;
2512 bits64 aSig;
2513 bits32 zSig;
2515 aSig = extractFloat64Frac( a );
2516 aExp = extractFloat64Exp( a );
2517 aSign = extractFloat64Sign( a );
2518 if ( aExp == 0x7FF ) {
2519 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2520 return packFloat32( aSign, 0xFF, 0 );
2522 shift64RightJamming( aSig, 22, &aSig );
2523 zSig = (bits32)aSig;
2524 if ( aExp || zSig ) {
2525 zSig |= 0x40000000;
2526 aExp -= 0x381;
2528 return roundAndPackFloat32( aSign, aExp, zSig );
2532 #ifdef FLOATX80
2535 -------------------------------------------------------------------------------
2536 Returns the result of converting the double-precision floating-point value
2537 `a' to the extended double-precision floating-point format. The conversion
2538 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2539 Arithmetic.
2540 -------------------------------------------------------------------------------
2542 floatx80 float64_to_floatx80( float64 a )
2544 flag aSign;
2545 int16 aExp;
2546 bits64 aSig;
2548 aSig = extractFloat64Frac( a );
2549 aExp = extractFloat64Exp( a );
2550 aSign = extractFloat64Sign( a );
2551 if ( aExp == 0x7FF ) {
2552 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2553 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2555 if ( aExp == 0 ) {
2556 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2557 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2559 return
2560 packFloatx80(
2561 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2565 #endif
2567 #ifdef FLOAT128
2570 -------------------------------------------------------------------------------
2571 Returns the result of converting the double-precision floating-point value
2572 `a' to the quadruple-precision floating-point format. The conversion is
2573 performed according to the IEC/IEEE Standard for Binary Floating-Point
2574 Arithmetic.
2575 -------------------------------------------------------------------------------
2577 float128 float64_to_float128( float64 a )
2579 flag aSign;
2580 int16 aExp;
2581 bits64 aSig, zSig0, zSig1;
2583 aSig = extractFloat64Frac( a );
2584 aExp = extractFloat64Exp( a );
2585 aSign = extractFloat64Sign( a );
2586 if ( aExp == 0x7FF ) {
2587 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2588 return packFloat128( aSign, 0x7FFF, 0, 0 );
2590 if ( aExp == 0 ) {
2591 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2592 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2593 --aExp;
2595 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2596 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2600 #endif
2602 #ifndef SOFTFLOAT_FOR_GCC
2604 -------------------------------------------------------------------------------
2605 Rounds the double-precision floating-point value `a' to an integer, and
2606 returns the result as a double-precision floating-point value. The
2607 operation is performed according to the IEC/IEEE Standard for Binary
2608 Floating-Point Arithmetic.
2609 -------------------------------------------------------------------------------
2611 float64 float64_round_to_int( float64 a )
2613 flag aSign;
2614 int16 aExp;
2615 bits64 lastBitMask, roundBitsMask;
2616 int8 roundingMode;
2617 float64 z;
2619 aExp = extractFloat64Exp( a );
2620 if ( 0x433 <= aExp ) {
2621 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2622 return propagateFloat64NaN( a, a );
2624 return a;
2626 if ( aExp < 0x3FF ) {
2627 if ( (bits64) ( a<<1 ) == 0 ) return a;
2628 set_float_exception_inexact_flag();
2629 aSign = extractFloat64Sign( a );
2630 switch ( float_rounding_mode ) {
2631 case float_round_nearest_even:
2632 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2633 return packFloat64( aSign, 0x3FF, 0 );
2635 break;
2636 case float_round_to_zero:
2637 break;
2638 case float_round_down:
2639 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2640 case float_round_up:
2641 return
2642 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2644 return packFloat64( aSign, 0, 0 );
2646 lastBitMask = 1;
2647 lastBitMask <<= 0x433 - aExp;
2648 roundBitsMask = lastBitMask - 1;
2649 z = a;
2650 roundingMode = float_rounding_mode;
2651 if ( roundingMode == float_round_nearest_even ) {
2652 z += lastBitMask>>1;
2653 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2655 else if ( roundingMode != float_round_to_zero ) {
2656 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2657 z += roundBitsMask;
2660 z &= ~ roundBitsMask;
2661 if ( z != a ) set_float_exception_inexact_flag();
2662 return z;
2665 #endif
2668 -------------------------------------------------------------------------------
2669 Returns the result of adding the absolute values of the double-precision
2670 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2671 before being returned. `zSign' is ignored if the result is a NaN.
2672 The addition is performed according to the IEC/IEEE Standard for Binary
2673 Floating-Point Arithmetic.
2674 -------------------------------------------------------------------------------
2676 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2678 int16 aExp, bExp, zExp;
2679 bits64 aSig, bSig, zSig;
2680 int16 expDiff;
2682 aSig = extractFloat64Frac( a );
2683 aExp = extractFloat64Exp( a );
2684 bSig = extractFloat64Frac( b );
2685 bExp = extractFloat64Exp( b );
2686 expDiff = aExp - bExp;
2687 aSig <<= 9;
2688 bSig <<= 9;
2689 if ( 0 < expDiff ) {
2690 if ( aExp == 0x7FF ) {
2691 if ( aSig ) return propagateFloat64NaN( a, b );
2692 return a;
2694 if ( bExp == 0 ) {
2695 --expDiff;
2697 else {
2698 bSig |= LIT64( 0x2000000000000000 );
2700 shift64RightJamming( bSig, expDiff, &bSig );
2701 zExp = aExp;
2703 else if ( expDiff < 0 ) {
2704 if ( bExp == 0x7FF ) {
2705 if ( bSig ) return propagateFloat64NaN( a, b );
2706 return packFloat64( zSign, 0x7FF, 0 );
2708 if ( aExp == 0 ) {
2709 ++expDiff;
2711 else {
2712 aSig |= LIT64( 0x2000000000000000 );
2714 shift64RightJamming( aSig, - expDiff, &aSig );
2715 zExp = bExp;
2717 else {
2718 if ( aExp == 0x7FF ) {
2719 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2720 return a;
2722 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2723 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2724 zExp = aExp;
2725 goto roundAndPack;
2727 aSig |= LIT64( 0x2000000000000000 );
2728 zSig = ( aSig + bSig )<<1;
2729 --zExp;
2730 if ( (sbits64) zSig < 0 ) {
2731 zSig = aSig + bSig;
2732 ++zExp;
2734 roundAndPack:
2735 return roundAndPackFloat64( zSign, zExp, zSig );
2740 -------------------------------------------------------------------------------
2741 Returns the result of subtracting the absolute values of the double-
2742 precision floating-point values `a' and `b'. If `zSign' is 1, the
2743 difference is negated before being returned. `zSign' is ignored if the
2744 result is a NaN. The subtraction is performed according to the IEC/IEEE
2745 Standard for Binary Floating-Point Arithmetic.
2746 -------------------------------------------------------------------------------
2748 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2750 int16 aExp, bExp, zExp;
2751 bits64 aSig, bSig, zSig;
2752 int16 expDiff;
2754 aSig = extractFloat64Frac( a );
2755 aExp = extractFloat64Exp( a );
2756 bSig = extractFloat64Frac( b );
2757 bExp = extractFloat64Exp( b );
2758 expDiff = aExp - bExp;
2759 aSig <<= 10;
2760 bSig <<= 10;
2761 if ( 0 < expDiff ) goto aExpBigger;
2762 if ( expDiff < 0 ) goto bExpBigger;
2763 if ( aExp == 0x7FF ) {
2764 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2765 float_raise( float_flag_invalid );
2766 return float64_default_nan;
2768 if ( aExp == 0 ) {
2769 aExp = 1;
2770 bExp = 1;
2772 if ( bSig < aSig ) goto aBigger;
2773 if ( aSig < bSig ) goto bBigger;
2774 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2775 bExpBigger:
2776 if ( bExp == 0x7FF ) {
2777 if ( bSig ) return propagateFloat64NaN( a, b );
2778 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2780 if ( aExp == 0 ) {
2781 ++expDiff;
2783 else {
2784 aSig |= LIT64( 0x4000000000000000 );
2786 shift64RightJamming( aSig, - expDiff, &aSig );
2787 bSig |= LIT64( 0x4000000000000000 );
2788 bBigger:
2789 zSig = bSig - aSig;
2790 zExp = bExp;
2791 zSign ^= 1;
2792 goto normalizeRoundAndPack;
2793 aExpBigger:
2794 if ( aExp == 0x7FF ) {
2795 if ( aSig ) return propagateFloat64NaN( a, b );
2796 return a;
2798 if ( bExp == 0 ) {
2799 --expDiff;
2801 else {
2802 bSig |= LIT64( 0x4000000000000000 );
2804 shift64RightJamming( bSig, expDiff, &bSig );
2805 aSig |= LIT64( 0x4000000000000000 );
2806 aBigger:
2807 zSig = aSig - bSig;
2808 zExp = aExp;
2809 normalizeRoundAndPack:
2810 --zExp;
2811 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2816 -------------------------------------------------------------------------------
2817 Returns the result of adding the double-precision floating-point values `a'
2818 and `b'. The operation is performed according to the IEC/IEEE Standard for
2819 Binary Floating-Point Arithmetic.
2820 -------------------------------------------------------------------------------
2822 float64 float64_add( float64 a, float64 b )
2824 flag aSign, bSign;
2826 aSign = extractFloat64Sign( a );
2827 bSign = extractFloat64Sign( b );
2828 if ( aSign == bSign ) {
2829 return addFloat64Sigs( a, b, aSign );
2831 else {
2832 return subFloat64Sigs( a, b, aSign );
2838 -------------------------------------------------------------------------------
2839 Returns the result of subtracting the double-precision floating-point values
2840 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2841 for Binary Floating-Point Arithmetic.
2842 -------------------------------------------------------------------------------
2844 float64 float64_sub( float64 a, float64 b )
2846 flag aSign, bSign;
2848 aSign = extractFloat64Sign( a );
2849 bSign = extractFloat64Sign( b );
2850 if ( aSign == bSign ) {
2851 return subFloat64Sigs( a, b, aSign );
2853 else {
2854 return addFloat64Sigs( a, b, aSign );
2860 -------------------------------------------------------------------------------
2861 Returns the result of multiplying the double-precision floating-point values
2862 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2863 for Binary Floating-Point Arithmetic.
2864 -------------------------------------------------------------------------------
2866 float64 float64_mul( float64 a, float64 b )
2868 flag aSign, bSign, zSign;
2869 int16 aExp, bExp, zExp;
2870 bits64 aSig, bSig, zSig0, zSig1;
2872 aSig = extractFloat64Frac( a );
2873 aExp = extractFloat64Exp( a );
2874 aSign = extractFloat64Sign( a );
2875 bSig = extractFloat64Frac( b );
2876 bExp = extractFloat64Exp( b );
2877 bSign = extractFloat64Sign( b );
2878 zSign = aSign ^ bSign;
2879 if ( aExp == 0x7FF ) {
2880 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2881 return propagateFloat64NaN( a, b );
2883 if ( ( bExp | bSig ) == 0 ) {
2884 float_raise( float_flag_invalid );
2885 return float64_default_nan;
2887 return packFloat64( zSign, 0x7FF, 0 );
2889 if ( bExp == 0x7FF ) {
2890 if ( bSig ) return propagateFloat64NaN( a, b );
2891 if ( ( aExp | aSig ) == 0 ) {
2892 float_raise( float_flag_invalid );
2893 return float64_default_nan;
2895 return packFloat64( zSign, 0x7FF, 0 );
2897 if ( aExp == 0 ) {
2898 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2899 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2901 if ( bExp == 0 ) {
2902 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2903 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2905 zExp = aExp + bExp - 0x3FF;
2906 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2907 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2908 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2909 zSig0 |= ( zSig1 != 0 );
2910 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2911 zSig0 <<= 1;
2912 --zExp;
2914 return roundAndPackFloat64( zSign, zExp, zSig0 );
2919 -------------------------------------------------------------------------------
2920 Returns the result of dividing the double-precision floating-point value `a'
2921 by the corresponding value `b'. The operation is performed according to
2922 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2923 -------------------------------------------------------------------------------
2925 float64 float64_div( float64 a, float64 b )
2927 flag aSign, bSign, zSign;
2928 int16 aExp, bExp, zExp;
2929 bits64 aSig, bSig, zSig;
2930 bits64 rem0, rem1;
2931 bits64 term0, term1;
2933 aSig = extractFloat64Frac( a );
2934 aExp = extractFloat64Exp( a );
2935 aSign = extractFloat64Sign( a );
2936 bSig = extractFloat64Frac( b );
2937 bExp = extractFloat64Exp( b );
2938 bSign = extractFloat64Sign( b );
2939 zSign = aSign ^ bSign;
2940 if ( aExp == 0x7FF ) {
2941 if ( aSig ) return propagateFloat64NaN( a, b );
2942 if ( bExp == 0x7FF ) {
2943 if ( bSig ) return propagateFloat64NaN( a, b );
2944 float_raise( float_flag_invalid );
2945 return float64_default_nan;
2947 return packFloat64( zSign, 0x7FF, 0 );
2949 if ( bExp == 0x7FF ) {
2950 if ( bSig ) return propagateFloat64NaN( a, b );
2951 return packFloat64( zSign, 0, 0 );
2953 if ( bExp == 0 ) {
2954 if ( bSig == 0 ) {
2955 if ( ( aExp | aSig ) == 0 ) {
2956 float_raise( float_flag_invalid );
2957 return float64_default_nan;
2959 float_raise( float_flag_divbyzero );
2960 return packFloat64( zSign, 0x7FF, 0 );
2962 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2964 if ( aExp == 0 ) {
2965 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2966 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2968 zExp = aExp - bExp + 0x3FD;
2969 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2970 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2971 if ( bSig <= ( aSig + aSig ) ) {
2972 aSig >>= 1;
2973 ++zExp;
2975 zSig = estimateDiv128To64( aSig, 0, bSig );
2976 if ( ( zSig & 0x1FF ) <= 2 ) {
2977 mul64To128( bSig, zSig, &term0, &term1 );
2978 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2979 while ( (sbits64) rem0 < 0 ) {
2980 --zSig;
2981 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2983 zSig |= ( rem1 != 0 );
2985 return roundAndPackFloat64( zSign, zExp, zSig );
2989 #ifndef SOFTFLOAT_FOR_GCC
2991 -------------------------------------------------------------------------------
2992 Returns the remainder of the double-precision floating-point value `a'
2993 with respect to the corresponding value `b'. The operation is performed
2994 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2995 -------------------------------------------------------------------------------
2997 float64 float64_rem( float64 a, float64 b )
2999 flag aSign, bSign, zSign;
3000 int16 aExp, bExp, expDiff;
3001 bits64 aSig, bSig;
3002 bits64 q, alternateASig;
3003 sbits64 sigMean;
3005 aSig = extractFloat64Frac( a );
3006 aExp = extractFloat64Exp( a );
3007 aSign = extractFloat64Sign( a );
3008 bSig = extractFloat64Frac( b );
3009 bExp = extractFloat64Exp( b );
3010 bSign = extractFloat64Sign( b );
3011 if ( aExp == 0x7FF ) {
3012 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3013 return propagateFloat64NaN( a, b );
3015 float_raise( float_flag_invalid );
3016 return float64_default_nan;
3018 if ( bExp == 0x7FF ) {
3019 if ( bSig ) return propagateFloat64NaN( a, b );
3020 return a;
3022 if ( bExp == 0 ) {
3023 if ( bSig == 0 ) {
3024 float_raise( float_flag_invalid );
3025 return float64_default_nan;
3027 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3029 if ( aExp == 0 ) {
3030 if ( aSig == 0 ) return a;
3031 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3033 expDiff = aExp - bExp;
3034 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3035 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3036 if ( expDiff < 0 ) {
3037 if ( expDiff < -1 ) return a;
3038 aSig >>= 1;
3040 q = ( bSig <= aSig );
3041 if ( q ) aSig -= bSig;
3042 expDiff -= 64;
3043 while ( 0 < expDiff ) {
3044 q = estimateDiv128To64( aSig, 0, bSig );
3045 q = ( 2 < q ) ? q - 2 : 0;
3046 aSig = - ( ( bSig>>2 ) * q );
3047 expDiff -= 62;
3049 expDiff += 64;
3050 if ( 0 < expDiff ) {
3051 q = estimateDiv128To64( aSig, 0, bSig );
3052 q = ( 2 < q ) ? q - 2 : 0;
3053 q >>= 64 - expDiff;
3054 bSig >>= 2;
3055 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3057 else {
3058 aSig >>= 2;
3059 bSig >>= 2;
3061 do {
3062 alternateASig = aSig;
3063 ++q;
3064 aSig -= bSig;
3065 } while ( 0 <= (sbits64) aSig );
3066 sigMean = aSig + alternateASig;
3067 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3068 aSig = alternateASig;
3070 zSign = ( (sbits64) aSig < 0 );
3071 if ( zSign ) aSig = - aSig;
3072 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3077 -------------------------------------------------------------------------------
3078 Returns the square root of the double-precision floating-point value `a'.
3079 The operation is performed according to the IEC/IEEE Standard for Binary
3080 Floating-Point Arithmetic.
3081 -------------------------------------------------------------------------------
3083 float64 float64_sqrt( float64 a )
3085 flag aSign;
3086 int16 aExp, zExp;
3087 bits64 aSig, zSig, doubleZSig;
3088 bits64 rem0, rem1, term0, term1;
3090 aSig = extractFloat64Frac( a );
3091 aExp = extractFloat64Exp( a );
3092 aSign = extractFloat64Sign( a );
3093 if ( aExp == 0x7FF ) {
3094 if ( aSig ) return propagateFloat64NaN( a, a );
3095 if ( ! aSign ) return a;
3096 float_raise( float_flag_invalid );
3097 return float64_default_nan;
3099 if ( aSign ) {
3100 if ( ( aExp | aSig ) == 0 ) return a;
3101 float_raise( float_flag_invalid );
3102 return float64_default_nan;
3104 if ( aExp == 0 ) {
3105 if ( aSig == 0 ) return 0;
3106 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3108 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3109 aSig |= LIT64( 0x0010000000000000 );
3110 zSig = estimateSqrt32( aExp, aSig>>21 );
3111 aSig <<= 9 - ( aExp & 1 );
3112 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3113 if ( ( zSig & 0x1FF ) <= 5 ) {
3114 doubleZSig = zSig<<1;
3115 mul64To128( zSig, zSig, &term0, &term1 );
3116 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3117 while ( (sbits64) rem0 < 0 ) {
3118 --zSig;
3119 doubleZSig -= 2;
3120 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3122 zSig |= ( ( rem0 | rem1 ) != 0 );
3124 return roundAndPackFloat64( 0, zExp, zSig );
3127 #endif
3130 -------------------------------------------------------------------------------
3131 Returns 1 if the double-precision floating-point value `a' is equal to the
3132 corresponding value `b', and 0 otherwise. The comparison is performed
3133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3134 -------------------------------------------------------------------------------
3136 flag float64_eq( float64 a, float64 b )
3139 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3140 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3142 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3143 float_raise( float_flag_invalid );
3145 return 0;
3147 return ( a == b ) ||
3148 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3153 -------------------------------------------------------------------------------
3154 Returns 1 if the double-precision floating-point value `a' is less than or
3155 equal to the corresponding value `b', and 0 otherwise. The comparison is
3156 performed according to the IEC/IEEE Standard for Binary Floating-Point
3157 Arithmetic.
3158 -------------------------------------------------------------------------------
3160 flag float64_le( float64 a, float64 b )
3162 flag aSign, bSign;
3164 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3165 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3167 float_raise( float_flag_invalid );
3168 return 0;
3170 aSign = extractFloat64Sign( a );
3171 bSign = extractFloat64Sign( b );
3172 if ( aSign != bSign )
3173 return aSign ||
3174 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3175 0 );
3176 return ( a == b ) ||
3177 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3182 -------------------------------------------------------------------------------
3183 Returns 1 if the double-precision floating-point value `a' is less than
3184 the corresponding value `b', and 0 otherwise. The comparison is performed
3185 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3186 -------------------------------------------------------------------------------
3188 flag float64_lt( float64 a, float64 b )
3190 flag aSign, bSign;
3192 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3193 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3195 float_raise( float_flag_invalid );
3196 return 0;
3198 aSign = extractFloat64Sign( a );
3199 bSign = extractFloat64Sign( b );
3200 if ( aSign != bSign )
3201 return aSign &&
3202 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3203 0 );
3204 return ( a != b ) &&
3205 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3209 #ifndef SOFTFLOAT_FOR_GCC
3211 -------------------------------------------------------------------------------
3212 Returns 1 if the double-precision floating-point value `a' is equal to the
3213 corresponding value `b', and 0 otherwise. The invalid exception is raised
3214 if either operand is a NaN. Otherwise, the comparison is performed
3215 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3216 -------------------------------------------------------------------------------
3218 flag float64_eq_signaling( float64 a, float64 b )
3221 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3222 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3224 float_raise( float_flag_invalid );
3225 return 0;
3227 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3232 -------------------------------------------------------------------------------
3233 Returns 1 if the double-precision floating-point value `a' is less than or
3234 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3235 cause an exception. Otherwise, the comparison is performed according to the
3236 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3237 -------------------------------------------------------------------------------
3239 flag float64_le_quiet( float64 a, float64 b )
3241 flag aSign, bSign;
3243 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3244 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3246 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3247 float_raise( float_flag_invalid );
3249 return 0;
3251 aSign = extractFloat64Sign( a );
3252 bSign = extractFloat64Sign( b );
3253 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3254 return ( a == b ) || ( aSign ^ ( a < b ) );
3259 -------------------------------------------------------------------------------
3260 Returns 1 if the double-precision floating-point value `a' is less than
3261 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3262 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3263 Standard for Binary Floating-Point Arithmetic.
3264 -------------------------------------------------------------------------------
3266 flag float64_lt_quiet( float64 a, float64 b )
3268 flag aSign, bSign;
3270 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3271 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3273 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3274 float_raise( float_flag_invalid );
3276 return 0;
3278 aSign = extractFloat64Sign( a );
3279 bSign = extractFloat64Sign( b );
3280 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3281 return ( a != b ) && ( aSign ^ ( a < b ) );
3284 #endif
3286 #ifdef FLOATX80
3289 -------------------------------------------------------------------------------
3290 Returns the result of converting the extended double-precision floating-
3291 point value `a' to the 32-bit two's complement integer format. The
3292 conversion is performed according to the IEC/IEEE Standard for Binary
3293 Floating-Point Arithmetic---which means in particular that the conversion
3294 is rounded according to the current rounding mode. If `a' is a NaN, the
3295 largest positive integer is returned. Otherwise, if the conversion
3296 overflows, the largest integer with the same sign as `a' is returned.
3297 -------------------------------------------------------------------------------
3299 int32 floatx80_to_int32( floatx80 a )
3301 flag aSign;
3302 int32 aExp, shiftCount;
3303 bits64 aSig;
3305 aSig = extractFloatx80Frac( a );
3306 aExp = extractFloatx80Exp( a );
3307 aSign = extractFloatx80Sign( a );
3308 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3309 shiftCount = 0x4037 - aExp;
3310 if ( shiftCount <= 0 ) shiftCount = 1;
3311 shift64RightJamming( aSig, shiftCount, &aSig );
3312 return roundAndPackInt32( aSign, aSig );
3317 -------------------------------------------------------------------------------
3318 Returns the result of converting the extended double-precision floating-
3319 point value `a' to the 32-bit two's complement integer format. The
3320 conversion is performed according to the IEC/IEEE Standard for Binary
3321 Floating-Point Arithmetic, except that the conversion is always rounded
3322 toward zero. If `a' is a NaN, the largest positive integer is returned.
3323 Otherwise, if the conversion overflows, the largest integer with the same
3324 sign as `a' is returned.
3325 -------------------------------------------------------------------------------
3327 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3329 flag aSign;
3330 int32 aExp, shiftCount;
3331 bits64 aSig, savedASig;
3332 int32 z;
3334 aSig = extractFloatx80Frac( a );
3335 aExp = extractFloatx80Exp( a );
3336 aSign = extractFloatx80Sign( a );
3337 if ( 0x401E < aExp ) {
3338 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3339 goto invalid;
3341 else if ( aExp < 0x3FFF ) {
3342 if ( aExp || aSig ) set_float_exception_inexact_flag();
3343 return 0;
3345 shiftCount = 0x403E - aExp;
3346 savedASig = aSig;
3347 aSig >>= shiftCount;
3348 z = aSig;
3349 if ( aSign ) z = - z;
3350 if ( ( z < 0 ) ^ aSign ) {
3351 invalid:
3352 float_raise( float_flag_invalid );
3353 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3355 if ( ( aSig<<shiftCount ) != savedASig ) {
3356 set_float_exception_inexact_flag();
3358 return z;
3363 -------------------------------------------------------------------------------
3364 Returns the result of converting the extended double-precision floating-
3365 point value `a' to the 64-bit two's complement integer format. The
3366 conversion is performed according to the IEC/IEEE Standard for Binary
3367 Floating-Point Arithmetic---which means in particular that the conversion
3368 is rounded according to the current rounding mode. If `a' is a NaN,
3369 the largest positive integer is returned. Otherwise, if the conversion
3370 overflows, the largest integer with the same sign as `a' is returned.
3371 -------------------------------------------------------------------------------
3373 int64 floatx80_to_int64( floatx80 a )
3375 flag aSign;
3376 int32 aExp, shiftCount;
3377 bits64 aSig, aSigExtra;
3379 aSig = extractFloatx80Frac( a );
3380 aExp = extractFloatx80Exp( a );
3381 aSign = extractFloatx80Sign( a );
3382 shiftCount = 0x403E - aExp;
3383 if ( shiftCount <= 0 ) {
3384 if ( shiftCount ) {
3385 float_raise( float_flag_invalid );
3386 if ( ! aSign
3387 || ( ( aExp == 0x7FFF )
3388 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3390 return LIT64( 0x7FFFFFFFFFFFFFFF );
3392 return (sbits64) LIT64( 0x8000000000000000 );
3394 aSigExtra = 0;
3396 else {
3397 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3399 return roundAndPackInt64( aSign, aSig, aSigExtra );
3404 -------------------------------------------------------------------------------
3405 Returns the result of converting the extended double-precision floating-
3406 point value `a' to the 64-bit two's complement integer format. The
3407 conversion is performed according to the IEC/IEEE Standard for Binary
3408 Floating-Point Arithmetic, except that the conversion is always rounded
3409 toward zero. If `a' is a NaN, the largest positive integer is returned.
3410 Otherwise, if the conversion overflows, the largest integer with the same
3411 sign as `a' is returned.
3412 -------------------------------------------------------------------------------
3414 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3416 flag aSign;
3417 int32 aExp, shiftCount;
3418 bits64 aSig;
3419 int64 z;
3421 aSig = extractFloatx80Frac( a );
3422 aExp = extractFloatx80Exp( a );
3423 aSign = extractFloatx80Sign( a );
3424 shiftCount = aExp - 0x403E;
3425 if ( 0 <= shiftCount ) {
3426 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3427 if ( ( a.high != 0xC03E ) || aSig ) {
3428 float_raise( float_flag_invalid );
3429 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3430 return LIT64( 0x7FFFFFFFFFFFFFFF );
3433 return (sbits64) LIT64( 0x8000000000000000 );
3435 else if ( aExp < 0x3FFF ) {
3436 if ( aExp | aSig ) set_float_exception_inexact_flag();
3437 return 0;
3439 z = aSig>>( - shiftCount );
3440 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3441 set_float_exception_inexact_flag();
3443 if ( aSign ) z = - z;
3444 return z;
3449 -------------------------------------------------------------------------------
3450 Returns the result of converting the extended double-precision floating-
3451 point value `a' to the single-precision floating-point format. The
3452 conversion is performed according to the IEC/IEEE Standard for Binary
3453 Floating-Point Arithmetic.
3454 -------------------------------------------------------------------------------
3456 float32 floatx80_to_float32( floatx80 a )
3458 flag aSign;
3459 int32 aExp;
3460 bits64 aSig;
3462 aSig = extractFloatx80Frac( a );
3463 aExp = extractFloatx80Exp( a );
3464 aSign = extractFloatx80Sign( a );
3465 if ( aExp == 0x7FFF ) {
3466 if ( (bits64) ( aSig<<1 ) ) {
3467 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3469 return packFloat32( aSign, 0xFF, 0 );
3471 shift64RightJamming( aSig, 33, &aSig );
3472 if ( aExp || aSig ) aExp -= 0x3F81;
3473 return roundAndPackFloat32( aSign, aExp, aSig );
3478 -------------------------------------------------------------------------------
3479 Returns the result of converting the extended double-precision floating-
3480 point value `a' to the double-precision floating-point format. The
3481 conversion is performed according to the IEC/IEEE Standard for Binary
3482 Floating-Point Arithmetic.
3483 -------------------------------------------------------------------------------
3485 float64 floatx80_to_float64( floatx80 a )
3487 flag aSign;
3488 int32 aExp;
3489 bits64 aSig, zSig;
3491 aSig = extractFloatx80Frac( a );
3492 aExp = extractFloatx80Exp( a );
3493 aSign = extractFloatx80Sign( a );
3494 if ( aExp == 0x7FFF ) {
3495 if ( (bits64) ( aSig<<1 ) ) {
3496 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3498 return packFloat64( aSign, 0x7FF, 0 );
3500 shift64RightJamming( aSig, 1, &zSig );
3501 if ( aExp || aSig ) aExp -= 0x3C01;
3502 return roundAndPackFloat64( aSign, aExp, zSig );
3506 #ifdef FLOAT128
3509 -------------------------------------------------------------------------------
3510 Returns the result of converting the extended double-precision floating-
3511 point value `a' to the quadruple-precision floating-point format. The
3512 conversion is performed according to the IEC/IEEE Standard for Binary
3513 Floating-Point Arithmetic.
3514 -------------------------------------------------------------------------------
3516 float128 floatx80_to_float128( floatx80 a )
3518 flag aSign;
3519 int16 aExp;
3520 bits64 aSig, zSig0, zSig1;
3522 aSig = extractFloatx80Frac( a );
3523 aExp = extractFloatx80Exp( a );
3524 aSign = extractFloatx80Sign( a );
3525 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3526 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3528 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3529 return packFloat128( aSign, aExp, zSig0, zSig1 );
3533 #endif
3536 -------------------------------------------------------------------------------
3537 Rounds the extended double-precision floating-point value `a' to an integer,
3538 and returns the result as an extended quadruple-precision floating-point
3539 value. The operation is performed according to the IEC/IEEE Standard for
3540 Binary Floating-Point Arithmetic.
3541 -------------------------------------------------------------------------------
3543 floatx80 floatx80_round_to_int( floatx80 a )
3545 flag aSign;
3546 int32 aExp;
3547 bits64 lastBitMask, roundBitsMask;
3548 int8 roundingMode;
3549 floatx80 z;
3551 aExp = extractFloatx80Exp( a );
3552 if ( 0x403E <= aExp ) {
3553 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3554 return propagateFloatx80NaN( a, a );
3556 return a;
3558 if ( aExp < 0x3FFF ) {
3559 if ( ( aExp == 0 )
3560 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3561 return a;
3563 set_float_exception_inexact_flag();
3564 aSign = extractFloatx80Sign( a );
3565 switch ( float_rounding_mode ) {
3566 case float_round_nearest_even:
3567 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3569 return
3570 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3572 break;
3573 case float_round_to_zero:
3574 break;
3575 case float_round_down:
3576 return
3577 aSign ?
3578 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3579 : packFloatx80( 0, 0, 0 );
3580 case float_round_up:
3581 return
3582 aSign ? packFloatx80( 1, 0, 0 )
3583 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3585 return packFloatx80( aSign, 0, 0 );
3587 lastBitMask = 1;
3588 lastBitMask <<= 0x403E - aExp;
3589 roundBitsMask = lastBitMask - 1;
3590 z = a;
3591 roundingMode = float_rounding_mode;
3592 if ( roundingMode == float_round_nearest_even ) {
3593 z.low += lastBitMask>>1;
3594 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3596 else if ( roundingMode != float_round_to_zero ) {
3597 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3598 z.low += roundBitsMask;
3601 z.low &= ~ roundBitsMask;
3602 if ( z.low == 0 ) {
3603 ++z.high;
3604 z.low = LIT64( 0x8000000000000000 );
3606 if ( z.low != a.low ) set_float_exception_inexact_flag();
3607 return z;
3612 -------------------------------------------------------------------------------
3613 Returns the result of adding the absolute values of the extended double-
3614 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3615 negated before being returned. `zSign' is ignored if the result is a NaN.
3616 The addition is performed according to the IEC/IEEE Standard for Binary
3617 Floating-Point Arithmetic.
3618 -------------------------------------------------------------------------------
3620 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3622 int32 aExp, bExp, zExp;
3623 bits64 aSig, bSig, zSig0, zSig1;
3624 int32 expDiff;
3626 aSig = extractFloatx80Frac( a );
3627 aExp = extractFloatx80Exp( a );
3628 bSig = extractFloatx80Frac( b );
3629 bExp = extractFloatx80Exp( b );
3630 expDiff = aExp - bExp;
3631 if ( 0 < expDiff ) {
3632 if ( aExp == 0x7FFF ) {
3633 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3634 return a;
3636 if ( bExp == 0 ) --expDiff;
3637 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3638 zExp = aExp;
3640 else if ( expDiff < 0 ) {
3641 if ( bExp == 0x7FFF ) {
3642 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3643 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3645 if ( aExp == 0 ) ++expDiff;
3646 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3647 zExp = bExp;
3649 else {
3650 if ( aExp == 0x7FFF ) {
3651 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3652 return propagateFloatx80NaN( a, b );
3654 return a;
3656 zSig1 = 0;
3657 zSig0 = aSig + bSig;
3658 if ( aExp == 0 ) {
3659 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3660 goto roundAndPack;
3662 zExp = aExp;
3663 goto shiftRight1;
3665 zSig0 = aSig + bSig;
3666 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3667 shiftRight1:
3668 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3669 zSig0 |= LIT64( 0x8000000000000000 );
3670 ++zExp;
3671 roundAndPack:
3672 return
3673 roundAndPackFloatx80(
3674 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3679 -------------------------------------------------------------------------------
3680 Returns the result of subtracting the absolute values of the extended
3681 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3682 difference is negated before being returned. `zSign' is ignored if the
3683 result is a NaN. The subtraction is performed according to the IEC/IEEE
3684 Standard for Binary Floating-Point Arithmetic.
3685 -------------------------------------------------------------------------------
3687 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3689 int32 aExp, bExp, zExp;
3690 bits64 aSig, bSig, zSig0, zSig1;
3691 int32 expDiff;
3692 floatx80 z;
3694 aSig = extractFloatx80Frac( a );
3695 aExp = extractFloatx80Exp( a );
3696 bSig = extractFloatx80Frac( b );
3697 bExp = extractFloatx80Exp( b );
3698 expDiff = aExp - bExp;
3699 if ( 0 < expDiff ) goto aExpBigger;
3700 if ( expDiff < 0 ) goto bExpBigger;
3701 if ( aExp == 0x7FFF ) {
3702 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3703 return propagateFloatx80NaN( a, b );
3705 float_raise( float_flag_invalid );
3706 z.low = floatx80_default_nan_low;
3707 z.high = floatx80_default_nan_high;
3708 return z;
3710 if ( aExp == 0 ) {
3711 aExp = 1;
3712 bExp = 1;
3714 zSig1 = 0;
3715 if ( bSig < aSig ) goto aBigger;
3716 if ( aSig < bSig ) goto bBigger;
3717 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3718 bExpBigger:
3719 if ( bExp == 0x7FFF ) {
3720 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3721 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3723 if ( aExp == 0 ) ++expDiff;
3724 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3725 bBigger:
3726 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3727 zExp = bExp;
3728 zSign ^= 1;
3729 goto normalizeRoundAndPack;
3730 aExpBigger:
3731 if ( aExp == 0x7FFF ) {
3732 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3733 return a;
3735 if ( bExp == 0 ) --expDiff;
3736 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3737 aBigger:
3738 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3739 zExp = aExp;
3740 normalizeRoundAndPack:
3741 return
3742 normalizeRoundAndPackFloatx80(
3743 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3748 -------------------------------------------------------------------------------
3749 Returns the result of adding the extended double-precision floating-point
3750 values `a' and `b'. The operation is performed according to the IEC/IEEE
3751 Standard for Binary Floating-Point Arithmetic.
3752 -------------------------------------------------------------------------------
3754 floatx80 floatx80_add( floatx80 a, floatx80 b )
3756 flag aSign, bSign;
3758 aSign = extractFloatx80Sign( a );
3759 bSign = extractFloatx80Sign( b );
3760 if ( aSign == bSign ) {
3761 return addFloatx80Sigs( a, b, aSign );
3763 else {
3764 return subFloatx80Sigs( a, b, aSign );
3770 -------------------------------------------------------------------------------
3771 Returns the result of subtracting the extended double-precision floating-
3772 point values `a' and `b'. The operation is performed according to the
3773 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3774 -------------------------------------------------------------------------------
3776 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3778 flag aSign, bSign;
3780 aSign = extractFloatx80Sign( a );
3781 bSign = extractFloatx80Sign( b );
3782 if ( aSign == bSign ) {
3783 return subFloatx80Sigs( a, b, aSign );
3785 else {
3786 return addFloatx80Sigs( a, b, aSign );
3792 -------------------------------------------------------------------------------
3793 Returns the result of multiplying the extended double-precision floating-
3794 point values `a' and `b'. The operation is performed according to the
3795 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3796 -------------------------------------------------------------------------------
3798 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3800 flag aSign, bSign, zSign;
3801 int32 aExp, bExp, zExp;
3802 bits64 aSig, bSig, zSig0, zSig1;
3803 floatx80 z;
3805 aSig = extractFloatx80Frac( a );
3806 aExp = extractFloatx80Exp( a );
3807 aSign = extractFloatx80Sign( a );
3808 bSig = extractFloatx80Frac( b );
3809 bExp = extractFloatx80Exp( b );
3810 bSign = extractFloatx80Sign( b );
3811 zSign = aSign ^ bSign;
3812 if ( aExp == 0x7FFF ) {
3813 if ( (bits64) ( aSig<<1 )
3814 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3815 return propagateFloatx80NaN( a, b );
3817 if ( ( bExp | bSig ) == 0 ) goto invalid;
3818 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3820 if ( bExp == 0x7FFF ) {
3821 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3822 if ( ( aExp | aSig ) == 0 ) {
3823 invalid:
3824 float_raise( float_flag_invalid );
3825 z.low = floatx80_default_nan_low;
3826 z.high = floatx80_default_nan_high;
3827 return z;
3829 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3831 if ( aExp == 0 ) {
3832 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3833 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3835 if ( bExp == 0 ) {
3836 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3837 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3839 zExp = aExp + bExp - 0x3FFE;
3840 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3841 if ( 0 < (sbits64) zSig0 ) {
3842 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3843 --zExp;
3845 return
3846 roundAndPackFloatx80(
3847 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3852 -------------------------------------------------------------------------------
3853 Returns the result of dividing the extended double-precision floating-point
3854 value `a' by the corresponding value `b'. The operation is performed
3855 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3856 -------------------------------------------------------------------------------
3858 floatx80 floatx80_div( floatx80 a, floatx80 b )
3860 flag aSign, bSign, zSign;
3861 int32 aExp, bExp, zExp;
3862 bits64 aSig, bSig, zSig0, zSig1;
3863 bits64 rem0, rem1, rem2, term0, term1, term2;
3864 floatx80 z;
3866 aSig = extractFloatx80Frac( a );
3867 aExp = extractFloatx80Exp( a );
3868 aSign = extractFloatx80Sign( a );
3869 bSig = extractFloatx80Frac( b );
3870 bExp = extractFloatx80Exp( b );
3871 bSign = extractFloatx80Sign( b );
3872 zSign = aSign ^ bSign;
3873 if ( aExp == 0x7FFF ) {
3874 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3875 if ( bExp == 0x7FFF ) {
3876 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3877 goto invalid;
3879 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3881 if ( bExp == 0x7FFF ) {
3882 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3883 return packFloatx80( zSign, 0, 0 );
3885 if ( bExp == 0 ) {
3886 if ( bSig == 0 ) {
3887 if ( ( aExp | aSig ) == 0 ) {
3888 invalid:
3889 float_raise( float_flag_invalid );
3890 z.low = floatx80_default_nan_low;
3891 z.high = floatx80_default_nan_high;
3892 return z;
3894 float_raise( float_flag_divbyzero );
3895 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3897 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3899 if ( aExp == 0 ) {
3900 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3901 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3903 zExp = aExp - bExp + 0x3FFE;
3904 rem1 = 0;
3905 if ( bSig <= aSig ) {
3906 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3907 ++zExp;
3909 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3910 mul64To128( bSig, zSig0, &term0, &term1 );
3911 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3912 while ( (sbits64) rem0 < 0 ) {
3913 --zSig0;
3914 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3916 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3917 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3918 mul64To128( bSig, zSig1, &term1, &term2 );
3919 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3920 while ( (sbits64) rem1 < 0 ) {
3921 --zSig1;
3922 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3924 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3926 return
3927 roundAndPackFloatx80(
3928 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3933 -------------------------------------------------------------------------------
3934 Returns the remainder of the extended double-precision floating-point value
3935 `a' with respect to the corresponding value `b'. The operation is performed
3936 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3937 -------------------------------------------------------------------------------
3939 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3941 flag aSign, bSign, zSign;
3942 int32 aExp, bExp, expDiff;
3943 bits64 aSig0, aSig1, bSig;
3944 bits64 q, term0, term1, alternateASig0, alternateASig1;
3945 floatx80 z;
3947 aSig0 = extractFloatx80Frac( a );
3948 aExp = extractFloatx80Exp( a );
3949 aSign = extractFloatx80Sign( a );
3950 bSig = extractFloatx80Frac( b );
3951 bExp = extractFloatx80Exp( b );
3952 bSign = extractFloatx80Sign( b );
3953 if ( aExp == 0x7FFF ) {
3954 if ( (bits64) ( aSig0<<1 )
3955 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3956 return propagateFloatx80NaN( a, b );
3958 goto invalid;
3960 if ( bExp == 0x7FFF ) {
3961 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3962 return a;
3964 if ( bExp == 0 ) {
3965 if ( bSig == 0 ) {
3966 invalid:
3967 float_raise( float_flag_invalid );
3968 z.low = floatx80_default_nan_low;
3969 z.high = floatx80_default_nan_high;
3970 return z;
3972 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3974 if ( aExp == 0 ) {
3975 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3976 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3978 bSig |= LIT64( 0x8000000000000000 );
3979 zSign = aSign;
3980 expDiff = aExp - bExp;
3981 aSig1 = 0;
3982 if ( expDiff < 0 ) {
3983 if ( expDiff < -1 ) return a;
3984 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3985 expDiff = 0;
3987 q = ( bSig <= aSig0 );
3988 if ( q ) aSig0 -= bSig;
3989 expDiff -= 64;
3990 while ( 0 < expDiff ) {
3991 q = estimateDiv128To64( aSig0, aSig1, bSig );
3992 q = ( 2 < q ) ? q - 2 : 0;
3993 mul64To128( bSig, q, &term0, &term1 );
3994 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3995 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3996 expDiff -= 62;
3998 expDiff += 64;
3999 if ( 0 < expDiff ) {
4000 q = estimateDiv128To64( aSig0, aSig1, bSig );
4001 q = ( 2 < q ) ? q - 2 : 0;
4002 q >>= 64 - expDiff;
4003 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4004 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4005 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4006 while ( le128( term0, term1, aSig0, aSig1 ) ) {
4007 ++q;
4008 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4011 else {
4012 term1 = 0;
4013 term0 = bSig;
4015 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4016 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4017 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4018 && ( q & 1 ) )
4020 aSig0 = alternateASig0;
4021 aSig1 = alternateASig1;
4022 zSign = ! zSign;
4024 return
4025 normalizeRoundAndPackFloatx80(
4026 80, zSign, bExp + expDiff, aSig0, aSig1 );
4031 -------------------------------------------------------------------------------
4032 Returns the square root of the extended double-precision floating-point
4033 value `a'. The operation is performed according to the IEC/IEEE Standard
4034 for Binary Floating-Point Arithmetic.
4035 -------------------------------------------------------------------------------
4037 floatx80 floatx80_sqrt( floatx80 a )
4039 flag aSign;
4040 int32 aExp, zExp;
4041 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4042 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4043 floatx80 z;
4045 aSig0 = extractFloatx80Frac( a );
4046 aExp = extractFloatx80Exp( a );
4047 aSign = extractFloatx80Sign( a );
4048 if ( aExp == 0x7FFF ) {
4049 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4050 if ( ! aSign ) return a;
4051 goto invalid;
4053 if ( aSign ) {
4054 if ( ( aExp | aSig0 ) == 0 ) return a;
4055 invalid:
4056 float_raise( float_flag_invalid );
4057 z.low = floatx80_default_nan_low;
4058 z.high = floatx80_default_nan_high;
4059 return z;
4061 if ( aExp == 0 ) {
4062 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4063 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4065 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4066 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4067 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4068 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4069 doubleZSig0 = zSig0<<1;
4070 mul64To128( zSig0, zSig0, &term0, &term1 );
4071 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4072 while ( (sbits64) rem0 < 0 ) {
4073 --zSig0;
4074 doubleZSig0 -= 2;
4075 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4077 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4078 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4079 if ( zSig1 == 0 ) zSig1 = 1;
4080 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4081 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4082 mul64To128( zSig1, zSig1, &term2, &term3 );
4083 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4084 while ( (sbits64) rem1 < 0 ) {
4085 --zSig1;
4086 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4087 term3 |= 1;
4088 term2 |= doubleZSig0;
4089 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4091 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4093 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4094 zSig0 |= doubleZSig0;
4095 return
4096 roundAndPackFloatx80(
4097 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4102 -------------------------------------------------------------------------------
4103 Returns 1 if the extended double-precision floating-point value `a' is
4104 equal to the corresponding value `b', and 0 otherwise. The comparison is
4105 performed according to the IEC/IEEE Standard for Binary Floating-Point
4106 Arithmetic.
4107 -------------------------------------------------------------------------------
4109 flag floatx80_eq( floatx80 a, floatx80 b )
4112 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4113 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4114 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4115 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4117 if ( floatx80_is_signaling_nan( a )
4118 || floatx80_is_signaling_nan( b ) ) {
4119 float_raise( float_flag_invalid );
4121 return 0;
4123 return
4124 ( a.low == b.low )
4125 && ( ( a.high == b.high )
4126 || ( ( a.low == 0 )
4127 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4133 -------------------------------------------------------------------------------
4134 Returns 1 if the extended double-precision floating-point value `a' is
4135 less than or equal to the corresponding value `b', and 0 otherwise. The
4136 comparison is performed according to the IEC/IEEE Standard for Binary
4137 Floating-Point Arithmetic.
4138 -------------------------------------------------------------------------------
4140 flag floatx80_le( floatx80 a, floatx80 b )
4142 flag aSign, bSign;
4144 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4145 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4146 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4147 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4149 float_raise( float_flag_invalid );
4150 return 0;
4152 aSign = extractFloatx80Sign( a );
4153 bSign = extractFloatx80Sign( b );
4154 if ( aSign != bSign ) {
4155 return
4156 aSign
4157 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4158 == 0 );
4160 return
4161 aSign ? le128( b.high, b.low, a.high, a.low )
4162 : le128( a.high, a.low, b.high, b.low );
4167 -------------------------------------------------------------------------------
4168 Returns 1 if the extended double-precision floating-point value `a' is
4169 less than the corresponding value `b', and 0 otherwise. The comparison
4170 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4171 Arithmetic.
4172 -------------------------------------------------------------------------------
4174 flag floatx80_lt( floatx80 a, floatx80 b )
4176 flag aSign, bSign;
4178 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4179 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4180 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4181 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4183 float_raise( float_flag_invalid );
4184 return 0;
4186 aSign = extractFloatx80Sign( a );
4187 bSign = extractFloatx80Sign( b );
4188 if ( aSign != bSign ) {
4189 return
4190 aSign
4191 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4192 != 0 );
4194 return
4195 aSign ? lt128( b.high, b.low, a.high, a.low )
4196 : lt128( a.high, a.low, b.high, b.low );
4201 -------------------------------------------------------------------------------
4202 Returns 1 if the extended double-precision floating-point value `a' is equal
4203 to the corresponding value `b', and 0 otherwise. The invalid exception is
4204 raised if either operand is a NaN. Otherwise, the comparison is performed
4205 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4206 -------------------------------------------------------------------------------
4208 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4211 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4212 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4213 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4214 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4216 float_raise( float_flag_invalid );
4217 return 0;
4219 return
4220 ( a.low == b.low )
4221 && ( ( a.high == b.high )
4222 || ( ( a.low == 0 )
4223 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4229 -------------------------------------------------------------------------------
4230 Returns 1 if the extended double-precision floating-point value `a' is less
4231 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4232 do not cause an exception. Otherwise, the comparison is performed according
4233 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4234 -------------------------------------------------------------------------------
4236 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4238 flag aSign, bSign;
4240 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4241 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4242 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4243 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4245 if ( floatx80_is_signaling_nan( a )
4246 || floatx80_is_signaling_nan( b ) ) {
4247 float_raise( float_flag_invalid );
4249 return 0;
4251 aSign = extractFloatx80Sign( a );
4252 bSign = extractFloatx80Sign( b );
4253 if ( aSign != bSign ) {
4254 return
4255 aSign
4256 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4257 == 0 );
4259 return
4260 aSign ? le128( b.high, b.low, a.high, a.low )
4261 : le128( a.high, a.low, b.high, b.low );
4266 -------------------------------------------------------------------------------
4267 Returns 1 if the extended double-precision floating-point value `a' is less
4268 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4269 an exception. Otherwise, the comparison is performed according to the
4270 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 -------------------------------------------------------------------------------
4273 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4275 flag aSign, bSign;
4277 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4278 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4279 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4280 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4282 if ( floatx80_is_signaling_nan( a )
4283 || floatx80_is_signaling_nan( b ) ) {
4284 float_raise( float_flag_invalid );
4286 return 0;
4288 aSign = extractFloatx80Sign( a );
4289 bSign = extractFloatx80Sign( b );
4290 if ( aSign != bSign ) {
4291 return
4292 aSign
4293 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4294 != 0 );
4296 return
4297 aSign ? lt128( b.high, b.low, a.high, a.low )
4298 : lt128( a.high, a.low, b.high, b.low );
4302 #endif
4304 #ifdef FLOAT128
4307 -------------------------------------------------------------------------------
4308 Returns the result of converting the quadruple-precision floating-point
4309 value `a' to the 32-bit two's complement integer format. The conversion
4310 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4311 Arithmetic---which means in particular that the conversion is rounded
4312 according to the current rounding mode. If `a' is a NaN, the largest
4313 positive integer is returned. Otherwise, if the conversion overflows, the
4314 largest integer with the same sign as `a' is returned.
4315 -------------------------------------------------------------------------------
4317 int32 float128_to_int32( float128 a )
4319 flag aSign;
4320 int32 aExp, shiftCount;
4321 bits64 aSig0, aSig1;
4323 aSig1 = extractFloat128Frac1( a );
4324 aSig0 = extractFloat128Frac0( a );
4325 aExp = extractFloat128Exp( a );
4326 aSign = extractFloat128Sign( a );
4327 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4328 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4329 aSig0 |= ( aSig1 != 0 );
4330 shiftCount = 0x4028 - aExp;
4331 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4332 return roundAndPackInt32( aSign, aSig0 );
4337 -------------------------------------------------------------------------------
4338 Returns the result of converting the quadruple-precision floating-point
4339 value `a' to the 32-bit two's complement integer format. The conversion
4340 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4341 Arithmetic, except that the conversion is always rounded toward zero. If
4342 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4343 conversion overflows, the largest integer with the same sign as `a' is
4344 returned.
4345 -------------------------------------------------------------------------------
4347 int32 float128_to_int32_round_to_zero( float128 a )
4349 flag aSign;
4350 int32 aExp, shiftCount;
4351 bits64 aSig0, aSig1, savedASig;
4352 int32 z;
4354 aSig1 = extractFloat128Frac1( a );
4355 aSig0 = extractFloat128Frac0( a );
4356 aExp = extractFloat128Exp( a );
4357 aSign = extractFloat128Sign( a );
4358 aSig0 |= ( aSig1 != 0 );
4359 if ( 0x401E < aExp ) {
4360 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4361 goto invalid;
4363 else if ( aExp < 0x3FFF ) {
4364 if ( aExp || aSig0 ) set_float_exception_inexact_flag();
4365 return 0;
4367 aSig0 |= LIT64( 0x0001000000000000 );
4368 shiftCount = 0x402F - aExp;
4369 savedASig = aSig0;
4370 aSig0 >>= shiftCount;
4371 z = (int32)aSig0;
4372 if ( aSign ) z = - z;
4373 if ( ( z < 0 ) ^ aSign ) {
4374 invalid:
4375 float_raise( float_flag_invalid );
4376 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4378 if ( ( aSig0<<shiftCount ) != savedASig ) {
4379 set_float_exception_inexact_flag();
4381 return z;
4386 -------------------------------------------------------------------------------
4387 Returns the result of converting the quadruple-precision floating-point
4388 value `a' to the 64-bit two's complement integer format. The conversion
4389 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4390 Arithmetic---which means in particular that the conversion is rounded
4391 according to the current rounding mode. If `a' is a NaN, the largest
4392 positive integer is returned. Otherwise, if the conversion overflows, the
4393 largest integer with the same sign as `a' is returned.
4394 -------------------------------------------------------------------------------
4396 int64 float128_to_int64( float128 a )
4398 flag aSign;
4399 int32 aExp, shiftCount;
4400 bits64 aSig0, aSig1;
4402 aSig1 = extractFloat128Frac1( a );
4403 aSig0 = extractFloat128Frac0( a );
4404 aExp = extractFloat128Exp( a );
4405 aSign = extractFloat128Sign( a );
4406 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4407 shiftCount = 0x402F - aExp;
4408 if ( shiftCount <= 0 ) {
4409 if ( 0x403E < aExp ) {
4410 float_raise( float_flag_invalid );
4411 if ( ! aSign
4412 || ( ( aExp == 0x7FFF )
4413 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4416 return LIT64( 0x7FFFFFFFFFFFFFFF );
4418 return (sbits64) LIT64( 0x8000000000000000 );
4420 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4422 else {
4423 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4425 return roundAndPackInt64( aSign, aSig0, aSig1 );
4430 -------------------------------------------------------------------------------
4431 Returns the result of converting the quadruple-precision floating-point
4432 value `a' to the 64-bit two's complement integer format. The conversion
4433 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4434 Arithmetic, except that the conversion is always rounded toward zero.
4435 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4436 the conversion overflows, the largest integer with the same sign as `a' is
4437 returned.
4438 -------------------------------------------------------------------------------
4440 int64 float128_to_int64_round_to_zero( float128 a )
4442 flag aSign;
4443 int32 aExp, shiftCount;
4444 bits64 aSig0, aSig1;
4445 int64 z;
4447 aSig1 = extractFloat128Frac1( a );
4448 aSig0 = extractFloat128Frac0( a );
4449 aExp = extractFloat128Exp( a );
4450 aSign = extractFloat128Sign( a );
4451 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4452 shiftCount = aExp - 0x402F;
4453 if ( 0 < shiftCount ) {
4454 if ( 0x403E <= aExp ) {
4455 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4456 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4457 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4458 if ( aSig1 ) set_float_exception_inexact_flag();
4460 else {
4461 float_raise( float_flag_invalid );
4462 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4463 return LIT64( 0x7FFFFFFFFFFFFFFF );
4466 return (sbits64) LIT64( 0x8000000000000000 );
4468 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4469 if ( (bits64) ( aSig1<<shiftCount ) ) {
4470 set_float_exception_inexact_flag();
4473 else {
4474 if ( aExp < 0x3FFF ) {
4475 if ( aExp | aSig0 | aSig1 ) {
4476 set_float_exception_inexact_flag();
4478 return 0;
4480 z = aSig0>>( - shiftCount );
4481 if ( aSig1
4482 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4483 set_float_exception_inexact_flag();
4486 if ( aSign ) z = - z;
4487 return z;
4491 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4492 && defined(SOFTFLOAT_NEED_FIXUNS)
4494 * just like above - but do not care for overflow of signed results
4496 uint64 float128_to_uint64_round_to_zero( float128 a )
4498 flag aSign;
4499 int32 aExp, shiftCount;
4500 bits64 aSig0, aSig1;
4501 uint64 z;
4503 aSig1 = extractFloat128Frac1( a );
4504 aSig0 = extractFloat128Frac0( a );
4505 aExp = extractFloat128Exp( a );
4506 aSign = extractFloat128Sign( a );
4507 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4508 shiftCount = aExp - 0x402F;
4509 if ( 0 < shiftCount ) {
4510 if ( 0x403F <= aExp ) {
4511 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4512 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4513 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4514 if ( aSig1 ) set_float_exception_inexact_flag();
4516 else {
4517 float_raise( float_flag_invalid );
4519 return LIT64( 0xFFFFFFFFFFFFFFFF );
4521 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4522 if ( (bits64) ( aSig1<<shiftCount ) ) {
4523 set_float_exception_inexact_flag();
4526 else {
4527 if ( aExp < 0x3FFF ) {
4528 if ( aExp | aSig0 | aSig1 ) {
4529 set_float_exception_inexact_flag();
4531 return 0;
4533 z = aSig0>>( - shiftCount );
4534 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4535 set_float_exception_inexact_flag();
4538 if ( aSign ) z = - z;
4539 return z;
4542 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4545 -------------------------------------------------------------------------------
4546 Returns the result of converting the quadruple-precision floating-point
4547 value `a' to the single-precision floating-point format. The conversion
4548 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4549 Arithmetic.
4550 -------------------------------------------------------------------------------
4552 float32 float128_to_float32( float128 a )
4554 flag aSign;
4555 int32 aExp;
4556 bits64 aSig0, aSig1;
4557 bits32 zSig;
4559 aSig1 = extractFloat128Frac1( a );
4560 aSig0 = extractFloat128Frac0( a );
4561 aExp = extractFloat128Exp( a );
4562 aSign = extractFloat128Sign( a );
4563 if ( aExp == 0x7FFF ) {
4564 if ( aSig0 | aSig1 ) {
4565 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4567 return packFloat32( aSign, 0xFF, 0 );
4569 aSig0 |= ( aSig1 != 0 );
4570 shift64RightJamming( aSig0, 18, &aSig0 );
4571 zSig = (bits32)aSig0;
4572 if ( aExp || zSig ) {
4573 zSig |= 0x40000000;
4574 aExp -= 0x3F81;
4576 return roundAndPackFloat32( aSign, aExp, zSig );
4581 -------------------------------------------------------------------------------
4582 Returns the result of converting the quadruple-precision floating-point
4583 value `a' to the double-precision floating-point format. The conversion
4584 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4585 Arithmetic.
4586 -------------------------------------------------------------------------------
4588 float64 float128_to_float64( float128 a )
4590 flag aSign;
4591 int32 aExp;
4592 bits64 aSig0, aSig1;
4594 aSig1 = extractFloat128Frac1( a );
4595 aSig0 = extractFloat128Frac0( a );
4596 aExp = extractFloat128Exp( a );
4597 aSign = extractFloat128Sign( a );
4598 if ( aExp == 0x7FFF ) {
4599 if ( aSig0 | aSig1 ) {
4600 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4602 return packFloat64( aSign, 0x7FF, 0 );
4604 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4605 aSig0 |= ( aSig1 != 0 );
4606 if ( aExp || aSig0 ) {
4607 aSig0 |= LIT64( 0x4000000000000000 );
4608 aExp -= 0x3C01;
4610 return roundAndPackFloat64( aSign, aExp, aSig0 );
4614 #ifdef FLOATX80
4617 -------------------------------------------------------------------------------
4618 Returns the result of converting the quadruple-precision floating-point
4619 value `a' to the extended double-precision floating-point format. The
4620 conversion is performed according to the IEC/IEEE Standard for Binary
4621 Floating-Point Arithmetic.
4622 -------------------------------------------------------------------------------
4624 floatx80 float128_to_floatx80( float128 a )
4626 flag aSign;
4627 int32 aExp;
4628 bits64 aSig0, aSig1;
4630 aSig1 = extractFloat128Frac1( a );
4631 aSig0 = extractFloat128Frac0( a );
4632 aExp = extractFloat128Exp( a );
4633 aSign = extractFloat128Sign( a );
4634 if ( aExp == 0x7FFF ) {
4635 if ( aSig0 | aSig1 ) {
4636 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4638 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4640 if ( aExp == 0 ) {
4641 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4642 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4644 else {
4645 aSig0 |= LIT64( 0x0001000000000000 );
4647 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4648 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4652 #endif
4655 -------------------------------------------------------------------------------
4656 Rounds the quadruple-precision floating-point value `a' to an integer, and
4657 returns the result as a quadruple-precision floating-point value. The
4658 operation is performed according to the IEC/IEEE Standard for Binary
4659 Floating-Point Arithmetic.
4660 -------------------------------------------------------------------------------
4662 float128 float128_round_to_int( float128 a )
4664 flag aSign;
4665 int32 aExp;
4666 bits64 lastBitMask, roundBitsMask;
4667 int8 roundingMode;
4668 float128 z;
4670 aExp = extractFloat128Exp( a );
4671 if ( 0x402F <= aExp ) {
4672 if ( 0x406F <= aExp ) {
4673 if ( ( aExp == 0x7FFF )
4674 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4676 return propagateFloat128NaN( a, a );
4678 return a;
4680 lastBitMask = 1;
4681 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4682 roundBitsMask = lastBitMask - 1;
4683 z = a;
4684 roundingMode = float_rounding_mode;
4685 if ( roundingMode == float_round_nearest_even ) {
4686 if ( lastBitMask ) {
4687 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4688 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4690 else {
4691 if ( (sbits64) z.low < 0 ) {
4692 ++z.high;
4693 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4697 else if ( roundingMode != float_round_to_zero ) {
4698 if ( extractFloat128Sign( z )
4699 ^ ( roundingMode == float_round_up ) ) {
4700 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4703 z.low &= ~ roundBitsMask;
4705 else {
4706 if ( aExp < 0x3FFF ) {
4707 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4708 set_float_exception_inexact_flag();
4709 aSign = extractFloat128Sign( a );
4710 switch ( float_rounding_mode ) {
4711 case float_round_nearest_even:
4712 if ( ( aExp == 0x3FFE )
4713 && ( extractFloat128Frac0( a )
4714 | extractFloat128Frac1( a ) )
4716 return packFloat128( aSign, 0x3FFF, 0, 0 );
4718 break;
4719 case float_round_to_zero:
4720 break;
4721 case float_round_down:
4722 return
4723 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4724 : packFloat128( 0, 0, 0, 0 );
4725 case float_round_up:
4726 return
4727 aSign ? packFloat128( 1, 0, 0, 0 )
4728 : packFloat128( 0, 0x3FFF, 0, 0 );
4730 return packFloat128( aSign, 0, 0, 0 );
4732 lastBitMask = 1;
4733 lastBitMask <<= 0x402F - aExp;
4734 roundBitsMask = lastBitMask - 1;
4735 z.low = 0;
4736 z.high = a.high;
4737 roundingMode = float_rounding_mode;
4738 if ( roundingMode == float_round_nearest_even ) {
4739 z.high += lastBitMask>>1;
4740 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4741 z.high &= ~ lastBitMask;
4744 else if ( roundingMode != float_round_to_zero ) {
4745 if ( extractFloat128Sign( z )
4746 ^ ( roundingMode == float_round_up ) ) {
4747 z.high |= ( a.low != 0 );
4748 z.high += roundBitsMask;
4751 z.high &= ~ roundBitsMask;
4753 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4754 set_float_exception_inexact_flag();
4756 return z;
4761 -------------------------------------------------------------------------------
4762 Returns the result of adding the absolute values of the quadruple-precision
4763 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4764 before being returned. `zSign' is ignored if the result is a NaN.
4765 The addition is performed according to the IEC/IEEE Standard for Binary
4766 Floating-Point Arithmetic.
4767 -------------------------------------------------------------------------------
4769 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4771 int32 aExp, bExp, zExp;
4772 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4773 int32 expDiff;
4775 aSig1 = extractFloat128Frac1( a );
4776 aSig0 = extractFloat128Frac0( a );
4777 aExp = extractFloat128Exp( a );
4778 bSig1 = extractFloat128Frac1( b );
4779 bSig0 = extractFloat128Frac0( b );
4780 bExp = extractFloat128Exp( b );
4781 expDiff = aExp - bExp;
4782 if ( 0 < expDiff ) {
4783 if ( aExp == 0x7FFF ) {
4784 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4785 return a;
4787 if ( bExp == 0 ) {
4788 --expDiff;
4790 else {
4791 bSig0 |= LIT64( 0x0001000000000000 );
4793 shift128ExtraRightJamming(
4794 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4795 zExp = aExp;
4797 else if ( expDiff < 0 ) {
4798 if ( bExp == 0x7FFF ) {
4799 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4800 return packFloat128( zSign, 0x7FFF, 0, 0 );
4802 if ( aExp == 0 ) {
4803 ++expDiff;
4805 else {
4806 aSig0 |= LIT64( 0x0001000000000000 );
4808 shift128ExtraRightJamming(
4809 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4810 zExp = bExp;
4812 else {
4813 if ( aExp == 0x7FFF ) {
4814 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4815 return propagateFloat128NaN( a, b );
4817 return a;
4819 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4820 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4821 zSig2 = 0;
4822 zSig0 |= LIT64( 0x0002000000000000 );
4823 zExp = aExp;
4824 goto shiftRight1;
4826 aSig0 |= LIT64( 0x0001000000000000 );
4827 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4828 --zExp;
4829 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4830 ++zExp;
4831 shiftRight1:
4832 shift128ExtraRightJamming(
4833 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4834 roundAndPack:
4835 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4840 -------------------------------------------------------------------------------
4841 Returns the result of subtracting the absolute values of the quadruple-
4842 precision floating-point values `a' and `b'. If `zSign' is 1, the
4843 difference is negated before being returned. `zSign' is ignored if the
4844 result is a NaN. The subtraction is performed according to the IEC/IEEE
4845 Standard for Binary Floating-Point Arithmetic.
4846 -------------------------------------------------------------------------------
4848 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4850 int32 aExp, bExp, zExp;
4851 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4852 int32 expDiff;
4853 float128 z;
4855 aSig1 = extractFloat128Frac1( a );
4856 aSig0 = extractFloat128Frac0( a );
4857 aExp = extractFloat128Exp( a );
4858 bSig1 = extractFloat128Frac1( b );
4859 bSig0 = extractFloat128Frac0( b );
4860 bExp = extractFloat128Exp( b );
4861 expDiff = aExp - bExp;
4862 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4863 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4864 if ( 0 < expDiff ) goto aExpBigger;
4865 if ( expDiff < 0 ) goto bExpBigger;
4866 if ( aExp == 0x7FFF ) {
4867 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4868 return propagateFloat128NaN( a, b );
4870 float_raise( float_flag_invalid );
4871 z.low = float128_default_nan_low;
4872 z.high = float128_default_nan_high;
4873 return z;
4875 if ( aExp == 0 ) {
4876 aExp = 1;
4877 bExp = 1;
4879 if ( bSig0 < aSig0 ) goto aBigger;
4880 if ( aSig0 < bSig0 ) goto bBigger;
4881 if ( bSig1 < aSig1 ) goto aBigger;
4882 if ( aSig1 < bSig1 ) goto bBigger;
4883 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4884 bExpBigger:
4885 if ( bExp == 0x7FFF ) {
4886 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4887 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4889 if ( aExp == 0 ) {
4890 ++expDiff;
4892 else {
4893 aSig0 |= LIT64( 0x4000000000000000 );
4895 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4896 bSig0 |= LIT64( 0x4000000000000000 );
4897 bBigger:
4898 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4899 zExp = bExp;
4900 zSign ^= 1;
4901 goto normalizeRoundAndPack;
4902 aExpBigger:
4903 if ( aExp == 0x7FFF ) {
4904 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4905 return a;
4907 if ( bExp == 0 ) {
4908 --expDiff;
4910 else {
4911 bSig0 |= LIT64( 0x4000000000000000 );
4913 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4914 aSig0 |= LIT64( 0x4000000000000000 );
4915 aBigger:
4916 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4917 zExp = aExp;
4918 normalizeRoundAndPack:
4919 --zExp;
4920 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4925 -------------------------------------------------------------------------------
4926 Returns the result of adding the quadruple-precision floating-point values
4927 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4928 for Binary Floating-Point Arithmetic.
4929 -------------------------------------------------------------------------------
4931 float128 float128_add( float128 a, float128 b )
4933 flag aSign, bSign;
4935 aSign = extractFloat128Sign( a );
4936 bSign = extractFloat128Sign( b );
4937 if ( aSign == bSign ) {
4938 return addFloat128Sigs( a, b, aSign );
4940 else {
4941 return subFloat128Sigs( a, b, aSign );
4947 -------------------------------------------------------------------------------
4948 Returns the result of subtracting the quadruple-precision floating-point
4949 values `a' and `b'. The operation is performed according to the IEC/IEEE
4950 Standard for Binary Floating-Point Arithmetic.
4951 -------------------------------------------------------------------------------
4953 float128 float128_sub( float128 a, float128 b )
4955 flag aSign, bSign;
4957 aSign = extractFloat128Sign( a );
4958 bSign = extractFloat128Sign( b );
4959 if ( aSign == bSign ) {
4960 return subFloat128Sigs( a, b, aSign );
4962 else {
4963 return addFloat128Sigs( a, b, aSign );
4969 -------------------------------------------------------------------------------
4970 Returns the result of multiplying the quadruple-precision floating-point
4971 values `a' and `b'. The operation is performed according to the IEC/IEEE
4972 Standard for Binary Floating-Point Arithmetic.
4973 -------------------------------------------------------------------------------
4975 float128 float128_mul( float128 a, float128 b )
4977 flag aSign, bSign, zSign;
4978 int32 aExp, bExp, zExp;
4979 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4980 float128 z;
4982 aSig1 = extractFloat128Frac1( a );
4983 aSig0 = extractFloat128Frac0( a );
4984 aExp = extractFloat128Exp( a );
4985 aSign = extractFloat128Sign( a );
4986 bSig1 = extractFloat128Frac1( b );
4987 bSig0 = extractFloat128Frac0( b );
4988 bExp = extractFloat128Exp( b );
4989 bSign = extractFloat128Sign( b );
4990 zSign = aSign ^ bSign;
4991 if ( aExp == 0x7FFF ) {
4992 if ( ( aSig0 | aSig1 )
4993 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4994 return propagateFloat128NaN( a, b );
4996 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4997 return packFloat128( zSign, 0x7FFF, 0, 0 );
4999 if ( bExp == 0x7FFF ) {
5000 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5001 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5002 invalid:
5003 float_raise( float_flag_invalid );
5004 z.low = float128_default_nan_low;
5005 z.high = float128_default_nan_high;
5006 return z;
5008 return packFloat128( zSign, 0x7FFF, 0, 0 );
5010 if ( aExp == 0 ) {
5011 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5012 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5014 if ( bExp == 0 ) {
5015 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5016 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5018 zExp = aExp + bExp - 0x4000;
5019 aSig0 |= LIT64( 0x0001000000000000 );
5020 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5021 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5022 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5023 zSig2 |= ( zSig3 != 0 );
5024 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5025 shift128ExtraRightJamming(
5026 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5027 ++zExp;
5029 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5034 -------------------------------------------------------------------------------
5035 Returns the result of dividing the quadruple-precision floating-point value
5036 `a' by the corresponding value `b'. The operation is performed according to
5037 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5038 -------------------------------------------------------------------------------
5040 float128 float128_div( float128 a, float128 b )
5042 flag aSign, bSign, zSign;
5043 int32 aExp, bExp, zExp;
5044 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5045 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5046 float128 z;
5048 aSig1 = extractFloat128Frac1( a );
5049 aSig0 = extractFloat128Frac0( a );
5050 aExp = extractFloat128Exp( a );
5051 aSign = extractFloat128Sign( a );
5052 bSig1 = extractFloat128Frac1( b );
5053 bSig0 = extractFloat128Frac0( b );
5054 bExp = extractFloat128Exp( b );
5055 bSign = extractFloat128Sign( b );
5056 zSign = aSign ^ bSign;
5057 if ( aExp == 0x7FFF ) {
5058 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5059 if ( bExp == 0x7FFF ) {
5060 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5061 goto invalid;
5063 return packFloat128( zSign, 0x7FFF, 0, 0 );
5065 if ( bExp == 0x7FFF ) {
5066 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5067 return packFloat128( zSign, 0, 0, 0 );
5069 if ( bExp == 0 ) {
5070 if ( ( bSig0 | bSig1 ) == 0 ) {
5071 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5072 invalid:
5073 float_raise( float_flag_invalid );
5074 z.low = float128_default_nan_low;
5075 z.high = float128_default_nan_high;
5076 return z;
5078 float_raise( float_flag_divbyzero );
5079 return packFloat128( zSign, 0x7FFF, 0, 0 );
5081 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5083 if ( aExp == 0 ) {
5084 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5085 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5087 zExp = aExp - bExp + 0x3FFD;
5088 shortShift128Left(
5089 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5090 shortShift128Left(
5091 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5092 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5093 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5094 ++zExp;
5096 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5097 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5098 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5099 while ( (sbits64) rem0 < 0 ) {
5100 --zSig0;
5101 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5103 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5104 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5105 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5106 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5107 while ( (sbits64) rem1 < 0 ) {
5108 --zSig1;
5109 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5111 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5113 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5114 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5119 -------------------------------------------------------------------------------
5120 Returns the remainder of the quadruple-precision floating-point value `a'
5121 with respect to the corresponding value `b'. The operation is performed
5122 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5123 -------------------------------------------------------------------------------
5125 float128 float128_rem( float128 a, float128 b )
5127 flag aSign, zSign;
5128 int32 aExp, bExp, expDiff;
5129 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5130 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5131 sbits64 sigMean0;
5132 float128 z;
5134 aSig1 = extractFloat128Frac1( a );
5135 aSig0 = extractFloat128Frac0( a );
5136 aExp = extractFloat128Exp( a );
5137 aSign = extractFloat128Sign( a );
5138 bSig1 = extractFloat128Frac1( b );
5139 bSig0 = extractFloat128Frac0( b );
5140 bExp = extractFloat128Exp( b );
5141 if ( aExp == 0x7FFF ) {
5142 if ( ( aSig0 | aSig1 )
5143 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5144 return propagateFloat128NaN( a, b );
5146 goto invalid;
5148 if ( bExp == 0x7FFF ) {
5149 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5150 return a;
5152 if ( bExp == 0 ) {
5153 if ( ( bSig0 | bSig1 ) == 0 ) {
5154 invalid:
5155 float_raise( float_flag_invalid );
5156 z.low = float128_default_nan_low;
5157 z.high = float128_default_nan_high;
5158 return z;
5160 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5162 if ( aExp == 0 ) {
5163 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5164 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5166 expDiff = aExp - bExp;
5167 if ( expDiff < -1 ) return a;
5168 shortShift128Left(
5169 aSig0 | LIT64( 0x0001000000000000 ),
5170 aSig1,
5171 15 - ( expDiff < 0 ),
5172 &aSig0,
5173 &aSig1
5175 shortShift128Left(
5176 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5177 q = le128( bSig0, bSig1, aSig0, aSig1 );
5178 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5179 expDiff -= 64;
5180 while ( 0 < expDiff ) {
5181 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5182 q = ( 4 < q ) ? q - 4 : 0;
5183 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5184 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5185 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5186 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5187 expDiff -= 61;
5189 if ( -64 < expDiff ) {
5190 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5191 q = ( 4 < q ) ? q - 4 : 0;
5192 q >>= - expDiff;
5193 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5194 expDiff += 52;
5195 if ( expDiff < 0 ) {
5196 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5198 else {
5199 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5201 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5202 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5204 else {
5205 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5206 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5208 do {
5209 alternateASig0 = aSig0;
5210 alternateASig1 = aSig1;
5211 ++q;
5212 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5213 } while ( 0 <= (sbits64) aSig0 );
5214 add128(
5215 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5216 if ( ( sigMean0 < 0 )
5217 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5218 aSig0 = alternateASig0;
5219 aSig1 = alternateASig1;
5221 zSign = ( (sbits64) aSig0 < 0 );
5222 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5223 return
5224 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5229 -------------------------------------------------------------------------------
5230 Returns the square root of the quadruple-precision floating-point value `a'.
5231 The operation is performed according to the IEC/IEEE Standard for Binary
5232 Floating-Point Arithmetic.
5233 -------------------------------------------------------------------------------
5235 float128 float128_sqrt( float128 a )
5237 flag aSign;
5238 int32 aExp, zExp;
5239 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5240 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5241 float128 z;
5243 aSig1 = extractFloat128Frac1( a );
5244 aSig0 = extractFloat128Frac0( a );
5245 aExp = extractFloat128Exp( a );
5246 aSign = extractFloat128Sign( a );
5247 if ( aExp == 0x7FFF ) {
5248 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5249 if ( ! aSign ) return a;
5250 goto invalid;
5252 if ( aSign ) {
5253 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5254 invalid:
5255 float_raise( float_flag_invalid );
5256 z.low = float128_default_nan_low;
5257 z.high = float128_default_nan_high;
5258 return z;
5260 if ( aExp == 0 ) {
5261 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5262 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5264 zExp = (int32) ( (aExp - 0x3FFF) >> 1) + 0x3FFE;
5265 aSig0 |= LIT64( 0x0001000000000000 );
5266 zSig0 = estimateSqrt32((int16)aExp, (bits32)(aSig0>>17));
5267 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5268 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5269 doubleZSig0 = zSig0<<1;
5270 mul64To128( zSig0, zSig0, &term0, &term1 );
5271 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5272 while ( (sbits64) rem0 < 0 ) {
5273 --zSig0;
5274 doubleZSig0 -= 2;
5275 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5277 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5278 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5279 if ( zSig1 == 0 ) zSig1 = 1;
5280 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5281 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5282 mul64To128( zSig1, zSig1, &term2, &term3 );
5283 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5284 while ( (sbits64) rem1 < 0 ) {
5285 --zSig1;
5286 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5287 term3 |= 1;
5288 term2 |= doubleZSig0;
5289 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5291 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5293 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5294 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5299 -------------------------------------------------------------------------------
5300 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5301 the corresponding value `b', and 0 otherwise. The comparison is performed
5302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5303 -------------------------------------------------------------------------------
5305 flag float128_eq( float128 a, float128 b )
5308 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5309 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5310 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5311 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5313 if ( float128_is_signaling_nan( a )
5314 || float128_is_signaling_nan( b ) ) {
5315 float_raise( float_flag_invalid );
5317 return 0;
5319 return
5320 ( a.low == b.low )
5321 && ( ( a.high == b.high )
5322 || ( ( a.low == 0 )
5323 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5329 -------------------------------------------------------------------------------
5330 Returns 1 if the quadruple-precision floating-point value `a' is less than
5331 or equal to the corresponding value `b', and 0 otherwise. The comparison
5332 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5333 Arithmetic.
5334 -------------------------------------------------------------------------------
5336 flag float128_le( float128 a, float128 b )
5338 flag aSign, bSign;
5340 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5341 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5342 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5343 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5345 float_raise( float_flag_invalid );
5346 return 0;
5348 aSign = extractFloat128Sign( a );
5349 bSign = extractFloat128Sign( b );
5350 if ( aSign != bSign ) {
5351 return
5352 aSign
5353 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5354 == 0 );
5356 return
5357 aSign ? le128( b.high, b.low, a.high, a.low )
5358 : le128( a.high, a.low, b.high, b.low );
5363 -------------------------------------------------------------------------------
5364 Returns 1 if the quadruple-precision floating-point value `a' is less than
5365 the corresponding value `b', and 0 otherwise. The comparison is performed
5366 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5367 -------------------------------------------------------------------------------
5369 flag float128_lt( float128 a, float128 b )
5371 flag aSign, bSign;
5373 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5374 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5375 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5376 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5378 float_raise( float_flag_invalid );
5379 return 0;
5381 aSign = extractFloat128Sign( a );
5382 bSign = extractFloat128Sign( b );
5383 if ( aSign != bSign ) {
5384 return
5385 aSign
5386 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5387 != 0 );
5389 return
5390 aSign ? lt128( b.high, b.low, a.high, a.low )
5391 : lt128( a.high, a.low, b.high, b.low );
5396 -------------------------------------------------------------------------------
5397 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5398 the corresponding value `b', and 0 otherwise. The invalid exception is
5399 raised if either operand is a NaN. Otherwise, the comparison is performed
5400 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5401 -------------------------------------------------------------------------------
5403 flag float128_eq_signaling( float128 a, float128 b )
5406 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5407 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5408 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5409 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5411 float_raise( float_flag_invalid );
5412 return 0;
5414 return
5415 ( a.low == b.low )
5416 && ( ( a.high == b.high )
5417 || ( ( a.low == 0 )
5418 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5424 -------------------------------------------------------------------------------
5425 Returns 1 if the quadruple-precision floating-point value `a' is less than
5426 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5427 cause an exception. Otherwise, the comparison is performed according to the
5428 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5429 -------------------------------------------------------------------------------
5431 flag float128_le_quiet( float128 a, float128 b )
5433 flag aSign, bSign;
5435 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5436 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5437 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5438 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5440 if ( float128_is_signaling_nan( a )
5441 || float128_is_signaling_nan( b ) ) {
5442 float_raise( float_flag_invalid );
5444 return 0;
5446 aSign = extractFloat128Sign( a );
5447 bSign = extractFloat128Sign( b );
5448 if ( aSign != bSign ) {
5449 return
5450 aSign
5451 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5452 == 0 );
5454 return
5455 aSign ? le128( b.high, b.low, a.high, a.low )
5456 : le128( a.high, a.low, b.high, b.low );
5461 -------------------------------------------------------------------------------
5462 Returns 1 if the quadruple-precision floating-point value `a' is less than
5463 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5464 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5465 Standard for Binary Floating-Point Arithmetic.
5466 -------------------------------------------------------------------------------
5468 flag float128_lt_quiet( float128 a, float128 b )
5470 flag aSign, bSign;
5472 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5473 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5474 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5475 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5477 if ( float128_is_signaling_nan( a )
5478 || float128_is_signaling_nan( b ) ) {
5479 float_raise( float_flag_invalid );
5481 return 0;
5483 aSign = extractFloat128Sign( a );
5484 bSign = extractFloat128Sign( b );
5485 if ( aSign != bSign ) {
5486 return
5487 aSign
5488 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5489 != 0 );
5491 return
5492 aSign ? lt128( b.high, b.low, a.high, a.low )
5493 : lt128( a.high, a.low, b.high, b.low );
5497 #endif
5500 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5503 * These two routines are not part of the original softfloat distribution.
5505 * They are based on the corresponding conversions to integer but return
5506 * unsigned numbers instead since these functions are required by GCC.
5508 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97
5510 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5514 -------------------------------------------------------------------------------
5515 Returns the result of converting the double-precision floating-point value
5516 `a' to the 32-bit unsigned integer format. The conversion is
5517 performed according to the IEC/IEEE Standard for Binary Floating-point
5518 Arithmetic, except that the conversion is always rounded toward zero. If
5519 `a' is a NaN, the largest positive integer is returned. If the conversion
5520 overflows, the largest integer positive is returned.
5521 -------------------------------------------------------------------------------
5523 uint32 float64_to_uint32_round_to_zero( float64 a )
5525 flag aSign;
5526 int16 aExp, shiftCount;
5527 bits64 aSig, savedASig;
5528 uint32 z;
5530 aSig = extractFloat64Frac( a );
5531 aExp = extractFloat64Exp( a );
5532 aSign = extractFloat64Sign( a );
5534 if (aSign) {
5535 float_raise( float_flag_invalid );
5536 return(0);
5539 if ( 0x41E < aExp ) {
5540 float_raise( float_flag_invalid );
5541 return 0xffffffff;
5543 else if ( aExp < 0x3FF ) {
5544 if ( aExp || aSig ) set_float_exception_inexact_flag();
5545 return 0;
5547 aSig |= LIT64( 0x0010000000000000 );
5548 shiftCount = 0x433 - aExp;
5549 savedASig = aSig;
5550 aSig >>= shiftCount;
5551 z = (uint32)aSig;
5552 if ( ( aSig<<shiftCount ) != savedASig ) {
5553 set_float_exception_inexact_flag();
5555 return z;
5560 -------------------------------------------------------------------------------
5561 Returns the result of converting the single-precision floating-point value
5562 `a' to the 32-bit unsigned integer format. The conversion is
5563 performed according to the IEC/IEEE Standard for Binary Floating-point
5564 Arithmetic, except that the conversion is always rounded toward zero. If
5565 `a' is a NaN, the largest positive integer is returned. If the conversion
5566 overflows, the largest positive integer is returned.
5567 -------------------------------------------------------------------------------
5569 uint32 float32_to_uint32_round_to_zero( float32 a )
5571 flag aSign;
5572 int16 aExp, shiftCount;
5573 bits32 aSig;
5574 uint32 z;
5576 aSig = extractFloat32Frac( a );
5577 aExp = extractFloat32Exp( a );
5578 aSign = extractFloat32Sign( a );
5579 shiftCount = aExp - 0x9E;
5581 if (aSign) {
5582 float_raise( float_flag_invalid );
5583 return(0);
5585 if ( 0 < shiftCount ) {
5586 float_raise( float_flag_invalid );
5587 return 0xFFFFFFFF;
5589 else if ( aExp <= 0x7E ) {
5590 if ( aExp | aSig ) set_float_exception_inexact_flag();
5591 return 0;
5593 aSig = ( aSig | 0x800000 )<<8;
5594 z = aSig>>( - shiftCount );
5595 if ( aSig<<( shiftCount & 31 ) ) {
5596 set_float_exception_inexact_flag();
5598 return z;
5602 #endif