1 /* $NetBSD: softfloat.c,v 1.2.6.3 2004/09/21 13:35:54 skrll Exp $ */
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
18 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* If you need this in a boot program, you have bigger problems... */
50 #include <sys/cdefs.h>
51 #if defined(LIBC_SCCS) && !defined(lint)
52 __RCSID("$NetBSD: softfloat.c,v 1.2.6.3 2004/09/21 13:35:54 skrll Exp $");
53 #endif /* LIBC_SCCS and not lint */
55 #ifdef SOFTFLOAT_FOR_GCC
56 #include "softfloat-for-gcc.h"
60 #include "softfloat.h"
63 * Conversions between floats as stored in memory and floats as
66 #ifndef FLOAT64_DEMANGLE
67 #define FLOAT64_DEMANGLE(a) (a)
69 #ifndef FLOAT64_MANGLE
70 #define FLOAT64_MANGLE(a) (a)
74 -------------------------------------------------------------------------------
75 Floating-point rounding mode, extended double-precision rounding precision,
77 -------------------------------------------------------------------------------
81 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
82 * Right now, it does not. I've removed all other dynamic global
86 int8 floatx80_rounding_precision
= 80;
90 -------------------------------------------------------------------------------
91 Primitive arithmetic functions, including multi-word arithmetic, and
92 division and square root approximations. (Can be specialized to target if
94 -------------------------------------------------------------------------------
96 #include "softfloat-macros.h"
99 -------------------------------------------------------------------------------
100 Functions and definitions to determine: (1) whether tininess for underflow
101 is detected before or after rounding by default, (2) what (if anything)
102 happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 are propagated from function inputs to output. These details are target-
106 -------------------------------------------------------------------------------
108 #include "softfloat-specialize.h"
110 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
112 -------------------------------------------------------------------------------
113 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
114 and 7, and returns the properly rounded 32-bit integer corresponding to the
115 input. If `zSign' is 1, the input is negated before being converted to an
116 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
117 is simply rounded to an integer, with the inexact exception raised if the
118 input cannot be represented exactly as an integer. However, if the fixed-
119 point input is too large, the invalid exception is raised and the largest
120 positive or negative integer is returned.
121 -------------------------------------------------------------------------------
123 static int32
roundAndPackInt32( flag zSign
, bits64 absZ
)
126 flag roundNearestEven
;
127 int8 roundIncrement
, roundBits
;
130 roundingMode
= float_rounding_mode();
131 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
132 roundIncrement
= 0x40;
133 if ( ! roundNearestEven
) {
134 if ( roundingMode
== float_round_to_zero
) {
138 roundIncrement
= 0x7F;
140 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
143 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
147 roundBits
= absZ
& 0x7F;
148 absZ
= ( absZ
+ roundIncrement
)>>7;
149 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
151 if ( zSign
) z
= - z
;
152 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
153 float_raise( float_flag_invalid
);
154 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
156 if ( roundBits
) float_set_inexact();
162 -------------------------------------------------------------------------------
163 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
164 `absZ1', with binary point between bits 63 and 64 (between the input words),
165 and returns the properly rounded 64-bit integer corresponding to the input.
166 If `zSign' is 1, the input is negated before being converted to an integer.
167 Ordinarily, the fixed-point input is simply rounded to an integer, with
168 the inexact exception raised if the input cannot be represented exactly as
169 an integer. However, if the fixed-point input is too large, the invalid
170 exception is raised and the largest positive or negative integer is
172 -------------------------------------------------------------------------------
174 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1
)
177 flag roundNearestEven
, increment
;
180 roundingMode
= float_rounding_mode();
181 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
182 increment
= ( (sbits64
) absZ1
< 0 );
183 if ( ! roundNearestEven
) {
184 if ( roundingMode
== float_round_to_zero
) {
189 increment
= ( roundingMode
== float_round_down
) && absZ1
;
192 increment
= ( roundingMode
== float_round_up
) && absZ1
;
198 if ( absZ0
== 0 ) goto overflow
;
199 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
202 if ( zSign
) z
= - z
;
203 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
205 float_raise( float_flag_invalid
);
207 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
208 : LIT64( 0x7FFFFFFFFFFFFFFF );
210 if ( absZ1
) float_set_inexact();
217 -------------------------------------------------------------------------------
218 Returns the fraction bits of the single-precision floating-point value `a'.
219 -------------------------------------------------------------------------------
221 INLINE bits32
extractFloat32Frac( float32 a
)
224 return a
& 0x007FFFFF;
229 -------------------------------------------------------------------------------
230 Returns the exponent bits of the single-precision floating-point value `a'.
231 -------------------------------------------------------------------------------
233 INLINE int16
extractFloat32Exp( float32 a
)
236 return ( a
>>23 ) & 0xFF;
241 -------------------------------------------------------------------------------
242 Returns the sign bit of the single-precision floating-point value `a'.
243 -------------------------------------------------------------------------------
245 INLINE flag
extractFloat32Sign( float32 a
)
253 -------------------------------------------------------------------------------
254 Normalizes the subnormal single-precision floating-point value represented
255 by the denormalized significand `aSig'. The normalized exponent and
256 significand are stored at the locations pointed to by `zExpPtr' and
257 `zSigPtr', respectively.
258 -------------------------------------------------------------------------------
261 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
265 shiftCount
= countLeadingZeros32( aSig
) - 8;
266 *zSigPtr
= aSig
<<shiftCount
;
267 *zExpPtr
= 1 - shiftCount
;
272 -------------------------------------------------------------------------------
273 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274 single-precision floating-point value, returning the result. After being
275 shifted into the proper positions, the three fields are simply added
276 together to form the result. This means that any integer portion of `zSig'
277 will be added into the exponent. Since a properly normalized significand
278 will have an integer portion equal to 1, the `zExp' input should be 1 less
279 than the desired result exponent whenever `zSig' is a complete, normalized
281 -------------------------------------------------------------------------------
283 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
286 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
291 -------------------------------------------------------------------------------
292 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293 and significand `zSig', and returns the proper single-precision floating-
294 point value corresponding to the abstract input. Ordinarily, the abstract
295 value is simply rounded and packed into the single-precision format, with
296 the inexact exception raised if the abstract input cannot be represented
297 exactly. However, if the abstract value is too large, the overflow and
298 inexact exceptions are raised and an infinity or maximal finite value is
299 returned. If the abstract value is too small, the input value is rounded to
300 a subnormal number, and the underflow and inexact exceptions are raised if
301 the abstract input cannot be represented exactly as a subnormal single-
302 precision floating-point number.
303 The input significand `zSig' has its binary point between bits 30
304 and 29, which is 7 bits to the left of the usual location. This shifted
305 significand must be normalized or smaller. If `zSig' is not normalized,
306 `zExp' must be 0; in that case, the result returned is a subnormal number,
307 and it must not require rounding. In the usual case that `zSig' is
308 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309 The handling of underflow and overflow follows the IEC/IEEE Standard for
310 Binary Floating-Point Arithmetic.
311 -------------------------------------------------------------------------------
313 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
316 flag roundNearestEven
;
317 int8 roundIncrement
, roundBits
;
320 roundingMode
= float_rounding_mode();
321 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
322 roundIncrement
= 0x40;
323 if ( ! roundNearestEven
) {
324 if ( roundingMode
== float_round_to_zero
) {
328 roundIncrement
= 0x7F;
330 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
333 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
337 roundBits
= zSig
& 0x7F;
338 if ( 0xFD <= (bits16
) zExp
) {
340 || ( ( zExp
== 0xFD )
341 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
343 float_raise( float_flag_overflow
| float_flag_inexact
);
344 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
348 ( float_detect_tininess
== float_tininess_before_rounding
)
350 || ( zSig
+ roundIncrement
< 0x80000000 );
351 shift32RightJamming( zSig
, - zExp
, &zSig
);
353 roundBits
= zSig
& 0x7F;
354 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
357 if ( roundBits
) float_set_inexact();
358 zSig
= ( zSig
+ roundIncrement
)>>7;
359 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
360 if ( zSig
== 0 ) zExp
= 0;
361 return packFloat32( zSign
, zExp
, zSig
);
366 -------------------------------------------------------------------------------
367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 and significand `zSig', and returns the proper single-precision floating-
369 point value corresponding to the abstract input. This routine is just like
370 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
371 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
372 floating-point exponent.
373 -------------------------------------------------------------------------------
376 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
380 shiftCount
= countLeadingZeros32( zSig
) - 1;
381 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
386 -------------------------------------------------------------------------------
387 Returns the fraction bits of the double-precision floating-point value `a'.
388 -------------------------------------------------------------------------------
390 INLINE bits64
extractFloat64Frac( float64 a
)
393 return FLOAT64_DEMANGLE(a
) & LIT64( 0x000FFFFFFFFFFFFF );
398 -------------------------------------------------------------------------------
399 Returns the exponent bits of the double-precision floating-point value `a'.
400 -------------------------------------------------------------------------------
402 INLINE int16
extractFloat64Exp( float64 a
)
405 return ( FLOAT64_DEMANGLE(a
)>>52 ) & 0x7FF;
410 -------------------------------------------------------------------------------
411 Returns the sign bit of the double-precision floating-point value `a'.
412 -------------------------------------------------------------------------------
414 INLINE flag
extractFloat64Sign( float64 a
)
417 return FLOAT64_DEMANGLE(a
)>>63;
422 -------------------------------------------------------------------------------
423 Normalizes the subnormal double-precision floating-point value represented
424 by the denormalized significand `aSig'. The normalized exponent and
425 significand are stored at the locations pointed to by `zExpPtr' and
426 `zSigPtr', respectively.
427 -------------------------------------------------------------------------------
430 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
434 shiftCount
= countLeadingZeros64( aSig
) - 11;
435 *zSigPtr
= aSig
<<shiftCount
;
436 *zExpPtr
= 1 - shiftCount
;
441 -------------------------------------------------------------------------------
442 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
443 double-precision floating-point value, returning the result. After being
444 shifted into the proper positions, the three fields are simply added
445 together to form the result. This means that any integer portion of `zSig'
446 will be added into the exponent. Since a properly normalized significand
447 will have an integer portion equal to 1, the `zExp' input should be 1 less
448 than the desired result exponent whenever `zSig' is a complete, normalized
450 -------------------------------------------------------------------------------
452 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
455 return FLOAT64_MANGLE( ( ( (bits64
) zSign
)<<63 ) +
456 ( ( (bits64
) zExp
)<<52 ) + zSig
);
461 -------------------------------------------------------------------------------
462 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
463 and significand `zSig', and returns the proper double-precision floating-
464 point value corresponding to the abstract input. Ordinarily, the abstract
465 value is simply rounded and packed into the double-precision format, with
466 the inexact exception raised if the abstract input cannot be represented
467 exactly. However, if the abstract value is too large, the overflow and
468 inexact exceptions are raised and an infinity or maximal finite value is
469 returned. If the abstract value is too small, the input value is rounded to
470 a subnormal number, and the underflow and inexact exceptions are raised if
471 the abstract input cannot be represented exactly as a subnormal double-
472 precision floating-point number.
473 The input significand `zSig' has its binary point between bits 62
474 and 61, which is 10 bits to the left of the usual location. This shifted
475 significand must be normalized or smaller. If `zSig' is not normalized,
476 `zExp' must be 0; in that case, the result returned is a subnormal number,
477 and it must not require rounding. In the usual case that `zSig' is
478 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
479 The handling of underflow and overflow follows the IEC/IEEE Standard for
480 Binary Floating-Point Arithmetic.
481 -------------------------------------------------------------------------------
483 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
486 flag roundNearestEven
;
487 int16 roundIncrement
, roundBits
;
490 roundingMode
= float_rounding_mode();
491 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
492 roundIncrement
= 0x200;
493 if ( ! roundNearestEven
) {
494 if ( roundingMode
== float_round_to_zero
) {
498 roundIncrement
= 0x3FF;
500 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
503 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
507 roundBits
= zSig
& 0x3FF;
508 if ( 0x7FD <= (bits16
) zExp
) {
509 if ( ( 0x7FD < zExp
)
510 || ( ( zExp
== 0x7FD )
511 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
513 float_raise( float_flag_overflow
| float_flag_inexact
);
514 return FLOAT64_MANGLE(
515 FLOAT64_DEMANGLE(packFloat64( zSign
, 0x7FF, 0 )) -
516 ( roundIncrement
== 0 ));
520 ( float_detect_tininess
== float_tininess_before_rounding
)
522 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
523 shift64RightJamming( zSig
, - zExp
, &zSig
);
525 roundBits
= zSig
& 0x3FF;
526 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
529 if ( roundBits
) float_set_inexact();
530 zSig
= ( zSig
+ roundIncrement
)>>10;
531 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
532 if ( zSig
== 0 ) zExp
= 0;
533 return packFloat64( zSign
, zExp
, zSig
);
538 -------------------------------------------------------------------------------
539 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
540 and significand `zSig', and returns the proper double-precision floating-
541 point value corresponding to the abstract input. This routine is just like
542 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
543 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
544 floating-point exponent.
545 -------------------------------------------------------------------------------
548 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
552 shiftCount
= countLeadingZeros64( zSig
) - 1;
553 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
560 -------------------------------------------------------------------------------
561 Returns the fraction bits of the extended double-precision floating-point
563 -------------------------------------------------------------------------------
565 INLINE bits64
extractFloatx80Frac( floatx80 a
)
573 -------------------------------------------------------------------------------
574 Returns the exponent bits of the extended double-precision floating-point
576 -------------------------------------------------------------------------------
578 INLINE int32
extractFloatx80Exp( floatx80 a
)
581 return a
.high
& 0x7FFF;
586 -------------------------------------------------------------------------------
587 Returns the sign bit of the extended double-precision floating-point value
589 -------------------------------------------------------------------------------
591 INLINE flag
extractFloatx80Sign( floatx80 a
)
599 -------------------------------------------------------------------------------
600 Normalizes the subnormal extended double-precision floating-point value
601 represented by the denormalized significand `aSig'. The normalized exponent
602 and significand are stored at the locations pointed to by `zExpPtr' and
603 `zSigPtr', respectively.
604 -------------------------------------------------------------------------------
607 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
611 shiftCount
= countLeadingZeros64( aSig
);
612 *zSigPtr
= aSig
<<shiftCount
;
613 *zExpPtr
= 1 - shiftCount
;
618 -------------------------------------------------------------------------------
619 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
620 extended double-precision floating-point value, returning the result.
621 -------------------------------------------------------------------------------
623 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
628 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
634 -------------------------------------------------------------------------------
635 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
636 and extended significand formed by the concatenation of `zSig0' and `zSig1',
637 and returns the proper extended double-precision floating-point value
638 corresponding to the abstract input. Ordinarily, the abstract value is
639 rounded and packed into the extended double-precision format, with the
640 inexact exception raised if the abstract input cannot be represented
641 exactly. However, if the abstract value is too large, the overflow and
642 inexact exceptions are raised and an infinity or maximal finite value is
643 returned. If the abstract value is too small, the input value is rounded to
644 a subnormal number, and the underflow and inexact exceptions are raised if
645 the abstract input cannot be represented exactly as a subnormal extended
646 double-precision floating-point number.
647 If `roundingPrecision' is 32 or 64, the result is rounded to the same
648 number of bits as single or double precision, respectively. Otherwise, the
649 result is rounded to the full precision of the extended double-precision
651 The input significand must be normalized or smaller. If the input
652 significand is not normalized, `zExp' must be 0; in that case, the result
653 returned is a subnormal number, and it must not require rounding. The
654 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
655 Floating-Point Arithmetic.
656 -------------------------------------------------------------------------------
659 roundAndPackFloatx80(
660 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
664 flag roundNearestEven
, increment
, isTiny
;
665 int64 roundIncrement
, roundMask
, roundBits
;
667 roundingMode
= float_rounding_mode();
668 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
669 if ( roundingPrecision
== 80 ) goto precision80
;
670 if ( roundingPrecision
== 64 ) {
671 roundIncrement
= LIT64( 0x0000000000000400 );
672 roundMask
= LIT64( 0x00000000000007FF );
674 else if ( roundingPrecision
== 32 ) {
675 roundIncrement
= LIT64( 0x0000008000000000 );
676 roundMask
= LIT64( 0x000000FFFFFFFFFF );
681 zSig0
|= ( zSig1
!= 0 );
682 if ( ! roundNearestEven
) {
683 if ( roundingMode
== float_round_to_zero
) {
687 roundIncrement
= roundMask
;
689 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
692 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
696 roundBits
= zSig0
& roundMask
;
697 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
698 if ( ( 0x7FFE < zExp
)
699 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
705 ( float_detect_tininess
== float_tininess_before_rounding
)
707 || ( zSig0
<= zSig0
+ roundIncrement
);
708 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
710 roundBits
= zSig0
& roundMask
;
711 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
712 if ( roundBits
) float_set_inexact();
713 zSig0
+= roundIncrement
;
714 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
715 roundIncrement
= roundMask
+ 1;
716 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
717 roundMask
|= roundIncrement
;
719 zSig0
&= ~ roundMask
;
720 return packFloatx80( zSign
, zExp
, zSig0
);
723 if ( roundBits
) float_set_inexact();
724 zSig0
+= roundIncrement
;
725 if ( zSig0
< roundIncrement
) {
727 zSig0
= LIT64( 0x8000000000000000 );
729 roundIncrement
= roundMask
+ 1;
730 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
731 roundMask
|= roundIncrement
;
733 zSig0
&= ~ roundMask
;
734 if ( zSig0
== 0 ) zExp
= 0;
735 return packFloatx80( zSign
, zExp
, zSig0
);
737 increment
= ( (sbits64
) zSig1
< 0 );
738 if ( ! roundNearestEven
) {
739 if ( roundingMode
== float_round_to_zero
) {
744 increment
= ( roundingMode
== float_round_down
) && zSig1
;
747 increment
= ( roundingMode
== float_round_up
) && zSig1
;
751 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
752 if ( ( 0x7FFE < zExp
)
753 || ( ( zExp
== 0x7FFE )
754 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
760 float_raise( float_flag_overflow
| float_flag_inexact
);
761 if ( ( roundingMode
== float_round_to_zero
)
762 || ( zSign
&& ( roundingMode
== float_round_up
) )
763 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
765 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
767 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
771 ( float_detect_tininess
== float_tininess_before_rounding
)
774 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
775 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
777 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow
);
778 if ( zSig1
) float_set_inexact();
779 if ( roundNearestEven
) {
780 increment
= ( (sbits64
) zSig1
< 0 );
784 increment
= ( roundingMode
== float_round_down
) && zSig1
;
787 increment
= ( roundingMode
== float_round_up
) && zSig1
;
793 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
794 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
796 return packFloatx80( zSign
, zExp
, zSig0
);
799 if ( zSig1
) float_set_inexact();
804 zSig0
= LIT64( 0x8000000000000000 );
807 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
811 if ( zSig0
== 0 ) zExp
= 0;
813 return packFloatx80( zSign
, zExp
, zSig0
);
818 -------------------------------------------------------------------------------
819 Takes an abstract floating-point value having sign `zSign', exponent
820 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
821 and returns the proper extended double-precision floating-point value
822 corresponding to the abstract input. This routine is just like
823 `roundAndPackFloatx80' except that the input significand does not have to be
825 -------------------------------------------------------------------------------
828 normalizeRoundAndPackFloatx80(
829 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
839 shiftCount
= countLeadingZeros64( zSig0
);
840 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
843 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1
);
852 -------------------------------------------------------------------------------
853 Returns the least-significant 64 fraction bits of the quadruple-precision
854 floating-point value `a'.
855 -------------------------------------------------------------------------------
857 INLINE bits64
extractFloat128Frac1( float128 a
)
865 -------------------------------------------------------------------------------
866 Returns the most-significant 48 fraction bits of the quadruple-precision
867 floating-point value `a'.
868 -------------------------------------------------------------------------------
870 INLINE bits64
extractFloat128Frac0( float128 a
)
873 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
878 -------------------------------------------------------------------------------
879 Returns the exponent bits of the quadruple-precision floating-point value
881 -------------------------------------------------------------------------------
883 INLINE int32
extractFloat128Exp( float128 a
)
886 return ( a
.high
>>48 ) & 0x7FFF;
891 -------------------------------------------------------------------------------
892 Returns the sign bit of the quadruple-precision floating-point value `a'.
893 -------------------------------------------------------------------------------
895 INLINE flag
extractFloat128Sign( float128 a
)
903 -------------------------------------------------------------------------------
904 Normalizes the subnormal quadruple-precision floating-point value
905 represented by the denormalized significand formed by the concatenation of
906 `aSig0' and `aSig1'. The normalized exponent is stored at the location
907 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
908 significand are stored at the location pointed to by `zSig0Ptr', and the
909 least significant 64 bits of the normalized significand are stored at the
910 location pointed to by `zSig1Ptr'.
911 -------------------------------------------------------------------------------
914 normalizeFloat128Subnormal(
925 shiftCount
= countLeadingZeros64( aSig1
) - 15;
926 if ( shiftCount
< 0 ) {
927 *zSig0Ptr
= aSig1
>>( - shiftCount
);
928 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
931 *zSig0Ptr
= aSig1
<<shiftCount
;
934 *zExpPtr
= - shiftCount
- 63;
937 shiftCount
= countLeadingZeros64( aSig0
) - 15;
938 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
939 *zExpPtr
= 1 - shiftCount
;
945 -------------------------------------------------------------------------------
946 Packs the sign `zSign', the exponent `zExp', and the significand formed
947 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
948 floating-point value, returning the result. After being shifted into the
949 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
950 added together to form the most significant 32 bits of the result. This
951 means that any integer portion of `zSig0' will be added into the exponent.
952 Since a properly normalized significand will have an integer portion equal
953 to 1, the `zExp' input should be 1 less than the desired result exponent
954 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
956 -------------------------------------------------------------------------------
959 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
964 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
970 -------------------------------------------------------------------------------
971 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
972 and extended significand formed by the concatenation of `zSig0', `zSig1',
973 and `zSig2', and returns the proper quadruple-precision floating-point value
974 corresponding to the abstract input. Ordinarily, the abstract value is
975 simply rounded and packed into the quadruple-precision format, with the
976 inexact exception raised if the abstract input cannot be represented
977 exactly. However, if the abstract value is too large, the overflow and
978 inexact exceptions are raised and an infinity or maximal finite value is
979 returned. If the abstract value is too small, the input value is rounded to
980 a subnormal number, and the underflow and inexact exceptions are raised if
981 the abstract input cannot be represented exactly as a subnormal quadruple-
982 precision floating-point number.
983 The input significand must be normalized or smaller. If the input
984 significand is not normalized, `zExp' must be 0; in that case, the result
985 returned is a subnormal number, and it must not require rounding. In the
986 usual case that the input significand is normalized, `zExp' must be 1 less
987 than the ``true'' floating-point exponent. The handling of underflow and
988 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
989 -------------------------------------------------------------------------------
992 roundAndPackFloat128(
993 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2
)
996 flag roundNearestEven
, increment
, isTiny
;
998 roundingMode
= float_rounding_mode();
999 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
1000 increment
= ( (sbits64
) zSig2
< 0 );
1001 if ( ! roundNearestEven
) {
1002 if ( roundingMode
== float_round_to_zero
) {
1007 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1010 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1014 if ( 0x7FFD <= (bits32
) zExp
) {
1015 if ( ( 0x7FFD < zExp
)
1016 || ( ( zExp
== 0x7FFD )
1018 LIT64( 0x0001FFFFFFFFFFFF ),
1019 LIT64( 0xFFFFFFFFFFFFFFFF ),
1026 float_raise( float_flag_overflow
| float_flag_inexact
);
1027 if ( ( roundingMode
== float_round_to_zero
)
1028 || ( zSign
&& ( roundingMode
== float_round_up
) )
1029 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1035 LIT64( 0x0000FFFFFFFFFFFF ),
1036 LIT64( 0xFFFFFFFFFFFFFFFF )
1039 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1043 ( float_detect_tininess
== float_tininess_before_rounding
)
1049 LIT64( 0x0001FFFFFFFFFFFF ),
1050 LIT64( 0xFFFFFFFFFFFFFFFF )
1052 shift128ExtraRightJamming(
1053 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1055 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow
);
1056 if ( roundNearestEven
) {
1057 increment
= ( (sbits64
) zSig2
< 0 );
1061 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1064 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1069 if ( zSig2
) float_set_inexact();
1071 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1072 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1075 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1077 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1082 -------------------------------------------------------------------------------
1083 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084 and significand formed by the concatenation of `zSig0' and `zSig1', and
1085 returns the proper quadruple-precision floating-point value corresponding
1086 to the abstract input. This routine is just like `roundAndPackFloat128'
1087 except that the input significand has fewer bits and does not have to be
1088 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1090 -------------------------------------------------------------------------------
1093 normalizeRoundAndPackFloat128(
1094 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
1104 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1105 if ( 0 <= shiftCount
) {
1107 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1110 shift128ExtraRightJamming(
1111 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1114 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1121 -------------------------------------------------------------------------------
1122 Returns the result of converting the 32-bit two's complement integer `a'
1123 to the single-precision floating-point format. The conversion is performed
1124 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1125 -------------------------------------------------------------------------------
1127 float32
int32_to_float32( int32 a
)
1131 if ( a
== 0 ) return 0;
1132 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1134 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a
);
1139 -------------------------------------------------------------------------------
1140 Returns the result of converting the 32-bit two's complement integer `a'
1141 to the double-precision floating-point format. The conversion is performed
1142 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1143 -------------------------------------------------------------------------------
1145 float64
int32_to_float64( int32 a
)
1152 if ( a
== 0 ) return 0;
1154 absA
= zSign
? - a
: a
;
1155 shiftCount
= countLeadingZeros32( absA
) + 21;
1157 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1164 -------------------------------------------------------------------------------
1165 Returns the result of converting the 32-bit two's complement integer `a'
1166 to the extended double-precision floating-point format. The conversion
1167 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1169 -------------------------------------------------------------------------------
1171 floatx80
int32_to_floatx80( int32 a
)
1178 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1180 absA
= zSign
? - a
: a
;
1181 shiftCount
= countLeadingZeros32( absA
) + 32;
1183 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1192 -------------------------------------------------------------------------------
1193 Returns the result of converting the 32-bit two's complement integer `a' to
1194 the quadruple-precision floating-point format. The conversion is performed
1195 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1196 -------------------------------------------------------------------------------
1198 float128
int32_to_float128( int32 a
)
1205 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1207 absA
= zSign
? - a
: a
;
1208 shiftCount
= countLeadingZeros32( absA
) + 17;
1210 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1216 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1218 -------------------------------------------------------------------------------
1219 Returns the result of converting the 64-bit two's complement integer `a'
1220 to the single-precision floating-point format. The conversion is performed
1221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1222 -------------------------------------------------------------------------------
1224 float32
int64_to_float32( int64 a
)
1230 if ( a
== 0 ) return 0;
1232 absA
= zSign
? - a
: a
;
1233 shiftCount
= countLeadingZeros64( absA
) - 40;
1234 if ( 0 <= shiftCount
) {
1235 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1239 if ( shiftCount
< 0 ) {
1240 shift64RightJamming( absA
, - shiftCount
, &absA
);
1243 absA
<<= shiftCount
;
1245 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA
);
1251 -------------------------------------------------------------------------------
1252 Returns the result of converting the 64-bit two's complement integer `a'
1253 to the double-precision floating-point format. The conversion is performed
1254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 -------------------------------------------------------------------------------
1257 float64
int64_to_float64( int64 a
)
1261 if ( a
== 0 ) return 0;
1262 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1263 return packFloat64( 1, 0x43E, 0 );
1266 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a
);
1273 -------------------------------------------------------------------------------
1274 Returns the result of converting the 64-bit two's complement integer `a'
1275 to the extended double-precision floating-point format. The conversion
1276 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1278 -------------------------------------------------------------------------------
1280 floatx80
int64_to_floatx80( int64 a
)
1286 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1288 absA
= zSign
? - a
: a
;
1289 shiftCount
= countLeadingZeros64( absA
);
1290 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1299 -------------------------------------------------------------------------------
1300 Returns the result of converting the 64-bit two's complement integer `a' to
1301 the quadruple-precision floating-point format. The conversion is performed
1302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1303 -------------------------------------------------------------------------------
1305 float128
int64_to_float128( int64 a
)
1311 bits64 zSig0
, zSig1
;
1313 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1315 absA
= zSign
? - a
: a
;
1316 shiftCount
= countLeadingZeros64( absA
) + 49;
1317 zExp
= 0x406E - shiftCount
;
1318 if ( 64 <= shiftCount
) {
1327 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1328 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1333 #endif /* !SOFTFLOAT_FOR_GCC */
1335 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1337 -------------------------------------------------------------------------------
1338 Returns the result of converting the single-precision floating-point value
1339 `a' to the 32-bit two's complement integer format. The conversion is
1340 performed according to the IEC/IEEE Standard for Binary Floating-Point
1341 Arithmetic---which means in particular that the conversion is rounded
1342 according to the current rounding mode. If `a' is a NaN, the largest
1343 positive integer is returned. Otherwise, if the conversion overflows, the
1344 largest integer with the same sign as `a' is returned.
1345 -------------------------------------------------------------------------------
1347 int32
float32_to_int32( float32 a
)
1350 int16 aExp
, shiftCount
;
1354 aSig
= extractFloat32Frac( a
);
1355 aExp
= extractFloat32Exp( a
);
1356 aSign
= extractFloat32Sign( a
);
1357 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1358 if ( aExp
) aSig
|= 0x00800000;
1359 shiftCount
= 0xAF - aExp
;
1362 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1363 return roundAndPackInt32( aSign
, aSig64
);
1366 #endif /* !SOFTFLOAT_FOR_GCC */
1369 -------------------------------------------------------------------------------
1370 Returns the result of converting the single-precision floating-point value
1371 `a' to the 32-bit two's complement integer format. The conversion is
1372 performed according to the IEC/IEEE Standard for Binary Floating-Point
1373 Arithmetic, except that the conversion is always rounded toward zero.
1374 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1375 the conversion overflows, the largest integer with the same sign as `a' is
1377 -------------------------------------------------------------------------------
1379 int32
float32_to_int32_round_to_zero( float32 a
)
1382 int16 aExp
, shiftCount
;
1386 aSig
= extractFloat32Frac( a
);
1387 aExp
= extractFloat32Exp( a
);
1388 aSign
= extractFloat32Sign( a
);
1389 shiftCount
= aExp
- 0x9E;
1390 if ( 0 <= shiftCount
) {
1391 if ( a
!= 0xCF000000 ) {
1392 float_raise( float_flag_invalid
);
1393 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1395 return (sbits32
) 0x80000000;
1397 else if ( aExp
<= 0x7E ) {
1398 if ( aExp
| aSig
) float_set_inexact();
1401 aSig
= ( aSig
| 0x00800000 )<<8;
1402 z
= aSig
>>( - shiftCount
);
1403 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1404 float_set_inexact();
1406 if ( aSign
) z
= - z
;
1411 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1413 -------------------------------------------------------------------------------
1414 Returns the result of converting the single-precision floating-point value
1415 `a' to the 64-bit two's complement integer format. The conversion is
1416 performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 Arithmetic---which means in particular that the conversion is rounded
1418 according to the current rounding mode. If `a' is a NaN, the largest
1419 positive integer is returned. Otherwise, if the conversion overflows, the
1420 largest integer with the same sign as `a' is returned.
1421 -------------------------------------------------------------------------------
1423 int64
float32_to_int64( float32 a
)
1426 int16 aExp
, shiftCount
;
1428 bits64 aSig64
, aSigExtra
;
1430 aSig
= extractFloat32Frac( a
);
1431 aExp
= extractFloat32Exp( a
);
1432 aSign
= extractFloat32Sign( a
);
1433 shiftCount
= 0xBE - aExp
;
1434 if ( shiftCount
< 0 ) {
1435 float_raise( float_flag_invalid
);
1436 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1437 return LIT64( 0x7FFFFFFFFFFFFFFF );
1439 return (sbits64
) LIT64( 0x8000000000000000 );
1441 if ( aExp
) aSig
|= 0x00800000;
1444 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1445 return roundAndPackInt64( aSign
, aSig64
, aSigExtra
);
1450 -------------------------------------------------------------------------------
1451 Returns the result of converting the single-precision floating-point value
1452 `a' to the 64-bit two's complement integer format. The conversion is
1453 performed according to the IEC/IEEE Standard for Binary Floating-Point
1454 Arithmetic, except that the conversion is always rounded toward zero. If
1455 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1456 conversion overflows, the largest integer with the same sign as `a' is
1458 -------------------------------------------------------------------------------
1460 int64
float32_to_int64_round_to_zero( float32 a
)
1463 int16 aExp
, shiftCount
;
1468 aSig
= extractFloat32Frac( a
);
1469 aExp
= extractFloat32Exp( a
);
1470 aSign
= extractFloat32Sign( a
);
1471 shiftCount
= aExp
- 0xBE;
1472 if ( 0 <= shiftCount
) {
1473 if ( a
!= 0xDF000000 ) {
1474 float_raise( float_flag_invalid
);
1475 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1476 return LIT64( 0x7FFFFFFFFFFFFFFF );
1479 return (sbits64
) LIT64( 0x8000000000000000 );
1481 else if ( aExp
<= 0x7E ) {
1482 if ( aExp
| aSig
) float_set_inexact();
1485 aSig64
= aSig
| 0x00800000;
1487 z
= aSig64
>>( - shiftCount
);
1488 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1489 float_set_inexact();
1491 if ( aSign
) z
= - z
;
1495 #endif /* !SOFTFLOAT_FOR_GCC */
1498 -------------------------------------------------------------------------------
1499 Returns the result of converting the single-precision floating-point value
1500 `a' to the double-precision floating-point format. The conversion is
1501 performed according to the IEC/IEEE Standard for Binary Floating-Point
1503 -------------------------------------------------------------------------------
1505 float64
float32_to_float64( float32 a
)
1511 aSig
= extractFloat32Frac( a
);
1512 aExp
= extractFloat32Exp( a
);
1513 aSign
= extractFloat32Sign( a
);
1514 if ( aExp
== 0xFF ) {
1515 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
1516 return packFloat64( aSign
, 0x7FF, 0 );
1519 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1520 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1523 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1530 -------------------------------------------------------------------------------
1531 Returns the result of converting the single-precision floating-point value
1532 `a' to the extended double-precision floating-point format. The conversion
1533 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1535 -------------------------------------------------------------------------------
1537 floatx80
float32_to_floatx80( float32 a
)
1543 aSig
= extractFloat32Frac( a
);
1544 aExp
= extractFloat32Exp( a
);
1545 aSign
= extractFloat32Sign( a
);
1546 if ( aExp
== 0xFF ) {
1547 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a
) );
1548 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1551 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1552 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1555 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1564 -------------------------------------------------------------------------------
1565 Returns the result of converting the single-precision floating-point value
1566 `a' to the double-precision floating-point format. The conversion is
1567 performed according to the IEC/IEEE Standard for Binary Floating-Point
1569 -------------------------------------------------------------------------------
1571 float128
float32_to_float128( float32 a
)
1577 aSig
= extractFloat32Frac( a
);
1578 aExp
= extractFloat32Exp( a
);
1579 aSign
= extractFloat32Sign( a
);
1580 if ( aExp
== 0xFF ) {
1581 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a
) );
1582 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1585 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1586 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1589 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1595 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1597 -------------------------------------------------------------------------------
1598 Rounds the single-precision floating-point value `a' to an integer, and
1599 returns the result as a single-precision floating-point value. The
1600 operation is performed according to the IEC/IEEE Standard for Binary
1601 Floating-Point Arithmetic.
1602 -------------------------------------------------------------------------------
1604 float32
float32_round_to_int( float32 a
)
1608 bits32 lastBitMask
, roundBitsMask
;
1612 aExp
= extractFloat32Exp( a
);
1613 if ( 0x96 <= aExp
) {
1614 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1615 return propagateFloat32NaN( a
, a
);
1619 if ( aExp
<= 0x7E ) {
1620 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
1621 float_set_inexact();
1622 aSign
= extractFloat32Sign( a
);
1623 switch ( float_rounding_mode() ) {
1624 case float_round_nearest_even
:
1625 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1626 return packFloat32( aSign
, 0x7F, 0 );
1629 case float_round_down
:
1630 return aSign
? 0xBF800000 : 0;
1631 case float_round_up
:
1632 return aSign
? 0x80000000 : 0x3F800000;
1634 return packFloat32( aSign
, 0, 0 );
1637 lastBitMask
<<= 0x96 - aExp
;
1638 roundBitsMask
= lastBitMask
- 1;
1640 roundingMode
= float_rounding_mode();
1641 if ( roundingMode
== float_round_nearest_even
) {
1642 z
+= lastBitMask
>>1;
1643 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1645 else if ( roundingMode
!= float_round_to_zero
) {
1646 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1650 z
&= ~ roundBitsMask
;
1651 if ( z
!= a
) float_set_inexact();
1655 #endif /* !SOFTFLOAT_FOR_GCC */
1658 -------------------------------------------------------------------------------
1659 Returns the result of adding the absolute values of the single-precision
1660 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1661 before being returned. `zSign' is ignored if the result is a NaN.
1662 The addition is performed according to the IEC/IEEE Standard for Binary
1663 Floating-Point Arithmetic.
1664 -------------------------------------------------------------------------------
1666 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1668 int16 aExp
, bExp
, zExp
;
1669 bits32 aSig
, bSig
, zSig
;
1672 aSig
= extractFloat32Frac( a
);
1673 aExp
= extractFloat32Exp( a
);
1674 bSig
= extractFloat32Frac( b
);
1675 bExp
= extractFloat32Exp( b
);
1676 expDiff
= aExp
- bExp
;
1679 if ( 0 < expDiff
) {
1680 if ( aExp
== 0xFF ) {
1681 if ( aSig
) return propagateFloat32NaN( a
, b
);
1690 shift32RightJamming( bSig
, expDiff
, &bSig
);
1693 else if ( expDiff
< 0 ) {
1694 if ( bExp
== 0xFF ) {
1695 if ( bSig
) return propagateFloat32NaN( a
, b
);
1696 return packFloat32( zSign
, 0xFF, 0 );
1704 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1708 if ( aExp
== 0xFF ) {
1709 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1712 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1713 zSig
= 0x40000000 + aSig
+ bSig
;
1718 zSig
= ( aSig
+ bSig
)<<1;
1720 if ( (sbits32
) zSig
< 0 ) {
1725 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1730 -------------------------------------------------------------------------------
1731 Returns the result of subtracting the absolute values of the single-
1732 precision floating-point values `a' and `b'. If `zSign' is 1, the
1733 difference is negated before being returned. `zSign' is ignored if the
1734 result is a NaN. The subtraction is performed according to the IEC/IEEE
1735 Standard for Binary Floating-Point Arithmetic.
1736 -------------------------------------------------------------------------------
1738 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1740 int16 aExp
, bExp
, zExp
;
1741 bits32 aSig
, bSig
, zSig
;
1744 aSig
= extractFloat32Frac( a
);
1745 aExp
= extractFloat32Exp( a
);
1746 bSig
= extractFloat32Frac( b
);
1747 bExp
= extractFloat32Exp( b
);
1748 expDiff
= aExp
- bExp
;
1751 if ( 0 < expDiff
) goto aExpBigger
;
1752 if ( expDiff
< 0 ) goto bExpBigger
;
1753 if ( aExp
== 0xFF ) {
1754 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1755 float_raise( float_flag_invalid
);
1756 return float32_default_nan
;
1762 if ( bSig
< aSig
) goto aBigger
;
1763 if ( aSig
< bSig
) goto bBigger
;
1764 return packFloat32( float_rounding_mode() == float_round_down
, 0, 0 );
1766 if ( bExp
== 0xFF ) {
1767 if ( bSig
) return propagateFloat32NaN( a
, b
);
1768 return packFloat32( zSign
^ 1, 0xFF, 0 );
1776 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1782 goto normalizeRoundAndPack
;
1784 if ( aExp
== 0xFF ) {
1785 if ( aSig
) return propagateFloat32NaN( a
, b
);
1794 shift32RightJamming( bSig
, expDiff
, &bSig
);
1799 normalizeRoundAndPack
:
1801 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig
);
1806 -------------------------------------------------------------------------------
1807 Returns the result of adding the single-precision floating-point values `a'
1808 and `b'. The operation is performed according to the IEC/IEEE Standard for
1809 Binary Floating-Point Arithmetic.
1810 -------------------------------------------------------------------------------
1812 float32
float32_add( float32 a
, float32 b
)
1816 aSign
= extractFloat32Sign( a
);
1817 bSign
= extractFloat32Sign( b
);
1818 if ( aSign
== bSign
) {
1819 return addFloat32Sigs( a
, b
, aSign
);
1822 return subFloat32Sigs( a
, b
, aSign
);
1828 -------------------------------------------------------------------------------
1829 Returns the result of subtracting the single-precision floating-point values
1830 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1831 for Binary Floating-Point Arithmetic.
1832 -------------------------------------------------------------------------------
1834 float32
float32_sub( float32 a
, float32 b
)
1838 aSign
= extractFloat32Sign( a
);
1839 bSign
= extractFloat32Sign( b
);
1840 if ( aSign
== bSign
) {
1841 return subFloat32Sigs( a
, b
, aSign
);
1844 return addFloat32Sigs( a
, b
, aSign
);
1850 -------------------------------------------------------------------------------
1851 Returns the result of multiplying the single-precision floating-point values
1852 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1853 for Binary Floating-Point Arithmetic.
1854 -------------------------------------------------------------------------------
1856 float32
float32_mul( float32 a
, float32 b
)
1858 flag aSign
, bSign
, zSign
;
1859 int16 aExp
, bExp
, zExp
;
1864 aSig
= extractFloat32Frac( a
);
1865 aExp
= extractFloat32Exp( a
);
1866 aSign
= extractFloat32Sign( a
);
1867 bSig
= extractFloat32Frac( b
);
1868 bExp
= extractFloat32Exp( b
);
1869 bSign
= extractFloat32Sign( b
);
1870 zSign
= aSign
^ bSign
;
1871 if ( aExp
== 0xFF ) {
1872 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1873 return propagateFloat32NaN( a
, b
);
1875 if ( ( bExp
| bSig
) == 0 ) {
1876 float_raise( float_flag_invalid
);
1877 return float32_default_nan
;
1879 return packFloat32( zSign
, 0xFF, 0 );
1881 if ( bExp
== 0xFF ) {
1882 if ( bSig
) return propagateFloat32NaN( a
, b
);
1883 if ( ( aExp
| aSig
) == 0 ) {
1884 float_raise( float_flag_invalid
);
1885 return float32_default_nan
;
1887 return packFloat32( zSign
, 0xFF, 0 );
1890 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1891 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1894 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1895 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1897 zExp
= aExp
+ bExp
- 0x7F;
1898 aSig
= ( aSig
| 0x00800000 )<<7;
1899 bSig
= ( bSig
| 0x00800000 )<<8;
1900 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1902 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1906 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1911 -------------------------------------------------------------------------------
1912 Returns the result of dividing the single-precision floating-point value `a'
1913 by the corresponding value `b'. The operation is performed according to the
1914 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1915 -------------------------------------------------------------------------------
1917 float32
float32_div( float32 a
, float32 b
)
1919 flag aSign
, bSign
, zSign
;
1920 int16 aExp
, bExp
, zExp
;
1921 bits32 aSig
, bSig
, zSig
;
1923 aSig
= extractFloat32Frac( a
);
1924 aExp
= extractFloat32Exp( a
);
1925 aSign
= extractFloat32Sign( a
);
1926 bSig
= extractFloat32Frac( b
);
1927 bExp
= extractFloat32Exp( b
);
1928 bSign
= extractFloat32Sign( b
);
1929 zSign
= aSign
^ bSign
;
1930 if ( aExp
== 0xFF ) {
1931 if ( aSig
) return propagateFloat32NaN( a
, b
);
1932 if ( bExp
== 0xFF ) {
1933 if ( bSig
) return propagateFloat32NaN( a
, b
);
1934 float_raise( float_flag_invalid
);
1935 return float32_default_nan
;
1937 return packFloat32( zSign
, 0xFF, 0 );
1939 if ( bExp
== 0xFF ) {
1940 if ( bSig
) return propagateFloat32NaN( a
, b
);
1941 return packFloat32( zSign
, 0, 0 );
1945 if ( ( aExp
| aSig
) == 0 ) {
1946 float_raise( float_flag_invalid
);
1947 return float32_default_nan
;
1949 float_raise( float_flag_divbyzero
);
1950 return packFloat32( zSign
, 0xFF, 0 );
1952 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1955 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1956 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1958 zExp
= aExp
- bExp
+ 0x7D;
1959 aSig
= ( aSig
| 0x00800000 )<<7;
1960 bSig
= ( bSig
| 0x00800000 )<<8;
1961 if ( bSig
<= ( aSig
+ aSig
) ) {
1965 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1966 if ( ( zSig
& 0x3F ) == 0 ) {
1967 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
1969 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1973 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1975 -------------------------------------------------------------------------------
1976 Returns the remainder of the single-precision floating-point value `a'
1977 with respect to the corresponding value `b'. The operation is performed
1978 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1979 -------------------------------------------------------------------------------
1981 float32
float32_rem( float32 a
, float32 b
)
1983 flag aSign
, bSign
, zSign
;
1984 int16 aExp
, bExp
, expDiff
;
1987 bits64 aSig64
, bSig64
, q64
;
1988 bits32 alternateASig
;
1991 aSig
= extractFloat32Frac( a
);
1992 aExp
= extractFloat32Exp( a
);
1993 aSign
= extractFloat32Sign( a
);
1994 bSig
= extractFloat32Frac( b
);
1995 bExp
= extractFloat32Exp( b
);
1996 bSign
= extractFloat32Sign( b
);
1997 if ( aExp
== 0xFF ) {
1998 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1999 return propagateFloat32NaN( a
, b
);
2001 float_raise( float_flag_invalid
);
2002 return float32_default_nan
;
2004 if ( bExp
== 0xFF ) {
2005 if ( bSig
) return propagateFloat32NaN( a
, b
);
2010 float_raise( float_flag_invalid
);
2011 return float32_default_nan
;
2013 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2016 if ( aSig
== 0 ) return a
;
2017 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2019 expDiff
= aExp
- bExp
;
2022 if ( expDiff
< 32 ) {
2025 if ( expDiff
< 0 ) {
2026 if ( expDiff
< -1 ) return a
;
2029 q
= ( bSig
<= aSig
);
2030 if ( q
) aSig
-= bSig
;
2031 if ( 0 < expDiff
) {
2032 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
2035 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2043 if ( bSig
<= aSig
) aSig
-= bSig
;
2044 aSig64
= ( (bits64
) aSig
)<<40;
2045 bSig64
= ( (bits64
) bSig
)<<40;
2047 while ( 0 < expDiff
) {
2048 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2049 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2050 aSig64
= - ( ( bSig
* q64
)<<38 );
2054 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2055 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2056 q
= q64
>>( 64 - expDiff
);
2058 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2061 alternateASig
= aSig
;
2064 } while ( 0 <= (sbits32
) aSig
);
2065 sigMean
= aSig
+ alternateASig
;
2066 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2067 aSig
= alternateASig
;
2069 zSign
= ( (sbits32
) aSig
< 0 );
2070 if ( zSign
) aSig
= - aSig
;
2071 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig
);
2074 #endif /* !SOFTFLOAT_FOR_GCC */
2076 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2078 -------------------------------------------------------------------------------
2079 Returns the square root of the single-precision floating-point value `a'.
2080 The operation is performed according to the IEC/IEEE Standard for Binary
2081 Floating-Point Arithmetic.
2082 -------------------------------------------------------------------------------
2084 float32
float32_sqrt( float32 a
)
2091 aSig
= extractFloat32Frac( a
);
2092 aExp
= extractFloat32Exp( a
);
2093 aSign
= extractFloat32Sign( a
);
2094 if ( aExp
== 0xFF ) {
2095 if ( aSig
) return propagateFloat32NaN( a
, 0 );
2096 if ( ! aSign
) return a
;
2097 float_raise( float_flag_invalid
);
2098 return float32_default_nan
;
2101 if ( ( aExp
| aSig
) == 0 ) return a
;
2102 float_raise( float_flag_invalid
);
2103 return float32_default_nan
;
2106 if ( aSig
== 0 ) return 0;
2107 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2109 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2110 aSig
= ( aSig
| 0x00800000 )<<8;
2111 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2112 if ( ( zSig
& 0x7F ) <= 5 ) {
2118 term
= ( (bits64
) zSig
) * zSig
;
2119 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2120 while ( (sbits64
) rem
< 0 ) {
2122 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2124 zSig
|= ( rem
!= 0 );
2126 shift32RightJamming( zSig
, 1, &zSig
);
2128 return roundAndPackFloat32( 0, zExp
, zSig
);
2131 #endif /* !SOFTFLOAT_FOR_GCC */
2134 -------------------------------------------------------------------------------
2135 Returns 1 if the single-precision floating-point value `a' is equal to
2136 the corresponding value `b', and 0 otherwise. The comparison is performed
2137 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2138 -------------------------------------------------------------------------------
2140 flag
float32_eq( float32 a
, float32 b
)
2143 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2144 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2146 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2147 float_raise( float_flag_invalid
);
2151 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2156 -------------------------------------------------------------------------------
2157 Returns 1 if the single-precision floating-point value `a' is less than
2158 or equal to the corresponding value `b', and 0 otherwise. The comparison
2159 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2161 -------------------------------------------------------------------------------
2163 flag
float32_le( float32 a
, float32 b
)
2167 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2168 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2170 float_raise( float_flag_invalid
);
2173 aSign
= extractFloat32Sign( a
);
2174 bSign
= extractFloat32Sign( b
);
2175 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2176 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2181 -------------------------------------------------------------------------------
2182 Returns 1 if the single-precision floating-point value `a' is less than
2183 the corresponding value `b', and 0 otherwise. The comparison is performed
2184 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 -------------------------------------------------------------------------------
2187 flag
float32_lt( float32 a
, float32 b
)
2191 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2192 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2194 float_raise( float_flag_invalid
);
2197 aSign
= extractFloat32Sign( a
);
2198 bSign
= extractFloat32Sign( b
);
2199 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2200 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2204 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2206 -------------------------------------------------------------------------------
2207 Returns 1 if the single-precision floating-point value `a' is equal to
2208 the corresponding value `b', and 0 otherwise. The invalid exception is
2209 raised if either operand is a NaN. Otherwise, the comparison is performed
2210 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2211 -------------------------------------------------------------------------------
2213 flag
float32_eq_signaling( float32 a
, float32 b
)
2216 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2217 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2219 float_raise( float_flag_invalid
);
2222 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2227 -------------------------------------------------------------------------------
2228 Returns 1 if the single-precision floating-point value `a' is less than or
2229 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2230 cause an exception. Otherwise, the comparison is performed according to the
2231 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2232 -------------------------------------------------------------------------------
2234 flag
float32_le_quiet( float32 a
, float32 b
)
2238 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2239 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2241 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2242 float_raise( float_flag_invalid
);
2246 aSign
= extractFloat32Sign( a
);
2247 bSign
= extractFloat32Sign( b
);
2248 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2249 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2254 -------------------------------------------------------------------------------
2255 Returns 1 if the single-precision floating-point value `a' is less than
2256 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2257 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2258 Standard for Binary Floating-Point Arithmetic.
2259 -------------------------------------------------------------------------------
2261 flag
float32_lt_quiet( float32 a
, float32 b
)
2265 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2266 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2268 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2269 float_raise( float_flag_invalid
);
2273 aSign
= extractFloat32Sign( a
);
2274 bSign
= extractFloat32Sign( b
);
2275 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2276 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2279 #endif /* !SOFTFLOAT_FOR_GCC */
2281 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2283 -------------------------------------------------------------------------------
2284 Returns the result of converting the double-precision floating-point value
2285 `a' to the 32-bit two's complement integer format. The conversion is
2286 performed according to the IEC/IEEE Standard for Binary Floating-Point
2287 Arithmetic---which means in particular that the conversion is rounded
2288 according to the current rounding mode. If `a' is a NaN, the largest
2289 positive integer is returned. Otherwise, if the conversion overflows, the
2290 largest integer with the same sign as `a' is returned.
2291 -------------------------------------------------------------------------------
2293 int32
float64_to_int32( float64 a
)
2296 int16 aExp
, shiftCount
;
2299 aSig
= extractFloat64Frac( a
);
2300 aExp
= extractFloat64Exp( a
);
2301 aSign
= extractFloat64Sign( a
);
2302 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2303 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2304 shiftCount
= 0x42C - aExp
;
2305 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2306 return roundAndPackInt32( aSign
, aSig
);
2309 #endif /* !SOFTFLOAT_FOR_GCC */
2312 -------------------------------------------------------------------------------
2313 Returns the result of converting the double-precision floating-point value
2314 `a' to the 32-bit two's complement integer format. The conversion is
2315 performed according to the IEC/IEEE Standard for Binary Floating-Point
2316 Arithmetic, except that the conversion is always rounded toward zero.
2317 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2318 the conversion overflows, the largest integer with the same sign as `a' is
2320 -------------------------------------------------------------------------------
2322 int32
float64_to_int32_round_to_zero( float64 a
)
2325 int16 aExp
, shiftCount
;
2326 bits64 aSig
, savedASig
;
2329 aSig
= extractFloat64Frac( a
);
2330 aExp
= extractFloat64Exp( a
);
2331 aSign
= extractFloat64Sign( a
);
2332 if ( 0x41E < aExp
) {
2333 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2336 else if ( aExp
< 0x3FF ) {
2337 if ( aExp
|| aSig
) float_set_inexact();
2340 aSig
|= LIT64( 0x0010000000000000 );
2341 shiftCount
= 0x433 - aExp
;
2343 aSig
>>= shiftCount
;
2345 if ( aSign
) z
= - z
;
2346 if ( ( z
< 0 ) ^ aSign
) {
2348 float_raise( float_flag_invalid
);
2349 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2351 if ( ( aSig
<<shiftCount
) != savedASig
) {
2352 float_set_inexact();
2358 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2360 -------------------------------------------------------------------------------
2361 Returns the result of converting the double-precision floating-point value
2362 `a' to the 64-bit two's complement integer format. The conversion is
2363 performed according to the IEC/IEEE Standard for Binary Floating-Point
2364 Arithmetic---which means in particular that the conversion is rounded
2365 according to the current rounding mode. If `a' is a NaN, the largest
2366 positive integer is returned. Otherwise, if the conversion overflows, the
2367 largest integer with the same sign as `a' is returned.
2368 -------------------------------------------------------------------------------
2370 int64
float64_to_int64( float64 a
)
2373 int16 aExp
, shiftCount
;
2374 bits64 aSig
, aSigExtra
;
2376 aSig
= extractFloat64Frac( a
);
2377 aExp
= extractFloat64Exp( a
);
2378 aSign
= extractFloat64Sign( a
);
2379 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2380 shiftCount
= 0x433 - aExp
;
2381 if ( shiftCount
<= 0 ) {
2382 if ( 0x43E < aExp
) {
2383 float_raise( float_flag_invalid
);
2385 || ( ( aExp
== 0x7FF )
2386 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2388 return LIT64( 0x7FFFFFFFFFFFFFFF );
2390 return (sbits64
) LIT64( 0x8000000000000000 );
2393 aSig
<<= - shiftCount
;
2396 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2398 return roundAndPackInt64( aSign
, aSig
, aSigExtra
);
2403 -------------------------------------------------------------------------------
2404 Returns the result of converting the double-precision floating-point value
2405 `a' to the 64-bit two's complement integer format. The conversion is
2406 performed according to the IEC/IEEE Standard for Binary Floating-Point
2407 Arithmetic, except that the conversion is always rounded toward zero.
2408 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2409 the conversion overflows, the largest integer with the same sign as `a' is
2411 -------------------------------------------------------------------------------
2413 int64
float64_to_int64_round_to_zero( float64 a
)
2416 int16 aExp
, shiftCount
;
2420 aSig
= extractFloat64Frac( a
);
2421 aExp
= extractFloat64Exp( a
);
2422 aSign
= extractFloat64Sign( a
);
2423 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2424 shiftCount
= aExp
- 0x433;
2425 if ( 0 <= shiftCount
) {
2426 if ( 0x43E <= aExp
) {
2427 if ( a
!= LIT64( 0xC3E0000000000000 ) ) {
2428 float_raise( float_flag_invalid
);
2430 || ( ( aExp
== 0x7FF )
2431 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2433 return LIT64( 0x7FFFFFFFFFFFFFFF );
2436 return (sbits64
) LIT64( 0x8000000000000000 );
2438 z
= aSig
<<shiftCount
;
2441 if ( aExp
< 0x3FE ) {
2442 if ( aExp
| aSig
) float_set_inexact();
2445 z
= aSig
>>( - shiftCount
);
2446 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2447 float_set_inexact();
2450 if ( aSign
) z
= - z
;
2454 #endif /* !SOFTFLOAT_FOR_GCC */
2457 -------------------------------------------------------------------------------
2458 Returns the result of converting the double-precision floating-point value
2459 `a' to the single-precision floating-point format. The conversion is
2460 performed according to the IEC/IEEE Standard for Binary Floating-Point
2462 -------------------------------------------------------------------------------
2464 float32
float64_to_float32( float64 a
)
2471 aSig
= extractFloat64Frac( a
);
2472 aExp
= extractFloat64Exp( a
);
2473 aSign
= extractFloat64Sign( a
);
2474 if ( aExp
== 0x7FF ) {
2475 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a
) );
2476 return packFloat32( aSign
, 0xFF, 0 );
2478 shift64RightJamming( aSig
, 22, &aSig
);
2480 if ( aExp
|| zSig
) {
2484 return roundAndPackFloat32( aSign
, aExp
, zSig
);
2491 -------------------------------------------------------------------------------
2492 Returns the result of converting the double-precision floating-point value
2493 `a' to the extended double-precision floating-point format. The conversion
2494 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2496 -------------------------------------------------------------------------------
2498 floatx80
float64_to_floatx80( float64 a
)
2504 aSig
= extractFloat64Frac( a
);
2505 aExp
= extractFloat64Exp( a
);
2506 aSign
= extractFloat64Sign( a
);
2507 if ( aExp
== 0x7FF ) {
2508 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a
) );
2509 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2512 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2513 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2517 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2526 -------------------------------------------------------------------------------
2527 Returns the result of converting the double-precision floating-point value
2528 `a' to the quadruple-precision floating-point format. The conversion is
2529 performed according to the IEC/IEEE Standard for Binary Floating-Point
2531 -------------------------------------------------------------------------------
2533 float128
float64_to_float128( float64 a
)
2537 bits64 aSig
, zSig0
, zSig1
;
2539 aSig
= extractFloat64Frac( a
);
2540 aExp
= extractFloat64Exp( a
);
2541 aSign
= extractFloat64Sign( a
);
2542 if ( aExp
== 0x7FF ) {
2543 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a
) );
2544 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2547 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2548 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2551 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2552 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2558 #ifndef SOFTFLOAT_FOR_GCC
2560 -------------------------------------------------------------------------------
2561 Rounds the double-precision floating-point value `a' to an integer, and
2562 returns the result as a double-precision floating-point value. The
2563 operation is performed according to the IEC/IEEE Standard for Binary
2564 Floating-Point Arithmetic.
2565 -------------------------------------------------------------------------------
2567 float64
float64_round_to_int( float64 a
)
2571 bits64 lastBitMask
, roundBitsMask
;
2575 aExp
= extractFloat64Exp( a
);
2576 if ( 0x433 <= aExp
) {
2577 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2578 return propagateFloat64NaN( a
, a
);
2582 if ( aExp
< 0x3FF ) {
2583 if ( (bits64
) ( a
<<1 ) == 0 ) return a
;
2584 float_set_inexact();
2585 aSign
= extractFloat64Sign( a
);
2586 switch ( float_rounding_mode() ) {
2587 case float_round_nearest_even
:
2588 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2589 return packFloat64( aSign
, 0x3FF, 0 );
2592 case float_round_down
:
2593 return aSign
? LIT64( 0xBFF0000000000000 ) : 0;
2594 case float_round_up
:
2596 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2598 return packFloat64( aSign
, 0, 0 );
2601 lastBitMask
<<= 0x433 - aExp
;
2602 roundBitsMask
= lastBitMask
- 1;
2604 roundingMode
= float_rounding_mode();
2605 if ( roundingMode
== float_round_nearest_even
) {
2606 z
+= lastBitMask
>>1;
2607 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2609 else if ( roundingMode
!= float_round_to_zero
) {
2610 if ( extractFloat64Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
2614 z
&= ~ roundBitsMask
;
2615 if ( z
!= a
) float_set_inexact();
2622 -------------------------------------------------------------------------------
2623 Returns the result of adding the absolute values of the double-precision
2624 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2625 before being returned. `zSign' is ignored if the result is a NaN.
2626 The addition is performed according to the IEC/IEEE Standard for Binary
2627 Floating-Point Arithmetic.
2628 -------------------------------------------------------------------------------
2630 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2632 int16 aExp
, bExp
, zExp
;
2633 bits64 aSig
, bSig
, zSig
;
2636 aSig
= extractFloat64Frac( a
);
2637 aExp
= extractFloat64Exp( a
);
2638 bSig
= extractFloat64Frac( b
);
2639 bExp
= extractFloat64Exp( b
);
2640 expDiff
= aExp
- bExp
;
2643 if ( 0 < expDiff
) {
2644 if ( aExp
== 0x7FF ) {
2645 if ( aSig
) return propagateFloat64NaN( a
, b
);
2652 bSig
|= LIT64( 0x2000000000000000 );
2654 shift64RightJamming( bSig
, expDiff
, &bSig
);
2657 else if ( expDiff
< 0 ) {
2658 if ( bExp
== 0x7FF ) {
2659 if ( bSig
) return propagateFloat64NaN( a
, b
);
2660 return packFloat64( zSign
, 0x7FF, 0 );
2666 aSig
|= LIT64( 0x2000000000000000 );
2668 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2672 if ( aExp
== 0x7FF ) {
2673 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2676 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
2677 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
2681 aSig
|= LIT64( 0x2000000000000000 );
2682 zSig
= ( aSig
+ bSig
)<<1;
2684 if ( (sbits64
) zSig
< 0 ) {
2689 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2694 -------------------------------------------------------------------------------
2695 Returns the result of subtracting the absolute values of the double-
2696 precision floating-point values `a' and `b'. If `zSign' is 1, the
2697 difference is negated before being returned. `zSign' is ignored if the
2698 result is a NaN. The subtraction is performed according to the IEC/IEEE
2699 Standard for Binary Floating-Point Arithmetic.
2700 -------------------------------------------------------------------------------
2702 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2704 int16 aExp
, bExp
, zExp
;
2705 bits64 aSig
, bSig
, zSig
;
2708 aSig
= extractFloat64Frac( a
);
2709 aExp
= extractFloat64Exp( a
);
2710 bSig
= extractFloat64Frac( b
);
2711 bExp
= extractFloat64Exp( b
);
2712 expDiff
= aExp
- bExp
;
2715 if ( 0 < expDiff
) goto aExpBigger
;
2716 if ( expDiff
< 0 ) goto bExpBigger
;
2717 if ( aExp
== 0x7FF ) {
2718 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2719 float_raise( float_flag_invalid
);
2720 return float64_default_nan
;
2726 if ( bSig
< aSig
) goto aBigger
;
2727 if ( aSig
< bSig
) goto bBigger
;
2728 return packFloat64( float_rounding_mode() == float_round_down
, 0, 0 );
2730 if ( bExp
== 0x7FF ) {
2731 if ( bSig
) return propagateFloat64NaN( a
, b
);
2732 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2738 aSig
|= LIT64( 0x4000000000000000 );
2740 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2741 bSig
|= LIT64( 0x4000000000000000 );
2746 goto normalizeRoundAndPack
;
2748 if ( aExp
== 0x7FF ) {
2749 if ( aSig
) return propagateFloat64NaN( a
, b
);
2756 bSig
|= LIT64( 0x4000000000000000 );
2758 shift64RightJamming( bSig
, expDiff
, &bSig
);
2759 aSig
|= LIT64( 0x4000000000000000 );
2763 normalizeRoundAndPack
:
2765 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig
);
2770 -------------------------------------------------------------------------------
2771 Returns the result of adding the double-precision floating-point values `a'
2772 and `b'. The operation is performed according to the IEC/IEEE Standard for
2773 Binary Floating-Point Arithmetic.
2774 -------------------------------------------------------------------------------
2776 float64
float64_add( float64 a
, float64 b
)
2780 aSign
= extractFloat64Sign( a
);
2781 bSign
= extractFloat64Sign( b
);
2782 if ( aSign
== bSign
) {
2783 return addFloat64Sigs( a
, b
, aSign
);
2786 return subFloat64Sigs( a
, b
, aSign
);
2792 -------------------------------------------------------------------------------
2793 Returns the result of subtracting the double-precision floating-point values
2794 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2795 for Binary Floating-Point Arithmetic.
2796 -------------------------------------------------------------------------------
2798 float64
float64_sub( float64 a
, float64 b
)
2802 aSign
= extractFloat64Sign( a
);
2803 bSign
= extractFloat64Sign( b
);
2804 if ( aSign
== bSign
) {
2805 return subFloat64Sigs( a
, b
, aSign
);
2808 return addFloat64Sigs( a
, b
, aSign
);
2814 -------------------------------------------------------------------------------
2815 Returns the result of multiplying the double-precision floating-point values
2816 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2817 for Binary Floating-Point Arithmetic.
2818 -------------------------------------------------------------------------------
2820 float64
float64_mul( float64 a
, float64 b
)
2822 flag aSign
, bSign
, zSign
;
2823 int16 aExp
, bExp
, zExp
;
2824 bits64 aSig
, bSig
, zSig0
, zSig1
;
2826 aSig
= extractFloat64Frac( a
);
2827 aExp
= extractFloat64Exp( a
);
2828 aSign
= extractFloat64Sign( a
);
2829 bSig
= extractFloat64Frac( b
);
2830 bExp
= extractFloat64Exp( b
);
2831 bSign
= extractFloat64Sign( b
);
2832 zSign
= aSign
^ bSign
;
2833 if ( aExp
== 0x7FF ) {
2834 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2835 return propagateFloat64NaN( a
, b
);
2837 if ( ( bExp
| bSig
) == 0 ) {
2838 float_raise( float_flag_invalid
);
2839 return float64_default_nan
;
2841 return packFloat64( zSign
, 0x7FF, 0 );
2843 if ( bExp
== 0x7FF ) {
2844 if ( bSig
) return propagateFloat64NaN( a
, b
);
2845 if ( ( aExp
| aSig
) == 0 ) {
2846 float_raise( float_flag_invalid
);
2847 return float64_default_nan
;
2849 return packFloat64( zSign
, 0x7FF, 0 );
2852 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2853 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2856 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2857 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2859 zExp
= aExp
+ bExp
- 0x3FF;
2860 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2861 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2862 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2863 zSig0
|= ( zSig1
!= 0 );
2864 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2868 return roundAndPackFloat64( zSign
, zExp
, zSig0
);
2873 -------------------------------------------------------------------------------
2874 Returns the result of dividing the double-precision floating-point value `a'
2875 by the corresponding value `b'. The operation is performed according to
2876 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2877 -------------------------------------------------------------------------------
2879 float64
float64_div( float64 a
, float64 b
)
2881 flag aSign
, bSign
, zSign
;
2882 int16 aExp
, bExp
, zExp
;
2883 bits64 aSig
, bSig
, zSig
;
2885 bits64 term0
, term1
;
2887 aSig
= extractFloat64Frac( a
);
2888 aExp
= extractFloat64Exp( a
);
2889 aSign
= extractFloat64Sign( a
);
2890 bSig
= extractFloat64Frac( b
);
2891 bExp
= extractFloat64Exp( b
);
2892 bSign
= extractFloat64Sign( b
);
2893 zSign
= aSign
^ bSign
;
2894 if ( aExp
== 0x7FF ) {
2895 if ( aSig
) return propagateFloat64NaN( a
, b
);
2896 if ( bExp
== 0x7FF ) {
2897 if ( bSig
) return propagateFloat64NaN( a
, b
);
2898 float_raise( float_flag_invalid
);
2899 return float64_default_nan
;
2901 return packFloat64( zSign
, 0x7FF, 0 );
2903 if ( bExp
== 0x7FF ) {
2904 if ( bSig
) return propagateFloat64NaN( a
, b
);
2905 return packFloat64( zSign
, 0, 0 );
2909 if ( ( aExp
| aSig
) == 0 ) {
2910 float_raise( float_flag_invalid
);
2911 return float64_default_nan
;
2913 float_raise( float_flag_divbyzero
);
2914 return packFloat64( zSign
, 0x7FF, 0 );
2916 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2919 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2920 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2922 zExp
= aExp
- bExp
+ 0x3FD;
2923 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2924 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2925 if ( bSig
<= ( aSig
+ aSig
) ) {
2929 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2930 if ( ( zSig
& 0x1FF ) <= 2 ) {
2931 mul64To128( bSig
, zSig
, &term0
, &term1
);
2932 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2933 while ( (sbits64
) rem0
< 0 ) {
2935 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2937 zSig
|= ( rem1
!= 0 );
2939 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2943 #ifndef SOFTFLOAT_FOR_GCC
2945 -------------------------------------------------------------------------------
2946 Returns the remainder of the double-precision floating-point value `a'
2947 with respect to the corresponding value `b'. The operation is performed
2948 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2949 -------------------------------------------------------------------------------
2951 float64
float64_rem( float64 a
, float64 b
)
2953 flag aSign
, bSign
, zSign
;
2954 int16 aExp
, bExp
, expDiff
;
2956 bits64 q
, alternateASig
;
2959 aSig
= extractFloat64Frac( a
);
2960 aExp
= extractFloat64Exp( a
);
2961 aSign
= extractFloat64Sign( a
);
2962 bSig
= extractFloat64Frac( b
);
2963 bExp
= extractFloat64Exp( b
);
2964 bSign
= extractFloat64Sign( b
);
2965 if ( aExp
== 0x7FF ) {
2966 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2967 return propagateFloat64NaN( a
, b
);
2969 float_raise( float_flag_invalid
);
2970 return float64_default_nan
;
2972 if ( bExp
== 0x7FF ) {
2973 if ( bSig
) return propagateFloat64NaN( a
, b
);
2978 float_raise( float_flag_invalid
);
2979 return float64_default_nan
;
2981 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2984 if ( aSig
== 0 ) return a
;
2985 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2987 expDiff
= aExp
- bExp
;
2988 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
2989 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2990 if ( expDiff
< 0 ) {
2991 if ( expDiff
< -1 ) return a
;
2994 q
= ( bSig
<= aSig
);
2995 if ( q
) aSig
-= bSig
;
2997 while ( 0 < expDiff
) {
2998 q
= estimateDiv128To64( aSig
, 0, bSig
);
2999 q
= ( 2 < q
) ? q
- 2 : 0;
3000 aSig
= - ( ( bSig
>>2 ) * q
);
3004 if ( 0 < expDiff
) {
3005 q
= estimateDiv128To64( aSig
, 0, bSig
);
3006 q
= ( 2 < q
) ? q
- 2 : 0;
3009 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3016 alternateASig
= aSig
;
3019 } while ( 0 <= (sbits64
) aSig
);
3020 sigMean
= aSig
+ alternateASig
;
3021 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3022 aSig
= alternateASig
;
3024 zSign
= ( (sbits64
) aSig
< 0 );
3025 if ( zSign
) aSig
= - aSig
;
3026 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig
);
3031 -------------------------------------------------------------------------------
3032 Returns the square root of the double-precision floating-point value `a'.
3033 The operation is performed according to the IEC/IEEE Standard for Binary
3034 Floating-Point Arithmetic.
3035 -------------------------------------------------------------------------------
3037 float64
float64_sqrt( float64 a
)
3041 bits64 aSig
, zSig
, doubleZSig
;
3042 bits64 rem0
, rem1
, term0
, term1
;
3044 aSig
= extractFloat64Frac( a
);
3045 aExp
= extractFloat64Exp( a
);
3046 aSign
= extractFloat64Sign( a
);
3047 if ( aExp
== 0x7FF ) {
3048 if ( aSig
) return propagateFloat64NaN( a
, a
);
3049 if ( ! aSign
) return a
;
3050 float_raise( float_flag_invalid
);
3051 return float64_default_nan
;
3054 if ( ( aExp
| aSig
) == 0 ) return a
;
3055 float_raise( float_flag_invalid
);
3056 return float64_default_nan
;
3059 if ( aSig
== 0 ) return 0;
3060 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3062 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3063 aSig
|= LIT64( 0x0010000000000000 );
3064 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3065 aSig
<<= 9 - ( aExp
& 1 );
3066 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3067 if ( ( zSig
& 0x1FF ) <= 5 ) {
3068 doubleZSig
= zSig
<<1;
3069 mul64To128( zSig
, zSig
, &term0
, &term1
);
3070 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3071 while ( (sbits64
) rem0
< 0 ) {
3074 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3076 zSig
|= ( ( rem0
| rem1
) != 0 );
3078 return roundAndPackFloat64( 0, zExp
, zSig
);
3084 -------------------------------------------------------------------------------
3085 Returns 1 if the double-precision floating-point value `a' is equal to the
3086 corresponding value `b', and 0 otherwise. The comparison is performed
3087 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3088 -------------------------------------------------------------------------------
3090 flag
float64_eq( float64 a
, float64 b
)
3093 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3094 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3096 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3097 float_raise( float_flag_invalid
);
3101 return ( a
== b
) ||
3102 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) == 0 );
3107 -------------------------------------------------------------------------------
3108 Returns 1 if the double-precision floating-point value `a' is less than or
3109 equal to the corresponding value `b', and 0 otherwise. The comparison is
3110 performed according to the IEC/IEEE Standard for Binary Floating-Point
3112 -------------------------------------------------------------------------------
3114 flag
float64_le( float64 a
, float64 b
)
3118 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3119 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3121 float_raise( float_flag_invalid
);
3124 aSign
= extractFloat64Sign( a
);
3125 bSign
= extractFloat64Sign( b
);
3126 if ( aSign
!= bSign
)
3128 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) ==
3130 return ( a
== b
) ||
3131 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
3136 -------------------------------------------------------------------------------
3137 Returns 1 if the double-precision floating-point value `a' is less than
3138 the corresponding value `b', and 0 otherwise. The comparison is performed
3139 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3140 -------------------------------------------------------------------------------
3142 flag
float64_lt( float64 a
, float64 b
)
3146 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3147 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3149 float_raise( float_flag_invalid
);
3152 aSign
= extractFloat64Sign( a
);
3153 bSign
= extractFloat64Sign( b
);
3154 if ( aSign
!= bSign
)
3156 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) !=
3158 return ( a
!= b
) &&
3159 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
3163 #ifndef SOFTFLOAT_FOR_GCC
3165 -------------------------------------------------------------------------------
3166 Returns 1 if the double-precision floating-point value `a' is equal to the
3167 corresponding value `b', and 0 otherwise. The invalid exception is raised
3168 if either operand is a NaN. Otherwise, the comparison is performed
3169 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3170 -------------------------------------------------------------------------------
3172 flag
float64_eq_signaling( float64 a
, float64 b
)
3175 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3176 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3178 float_raise( float_flag_invalid
);
3181 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3186 -------------------------------------------------------------------------------
3187 Returns 1 if the double-precision floating-point value `a' is less than or
3188 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3189 cause an exception. Otherwise, the comparison is performed according to the
3190 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3191 -------------------------------------------------------------------------------
3193 flag
float64_le_quiet( float64 a
, float64 b
)
3197 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3198 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3200 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3201 float_raise( float_flag_invalid
);
3205 aSign
= extractFloat64Sign( a
);
3206 bSign
= extractFloat64Sign( b
);
3207 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3208 return ( a
== b
) || ( aSign
^ ( a
< b
) );
3213 -------------------------------------------------------------------------------
3214 Returns 1 if the double-precision floating-point value `a' is less than
3215 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3216 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3217 Standard for Binary Floating-Point Arithmetic.
3218 -------------------------------------------------------------------------------
3220 flag
float64_lt_quiet( float64 a
, float64 b
)
3224 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3225 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3227 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3228 float_raise( float_flag_invalid
);
3232 aSign
= extractFloat64Sign( a
);
3233 bSign
= extractFloat64Sign( b
);
3234 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
3235 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
3243 -------------------------------------------------------------------------------
3244 Returns the result of converting the extended double-precision floating-
3245 point value `a' to the 32-bit two's complement integer format. The
3246 conversion is performed according to the IEC/IEEE Standard for Binary
3247 Floating-Point Arithmetic---which means in particular that the conversion
3248 is rounded according to the current rounding mode. If `a' is a NaN, the
3249 largest positive integer is returned. Otherwise, if the conversion
3250 overflows, the largest integer with the same sign as `a' is returned.
3251 -------------------------------------------------------------------------------
3253 int32
floatx80_to_int32( floatx80 a
)
3256 int32 aExp
, shiftCount
;
3259 aSig
= extractFloatx80Frac( a
);
3260 aExp
= extractFloatx80Exp( a
);
3261 aSign
= extractFloatx80Sign( a
);
3262 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3263 shiftCount
= 0x4037 - aExp
;
3264 if ( shiftCount
<= 0 ) shiftCount
= 1;
3265 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3266 return roundAndPackInt32( aSign
, aSig
);
3271 -------------------------------------------------------------------------------
3272 Returns the result of converting the extended double-precision floating-
3273 point value `a' to the 32-bit two's complement integer format. The
3274 conversion is performed according to the IEC/IEEE Standard for Binary
3275 Floating-Point Arithmetic, except that the conversion is always rounded
3276 toward zero. If `a' is a NaN, the largest positive integer is returned.
3277 Otherwise, if the conversion overflows, the largest integer with the same
3278 sign as `a' is returned.
3279 -------------------------------------------------------------------------------
3281 int32
floatx80_to_int32_round_to_zero( floatx80 a
)
3284 int32 aExp
, shiftCount
;
3285 bits64 aSig
, savedASig
;
3288 aSig
= extractFloatx80Frac( a
);
3289 aExp
= extractFloatx80Exp( a
);
3290 aSign
= extractFloatx80Sign( a
);
3291 if ( 0x401E < aExp
) {
3292 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3295 else if ( aExp
< 0x3FFF ) {
3296 if ( aExp
|| aSig
) float_set_inexact();
3299 shiftCount
= 0x403E - aExp
;
3301 aSig
>>= shiftCount
;
3303 if ( aSign
) z
= - z
;
3304 if ( ( z
< 0 ) ^ aSign
) {
3306 float_raise( float_flag_invalid
);
3307 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3309 if ( ( aSig
<<shiftCount
) != savedASig
) {
3310 float_set_inexact();
3317 -------------------------------------------------------------------------------
3318 Returns the result of converting the extended double-precision floating-
3319 point value `a' to the 64-bit two's complement integer format. The
3320 conversion is performed according to the IEC/IEEE Standard for Binary
3321 Floating-Point Arithmetic---which means in particular that the conversion
3322 is rounded according to the current rounding mode. If `a' is a NaN,
3323 the largest positive integer is returned. Otherwise, if the conversion
3324 overflows, the largest integer with the same sign as `a' is returned.
3325 -------------------------------------------------------------------------------
3327 int64
floatx80_to_int64( floatx80 a
)
3330 int32 aExp
, shiftCount
;
3331 bits64 aSig
, aSigExtra
;
3333 aSig
= extractFloatx80Frac( a
);
3334 aExp
= extractFloatx80Exp( a
);
3335 aSign
= extractFloatx80Sign( a
);
3336 shiftCount
= 0x403E - aExp
;
3337 if ( shiftCount
<= 0 ) {
3339 float_raise( float_flag_invalid
);
3341 || ( ( aExp
== 0x7FFF )
3342 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3344 return LIT64( 0x7FFFFFFFFFFFFFFF );
3346 return (sbits64
) LIT64( 0x8000000000000000 );
3351 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3353 return roundAndPackInt64( aSign
, aSig
, aSigExtra
);
3358 -------------------------------------------------------------------------------
3359 Returns the result of converting the extended double-precision floating-
3360 point value `a' to the 64-bit two's complement integer format. The
3361 conversion is performed according to the IEC/IEEE Standard for Binary
3362 Floating-Point Arithmetic, except that the conversion is always rounded
3363 toward zero. If `a' is a NaN, the largest positive integer is returned.
3364 Otherwise, if the conversion overflows, the largest integer with the same
3365 sign as `a' is returned.
3366 -------------------------------------------------------------------------------
3368 int64
floatx80_to_int64_round_to_zero( floatx80 a
)
3371 int32 aExp
, shiftCount
;
3375 aSig
= extractFloatx80Frac( a
);
3376 aExp
= extractFloatx80Exp( a
);
3377 aSign
= extractFloatx80Sign( a
);
3378 shiftCount
= aExp
- 0x403E;
3379 if ( 0 <= shiftCount
) {
3380 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3381 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3382 float_raise( float_flag_invalid
);
3383 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3384 return LIT64( 0x7FFFFFFFFFFFFFFF );
3387 return (sbits64
) LIT64( 0x8000000000000000 );
3389 else if ( aExp
< 0x3FFF ) {
3390 if ( aExp
| aSig
) float_set_inexact();
3393 z
= aSig
>>( - shiftCount
);
3394 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3395 float_set_inexact();
3397 if ( aSign
) z
= - z
;
3403 -------------------------------------------------------------------------------
3404 Returns the result of converting the extended double-precision floating-
3405 point value `a' to the single-precision floating-point format. The
3406 conversion is performed according to the IEC/IEEE Standard for Binary
3407 Floating-Point Arithmetic.
3408 -------------------------------------------------------------------------------
3410 float32
floatx80_to_float32( floatx80 a
)
3416 aSig
= extractFloatx80Frac( a
);
3417 aExp
= extractFloatx80Exp( a
);
3418 aSign
= extractFloatx80Sign( a
);
3419 if ( aExp
== 0x7FFF ) {
3420 if ( (bits64
) ( aSig
<<1 ) ) {
3421 return commonNaNToFloat32( floatx80ToCommonNaN( a
) );
3423 return packFloat32( aSign
, 0xFF, 0 );
3425 shift64RightJamming( aSig
, 33, &aSig
);
3426 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3427 return roundAndPackFloat32( aSign
, aExp
, aSig
);
3432 -------------------------------------------------------------------------------
3433 Returns the result of converting the extended double-precision floating-
3434 point value `a' to the double-precision floating-point format. The
3435 conversion is performed according to the IEC/IEEE Standard for Binary
3436 Floating-Point Arithmetic.
3437 -------------------------------------------------------------------------------
3439 float64
floatx80_to_float64( floatx80 a
)
3445 aSig
= extractFloatx80Frac( a
);
3446 aExp
= extractFloatx80Exp( a
);
3447 aSign
= extractFloatx80Sign( a
);
3448 if ( aExp
== 0x7FFF ) {
3449 if ( (bits64
) ( aSig
<<1 ) ) {
3450 return commonNaNToFloat64( floatx80ToCommonNaN( a
) );
3452 return packFloat64( aSign
, 0x7FF, 0 );
3454 shift64RightJamming( aSig
, 1, &zSig
);
3455 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3456 return roundAndPackFloat64( aSign
, aExp
, zSig
);
3463 -------------------------------------------------------------------------------
3464 Returns the result of converting the extended double-precision floating-
3465 point value `a' to the quadruple-precision floating-point format. The
3466 conversion is performed according to the IEC/IEEE Standard for Binary
3467 Floating-Point Arithmetic.
3468 -------------------------------------------------------------------------------
3470 float128
floatx80_to_float128( floatx80 a
)
3474 bits64 aSig
, zSig0
, zSig1
;
3476 aSig
= extractFloatx80Frac( a
);
3477 aExp
= extractFloatx80Exp( a
);
3478 aSign
= extractFloatx80Sign( a
);
3479 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3480 return commonNaNToFloat128( floatx80ToCommonNaN( a
) );
3482 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3483 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3490 -------------------------------------------------------------------------------
3491 Rounds the extended double-precision floating-point value `a' to an integer,
3492 and returns the result as an extended quadruple-precision floating-point
3493 value. The operation is performed according to the IEC/IEEE Standard for
3494 Binary Floating-Point Arithmetic.
3495 -------------------------------------------------------------------------------
3497 floatx80
floatx80_round_to_int( floatx80 a
)
3501 bits64 lastBitMask
, roundBitsMask
;
3505 aExp
= extractFloatx80Exp( a
);
3506 if ( 0x403E <= aExp
) {
3507 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3508 return propagateFloatx80NaN( a
, a
);
3512 if ( aExp
< 0x3FFF ) {
3514 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3517 float_set_inexact();
3518 aSign
= extractFloatx80Sign( a
);
3519 switch ( float_rounding_mode() ) {
3520 case float_round_nearest_even
:
3521 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3524 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3527 case float_round_down
:
3530 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3531 : packFloatx80( 0, 0, 0 );
3532 case float_round_up
:
3534 aSign
? packFloatx80( 1, 0, 0 )
3535 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3537 return packFloatx80( aSign
, 0, 0 );
3540 lastBitMask
<<= 0x403E - aExp
;
3541 roundBitsMask
= lastBitMask
- 1;
3543 roundingMode
= float_rounding_mode();
3544 if ( roundingMode
== float_round_nearest_even
) {
3545 z
.low
+= lastBitMask
>>1;
3546 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
3548 else if ( roundingMode
!= float_round_to_zero
) {
3549 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
3550 z
.low
+= roundBitsMask
;
3553 z
.low
&= ~ roundBitsMask
;
3556 z
.low
= LIT64( 0x8000000000000000 );
3558 if ( z
.low
!= a
.low
) float_set_inexact();
3564 -------------------------------------------------------------------------------
3565 Returns the result of adding the absolute values of the extended double-
3566 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3567 negated before being returned. `zSign' is ignored if the result is a NaN.
3568 The addition is performed according to the IEC/IEEE Standard for Binary
3569 Floating-Point Arithmetic.
3570 -------------------------------------------------------------------------------
3572 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3574 int32 aExp
, bExp
, zExp
;
3575 bits64 aSig
, bSig
, zSig0
, zSig1
;
3578 aSig
= extractFloatx80Frac( a
);
3579 aExp
= extractFloatx80Exp( a
);
3580 bSig
= extractFloatx80Frac( b
);
3581 bExp
= extractFloatx80Exp( b
);
3582 expDiff
= aExp
- bExp
;
3583 if ( 0 < expDiff
) {
3584 if ( aExp
== 0x7FFF ) {
3585 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3588 if ( bExp
== 0 ) --expDiff
;
3589 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3592 else if ( expDiff
< 0 ) {
3593 if ( bExp
== 0x7FFF ) {
3594 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3595 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3597 if ( aExp
== 0 ) ++expDiff
;
3598 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3602 if ( aExp
== 0x7FFF ) {
3603 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3604 return propagateFloatx80NaN( a
, b
);
3609 zSig0
= aSig
+ bSig
;
3611 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
3617 zSig0
= aSig
+ bSig
;
3618 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
3620 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3621 zSig0
|= LIT64( 0x8000000000000000 );
3625 roundAndPackFloatx80(
3626 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3631 -------------------------------------------------------------------------------
3632 Returns the result of subtracting the absolute values of the extended
3633 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3634 difference is negated before being returned. `zSign' is ignored if the
3635 result is a NaN. The subtraction is performed according to the IEC/IEEE
3636 Standard for Binary Floating-Point Arithmetic.
3637 -------------------------------------------------------------------------------
3639 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3641 int32 aExp
, bExp
, zExp
;
3642 bits64 aSig
, bSig
, zSig0
, zSig1
;
3646 aSig
= extractFloatx80Frac( a
);
3647 aExp
= extractFloatx80Exp( a
);
3648 bSig
= extractFloatx80Frac( b
);
3649 bExp
= extractFloatx80Exp( b
);
3650 expDiff
= aExp
- bExp
;
3651 if ( 0 < expDiff
) goto aExpBigger
;
3652 if ( expDiff
< 0 ) goto bExpBigger
;
3653 if ( aExp
== 0x7FFF ) {
3654 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3655 return propagateFloatx80NaN( a
, b
);
3657 float_raise( float_flag_invalid
);
3658 z
.low
= floatx80_default_nan_low
;
3659 z
.high
= floatx80_default_nan_high
;
3667 if ( bSig
< aSig
) goto aBigger
;
3668 if ( aSig
< bSig
) goto bBigger
;
3669 return packFloatx80( float_rounding_mode() == float_round_down
, 0, 0 );
3671 if ( bExp
== 0x7FFF ) {
3672 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3673 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3675 if ( aExp
== 0 ) ++expDiff
;
3676 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3678 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
3681 goto normalizeRoundAndPack
;
3683 if ( aExp
== 0x7FFF ) {
3684 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3687 if ( bExp
== 0 ) --expDiff
;
3688 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3690 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
3692 normalizeRoundAndPack
:
3694 normalizeRoundAndPackFloatx80(
3695 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3700 -------------------------------------------------------------------------------
3701 Returns the result of adding the extended double-precision floating-point
3702 values `a' and `b'. The operation is performed according to the IEC/IEEE
3703 Standard for Binary Floating-Point Arithmetic.
3704 -------------------------------------------------------------------------------
3706 floatx80
floatx80_add( floatx80 a
, floatx80 b
)
3710 aSign
= extractFloatx80Sign( a
);
3711 bSign
= extractFloatx80Sign( b
);
3712 if ( aSign
== bSign
) {
3713 return addFloatx80Sigs( a
, b
, aSign
);
3716 return subFloatx80Sigs( a
, b
, aSign
);
3722 -------------------------------------------------------------------------------
3723 Returns the result of subtracting the extended double-precision floating-
3724 point values `a' and `b'. The operation is performed according to the
3725 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3726 -------------------------------------------------------------------------------
3728 floatx80
floatx80_sub( floatx80 a
, floatx80 b
)
3732 aSign
= extractFloatx80Sign( a
);
3733 bSign
= extractFloatx80Sign( b
);
3734 if ( aSign
== bSign
) {
3735 return subFloatx80Sigs( a
, b
, aSign
);
3738 return addFloatx80Sigs( a
, b
, aSign
);
3744 -------------------------------------------------------------------------------
3745 Returns the result of multiplying the extended double-precision floating-
3746 point values `a' and `b'. The operation is performed according to the
3747 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3748 -------------------------------------------------------------------------------
3750 floatx80
floatx80_mul( floatx80 a
, floatx80 b
)
3752 flag aSign
, bSign
, zSign
;
3753 int32 aExp
, bExp
, zExp
;
3754 bits64 aSig
, bSig
, zSig0
, zSig1
;
3757 aSig
= extractFloatx80Frac( a
);
3758 aExp
= extractFloatx80Exp( a
);
3759 aSign
= extractFloatx80Sign( a
);
3760 bSig
= extractFloatx80Frac( b
);
3761 bExp
= extractFloatx80Exp( b
);
3762 bSign
= extractFloatx80Sign( b
);
3763 zSign
= aSign
^ bSign
;
3764 if ( aExp
== 0x7FFF ) {
3765 if ( (bits64
) ( aSig
<<1 )
3766 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3767 return propagateFloatx80NaN( a
, b
);
3769 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
3770 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3772 if ( bExp
== 0x7FFF ) {
3773 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3774 if ( ( aExp
| aSig
) == 0 ) {
3776 float_raise( float_flag_invalid
);
3777 z
.low
= floatx80_default_nan_low
;
3778 z
.high
= floatx80_default_nan_high
;
3781 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3784 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3785 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3788 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3789 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3791 zExp
= aExp
+ bExp
- 0x3FFE;
3792 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3793 if ( 0 < (sbits64
) zSig0
) {
3794 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3798 roundAndPackFloatx80(
3799 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3804 -------------------------------------------------------------------------------
3805 Returns the result of dividing the extended double-precision floating-point
3806 value `a' by the corresponding value `b'. The operation is performed
3807 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3808 -------------------------------------------------------------------------------
3810 floatx80
floatx80_div( floatx80 a
, floatx80 b
)
3812 flag aSign
, bSign
, zSign
;
3813 int32 aExp
, bExp
, zExp
;
3814 bits64 aSig
, bSig
, zSig0
, zSig1
;
3815 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3818 aSig
= extractFloatx80Frac( a
);
3819 aExp
= extractFloatx80Exp( a
);
3820 aSign
= extractFloatx80Sign( a
);
3821 bSig
= extractFloatx80Frac( b
);
3822 bExp
= extractFloatx80Exp( b
);
3823 bSign
= extractFloatx80Sign( b
);
3824 zSign
= aSign
^ bSign
;
3825 if ( aExp
== 0x7FFF ) {
3826 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3827 if ( bExp
== 0x7FFF ) {
3828 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3831 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3833 if ( bExp
== 0x7FFF ) {
3834 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3835 return packFloatx80( zSign
, 0, 0 );
3839 if ( ( aExp
| aSig
) == 0 ) {
3841 float_raise( float_flag_invalid
);
3842 z
.low
= floatx80_default_nan_low
;
3843 z
.high
= floatx80_default_nan_high
;
3846 float_raise( float_flag_divbyzero
);
3847 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3849 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3852 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3853 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3855 zExp
= aExp
- bExp
+ 0x3FFE;
3857 if ( bSig
<= aSig
) {
3858 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3861 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3862 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3863 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3864 while ( (sbits64
) rem0
< 0 ) {
3866 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3868 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3869 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3870 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3871 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3872 while ( (sbits64
) rem1
< 0 ) {
3874 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3876 zSig1
|= ( ( rem1
| rem2
) != 0 );
3879 roundAndPackFloatx80(
3880 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3885 -------------------------------------------------------------------------------
3886 Returns the remainder of the extended double-precision floating-point value
3887 `a' with respect to the corresponding value `b'. The operation is performed
3888 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3889 -------------------------------------------------------------------------------
3891 floatx80
floatx80_rem( floatx80 a
, floatx80 b
)
3893 flag aSign
, bSign
, zSign
;
3894 int32 aExp
, bExp
, expDiff
;
3895 bits64 aSig0
, aSig1
, bSig
;
3896 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3899 aSig0
= extractFloatx80Frac( a
);
3900 aExp
= extractFloatx80Exp( a
);
3901 aSign
= extractFloatx80Sign( a
);
3902 bSig
= extractFloatx80Frac( b
);
3903 bExp
= extractFloatx80Exp( b
);
3904 bSign
= extractFloatx80Sign( b
);
3905 if ( aExp
== 0x7FFF ) {
3906 if ( (bits64
) ( aSig0
<<1 )
3907 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3908 return propagateFloatx80NaN( a
, b
);
3912 if ( bExp
== 0x7FFF ) {
3913 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3919 float_raise( float_flag_invalid
);
3920 z
.low
= floatx80_default_nan_low
;
3921 z
.high
= floatx80_default_nan_high
;
3924 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3927 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3928 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3930 bSig
|= LIT64( 0x8000000000000000 );
3932 expDiff
= aExp
- bExp
;
3934 if ( expDiff
< 0 ) {
3935 if ( expDiff
< -1 ) return a
;
3936 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3939 q
= ( bSig
<= aSig0
);
3940 if ( q
) aSig0
-= bSig
;
3942 while ( 0 < expDiff
) {
3943 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3944 q
= ( 2 < q
) ? q
- 2 : 0;
3945 mul64To128( bSig
, q
, &term0
, &term1
);
3946 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3947 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3951 if ( 0 < expDiff
) {
3952 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3953 q
= ( 2 < q
) ? q
- 2 : 0;
3955 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
3956 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3957 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
3958 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
3960 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3967 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
3968 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3969 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3972 aSig0
= alternateASig0
;
3973 aSig1
= alternateASig1
;
3977 normalizeRoundAndPackFloatx80(
3978 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
);
3983 -------------------------------------------------------------------------------
3984 Returns the square root of the extended double-precision floating-point
3985 value `a'. The operation is performed according to the IEC/IEEE Standard
3986 for Binary Floating-Point Arithmetic.
3987 -------------------------------------------------------------------------------
3989 floatx80
floatx80_sqrt( floatx80 a
)
3993 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
3994 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
3997 aSig0
= extractFloatx80Frac( a
);
3998 aExp
= extractFloatx80Exp( a
);
3999 aSign
= extractFloatx80Sign( a
);
4000 if ( aExp
== 0x7FFF ) {
4001 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a
);
4002 if ( ! aSign
) return a
;
4006 if ( ( aExp
| aSig0
) == 0 ) return a
;
4008 float_raise( float_flag_invalid
);
4009 z
.low
= floatx80_default_nan_low
;
4010 z
.high
= floatx80_default_nan_high
;
4014 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4015 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4017 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4018 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4019 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4020 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4021 doubleZSig0
= zSig0
<<1;
4022 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4023 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4024 while ( (sbits64
) rem0
< 0 ) {
4027 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4029 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4030 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4031 if ( zSig1
== 0 ) zSig1
= 1;
4032 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4033 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4034 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4035 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4036 while ( (sbits64
) rem1
< 0 ) {
4038 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4040 term2
|= doubleZSig0
;
4041 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4043 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4045 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4046 zSig0
|= doubleZSig0
;
4048 roundAndPackFloatx80(
4049 floatx80_rounding_precision
, 0, zExp
, zSig0
, zSig1
);
4054 -------------------------------------------------------------------------------
4055 Returns 1 if the extended double-precision floating-point value `a' is
4056 equal to the corresponding value `b', and 0 otherwise. The comparison is
4057 performed according to the IEC/IEEE Standard for Binary Floating-Point
4059 -------------------------------------------------------------------------------
4061 flag
floatx80_eq( floatx80 a
, floatx80 b
)
4064 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4065 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4066 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4067 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4069 if ( floatx80_is_signaling_nan( a
)
4070 || floatx80_is_signaling_nan( b
) ) {
4071 float_raise( float_flag_invalid
);
4077 && ( ( a
.high
== b
.high
)
4079 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4085 -------------------------------------------------------------------------------
4086 Returns 1 if the extended double-precision floating-point value `a' is
4087 less than or equal to the corresponding value `b', and 0 otherwise. The
4088 comparison is performed according to the IEC/IEEE Standard for Binary
4089 Floating-Point Arithmetic.
4090 -------------------------------------------------------------------------------
4092 flag
floatx80_le( floatx80 a
, floatx80 b
)
4096 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4097 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4098 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4099 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4101 float_raise( float_flag_invalid
);
4104 aSign
= extractFloatx80Sign( a
);
4105 bSign
= extractFloatx80Sign( b
);
4106 if ( aSign
!= bSign
) {
4109 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4113 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4114 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4119 -------------------------------------------------------------------------------
4120 Returns 1 if the extended double-precision floating-point value `a' is
4121 less than the corresponding value `b', and 0 otherwise. The comparison
4122 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4124 -------------------------------------------------------------------------------
4126 flag
floatx80_lt( floatx80 a
, floatx80 b
)
4130 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4131 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4132 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4133 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4135 float_raise( float_flag_invalid
);
4138 aSign
= extractFloatx80Sign( a
);
4139 bSign
= extractFloatx80Sign( b
);
4140 if ( aSign
!= bSign
) {
4143 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4147 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4148 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4153 -------------------------------------------------------------------------------
4154 Returns 1 if the extended double-precision floating-point value `a' is equal
4155 to the corresponding value `b', and 0 otherwise. The invalid exception is
4156 raised if either operand is a NaN. Otherwise, the comparison is performed
4157 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4158 -------------------------------------------------------------------------------
4160 flag
floatx80_eq_signaling( floatx80 a
, floatx80 b
)
4163 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4164 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4165 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4166 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4168 float_raise( float_flag_invalid
);
4173 && ( ( a
.high
== b
.high
)
4175 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4181 -------------------------------------------------------------------------------
4182 Returns 1 if the extended double-precision floating-point value `a' is less
4183 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4184 do not cause an exception. Otherwise, the comparison is performed according
4185 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4186 -------------------------------------------------------------------------------
4188 flag
floatx80_le_quiet( floatx80 a
, floatx80 b
)
4192 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4193 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4194 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4195 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4197 if ( floatx80_is_signaling_nan( a
)
4198 || floatx80_is_signaling_nan( b
) ) {
4199 float_raise( float_flag_invalid
);
4203 aSign
= extractFloatx80Sign( a
);
4204 bSign
= extractFloatx80Sign( b
);
4205 if ( aSign
!= bSign
) {
4208 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4212 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4213 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4218 -------------------------------------------------------------------------------
4219 Returns 1 if the extended double-precision floating-point value `a' is less
4220 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4221 an exception. Otherwise, the comparison is performed according to the
4222 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4223 -------------------------------------------------------------------------------
4225 flag
floatx80_lt_quiet( floatx80 a
, floatx80 b
)
4229 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4230 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4231 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4232 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4234 if ( floatx80_is_signaling_nan( a
)
4235 || floatx80_is_signaling_nan( b
) ) {
4236 float_raise( float_flag_invalid
);
4240 aSign
= extractFloatx80Sign( a
);
4241 bSign
= extractFloatx80Sign( b
);
4242 if ( aSign
!= bSign
) {
4245 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4249 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4250 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4259 -------------------------------------------------------------------------------
4260 Returns the result of converting the quadruple-precision floating-point
4261 value `a' to the 32-bit two's complement integer format. The conversion
4262 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4263 Arithmetic---which means in particular that the conversion is rounded
4264 according to the current rounding mode. If `a' is a NaN, the largest
4265 positive integer is returned. Otherwise, if the conversion overflows, the
4266 largest integer with the same sign as `a' is returned.
4267 -------------------------------------------------------------------------------
4269 int32
float128_to_int32( float128 a
)
4272 int32 aExp
, shiftCount
;
4273 bits64 aSig0
, aSig1
;
4275 aSig1
= extractFloat128Frac1( a
);
4276 aSig0
= extractFloat128Frac0( a
);
4277 aExp
= extractFloat128Exp( a
);
4278 aSign
= extractFloat128Sign( a
);
4279 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4280 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4281 aSig0
|= ( aSig1
!= 0 );
4282 shiftCount
= 0x4028 - aExp
;
4283 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4284 return roundAndPackInt32( aSign
, aSig0
);
4289 -------------------------------------------------------------------------------
4290 Returns the result of converting the quadruple-precision floating-point
4291 value `a' to the 32-bit two's complement integer format. The conversion
4292 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4293 Arithmetic, except that the conversion is always rounded toward zero. If
4294 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4295 conversion overflows, the largest integer with the same sign as `a' is
4297 -------------------------------------------------------------------------------
4299 int32
float128_to_int32_round_to_zero( float128 a
)
4302 int32 aExp
, shiftCount
;
4303 bits64 aSig0
, aSig1
, savedASig
;
4306 aSig1
= extractFloat128Frac1( a
);
4307 aSig0
= extractFloat128Frac0( a
);
4308 aExp
= extractFloat128Exp( a
);
4309 aSign
= extractFloat128Sign( a
);
4310 aSig0
|= ( aSig1
!= 0 );
4311 if ( 0x401E < aExp
) {
4312 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4315 else if ( aExp
< 0x3FFF ) {
4316 if ( aExp
|| aSig0
) float_set_inexact();
4319 aSig0
|= LIT64( 0x0001000000000000 );
4320 shiftCount
= 0x402F - aExp
;
4322 aSig0
>>= shiftCount
;
4324 if ( aSign
) z
= - z
;
4325 if ( ( z
< 0 ) ^ aSign
) {
4327 float_raise( float_flag_invalid
);
4328 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4330 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4331 float_set_inexact();
4338 -------------------------------------------------------------------------------
4339 Returns the result of converting the quadruple-precision floating-point
4340 value `a' to the 64-bit two's complement integer format. The conversion
4341 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4342 Arithmetic---which means in particular that the conversion is rounded
4343 according to the current rounding mode. If `a' is a NaN, the largest
4344 positive integer is returned. Otherwise, if the conversion overflows, the
4345 largest integer with the same sign as `a' is returned.
4346 -------------------------------------------------------------------------------
4348 int64
float128_to_int64( float128 a
)
4351 int32 aExp
, shiftCount
;
4352 bits64 aSig0
, aSig1
;
4354 aSig1
= extractFloat128Frac1( a
);
4355 aSig0
= extractFloat128Frac0( a
);
4356 aExp
= extractFloat128Exp( a
);
4357 aSign
= extractFloat128Sign( a
);
4358 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4359 shiftCount
= 0x402F - aExp
;
4360 if ( shiftCount
<= 0 ) {
4361 if ( 0x403E < aExp
) {
4362 float_raise( float_flag_invalid
);
4364 || ( ( aExp
== 0x7FFF )
4365 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4368 return LIT64( 0x7FFFFFFFFFFFFFFF );
4370 return (sbits64
) LIT64( 0x8000000000000000 );
4372 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4375 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4377 return roundAndPackInt64( aSign
, aSig0
, aSig1
);
4382 -------------------------------------------------------------------------------
4383 Returns the result of converting the quadruple-precision floating-point
4384 value `a' to the 64-bit two's complement integer format. The conversion
4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4386 Arithmetic, except that the conversion is always rounded toward zero.
4387 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4388 the conversion overflows, the largest integer with the same sign as `a' is
4390 -------------------------------------------------------------------------------
4392 int64
float128_to_int64_round_to_zero( float128 a
)
4395 int32 aExp
, shiftCount
;
4396 bits64 aSig0
, aSig1
;
4399 aSig1
= extractFloat128Frac1( a
);
4400 aSig0
= extractFloat128Frac0( a
);
4401 aExp
= extractFloat128Exp( a
);
4402 aSign
= extractFloat128Sign( a
);
4403 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4404 shiftCount
= aExp
- 0x402F;
4405 if ( 0 < shiftCount
) {
4406 if ( 0x403E <= aExp
) {
4407 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4408 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4409 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4410 if ( aSig1
) float_set_inexact();
4413 float_raise( float_flag_invalid
);
4414 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4415 return LIT64( 0x7FFFFFFFFFFFFFFF );
4418 return (sbits64
) LIT64( 0x8000000000000000 );
4420 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4421 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4422 float_set_inexact();
4426 if ( aExp
< 0x3FFF ) {
4427 if ( aExp
| aSig0
| aSig1
) {
4428 float_set_inexact();
4432 z
= aSig0
>>( - shiftCount
);
4434 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4435 float_set_inexact();
4438 if ( aSign
) z
= - z
;
4444 -------------------------------------------------------------------------------
4445 Returns the result of converting the quadruple-precision floating-point
4446 value `a' to the single-precision floating-point format. The conversion
4447 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4449 -------------------------------------------------------------------------------
4451 float32
float128_to_float32( float128 a
)
4455 bits64 aSig0
, aSig1
;
4458 aSig1
= extractFloat128Frac1( a
);
4459 aSig0
= extractFloat128Frac0( a
);
4460 aExp
= extractFloat128Exp( a
);
4461 aSign
= extractFloat128Sign( a
);
4462 if ( aExp
== 0x7FFF ) {
4463 if ( aSig0
| aSig1
) {
4464 return commonNaNToFloat32( float128ToCommonNaN( a
) );
4466 return packFloat32( aSign
, 0xFF, 0 );
4468 aSig0
|= ( aSig1
!= 0 );
4469 shift64RightJamming( aSig0
, 18, &aSig0
);
4471 if ( aExp
|| zSig
) {
4475 return roundAndPackFloat32( aSign
, aExp
, zSig
);
4480 -------------------------------------------------------------------------------
4481 Returns the result of converting the quadruple-precision floating-point
4482 value `a' to the double-precision floating-point format. The conversion
4483 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4485 -------------------------------------------------------------------------------
4487 float64
float128_to_float64( float128 a
)
4491 bits64 aSig0
, aSig1
;
4493 aSig1
= extractFloat128Frac1( a
);
4494 aSig0
= extractFloat128Frac0( a
);
4495 aExp
= extractFloat128Exp( a
);
4496 aSign
= extractFloat128Sign( a
);
4497 if ( aExp
== 0x7FFF ) {
4498 if ( aSig0
| aSig1
) {
4499 return commonNaNToFloat64( float128ToCommonNaN( a
) );
4501 return packFloat64( aSign
, 0x7FF, 0 );
4503 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4504 aSig0
|= ( aSig1
!= 0 );
4505 if ( aExp
|| aSig0
) {
4506 aSig0
|= LIT64( 0x4000000000000000 );
4509 return roundAndPackFloat64( aSign
, aExp
, aSig0
);
4516 -------------------------------------------------------------------------------
4517 Returns the result of converting the quadruple-precision floating-point
4518 value `a' to the extended double-precision floating-point format. The
4519 conversion is performed according to the IEC/IEEE Standard for Binary
4520 Floating-Point Arithmetic.
4521 -------------------------------------------------------------------------------
4523 floatx80
float128_to_floatx80( float128 a
)
4527 bits64 aSig0
, aSig1
;
4529 aSig1
= extractFloat128Frac1( a
);
4530 aSig0
= extractFloat128Frac0( a
);
4531 aExp
= extractFloat128Exp( a
);
4532 aSign
= extractFloat128Sign( a
);
4533 if ( aExp
== 0x7FFF ) {
4534 if ( aSig0
| aSig1
) {
4535 return commonNaNToFloatx80( float128ToCommonNaN( a
) );
4537 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4540 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4541 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4544 aSig0
|= LIT64( 0x0001000000000000 );
4546 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4547 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1
);
4554 -------------------------------------------------------------------------------
4555 Rounds the quadruple-precision floating-point value `a' to an integer, and
4556 returns the result as a quadruple-precision floating-point value. The
4557 operation is performed according to the IEC/IEEE Standard for Binary
4558 Floating-Point Arithmetic.
4559 -------------------------------------------------------------------------------
4561 float128
float128_round_to_int( float128 a
)
4565 bits64 lastBitMask
, roundBitsMask
;
4569 aExp
= extractFloat128Exp( a
);
4570 if ( 0x402F <= aExp
) {
4571 if ( 0x406F <= aExp
) {
4572 if ( ( aExp
== 0x7FFF )
4573 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
4575 return propagateFloat128NaN( a
, a
);
4580 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
4581 roundBitsMask
= lastBitMask
- 1;
4583 roundingMode
= float_rounding_mode();
4584 if ( roundingMode
== float_round_nearest_even
) {
4585 if ( lastBitMask
) {
4586 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
4587 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4590 if ( (sbits64
) z
.low
< 0 ) {
4592 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
4596 else if ( roundingMode
!= float_round_to_zero
) {
4597 if ( extractFloat128Sign( z
)
4598 ^ ( roundingMode
== float_round_up
) ) {
4599 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
4602 z
.low
&= ~ roundBitsMask
;
4605 if ( aExp
< 0x3FFF ) {
4606 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
4607 float_set_inexact();
4608 aSign
= extractFloat128Sign( a
);
4609 switch ( float_rounding_mode() ) {
4610 case float_round_nearest_even
:
4611 if ( ( aExp
== 0x3FFE )
4612 && ( extractFloat128Frac0( a
)
4613 | extractFloat128Frac1( a
) )
4615 return packFloat128( aSign
, 0x3FFF, 0, 0 );
4618 case float_round_down
:
4620 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
4621 : packFloat128( 0, 0, 0, 0 );
4622 case float_round_up
:
4624 aSign
? packFloat128( 1, 0, 0, 0 )
4625 : packFloat128( 0, 0x3FFF, 0, 0 );
4627 return packFloat128( aSign
, 0, 0, 0 );
4630 lastBitMask
<<= 0x402F - aExp
;
4631 roundBitsMask
= lastBitMask
- 1;
4634 roundingMode
= float_rounding_mode();
4635 if ( roundingMode
== float_round_nearest_even
) {
4636 z
.high
+= lastBitMask
>>1;
4637 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
4638 z
.high
&= ~ lastBitMask
;
4641 else if ( roundingMode
!= float_round_to_zero
) {
4642 if ( extractFloat128Sign( z
)
4643 ^ ( roundingMode
== float_round_up
) ) {
4644 z
.high
|= ( a
.low
!= 0 );
4645 z
.high
+= roundBitsMask
;
4648 z
.high
&= ~ roundBitsMask
;
4650 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
4651 float_set_inexact();
4658 -------------------------------------------------------------------------------
4659 Returns the result of adding the absolute values of the quadruple-precision
4660 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4661 before being returned. `zSign' is ignored if the result is a NaN.
4662 The addition is performed according to the IEC/IEEE Standard for Binary
4663 Floating-Point Arithmetic.
4664 -------------------------------------------------------------------------------
4666 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4668 int32 aExp
, bExp
, zExp
;
4669 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4672 aSig1
= extractFloat128Frac1( a
);
4673 aSig0
= extractFloat128Frac0( a
);
4674 aExp
= extractFloat128Exp( a
);
4675 bSig1
= extractFloat128Frac1( b
);
4676 bSig0
= extractFloat128Frac0( b
);
4677 bExp
= extractFloat128Exp( b
);
4678 expDiff
= aExp
- bExp
;
4679 if ( 0 < expDiff
) {
4680 if ( aExp
== 0x7FFF ) {
4681 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4688 bSig0
|= LIT64( 0x0001000000000000 );
4690 shift128ExtraRightJamming(
4691 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
4694 else if ( expDiff
< 0 ) {
4695 if ( bExp
== 0x7FFF ) {
4696 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4697 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4703 aSig0
|= LIT64( 0x0001000000000000 );
4705 shift128ExtraRightJamming(
4706 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
4710 if ( aExp
== 0x7FFF ) {
4711 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4712 return propagateFloat128NaN( a
, b
);
4716 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4717 if ( aExp
== 0 ) return packFloat128( zSign
, 0, zSig0
, zSig1
);
4719 zSig0
|= LIT64( 0x0002000000000000 );
4723 aSig0
|= LIT64( 0x0001000000000000 );
4724 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4726 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
4729 shift128ExtraRightJamming(
4730 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4732 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4737 -------------------------------------------------------------------------------
4738 Returns the result of subtracting the absolute values of the quadruple-
4739 precision floating-point values `a' and `b'. If `zSign' is 1, the
4740 difference is negated before being returned. `zSign' is ignored if the
4741 result is a NaN. The subtraction is performed according to the IEC/IEEE
4742 Standard for Binary Floating-Point Arithmetic.
4743 -------------------------------------------------------------------------------
4745 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4747 int32 aExp
, bExp
, zExp
;
4748 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
4752 aSig1
= extractFloat128Frac1( a
);
4753 aSig0
= extractFloat128Frac0( a
);
4754 aExp
= extractFloat128Exp( a
);
4755 bSig1
= extractFloat128Frac1( b
);
4756 bSig0
= extractFloat128Frac0( b
);
4757 bExp
= extractFloat128Exp( b
);
4758 expDiff
= aExp
- bExp
;
4759 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4760 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
4761 if ( 0 < expDiff
) goto aExpBigger
;
4762 if ( expDiff
< 0 ) goto bExpBigger
;
4763 if ( aExp
== 0x7FFF ) {
4764 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4765 return propagateFloat128NaN( a
, b
);
4767 float_raise( float_flag_invalid
);
4768 z
.low
= float128_default_nan_low
;
4769 z
.high
= float128_default_nan_high
;
4776 if ( bSig0
< aSig0
) goto aBigger
;
4777 if ( aSig0
< bSig0
) goto bBigger
;
4778 if ( bSig1
< aSig1
) goto aBigger
;
4779 if ( aSig1
< bSig1
) goto bBigger
;
4780 return packFloat128( float_rounding_mode() == float_round_down
, 0, 0, 0 );
4782 if ( bExp
== 0x7FFF ) {
4783 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4784 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
4790 aSig0
|= LIT64( 0x4000000000000000 );
4792 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4793 bSig0
|= LIT64( 0x4000000000000000 );
4795 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4798 goto normalizeRoundAndPack
;
4800 if ( aExp
== 0x7FFF ) {
4801 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4808 bSig0
|= LIT64( 0x4000000000000000 );
4810 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
4811 aSig0
|= LIT64( 0x4000000000000000 );
4813 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4815 normalizeRoundAndPack
:
4817 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1
);
4822 -------------------------------------------------------------------------------
4823 Returns the result of adding the quadruple-precision floating-point values
4824 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4825 for Binary Floating-Point Arithmetic.
4826 -------------------------------------------------------------------------------
4828 float128
float128_add( float128 a
, float128 b
)
4832 aSign
= extractFloat128Sign( a
);
4833 bSign
= extractFloat128Sign( b
);
4834 if ( aSign
== bSign
) {
4835 return addFloat128Sigs( a
, b
, aSign
);
4838 return subFloat128Sigs( a
, b
, aSign
);
4844 -------------------------------------------------------------------------------
4845 Returns the result of subtracting the quadruple-precision floating-point
4846 values `a' and `b'. The operation is performed according to the IEC/IEEE
4847 Standard for Binary Floating-Point Arithmetic.
4848 -------------------------------------------------------------------------------
4850 float128
float128_sub( float128 a
, float128 b
)
4854 aSign
= extractFloat128Sign( a
);
4855 bSign
= extractFloat128Sign( b
);
4856 if ( aSign
== bSign
) {
4857 return subFloat128Sigs( a
, b
, aSign
);
4860 return addFloat128Sigs( a
, b
, aSign
);
4866 -------------------------------------------------------------------------------
4867 Returns the result of multiplying the quadruple-precision floating-point
4868 values `a' and `b'. The operation is performed according to the IEC/IEEE
4869 Standard for Binary Floating-Point Arithmetic.
4870 -------------------------------------------------------------------------------
4872 float128
float128_mul( float128 a
, float128 b
)
4874 flag aSign
, bSign
, zSign
;
4875 int32 aExp
, bExp
, zExp
;
4876 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
4879 aSig1
= extractFloat128Frac1( a
);
4880 aSig0
= extractFloat128Frac0( a
);
4881 aExp
= extractFloat128Exp( a
);
4882 aSign
= extractFloat128Sign( a
);
4883 bSig1
= extractFloat128Frac1( b
);
4884 bSig0
= extractFloat128Frac0( b
);
4885 bExp
= extractFloat128Exp( b
);
4886 bSign
= extractFloat128Sign( b
);
4887 zSign
= aSign
^ bSign
;
4888 if ( aExp
== 0x7FFF ) {
4889 if ( ( aSig0
| aSig1
)
4890 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4891 return propagateFloat128NaN( a
, b
);
4893 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
4894 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4896 if ( bExp
== 0x7FFF ) {
4897 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4898 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4900 float_raise( float_flag_invalid
);
4901 z
.low
= float128_default_nan_low
;
4902 z
.high
= float128_default_nan_high
;
4905 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4908 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4909 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4912 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4913 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4915 zExp
= aExp
+ bExp
- 0x4000;
4916 aSig0
|= LIT64( 0x0001000000000000 );
4917 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
4918 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
4919 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4920 zSig2
|= ( zSig3
!= 0 );
4921 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
4922 shift128ExtraRightJamming(
4923 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4926 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4931 -------------------------------------------------------------------------------
4932 Returns the result of dividing the quadruple-precision floating-point value
4933 `a' by the corresponding value `b'. The operation is performed according to
4934 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4935 -------------------------------------------------------------------------------
4937 float128
float128_div( float128 a
, float128 b
)
4939 flag aSign
, bSign
, zSign
;
4940 int32 aExp
, bExp
, zExp
;
4941 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4942 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4945 aSig1
= extractFloat128Frac1( a
);
4946 aSig0
= extractFloat128Frac0( a
);
4947 aExp
= extractFloat128Exp( a
);
4948 aSign
= extractFloat128Sign( a
);
4949 bSig1
= extractFloat128Frac1( b
);
4950 bSig0
= extractFloat128Frac0( b
);
4951 bExp
= extractFloat128Exp( b
);
4952 bSign
= extractFloat128Sign( b
);
4953 zSign
= aSign
^ bSign
;
4954 if ( aExp
== 0x7FFF ) {
4955 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4956 if ( bExp
== 0x7FFF ) {
4957 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4960 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4962 if ( bExp
== 0x7FFF ) {
4963 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4964 return packFloat128( zSign
, 0, 0, 0 );
4967 if ( ( bSig0
| bSig1
) == 0 ) {
4968 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4970 float_raise( float_flag_invalid
);
4971 z
.low
= float128_default_nan_low
;
4972 z
.high
= float128_default_nan_high
;
4975 float_raise( float_flag_divbyzero
);
4976 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4978 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4981 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4982 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4984 zExp
= aExp
- bExp
+ 0x3FFD;
4986 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
4988 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
4989 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
4990 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
4993 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4994 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
4995 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
4996 while ( (sbits64
) rem0
< 0 ) {
4998 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5000 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5001 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5002 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5003 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5004 while ( (sbits64
) rem1
< 0 ) {
5006 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5008 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5010 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5011 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
5016 -------------------------------------------------------------------------------
5017 Returns the remainder of the quadruple-precision floating-point value `a'
5018 with respect to the corresponding value `b'. The operation is performed
5019 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020 -------------------------------------------------------------------------------
5022 float128
float128_rem( float128 a
, float128 b
)
5024 flag aSign
, bSign
, zSign
;
5025 int32 aExp
, bExp
, expDiff
;
5026 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5027 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5031 aSig1
= extractFloat128Frac1( a
);
5032 aSig0
= extractFloat128Frac0( a
);
5033 aExp
= extractFloat128Exp( a
);
5034 aSign
= extractFloat128Sign( a
);
5035 bSig1
= extractFloat128Frac1( b
);
5036 bSig0
= extractFloat128Frac0( b
);
5037 bExp
= extractFloat128Exp( b
);
5038 bSign
= extractFloat128Sign( b
);
5039 if ( aExp
== 0x7FFF ) {
5040 if ( ( aSig0
| aSig1
)
5041 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5042 return propagateFloat128NaN( a
, b
);
5046 if ( bExp
== 0x7FFF ) {
5047 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
5051 if ( ( bSig0
| bSig1
) == 0 ) {
5053 float_raise( float_flag_invalid
);
5054 z
.low
= float128_default_nan_low
;
5055 z
.high
= float128_default_nan_high
;
5058 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5061 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5062 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5064 expDiff
= aExp
- bExp
;
5065 if ( expDiff
< -1 ) return a
;
5067 aSig0
| LIT64( 0x0001000000000000 ),
5069 15 - ( expDiff
< 0 ),
5074 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5075 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5076 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5078 while ( 0 < expDiff
) {
5079 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5080 q
= ( 4 < q
) ? q
- 4 : 0;
5081 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5082 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5083 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5084 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5087 if ( -64 < expDiff
) {
5088 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5089 q
= ( 4 < q
) ? q
- 4 : 0;
5091 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5093 if ( expDiff
< 0 ) {
5094 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5097 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5099 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5100 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5103 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5104 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5107 alternateASig0
= aSig0
;
5108 alternateASig1
= aSig1
;
5110 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5111 } while ( 0 <= (sbits64
) aSig0
);
5113 aSig0
, aSig1
, alternateASig0
, alternateASig1
, &sigMean0
, &sigMean1
);
5114 if ( ( sigMean0
< 0 )
5115 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5116 aSig0
= alternateASig0
;
5117 aSig1
= alternateASig1
;
5119 zSign
= ( (sbits64
) aSig0
< 0 );
5120 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5122 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
);
5127 -------------------------------------------------------------------------------
5128 Returns the square root of the quadruple-precision floating-point value `a'.
5129 The operation is performed according to the IEC/IEEE Standard for Binary
5130 Floating-Point Arithmetic.
5131 -------------------------------------------------------------------------------
5133 float128
float128_sqrt( float128 a
)
5137 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5138 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5141 aSig1
= extractFloat128Frac1( a
);
5142 aSig0
= extractFloat128Frac0( a
);
5143 aExp
= extractFloat128Exp( a
);
5144 aSign
= extractFloat128Sign( a
);
5145 if ( aExp
== 0x7FFF ) {
5146 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a
);
5147 if ( ! aSign
) return a
;
5151 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5153 float_raise( float_flag_invalid
);
5154 z
.low
= float128_default_nan_low
;
5155 z
.high
= float128_default_nan_high
;
5159 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5160 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5162 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5163 aSig0
|= LIT64( 0x0001000000000000 );
5164 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5165 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5166 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5167 doubleZSig0
= zSig0
<<1;
5168 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5169 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5170 while ( (sbits64
) rem0
< 0 ) {
5173 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5175 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5176 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5177 if ( zSig1
== 0 ) zSig1
= 1;
5178 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5179 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5180 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5181 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5182 while ( (sbits64
) rem1
< 0 ) {
5184 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5186 term2
|= doubleZSig0
;
5187 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5189 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5191 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5192 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2
);
5197 -------------------------------------------------------------------------------
5198 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5199 the corresponding value `b', and 0 otherwise. The comparison is performed
5200 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5201 -------------------------------------------------------------------------------
5203 flag
float128_eq( float128 a
, float128 b
)
5206 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5207 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5208 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5209 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5211 if ( float128_is_signaling_nan( a
)
5212 || float128_is_signaling_nan( b
) ) {
5213 float_raise( float_flag_invalid
);
5219 && ( ( a
.high
== b
.high
)
5221 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5227 -------------------------------------------------------------------------------
5228 Returns 1 if the quadruple-precision floating-point value `a' is less than
5229 or equal to the corresponding value `b', and 0 otherwise. The comparison
5230 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5232 -------------------------------------------------------------------------------
5234 flag
float128_le( float128 a
, float128 b
)
5238 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5239 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5240 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5241 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5243 float_raise( float_flag_invalid
);
5246 aSign
= extractFloat128Sign( a
);
5247 bSign
= extractFloat128Sign( b
);
5248 if ( aSign
!= bSign
) {
5251 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5255 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5256 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5261 -------------------------------------------------------------------------------
5262 Returns 1 if the quadruple-precision floating-point value `a' is less than
5263 the corresponding value `b', and 0 otherwise. The comparison is performed
5264 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5265 -------------------------------------------------------------------------------
5267 flag
float128_lt( float128 a
, float128 b
)
5271 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5272 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5273 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5274 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5276 float_raise( float_flag_invalid
);
5279 aSign
= extractFloat128Sign( a
);
5280 bSign
= extractFloat128Sign( b
);
5281 if ( aSign
!= bSign
) {
5284 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5288 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5289 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5294 -------------------------------------------------------------------------------
5295 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5296 the corresponding value `b', and 0 otherwise. The invalid exception is
5297 raised if either operand is a NaN. Otherwise, the comparison is performed
5298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5299 -------------------------------------------------------------------------------
5301 flag
float128_eq_signaling( float128 a
, float128 b
)
5304 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5305 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5306 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5307 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5309 float_raise( float_flag_invalid
);
5314 && ( ( a
.high
== b
.high
)
5316 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5322 -------------------------------------------------------------------------------
5323 Returns 1 if the quadruple-precision floating-point value `a' is less than
5324 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5325 cause an exception. Otherwise, the comparison is performed according to the
5326 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327 -------------------------------------------------------------------------------
5329 flag
float128_le_quiet( float128 a
, float128 b
)
5333 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5334 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5335 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5336 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5338 if ( float128_is_signaling_nan( a
)
5339 || float128_is_signaling_nan( b
) ) {
5340 float_raise( float_flag_invalid
);
5344 aSign
= extractFloat128Sign( a
);
5345 bSign
= extractFloat128Sign( b
);
5346 if ( aSign
!= bSign
) {
5349 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5353 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5354 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5359 -------------------------------------------------------------------------------
5360 Returns 1 if the quadruple-precision floating-point value `a' is less than
5361 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5362 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5363 Standard for Binary Floating-Point Arithmetic.
5364 -------------------------------------------------------------------------------
5366 flag
float128_lt_quiet( float128 a
, float128 b
)
5370 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5371 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5372 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5373 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5375 if ( float128_is_signaling_nan( a
)
5376 || float128_is_signaling_nan( b
) ) {
5377 float_raise( float_flag_invalid
);
5381 aSign
= extractFloat128Sign( a
);
5382 bSign
= extractFloat128Sign( b
);
5383 if ( aSign
!= bSign
) {
5386 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5390 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5391 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5398 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5401 * These two routines are not part of the original softfloat distribution.
5403 * They are based on the corresponding conversions to integer but return
5404 * unsigned numbers instead since these functions are required by GCC.
5406 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97
5408 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5412 -------------------------------------------------------------------------------
5413 Returns the result of converting the double-precision floating-point value
5414 `a' to the 32-bit unsigned integer format. The conversion is
5415 performed according to the IEC/IEEE Standard for Binary Floating-point
5416 Arithmetic, except that the conversion is always rounded toward zero. If
5417 `a' is a NaN, the largest positive integer is returned. If the conversion
5418 overflows, the largest integer positive is returned.
5419 -------------------------------------------------------------------------------
5421 uint32
float64_to_uint32_round_to_zero( float64 a
)
5424 int16 aExp
, shiftCount
;
5425 bits64 aSig
, savedASig
;
5428 aSig
= extractFloat64Frac( a
);
5429 aExp
= extractFloat64Exp( a
);
5430 aSign
= extractFloat64Sign( a
);
5433 float_raise( float_flag_invalid
);
5437 if ( 0x41E < aExp
) {
5438 float_raise( float_flag_invalid
);
5441 else if ( aExp
< 0x3FF ) {
5442 if ( aExp
|| aSig
) float_set_inexact();
5445 aSig
|= LIT64( 0x0010000000000000 );
5446 shiftCount
= 0x433 - aExp
;
5448 aSig
>>= shiftCount
;
5450 if ( ( aSig
<<shiftCount
) != savedASig
) {
5451 float_set_inexact();
5458 -------------------------------------------------------------------------------
5459 Returns the result of converting the single-precision floating-point value
5460 `a' to the 32-bit unsigned integer format. The conversion is
5461 performed according to the IEC/IEEE Standard for Binary Floating-point
5462 Arithmetic, except that the conversion is always rounded toward zero. If
5463 `a' is a NaN, the largest positive integer is returned. If the conversion
5464 overflows, the largest positive integer is returned.
5465 -------------------------------------------------------------------------------
5467 uint32
float32_to_uint32_round_to_zero( float32 a
)
5470 int16 aExp
, shiftCount
;
5474 aSig
= extractFloat32Frac( a
);
5475 aExp
= extractFloat32Exp( a
);
5476 aSign
= extractFloat32Sign( a
);
5477 shiftCount
= aExp
- 0x9E;
5480 float_raise( float_flag_invalid
);
5483 if ( 0 < shiftCount
) {
5484 float_raise( float_flag_invalid
);
5487 else if ( aExp
<= 0x7E ) {
5488 if ( aExp
| aSig
) float_set_inexact();
5491 aSig
= ( aSig
| 0x800000 )<<8;
5492 z
= aSig
>>( - shiftCount
);
5493 if ( aSig
<<( shiftCount
& 31 ) ) {
5494 float_set_inexact();
5502 #endif /* _STANDALONE */