1 /* $NetBSD: softfloat.c,v 1.3 2013/01/10 08:16:11 matt Exp $ */
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
18 * This differs from the standard bits32/softfloat.c in that float64
19 * is defined to be a 64-bit integer rather than a structure. The
20 * structure is float64s, with translation between the two going via
25 ===============================================================================
27 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
28 Arithmetic Package, Release 2a.
30 Written by John R. Hauser. This work was made possible in part by the
31 International Computer Science Institute, located at Suite 600, 1947 Center
32 Street, Berkeley, California 94704. Funding was partially provided by the
33 National Science Foundation under grant MIP-9311980. The original version
34 of this code was written as part of a project to build a fixed-point vector
35 processor in collaboration with the University of California at Berkeley,
36 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
37 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
38 arithmetic/SoftFloat.html'.
40 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
41 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
42 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
43 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
44 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
46 Derivative works are acceptable, even for commercial purposes, so long as
47 (1) they include prominent notice that the work is derivative, and (2) they
48 include prominent notice akin to these four paragraphs for those parts of
49 this code that are retained.
51 ===============================================================================
54 #include <sys/cdefs.h>
55 #if defined(LIBC_SCCS) && !defined(lint)
56 __RCSID("$NetBSD: softfloat.c,v 1.3 2013/01/10 08:16:11 matt Exp $");
57 #endif /* LIBC_SCCS and not lint */
59 #ifdef SOFTFLOAT_FOR_GCC
60 #include "softfloat-for-gcc.h"
64 #include "softfloat.h"
67 * Conversions between floats as stored in memory and floats as
70 #ifndef FLOAT64_DEMANGLE
71 #define FLOAT64_DEMANGLE(a) (a)
73 #ifndef FLOAT64_MANGLE
74 #define FLOAT64_MANGLE(a) (a)
78 -------------------------------------------------------------------------------
79 Floating-point rounding mode and exception flags.
80 -------------------------------------------------------------------------------
82 #ifndef set_float_rounding_mode
83 fp_rnd float_rounding_mode
= float_round_nearest_even
;
84 fp_except float_exception_flags
= 0;
86 #ifndef set_float_exception_inexact_flag
87 #define set_float_exception_inexact_flag() \
88 ((void)(float_exception_flags |= float_flag_inexact))
92 -------------------------------------------------------------------------------
93 Primitive arithmetic functions, including multi-word arithmetic, and
94 division and square root approximations. (Can be specialized to target if
96 -------------------------------------------------------------------------------
98 #include "softfloat-macros"
101 -------------------------------------------------------------------------------
102 Functions and definitions to determine: (1) whether tininess for underflow
103 is detected before or after rounding by default, (2) what (if anything)
104 happens when exceptions are raised, (3) how signaling NaNs are distinguished
105 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
106 are propagated from function inputs to output. These details are target-
108 -------------------------------------------------------------------------------
110 #include "softfloat-specialize"
113 -------------------------------------------------------------------------------
114 Returns the fraction bits of the single-precision floating-point value `a'.
115 -------------------------------------------------------------------------------
117 INLINE bits32
extractFloat32Frac( float32 a
)
120 return a
& 0x007FFFFF;
125 -------------------------------------------------------------------------------
126 Returns the exponent bits of the single-precision floating-point value `a'.
127 -------------------------------------------------------------------------------
129 INLINE int16
extractFloat32Exp( float32 a
)
132 return ( a
>>23 ) & 0xFF;
137 -------------------------------------------------------------------------------
138 Returns the sign bit of the single-precision floating-point value `a'.
139 -------------------------------------------------------------------------------
141 INLINE flag
extractFloat32Sign( float32 a
)
149 -------------------------------------------------------------------------------
150 Normalizes the subnormal single-precision floating-point value represented
151 by the denormalized significand `aSig'. The normalized exponent and
152 significand are stored at the locations pointed to by `zExpPtr' and
153 `zSigPtr', respectively.
154 -------------------------------------------------------------------------------
157 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
161 shiftCount
= countLeadingZeros32( aSig
) - 8;
162 *zSigPtr
= aSig
<<shiftCount
;
163 *zExpPtr
= 1 - shiftCount
;
168 -------------------------------------------------------------------------------
169 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
170 single-precision floating-point value, returning the result. After being
171 shifted into the proper positions, the three fields are simply added
172 together to form the result. This means that any integer portion of `zSig'
173 will be added into the exponent. Since a properly normalized significand
174 will have an integer portion equal to 1, the `zExp' input should be 1 less
175 than the desired result exponent whenever `zSig' is a complete, normalized
177 -------------------------------------------------------------------------------
179 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
182 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
187 -------------------------------------------------------------------------------
188 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
189 and significand `zSig', and returns the proper single-precision floating-
190 point value corresponding to the abstract input. Ordinarily, the abstract
191 value is simply rounded and packed into the single-precision format, with
192 the inexact exception raised if the abstract input cannot be represented
193 exactly. However, if the abstract value is too large, the overflow and
194 inexact exceptions are raised and an infinity or maximal finite value is
195 returned. If the abstract value is too small, the input value is rounded to
196 a subnormal number, and the underflow and inexact exceptions are raised if
197 the abstract input cannot be represented exactly as a subnormal single-
198 precision floating-point number.
199 The input significand `zSig' has its binary point between bits 30
200 and 29, which is 7 bits to the left of the usual location. This shifted
201 significand must be normalized or smaller. If `zSig' is not normalized,
202 `zExp' must be 0; in that case, the result returned is a subnormal number,
203 and it must not require rounding. In the usual case that `zSig' is
204 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
205 The handling of underflow and overflow follows the IEC/IEEE Standard for
206 Binary Floating-Point Arithmetic.
207 -------------------------------------------------------------------------------
209 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
212 flag roundNearestEven
;
213 int8 roundIncrement
, roundBits
;
216 roundingMode
= float_rounding_mode
;
217 roundNearestEven
= roundingMode
== float_round_nearest_even
;
218 roundIncrement
= 0x40;
219 if ( ! roundNearestEven
) {
220 if ( roundingMode
== float_round_to_zero
) {
224 roundIncrement
= 0x7F;
226 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
229 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
233 roundBits
= zSig
& 0x7F;
234 if ( 0xFD <= (bits16
) zExp
) {
236 || ( ( zExp
== 0xFD )
237 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
239 float_raise( float_flag_overflow
| float_flag_inexact
);
240 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
244 ( float_detect_tininess
== float_tininess_before_rounding
)
246 || ( zSig
+ roundIncrement
< (uint32
)0x80000000 );
247 shift32RightJamming( zSig
, - zExp
, &zSig
);
249 roundBits
= zSig
& 0x7F;
250 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
253 if ( roundBits
) set_float_exception_inexact_flag();
254 zSig
= ( zSig
+ roundIncrement
)>>7;
255 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
256 if ( zSig
== 0 ) zExp
= 0;
257 return packFloat32( zSign
, zExp
, zSig
);
262 -------------------------------------------------------------------------------
263 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
264 and significand `zSig', and returns the proper single-precision floating-
265 point value corresponding to the abstract input. This routine is just like
266 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
267 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
268 floating-point exponent.
269 -------------------------------------------------------------------------------
272 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
276 shiftCount
= countLeadingZeros32( zSig
) - 1;
277 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
282 -------------------------------------------------------------------------------
283 Returns the least-significant 32 fraction bits of the double-precision
284 floating-point value `a'.
285 -------------------------------------------------------------------------------
287 INLINE bits32
extractFloat64Frac1( float64 a
)
290 return (bits32
)(FLOAT64_DEMANGLE(a
) & LIT64(0x00000000FFFFFFFF));
295 -------------------------------------------------------------------------------
296 Returns the most-significant 20 fraction bits of the double-precision
297 floating-point value `a'.
298 -------------------------------------------------------------------------------
300 INLINE bits32
extractFloat64Frac0( float64 a
)
303 return (bits32
)((FLOAT64_DEMANGLE(a
) >> 32) & 0x000FFFFF);
308 -------------------------------------------------------------------------------
309 Returns the exponent bits of the double-precision floating-point value `a'.
310 -------------------------------------------------------------------------------
312 INLINE int16
extractFloat64Exp( float64 a
)
315 return (int16
)((FLOAT64_DEMANGLE(a
) >> 52) & 0x7FF);
320 -------------------------------------------------------------------------------
321 Returns the sign bit of the double-precision floating-point value `a'.
322 -------------------------------------------------------------------------------
324 INLINE flag
extractFloat64Sign( float64 a
)
327 return (flag
)(FLOAT64_DEMANGLE(a
) >> 63);
332 -------------------------------------------------------------------------------
333 Normalizes the subnormal double-precision floating-point value represented
334 by the denormalized significand formed by the concatenation of `aSig0' and
335 `aSig1'. The normalized exponent is stored at the location pointed to by
336 `zExpPtr'. The most significant 21 bits of the normalized significand are
337 stored at the location pointed to by `zSig0Ptr', and the least significant
338 32 bits of the normalized significand are stored at the location pointed to
340 -------------------------------------------------------------------------------
343 normalizeFloat64Subnormal(
354 shiftCount
= countLeadingZeros32( aSig1
) - 11;
355 if ( shiftCount
< 0 ) {
356 *zSig0Ptr
= aSig1
>>( - shiftCount
);
357 *zSig1Ptr
= aSig1
<<( shiftCount
& 31 );
360 *zSig0Ptr
= aSig1
<<shiftCount
;
363 *zExpPtr
= - shiftCount
- 31;
366 shiftCount
= countLeadingZeros32( aSig0
) - 11;
367 shortShift64Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
368 *zExpPtr
= 1 - shiftCount
;
374 -------------------------------------------------------------------------------
375 Packs the sign `zSign', the exponent `zExp', and the significand formed by
376 the concatenation of `zSig0' and `zSig1' into a double-precision floating-
377 point value, returning the result. After being shifted into the proper
378 positions, the three fields `zSign', `zExp', and `zSig0' are simply added
379 together to form the most significant 32 bits of the result. This means
380 that any integer portion of `zSig0' will be added into the exponent. Since
381 a properly normalized significand will have an integer portion equal to 1,
382 the `zExp' input should be 1 less than the desired result exponent whenever
383 `zSig0' and `zSig1' concatenated form a complete, normalized significand.
384 -------------------------------------------------------------------------------
387 packFloat64( flag zSign
, int16 zExp
, bits32 zSig0
, bits32 zSig1
)
390 return FLOAT64_MANGLE( ( ( (bits64
) zSign
)<<63 ) +
391 ( ( (bits64
) zExp
)<<52 ) +
392 ( ( (bits64
) zSig0
)<<32 ) + zSig1
);
398 -------------------------------------------------------------------------------
399 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
400 and extended significand formed by the concatenation of `zSig0', `zSig1',
401 and `zSig2', and returns the proper double-precision floating-point value
402 corresponding to the abstract input. Ordinarily, the abstract value is
403 simply rounded and packed into the double-precision format, with the inexact
404 exception raised if the abstract input cannot be represented exactly.
405 However, if the abstract value is too large, the overflow and inexact
406 exceptions are raised and an infinity or maximal finite value is returned.
407 If the abstract value is too small, the input value is rounded to a
408 subnormal number, and the underflow and inexact exceptions are raised if the
409 abstract input cannot be represented exactly as a subnormal double-precision
410 floating-point number.
411 The input significand must be normalized or smaller. If the input
412 significand is not normalized, `zExp' must be 0; in that case, the result
413 returned is a subnormal number, and it must not require rounding. In the
414 usual case that the input significand is normalized, `zExp' must be 1 less
415 than the ``true'' floating-point exponent. The handling of underflow and
416 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
417 -------------------------------------------------------------------------------
421 flag zSign
, int16 zExp
, bits32 zSig0
, bits32 zSig1
, bits32 zSig2
)
424 flag roundNearestEven
, increment
, isTiny
;
426 roundingMode
= float_rounding_mode
;
427 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
428 increment
= ( (sbits32
) zSig2
< 0 );
429 if ( ! roundNearestEven
) {
430 if ( roundingMode
== float_round_to_zero
) {
435 increment
= ( roundingMode
== float_round_down
) && zSig2
;
438 increment
= ( roundingMode
== float_round_up
) && zSig2
;
442 if ( 0x7FD <= (bits16
) zExp
) {
443 if ( ( 0x7FD < zExp
)
444 || ( ( zExp
== 0x7FD )
445 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0
, zSig1
)
449 float_raise( float_flag_overflow
| float_flag_inexact
);
450 if ( ( roundingMode
== float_round_to_zero
)
451 || ( zSign
&& ( roundingMode
== float_round_up
) )
452 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
454 return packFloat64( zSign
, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
456 return packFloat64( zSign
, 0x7FF, 0, 0 );
460 ( float_detect_tininess
== float_tininess_before_rounding
)
463 || lt64( zSig0
, zSig1
, 0x001FFFFF, 0xFFFFFFFF );
464 shift64ExtraRightJamming(
465 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
467 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow
);
468 if ( roundNearestEven
) {
469 increment
= ( (sbits32
) zSig2
< 0 );
473 increment
= ( roundingMode
== float_round_down
) && zSig2
;
476 increment
= ( roundingMode
== float_round_up
) && zSig2
;
481 if ( zSig2
) set_float_exception_inexact_flag();
483 add64( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
484 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
487 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
489 return packFloat64( zSign
, zExp
, zSig0
, zSig1
);
494 -------------------------------------------------------------------------------
495 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
496 and significand formed by the concatenation of `zSig0' and `zSig1', and
497 returns the proper double-precision floating-point value corresponding
498 to the abstract input. This routine is just like `roundAndPackFloat64'
499 except that the input significand has fewer bits and does not have to be
500 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
502 -------------------------------------------------------------------------------
505 normalizeRoundAndPackFloat64(
506 flag zSign
, int16 zExp
, bits32 zSig0
, bits32 zSig1
)
516 shiftCount
= countLeadingZeros32( zSig0
) - 11;
517 if ( 0 <= shiftCount
) {
519 shortShift64Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
522 shift64ExtraRightJamming(
523 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
526 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
531 -------------------------------------------------------------------------------
532 Returns the result of converting the 32-bit two's complement integer `a' to
533 the single-precision floating-point format. The conversion is performed
534 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
535 -------------------------------------------------------------------------------
537 float32
int32_to_float32( int32 a
)
541 if ( a
== 0 ) return 0;
542 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
544 return normalizeRoundAndPackFloat32(zSign
, 0x9C, (uint32
)(zSign
? - a
: a
));
549 -------------------------------------------------------------------------------
550 Returns the result of converting the 32-bit two's complement integer `a' to
551 the double-precision floating-point format. The conversion is performed
552 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
553 -------------------------------------------------------------------------------
555 float64
int32_to_float64( int32 a
)
562 if ( a
== 0 ) return packFloat64( 0, 0, 0, 0 );
564 absA
= zSign
? - a
: a
;
565 shiftCount
= countLeadingZeros32( absA
) - 11;
566 if ( 0 <= shiftCount
) {
567 zSig0
= absA
<<shiftCount
;
571 shift64Right( absA
, 0, - shiftCount
, &zSig0
, &zSig1
);
573 return packFloat64( zSign
, 0x412 - shiftCount
, zSig0
, zSig1
);
577 #ifndef SOFTFLOAT_FOR_GCC
579 -------------------------------------------------------------------------------
580 Returns the result of converting the single-precision floating-point value
581 `a' to the 32-bit two's complement integer format. The conversion is
582 performed according to the IEC/IEEE Standard for Binary Floating-Point
583 Arithmetic---which means in particular that the conversion is rounded
584 according to the current rounding mode. If `a' is a NaN, the largest
585 positive integer is returned. Otherwise, if the conversion overflows, the
586 largest integer with the same sign as `a' is returned.
587 -------------------------------------------------------------------------------
589 int32
float32_to_int32( float32 a
)
592 int16 aExp
, shiftCount
;
593 bits32 aSig
, aSigExtra
;
597 aSig
= extractFloat32Frac( a
);
598 aExp
= extractFloat32Exp( a
);
599 aSign
= extractFloat32Sign( a
);
600 shiftCount
= aExp
- 0x96;
601 if ( 0 <= shiftCount
) {
602 if ( 0x9E <= aExp
) {
603 if ( a
!= 0xCF000000 ) {
604 float_raise( float_flag_invalid
);
605 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
609 return (sbits32
) 0x80000000;
611 z
= ( aSig
| 0x00800000 )<<shiftCount
;
612 if ( aSign
) z
= - z
;
616 aSigExtra
= aExp
| aSig
;
621 aSigExtra
= aSig
<<( shiftCount
& 31 );
622 z
= aSig
>>( - shiftCount
);
624 if ( aSigExtra
) set_float_exception_inexact_flag();
625 roundingMode
= float_rounding_mode
;
626 if ( roundingMode
== float_round_nearest_even
) {
627 if ( (sbits32
) aSigExtra
< 0 ) {
629 if ( (bits32
) ( aSigExtra
<<1 ) == 0 ) z
&= ~1;
631 if ( aSign
) z
= - z
;
634 aSigExtra
= ( aSigExtra
!= 0 );
636 z
+= ( roundingMode
== float_round_down
) & aSigExtra
;
640 z
+= ( roundingMode
== float_round_up
) & aSigExtra
;
650 -------------------------------------------------------------------------------
651 Returns the result of converting the single-precision floating-point value
652 `a' to the 32-bit two's complement integer format. The conversion is
653 performed according to the IEC/IEEE Standard for Binary Floating-Point
654 Arithmetic, except that the conversion is always rounded toward zero.
655 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
656 the conversion overflows, the largest integer with the same sign as `a' is
658 -------------------------------------------------------------------------------
660 int32
float32_to_int32_round_to_zero( float32 a
)
663 int16 aExp
, shiftCount
;
667 aSig
= extractFloat32Frac( a
);
668 aExp
= extractFloat32Exp( a
);
669 aSign
= extractFloat32Sign( a
);
670 shiftCount
= aExp
- 0x9E;
671 if ( 0 <= shiftCount
) {
672 if ( a
!= 0xCF000000 ) {
673 float_raise( float_flag_invalid
);
674 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
676 return (sbits32
) 0x80000000;
678 else if ( aExp
<= 0x7E ) {
679 if ( aExp
| aSig
) set_float_exception_inexact_flag();
682 aSig
= ( aSig
| 0x00800000 )<<8;
683 z
= aSig
>>( - shiftCount
);
684 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
685 set_float_exception_inexact_flag();
687 if ( aSign
) z
= - z
;
693 -------------------------------------------------------------------------------
694 Returns the result of converting the single-precision floating-point value
695 `a' to the double-precision floating-point format. The conversion is
696 performed according to the IEC/IEEE Standard for Binary Floating-Point
698 -------------------------------------------------------------------------------
700 float64
float32_to_float64( float32 a
)
704 bits32 aSig
, zSig0
, zSig1
;
706 aSig
= extractFloat32Frac( a
);
707 aExp
= extractFloat32Exp( a
);
708 aSign
= extractFloat32Sign( a
);
709 if ( aExp
== 0xFF ) {
710 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
711 return packFloat64( aSign
, 0x7FF, 0, 0 );
714 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0, 0 );
715 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
718 shift64Right( aSig
, 0, 3, &zSig0
, &zSig1
);
719 return packFloat64( aSign
, aExp
+ 0x380, zSig0
, zSig1
);
723 #ifndef SOFTFLOAT_FOR_GCC
725 -------------------------------------------------------------------------------
726 Rounds the single-precision floating-point value `a' to an integer,
727 and returns the result as a single-precision floating-point value. The
728 operation is performed according to the IEC/IEEE Standard for Binary
729 Floating-Point Arithmetic.
730 -------------------------------------------------------------------------------
732 float32
float32_round_to_int( float32 a
)
736 bits32 lastBitMask
, roundBitsMask
;
740 aExp
= extractFloat32Exp( a
);
741 if ( 0x96 <= aExp
) {
742 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
743 return propagateFloat32NaN( a
, a
);
747 if ( aExp
<= 0x7E ) {
748 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
749 set_float_exception_inexact_flag();
750 aSign
= extractFloat32Sign( a
);
751 switch ( float_rounding_mode
) {
752 case float_round_nearest_even
:
753 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
754 return packFloat32( aSign
, 0x7F, 0 );
757 case float_round_to_zero
:
759 case float_round_down
:
760 return aSign
? 0xBF800000 : 0;
762 return aSign
? 0x80000000 : 0x3F800000;
764 return packFloat32( aSign
, 0, 0 );
767 lastBitMask
<<= 0x96 - aExp
;
768 roundBitsMask
= lastBitMask
- 1;
770 roundingMode
= float_rounding_mode
;
771 if ( roundingMode
== float_round_nearest_even
) {
773 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
775 else if ( roundingMode
!= float_round_to_zero
) {
776 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
780 z
&= ~ roundBitsMask
;
781 if ( z
!= a
) set_float_exception_inexact_flag();
788 -------------------------------------------------------------------------------
789 Returns the result of adding the absolute values of the single-precision
790 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
791 before being returned. `zSign' is ignored if the result is a NaN.
792 The addition is performed according to the IEC/IEEE Standard for Binary
793 Floating-Point Arithmetic.
794 -------------------------------------------------------------------------------
796 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign
)
798 int16 aExp
, bExp
, zExp
;
799 bits32 aSig
, bSig
, zSig
;
802 aSig
= extractFloat32Frac( a
);
803 aExp
= extractFloat32Exp( a
);
804 bSig
= extractFloat32Frac( b
);
805 bExp
= extractFloat32Exp( b
);
806 expDiff
= aExp
- bExp
;
810 if ( aExp
== 0xFF ) {
811 if ( aSig
) return propagateFloat32NaN( a
, b
);
820 shift32RightJamming( bSig
, expDiff
, &bSig
);
823 else if ( expDiff
< 0 ) {
824 if ( bExp
== 0xFF ) {
825 if ( bSig
) return propagateFloat32NaN( a
, b
);
826 return packFloat32( zSign
, 0xFF, 0 );
834 shift32RightJamming( aSig
, - expDiff
, &aSig
);
838 if ( aExp
== 0xFF ) {
839 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
842 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
843 zSig
= 0x40000000 + aSig
+ bSig
;
848 zSig
= ( aSig
+ bSig
)<<1;
850 if ( (sbits32
) zSig
< 0 ) {
855 return roundAndPackFloat32( zSign
, zExp
, zSig
);
860 -------------------------------------------------------------------------------
861 Returns the result of subtracting the absolute values of the single-
862 precision floating-point values `a' and `b'. If `zSign' is 1, the
863 difference is negated before being returned. `zSign' is ignored if the
864 result is a NaN. The subtraction is performed according to the IEC/IEEE
865 Standard for Binary Floating-Point Arithmetic.
866 -------------------------------------------------------------------------------
868 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign
)
870 int16 aExp
, bExp
, zExp
;
871 bits32 aSig
, bSig
, zSig
;
874 aSig
= extractFloat32Frac( a
);
875 aExp
= extractFloat32Exp( a
);
876 bSig
= extractFloat32Frac( b
);
877 bExp
= extractFloat32Exp( b
);
878 expDiff
= aExp
- bExp
;
881 if ( 0 < expDiff
) goto aExpBigger
;
882 if ( expDiff
< 0 ) goto bExpBigger
;
883 if ( aExp
== 0xFF ) {
884 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
885 float_raise( float_flag_invalid
);
886 return float32_default_nan
;
892 if ( bSig
< aSig
) goto aBigger
;
893 if ( aSig
< bSig
) goto bBigger
;
894 return packFloat32( float_rounding_mode
== float_round_down
, 0, 0 );
896 if ( bExp
== 0xFF ) {
897 if ( bSig
) return propagateFloat32NaN( a
, b
);
898 return packFloat32( zSign
^ 1, 0xFF, 0 );
906 shift32RightJamming( aSig
, - expDiff
, &aSig
);
912 goto normalizeRoundAndPack
;
914 if ( aExp
== 0xFF ) {
915 if ( aSig
) return propagateFloat32NaN( a
, b
);
924 shift32RightJamming( bSig
, expDiff
, &bSig
);
929 normalizeRoundAndPack
:
931 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig
);
936 -------------------------------------------------------------------------------
937 Returns the result of adding the single-precision floating-point values `a'
938 and `b'. The operation is performed according to the IEC/IEEE Standard for
939 Binary Floating-Point Arithmetic.
940 -------------------------------------------------------------------------------
942 float32
float32_add( float32 a
, float32 b
)
946 aSign
= extractFloat32Sign( a
);
947 bSign
= extractFloat32Sign( b
);
948 if ( aSign
== bSign
) {
949 return addFloat32Sigs( a
, b
, aSign
);
952 return subFloat32Sigs( a
, b
, aSign
);
958 -------------------------------------------------------------------------------
959 Returns the result of subtracting the single-precision floating-point values
960 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
961 for Binary Floating-Point Arithmetic.
962 -------------------------------------------------------------------------------
964 float32
float32_sub( float32 a
, float32 b
)
968 aSign
= extractFloat32Sign( a
);
969 bSign
= extractFloat32Sign( b
);
970 if ( aSign
== bSign
) {
971 return subFloat32Sigs( a
, b
, aSign
);
974 return addFloat32Sigs( a
, b
, aSign
);
980 -------------------------------------------------------------------------------
981 Returns the result of multiplying the single-precision floating-point values
982 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
983 for Binary Floating-Point Arithmetic.
984 -------------------------------------------------------------------------------
986 float32
float32_mul( float32 a
, float32 b
)
988 flag aSign
, bSign
, zSign
;
989 int16 aExp
, bExp
, zExp
;
990 bits32 aSig
, bSig
, zSig0
, zSig1
;
992 aSig
= extractFloat32Frac( a
);
993 aExp
= extractFloat32Exp( a
);
994 aSign
= extractFloat32Sign( a
);
995 bSig
= extractFloat32Frac( b
);
996 bExp
= extractFloat32Exp( b
);
997 bSign
= extractFloat32Sign( b
);
998 zSign
= aSign
^ bSign
;
999 if ( aExp
== 0xFF ) {
1000 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1001 return propagateFloat32NaN( a
, b
);
1003 if ( ( bExp
| bSig
) == 0 ) {
1004 float_raise( float_flag_invalid
);
1005 return float32_default_nan
;
1007 return packFloat32( zSign
, 0xFF, 0 );
1009 if ( bExp
== 0xFF ) {
1010 if ( bSig
) return propagateFloat32NaN( a
, b
);
1011 if ( ( aExp
| aSig
) == 0 ) {
1012 float_raise( float_flag_invalid
);
1013 return float32_default_nan
;
1015 return packFloat32( zSign
, 0xFF, 0 );
1018 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1019 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1022 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1023 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1025 zExp
= aExp
+ bExp
- 0x7F;
1026 aSig
= ( aSig
| 0x00800000 )<<7;
1027 bSig
= ( bSig
| 0x00800000 )<<8;
1028 mul32To64( aSig
, bSig
, &zSig0
, &zSig1
);
1029 zSig0
|= ( zSig1
!= 0 );
1030 if ( 0 <= (sbits32
) ( zSig0
<<1 ) ) {
1034 return roundAndPackFloat32( zSign
, zExp
, zSig0
);
1039 -------------------------------------------------------------------------------
1040 Returns the result of dividing the single-precision floating-point value `a'
1041 by the corresponding value `b'. The operation is performed according to the
1042 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1043 -------------------------------------------------------------------------------
1045 float32
float32_div( float32 a
, float32 b
)
1047 flag aSign
, bSign
, zSign
;
1048 int16 aExp
, bExp
, zExp
;
1049 bits32 aSig
, bSig
, zSig
, rem0
, rem1
, term0
, term1
;
1051 aSig
= extractFloat32Frac( a
);
1052 aExp
= extractFloat32Exp( a
);
1053 aSign
= extractFloat32Sign( a
);
1054 bSig
= extractFloat32Frac( b
);
1055 bExp
= extractFloat32Exp( b
);
1056 bSign
= extractFloat32Sign( b
);
1057 zSign
= aSign
^ bSign
;
1058 if ( aExp
== 0xFF ) {
1059 if ( aSig
) return propagateFloat32NaN( a
, b
);
1060 if ( bExp
== 0xFF ) {
1061 if ( bSig
) return propagateFloat32NaN( a
, b
);
1062 float_raise( float_flag_invalid
);
1063 return float32_default_nan
;
1065 return packFloat32( zSign
, 0xFF, 0 );
1067 if ( bExp
== 0xFF ) {
1068 if ( bSig
) return propagateFloat32NaN( a
, b
);
1069 return packFloat32( zSign
, 0, 0 );
1073 if ( ( aExp
| aSig
) == 0 ) {
1074 float_raise( float_flag_invalid
);
1075 return float32_default_nan
;
1077 float_raise( float_flag_divbyzero
);
1078 return packFloat32( zSign
, 0xFF, 0 );
1080 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1083 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1084 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1086 zExp
= aExp
- bExp
+ 0x7D;
1087 aSig
= ( aSig
| 0x00800000 )<<7;
1088 bSig
= ( bSig
| 0x00800000 )<<8;
1089 if ( bSig
<= ( aSig
+ aSig
) ) {
1093 zSig
= estimateDiv64To32( aSig
, 0, bSig
);
1094 if ( ( zSig
& 0x3F ) <= 2 ) {
1095 mul32To64( bSig
, zSig
, &term0
, &term1
);
1096 sub64( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
1097 while ( (sbits32
) rem0
< 0 ) {
1099 add64( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
1101 zSig
|= ( rem1
!= 0 );
1103 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1107 #ifndef SOFTFLOAT_FOR_GCC
1109 -------------------------------------------------------------------------------
1110 Returns the remainder of the single-precision floating-point value `a'
1111 with respect to the corresponding value `b'. The operation is performed
1112 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 -------------------------------------------------------------------------------
1115 float32
float32_rem( float32 a
, float32 b
)
1117 flag aSign
, bSign
, zSign
;
1118 int16 aExp
, bExp
, expDiff
;
1119 bits32 aSig
, bSig
, q
, allZero
, alternateASig
;
1122 aSig
= extractFloat32Frac( a
);
1123 aExp
= extractFloat32Exp( a
);
1124 aSign
= extractFloat32Sign( a
);
1125 bSig
= extractFloat32Frac( b
);
1126 bExp
= extractFloat32Exp( b
);
1127 bSign
= extractFloat32Sign( b
);
1128 if ( aExp
== 0xFF ) {
1129 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1130 return propagateFloat32NaN( a
, b
);
1132 float_raise( float_flag_invalid
);
1133 return float32_default_nan
;
1135 if ( bExp
== 0xFF ) {
1136 if ( bSig
) return propagateFloat32NaN( a
, b
);
1141 float_raise( float_flag_invalid
);
1142 return float32_default_nan
;
1144 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1147 if ( aSig
== 0 ) return a
;
1148 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1150 expDiff
= aExp
- bExp
;
1151 aSig
= ( aSig
| 0x00800000 )<<8;
1152 bSig
= ( bSig
| 0x00800000 )<<8;
1153 if ( expDiff
< 0 ) {
1154 if ( expDiff
< -1 ) return a
;
1157 q
= ( bSig
<= aSig
);
1158 if ( q
) aSig
-= bSig
;
1160 while ( 0 < expDiff
) {
1161 q
= estimateDiv64To32( aSig
, 0, bSig
);
1162 q
= ( 2 < q
) ? q
- 2 : 0;
1163 aSig
= - ( ( bSig
>>2 ) * q
);
1167 if ( 0 < expDiff
) {
1168 q
= estimateDiv64To32( aSig
, 0, bSig
);
1169 q
= ( 2 < q
) ? q
- 2 : 0;
1172 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1179 alternateASig
= aSig
;
1182 } while ( 0 <= (sbits32
) aSig
);
1183 sigMean
= aSig
+ alternateASig
;
1184 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1185 aSig
= alternateASig
;
1187 zSign
= ( (sbits32
) aSig
< 0 );
1188 if ( zSign
) aSig
= - aSig
;
1189 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig
);
1194 #ifndef SOFTFLOAT_FOR_GCC
1196 -------------------------------------------------------------------------------
1197 Returns the square root of the single-precision floating-point value `a'.
1198 The operation is performed according to the IEC/IEEE Standard for Binary
1199 Floating-Point Arithmetic.
1200 -------------------------------------------------------------------------------
1202 float32
float32_sqrt( float32 a
)
1206 bits32 aSig
, zSig
, rem0
, rem1
, term0
, term1
;
1208 aSig
= extractFloat32Frac( a
);
1209 aExp
= extractFloat32Exp( a
);
1210 aSign
= extractFloat32Sign( a
);
1211 if ( aExp
== 0xFF ) {
1212 if ( aSig
) return propagateFloat32NaN( a
, 0 );
1213 if ( ! aSign
) return a
;
1214 float_raise( float_flag_invalid
);
1215 return float32_default_nan
;
1218 if ( ( aExp
| aSig
) == 0 ) return a
;
1219 float_raise( float_flag_invalid
);
1220 return float32_default_nan
;
1223 if ( aSig
== 0 ) return 0;
1224 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1226 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
1227 aSig
= ( aSig
| 0x00800000 )<<8;
1228 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
1229 if ( ( zSig
& 0x7F ) <= 5 ) {
1236 mul32To64( zSig
, zSig
, &term0
, &term1
);
1237 sub64( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
1238 while ( (sbits32
) rem0
< 0 ) {
1240 shortShift64Left( 0, zSig
, 1, &term0
, &term1
);
1242 add64( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
1244 zSig
|= ( ( rem0
| rem1
) != 0 );
1247 shift32RightJamming( zSig
, 1, &zSig
);
1249 return roundAndPackFloat32( 0, zExp
, zSig
);
1255 -------------------------------------------------------------------------------
1256 Returns 1 if the single-precision floating-point value `a' is equal to
1257 the corresponding value `b', and 0 otherwise. The comparison is performed
1258 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259 -------------------------------------------------------------------------------
1261 flag
float32_eq( float32 a
, float32 b
)
1264 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1265 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1267 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1268 float_raise( float_flag_invalid
);
1272 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1277 -------------------------------------------------------------------------------
1278 Returns 1 if the single-precision floating-point value `a' is less than
1279 or equal to the corresponding value `b', and 0 otherwise. The comparison
1280 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1282 -------------------------------------------------------------------------------
1284 flag
float32_le( float32 a
, float32 b
)
1288 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1289 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1291 float_raise( float_flag_invalid
);
1294 aSign
= extractFloat32Sign( a
);
1295 bSign
= extractFloat32Sign( b
);
1296 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1297 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1302 -------------------------------------------------------------------------------
1303 Returns 1 if the single-precision floating-point value `a' is less than
1304 the corresponding value `b', and 0 otherwise. The comparison is performed
1305 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1306 -------------------------------------------------------------------------------
1308 flag
float32_lt( float32 a
, float32 b
)
1312 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1313 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1315 float_raise( float_flag_invalid
);
1318 aSign
= extractFloat32Sign( a
);
1319 bSign
= extractFloat32Sign( b
);
1320 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1321 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1325 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1327 -------------------------------------------------------------------------------
1328 Returns 1 if the single-precision floating-point value `a' is equal to
1329 the corresponding value `b', and 0 otherwise. The invalid exception is
1330 raised if either operand is a NaN. Otherwise, the comparison is performed
1331 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1332 -------------------------------------------------------------------------------
1334 flag
float32_eq_signaling( float32 a
, float32 b
)
1337 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1338 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1340 float_raise( float_flag_invalid
);
1343 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1348 -------------------------------------------------------------------------------
1349 Returns 1 if the single-precision floating-point value `a' is less than or
1350 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1351 cause an exception. Otherwise, the comparison is performed according to the
1352 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1353 -------------------------------------------------------------------------------
1355 flag
float32_le_quiet( float32 a
, float32 b
)
1360 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1361 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1363 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1364 float_raise( float_flag_invalid
);
1368 aSign
= extractFloat32Sign( a
);
1369 bSign
= extractFloat32Sign( b
);
1370 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1371 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1376 -------------------------------------------------------------------------------
1377 Returns 1 if the single-precision floating-point value `a' is less than
1378 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1379 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1380 Standard for Binary Floating-Point Arithmetic.
1381 -------------------------------------------------------------------------------
1383 flag
float32_lt_quiet( float32 a
, float32 b
)
1387 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1388 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1390 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1391 float_raise( float_flag_invalid
);
1395 aSign
= extractFloat32Sign( a
);
1396 bSign
= extractFloat32Sign( b
);
1397 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1398 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1401 #endif /* !SOFTFLOAT_FOR_GCC */
1403 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1405 -------------------------------------------------------------------------------
1406 Returns the result of converting the double-precision floating-point value
1407 `a' to the 32-bit two's complement integer format. The conversion is
1408 performed according to the IEC/IEEE Standard for Binary Floating-Point
1409 Arithmetic---which means in particular that the conversion is rounded
1410 according to the current rounding mode. If `a' is a NaN, the largest
1411 positive integer is returned. Otherwise, if the conversion overflows, the
1412 largest integer with the same sign as `a' is returned.
1413 -------------------------------------------------------------------------------
1415 int32
float64_to_int32( float64 a
)
1418 int16 aExp
, shiftCount
;
1419 bits32 aSig0
, aSig1
, absZ
, aSigExtra
;
1423 aSig1
= extractFloat64Frac1( a
);
1424 aSig0
= extractFloat64Frac0( a
);
1425 aExp
= extractFloat64Exp( a
);
1426 aSign
= extractFloat64Sign( a
);
1427 shiftCount
= aExp
- 0x413;
1428 if ( 0 <= shiftCount
) {
1429 if ( 0x41E < aExp
) {
1430 if ( ( aExp
== 0x7FF ) && ( aSig0
| aSig1
) ) aSign
= 0;
1434 aSig0
| 0x00100000, aSig1
, shiftCount
, &absZ
, &aSigExtra
);
1435 if ( 0x80000000 < absZ
) goto invalid
;
1438 aSig1
= ( aSig1
!= 0 );
1439 if ( aExp
< 0x3FE ) {
1440 aSigExtra
= aExp
| aSig0
| aSig1
;
1444 aSig0
|= 0x00100000;
1445 aSigExtra
= ( aSig0
<<( shiftCount
& 31 ) ) | aSig1
;
1446 absZ
= aSig0
>>( - shiftCount
);
1449 roundingMode
= float_rounding_mode
;
1450 if ( roundingMode
== float_round_nearest_even
) {
1451 if ( (sbits32
) aSigExtra
< 0 ) {
1453 if ( (bits32
) ( aSigExtra
<<1 ) == 0 ) absZ
&= ~1;
1455 z
= aSign
? - absZ
: absZ
;
1458 aSigExtra
= ( aSigExtra
!= 0 );
1461 + ( ( roundingMode
== float_round_down
) & aSigExtra
) );
1464 z
= absZ
+ ( ( roundingMode
== float_round_up
) & aSigExtra
);
1467 if ( ( aSign
^ ( z
< 0 ) ) && z
) {
1469 float_raise( float_flag_invalid
);
1470 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
1472 if ( aSigExtra
) set_float_exception_inexact_flag();
1476 #endif /* !SOFTFLOAT_FOR_GCC */
1479 -------------------------------------------------------------------------------
1480 Returns the result of converting the double-precision floating-point value
1481 `a' to the 32-bit two's complement integer format. The conversion is
1482 performed according to the IEC/IEEE Standard for Binary Floating-Point
1483 Arithmetic, except that the conversion is always rounded toward zero.
1484 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1485 the conversion overflows, the largest integer with the same sign as `a' is
1487 -------------------------------------------------------------------------------
1489 int32
float64_to_int32_round_to_zero( float64 a
)
1492 int16 aExp
, shiftCount
;
1493 bits32 aSig0
, aSig1
, absZ
, aSigExtra
;
1496 aSig1
= extractFloat64Frac1( a
);
1497 aSig0
= extractFloat64Frac0( a
);
1498 aExp
= extractFloat64Exp( a
);
1499 aSign
= extractFloat64Sign( a
);
1500 shiftCount
= aExp
- 0x413;
1501 if ( 0 <= shiftCount
) {
1502 if ( 0x41E < aExp
) {
1503 if ( ( aExp
== 0x7FF ) && ( aSig0
| aSig1
) ) aSign
= 0;
1507 aSig0
| 0x00100000, aSig1
, shiftCount
, &absZ
, &aSigExtra
);
1510 if ( aExp
< 0x3FF ) {
1511 if ( aExp
| aSig0
| aSig1
) {
1512 set_float_exception_inexact_flag();
1516 aSig0
|= 0x00100000;
1517 aSigExtra
= ( aSig0
<<( shiftCount
& 31 ) ) | aSig1
;
1518 absZ
= aSig0
>>( - shiftCount
);
1520 z
= aSign
? - absZ
: absZ
;
1521 if ( ( aSign
^ ( z
< 0 ) ) && z
) {
1523 float_raise( float_flag_invalid
);
1524 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
1526 if ( aSigExtra
) set_float_exception_inexact_flag();
1532 -------------------------------------------------------------------------------
1533 Returns the result of converting the double-precision floating-point value
1534 `a' to the single-precision floating-point format. The conversion is
1535 performed according to the IEC/IEEE Standard for Binary Floating-Point
1537 -------------------------------------------------------------------------------
1539 float32
float64_to_float32( float64 a
)
1543 bits32 aSig0
, aSig1
, zSig
;
1546 aSig1
= extractFloat64Frac1( a
);
1547 aSig0
= extractFloat64Frac0( a
);
1548 aExp
= extractFloat64Exp( a
);
1549 aSign
= extractFloat64Sign( a
);
1550 if ( aExp
== 0x7FF ) {
1551 if ( aSig0
| aSig1
) {
1552 return commonNaNToFloat32( float64ToCommonNaN( a
) );
1554 return packFloat32( aSign
, 0xFF, 0 );
1556 shift64RightJamming( aSig0
, aSig1
, 22, &allZero
, &zSig
);
1557 if ( aExp
) zSig
|= 0x40000000;
1558 return roundAndPackFloat32( aSign
, aExp
- 0x381, zSig
);
1562 #ifndef SOFTFLOAT_FOR_GCC
1564 -------------------------------------------------------------------------------
1565 Rounds the double-precision floating-point value `a' to an integer,
1566 and returns the result as a double-precision floating-point value. The
1567 operation is performed according to the IEC/IEEE Standard for Binary
1568 Floating-Point Arithmetic.
1569 -------------------------------------------------------------------------------
1571 float64
float64_round_to_int( float64 a
)
1575 bits32 lastBitMask
, roundBitsMask
;
1579 aExp
= extractFloat64Exp( a
);
1580 if ( 0x413 <= aExp
) {
1581 if ( 0x433 <= aExp
) {
1582 if ( ( aExp
== 0x7FF )
1583 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) ) {
1584 return propagateFloat64NaN( a
, a
);
1589 lastBitMask
= ( lastBitMask
<<( 0x432 - aExp
) )<<1;
1590 roundBitsMask
= lastBitMask
- 1;
1592 roundingMode
= float_rounding_mode
;
1593 if ( roundingMode
== float_round_nearest_even
) {
1594 if ( lastBitMask
) {
1595 add64( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
1596 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
1599 if ( (sbits32
) z
.low
< 0 ) {
1601 if ( (bits32
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
1605 else if ( roundingMode
!= float_round_to_zero
) {
1606 if ( extractFloat64Sign( z
)
1607 ^ ( roundingMode
== float_round_up
) ) {
1608 add64( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
1611 z
.low
&= ~ roundBitsMask
;
1614 if ( aExp
<= 0x3FE ) {
1615 if ( ( ( (bits32
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
1616 set_float_exception_inexact_flag();
1617 aSign
= extractFloat64Sign( a
);
1618 switch ( float_rounding_mode
) {
1619 case float_round_nearest_even
:
1620 if ( ( aExp
== 0x3FE )
1621 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) )
1623 return packFloat64( aSign
, 0x3FF, 0, 0 );
1626 case float_round_down
:
1628 aSign
? packFloat64( 1, 0x3FF, 0, 0 )
1629 : packFloat64( 0, 0, 0, 0 );
1630 case float_round_up
:
1632 aSign
? packFloat64( 1, 0, 0, 0 )
1633 : packFloat64( 0, 0x3FF, 0, 0 );
1635 return packFloat64( aSign
, 0, 0, 0 );
1638 lastBitMask
<<= 0x413 - aExp
;
1639 roundBitsMask
= lastBitMask
- 1;
1642 roundingMode
= float_rounding_mode
;
1643 if ( roundingMode
== float_round_nearest_even
) {
1644 z
.high
+= lastBitMask
>>1;
1645 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
1646 z
.high
&= ~ lastBitMask
;
1649 else if ( roundingMode
!= float_round_to_zero
) {
1650 if ( extractFloat64Sign( z
)
1651 ^ ( roundingMode
== float_round_up
) ) {
1652 z
.high
|= ( a
.low
!= 0 );
1653 z
.high
+= roundBitsMask
;
1656 z
.high
&= ~ roundBitsMask
;
1658 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
1659 set_float_exception_inexact_flag();
1667 -------------------------------------------------------------------------------
1668 Returns the result of adding the absolute values of the double-precision
1669 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1670 before being returned. `zSign' is ignored if the result is a NaN.
1671 The addition is performed according to the IEC/IEEE Standard for Binary
1672 Floating-Point Arithmetic.
1673 -------------------------------------------------------------------------------
1675 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign
)
1677 int16 aExp
, bExp
, zExp
;
1678 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
1681 aSig1
= extractFloat64Frac1( a
);
1682 aSig0
= extractFloat64Frac0( a
);
1683 aExp
= extractFloat64Exp( a
);
1684 bSig1
= extractFloat64Frac1( b
);
1685 bSig0
= extractFloat64Frac0( b
);
1686 bExp
= extractFloat64Exp( b
);
1687 expDiff
= aExp
- bExp
;
1688 if ( 0 < expDiff
) {
1689 if ( aExp
== 0x7FF ) {
1690 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, b
);
1697 bSig0
|= 0x00100000;
1699 shift64ExtraRightJamming(
1700 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
1703 else if ( expDiff
< 0 ) {
1704 if ( bExp
== 0x7FF ) {
1705 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1706 return packFloat64( zSign
, 0x7FF, 0, 0 );
1712 aSig0
|= 0x00100000;
1714 shift64ExtraRightJamming(
1715 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
1719 if ( aExp
== 0x7FF ) {
1720 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
1721 return propagateFloat64NaN( a
, b
);
1725 add64( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
1726 if ( aExp
== 0 ) return packFloat64( zSign
, 0, zSig0
, zSig1
);
1728 zSig0
|= 0x00200000;
1732 aSig0
|= 0x00100000;
1733 add64( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
1735 if ( zSig0
< 0x00200000 ) goto roundAndPack
;
1738 shift64ExtraRightJamming( zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
1740 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1745 -------------------------------------------------------------------------------
1746 Returns the result of subtracting the absolute values of the double-
1747 precision floating-point values `a' and `b'. If `zSign' is 1, the
1748 difference is negated before being returned. `zSign' is ignored if the
1749 result is a NaN. The subtraction is performed according to the IEC/IEEE
1750 Standard for Binary Floating-Point Arithmetic.
1751 -------------------------------------------------------------------------------
1753 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign
)
1755 int16 aExp
, bExp
, zExp
;
1756 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
1759 aSig1
= extractFloat64Frac1( a
);
1760 aSig0
= extractFloat64Frac0( a
);
1761 aExp
= extractFloat64Exp( a
);
1762 bSig1
= extractFloat64Frac1( b
);
1763 bSig0
= extractFloat64Frac0( b
);
1764 bExp
= extractFloat64Exp( b
);
1765 expDiff
= aExp
- bExp
;
1766 shortShift64Left( aSig0
, aSig1
, 10, &aSig0
, &aSig1
);
1767 shortShift64Left( bSig0
, bSig1
, 10, &bSig0
, &bSig1
);
1768 if ( 0 < expDiff
) goto aExpBigger
;
1769 if ( expDiff
< 0 ) goto bExpBigger
;
1770 if ( aExp
== 0x7FF ) {
1771 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
1772 return propagateFloat64NaN( a
, b
);
1774 float_raise( float_flag_invalid
);
1775 return float64_default_nan
;
1781 if ( bSig0
< aSig0
) goto aBigger
;
1782 if ( aSig0
< bSig0
) goto bBigger
;
1783 if ( bSig1
< aSig1
) goto aBigger
;
1784 if ( aSig1
< bSig1
) goto bBigger
;
1785 return packFloat64( float_rounding_mode
== float_round_down
, 0, 0, 0 );
1787 if ( bExp
== 0x7FF ) {
1788 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1789 return packFloat64( zSign
^ 1, 0x7FF, 0, 0 );
1795 aSig0
|= 0x40000000;
1797 shift64RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
1798 bSig0
|= 0x40000000;
1800 sub64( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
1803 goto normalizeRoundAndPack
;
1805 if ( aExp
== 0x7FF ) {
1806 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, b
);
1813 bSig0
|= 0x40000000;
1815 shift64RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
1816 aSig0
|= 0x40000000;
1818 sub64( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
1820 normalizeRoundAndPack
:
1822 return normalizeRoundAndPackFloat64( zSign
, zExp
- 10, zSig0
, zSig1
);
1827 -------------------------------------------------------------------------------
1828 Returns the result of adding the double-precision floating-point values `a'
1829 and `b'. The operation is performed according to the IEC/IEEE Standard for
1830 Binary Floating-Point Arithmetic.
1831 -------------------------------------------------------------------------------
1833 float64
float64_add( float64 a
, float64 b
)
1837 aSign
= extractFloat64Sign( a
);
1838 bSign
= extractFloat64Sign( b
);
1839 if ( aSign
== bSign
) {
1840 return addFloat64Sigs( a
, b
, aSign
);
1843 return subFloat64Sigs( a
, b
, aSign
);
1849 -------------------------------------------------------------------------------
1850 Returns the result of subtracting the double-precision floating-point values
1851 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1852 for Binary Floating-Point Arithmetic.
1853 -------------------------------------------------------------------------------
1855 float64
float64_sub( float64 a
, float64 b
)
1859 aSign
= extractFloat64Sign( a
);
1860 bSign
= extractFloat64Sign( b
);
1861 if ( aSign
== bSign
) {
1862 return subFloat64Sigs( a
, b
, aSign
);
1865 return addFloat64Sigs( a
, b
, aSign
);
1871 -------------------------------------------------------------------------------
1872 Returns the result of multiplying the double-precision floating-point values
1873 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1874 for Binary Floating-Point Arithmetic.
1875 -------------------------------------------------------------------------------
1877 float64
float64_mul( float64 a
, float64 b
)
1879 flag aSign
, bSign
, zSign
;
1880 int16 aExp
, bExp
, zExp
;
1881 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
1883 aSig1
= extractFloat64Frac1( a
);
1884 aSig0
= extractFloat64Frac0( a
);
1885 aExp
= extractFloat64Exp( a
);
1886 aSign
= extractFloat64Sign( a
);
1887 bSig1
= extractFloat64Frac1( b
);
1888 bSig0
= extractFloat64Frac0( b
);
1889 bExp
= extractFloat64Exp( b
);
1890 bSign
= extractFloat64Sign( b
);
1891 zSign
= aSign
^ bSign
;
1892 if ( aExp
== 0x7FF ) {
1893 if ( ( aSig0
| aSig1
)
1894 || ( ( bExp
== 0x7FF ) && ( bSig0
| bSig1
) ) ) {
1895 return propagateFloat64NaN( a
, b
);
1897 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
1898 return packFloat64( zSign
, 0x7FF, 0, 0 );
1900 if ( bExp
== 0x7FF ) {
1901 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1902 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
1904 float_raise( float_flag_invalid
);
1905 return float64_default_nan
;
1907 return packFloat64( zSign
, 0x7FF, 0, 0 );
1910 if ( ( aSig0
| aSig1
) == 0 ) return packFloat64( zSign
, 0, 0, 0 );
1911 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
1914 if ( ( bSig0
| bSig1
) == 0 ) return packFloat64( zSign
, 0, 0, 0 );
1915 normalizeFloat64Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
1917 zExp
= aExp
+ bExp
- 0x400;
1918 aSig0
|= 0x00100000;
1919 shortShift64Left( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
1920 mul64To128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
1921 add64( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
1922 zSig2
|= ( zSig3
!= 0 );
1923 if ( 0x00200000 <= zSig0
) {
1924 shift64ExtraRightJamming(
1925 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
1928 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1933 -------------------------------------------------------------------------------
1934 Returns the result of dividing the double-precision floating-point value `a'
1935 by the corresponding value `b'. The operation is performed according to the
1936 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1937 -------------------------------------------------------------------------------
1939 float64
float64_div( float64 a
, float64 b
)
1941 flag aSign
, bSign
, zSign
;
1942 int16 aExp
, bExp
, zExp
;
1943 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
1944 bits32 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
1946 aSig1
= extractFloat64Frac1( a
);
1947 aSig0
= extractFloat64Frac0( a
);
1948 aExp
= extractFloat64Exp( a
);
1949 aSign
= extractFloat64Sign( a
);
1950 bSig1
= extractFloat64Frac1( b
);
1951 bSig0
= extractFloat64Frac0( b
);
1952 bExp
= extractFloat64Exp( b
);
1953 bSign
= extractFloat64Sign( b
);
1954 zSign
= aSign
^ bSign
;
1955 if ( aExp
== 0x7FF ) {
1956 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, b
);
1957 if ( bExp
== 0x7FF ) {
1958 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1961 return packFloat64( zSign
, 0x7FF, 0, 0 );
1963 if ( bExp
== 0x7FF ) {
1964 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1965 return packFloat64( zSign
, 0, 0, 0 );
1968 if ( ( bSig0
| bSig1
) == 0 ) {
1969 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
1971 float_raise( float_flag_invalid
);
1972 return float64_default_nan
;
1974 float_raise( float_flag_divbyzero
);
1975 return packFloat64( zSign
, 0x7FF, 0, 0 );
1977 normalizeFloat64Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
1980 if ( ( aSig0
| aSig1
) == 0 ) return packFloat64( zSign
, 0, 0, 0 );
1981 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
1983 zExp
= aExp
- bExp
+ 0x3FD;
1984 shortShift64Left( aSig0
| 0x00100000, aSig1
, 11, &aSig0
, &aSig1
);
1985 shortShift64Left( bSig0
| 0x00100000, bSig1
, 11, &bSig0
, &bSig1
);
1986 if ( le64( bSig0
, bSig1
, aSig0
, aSig1
) ) {
1987 shift64Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
1990 zSig0
= estimateDiv64To32( aSig0
, aSig1
, bSig0
);
1991 mul64By32To96( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
1992 sub96( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
1993 while ( (sbits32
) rem0
< 0 ) {
1995 add96( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
1997 zSig1
= estimateDiv64To32( rem1
, rem2
, bSig0
);
1998 if ( ( zSig1
& 0x3FF ) <= 4 ) {
1999 mul64By32To96( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
2000 sub96( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
2001 while ( (sbits32
) rem1
< 0 ) {
2003 add96( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
2005 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
2007 shift64ExtraRightJamming( zSig0
, zSig1
, 0, 11, &zSig0
, &zSig1
, &zSig2
);
2008 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
2012 #ifndef SOFTFLOAT_FOR_GCC
2014 -------------------------------------------------------------------------------
2015 Returns the remainder of the double-precision floating-point value `a'
2016 with respect to the corresponding value `b'. The operation is performed
2017 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2018 -------------------------------------------------------------------------------
2020 float64
float64_rem( float64 a
, float64 b
)
2022 flag aSign
, bSign
, zSign
;
2023 int16 aExp
, bExp
, expDiff
;
2024 bits32 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
2025 bits32 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
2029 aSig1
= extractFloat64Frac1( a
);
2030 aSig0
= extractFloat64Frac0( a
);
2031 aExp
= extractFloat64Exp( a
);
2032 aSign
= extractFloat64Sign( a
);
2033 bSig1
= extractFloat64Frac1( b
);
2034 bSig0
= extractFloat64Frac0( b
);
2035 bExp
= extractFloat64Exp( b
);
2036 bSign
= extractFloat64Sign( b
);
2037 if ( aExp
== 0x7FF ) {
2038 if ( ( aSig0
| aSig1
)
2039 || ( ( bExp
== 0x7FF ) && ( bSig0
| bSig1
) ) ) {
2040 return propagateFloat64NaN( a
, b
);
2044 if ( bExp
== 0x7FF ) {
2045 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
2049 if ( ( bSig0
| bSig1
) == 0 ) {
2051 float_raise( float_flag_invalid
);
2052 return float64_default_nan
;
2054 normalizeFloat64Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
2057 if ( ( aSig0
| aSig1
) == 0 ) return a
;
2058 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
2060 expDiff
= aExp
- bExp
;
2061 if ( expDiff
< -1 ) return a
;
2063 aSig0
| 0x00100000, aSig1
, 11 - ( expDiff
< 0 ), &aSig0
, &aSig1
);
2064 shortShift64Left( bSig0
| 0x00100000, bSig1
, 11, &bSig0
, &bSig1
);
2065 q
= le64( bSig0
, bSig1
, aSig0
, aSig1
);
2066 if ( q
) sub64( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
2068 while ( 0 < expDiff
) {
2069 q
= estimateDiv64To32( aSig0
, aSig1
, bSig0
);
2070 q
= ( 4 < q
) ? q
- 4 : 0;
2071 mul64By32To96( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
2072 shortShift96Left( term0
, term1
, term2
, 29, &term1
, &term2
, &allZero
);
2073 shortShift64Left( aSig0
, aSig1
, 29, &aSig0
, &allZero
);
2074 sub64( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
2077 if ( -32 < expDiff
) {
2078 q
= estimateDiv64To32( aSig0
, aSig1
, bSig0
);
2079 q
= ( 4 < q
) ? q
- 4 : 0;
2081 shift64Right( bSig0
, bSig1
, 8, &bSig0
, &bSig1
);
2083 if ( expDiff
< 0 ) {
2084 shift64Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
2087 shortShift64Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
2089 mul64By32To96( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
2090 sub64( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
2093 shift64Right( aSig0
, aSig1
, 8, &aSig0
, &aSig1
);
2094 shift64Right( bSig0
, bSig1
, 8, &bSig0
, &bSig1
);
2097 alternateASig0
= aSig0
;
2098 alternateASig1
= aSig1
;
2100 sub64( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
2101 } while ( 0 <= (sbits32
) aSig0
);
2103 aSig0
, aSig1
, alternateASig0
, alternateASig1
, &sigMean0
, &sigMean1
);
2104 if ( ( sigMean0
< 0 )
2105 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
2106 aSig0
= alternateASig0
;
2107 aSig1
= alternateASig1
;
2109 zSign
= ( (sbits32
) aSig0
< 0 );
2110 if ( zSign
) sub64( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
2112 normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
);
2117 #ifndef SOFTFLOAT_FOR_GCC
2119 -------------------------------------------------------------------------------
2120 Returns the square root of the double-precision floating-point value `a'.
2121 The operation is performed according to the IEC/IEEE Standard for Binary
2122 Floating-Point Arithmetic.
2123 -------------------------------------------------------------------------------
2125 float64
float64_sqrt( float64 a
)
2129 bits32 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
2130 bits32 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
2133 aSig1
= extractFloat64Frac1( a
);
2134 aSig0
= extractFloat64Frac0( a
);
2135 aExp
= extractFloat64Exp( a
);
2136 aSign
= extractFloat64Sign( a
);
2137 if ( aExp
== 0x7FF ) {
2138 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, a
);
2139 if ( ! aSign
) return a
;
2143 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
2145 float_raise( float_flag_invalid
);
2146 return float64_default_nan
;
2149 if ( ( aSig0
| aSig1
) == 0 ) return packFloat64( 0, 0, 0, 0 );
2150 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
2152 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
2153 aSig0
|= 0x00100000;
2154 shortShift64Left( aSig0
, aSig1
, 11, &term0
, &term1
);
2155 zSig0
= ( estimateSqrt32( aExp
, term0
)>>1 ) + 1;
2156 if ( zSig0
== 0 ) zSig0
= 0x7FFFFFFF;
2157 doubleZSig0
= zSig0
+ zSig0
;
2158 shortShift64Left( aSig0
, aSig1
, 9 - ( aExp
& 1 ), &aSig0
, &aSig1
);
2159 mul32To64( zSig0
, zSig0
, &term0
, &term1
);
2160 sub64( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
2161 while ( (sbits32
) rem0
< 0 ) {
2164 add64( rem0
, rem1
, 0, doubleZSig0
| 1, &rem0
, &rem1
);
2166 zSig1
= estimateDiv64To32( rem1
, 0, doubleZSig0
);
2167 if ( ( zSig1
& 0x1FF ) <= 5 ) {
2168 if ( zSig1
== 0 ) zSig1
= 1;
2169 mul32To64( doubleZSig0
, zSig1
, &term1
, &term2
);
2170 sub64( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
2171 mul32To64( zSig1
, zSig1
, &term2
, &term3
);
2172 sub96( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
2173 while ( (sbits32
) rem1
< 0 ) {
2175 shortShift64Left( 0, zSig1
, 1, &term2
, &term3
);
2177 term2
|= doubleZSig0
;
2178 add96( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
2180 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
2182 shift64ExtraRightJamming( zSig0
, zSig1
, 0, 10, &zSig0
, &zSig1
, &zSig2
);
2183 return roundAndPackFloat64( 0, zExp
, zSig0
, zSig1
, zSig2
);
2189 -------------------------------------------------------------------------------
2190 Returns 1 if the double-precision floating-point value `a' is equal to
2191 the corresponding value `b', and 0 otherwise. The comparison is performed
2192 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2193 -------------------------------------------------------------------------------
2195 flag
float64_eq( float64 a
, float64 b
)
2198 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2199 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2200 || ( ( extractFloat64Exp( b
) == 0x7FF )
2201 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2203 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2204 float_raise( float_flag_invalid
);
2208 return ( a
== b
) ||
2209 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) == 0 );
2214 -------------------------------------------------------------------------------
2215 Returns 1 if the double-precision floating-point value `a' is less than
2216 or equal to the corresponding value `b', and 0 otherwise. The comparison
2217 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2219 -------------------------------------------------------------------------------
2221 flag
float64_le( float64 a
, float64 b
)
2225 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2226 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2227 || ( ( extractFloat64Exp( b
) == 0x7FF )
2228 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2230 float_raise( float_flag_invalid
);
2233 aSign
= extractFloat64Sign( a
);
2234 bSign
= extractFloat64Sign( b
);
2235 if ( aSign
!= bSign
)
2237 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) ==
2239 return ( a
== b
) ||
2240 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
2244 -------------------------------------------------------------------------------
2245 Returns 1 if the double-precision floating-point value `a' is less than
2246 the corresponding value `b', and 0 otherwise. The comparison is performed
2247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2248 -------------------------------------------------------------------------------
2250 flag
float64_lt( float64 a
, float64 b
)
2254 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2255 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2256 || ( ( extractFloat64Exp( b
) == 0x7FF )
2257 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2259 float_raise( float_flag_invalid
);
2262 aSign
= extractFloat64Sign( a
);
2263 bSign
= extractFloat64Sign( b
);
2264 if ( aSign
!= bSign
)
2266 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) !=
2268 return ( a
!= b
) &&
2269 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
2273 #ifndef SOFTFLOAT_FOR_GCC
2275 -------------------------------------------------------------------------------
2276 Returns 1 if the double-precision floating-point value `a' is equal to
2277 the corresponding value `b', and 0 otherwise. The invalid exception is
2278 raised if either operand is a NaN. Otherwise, the comparison is performed
2279 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2280 -------------------------------------------------------------------------------
2282 flag
float64_eq_signaling( float64 a
, float64 b
)
2285 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2286 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2287 || ( ( extractFloat64Exp( b
) == 0x7FF )
2288 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2290 float_raise( float_flag_invalid
);
2293 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2298 -------------------------------------------------------------------------------
2299 Returns 1 if the double-precision floating-point value `a' is less than or
2300 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2301 cause an exception. Otherwise, the comparison is performed according to the
2302 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2303 -------------------------------------------------------------------------------
2305 flag
float64_le_quiet( float64 a
, float64 b
)
2309 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2310 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2311 || ( ( extractFloat64Exp( b
) == 0x7FF )
2312 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2314 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2315 float_raise( float_flag_invalid
);
2319 aSign
= extractFloat64Sign( a
);
2320 bSign
= extractFloat64Sign( b
);
2321 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2322 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2327 -------------------------------------------------------------------------------
2328 Returns 1 if the double-precision floating-point value `a' is less than
2329 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2330 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2331 Standard for Binary Floating-Point Arithmetic.
2332 -------------------------------------------------------------------------------
2334 flag
float64_lt_quiet( float64 a
, float64 b
)
2338 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2339 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2340 || ( ( extractFloat64Exp( b
) == 0x7FF )
2341 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2343 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2344 float_raise( float_flag_invalid
);
2348 aSign
= extractFloat64Sign( a
);
2349 bSign
= extractFloat64Sign( b
);
2350 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2351 return ( a
!= b
) && ( aSign
^ ( a
< b
) );