4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
18 * This differs from the standard bits32/softfloat.c in that float64
19 * is defined to be a 64-bit integer rather than a structure. The
20 * structure is float64s, with translation between the two going via
25 ===============================================================================
27 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
28 Arithmetic Package, Release 2a.
30 Written by John R. Hauser. This work was made possible in part by the
31 International Computer Science Institute, located at Suite 600, 1947 Center
32 Street, Berkeley, California 94704. Funding was partially provided by the
33 National Science Foundation under grant MIP-9311980. The original version
34 of this code was written as part of a project to build a fixed-point vector
35 processor in collaboration with the University of California at Berkeley,
36 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
37 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
38 arithmetic/SoftFloat.html'.
40 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
41 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
42 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
43 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
44 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
46 Derivative works are acceptable, even for commercial purposes, so long as
47 (1) they include prominent notice that the work is derivative, and (2) they
48 include prominent notice akin to these four paragraphs for those parts of
49 this code that are retained.
51 ===============================================================================
54 #include <sys/cdefs.h>
55 #if defined(LIBC_SCCS) && !defined(lint)
56 __RCSID("$NetBSD: softfloat.c,v 1.9 2002/05/07 10:02:42 bjh21 Exp $");
57 #endif /* LIBC_SCCS and not lint */
59 #ifdef SOFTFLOAT_FOR_GCC
60 #include "softfloat-for-gcc.h"
64 #include "softfloat.h"
67 * Conversions between floats as stored in memory and floats as
70 #ifndef FLOAT64_DEMANGLE
71 #define FLOAT64_DEMANGLE(a) (a)
73 #ifndef FLOAT64_MANGLE
74 #define FLOAT64_MANGLE(a) (a)
78 -------------------------------------------------------------------------------
79 Floating-point rounding mode and exception flags.
80 -------------------------------------------------------------------------------
82 fp_rnd float_rounding_mode
= float_round_nearest_even
;
83 fp_except float_exception_flags
= 0;
86 -------------------------------------------------------------------------------
87 Primitive arithmetic functions, including multi-word arithmetic, and
88 division and square root approximations. (Can be specialized to target if
90 -------------------------------------------------------------------------------
92 #include "softfloat-macros"
95 -------------------------------------------------------------------------------
96 Functions and definitions to determine: (1) whether tininess for underflow
97 is detected before or after rounding by default, (2) what (if anything)
98 happens when exceptions are raised, (3) how signaling NaNs are distinguished
99 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
100 are propagated from function inputs to output. These details are target-
102 -------------------------------------------------------------------------------
104 #include "softfloat-specialize"
107 -------------------------------------------------------------------------------
108 Returns the fraction bits of the single-precision floating-point value `a'.
109 -------------------------------------------------------------------------------
111 INLINE bits32
extractFloat32Frac( float32 a
)
114 return a
& 0x007FFFFF;
119 -------------------------------------------------------------------------------
120 Returns the exponent bits of the single-precision floating-point value `a'.
121 -------------------------------------------------------------------------------
123 INLINE int16
extractFloat32Exp( float32 a
)
126 return ( a
>>23 ) & 0xFF;
131 -------------------------------------------------------------------------------
132 Returns the sign bit of the single-precision floating-point value `a'.
133 -------------------------------------------------------------------------------
135 INLINE flag
extractFloat32Sign( float32 a
)
143 -------------------------------------------------------------------------------
144 Normalizes the subnormal single-precision floating-point value represented
145 by the denormalized significand `aSig'. The normalized exponent and
146 significand are stored at the locations pointed to by `zExpPtr' and
147 `zSigPtr', respectively.
148 -------------------------------------------------------------------------------
151 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
155 shiftCount
= countLeadingZeros32( aSig
) - 8;
156 *zSigPtr
= aSig
<<shiftCount
;
157 *zExpPtr
= 1 - shiftCount
;
162 -------------------------------------------------------------------------------
163 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
164 single-precision floating-point value, returning the result. After being
165 shifted into the proper positions, the three fields are simply added
166 together to form the result. This means that any integer portion of `zSig'
167 will be added into the exponent. Since a properly normalized significand
168 will have an integer portion equal to 1, the `zExp' input should be 1 less
169 than the desired result exponent whenever `zSig' is a complete, normalized
171 -------------------------------------------------------------------------------
173 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
176 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
181 -------------------------------------------------------------------------------
182 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
183 and significand `zSig', and returns the proper single-precision floating-
184 point value corresponding to the abstract input. Ordinarily, the abstract
185 value is simply rounded and packed into the single-precision format, with
186 the inexact exception raised if the abstract input cannot be represented
187 exactly. However, if the abstract value is too large, the overflow and
188 inexact exceptions are raised and an infinity or maximal finite value is
189 returned. If the abstract value is too small, the input value is rounded to
190 a subnormal number, and the underflow and inexact exceptions are raised if
191 the abstract input cannot be represented exactly as a subnormal single-
192 precision floating-point number.
193 The input significand `zSig' has its binary point between bits 30
194 and 29, which is 7 bits to the left of the usual location. This shifted
195 significand must be normalized or smaller. If `zSig' is not normalized,
196 `zExp' must be 0; in that case, the result returned is a subnormal number,
197 and it must not require rounding. In the usual case that `zSig' is
198 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
199 The handling of underflow and overflow follows the IEC/IEEE Standard for
200 Binary Floating-Point Arithmetic.
201 -------------------------------------------------------------------------------
203 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
206 flag roundNearestEven
;
207 int8 roundIncrement
, roundBits
;
210 roundingMode
= float_rounding_mode
;
211 roundNearestEven
= roundingMode
== float_round_nearest_even
;
212 roundIncrement
= 0x40;
213 if ( ! roundNearestEven
) {
214 if ( roundingMode
== float_round_to_zero
) {
218 roundIncrement
= 0x7F;
220 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
223 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
227 roundBits
= zSig
& 0x7F;
228 if ( 0xFD <= (bits16
) zExp
) {
230 || ( ( zExp
== 0xFD )
231 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
233 float_raise( float_flag_overflow
| float_flag_inexact
);
234 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
238 ( float_detect_tininess
== float_tininess_before_rounding
)
240 || ( zSig
+ roundIncrement
< 0x80000000 );
241 shift32RightJamming( zSig
, - zExp
, &zSig
);
243 roundBits
= zSig
& 0x7F;
244 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
247 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
248 zSig
= ( zSig
+ roundIncrement
)>>7;
249 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
250 if ( zSig
== 0 ) zExp
= 0;
251 return packFloat32( zSign
, zExp
, zSig
);
256 -------------------------------------------------------------------------------
257 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
258 and significand `zSig', and returns the proper single-precision floating-
259 point value corresponding to the abstract input. This routine is just like
260 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
261 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
262 floating-point exponent.
263 -------------------------------------------------------------------------------
266 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
270 shiftCount
= countLeadingZeros32( zSig
) - 1;
271 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
276 -------------------------------------------------------------------------------
277 Returns the least-significant 32 fraction bits of the double-precision
278 floating-point value `a'.
279 -------------------------------------------------------------------------------
281 INLINE bits32
extractFloat64Frac1( float64 a
)
284 return FLOAT64_DEMANGLE(a
) & LIT64( 0x00000000FFFFFFFF );
289 -------------------------------------------------------------------------------
290 Returns the most-significant 20 fraction bits of the double-precision
291 floating-point value `a'.
292 -------------------------------------------------------------------------------
294 INLINE bits32
extractFloat64Frac0( float64 a
)
297 return ( FLOAT64_DEMANGLE(a
)>>32 ) & 0x000FFFFF;
302 -------------------------------------------------------------------------------
303 Returns the exponent bits of the double-precision floating-point value `a'.
304 -------------------------------------------------------------------------------
306 INLINE int16
extractFloat64Exp( float64 a
)
309 return ( FLOAT64_DEMANGLE(a
)>>52 ) & 0x7FF;
314 -------------------------------------------------------------------------------
315 Returns the sign bit of the double-precision floating-point value `a'.
316 -------------------------------------------------------------------------------
318 INLINE flag
extractFloat64Sign( float64 a
)
321 return FLOAT64_DEMANGLE(a
)>>63;
326 -------------------------------------------------------------------------------
327 Normalizes the subnormal double-precision floating-point value represented
328 by the denormalized significand formed by the concatenation of `aSig0' and
329 `aSig1'. The normalized exponent is stored at the location pointed to by
330 `zExpPtr'. The most significant 21 bits of the normalized significand are
331 stored at the location pointed to by `zSig0Ptr', and the least significant
332 32 bits of the normalized significand are stored at the location pointed to
334 -------------------------------------------------------------------------------
337 normalizeFloat64Subnormal(
348 shiftCount
= countLeadingZeros32( aSig1
) - 11;
349 if ( shiftCount
< 0 ) {
350 *zSig0Ptr
= aSig1
>>( - shiftCount
);
351 *zSig1Ptr
= aSig1
<<( shiftCount
& 31 );
354 *zSig0Ptr
= aSig1
<<shiftCount
;
357 *zExpPtr
= - shiftCount
- 31;
360 shiftCount
= countLeadingZeros32( aSig0
) - 11;
361 shortShift64Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
362 *zExpPtr
= 1 - shiftCount
;
368 -------------------------------------------------------------------------------
369 Packs the sign `zSign', the exponent `zExp', and the significand formed by
370 the concatenation of `zSig0' and `zSig1' into a double-precision floating-
371 point value, returning the result. After being shifted into the proper
372 positions, the three fields `zSign', `zExp', and `zSig0' are simply added
373 together to form the most significant 32 bits of the result. This means
374 that any integer portion of `zSig0' will be added into the exponent. Since
375 a properly normalized significand will have an integer portion equal to 1,
376 the `zExp' input should be 1 less than the desired result exponent whenever
377 `zSig0' and `zSig1' concatenated form a complete, normalized significand.
378 -------------------------------------------------------------------------------
381 packFloat64( flag zSign
, int16 zExp
, bits32 zSig0
, bits32 zSig1
)
384 return FLOAT64_MANGLE( ( ( (bits64
) zSign
)<<63 ) +
385 ( ( (bits64
) zExp
)<<52 ) +
386 ( ( (bits64
) zSig0
)<<32 ) + zSig1
);
392 -------------------------------------------------------------------------------
393 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
394 and extended significand formed by the concatenation of `zSig0', `zSig1',
395 and `zSig2', and returns the proper double-precision floating-point value
396 corresponding to the abstract input. Ordinarily, the abstract value is
397 simply rounded and packed into the double-precision format, with the inexact
398 exception raised if the abstract input cannot be represented exactly.
399 However, if the abstract value is too large, the overflow and inexact
400 exceptions are raised and an infinity or maximal finite value is returned.
401 If the abstract value is too small, the input value is rounded to a
402 subnormal number, and the underflow and inexact exceptions are raised if the
403 abstract input cannot be represented exactly as a subnormal double-precision
404 floating-point number.
405 The input significand must be normalized or smaller. If the input
406 significand is not normalized, `zExp' must be 0; in that case, the result
407 returned is a subnormal number, and it must not require rounding. In the
408 usual case that the input significand is normalized, `zExp' must be 1 less
409 than the ``true'' floating-point exponent. The handling of underflow and
410 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
411 -------------------------------------------------------------------------------
415 flag zSign
, int16 zExp
, bits32 zSig0
, bits32 zSig1
, bits32 zSig2
)
418 flag roundNearestEven
, increment
, isTiny
;
420 roundingMode
= float_rounding_mode
;
421 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
422 increment
= ( (sbits32
) zSig2
< 0 );
423 if ( ! roundNearestEven
) {
424 if ( roundingMode
== float_round_to_zero
) {
429 increment
= ( roundingMode
== float_round_down
) && zSig2
;
432 increment
= ( roundingMode
== float_round_up
) && zSig2
;
436 if ( 0x7FD <= (bits16
) zExp
) {
437 if ( ( 0x7FD < zExp
)
438 || ( ( zExp
== 0x7FD )
439 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0
, zSig1
)
443 float_raise( float_flag_overflow
| float_flag_inexact
);
444 if ( ( roundingMode
== float_round_to_zero
)
445 || ( zSign
&& ( roundingMode
== float_round_up
) )
446 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
448 return packFloat64( zSign
, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
450 return packFloat64( zSign
, 0x7FF, 0, 0 );
454 ( float_detect_tininess
== float_tininess_before_rounding
)
457 || lt64( zSig0
, zSig1
, 0x001FFFFF, 0xFFFFFFFF );
458 shift64ExtraRightJamming(
459 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
461 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow
);
462 if ( roundNearestEven
) {
463 increment
= ( (sbits32
) zSig2
< 0 );
467 increment
= ( roundingMode
== float_round_down
) && zSig2
;
470 increment
= ( roundingMode
== float_round_up
) && zSig2
;
475 if ( zSig2
) float_exception_flags
|= float_flag_inexact
;
477 add64( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
478 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
481 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
483 return packFloat64( zSign
, zExp
, zSig0
, zSig1
);
488 -------------------------------------------------------------------------------
489 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
490 and significand formed by the concatenation of `zSig0' and `zSig1', and
491 returns the proper double-precision floating-point value corresponding
492 to the abstract input. This routine is just like `roundAndPackFloat64'
493 except that the input significand has fewer bits and does not have to be
494 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
496 -------------------------------------------------------------------------------
499 normalizeRoundAndPackFloat64(
500 flag zSign
, int16 zExp
, bits32 zSig0
, bits32 zSig1
)
510 shiftCount
= countLeadingZeros32( zSig0
) - 11;
511 if ( 0 <= shiftCount
) {
513 shortShift64Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
516 shift64ExtraRightJamming(
517 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
520 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
525 -------------------------------------------------------------------------------
526 Returns the result of converting the 32-bit two's complement integer `a' to
527 the single-precision floating-point format. The conversion is performed
528 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
529 -------------------------------------------------------------------------------
531 float32
int32_to_float32( int32 a
)
535 if ( a
== 0 ) return 0;
536 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
538 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a
);
543 -------------------------------------------------------------------------------
544 Returns the result of converting the 32-bit two's complement integer `a' to
545 the double-precision floating-point format. The conversion is performed
546 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
547 -------------------------------------------------------------------------------
549 float64
int32_to_float64( int32 a
)
556 if ( a
== 0 ) return packFloat64( 0, 0, 0, 0 );
558 absA
= zSign
? - a
: a
;
559 shiftCount
= countLeadingZeros32( absA
) - 11;
560 if ( 0 <= shiftCount
) {
561 zSig0
= absA
<<shiftCount
;
565 shift64Right( absA
, 0, - shiftCount
, &zSig0
, &zSig1
);
567 return packFloat64( zSign
, 0x412 - shiftCount
, zSig0
, zSig1
);
571 #ifndef SOFTFLOAT_FOR_GCC
573 -------------------------------------------------------------------------------
574 Returns the result of converting the single-precision floating-point value
575 `a' to the 32-bit two's complement integer format. The conversion is
576 performed according to the IEC/IEEE Standard for Binary Floating-Point
577 Arithmetic---which means in particular that the conversion is rounded
578 according to the current rounding mode. If `a' is a NaN, the largest
579 positive integer is returned. Otherwise, if the conversion overflows, the
580 largest integer with the same sign as `a' is returned.
581 -------------------------------------------------------------------------------
583 int32
float32_to_int32( float32 a
)
586 int16 aExp
, shiftCount
;
587 bits32 aSig
, aSigExtra
;
591 aSig
= extractFloat32Frac( a
);
592 aExp
= extractFloat32Exp( a
);
593 aSign
= extractFloat32Sign( a
);
594 shiftCount
= aExp
- 0x96;
595 if ( 0 <= shiftCount
) {
596 if ( 0x9E <= aExp
) {
597 if ( a
!= 0xCF000000 ) {
598 float_raise( float_flag_invalid
);
599 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
603 return (sbits32
) 0x80000000;
605 z
= ( aSig
| 0x00800000 )<<shiftCount
;
606 if ( aSign
) z
= - z
;
610 aSigExtra
= aExp
| aSig
;
615 aSigExtra
= aSig
<<( shiftCount
& 31 );
616 z
= aSig
>>( - shiftCount
);
618 if ( aSigExtra
) float_exception_flags
|= float_flag_inexact
;
619 roundingMode
= float_rounding_mode
;
620 if ( roundingMode
== float_round_nearest_even
) {
621 if ( (sbits32
) aSigExtra
< 0 ) {
623 if ( (bits32
) ( aSigExtra
<<1 ) == 0 ) z
&= ~1;
625 if ( aSign
) z
= - z
;
628 aSigExtra
= ( aSigExtra
!= 0 );
630 z
+= ( roundingMode
== float_round_down
) & aSigExtra
;
634 z
+= ( roundingMode
== float_round_up
) & aSigExtra
;
644 -------------------------------------------------------------------------------
645 Returns the result of converting the single-precision floating-point value
646 `a' to the 32-bit two's complement integer format. The conversion is
647 performed according to the IEC/IEEE Standard for Binary Floating-Point
648 Arithmetic, except that the conversion is always rounded toward zero.
649 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
650 the conversion overflows, the largest integer with the same sign as `a' is
652 -------------------------------------------------------------------------------
654 int32
float32_to_int32_round_to_zero( float32 a
)
657 int16 aExp
, shiftCount
;
661 aSig
= extractFloat32Frac( a
);
662 aExp
= extractFloat32Exp( a
);
663 aSign
= extractFloat32Sign( a
);
664 shiftCount
= aExp
- 0x9E;
665 if ( 0 <= shiftCount
) {
666 if ( a
!= 0xCF000000 ) {
667 float_raise( float_flag_invalid
);
668 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
670 return (sbits32
) 0x80000000;
672 else if ( aExp
<= 0x7E ) {
673 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
676 aSig
= ( aSig
| 0x00800000 )<<8;
677 z
= aSig
>>( - shiftCount
);
678 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
679 float_exception_flags
|= float_flag_inexact
;
681 if ( aSign
) z
= - z
;
687 -------------------------------------------------------------------------------
688 Returns the result of converting the single-precision floating-point value
689 `a' to the double-precision floating-point format. The conversion is
690 performed according to the IEC/IEEE Standard for Binary Floating-Point
692 -------------------------------------------------------------------------------
694 float64
float32_to_float64( float32 a
)
698 bits32 aSig
, zSig0
, zSig1
;
700 aSig
= extractFloat32Frac( a
);
701 aExp
= extractFloat32Exp( a
);
702 aSign
= extractFloat32Sign( a
);
703 if ( aExp
== 0xFF ) {
704 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
705 return packFloat64( aSign
, 0x7FF, 0, 0 );
708 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0, 0 );
709 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
712 shift64Right( aSig
, 0, 3, &zSig0
, &zSig1
);
713 return packFloat64( aSign
, aExp
+ 0x380, zSig0
, zSig1
);
717 #ifndef SOFTFLOAT_FOR_GCC
719 -------------------------------------------------------------------------------
720 Rounds the single-precision floating-point value `a' to an integer,
721 and returns the result as a single-precision floating-point value. The
722 operation is performed according to the IEC/IEEE Standard for Binary
723 Floating-Point Arithmetic.
724 -------------------------------------------------------------------------------
726 float32
float32_round_to_int( float32 a
)
730 bits32 lastBitMask
, roundBitsMask
;
734 aExp
= extractFloat32Exp( a
);
735 if ( 0x96 <= aExp
) {
736 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
737 return propagateFloat32NaN( a
, a
);
741 if ( aExp
<= 0x7E ) {
742 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
743 float_exception_flags
|= float_flag_inexact
;
744 aSign
= extractFloat32Sign( a
);
745 switch ( float_rounding_mode
) {
746 case float_round_nearest_even
:
747 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
748 return packFloat32( aSign
, 0x7F, 0 );
751 case float_round_to_zero
:
753 case float_round_down
:
754 return aSign
? 0xBF800000 : 0;
756 return aSign
? 0x80000000 : 0x3F800000;
758 return packFloat32( aSign
, 0, 0 );
761 lastBitMask
<<= 0x96 - aExp
;
762 roundBitsMask
= lastBitMask
- 1;
764 roundingMode
= float_rounding_mode
;
765 if ( roundingMode
== float_round_nearest_even
) {
767 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
769 else if ( roundingMode
!= float_round_to_zero
) {
770 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
774 z
&= ~ roundBitsMask
;
775 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
782 -------------------------------------------------------------------------------
783 Returns the result of adding the absolute values of the single-precision
784 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
785 before being returned. `zSign' is ignored if the result is a NaN.
786 The addition is performed according to the IEC/IEEE Standard for Binary
787 Floating-Point Arithmetic.
788 -------------------------------------------------------------------------------
790 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign
)
792 int16 aExp
, bExp
, zExp
;
793 bits32 aSig
, bSig
, zSig
;
796 aSig
= extractFloat32Frac( a
);
797 aExp
= extractFloat32Exp( a
);
798 bSig
= extractFloat32Frac( b
);
799 bExp
= extractFloat32Exp( b
);
800 expDiff
= aExp
- bExp
;
804 if ( aExp
== 0xFF ) {
805 if ( aSig
) return propagateFloat32NaN( a
, b
);
814 shift32RightJamming( bSig
, expDiff
, &bSig
);
817 else if ( expDiff
< 0 ) {
818 if ( bExp
== 0xFF ) {
819 if ( bSig
) return propagateFloat32NaN( a
, b
);
820 return packFloat32( zSign
, 0xFF, 0 );
828 shift32RightJamming( aSig
, - expDiff
, &aSig
);
832 if ( aExp
== 0xFF ) {
833 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
836 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
837 zSig
= 0x40000000 + aSig
+ bSig
;
842 zSig
= ( aSig
+ bSig
)<<1;
844 if ( (sbits32
) zSig
< 0 ) {
849 return roundAndPackFloat32( zSign
, zExp
, zSig
);
854 -------------------------------------------------------------------------------
855 Returns the result of subtracting the absolute values of the single-
856 precision floating-point values `a' and `b'. If `zSign' is 1, the
857 difference is negated before being returned. `zSign' is ignored if the
858 result is a NaN. The subtraction is performed according to the IEC/IEEE
859 Standard for Binary Floating-Point Arithmetic.
860 -------------------------------------------------------------------------------
862 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign
)
864 int16 aExp
, bExp
, zExp
;
865 bits32 aSig
, bSig
, zSig
;
868 aSig
= extractFloat32Frac( a
);
869 aExp
= extractFloat32Exp( a
);
870 bSig
= extractFloat32Frac( b
);
871 bExp
= extractFloat32Exp( b
);
872 expDiff
= aExp
- bExp
;
875 if ( 0 < expDiff
) goto aExpBigger
;
876 if ( expDiff
< 0 ) goto bExpBigger
;
877 if ( aExp
== 0xFF ) {
878 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
879 float_raise( float_flag_invalid
);
880 return float32_default_nan
;
886 if ( bSig
< aSig
) goto aBigger
;
887 if ( aSig
< bSig
) goto bBigger
;
888 return packFloat32( float_rounding_mode
== float_round_down
, 0, 0 );
890 if ( bExp
== 0xFF ) {
891 if ( bSig
) return propagateFloat32NaN( a
, b
);
892 return packFloat32( zSign
^ 1, 0xFF, 0 );
900 shift32RightJamming( aSig
, - expDiff
, &aSig
);
906 goto normalizeRoundAndPack
;
908 if ( aExp
== 0xFF ) {
909 if ( aSig
) return propagateFloat32NaN( a
, b
);
918 shift32RightJamming( bSig
, expDiff
, &bSig
);
923 normalizeRoundAndPack
:
925 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig
);
930 -------------------------------------------------------------------------------
931 Returns the result of adding the single-precision floating-point values `a'
932 and `b'. The operation is performed according to the IEC/IEEE Standard for
933 Binary Floating-Point Arithmetic.
934 -------------------------------------------------------------------------------
936 float32
float32_add( float32 a
, float32 b
)
940 aSign
= extractFloat32Sign( a
);
941 bSign
= extractFloat32Sign( b
);
942 if ( aSign
== bSign
) {
943 return addFloat32Sigs( a
, b
, aSign
);
946 return subFloat32Sigs( a
, b
, aSign
);
952 -------------------------------------------------------------------------------
953 Returns the result of subtracting the single-precision floating-point values
954 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
955 for Binary Floating-Point Arithmetic.
956 -------------------------------------------------------------------------------
958 float32
float32_sub( float32 a
, float32 b
)
962 aSign
= extractFloat32Sign( a
);
963 bSign
= extractFloat32Sign( b
);
964 if ( aSign
== bSign
) {
965 return subFloat32Sigs( a
, b
, aSign
);
968 return addFloat32Sigs( a
, b
, aSign
);
974 -------------------------------------------------------------------------------
975 Returns the result of multiplying the single-precision floating-point values
976 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
977 for Binary Floating-Point Arithmetic.
978 -------------------------------------------------------------------------------
980 float32
float32_mul( float32 a
, float32 b
)
982 flag aSign
, bSign
, zSign
;
983 int16 aExp
, bExp
, zExp
;
984 bits32 aSig
, bSig
, zSig0
, zSig1
;
986 aSig
= extractFloat32Frac( a
);
987 aExp
= extractFloat32Exp( a
);
988 aSign
= extractFloat32Sign( a
);
989 bSig
= extractFloat32Frac( b
);
990 bExp
= extractFloat32Exp( b
);
991 bSign
= extractFloat32Sign( b
);
992 zSign
= aSign
^ bSign
;
993 if ( aExp
== 0xFF ) {
994 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
995 return propagateFloat32NaN( a
, b
);
997 if ( ( bExp
| bSig
) == 0 ) {
998 float_raise( float_flag_invalid
);
999 return float32_default_nan
;
1001 return packFloat32( zSign
, 0xFF, 0 );
1003 if ( bExp
== 0xFF ) {
1004 if ( bSig
) return propagateFloat32NaN( a
, b
);
1005 if ( ( aExp
| aSig
) == 0 ) {
1006 float_raise( float_flag_invalid
);
1007 return float32_default_nan
;
1009 return packFloat32( zSign
, 0xFF, 0 );
1012 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1013 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1016 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1017 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1019 zExp
= aExp
+ bExp
- 0x7F;
1020 aSig
= ( aSig
| 0x00800000 )<<7;
1021 bSig
= ( bSig
| 0x00800000 )<<8;
1022 mul32To64( aSig
, bSig
, &zSig0
, &zSig1
);
1023 zSig0
|= ( zSig1
!= 0 );
1024 if ( 0 <= (sbits32
) ( zSig0
<<1 ) ) {
1028 return roundAndPackFloat32( zSign
, zExp
, zSig0
);
1033 -------------------------------------------------------------------------------
1034 Returns the result of dividing the single-precision floating-point value `a'
1035 by the corresponding value `b'. The operation is performed according to the
1036 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1037 -------------------------------------------------------------------------------
1039 float32
float32_div( float32 a
, float32 b
)
1041 flag aSign
, bSign
, zSign
;
1042 int16 aExp
, bExp
, zExp
;
1043 bits32 aSig
, bSig
, zSig
, rem0
, rem1
, term0
, term1
;
1045 aSig
= extractFloat32Frac( a
);
1046 aExp
= extractFloat32Exp( a
);
1047 aSign
= extractFloat32Sign( a
);
1048 bSig
= extractFloat32Frac( b
);
1049 bExp
= extractFloat32Exp( b
);
1050 bSign
= extractFloat32Sign( b
);
1051 zSign
= aSign
^ bSign
;
1052 if ( aExp
== 0xFF ) {
1053 if ( aSig
) return propagateFloat32NaN( a
, b
);
1054 if ( bExp
== 0xFF ) {
1055 if ( bSig
) return propagateFloat32NaN( a
, b
);
1056 float_raise( float_flag_invalid
);
1057 return float32_default_nan
;
1059 return packFloat32( zSign
, 0xFF, 0 );
1061 if ( bExp
== 0xFF ) {
1062 if ( bSig
) return propagateFloat32NaN( a
, b
);
1063 return packFloat32( zSign
, 0, 0 );
1067 if ( ( aExp
| aSig
) == 0 ) {
1068 float_raise( float_flag_invalid
);
1069 return float32_default_nan
;
1071 float_raise( float_flag_divbyzero
);
1072 return packFloat32( zSign
, 0xFF, 0 );
1074 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1077 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1078 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1080 zExp
= aExp
- bExp
+ 0x7D;
1081 aSig
= ( aSig
| 0x00800000 )<<7;
1082 bSig
= ( bSig
| 0x00800000 )<<8;
1083 if ( bSig
<= ( aSig
+ aSig
) ) {
1087 zSig
= estimateDiv64To32( aSig
, 0, bSig
);
1088 if ( ( zSig
& 0x3F ) <= 2 ) {
1089 mul32To64( bSig
, zSig
, &term0
, &term1
);
1090 sub64( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
1091 while ( (sbits32
) rem0
< 0 ) {
1093 add64( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
1095 zSig
|= ( rem1
!= 0 );
1097 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1101 #ifndef SOFTFLOAT_FOR_GCC
1103 -------------------------------------------------------------------------------
1104 Returns the remainder of the single-precision floating-point value `a'
1105 with respect to the corresponding value `b'. The operation is performed
1106 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1107 -------------------------------------------------------------------------------
1109 float32
float32_rem( float32 a
, float32 b
)
1111 flag aSign
, bSign
, zSign
;
1112 int16 aExp
, bExp
, expDiff
;
1113 bits32 aSig
, bSig
, q
, allZero
, alternateASig
;
1116 aSig
= extractFloat32Frac( a
);
1117 aExp
= extractFloat32Exp( a
);
1118 aSign
= extractFloat32Sign( a
);
1119 bSig
= extractFloat32Frac( b
);
1120 bExp
= extractFloat32Exp( b
);
1121 bSign
= extractFloat32Sign( b
);
1122 if ( aExp
== 0xFF ) {
1123 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1124 return propagateFloat32NaN( a
, b
);
1126 float_raise( float_flag_invalid
);
1127 return float32_default_nan
;
1129 if ( bExp
== 0xFF ) {
1130 if ( bSig
) return propagateFloat32NaN( a
, b
);
1135 float_raise( float_flag_invalid
);
1136 return float32_default_nan
;
1138 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1141 if ( aSig
== 0 ) return a
;
1142 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1144 expDiff
= aExp
- bExp
;
1145 aSig
= ( aSig
| 0x00800000 )<<8;
1146 bSig
= ( bSig
| 0x00800000 )<<8;
1147 if ( expDiff
< 0 ) {
1148 if ( expDiff
< -1 ) return a
;
1151 q
= ( bSig
<= aSig
);
1152 if ( q
) aSig
-= bSig
;
1154 while ( 0 < expDiff
) {
1155 q
= estimateDiv64To32( aSig
, 0, bSig
);
1156 q
= ( 2 < q
) ? q
- 2 : 0;
1157 aSig
= - ( ( bSig
>>2 ) * q
);
1161 if ( 0 < expDiff
) {
1162 q
= estimateDiv64To32( aSig
, 0, bSig
);
1163 q
= ( 2 < q
) ? q
- 2 : 0;
1166 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1173 alternateASig
= aSig
;
1176 } while ( 0 <= (sbits32
) aSig
);
1177 sigMean
= aSig
+ alternateASig
;
1178 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1179 aSig
= alternateASig
;
1181 zSign
= ( (sbits32
) aSig
< 0 );
1182 if ( zSign
) aSig
= - aSig
;
1183 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig
);
1188 #ifndef SOFTFLOAT_FOR_GCC
1190 -------------------------------------------------------------------------------
1191 Returns the square root of the single-precision floating-point value `a'.
1192 The operation is performed according to the IEC/IEEE Standard for Binary
1193 Floating-Point Arithmetic.
1194 -------------------------------------------------------------------------------
1196 float32
float32_sqrt( float32 a
)
1200 bits32 aSig
, zSig
, rem0
, rem1
, term0
, term1
;
1202 aSig
= extractFloat32Frac( a
);
1203 aExp
= extractFloat32Exp( a
);
1204 aSign
= extractFloat32Sign( a
);
1205 if ( aExp
== 0xFF ) {
1206 if ( aSig
) return propagateFloat32NaN( a
, 0 );
1207 if ( ! aSign
) return a
;
1208 float_raise( float_flag_invalid
);
1209 return float32_default_nan
;
1212 if ( ( aExp
| aSig
) == 0 ) return a
;
1213 float_raise( float_flag_invalid
);
1214 return float32_default_nan
;
1217 if ( aSig
== 0 ) return 0;
1218 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1220 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
1221 aSig
= ( aSig
| 0x00800000 )<<8;
1222 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
1223 if ( ( zSig
& 0x7F ) <= 5 ) {
1230 mul32To64( zSig
, zSig
, &term0
, &term1
);
1231 sub64( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
1232 while ( (sbits32
) rem0
< 0 ) {
1234 shortShift64Left( 0, zSig
, 1, &term0
, &term1
);
1236 add64( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
1238 zSig
|= ( ( rem0
| rem1
) != 0 );
1241 shift32RightJamming( zSig
, 1, &zSig
);
1243 return roundAndPackFloat32( 0, zExp
, zSig
);
1249 -------------------------------------------------------------------------------
1250 Returns 1 if the single-precision floating-point value `a' is equal to
1251 the corresponding value `b', and 0 otherwise. The comparison is performed
1252 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 -------------------------------------------------------------------------------
1255 flag
float32_eq( float32 a
, float32 b
)
1258 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1259 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1261 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1262 float_raise( float_flag_invalid
);
1266 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1271 -------------------------------------------------------------------------------
1272 Returns 1 if the single-precision floating-point value `a' is less than
1273 or equal to the corresponding value `b', and 0 otherwise. The comparison
1274 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1276 -------------------------------------------------------------------------------
1278 flag
float32_le( float32 a
, float32 b
)
1282 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1283 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1285 float_raise( float_flag_invalid
);
1288 aSign
= extractFloat32Sign( a
);
1289 bSign
= extractFloat32Sign( b
);
1290 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1291 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1296 -------------------------------------------------------------------------------
1297 Returns 1 if the single-precision floating-point value `a' is less than
1298 the corresponding value `b', and 0 otherwise. The comparison is performed
1299 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1300 -------------------------------------------------------------------------------
1302 flag
float32_lt( float32 a
, float32 b
)
1306 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1307 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1309 float_raise( float_flag_invalid
);
1312 aSign
= extractFloat32Sign( a
);
1313 bSign
= extractFloat32Sign( b
);
1314 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1315 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1319 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1321 -------------------------------------------------------------------------------
1322 Returns 1 if the single-precision floating-point value `a' is equal to
1323 the corresponding value `b', and 0 otherwise. The invalid exception is
1324 raised if either operand is a NaN. Otherwise, the comparison is performed
1325 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1326 -------------------------------------------------------------------------------
1328 flag
float32_eq_signaling( float32 a
, float32 b
)
1331 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1332 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1334 float_raise( float_flag_invalid
);
1337 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1342 -------------------------------------------------------------------------------
1343 Returns 1 if the single-precision floating-point value `a' is less than or
1344 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1345 cause an exception. Otherwise, the comparison is performed according to the
1346 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1347 -------------------------------------------------------------------------------
1349 flag
float32_le_quiet( float32 a
, float32 b
)
1354 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1355 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1357 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1358 float_raise( float_flag_invalid
);
1362 aSign
= extractFloat32Sign( a
);
1363 bSign
= extractFloat32Sign( b
);
1364 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1365 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1370 -------------------------------------------------------------------------------
1371 Returns 1 if the single-precision floating-point value `a' is less than
1372 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1373 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1374 Standard for Binary Floating-Point Arithmetic.
1375 -------------------------------------------------------------------------------
1377 flag
float32_lt_quiet( float32 a
, float32 b
)
1381 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1382 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1384 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1385 float_raise( float_flag_invalid
);
1389 aSign
= extractFloat32Sign( a
);
1390 bSign
= extractFloat32Sign( b
);
1391 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1392 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1395 #endif /* !SOFTFLOAT_FOR_GCC */
1397 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1399 -------------------------------------------------------------------------------
1400 Returns the result of converting the double-precision floating-point value
1401 `a' to the 32-bit two's complement integer format. The conversion is
1402 performed according to the IEC/IEEE Standard for Binary Floating-Point
1403 Arithmetic---which means in particular that the conversion is rounded
1404 according to the current rounding mode. If `a' is a NaN, the largest
1405 positive integer is returned. Otherwise, if the conversion overflows, the
1406 largest integer with the same sign as `a' is returned.
1407 -------------------------------------------------------------------------------
1409 int32
float64_to_int32( float64 a
)
1412 int16 aExp
, shiftCount
;
1413 bits32 aSig0
, aSig1
, absZ
, aSigExtra
;
1417 aSig1
= extractFloat64Frac1( a
);
1418 aSig0
= extractFloat64Frac0( a
);
1419 aExp
= extractFloat64Exp( a
);
1420 aSign
= extractFloat64Sign( a
);
1421 shiftCount
= aExp
- 0x413;
1422 if ( 0 <= shiftCount
) {
1423 if ( 0x41E < aExp
) {
1424 if ( ( aExp
== 0x7FF ) && ( aSig0
| aSig1
) ) aSign
= 0;
1428 aSig0
| 0x00100000, aSig1
, shiftCount
, &absZ
, &aSigExtra
);
1429 if ( 0x80000000 < absZ
) goto invalid
;
1432 aSig1
= ( aSig1
!= 0 );
1433 if ( aExp
< 0x3FE ) {
1434 aSigExtra
= aExp
| aSig0
| aSig1
;
1438 aSig0
|= 0x00100000;
1439 aSigExtra
= ( aSig0
<<( shiftCount
& 31 ) ) | aSig1
;
1440 absZ
= aSig0
>>( - shiftCount
);
1443 roundingMode
= float_rounding_mode
;
1444 if ( roundingMode
== float_round_nearest_even
) {
1445 if ( (sbits32
) aSigExtra
< 0 ) {
1447 if ( (bits32
) ( aSigExtra
<<1 ) == 0 ) absZ
&= ~1;
1449 z
= aSign
? - absZ
: absZ
;
1452 aSigExtra
= ( aSigExtra
!= 0 );
1455 + ( ( roundingMode
== float_round_down
) & aSigExtra
) );
1458 z
= absZ
+ ( ( roundingMode
== float_round_up
) & aSigExtra
);
1461 if ( ( aSign
^ ( z
< 0 ) ) && z
) {
1463 float_raise( float_flag_invalid
);
1464 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
1466 if ( aSigExtra
) float_exception_flags
|= float_flag_inexact
;
1470 #endif /* !SOFTFLOAT_FOR_GCC */
1473 -------------------------------------------------------------------------------
1474 Returns the result of converting the double-precision floating-point value
1475 `a' to the 32-bit two's complement integer format. The conversion is
1476 performed according to the IEC/IEEE Standard for Binary Floating-Point
1477 Arithmetic, except that the conversion is always rounded toward zero.
1478 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1479 the conversion overflows, the largest integer with the same sign as `a' is
1481 -------------------------------------------------------------------------------
1483 int32
float64_to_int32_round_to_zero( float64 a
)
1486 int16 aExp
, shiftCount
;
1487 bits32 aSig0
, aSig1
, absZ
, aSigExtra
;
1490 aSig1
= extractFloat64Frac1( a
);
1491 aSig0
= extractFloat64Frac0( a
);
1492 aExp
= extractFloat64Exp( a
);
1493 aSign
= extractFloat64Sign( a
);
1494 shiftCount
= aExp
- 0x413;
1495 if ( 0 <= shiftCount
) {
1496 if ( 0x41E < aExp
) {
1497 if ( ( aExp
== 0x7FF ) && ( aSig0
| aSig1
) ) aSign
= 0;
1501 aSig0
| 0x00100000, aSig1
, shiftCount
, &absZ
, &aSigExtra
);
1504 if ( aExp
< 0x3FF ) {
1505 if ( aExp
| aSig0
| aSig1
) {
1506 float_exception_flags
|= float_flag_inexact
;
1510 aSig0
|= 0x00100000;
1511 aSigExtra
= ( aSig0
<<( shiftCount
& 31 ) ) | aSig1
;
1512 absZ
= aSig0
>>( - shiftCount
);
1514 z
= aSign
? - absZ
: absZ
;
1515 if ( ( aSign
^ ( z
< 0 ) ) && z
) {
1517 float_raise( float_flag_invalid
);
1518 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
1520 if ( aSigExtra
) float_exception_flags
|= float_flag_inexact
;
1526 -------------------------------------------------------------------------------
1527 Returns the result of converting the double-precision floating-point value
1528 `a' to the single-precision floating-point format. The conversion is
1529 performed according to the IEC/IEEE Standard for Binary Floating-Point
1531 -------------------------------------------------------------------------------
1533 float32
float64_to_float32( float64 a
)
1537 bits32 aSig0
, aSig1
, zSig
;
1540 aSig1
= extractFloat64Frac1( a
);
1541 aSig0
= extractFloat64Frac0( a
);
1542 aExp
= extractFloat64Exp( a
);
1543 aSign
= extractFloat64Sign( a
);
1544 if ( aExp
== 0x7FF ) {
1545 if ( aSig0
| aSig1
) {
1546 return commonNaNToFloat32( float64ToCommonNaN( a
) );
1548 return packFloat32( aSign
, 0xFF, 0 );
1550 shift64RightJamming( aSig0
, aSig1
, 22, &allZero
, &zSig
);
1551 if ( aExp
) zSig
|= 0x40000000;
1552 return roundAndPackFloat32( aSign
, aExp
- 0x381, zSig
);
1556 #ifndef SOFTFLOAT_FOR_GCC
1558 -------------------------------------------------------------------------------
1559 Rounds the double-precision floating-point value `a' to an integer,
1560 and returns the result as a double-precision floating-point value. The
1561 operation is performed according to the IEC/IEEE Standard for Binary
1562 Floating-Point Arithmetic.
1563 -------------------------------------------------------------------------------
1565 float64
float64_round_to_int( float64 a
)
1569 bits32 lastBitMask
, roundBitsMask
;
1573 aExp
= extractFloat64Exp( a
);
1574 if ( 0x413 <= aExp
) {
1575 if ( 0x433 <= aExp
) {
1576 if ( ( aExp
== 0x7FF )
1577 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) ) {
1578 return propagateFloat64NaN( a
, a
);
1583 lastBitMask
= ( lastBitMask
<<( 0x432 - aExp
) )<<1;
1584 roundBitsMask
= lastBitMask
- 1;
1586 roundingMode
= float_rounding_mode
;
1587 if ( roundingMode
== float_round_nearest_even
) {
1588 if ( lastBitMask
) {
1589 add64( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
1590 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
1593 if ( (sbits32
) z
.low
< 0 ) {
1595 if ( (bits32
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
1599 else if ( roundingMode
!= float_round_to_zero
) {
1600 if ( extractFloat64Sign( z
)
1601 ^ ( roundingMode
== float_round_up
) ) {
1602 add64( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
1605 z
.low
&= ~ roundBitsMask
;
1608 if ( aExp
<= 0x3FE ) {
1609 if ( ( ( (bits32
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
1610 float_exception_flags
|= float_flag_inexact
;
1611 aSign
= extractFloat64Sign( a
);
1612 switch ( float_rounding_mode
) {
1613 case float_round_nearest_even
:
1614 if ( ( aExp
== 0x3FE )
1615 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) )
1617 return packFloat64( aSign
, 0x3FF, 0, 0 );
1620 case float_round_down
:
1622 aSign
? packFloat64( 1, 0x3FF, 0, 0 )
1623 : packFloat64( 0, 0, 0, 0 );
1624 case float_round_up
:
1626 aSign
? packFloat64( 1, 0, 0, 0 )
1627 : packFloat64( 0, 0x3FF, 0, 0 );
1629 return packFloat64( aSign
, 0, 0, 0 );
1632 lastBitMask
<<= 0x413 - aExp
;
1633 roundBitsMask
= lastBitMask
- 1;
1636 roundingMode
= float_rounding_mode
;
1637 if ( roundingMode
== float_round_nearest_even
) {
1638 z
.high
+= lastBitMask
>>1;
1639 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
1640 z
.high
&= ~ lastBitMask
;
1643 else if ( roundingMode
!= float_round_to_zero
) {
1644 if ( extractFloat64Sign( z
)
1645 ^ ( roundingMode
== float_round_up
) ) {
1646 z
.high
|= ( a
.low
!= 0 );
1647 z
.high
+= roundBitsMask
;
1650 z
.high
&= ~ roundBitsMask
;
1652 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
1653 float_exception_flags
|= float_flag_inexact
;
1661 -------------------------------------------------------------------------------
1662 Returns the result of adding the absolute values of the double-precision
1663 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1664 before being returned. `zSign' is ignored if the result is a NaN.
1665 The addition is performed according to the IEC/IEEE Standard for Binary
1666 Floating-Point Arithmetic.
1667 -------------------------------------------------------------------------------
1669 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign
)
1671 int16 aExp
, bExp
, zExp
;
1672 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
1675 aSig1
= extractFloat64Frac1( a
);
1676 aSig0
= extractFloat64Frac0( a
);
1677 aExp
= extractFloat64Exp( a
);
1678 bSig1
= extractFloat64Frac1( b
);
1679 bSig0
= extractFloat64Frac0( b
);
1680 bExp
= extractFloat64Exp( b
);
1681 expDiff
= aExp
- bExp
;
1682 if ( 0 < expDiff
) {
1683 if ( aExp
== 0x7FF ) {
1684 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, b
);
1691 bSig0
|= 0x00100000;
1693 shift64ExtraRightJamming(
1694 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
1697 else if ( expDiff
< 0 ) {
1698 if ( bExp
== 0x7FF ) {
1699 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1700 return packFloat64( zSign
, 0x7FF, 0, 0 );
1706 aSig0
|= 0x00100000;
1708 shift64ExtraRightJamming(
1709 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
1713 if ( aExp
== 0x7FF ) {
1714 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
1715 return propagateFloat64NaN( a
, b
);
1719 add64( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
1720 if ( aExp
== 0 ) return packFloat64( zSign
, 0, zSig0
, zSig1
);
1722 zSig0
|= 0x00200000;
1726 aSig0
|= 0x00100000;
1727 add64( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
1729 if ( zSig0
< 0x00200000 ) goto roundAndPack
;
1732 shift64ExtraRightJamming( zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
1734 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1739 -------------------------------------------------------------------------------
1740 Returns the result of subtracting the absolute values of the double-
1741 precision floating-point values `a' and `b'. If `zSign' is 1, the
1742 difference is negated before being returned. `zSign' is ignored if the
1743 result is a NaN. The subtraction is performed according to the IEC/IEEE
1744 Standard for Binary Floating-Point Arithmetic.
1745 -------------------------------------------------------------------------------
1747 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign
)
1749 int16 aExp
, bExp
, zExp
;
1750 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
1753 aSig1
= extractFloat64Frac1( a
);
1754 aSig0
= extractFloat64Frac0( a
);
1755 aExp
= extractFloat64Exp( a
);
1756 bSig1
= extractFloat64Frac1( b
);
1757 bSig0
= extractFloat64Frac0( b
);
1758 bExp
= extractFloat64Exp( b
);
1759 expDiff
= aExp
- bExp
;
1760 shortShift64Left( aSig0
, aSig1
, 10, &aSig0
, &aSig1
);
1761 shortShift64Left( bSig0
, bSig1
, 10, &bSig0
, &bSig1
);
1762 if ( 0 < expDiff
) goto aExpBigger
;
1763 if ( expDiff
< 0 ) goto bExpBigger
;
1764 if ( aExp
== 0x7FF ) {
1765 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
1766 return propagateFloat64NaN( a
, b
);
1768 float_raise( float_flag_invalid
);
1769 return float64_default_nan
;
1775 if ( bSig0
< aSig0
) goto aBigger
;
1776 if ( aSig0
< bSig0
) goto bBigger
;
1777 if ( bSig1
< aSig1
) goto aBigger
;
1778 if ( aSig1
< bSig1
) goto bBigger
;
1779 return packFloat64( float_rounding_mode
== float_round_down
, 0, 0, 0 );
1781 if ( bExp
== 0x7FF ) {
1782 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1783 return packFloat64( zSign
^ 1, 0x7FF, 0, 0 );
1789 aSig0
|= 0x40000000;
1791 shift64RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
1792 bSig0
|= 0x40000000;
1794 sub64( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
1797 goto normalizeRoundAndPack
;
1799 if ( aExp
== 0x7FF ) {
1800 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, b
);
1807 bSig0
|= 0x40000000;
1809 shift64RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
1810 aSig0
|= 0x40000000;
1812 sub64( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
1814 normalizeRoundAndPack
:
1816 return normalizeRoundAndPackFloat64( zSign
, zExp
- 10, zSig0
, zSig1
);
1821 -------------------------------------------------------------------------------
1822 Returns the result of adding the double-precision floating-point values `a'
1823 and `b'. The operation is performed according to the IEC/IEEE Standard for
1824 Binary Floating-Point Arithmetic.
1825 -------------------------------------------------------------------------------
1827 float64
float64_add( float64 a
, float64 b
)
1831 aSign
= extractFloat64Sign( a
);
1832 bSign
= extractFloat64Sign( b
);
1833 if ( aSign
== bSign
) {
1834 return addFloat64Sigs( a
, b
, aSign
);
1837 return subFloat64Sigs( a
, b
, aSign
);
1843 -------------------------------------------------------------------------------
1844 Returns the result of subtracting the double-precision floating-point values
1845 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1846 for Binary Floating-Point Arithmetic.
1847 -------------------------------------------------------------------------------
1849 float64
float64_sub( float64 a
, float64 b
)
1853 aSign
= extractFloat64Sign( a
);
1854 bSign
= extractFloat64Sign( b
);
1855 if ( aSign
== bSign
) {
1856 return subFloat64Sigs( a
, b
, aSign
);
1859 return addFloat64Sigs( a
, b
, aSign
);
1865 -------------------------------------------------------------------------------
1866 Returns the result of multiplying the double-precision floating-point values
1867 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1868 for Binary Floating-Point Arithmetic.
1869 -------------------------------------------------------------------------------
1871 float64
float64_mul( float64 a
, float64 b
)
1873 flag aSign
, bSign
, zSign
;
1874 int16 aExp
, bExp
, zExp
;
1875 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
1877 aSig1
= extractFloat64Frac1( a
);
1878 aSig0
= extractFloat64Frac0( a
);
1879 aExp
= extractFloat64Exp( a
);
1880 aSign
= extractFloat64Sign( a
);
1881 bSig1
= extractFloat64Frac1( b
);
1882 bSig0
= extractFloat64Frac0( b
);
1883 bExp
= extractFloat64Exp( b
);
1884 bSign
= extractFloat64Sign( b
);
1885 zSign
= aSign
^ bSign
;
1886 if ( aExp
== 0x7FF ) {
1887 if ( ( aSig0
| aSig1
)
1888 || ( ( bExp
== 0x7FF ) && ( bSig0
| bSig1
) ) ) {
1889 return propagateFloat64NaN( a
, b
);
1891 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
1892 return packFloat64( zSign
, 0x7FF, 0, 0 );
1894 if ( bExp
== 0x7FF ) {
1895 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1896 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
1898 float_raise( float_flag_invalid
);
1899 return float64_default_nan
;
1901 return packFloat64( zSign
, 0x7FF, 0, 0 );
1904 if ( ( aSig0
| aSig1
) == 0 ) return packFloat64( zSign
, 0, 0, 0 );
1905 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
1908 if ( ( bSig0
| bSig1
) == 0 ) return packFloat64( zSign
, 0, 0, 0 );
1909 normalizeFloat64Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
1911 zExp
= aExp
+ bExp
- 0x400;
1912 aSig0
|= 0x00100000;
1913 shortShift64Left( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
1914 mul64To128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
1915 add64( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
1916 zSig2
|= ( zSig3
!= 0 );
1917 if ( 0x00200000 <= zSig0
) {
1918 shift64ExtraRightJamming(
1919 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
1922 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1927 -------------------------------------------------------------------------------
1928 Returns the result of dividing the double-precision floating-point value `a'
1929 by the corresponding value `b'. The operation is performed according to the
1930 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1931 -------------------------------------------------------------------------------
1933 float64
float64_div( float64 a
, float64 b
)
1935 flag aSign
, bSign
, zSign
;
1936 int16 aExp
, bExp
, zExp
;
1937 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
1938 bits32 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
1940 aSig1
= extractFloat64Frac1( a
);
1941 aSig0
= extractFloat64Frac0( a
);
1942 aExp
= extractFloat64Exp( a
);
1943 aSign
= extractFloat64Sign( a
);
1944 bSig1
= extractFloat64Frac1( b
);
1945 bSig0
= extractFloat64Frac0( b
);
1946 bExp
= extractFloat64Exp( b
);
1947 bSign
= extractFloat64Sign( b
);
1948 zSign
= aSign
^ bSign
;
1949 if ( aExp
== 0x7FF ) {
1950 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, b
);
1951 if ( bExp
== 0x7FF ) {
1952 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1955 return packFloat64( zSign
, 0x7FF, 0, 0 );
1957 if ( bExp
== 0x7FF ) {
1958 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1959 return packFloat64( zSign
, 0, 0, 0 );
1962 if ( ( bSig0
| bSig1
) == 0 ) {
1963 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
1965 float_raise( float_flag_invalid
);
1966 return float64_default_nan
;
1968 float_raise( float_flag_divbyzero
);
1969 return packFloat64( zSign
, 0x7FF, 0, 0 );
1971 normalizeFloat64Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
1974 if ( ( aSig0
| aSig1
) == 0 ) return packFloat64( zSign
, 0, 0, 0 );
1975 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
1977 zExp
= aExp
- bExp
+ 0x3FD;
1978 shortShift64Left( aSig0
| 0x00100000, aSig1
, 11, &aSig0
, &aSig1
);
1979 shortShift64Left( bSig0
| 0x00100000, bSig1
, 11, &bSig0
, &bSig1
);
1980 if ( le64( bSig0
, bSig1
, aSig0
, aSig1
) ) {
1981 shift64Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
1984 zSig0
= estimateDiv64To32( aSig0
, aSig1
, bSig0
);
1985 mul64By32To96( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
1986 sub96( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
1987 while ( (sbits32
) rem0
< 0 ) {
1989 add96( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
1991 zSig1
= estimateDiv64To32( rem1
, rem2
, bSig0
);
1992 if ( ( zSig1
& 0x3FF ) <= 4 ) {
1993 mul64By32To96( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
1994 sub96( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
1995 while ( (sbits32
) rem1
< 0 ) {
1997 add96( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
1999 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
2001 shift64ExtraRightJamming( zSig0
, zSig1
, 0, 11, &zSig0
, &zSig1
, &zSig2
);
2002 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
2006 #ifndef SOFTFLOAT_FOR_GCC
2008 -------------------------------------------------------------------------------
2009 Returns the remainder of the double-precision floating-point value `a'
2010 with respect to the corresponding value `b'. The operation is performed
2011 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2012 -------------------------------------------------------------------------------
2014 float64
float64_rem( float64 a
, float64 b
)
2016 flag aSign
, bSign
, zSign
;
2017 int16 aExp
, bExp
, expDiff
;
2018 bits32 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
2019 bits32 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
2023 aSig1
= extractFloat64Frac1( a
);
2024 aSig0
= extractFloat64Frac0( a
);
2025 aExp
= extractFloat64Exp( a
);
2026 aSign
= extractFloat64Sign( a
);
2027 bSig1
= extractFloat64Frac1( b
);
2028 bSig0
= extractFloat64Frac0( b
);
2029 bExp
= extractFloat64Exp( b
);
2030 bSign
= extractFloat64Sign( b
);
2031 if ( aExp
== 0x7FF ) {
2032 if ( ( aSig0
| aSig1
)
2033 || ( ( bExp
== 0x7FF ) && ( bSig0
| bSig1
) ) ) {
2034 return propagateFloat64NaN( a
, b
);
2038 if ( bExp
== 0x7FF ) {
2039 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
2043 if ( ( bSig0
| bSig1
) == 0 ) {
2045 float_raise( float_flag_invalid
);
2046 return float64_default_nan
;
2048 normalizeFloat64Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
2051 if ( ( aSig0
| aSig1
) == 0 ) return a
;
2052 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
2054 expDiff
= aExp
- bExp
;
2055 if ( expDiff
< -1 ) return a
;
2057 aSig0
| 0x00100000, aSig1
, 11 - ( expDiff
< 0 ), &aSig0
, &aSig1
);
2058 shortShift64Left( bSig0
| 0x00100000, bSig1
, 11, &bSig0
, &bSig1
);
2059 q
= le64( bSig0
, bSig1
, aSig0
, aSig1
);
2060 if ( q
) sub64( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
2062 while ( 0 < expDiff
) {
2063 q
= estimateDiv64To32( aSig0
, aSig1
, bSig0
);
2064 q
= ( 4 < q
) ? q
- 4 : 0;
2065 mul64By32To96( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
2066 shortShift96Left( term0
, term1
, term2
, 29, &term1
, &term2
, &allZero
);
2067 shortShift64Left( aSig0
, aSig1
, 29, &aSig0
, &allZero
);
2068 sub64( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
2071 if ( -32 < expDiff
) {
2072 q
= estimateDiv64To32( aSig0
, aSig1
, bSig0
);
2073 q
= ( 4 < q
) ? q
- 4 : 0;
2075 shift64Right( bSig0
, bSig1
, 8, &bSig0
, &bSig1
);
2077 if ( expDiff
< 0 ) {
2078 shift64Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
2081 shortShift64Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
2083 mul64By32To96( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
2084 sub64( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
2087 shift64Right( aSig0
, aSig1
, 8, &aSig0
, &aSig1
);
2088 shift64Right( bSig0
, bSig1
, 8, &bSig0
, &bSig1
);
2091 alternateASig0
= aSig0
;
2092 alternateASig1
= aSig1
;
2094 sub64( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
2095 } while ( 0 <= (sbits32
) aSig0
);
2097 aSig0
, aSig1
, alternateASig0
, alternateASig1
, &sigMean0
, &sigMean1
);
2098 if ( ( sigMean0
< 0 )
2099 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
2100 aSig0
= alternateASig0
;
2101 aSig1
= alternateASig1
;
2103 zSign
= ( (sbits32
) aSig0
< 0 );
2104 if ( zSign
) sub64( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
2106 normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
);
2111 #ifndef SOFTFLOAT_FOR_GCC
2113 -------------------------------------------------------------------------------
2114 Returns the square root of the double-precision floating-point value `a'.
2115 The operation is performed according to the IEC/IEEE Standard for Binary
2116 Floating-Point Arithmetic.
2117 -------------------------------------------------------------------------------
2119 float64
float64_sqrt( float64 a
)
2123 bits32 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
2124 bits32 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
2127 aSig1
= extractFloat64Frac1( a
);
2128 aSig0
= extractFloat64Frac0( a
);
2129 aExp
= extractFloat64Exp( a
);
2130 aSign
= extractFloat64Sign( a
);
2131 if ( aExp
== 0x7FF ) {
2132 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, a
);
2133 if ( ! aSign
) return a
;
2137 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
2139 float_raise( float_flag_invalid
);
2140 return float64_default_nan
;
2143 if ( ( aSig0
| aSig1
) == 0 ) return packFloat64( 0, 0, 0, 0 );
2144 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
2146 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
2147 aSig0
|= 0x00100000;
2148 shortShift64Left( aSig0
, aSig1
, 11, &term0
, &term1
);
2149 zSig0
= ( estimateSqrt32( aExp
, term0
)>>1 ) + 1;
2150 if ( zSig0
== 0 ) zSig0
= 0x7FFFFFFF;
2151 doubleZSig0
= zSig0
+ zSig0
;
2152 shortShift64Left( aSig0
, aSig1
, 9 - ( aExp
& 1 ), &aSig0
, &aSig1
);
2153 mul32To64( zSig0
, zSig0
, &term0
, &term1
);
2154 sub64( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
2155 while ( (sbits32
) rem0
< 0 ) {
2158 add64( rem0
, rem1
, 0, doubleZSig0
| 1, &rem0
, &rem1
);
2160 zSig1
= estimateDiv64To32( rem1
, 0, doubleZSig0
);
2161 if ( ( zSig1
& 0x1FF ) <= 5 ) {
2162 if ( zSig1
== 0 ) zSig1
= 1;
2163 mul32To64( doubleZSig0
, zSig1
, &term1
, &term2
);
2164 sub64( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
2165 mul32To64( zSig1
, zSig1
, &term2
, &term3
);
2166 sub96( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
2167 while ( (sbits32
) rem1
< 0 ) {
2169 shortShift64Left( 0, zSig1
, 1, &term2
, &term3
);
2171 term2
|= doubleZSig0
;
2172 add96( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
2174 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
2176 shift64ExtraRightJamming( zSig0
, zSig1
, 0, 10, &zSig0
, &zSig1
, &zSig2
);
2177 return roundAndPackFloat64( 0, zExp
, zSig0
, zSig1
, zSig2
);
2183 -------------------------------------------------------------------------------
2184 Returns 1 if the double-precision floating-point value `a' is equal to
2185 the corresponding value `b', and 0 otherwise. The comparison is performed
2186 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2187 -------------------------------------------------------------------------------
2189 flag
float64_eq( float64 a
, float64 b
)
2192 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2193 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2194 || ( ( extractFloat64Exp( b
) == 0x7FF )
2195 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2197 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2198 float_raise( float_flag_invalid
);
2202 return ( a
== b
) ||
2203 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) == 0 );
2208 -------------------------------------------------------------------------------
2209 Returns 1 if the double-precision floating-point value `a' is less than
2210 or equal to the corresponding value `b', and 0 otherwise. The comparison
2211 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2213 -------------------------------------------------------------------------------
2215 flag
float64_le( float64 a
, float64 b
)
2219 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2220 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2221 || ( ( extractFloat64Exp( b
) == 0x7FF )
2222 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2224 float_raise( float_flag_invalid
);
2227 aSign
= extractFloat64Sign( a
);
2228 bSign
= extractFloat64Sign( b
);
2229 if ( aSign
!= bSign
)
2231 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) ==
2233 return ( a
== b
) ||
2234 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
2238 -------------------------------------------------------------------------------
2239 Returns 1 if the double-precision floating-point value `a' is less than
2240 the corresponding value `b', and 0 otherwise. The comparison is performed
2241 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2242 -------------------------------------------------------------------------------
2244 flag
float64_lt( float64 a
, float64 b
)
2248 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2249 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2250 || ( ( extractFloat64Exp( b
) == 0x7FF )
2251 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2253 float_raise( float_flag_invalid
);
2256 aSign
= extractFloat64Sign( a
);
2257 bSign
= extractFloat64Sign( b
);
2258 if ( aSign
!= bSign
)
2260 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) !=
2262 return ( a
!= b
) &&
2263 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
2267 #ifndef SOFTFLOAT_FOR_GCC
2269 -------------------------------------------------------------------------------
2270 Returns 1 if the double-precision floating-point value `a' is equal to
2271 the corresponding value `b', and 0 otherwise. The invalid exception is
2272 raised if either operand is a NaN. Otherwise, the comparison is performed
2273 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2274 -------------------------------------------------------------------------------
2276 flag
float64_eq_signaling( float64 a
, float64 b
)
2279 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2280 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2281 || ( ( extractFloat64Exp( b
) == 0x7FF )
2282 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2284 float_raise( float_flag_invalid
);
2287 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2292 -------------------------------------------------------------------------------
2293 Returns 1 if the double-precision floating-point value `a' is less than or
2294 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2295 cause an exception. Otherwise, the comparison is performed according to the
2296 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2297 -------------------------------------------------------------------------------
2299 flag
float64_le_quiet( float64 a
, float64 b
)
2303 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2304 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2305 || ( ( extractFloat64Exp( b
) == 0x7FF )
2306 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2308 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2309 float_raise( float_flag_invalid
);
2313 aSign
= extractFloat64Sign( a
);
2314 bSign
= extractFloat64Sign( b
);
2315 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2316 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2321 -------------------------------------------------------------------------------
2322 Returns 1 if the double-precision floating-point value `a' is less than
2323 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2324 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2325 Standard for Binary Floating-Point Arithmetic.
2326 -------------------------------------------------------------------------------
2328 flag
float64_lt_quiet( float64 a
, float64 b
)
2332 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2333 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2334 || ( ( extractFloat64Exp( b
) == 0x7FF )
2335 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2337 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2338 float_raise( float_flag_invalid
);
2342 aSign
= extractFloat64Sign( a
);
2343 bSign
= extractFloat64Sign( b
);
2344 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2345 return ( a
!= b
) && ( aSign
^ ( a
< b
) );