1 /* $NetBSD: softfloat.c,v 1.5 2007/11/08 21:31:04 martin Exp $ */
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
18 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 #ifdef SOFTFLOAT_FOR_GCC
48 #include "softfloat-for-gcc.h"
52 #include "softfloat.h"
55 * Conversions between floats as stored in memory and floats as
58 #ifndef FLOAT64_DEMANGLE
59 #define FLOAT64_DEMANGLE(a) (a)
61 #ifndef FLOAT64_MANGLE
62 #define FLOAT64_MANGLE(a) (a)
66 -------------------------------------------------------------------------------
67 Floating-point rounding mode, extended double-precision rounding precision,
69 -------------------------------------------------------------------------------
71 fp_rnd float_rounding_mode
= float_round_nearest_even
;
72 fp_except float_exception_flags
= 0;
74 int8 floatx80_rounding_precision
= 80;
78 -------------------------------------------------------------------------------
79 Primitive arithmetic functions, including multi-word arithmetic, and
80 division and square root approximations. (Can be specialized to target if
82 -------------------------------------------------------------------------------
84 #include "softfloat-macros"
87 -------------------------------------------------------------------------------
88 Functions and definitions to determine: (1) whether tininess for underflow
89 is detected before or after rounding by default, (2) what (if anything)
90 happens when exceptions are raised, (3) how signaling NaNs are distinguished
91 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
92 are propagated from function inputs to output. These details are target-
94 -------------------------------------------------------------------------------
96 #include "softfloat-specialize"
98 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
100 -------------------------------------------------------------------------------
101 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
102 and 7, and returns the properly rounded 32-bit integer corresponding to the
103 input. If `zSign' is 1, the input is negated before being converted to an
104 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
105 is simply rounded to an integer, with the inexact exception raised if the
106 input cannot be represented exactly as an integer. However, if the fixed-
107 point input is too large, the invalid exception is raised and the largest
108 positive or negative integer is returned.
109 -------------------------------------------------------------------------------
111 static int32
roundAndPackInt32( flag zSign
, bits64 absZ
)
114 flag roundNearestEven
;
115 int8 roundIncrement
, roundBits
;
118 roundingMode
= float_rounding_mode
;
119 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
120 roundIncrement
= 0x40;
121 if ( ! roundNearestEven
) {
122 if ( roundingMode
== float_round_to_zero
) {
126 roundIncrement
= 0x7F;
128 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
131 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
135 roundBits
= absZ
& 0x7F;
136 absZ
= ( absZ
+ roundIncrement
)>>7;
137 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
139 if ( zSign
) z
= - z
;
140 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
141 float_raise( float_flag_invalid
);
142 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
144 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
150 -------------------------------------------------------------------------------
151 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152 `absZ1', with binary point between bits 63 and 64 (between the input words),
153 and returns the properly rounded 64-bit integer corresponding to the input.
154 If `zSign' is 1, the input is negated before being converted to an integer.
155 Ordinarily, the fixed-point input is simply rounded to an integer, with
156 the inexact exception raised if the input cannot be represented exactly as
157 an integer. However, if the fixed-point input is too large, the invalid
158 exception is raised and the largest positive or negative integer is
160 -------------------------------------------------------------------------------
162 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1
)
165 flag roundNearestEven
, increment
;
168 roundingMode
= float_rounding_mode
;
169 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
170 increment
= ( (sbits64
) absZ1
< 0 );
171 if ( ! roundNearestEven
) {
172 if ( roundingMode
== float_round_to_zero
) {
177 increment
= ( roundingMode
== float_round_down
) && absZ1
;
180 increment
= ( roundingMode
== float_round_up
) && absZ1
;
186 if ( absZ0
== 0 ) goto overflow
;
187 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
190 if ( zSign
) z
= - z
;
191 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
193 float_raise( float_flag_invalid
);
195 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
196 : LIT64( 0x7FFFFFFFFFFFFFFF );
198 if ( absZ1
) float_exception_flags
|= float_flag_inexact
;
205 -------------------------------------------------------------------------------
206 Returns the fraction bits of the single-precision floating-point value `a'.
207 -------------------------------------------------------------------------------
209 INLINE bits32
extractFloat32Frac( float32 a
)
212 return a
& 0x007FFFFF;
217 -------------------------------------------------------------------------------
218 Returns the exponent bits of the single-precision floating-point value `a'.
219 -------------------------------------------------------------------------------
221 INLINE int16
extractFloat32Exp( float32 a
)
224 return ( a
>>23 ) & 0xFF;
229 -------------------------------------------------------------------------------
230 Returns the sign bit of the single-precision floating-point value `a'.
231 -------------------------------------------------------------------------------
233 INLINE flag
extractFloat32Sign( float32 a
)
241 -------------------------------------------------------------------------------
242 Normalizes the subnormal single-precision floating-point value represented
243 by the denormalized significand `aSig'. The normalized exponent and
244 significand are stored at the locations pointed to by `zExpPtr' and
245 `zSigPtr', respectively.
246 -------------------------------------------------------------------------------
249 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
253 shiftCount
= countLeadingZeros32( aSig
) - 8;
254 *zSigPtr
= aSig
<<shiftCount
;
255 *zExpPtr
= 1 - shiftCount
;
260 -------------------------------------------------------------------------------
261 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
262 single-precision floating-point value, returning the result. After being
263 shifted into the proper positions, the three fields are simply added
264 together to form the result. This means that any integer portion of `zSig'
265 will be added into the exponent. Since a properly normalized significand
266 will have an integer portion equal to 1, the `zExp' input should be 1 less
267 than the desired result exponent whenever `zSig' is a complete, normalized
269 -------------------------------------------------------------------------------
271 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
274 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
279 -------------------------------------------------------------------------------
280 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
281 and significand `zSig', and returns the proper single-precision floating-
282 point value corresponding to the abstract input. Ordinarily, the abstract
283 value is simply rounded and packed into the single-precision format, with
284 the inexact exception raised if the abstract input cannot be represented
285 exactly. However, if the abstract value is too large, the overflow and
286 inexact exceptions are raised and an infinity or maximal finite value is
287 returned. If the abstract value is too small, the input value is rounded to
288 a subnormal number, and the underflow and inexact exceptions are raised if
289 the abstract input cannot be represented exactly as a subnormal single-
290 precision floating-point number.
291 The input significand `zSig' has its binary point between bits 30
292 and 29, which is 7 bits to the left of the usual location. This shifted
293 significand must be normalized or smaller. If `zSig' is not normalized,
294 `zExp' must be 0; in that case, the result returned is a subnormal number,
295 and it must not require rounding. In the usual case that `zSig' is
296 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
297 The handling of underflow and overflow follows the IEC/IEEE Standard for
298 Binary Floating-Point Arithmetic.
299 -------------------------------------------------------------------------------
301 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
304 flag roundNearestEven
;
305 int8 roundIncrement
, roundBits
;
308 roundingMode
= float_rounding_mode
;
309 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
310 roundIncrement
= 0x40;
311 if ( ! roundNearestEven
) {
312 if ( roundingMode
== float_round_to_zero
) {
316 roundIncrement
= 0x7F;
318 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
321 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
325 roundBits
= zSig
& 0x7F;
326 if ( 0xFD <= (bits16
) zExp
) {
328 || ( ( zExp
== 0xFD )
329 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
331 float_raise( float_flag_overflow
| float_flag_inexact
);
332 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
336 ( float_detect_tininess
== float_tininess_before_rounding
)
338 || ( zSig
+ roundIncrement
< 0x80000000 );
339 shift32RightJamming( zSig
, - zExp
, &zSig
);
341 roundBits
= zSig
& 0x7F;
342 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
345 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
346 zSig
= ( zSig
+ roundIncrement
)>>7;
347 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
348 if ( zSig
== 0 ) zExp
= 0;
349 return packFloat32( zSign
, zExp
, zSig
);
354 -------------------------------------------------------------------------------
355 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
356 and significand `zSig', and returns the proper single-precision floating-
357 point value corresponding to the abstract input. This routine is just like
358 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
359 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
360 floating-point exponent.
361 -------------------------------------------------------------------------------
364 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
368 shiftCount
= countLeadingZeros32( zSig
) - 1;
369 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
374 -------------------------------------------------------------------------------
375 Returns the fraction bits of the double-precision floating-point value `a'.
376 -------------------------------------------------------------------------------
378 INLINE bits64
extractFloat64Frac( float64 a
)
381 return FLOAT64_DEMANGLE(a
) & LIT64( 0x000FFFFFFFFFFFFF );
386 -------------------------------------------------------------------------------
387 Returns the exponent bits of the double-precision floating-point value `a'.
388 -------------------------------------------------------------------------------
390 INLINE int16
extractFloat64Exp( float64 a
)
393 return ( FLOAT64_DEMANGLE(a
)>>52 ) & 0x7FF;
398 -------------------------------------------------------------------------------
399 Returns the sign bit of the double-precision floating-point value `a'.
400 -------------------------------------------------------------------------------
402 INLINE flag
extractFloat64Sign( float64 a
)
405 return FLOAT64_DEMANGLE(a
)>>63;
410 -------------------------------------------------------------------------------
411 Normalizes the subnormal double-precision floating-point value represented
412 by the denormalized significand `aSig'. The normalized exponent and
413 significand are stored at the locations pointed to by `zExpPtr' and
414 `zSigPtr', respectively.
415 -------------------------------------------------------------------------------
418 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
422 shiftCount
= countLeadingZeros64( aSig
) - 11;
423 *zSigPtr
= aSig
<<shiftCount
;
424 *zExpPtr
= 1 - shiftCount
;
429 -------------------------------------------------------------------------------
430 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
431 double-precision floating-point value, returning the result. After being
432 shifted into the proper positions, the three fields are simply added
433 together to form the result. This means that any integer portion of `zSig'
434 will be added into the exponent. Since a properly normalized significand
435 will have an integer portion equal to 1, the `zExp' input should be 1 less
436 than the desired result exponent whenever `zSig' is a complete, normalized
438 -------------------------------------------------------------------------------
440 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
443 return FLOAT64_MANGLE( ( ( (bits64
) zSign
)<<63 ) +
444 ( ( (bits64
) zExp
)<<52 ) + zSig
);
449 -------------------------------------------------------------------------------
450 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
451 and significand `zSig', and returns the proper double-precision floating-
452 point value corresponding to the abstract input. Ordinarily, the abstract
453 value is simply rounded and packed into the double-precision format, with
454 the inexact exception raised if the abstract input cannot be represented
455 exactly. However, if the abstract value is too large, the overflow and
456 inexact exceptions are raised and an infinity or maximal finite value is
457 returned. If the abstract value is too small, the input value is rounded to
458 a subnormal number, and the underflow and inexact exceptions are raised if
459 the abstract input cannot be represented exactly as a subnormal double-
460 precision floating-point number.
461 The input significand `zSig' has its binary point between bits 62
462 and 61, which is 10 bits to the left of the usual location. This shifted
463 significand must be normalized or smaller. If `zSig' is not normalized,
464 `zExp' must be 0; in that case, the result returned is a subnormal number,
465 and it must not require rounding. In the usual case that `zSig' is
466 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
467 The handling of underflow and overflow follows the IEC/IEEE Standard for
468 Binary Floating-Point Arithmetic.
469 -------------------------------------------------------------------------------
471 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
474 flag roundNearestEven
;
475 int16 roundIncrement
, roundBits
;
478 roundingMode
= float_rounding_mode
;
479 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
480 roundIncrement
= 0x200;
481 if ( ! roundNearestEven
) {
482 if ( roundingMode
== float_round_to_zero
) {
486 roundIncrement
= 0x3FF;
488 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
491 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
495 roundBits
= zSig
& 0x3FF;
496 if ( 0x7FD <= (bits16
) zExp
) {
497 if ( ( 0x7FD < zExp
)
498 || ( ( zExp
== 0x7FD )
499 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
501 float_raise( float_flag_overflow
| float_flag_inexact
);
502 return FLOAT64_MANGLE(
503 FLOAT64_DEMANGLE(packFloat64( zSign
, 0x7FF, 0 )) -
504 ( roundIncrement
== 0 ));
508 ( float_detect_tininess
== float_tininess_before_rounding
)
510 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
511 shift64RightJamming( zSig
, - zExp
, &zSig
);
513 roundBits
= zSig
& 0x3FF;
514 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
517 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
518 zSig
= ( zSig
+ roundIncrement
)>>10;
519 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
520 if ( zSig
== 0 ) zExp
= 0;
521 return packFloat64( zSign
, zExp
, zSig
);
526 -------------------------------------------------------------------------------
527 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
528 and significand `zSig', and returns the proper double-precision floating-
529 point value corresponding to the abstract input. This routine is just like
530 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
531 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
532 floating-point exponent.
533 -------------------------------------------------------------------------------
536 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
540 shiftCount
= countLeadingZeros64( zSig
) - 1;
541 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
548 -------------------------------------------------------------------------------
549 Returns the fraction bits of the extended double-precision floating-point
551 -------------------------------------------------------------------------------
553 INLINE bits64
extractFloatx80Frac( floatx80 a
)
561 -------------------------------------------------------------------------------
562 Returns the exponent bits of the extended double-precision floating-point
564 -------------------------------------------------------------------------------
566 INLINE int32
extractFloatx80Exp( floatx80 a
)
569 return a
.high
& 0x7FFF;
574 -------------------------------------------------------------------------------
575 Returns the sign bit of the extended double-precision floating-point value
577 -------------------------------------------------------------------------------
579 INLINE flag
extractFloatx80Sign( floatx80 a
)
587 -------------------------------------------------------------------------------
588 Normalizes the subnormal extended double-precision floating-point value
589 represented by the denormalized significand `aSig'. The normalized exponent
590 and significand are stored at the locations pointed to by `zExpPtr' and
591 `zSigPtr', respectively.
592 -------------------------------------------------------------------------------
595 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
599 shiftCount
= countLeadingZeros64( aSig
);
600 *zSigPtr
= aSig
<<shiftCount
;
601 *zExpPtr
= 1 - shiftCount
;
606 -------------------------------------------------------------------------------
607 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
608 extended double-precision floating-point value, returning the result.
609 -------------------------------------------------------------------------------
611 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
616 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
622 -------------------------------------------------------------------------------
623 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
624 and extended significand formed by the concatenation of `zSig0' and `zSig1',
625 and returns the proper extended double-precision floating-point value
626 corresponding to the abstract input. Ordinarily, the abstract value is
627 rounded and packed into the extended double-precision format, with the
628 inexact exception raised if the abstract input cannot be represented
629 exactly. However, if the abstract value is too large, the overflow and
630 inexact exceptions are raised and an infinity or maximal finite value is
631 returned. If the abstract value is too small, the input value is rounded to
632 a subnormal number, and the underflow and inexact exceptions are raised if
633 the abstract input cannot be represented exactly as a subnormal extended
634 double-precision floating-point number.
635 If `roundingPrecision' is 32 or 64, the result is rounded to the same
636 number of bits as single or double precision, respectively. Otherwise, the
637 result is rounded to the full precision of the extended double-precision
639 The input significand must be normalized or smaller. If the input
640 significand is not normalized, `zExp' must be 0; in that case, the result
641 returned is a subnormal number, and it must not require rounding. The
642 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
643 Floating-Point Arithmetic.
644 -------------------------------------------------------------------------------
647 roundAndPackFloatx80(
648 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
652 flag roundNearestEven
, increment
, isTiny
;
653 int64 roundIncrement
, roundMask
, roundBits
;
655 roundingMode
= float_rounding_mode
;
656 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
657 if ( roundingPrecision
== 80 ) goto precision80
;
658 if ( roundingPrecision
== 64 ) {
659 roundIncrement
= LIT64( 0x0000000000000400 );
660 roundMask
= LIT64( 0x00000000000007FF );
662 else if ( roundingPrecision
== 32 ) {
663 roundIncrement
= LIT64( 0x0000008000000000 );
664 roundMask
= LIT64( 0x000000FFFFFFFFFF );
669 zSig0
|= ( zSig1
!= 0 );
670 if ( ! roundNearestEven
) {
671 if ( roundingMode
== float_round_to_zero
) {
675 roundIncrement
= roundMask
;
677 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
680 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
684 roundBits
= zSig0
& roundMask
;
685 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
686 if ( ( 0x7FFE < zExp
)
687 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
693 ( float_detect_tininess
== float_tininess_before_rounding
)
695 || ( zSig0
<= zSig0
+ roundIncrement
);
696 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
698 roundBits
= zSig0
& roundMask
;
699 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
700 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
701 zSig0
+= roundIncrement
;
702 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
703 roundIncrement
= roundMask
+ 1;
704 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
705 roundMask
|= roundIncrement
;
707 zSig0
&= ~ roundMask
;
708 return packFloatx80( zSign
, zExp
, zSig0
);
711 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
712 zSig0
+= roundIncrement
;
713 if ( zSig0
< roundIncrement
) {
715 zSig0
= LIT64( 0x8000000000000000 );
717 roundIncrement
= roundMask
+ 1;
718 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
719 roundMask
|= roundIncrement
;
721 zSig0
&= ~ roundMask
;
722 if ( zSig0
== 0 ) zExp
= 0;
723 return packFloatx80( zSign
, zExp
, zSig0
);
725 increment
= ( (sbits64
) zSig1
< 0 );
726 if ( ! roundNearestEven
) {
727 if ( roundingMode
== float_round_to_zero
) {
732 increment
= ( roundingMode
== float_round_down
) && zSig1
;
735 increment
= ( roundingMode
== float_round_up
) && zSig1
;
739 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
740 if ( ( 0x7FFE < zExp
)
741 || ( ( zExp
== 0x7FFE )
742 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
748 float_raise( float_flag_overflow
| float_flag_inexact
);
749 if ( ( roundingMode
== float_round_to_zero
)
750 || ( zSign
&& ( roundingMode
== float_round_up
) )
751 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
753 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
755 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
759 ( float_detect_tininess
== float_tininess_before_rounding
)
762 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
763 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
765 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow
);
766 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
767 if ( roundNearestEven
) {
768 increment
= ( (sbits64
) zSig1
< 0 );
772 increment
= ( roundingMode
== float_round_down
) && zSig1
;
775 increment
= ( roundingMode
== float_round_up
) && zSig1
;
781 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
782 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
784 return packFloatx80( zSign
, zExp
, zSig0
);
787 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
792 zSig0
= LIT64( 0x8000000000000000 );
795 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
799 if ( zSig0
== 0 ) zExp
= 0;
801 return packFloatx80( zSign
, zExp
, zSig0
);
806 -------------------------------------------------------------------------------
807 Takes an abstract floating-point value having sign `zSign', exponent
808 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
809 and returns the proper extended double-precision floating-point value
810 corresponding to the abstract input. This routine is just like
811 `roundAndPackFloatx80' except that the input significand does not have to be
813 -------------------------------------------------------------------------------
816 normalizeRoundAndPackFloatx80(
817 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
827 shiftCount
= countLeadingZeros64( zSig0
);
828 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
831 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1
);
840 -------------------------------------------------------------------------------
841 Returns the least-significant 64 fraction bits of the quadruple-precision
842 floating-point value `a'.
843 -------------------------------------------------------------------------------
845 INLINE bits64
extractFloat128Frac1( float128 a
)
853 -------------------------------------------------------------------------------
854 Returns the most-significant 48 fraction bits of the quadruple-precision
855 floating-point value `a'.
856 -------------------------------------------------------------------------------
858 INLINE bits64
extractFloat128Frac0( float128 a
)
861 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
866 -------------------------------------------------------------------------------
867 Returns the exponent bits of the quadruple-precision floating-point value
869 -------------------------------------------------------------------------------
871 INLINE int32
extractFloat128Exp( float128 a
)
874 return ( a
.high
>>48 ) & 0x7FFF;
879 -------------------------------------------------------------------------------
880 Returns the sign bit of the quadruple-precision floating-point value `a'.
881 -------------------------------------------------------------------------------
883 INLINE flag
extractFloat128Sign( float128 a
)
891 -------------------------------------------------------------------------------
892 Normalizes the subnormal quadruple-precision floating-point value
893 represented by the denormalized significand formed by the concatenation of
894 `aSig0' and `aSig1'. The normalized exponent is stored at the location
895 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
896 significand are stored at the location pointed to by `zSig0Ptr', and the
897 least significant 64 bits of the normalized significand are stored at the
898 location pointed to by `zSig1Ptr'.
899 -------------------------------------------------------------------------------
902 normalizeFloat128Subnormal(
913 shiftCount
= countLeadingZeros64( aSig1
) - 15;
914 if ( shiftCount
< 0 ) {
915 *zSig0Ptr
= aSig1
>>( - shiftCount
);
916 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
919 *zSig0Ptr
= aSig1
<<shiftCount
;
922 *zExpPtr
= - shiftCount
- 63;
925 shiftCount
= countLeadingZeros64( aSig0
) - 15;
926 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
927 *zExpPtr
= 1 - shiftCount
;
933 -------------------------------------------------------------------------------
934 Packs the sign `zSign', the exponent `zExp', and the significand formed
935 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
936 floating-point value, returning the result. After being shifted into the
937 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
938 added together to form the most significant 32 bits of the result. This
939 means that any integer portion of `zSig0' will be added into the exponent.
940 Since a properly normalized significand will have an integer portion equal
941 to 1, the `zExp' input should be 1 less than the desired result exponent
942 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
944 -------------------------------------------------------------------------------
947 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
952 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
958 -------------------------------------------------------------------------------
959 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
960 and extended significand formed by the concatenation of `zSig0', `zSig1',
961 and `zSig2', and returns the proper quadruple-precision floating-point value
962 corresponding to the abstract input. Ordinarily, the abstract value is
963 simply rounded and packed into the quadruple-precision format, with the
964 inexact exception raised if the abstract input cannot be represented
965 exactly. However, if the abstract value is too large, the overflow and
966 inexact exceptions are raised and an infinity or maximal finite value is
967 returned. If the abstract value is too small, the input value is rounded to
968 a subnormal number, and the underflow and inexact exceptions are raised if
969 the abstract input cannot be represented exactly as a subnormal quadruple-
970 precision floating-point number.
971 The input significand must be normalized or smaller. If the input
972 significand is not normalized, `zExp' must be 0; in that case, the result
973 returned is a subnormal number, and it must not require rounding. In the
974 usual case that the input significand is normalized, `zExp' must be 1 less
975 than the ``true'' floating-point exponent. The handling of underflow and
976 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
977 -------------------------------------------------------------------------------
980 roundAndPackFloat128(
981 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2
)
984 flag roundNearestEven
, increment
, isTiny
;
986 roundingMode
= float_rounding_mode
;
987 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
988 increment
= ( (sbits64
) zSig2
< 0 );
989 if ( ! roundNearestEven
) {
990 if ( roundingMode
== float_round_to_zero
) {
995 increment
= ( roundingMode
== float_round_down
) && zSig2
;
998 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1002 if ( 0x7FFD <= (bits32
) zExp
) {
1003 if ( ( 0x7FFD < zExp
)
1004 || ( ( zExp
== 0x7FFD )
1006 LIT64( 0x0001FFFFFFFFFFFF ),
1007 LIT64( 0xFFFFFFFFFFFFFFFF ),
1014 float_raise( float_flag_overflow
| float_flag_inexact
);
1015 if ( ( roundingMode
== float_round_to_zero
)
1016 || ( zSign
&& ( roundingMode
== float_round_up
) )
1017 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1023 LIT64( 0x0000FFFFFFFFFFFF ),
1024 LIT64( 0xFFFFFFFFFFFFFFFF )
1027 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1031 ( float_detect_tininess
== float_tininess_before_rounding
)
1037 LIT64( 0x0001FFFFFFFFFFFF ),
1038 LIT64( 0xFFFFFFFFFFFFFFFF )
1040 shift128ExtraRightJamming(
1041 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1043 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow
);
1044 if ( roundNearestEven
) {
1045 increment
= ( (sbits64
) zSig2
< 0 );
1049 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1052 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1057 if ( zSig2
) float_exception_flags
|= float_flag_inexact
;
1059 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1060 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1063 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1065 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1070 -------------------------------------------------------------------------------
1071 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1072 and significand formed by the concatenation of `zSig0' and `zSig1', and
1073 returns the proper quadruple-precision floating-point value corresponding
1074 to the abstract input. This routine is just like `roundAndPackFloat128'
1075 except that the input significand has fewer bits and does not have to be
1076 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1078 -------------------------------------------------------------------------------
1081 normalizeRoundAndPackFloat128(
1082 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
1092 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1093 if ( 0 <= shiftCount
) {
1095 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1098 shift128ExtraRightJamming(
1099 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1102 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1109 -------------------------------------------------------------------------------
1110 Returns the result of converting the 32-bit two's complement integer `a'
1111 to the single-precision floating-point format. The conversion is performed
1112 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 -------------------------------------------------------------------------------
1115 float32
int32_to_float32( int32 a
)
1119 if ( a
== 0 ) return 0;
1120 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1122 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a
);
1127 -------------------------------------------------------------------------------
1128 Returns the result of converting the 32-bit two's complement integer `a'
1129 to the double-precision floating-point format. The conversion is performed
1130 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1131 -------------------------------------------------------------------------------
1133 float64
int32_to_float64( int32 a
)
1140 if ( a
== 0 ) return 0;
1142 absA
= zSign
? - a
: a
;
1143 shiftCount
= countLeadingZeros32( absA
) + 21;
1145 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1152 -------------------------------------------------------------------------------
1153 Returns the result of converting the 32-bit two's complement integer `a'
1154 to the extended double-precision floating-point format. The conversion
1155 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1157 -------------------------------------------------------------------------------
1159 floatx80
int32_to_floatx80( int32 a
)
1166 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1168 absA
= zSign
? - a
: a
;
1169 shiftCount
= countLeadingZeros32( absA
) + 32;
1171 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1180 -------------------------------------------------------------------------------
1181 Returns the result of converting the 32-bit two's complement integer `a' to
1182 the quadruple-precision floating-point format. The conversion is performed
1183 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184 -------------------------------------------------------------------------------
1186 float128
int32_to_float128( int32 a
)
1193 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1195 absA
= zSign
? - a
: a
;
1196 shiftCount
= countLeadingZeros32( absA
) + 17;
1198 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1204 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1206 -------------------------------------------------------------------------------
1207 Returns the result of converting the 64-bit two's complement integer `a'
1208 to the single-precision floating-point format. The conversion is performed
1209 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1210 -------------------------------------------------------------------------------
1212 float32
int64_to_float32( int64 a
)
1218 if ( a
== 0 ) return 0;
1220 absA
= zSign
? - a
: a
;
1221 shiftCount
= countLeadingZeros64( absA
) - 40;
1222 if ( 0 <= shiftCount
) {
1223 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1227 if ( shiftCount
< 0 ) {
1228 shift64RightJamming( absA
, - shiftCount
, &absA
);
1231 absA
<<= shiftCount
;
1233 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA
);
1239 -------------------------------------------------------------------------------
1240 Returns the result of converting the 64-bit two's complement integer `a'
1241 to the double-precision floating-point format. The conversion is performed
1242 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1243 -------------------------------------------------------------------------------
1245 float64
int64_to_float64( int64 a
)
1249 if ( a
== 0 ) return 0;
1250 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1251 return packFloat64( 1, 0x43E, 0 );
1254 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a
);
1261 -------------------------------------------------------------------------------
1262 Returns the result of converting the 64-bit two's complement integer `a'
1263 to the extended double-precision floating-point format. The conversion
1264 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1266 -------------------------------------------------------------------------------
1268 floatx80
int64_to_floatx80( int64 a
)
1274 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1276 absA
= zSign
? - a
: a
;
1277 shiftCount
= countLeadingZeros64( absA
);
1278 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1284 #endif /* !SOFTFLOAT_FOR_GCC */
1289 -------------------------------------------------------------------------------
1290 Returns the result of converting the 64-bit two's complement integer `a' to
1291 the quadruple-precision floating-point format. The conversion is performed
1292 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293 -------------------------------------------------------------------------------
1295 float128
int64_to_float128( int64 a
)
1301 bits64 zSig0
, zSig1
;
1303 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1305 absA
= zSign
? - a
: a
;
1306 shiftCount
= countLeadingZeros64( absA
) + 49;
1307 zExp
= 0x406E - shiftCount
;
1308 if ( 64 <= shiftCount
) {
1317 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1318 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1324 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1326 -------------------------------------------------------------------------------
1327 Returns the result of converting the single-precision floating-point value
1328 `a' to the 32-bit two's complement integer format. The conversion is
1329 performed according to the IEC/IEEE Standard for Binary Floating-Point
1330 Arithmetic---which means in particular that the conversion is rounded
1331 according to the current rounding mode. If `a' is a NaN, the largest
1332 positive integer is returned. Otherwise, if the conversion overflows, the
1333 largest integer with the same sign as `a' is returned.
1334 -------------------------------------------------------------------------------
1336 int32
float32_to_int32( float32 a
)
1339 int16 aExp
, shiftCount
;
1343 aSig
= extractFloat32Frac( a
);
1344 aExp
= extractFloat32Exp( a
);
1345 aSign
= extractFloat32Sign( a
);
1346 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1347 if ( aExp
) aSig
|= 0x00800000;
1348 shiftCount
= 0xAF - aExp
;
1351 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1352 return roundAndPackInt32( aSign
, aSig64
);
1355 #endif /* !SOFTFLOAT_FOR_GCC */
1358 -------------------------------------------------------------------------------
1359 Returns the result of converting the single-precision floating-point value
1360 `a' to the 32-bit two's complement integer format. The conversion is
1361 performed according to the IEC/IEEE Standard for Binary Floating-Point
1362 Arithmetic, except that the conversion is always rounded toward zero.
1363 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1364 the conversion overflows, the largest integer with the same sign as `a' is
1366 -------------------------------------------------------------------------------
1368 int32
float32_to_int32_round_to_zero( float32 a
)
1371 int16 aExp
, shiftCount
;
1375 aSig
= extractFloat32Frac( a
);
1376 aExp
= extractFloat32Exp( a
);
1377 aSign
= extractFloat32Sign( a
);
1378 shiftCount
= aExp
- 0x9E;
1379 if ( 0 <= shiftCount
) {
1380 if ( a
!= 0xCF000000 ) {
1381 float_raise( float_flag_invalid
);
1382 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1384 return (sbits32
) 0x80000000;
1386 else if ( aExp
<= 0x7E ) {
1387 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
1390 aSig
= ( aSig
| 0x00800000 )<<8;
1391 z
= aSig
>>( - shiftCount
);
1392 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1393 float_exception_flags
|= float_flag_inexact
;
1395 if ( aSign
) z
= - z
;
1400 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1402 -------------------------------------------------------------------------------
1403 Returns the result of converting the single-precision floating-point value
1404 `a' to the 64-bit two's complement integer format. The conversion is
1405 performed according to the IEC/IEEE Standard for Binary Floating-Point
1406 Arithmetic---which means in particular that the conversion is rounded
1407 according to the current rounding mode. If `a' is a NaN, the largest
1408 positive integer is returned. Otherwise, if the conversion overflows, the
1409 largest integer with the same sign as `a' is returned.
1410 -------------------------------------------------------------------------------
1412 int64
float32_to_int64( float32 a
)
1415 int16 aExp
, shiftCount
;
1417 bits64 aSig64
, aSigExtra
;
1419 aSig
= extractFloat32Frac( a
);
1420 aExp
= extractFloat32Exp( a
);
1421 aSign
= extractFloat32Sign( a
);
1422 shiftCount
= 0xBE - aExp
;
1423 if ( shiftCount
< 0 ) {
1424 float_raise( float_flag_invalid
);
1425 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1426 return LIT64( 0x7FFFFFFFFFFFFFFF );
1428 return (sbits64
) LIT64( 0x8000000000000000 );
1430 if ( aExp
) aSig
|= 0x00800000;
1433 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1434 return roundAndPackInt64( aSign
, aSig64
, aSigExtra
);
1439 -------------------------------------------------------------------------------
1440 Returns the result of converting the single-precision floating-point value
1441 `a' to the 64-bit two's complement integer format. The conversion is
1442 performed according to the IEC/IEEE Standard for Binary Floating-Point
1443 Arithmetic, except that the conversion is always rounded toward zero. If
1444 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1445 conversion overflows, the largest integer with the same sign as `a' is
1447 -------------------------------------------------------------------------------
1449 int64
float32_to_int64_round_to_zero( float32 a
)
1452 int16 aExp
, shiftCount
;
1457 aSig
= extractFloat32Frac( a
);
1458 aExp
= extractFloat32Exp( a
);
1459 aSign
= extractFloat32Sign( a
);
1460 shiftCount
= aExp
- 0xBE;
1461 if ( 0 <= shiftCount
) {
1462 if ( a
!= 0xDF000000 ) {
1463 float_raise( float_flag_invalid
);
1464 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1465 return LIT64( 0x7FFFFFFFFFFFFFFF );
1468 return (sbits64
) LIT64( 0x8000000000000000 );
1470 else if ( aExp
<= 0x7E ) {
1471 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
1474 aSig64
= aSig
| 0x00800000;
1476 z
= aSig64
>>( - shiftCount
);
1477 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1478 float_exception_flags
|= float_flag_inexact
;
1480 if ( aSign
) z
= - z
;
1484 #endif /* !SOFTFLOAT_FOR_GCC */
1487 -------------------------------------------------------------------------------
1488 Returns the result of converting the single-precision floating-point value
1489 `a' to the double-precision floating-point format. The conversion is
1490 performed according to the IEC/IEEE Standard for Binary Floating-Point
1492 -------------------------------------------------------------------------------
1494 float64
float32_to_float64( float32 a
)
1500 aSig
= extractFloat32Frac( a
);
1501 aExp
= extractFloat32Exp( a
);
1502 aSign
= extractFloat32Sign( a
);
1503 if ( aExp
== 0xFF ) {
1504 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
1505 return packFloat64( aSign
, 0x7FF, 0 );
1508 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1509 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1512 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1519 -------------------------------------------------------------------------------
1520 Returns the result of converting the single-precision floating-point value
1521 `a' to the extended double-precision floating-point format. The conversion
1522 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1524 -------------------------------------------------------------------------------
1526 floatx80
float32_to_floatx80( float32 a
)
1532 aSig
= extractFloat32Frac( a
);
1533 aExp
= extractFloat32Exp( a
);
1534 aSign
= extractFloat32Sign( a
);
1535 if ( aExp
== 0xFF ) {
1536 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a
) );
1537 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1540 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1541 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1544 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1553 -------------------------------------------------------------------------------
1554 Returns the result of converting the single-precision floating-point value
1555 `a' to the double-precision floating-point format. The conversion is
1556 performed according to the IEC/IEEE Standard for Binary Floating-Point
1558 -------------------------------------------------------------------------------
1560 float128
float32_to_float128( float32 a
)
1566 aSig
= extractFloat32Frac( a
);
1567 aExp
= extractFloat32Exp( a
);
1568 aSign
= extractFloat32Sign( a
);
1569 if ( aExp
== 0xFF ) {
1570 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a
) );
1571 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1574 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1575 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1578 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1584 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1586 -------------------------------------------------------------------------------
1587 Rounds the single-precision floating-point value `a' to an integer, and
1588 returns the result as a single-precision floating-point value. The
1589 operation is performed according to the IEC/IEEE Standard for Binary
1590 Floating-Point Arithmetic.
1591 -------------------------------------------------------------------------------
1593 float32
float32_round_to_int( float32 a
)
1597 bits32 lastBitMask
, roundBitsMask
;
1601 aExp
= extractFloat32Exp( a
);
1602 if ( 0x96 <= aExp
) {
1603 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1604 return propagateFloat32NaN( a
, a
);
1608 if ( aExp
<= 0x7E ) {
1609 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
1610 float_exception_flags
|= float_flag_inexact
;
1611 aSign
= extractFloat32Sign( a
);
1612 switch ( float_rounding_mode
) {
1613 case float_round_nearest_even
:
1614 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1615 return packFloat32( aSign
, 0x7F, 0 );
1618 case float_round_to_zero
:
1620 case float_round_down
:
1621 return aSign
? 0xBF800000 : 0;
1622 case float_round_up
:
1623 return aSign
? 0x80000000 : 0x3F800000;
1625 return packFloat32( aSign
, 0, 0 );
1628 lastBitMask
<<= 0x96 - aExp
;
1629 roundBitsMask
= lastBitMask
- 1;
1631 roundingMode
= float_rounding_mode
;
1632 if ( roundingMode
== float_round_nearest_even
) {
1633 z
+= lastBitMask
>>1;
1634 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1636 else if ( roundingMode
!= float_round_to_zero
) {
1637 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1641 z
&= ~ roundBitsMask
;
1642 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
1646 #endif /* !SOFTFLOAT_FOR_GCC */
1649 -------------------------------------------------------------------------------
1650 Returns the result of adding the absolute values of the single-precision
1651 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1652 before being returned. `zSign' is ignored if the result is a NaN.
1653 The addition is performed according to the IEC/IEEE Standard for Binary
1654 Floating-Point Arithmetic.
1655 -------------------------------------------------------------------------------
1657 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1659 int16 aExp
, bExp
, zExp
;
1660 bits32 aSig
, bSig
, zSig
;
1663 aSig
= extractFloat32Frac( a
);
1664 aExp
= extractFloat32Exp( a
);
1665 bSig
= extractFloat32Frac( b
);
1666 bExp
= extractFloat32Exp( b
);
1667 expDiff
= aExp
- bExp
;
1670 if ( 0 < expDiff
) {
1671 if ( aExp
== 0xFF ) {
1672 if ( aSig
) return propagateFloat32NaN( a
, b
);
1681 shift32RightJamming( bSig
, expDiff
, &bSig
);
1684 else if ( expDiff
< 0 ) {
1685 if ( bExp
== 0xFF ) {
1686 if ( bSig
) return propagateFloat32NaN( a
, b
);
1687 return packFloat32( zSign
, 0xFF, 0 );
1695 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1699 if ( aExp
== 0xFF ) {
1700 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1703 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1704 zSig
= 0x40000000 + aSig
+ bSig
;
1709 zSig
= ( aSig
+ bSig
)<<1;
1711 if ( (sbits32
) zSig
< 0 ) {
1716 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1721 -------------------------------------------------------------------------------
1722 Returns the result of subtracting the absolute values of the single-
1723 precision floating-point values `a' and `b'. If `zSign' is 1, the
1724 difference is negated before being returned. `zSign' is ignored if the
1725 result is a NaN. The subtraction is performed according to the IEC/IEEE
1726 Standard for Binary Floating-Point Arithmetic.
1727 -------------------------------------------------------------------------------
1729 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1731 int16 aExp
, bExp
, zExp
;
1732 bits32 aSig
, bSig
, zSig
;
1735 aSig
= extractFloat32Frac( a
);
1736 aExp
= extractFloat32Exp( a
);
1737 bSig
= extractFloat32Frac( b
);
1738 bExp
= extractFloat32Exp( b
);
1739 expDiff
= aExp
- bExp
;
1742 if ( 0 < expDiff
) goto aExpBigger
;
1743 if ( expDiff
< 0 ) goto bExpBigger
;
1744 if ( aExp
== 0xFF ) {
1745 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1746 float_raise( float_flag_invalid
);
1747 return float32_default_nan
;
1753 if ( bSig
< aSig
) goto aBigger
;
1754 if ( aSig
< bSig
) goto bBigger
;
1755 return packFloat32( float_rounding_mode
== float_round_down
, 0, 0 );
1757 if ( bExp
== 0xFF ) {
1758 if ( bSig
) return propagateFloat32NaN( a
, b
);
1759 return packFloat32( zSign
^ 1, 0xFF, 0 );
1767 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1773 goto normalizeRoundAndPack
;
1775 if ( aExp
== 0xFF ) {
1776 if ( aSig
) return propagateFloat32NaN( a
, b
);
1785 shift32RightJamming( bSig
, expDiff
, &bSig
);
1790 normalizeRoundAndPack
:
1792 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig
);
1797 -------------------------------------------------------------------------------
1798 Returns the result of adding the single-precision floating-point values `a'
1799 and `b'. The operation is performed according to the IEC/IEEE Standard for
1800 Binary Floating-Point Arithmetic.
1801 -------------------------------------------------------------------------------
1803 float32
float32_add( float32 a
, float32 b
)
1807 aSign
= extractFloat32Sign( a
);
1808 bSign
= extractFloat32Sign( b
);
1809 if ( aSign
== bSign
) {
1810 return addFloat32Sigs( a
, b
, aSign
);
1813 return subFloat32Sigs( a
, b
, aSign
);
1819 -------------------------------------------------------------------------------
1820 Returns the result of subtracting the single-precision floating-point values
1821 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1822 for Binary Floating-Point Arithmetic.
1823 -------------------------------------------------------------------------------
1825 float32
float32_sub( float32 a
, float32 b
)
1829 aSign
= extractFloat32Sign( a
);
1830 bSign
= extractFloat32Sign( b
);
1831 if ( aSign
== bSign
) {
1832 return subFloat32Sigs( a
, b
, aSign
);
1835 return addFloat32Sigs( a
, b
, aSign
);
1841 -------------------------------------------------------------------------------
1842 Returns the result of multiplying the single-precision floating-point values
1843 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1844 for Binary Floating-Point Arithmetic.
1845 -------------------------------------------------------------------------------
1847 float32
float32_mul( float32 a
, float32 b
)
1849 flag aSign
, bSign
, zSign
;
1850 int16 aExp
, bExp
, zExp
;
1855 aSig
= extractFloat32Frac( a
);
1856 aExp
= extractFloat32Exp( a
);
1857 aSign
= extractFloat32Sign( a
);
1858 bSig
= extractFloat32Frac( b
);
1859 bExp
= extractFloat32Exp( b
);
1860 bSign
= extractFloat32Sign( b
);
1861 zSign
= aSign
^ bSign
;
1862 if ( aExp
== 0xFF ) {
1863 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1864 return propagateFloat32NaN( a
, b
);
1866 if ( ( bExp
| bSig
) == 0 ) {
1867 float_raise( float_flag_invalid
);
1868 return float32_default_nan
;
1870 return packFloat32( zSign
, 0xFF, 0 );
1872 if ( bExp
== 0xFF ) {
1873 if ( bSig
) return propagateFloat32NaN( a
, b
);
1874 if ( ( aExp
| aSig
) == 0 ) {
1875 float_raise( float_flag_invalid
);
1876 return float32_default_nan
;
1878 return packFloat32( zSign
, 0xFF, 0 );
1881 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1882 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1885 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1886 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1888 zExp
= aExp
+ bExp
- 0x7F;
1889 aSig
= ( aSig
| 0x00800000 )<<7;
1890 bSig
= ( bSig
| 0x00800000 )<<8;
1891 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1893 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1897 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1902 -------------------------------------------------------------------------------
1903 Returns the result of dividing the single-precision floating-point value `a'
1904 by the corresponding value `b'. The operation is performed according to the
1905 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1906 -------------------------------------------------------------------------------
1908 float32
float32_div( float32 a
, float32 b
)
1910 flag aSign
, bSign
, zSign
;
1911 int16 aExp
, bExp
, zExp
;
1912 bits32 aSig
, bSig
, zSig
;
1914 aSig
= extractFloat32Frac( a
);
1915 aExp
= extractFloat32Exp( a
);
1916 aSign
= extractFloat32Sign( a
);
1917 bSig
= extractFloat32Frac( b
);
1918 bExp
= extractFloat32Exp( b
);
1919 bSign
= extractFloat32Sign( b
);
1920 zSign
= aSign
^ bSign
;
1921 if ( aExp
== 0xFF ) {
1922 if ( aSig
) return propagateFloat32NaN( a
, b
);
1923 if ( bExp
== 0xFF ) {
1924 if ( bSig
) return propagateFloat32NaN( a
, b
);
1925 float_raise( float_flag_invalid
);
1926 return float32_default_nan
;
1928 return packFloat32( zSign
, 0xFF, 0 );
1930 if ( bExp
== 0xFF ) {
1931 if ( bSig
) return propagateFloat32NaN( a
, b
);
1932 return packFloat32( zSign
, 0, 0 );
1936 if ( ( aExp
| aSig
) == 0 ) {
1937 float_raise( float_flag_invalid
);
1938 return float32_default_nan
;
1940 float_raise( float_flag_divbyzero
);
1941 return packFloat32( zSign
, 0xFF, 0 );
1943 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1946 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1947 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1949 zExp
= aExp
- bExp
+ 0x7D;
1950 aSig
= ( aSig
| 0x00800000 )<<7;
1951 bSig
= ( bSig
| 0x00800000 )<<8;
1952 if ( bSig
<= ( aSig
+ aSig
) ) {
1956 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1957 if ( ( zSig
& 0x3F ) == 0 ) {
1958 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
1960 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1964 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1966 -------------------------------------------------------------------------------
1967 Returns the remainder of the single-precision floating-point value `a'
1968 with respect to the corresponding value `b'. The operation is performed
1969 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1970 -------------------------------------------------------------------------------
1972 float32
float32_rem( float32 a
, float32 b
)
1974 flag aSign
, bSign
, zSign
;
1975 int16 aExp
, bExp
, expDiff
;
1978 bits64 aSig64
, bSig64
, q64
;
1979 bits32 alternateASig
;
1982 aSig
= extractFloat32Frac( a
);
1983 aExp
= extractFloat32Exp( a
);
1984 aSign
= extractFloat32Sign( a
);
1985 bSig
= extractFloat32Frac( b
);
1986 bExp
= extractFloat32Exp( b
);
1987 bSign
= extractFloat32Sign( b
);
1988 if ( aExp
== 0xFF ) {
1989 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1990 return propagateFloat32NaN( a
, b
);
1992 float_raise( float_flag_invalid
);
1993 return float32_default_nan
;
1995 if ( bExp
== 0xFF ) {
1996 if ( bSig
) return propagateFloat32NaN( a
, b
);
2001 float_raise( float_flag_invalid
);
2002 return float32_default_nan
;
2004 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2007 if ( aSig
== 0 ) return a
;
2008 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2010 expDiff
= aExp
- bExp
;
2013 if ( expDiff
< 32 ) {
2016 if ( expDiff
< 0 ) {
2017 if ( expDiff
< -1 ) return a
;
2020 q
= ( bSig
<= aSig
);
2021 if ( q
) aSig
-= bSig
;
2022 if ( 0 < expDiff
) {
2023 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
2026 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2034 if ( bSig
<= aSig
) aSig
-= bSig
;
2035 aSig64
= ( (bits64
) aSig
)<<40;
2036 bSig64
= ( (bits64
) bSig
)<<40;
2038 while ( 0 < expDiff
) {
2039 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2040 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2041 aSig64
= - ( ( bSig
* q64
)<<38 );
2045 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2046 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2047 q
= q64
>>( 64 - expDiff
);
2049 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2052 alternateASig
= aSig
;
2055 } while ( 0 <= (sbits32
) aSig
);
2056 sigMean
= aSig
+ alternateASig
;
2057 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2058 aSig
= alternateASig
;
2060 zSign
= ( (sbits32
) aSig
< 0 );
2061 if ( zSign
) aSig
= - aSig
;
2062 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig
);
2065 #endif /* !SOFTFLOAT_FOR_GCC */
2067 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2069 -------------------------------------------------------------------------------
2070 Returns the square root of the single-precision floating-point value `a'.
2071 The operation is performed according to the IEC/IEEE Standard for Binary
2072 Floating-Point Arithmetic.
2073 -------------------------------------------------------------------------------
2075 float32
float32_sqrt( float32 a
)
2082 aSig
= extractFloat32Frac( a
);
2083 aExp
= extractFloat32Exp( a
);
2084 aSign
= extractFloat32Sign( a
);
2085 if ( aExp
== 0xFF ) {
2086 if ( aSig
) return propagateFloat32NaN( a
, 0 );
2087 if ( ! aSign
) return a
;
2088 float_raise( float_flag_invalid
);
2089 return float32_default_nan
;
2092 if ( ( aExp
| aSig
) == 0 ) return a
;
2093 float_raise( float_flag_invalid
);
2094 return float32_default_nan
;
2097 if ( aSig
== 0 ) return 0;
2098 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2100 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2101 aSig
= ( aSig
| 0x00800000 )<<8;
2102 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2103 if ( ( zSig
& 0x7F ) <= 5 ) {
2109 term
= ( (bits64
) zSig
) * zSig
;
2110 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2111 while ( (sbits64
) rem
< 0 ) {
2113 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2115 zSig
|= ( rem
!= 0 );
2117 shift32RightJamming( zSig
, 1, &zSig
);
2119 return roundAndPackFloat32( 0, zExp
, zSig
);
2122 #endif /* !SOFTFLOAT_FOR_GCC */
2125 -------------------------------------------------------------------------------
2126 Returns 1 if the single-precision floating-point value `a' is equal to
2127 the corresponding value `b', and 0 otherwise. The comparison is performed
2128 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2129 -------------------------------------------------------------------------------
2131 flag
float32_eq( float32 a
, float32 b
)
2134 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2135 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2137 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2138 float_raise( float_flag_invalid
);
2142 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2147 -------------------------------------------------------------------------------
2148 Returns 1 if the single-precision floating-point value `a' is less than
2149 or equal to the corresponding value `b', and 0 otherwise. The comparison
2150 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2152 -------------------------------------------------------------------------------
2154 flag
float32_le( float32 a
, float32 b
)
2158 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2159 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2161 float_raise( float_flag_invalid
);
2164 aSign
= extractFloat32Sign( a
);
2165 bSign
= extractFloat32Sign( b
);
2166 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2167 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2172 -------------------------------------------------------------------------------
2173 Returns 1 if the single-precision floating-point value `a' is less than
2174 the corresponding value `b', and 0 otherwise. The comparison is performed
2175 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176 -------------------------------------------------------------------------------
2178 flag
float32_lt( float32 a
, float32 b
)
2182 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2183 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2185 float_raise( float_flag_invalid
);
2188 aSign
= extractFloat32Sign( a
);
2189 bSign
= extractFloat32Sign( b
);
2190 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2191 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2195 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2197 -------------------------------------------------------------------------------
2198 Returns 1 if the single-precision floating-point value `a' is equal to
2199 the corresponding value `b', and 0 otherwise. The invalid exception is
2200 raised if either operand is a NaN. Otherwise, the comparison is performed
2201 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2202 -------------------------------------------------------------------------------
2204 flag
float32_eq_signaling( float32 a
, float32 b
)
2207 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2208 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2210 float_raise( float_flag_invalid
);
2213 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2218 -------------------------------------------------------------------------------
2219 Returns 1 if the single-precision floating-point value `a' is less than or
2220 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2221 cause an exception. Otherwise, the comparison is performed according to the
2222 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2223 -------------------------------------------------------------------------------
2225 flag
float32_le_quiet( float32 a
, float32 b
)
2229 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2230 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2232 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2233 float_raise( float_flag_invalid
);
2237 aSign
= extractFloat32Sign( a
);
2238 bSign
= extractFloat32Sign( b
);
2239 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2240 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2245 -------------------------------------------------------------------------------
2246 Returns 1 if the single-precision floating-point value `a' is less than
2247 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2248 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2249 Standard for Binary Floating-Point Arithmetic.
2250 -------------------------------------------------------------------------------
2252 flag
float32_lt_quiet( float32 a
, float32 b
)
2256 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2257 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2259 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2260 float_raise( float_flag_invalid
);
2264 aSign
= extractFloat32Sign( a
);
2265 bSign
= extractFloat32Sign( b
);
2266 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2267 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2270 #endif /* !SOFTFLOAT_FOR_GCC */
2272 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2274 -------------------------------------------------------------------------------
2275 Returns the result of converting the double-precision floating-point value
2276 `a' to the 32-bit two's complement integer format. The conversion is
2277 performed according to the IEC/IEEE Standard for Binary Floating-Point
2278 Arithmetic---which means in particular that the conversion is rounded
2279 according to the current rounding mode. If `a' is a NaN, the largest
2280 positive integer is returned. Otherwise, if the conversion overflows, the
2281 largest integer with the same sign as `a' is returned.
2282 -------------------------------------------------------------------------------
2284 int32
float64_to_int32( float64 a
)
2287 int16 aExp
, shiftCount
;
2290 aSig
= extractFloat64Frac( a
);
2291 aExp
= extractFloat64Exp( a
);
2292 aSign
= extractFloat64Sign( a
);
2293 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2294 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2295 shiftCount
= 0x42C - aExp
;
2296 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2297 return roundAndPackInt32( aSign
, aSig
);
2300 #endif /* !SOFTFLOAT_FOR_GCC */
2303 -------------------------------------------------------------------------------
2304 Returns the result of converting the double-precision floating-point value
2305 `a' to the 32-bit two's complement integer format. The conversion is
2306 performed according to the IEC/IEEE Standard for Binary Floating-Point
2307 Arithmetic, except that the conversion is always rounded toward zero.
2308 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2309 the conversion overflows, the largest integer with the same sign as `a' is
2311 -------------------------------------------------------------------------------
2313 int32
float64_to_int32_round_to_zero( float64 a
)
2316 int16 aExp
, shiftCount
;
2317 bits64 aSig
, savedASig
;
2320 aSig
= extractFloat64Frac( a
);
2321 aExp
= extractFloat64Exp( a
);
2322 aSign
= extractFloat64Sign( a
);
2323 if ( 0x41E < aExp
) {
2324 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2327 else if ( aExp
< 0x3FF ) {
2328 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
2331 aSig
|= LIT64( 0x0010000000000000 );
2332 shiftCount
= 0x433 - aExp
;
2334 aSig
>>= shiftCount
;
2336 if ( aSign
) z
= - z
;
2337 if ( ( z
< 0 ) ^ aSign
) {
2339 float_raise( float_flag_invalid
);
2340 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2342 if ( ( aSig
<<shiftCount
) != savedASig
) {
2343 float_exception_flags
|= float_flag_inexact
;
2349 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2351 -------------------------------------------------------------------------------
2352 Returns the result of converting the double-precision floating-point value
2353 `a' to the 64-bit two's complement integer format. The conversion is
2354 performed according to the IEC/IEEE Standard for Binary Floating-Point
2355 Arithmetic---which means in particular that the conversion is rounded
2356 according to the current rounding mode. If `a' is a NaN, the largest
2357 positive integer is returned. Otherwise, if the conversion overflows, the
2358 largest integer with the same sign as `a' is returned.
2359 -------------------------------------------------------------------------------
2361 int64
float64_to_int64( float64 a
)
2364 int16 aExp
, shiftCount
;
2365 bits64 aSig
, aSigExtra
;
2367 aSig
= extractFloat64Frac( a
);
2368 aExp
= extractFloat64Exp( a
);
2369 aSign
= extractFloat64Sign( a
);
2370 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2371 shiftCount
= 0x433 - aExp
;
2372 if ( shiftCount
<= 0 ) {
2373 if ( 0x43E < aExp
) {
2374 float_raise( float_flag_invalid
);
2376 || ( ( aExp
== 0x7FF )
2377 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2379 return LIT64( 0x7FFFFFFFFFFFFFFF );
2381 return (sbits64
) LIT64( 0x8000000000000000 );
2384 aSig
<<= - shiftCount
;
2387 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2389 return roundAndPackInt64( aSign
, aSig
, aSigExtra
);
2394 -------------------------------------------------------------------------------
2395 Returns the result of converting the double-precision floating-point value
2396 `a' to the 64-bit two's complement integer format. The conversion is
2397 performed according to the IEC/IEEE Standard for Binary Floating-Point
2398 Arithmetic, except that the conversion is always rounded toward zero.
2399 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2400 the conversion overflows, the largest integer with the same sign as `a' is
2402 -------------------------------------------------------------------------------
2404 int64
float64_to_int64_round_to_zero( float64 a
)
2407 int16 aExp
, shiftCount
;
2411 aSig
= extractFloat64Frac( a
);
2412 aExp
= extractFloat64Exp( a
);
2413 aSign
= extractFloat64Sign( a
);
2414 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2415 shiftCount
= aExp
- 0x433;
2416 if ( 0 <= shiftCount
) {
2417 if ( 0x43E <= aExp
) {
2418 if ( a
!= LIT64( 0xC3E0000000000000 ) ) {
2419 float_raise( float_flag_invalid
);
2421 || ( ( aExp
== 0x7FF )
2422 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2424 return LIT64( 0x7FFFFFFFFFFFFFFF );
2427 return (sbits64
) LIT64( 0x8000000000000000 );
2429 z
= aSig
<<shiftCount
;
2432 if ( aExp
< 0x3FE ) {
2433 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
2436 z
= aSig
>>( - shiftCount
);
2437 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2438 float_exception_flags
|= float_flag_inexact
;
2441 if ( aSign
) z
= - z
;
2445 #endif /* !SOFTFLOAT_FOR_GCC */
2448 -------------------------------------------------------------------------------
2449 Returns the result of converting the double-precision floating-point value
2450 `a' to the single-precision floating-point format. The conversion is
2451 performed according to the IEC/IEEE Standard for Binary Floating-Point
2453 -------------------------------------------------------------------------------
2455 float32
float64_to_float32( float64 a
)
2462 aSig
= extractFloat64Frac( a
);
2463 aExp
= extractFloat64Exp( a
);
2464 aSign
= extractFloat64Sign( a
);
2465 if ( aExp
== 0x7FF ) {
2466 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a
) );
2467 return packFloat32( aSign
, 0xFF, 0 );
2469 shift64RightJamming( aSig
, 22, &aSig
);
2471 if ( aExp
|| zSig
) {
2475 return roundAndPackFloat32( aSign
, aExp
, zSig
);
2482 -------------------------------------------------------------------------------
2483 Returns the result of converting the double-precision floating-point value
2484 `a' to the extended double-precision floating-point format. The conversion
2485 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2487 -------------------------------------------------------------------------------
2489 floatx80
float64_to_floatx80( float64 a
)
2495 aSig
= extractFloat64Frac( a
);
2496 aExp
= extractFloat64Exp( a
);
2497 aSign
= extractFloat64Sign( a
);
2498 if ( aExp
== 0x7FF ) {
2499 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a
) );
2500 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2503 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2504 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2508 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2517 -------------------------------------------------------------------------------
2518 Returns the result of converting the double-precision floating-point value
2519 `a' to the quadruple-precision floating-point format. The conversion is
2520 performed according to the IEC/IEEE Standard for Binary Floating-Point
2522 -------------------------------------------------------------------------------
2524 float128
float64_to_float128( float64 a
)
2528 bits64 aSig
, zSig0
, zSig1
;
2530 aSig
= extractFloat64Frac( a
);
2531 aExp
= extractFloat64Exp( a
);
2532 aSign
= extractFloat64Sign( a
);
2533 if ( aExp
== 0x7FF ) {
2534 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a
) );
2535 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2538 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2539 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2542 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2543 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2549 #ifndef SOFTFLOAT_FOR_GCC
2551 -------------------------------------------------------------------------------
2552 Rounds the double-precision floating-point value `a' to an integer, and
2553 returns the result as a double-precision floating-point value. The
2554 operation is performed according to the IEC/IEEE Standard for Binary
2555 Floating-Point Arithmetic.
2556 -------------------------------------------------------------------------------
2558 float64
float64_round_to_int( float64 a
)
2562 bits64 lastBitMask
, roundBitsMask
;
2566 aExp
= extractFloat64Exp( a
);
2567 if ( 0x433 <= aExp
) {
2568 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2569 return propagateFloat64NaN( a
, a
);
2573 if ( aExp
< 0x3FF ) {
2574 if ( (bits64
) ( a
<<1 ) == 0 ) return a
;
2575 float_exception_flags
|= float_flag_inexact
;
2576 aSign
= extractFloat64Sign( a
);
2577 switch ( float_rounding_mode
) {
2578 case float_round_nearest_even
:
2579 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2580 return packFloat64( aSign
, 0x3FF, 0 );
2583 case float_round_to_zero
:
2585 case float_round_down
:
2586 return aSign
? LIT64( 0xBFF0000000000000 ) : 0;
2587 case float_round_up
:
2589 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2591 return packFloat64( aSign
, 0, 0 );
2594 lastBitMask
<<= 0x433 - aExp
;
2595 roundBitsMask
= lastBitMask
- 1;
2597 roundingMode
= float_rounding_mode
;
2598 if ( roundingMode
== float_round_nearest_even
) {
2599 z
+= lastBitMask
>>1;
2600 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2602 else if ( roundingMode
!= float_round_to_zero
) {
2603 if ( extractFloat64Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
2607 z
&= ~ roundBitsMask
;
2608 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
2615 -------------------------------------------------------------------------------
2616 Returns the result of adding the absolute values of the double-precision
2617 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2618 before being returned. `zSign' is ignored if the result is a NaN.
2619 The addition is performed according to the IEC/IEEE Standard for Binary
2620 Floating-Point Arithmetic.
2621 -------------------------------------------------------------------------------
2623 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2625 int16 aExp
, bExp
, zExp
;
2626 bits64 aSig
, bSig
, zSig
;
2629 aSig
= extractFloat64Frac( a
);
2630 aExp
= extractFloat64Exp( a
);
2631 bSig
= extractFloat64Frac( b
);
2632 bExp
= extractFloat64Exp( b
);
2633 expDiff
= aExp
- bExp
;
2636 if ( 0 < expDiff
) {
2637 if ( aExp
== 0x7FF ) {
2638 if ( aSig
) return propagateFloat64NaN( a
, b
);
2645 bSig
|= LIT64( 0x2000000000000000 );
2647 shift64RightJamming( bSig
, expDiff
, &bSig
);
2650 else if ( expDiff
< 0 ) {
2651 if ( bExp
== 0x7FF ) {
2652 if ( bSig
) return propagateFloat64NaN( a
, b
);
2653 return packFloat64( zSign
, 0x7FF, 0 );
2659 aSig
|= LIT64( 0x2000000000000000 );
2661 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2665 if ( aExp
== 0x7FF ) {
2666 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2669 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
2670 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
2674 aSig
|= LIT64( 0x2000000000000000 );
2675 zSig
= ( aSig
+ bSig
)<<1;
2677 if ( (sbits64
) zSig
< 0 ) {
2682 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2687 -------------------------------------------------------------------------------
2688 Returns the result of subtracting the absolute values of the double-
2689 precision floating-point values `a' and `b'. If `zSign' is 1, the
2690 difference is negated before being returned. `zSign' is ignored if the
2691 result is a NaN. The subtraction is performed according to the IEC/IEEE
2692 Standard for Binary Floating-Point Arithmetic.
2693 -------------------------------------------------------------------------------
2695 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2697 int16 aExp
, bExp
, zExp
;
2698 bits64 aSig
, bSig
, zSig
;
2701 aSig
= extractFloat64Frac( a
);
2702 aExp
= extractFloat64Exp( a
);
2703 bSig
= extractFloat64Frac( b
);
2704 bExp
= extractFloat64Exp( b
);
2705 expDiff
= aExp
- bExp
;
2708 if ( 0 < expDiff
) goto aExpBigger
;
2709 if ( expDiff
< 0 ) goto bExpBigger
;
2710 if ( aExp
== 0x7FF ) {
2711 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2712 float_raise( float_flag_invalid
);
2713 return float64_default_nan
;
2719 if ( bSig
< aSig
) goto aBigger
;
2720 if ( aSig
< bSig
) goto bBigger
;
2721 return packFloat64( float_rounding_mode
== float_round_down
, 0, 0 );
2723 if ( bExp
== 0x7FF ) {
2724 if ( bSig
) return propagateFloat64NaN( a
, b
);
2725 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2731 aSig
|= LIT64( 0x4000000000000000 );
2733 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2734 bSig
|= LIT64( 0x4000000000000000 );
2739 goto normalizeRoundAndPack
;
2741 if ( aExp
== 0x7FF ) {
2742 if ( aSig
) return propagateFloat64NaN( a
, b
);
2749 bSig
|= LIT64( 0x4000000000000000 );
2751 shift64RightJamming( bSig
, expDiff
, &bSig
);
2752 aSig
|= LIT64( 0x4000000000000000 );
2756 normalizeRoundAndPack
:
2758 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig
);
2763 -------------------------------------------------------------------------------
2764 Returns the result of adding the double-precision floating-point values `a'
2765 and `b'. The operation is performed according to the IEC/IEEE Standard for
2766 Binary Floating-Point Arithmetic.
2767 -------------------------------------------------------------------------------
2769 float64
float64_add( float64 a
, float64 b
)
2773 aSign
= extractFloat64Sign( a
);
2774 bSign
= extractFloat64Sign( b
);
2775 if ( aSign
== bSign
) {
2776 return addFloat64Sigs( a
, b
, aSign
);
2779 return subFloat64Sigs( a
, b
, aSign
);
2785 -------------------------------------------------------------------------------
2786 Returns the result of subtracting the double-precision floating-point values
2787 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2788 for Binary Floating-Point Arithmetic.
2789 -------------------------------------------------------------------------------
2791 float64
float64_sub( float64 a
, float64 b
)
2795 aSign
= extractFloat64Sign( a
);
2796 bSign
= extractFloat64Sign( b
);
2797 if ( aSign
== bSign
) {
2798 return subFloat64Sigs( a
, b
, aSign
);
2801 return addFloat64Sigs( a
, b
, aSign
);
2807 -------------------------------------------------------------------------------
2808 Returns the result of multiplying the double-precision floating-point values
2809 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2810 for Binary Floating-Point Arithmetic.
2811 -------------------------------------------------------------------------------
2813 float64
float64_mul( float64 a
, float64 b
)
2815 flag aSign
, bSign
, zSign
;
2816 int16 aExp
, bExp
, zExp
;
2817 bits64 aSig
, bSig
, zSig0
, zSig1
;
2819 aSig
= extractFloat64Frac( a
);
2820 aExp
= extractFloat64Exp( a
);
2821 aSign
= extractFloat64Sign( a
);
2822 bSig
= extractFloat64Frac( b
);
2823 bExp
= extractFloat64Exp( b
);
2824 bSign
= extractFloat64Sign( b
);
2825 zSign
= aSign
^ bSign
;
2826 if ( aExp
== 0x7FF ) {
2827 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2828 return propagateFloat64NaN( a
, b
);
2830 if ( ( bExp
| bSig
) == 0 ) {
2831 float_raise( float_flag_invalid
);
2832 return float64_default_nan
;
2834 return packFloat64( zSign
, 0x7FF, 0 );
2836 if ( bExp
== 0x7FF ) {
2837 if ( bSig
) return propagateFloat64NaN( a
, b
);
2838 if ( ( aExp
| aSig
) == 0 ) {
2839 float_raise( float_flag_invalid
);
2840 return float64_default_nan
;
2842 return packFloat64( zSign
, 0x7FF, 0 );
2845 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2846 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2849 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2850 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2852 zExp
= aExp
+ bExp
- 0x3FF;
2853 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2854 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2855 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2856 zSig0
|= ( zSig1
!= 0 );
2857 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2861 return roundAndPackFloat64( zSign
, zExp
, zSig0
);
2866 -------------------------------------------------------------------------------
2867 Returns the result of dividing the double-precision floating-point value `a'
2868 by the corresponding value `b'. The operation is performed according to
2869 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2870 -------------------------------------------------------------------------------
2872 float64
float64_div( float64 a
, float64 b
)
2874 flag aSign
, bSign
, zSign
;
2875 int16 aExp
, bExp
, zExp
;
2876 bits64 aSig
, bSig
, zSig
;
2878 bits64 term0
, term1
;
2880 aSig
= extractFloat64Frac( a
);
2881 aExp
= extractFloat64Exp( a
);
2882 aSign
= extractFloat64Sign( a
);
2883 bSig
= extractFloat64Frac( b
);
2884 bExp
= extractFloat64Exp( b
);
2885 bSign
= extractFloat64Sign( b
);
2886 zSign
= aSign
^ bSign
;
2887 if ( aExp
== 0x7FF ) {
2888 if ( aSig
) return propagateFloat64NaN( a
, b
);
2889 if ( bExp
== 0x7FF ) {
2890 if ( bSig
) return propagateFloat64NaN( a
, b
);
2891 float_raise( float_flag_invalid
);
2892 return float64_default_nan
;
2894 return packFloat64( zSign
, 0x7FF, 0 );
2896 if ( bExp
== 0x7FF ) {
2897 if ( bSig
) return propagateFloat64NaN( a
, b
);
2898 return packFloat64( zSign
, 0, 0 );
2902 if ( ( aExp
| aSig
) == 0 ) {
2903 float_raise( float_flag_invalid
);
2904 return float64_default_nan
;
2906 float_raise( float_flag_divbyzero
);
2907 return packFloat64( zSign
, 0x7FF, 0 );
2909 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2912 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2913 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2915 zExp
= aExp
- bExp
+ 0x3FD;
2916 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2917 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2918 if ( bSig
<= ( aSig
+ aSig
) ) {
2922 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2923 if ( ( zSig
& 0x1FF ) <= 2 ) {
2924 mul64To128( bSig
, zSig
, &term0
, &term1
);
2925 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2926 while ( (sbits64
) rem0
< 0 ) {
2928 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2930 zSig
|= ( rem1
!= 0 );
2932 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2936 #ifndef SOFTFLOAT_FOR_GCC
2938 -------------------------------------------------------------------------------
2939 Returns the remainder of the double-precision floating-point value `a'
2940 with respect to the corresponding value `b'. The operation is performed
2941 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2942 -------------------------------------------------------------------------------
2944 float64
float64_rem( float64 a
, float64 b
)
2946 flag aSign
, bSign
, zSign
;
2947 int16 aExp
, bExp
, expDiff
;
2949 bits64 q
, alternateASig
;
2952 aSig
= extractFloat64Frac( a
);
2953 aExp
= extractFloat64Exp( a
);
2954 aSign
= extractFloat64Sign( a
);
2955 bSig
= extractFloat64Frac( b
);
2956 bExp
= extractFloat64Exp( b
);
2957 bSign
= extractFloat64Sign( b
);
2958 if ( aExp
== 0x7FF ) {
2959 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2960 return propagateFloat64NaN( a
, b
);
2962 float_raise( float_flag_invalid
);
2963 return float64_default_nan
;
2965 if ( bExp
== 0x7FF ) {
2966 if ( bSig
) return propagateFloat64NaN( a
, b
);
2971 float_raise( float_flag_invalid
);
2972 return float64_default_nan
;
2974 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2977 if ( aSig
== 0 ) return a
;
2978 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2980 expDiff
= aExp
- bExp
;
2981 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
2982 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2983 if ( expDiff
< 0 ) {
2984 if ( expDiff
< -1 ) return a
;
2987 q
= ( bSig
<= aSig
);
2988 if ( q
) aSig
-= bSig
;
2990 while ( 0 < expDiff
) {
2991 q
= estimateDiv128To64( aSig
, 0, bSig
);
2992 q
= ( 2 < q
) ? q
- 2 : 0;
2993 aSig
= - ( ( bSig
>>2 ) * q
);
2997 if ( 0 < expDiff
) {
2998 q
= estimateDiv128To64( aSig
, 0, bSig
);
2999 q
= ( 2 < q
) ? q
- 2 : 0;
3002 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3009 alternateASig
= aSig
;
3012 } while ( 0 <= (sbits64
) aSig
);
3013 sigMean
= aSig
+ alternateASig
;
3014 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3015 aSig
= alternateASig
;
3017 zSign
= ( (sbits64
) aSig
< 0 );
3018 if ( zSign
) aSig
= - aSig
;
3019 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig
);
3024 -------------------------------------------------------------------------------
3025 Returns the square root of the double-precision floating-point value `a'.
3026 The operation is performed according to the IEC/IEEE Standard for Binary
3027 Floating-Point Arithmetic.
3028 -------------------------------------------------------------------------------
3030 float64
float64_sqrt( float64 a
)
3034 bits64 aSig
, zSig
, doubleZSig
;
3035 bits64 rem0
, rem1
, term0
, term1
;
3037 aSig
= extractFloat64Frac( a
);
3038 aExp
= extractFloat64Exp( a
);
3039 aSign
= extractFloat64Sign( a
);
3040 if ( aExp
== 0x7FF ) {
3041 if ( aSig
) return propagateFloat64NaN( a
, a
);
3042 if ( ! aSign
) return a
;
3043 float_raise( float_flag_invalid
);
3044 return float64_default_nan
;
3047 if ( ( aExp
| aSig
) == 0 ) return a
;
3048 float_raise( float_flag_invalid
);
3049 return float64_default_nan
;
3052 if ( aSig
== 0 ) return 0;
3053 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3055 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3056 aSig
|= LIT64( 0x0010000000000000 );
3057 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3058 aSig
<<= 9 - ( aExp
& 1 );
3059 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3060 if ( ( zSig
& 0x1FF ) <= 5 ) {
3061 doubleZSig
= zSig
<<1;
3062 mul64To128( zSig
, zSig
, &term0
, &term1
);
3063 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3064 while ( (sbits64
) rem0
< 0 ) {
3067 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3069 zSig
|= ( ( rem0
| rem1
) != 0 );
3071 return roundAndPackFloat64( 0, zExp
, zSig
);
3077 -------------------------------------------------------------------------------
3078 Returns 1 if the double-precision floating-point value `a' is equal to the
3079 corresponding value `b', and 0 otherwise. The comparison is performed
3080 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3081 -------------------------------------------------------------------------------
3083 flag
float64_eq( float64 a
, float64 b
)
3086 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3087 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3089 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3090 float_raise( float_flag_invalid
);
3094 return ( a
== b
) ||
3095 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) == 0 );
3100 -------------------------------------------------------------------------------
3101 Returns 1 if the double-precision floating-point value `a' is less than or
3102 equal to the corresponding value `b', and 0 otherwise. The comparison is
3103 performed according to the IEC/IEEE Standard for Binary Floating-Point
3105 -------------------------------------------------------------------------------
3107 flag
float64_le( float64 a
, float64 b
)
3111 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3112 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3114 float_raise( float_flag_invalid
);
3117 aSign
= extractFloat64Sign( a
);
3118 bSign
= extractFloat64Sign( b
);
3119 if ( aSign
!= bSign
)
3121 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) ==
3123 return ( a
== b
) ||
3124 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
3129 -------------------------------------------------------------------------------
3130 Returns 1 if the double-precision floating-point value `a' is less than
3131 the corresponding value `b', and 0 otherwise. The comparison is performed
3132 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3133 -------------------------------------------------------------------------------
3135 flag
float64_lt( float64 a
, float64 b
)
3139 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3140 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3142 float_raise( float_flag_invalid
);
3145 aSign
= extractFloat64Sign( a
);
3146 bSign
= extractFloat64Sign( b
);
3147 if ( aSign
!= bSign
)
3149 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) !=
3151 return ( a
!= b
) &&
3152 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
3156 #ifndef SOFTFLOAT_FOR_GCC
3158 -------------------------------------------------------------------------------
3159 Returns 1 if the double-precision floating-point value `a' is equal to the
3160 corresponding value `b', and 0 otherwise. The invalid exception is raised
3161 if either operand is a NaN. Otherwise, the comparison is performed
3162 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3163 -------------------------------------------------------------------------------
3165 flag
float64_eq_signaling( float64 a
, float64 b
)
3168 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3169 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3171 float_raise( float_flag_invalid
);
3174 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3179 -------------------------------------------------------------------------------
3180 Returns 1 if the double-precision floating-point value `a' is less than or
3181 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3182 cause an exception. Otherwise, the comparison is performed according to the
3183 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3184 -------------------------------------------------------------------------------
3186 flag
float64_le_quiet( float64 a
, float64 b
)
3190 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3191 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3193 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3194 float_raise( float_flag_invalid
);
3198 aSign
= extractFloat64Sign( a
);
3199 bSign
= extractFloat64Sign( b
);
3200 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3201 return ( a
== b
) || ( aSign
^ ( a
< b
) );
3206 -------------------------------------------------------------------------------
3207 Returns 1 if the double-precision floating-point value `a' is less than
3208 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3209 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3210 Standard for Binary Floating-Point Arithmetic.
3211 -------------------------------------------------------------------------------
3213 flag
float64_lt_quiet( float64 a
, float64 b
)
3217 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3218 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3220 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3221 float_raise( float_flag_invalid
);
3225 aSign
= extractFloat64Sign( a
);
3226 bSign
= extractFloat64Sign( b
);
3227 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
3228 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
3236 -------------------------------------------------------------------------------
3237 Returns the result of converting the extended double-precision floating-
3238 point value `a' to the 32-bit two's complement integer format. The
3239 conversion is performed according to the IEC/IEEE Standard for Binary
3240 Floating-Point Arithmetic---which means in particular that the conversion
3241 is rounded according to the current rounding mode. If `a' is a NaN, the
3242 largest positive integer is returned. Otherwise, if the conversion
3243 overflows, the largest integer with the same sign as `a' is returned.
3244 -------------------------------------------------------------------------------
3246 int32
floatx80_to_int32( floatx80 a
)
3249 int32 aExp
, shiftCount
;
3252 aSig
= extractFloatx80Frac( a
);
3253 aExp
= extractFloatx80Exp( a
);
3254 aSign
= extractFloatx80Sign( a
);
3255 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3256 shiftCount
= 0x4037 - aExp
;
3257 if ( shiftCount
<= 0 ) shiftCount
= 1;
3258 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3259 return roundAndPackInt32( aSign
, aSig
);
3264 -------------------------------------------------------------------------------
3265 Returns the result of converting the extended double-precision floating-
3266 point value `a' to the 32-bit two's complement integer format. The
3267 conversion is performed according to the IEC/IEEE Standard for Binary
3268 Floating-Point Arithmetic, except that the conversion is always rounded
3269 toward zero. If `a' is a NaN, the largest positive integer is returned.
3270 Otherwise, if the conversion overflows, the largest integer with the same
3271 sign as `a' is returned.
3272 -------------------------------------------------------------------------------
3274 int32
floatx80_to_int32_round_to_zero( floatx80 a
)
3277 int32 aExp
, shiftCount
;
3278 bits64 aSig
, savedASig
;
3281 aSig
= extractFloatx80Frac( a
);
3282 aExp
= extractFloatx80Exp( a
);
3283 aSign
= extractFloatx80Sign( a
);
3284 if ( 0x401E < aExp
) {
3285 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3288 else if ( aExp
< 0x3FFF ) {
3289 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
3292 shiftCount
= 0x403E - aExp
;
3294 aSig
>>= shiftCount
;
3296 if ( aSign
) z
= - z
;
3297 if ( ( z
< 0 ) ^ aSign
) {
3299 float_raise( float_flag_invalid
);
3300 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3302 if ( ( aSig
<<shiftCount
) != savedASig
) {
3303 float_exception_flags
|= float_flag_inexact
;
3310 -------------------------------------------------------------------------------
3311 Returns the result of converting the extended double-precision floating-
3312 point value `a' to the 64-bit two's complement integer format. The
3313 conversion is performed according to the IEC/IEEE Standard for Binary
3314 Floating-Point Arithmetic---which means in particular that the conversion
3315 is rounded according to the current rounding mode. If `a' is a NaN,
3316 the largest positive integer is returned. Otherwise, if the conversion
3317 overflows, the largest integer with the same sign as `a' is returned.
3318 -------------------------------------------------------------------------------
3320 int64
floatx80_to_int64( floatx80 a
)
3323 int32 aExp
, shiftCount
;
3324 bits64 aSig
, aSigExtra
;
3326 aSig
= extractFloatx80Frac( a
);
3327 aExp
= extractFloatx80Exp( a
);
3328 aSign
= extractFloatx80Sign( a
);
3329 shiftCount
= 0x403E - aExp
;
3330 if ( shiftCount
<= 0 ) {
3332 float_raise( float_flag_invalid
);
3334 || ( ( aExp
== 0x7FFF )
3335 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3337 return LIT64( 0x7FFFFFFFFFFFFFFF );
3339 return (sbits64
) LIT64( 0x8000000000000000 );
3344 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3346 return roundAndPackInt64( aSign
, aSig
, aSigExtra
);
3351 -------------------------------------------------------------------------------
3352 Returns the result of converting the extended double-precision floating-
3353 point value `a' to the 64-bit two's complement integer format. The
3354 conversion is performed according to the IEC/IEEE Standard for Binary
3355 Floating-Point Arithmetic, except that the conversion is always rounded
3356 toward zero. If `a' is a NaN, the largest positive integer is returned.
3357 Otherwise, if the conversion overflows, the largest integer with the same
3358 sign as `a' is returned.
3359 -------------------------------------------------------------------------------
3361 int64
floatx80_to_int64_round_to_zero( floatx80 a
)
3364 int32 aExp
, shiftCount
;
3368 aSig
= extractFloatx80Frac( a
);
3369 aExp
= extractFloatx80Exp( a
);
3370 aSign
= extractFloatx80Sign( a
);
3371 shiftCount
= aExp
- 0x403E;
3372 if ( 0 <= shiftCount
) {
3373 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3374 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3375 float_raise( float_flag_invalid
);
3376 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3377 return LIT64( 0x7FFFFFFFFFFFFFFF );
3380 return (sbits64
) LIT64( 0x8000000000000000 );
3382 else if ( aExp
< 0x3FFF ) {
3383 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
3386 z
= aSig
>>( - shiftCount
);
3387 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3388 float_exception_flags
|= float_flag_inexact
;
3390 if ( aSign
) z
= - z
;
3396 -------------------------------------------------------------------------------
3397 Returns the result of converting the extended double-precision floating-
3398 point value `a' to the single-precision floating-point format. The
3399 conversion is performed according to the IEC/IEEE Standard for Binary
3400 Floating-Point Arithmetic.
3401 -------------------------------------------------------------------------------
3403 float32
floatx80_to_float32( floatx80 a
)
3409 aSig
= extractFloatx80Frac( a
);
3410 aExp
= extractFloatx80Exp( a
);
3411 aSign
= extractFloatx80Sign( a
);
3412 if ( aExp
== 0x7FFF ) {
3413 if ( (bits64
) ( aSig
<<1 ) ) {
3414 return commonNaNToFloat32( floatx80ToCommonNaN( a
) );
3416 return packFloat32( aSign
, 0xFF, 0 );
3418 shift64RightJamming( aSig
, 33, &aSig
);
3419 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3420 return roundAndPackFloat32( aSign
, aExp
, aSig
);
3425 -------------------------------------------------------------------------------
3426 Returns the result of converting the extended double-precision floating-
3427 point value `a' to the double-precision floating-point format. The
3428 conversion is performed according to the IEC/IEEE Standard for Binary
3429 Floating-Point Arithmetic.
3430 -------------------------------------------------------------------------------
3432 float64
floatx80_to_float64( floatx80 a
)
3438 aSig
= extractFloatx80Frac( a
);
3439 aExp
= extractFloatx80Exp( a
);
3440 aSign
= extractFloatx80Sign( a
);
3441 if ( aExp
== 0x7FFF ) {
3442 if ( (bits64
) ( aSig
<<1 ) ) {
3443 return commonNaNToFloat64( floatx80ToCommonNaN( a
) );
3445 return packFloat64( aSign
, 0x7FF, 0 );
3447 shift64RightJamming( aSig
, 1, &zSig
);
3448 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3449 return roundAndPackFloat64( aSign
, aExp
, zSig
);
3456 -------------------------------------------------------------------------------
3457 Returns the result of converting the extended double-precision floating-
3458 point value `a' to the quadruple-precision floating-point format. The
3459 conversion is performed according to the IEC/IEEE Standard for Binary
3460 Floating-Point Arithmetic.
3461 -------------------------------------------------------------------------------
3463 float128
floatx80_to_float128( floatx80 a
)
3467 bits64 aSig
, zSig0
, zSig1
;
3469 aSig
= extractFloatx80Frac( a
);
3470 aExp
= extractFloatx80Exp( a
);
3471 aSign
= extractFloatx80Sign( a
);
3472 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3473 return commonNaNToFloat128( floatx80ToCommonNaN( a
) );
3475 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3476 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3483 -------------------------------------------------------------------------------
3484 Rounds the extended double-precision floating-point value `a' to an integer,
3485 and returns the result as an extended quadruple-precision floating-point
3486 value. The operation is performed according to the IEC/IEEE Standard for
3487 Binary Floating-Point Arithmetic.
3488 -------------------------------------------------------------------------------
3490 floatx80
floatx80_round_to_int( floatx80 a
)
3494 bits64 lastBitMask
, roundBitsMask
;
3498 aExp
= extractFloatx80Exp( a
);
3499 if ( 0x403E <= aExp
) {
3500 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3501 return propagateFloatx80NaN( a
, a
);
3505 if ( aExp
< 0x3FFF ) {
3507 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3510 float_exception_flags
|= float_flag_inexact
;
3511 aSign
= extractFloatx80Sign( a
);
3512 switch ( float_rounding_mode
) {
3513 case float_round_nearest_even
:
3514 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3517 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3520 case float_round_to_zero
:
3522 case float_round_down
:
3525 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3526 : packFloatx80( 0, 0, 0 );
3527 case float_round_up
:
3529 aSign
? packFloatx80( 1, 0, 0 )
3530 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3532 return packFloatx80( aSign
, 0, 0 );
3535 lastBitMask
<<= 0x403E - aExp
;
3536 roundBitsMask
= lastBitMask
- 1;
3538 roundingMode
= float_rounding_mode
;
3539 if ( roundingMode
== float_round_nearest_even
) {
3540 z
.low
+= lastBitMask
>>1;
3541 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
3543 else if ( roundingMode
!= float_round_to_zero
) {
3544 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
3545 z
.low
+= roundBitsMask
;
3548 z
.low
&= ~ roundBitsMask
;
3551 z
.low
= LIT64( 0x8000000000000000 );
3553 if ( z
.low
!= a
.low
) float_exception_flags
|= float_flag_inexact
;
3559 -------------------------------------------------------------------------------
3560 Returns the result of adding the absolute values of the extended double-
3561 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3562 negated before being returned. `zSign' is ignored if the result is a NaN.
3563 The addition is performed according to the IEC/IEEE Standard for Binary
3564 Floating-Point Arithmetic.
3565 -------------------------------------------------------------------------------
3567 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3569 int32 aExp
, bExp
, zExp
;
3570 bits64 aSig
, bSig
, zSig0
, zSig1
;
3573 aSig
= extractFloatx80Frac( a
);
3574 aExp
= extractFloatx80Exp( a
);
3575 bSig
= extractFloatx80Frac( b
);
3576 bExp
= extractFloatx80Exp( b
);
3577 expDiff
= aExp
- bExp
;
3578 if ( 0 < expDiff
) {
3579 if ( aExp
== 0x7FFF ) {
3580 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3583 if ( bExp
== 0 ) --expDiff
;
3584 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3587 else if ( expDiff
< 0 ) {
3588 if ( bExp
== 0x7FFF ) {
3589 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3590 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3592 if ( aExp
== 0 ) ++expDiff
;
3593 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3597 if ( aExp
== 0x7FFF ) {
3598 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3599 return propagateFloatx80NaN( a
, b
);
3604 zSig0
= aSig
+ bSig
;
3606 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
3612 zSig0
= aSig
+ bSig
;
3613 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
3615 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3616 zSig0
|= LIT64( 0x8000000000000000 );
3620 roundAndPackFloatx80(
3621 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3626 -------------------------------------------------------------------------------
3627 Returns the result of subtracting the absolute values of the extended
3628 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3629 difference is negated before being returned. `zSign' is ignored if the
3630 result is a NaN. The subtraction is performed according to the IEC/IEEE
3631 Standard for Binary Floating-Point Arithmetic.
3632 -------------------------------------------------------------------------------
3634 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3636 int32 aExp
, bExp
, zExp
;
3637 bits64 aSig
, bSig
, zSig0
, zSig1
;
3641 aSig
= extractFloatx80Frac( a
);
3642 aExp
= extractFloatx80Exp( a
);
3643 bSig
= extractFloatx80Frac( b
);
3644 bExp
= extractFloatx80Exp( b
);
3645 expDiff
= aExp
- bExp
;
3646 if ( 0 < expDiff
) goto aExpBigger
;
3647 if ( expDiff
< 0 ) goto bExpBigger
;
3648 if ( aExp
== 0x7FFF ) {
3649 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3650 return propagateFloatx80NaN( a
, b
);
3652 float_raise( float_flag_invalid
);
3653 z
.low
= floatx80_default_nan_low
;
3654 z
.high
= floatx80_default_nan_high
;
3662 if ( bSig
< aSig
) goto aBigger
;
3663 if ( aSig
< bSig
) goto bBigger
;
3664 return packFloatx80( float_rounding_mode
== float_round_down
, 0, 0 );
3666 if ( bExp
== 0x7FFF ) {
3667 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3668 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3670 if ( aExp
== 0 ) ++expDiff
;
3671 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3673 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
3676 goto normalizeRoundAndPack
;
3678 if ( aExp
== 0x7FFF ) {
3679 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3682 if ( bExp
== 0 ) --expDiff
;
3683 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3685 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
3687 normalizeRoundAndPack
:
3689 normalizeRoundAndPackFloatx80(
3690 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3695 -------------------------------------------------------------------------------
3696 Returns the result of adding the extended double-precision floating-point
3697 values `a' and `b'. The operation is performed according to the IEC/IEEE
3698 Standard for Binary Floating-Point Arithmetic.
3699 -------------------------------------------------------------------------------
3701 floatx80
floatx80_add( floatx80 a
, floatx80 b
)
3705 aSign
= extractFloatx80Sign( a
);
3706 bSign
= extractFloatx80Sign( b
);
3707 if ( aSign
== bSign
) {
3708 return addFloatx80Sigs( a
, b
, aSign
);
3711 return subFloatx80Sigs( a
, b
, aSign
);
3717 -------------------------------------------------------------------------------
3718 Returns the result of subtracting the extended double-precision floating-
3719 point values `a' and `b'. The operation is performed according to the
3720 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3721 -------------------------------------------------------------------------------
3723 floatx80
floatx80_sub( floatx80 a
, floatx80 b
)
3727 aSign
= extractFloatx80Sign( a
);
3728 bSign
= extractFloatx80Sign( b
);
3729 if ( aSign
== bSign
) {
3730 return subFloatx80Sigs( a
, b
, aSign
);
3733 return addFloatx80Sigs( a
, b
, aSign
);
3739 -------------------------------------------------------------------------------
3740 Returns the result of multiplying the extended double-precision floating-
3741 point values `a' and `b'. The operation is performed according to the
3742 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3743 -------------------------------------------------------------------------------
3745 floatx80
floatx80_mul( floatx80 a
, floatx80 b
)
3747 flag aSign
, bSign
, zSign
;
3748 int32 aExp
, bExp
, zExp
;
3749 bits64 aSig
, bSig
, zSig0
, zSig1
;
3752 aSig
= extractFloatx80Frac( a
);
3753 aExp
= extractFloatx80Exp( a
);
3754 aSign
= extractFloatx80Sign( a
);
3755 bSig
= extractFloatx80Frac( b
);
3756 bExp
= extractFloatx80Exp( b
);
3757 bSign
= extractFloatx80Sign( b
);
3758 zSign
= aSign
^ bSign
;
3759 if ( aExp
== 0x7FFF ) {
3760 if ( (bits64
) ( aSig
<<1 )
3761 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3762 return propagateFloatx80NaN( a
, b
);
3764 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
3765 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3767 if ( bExp
== 0x7FFF ) {
3768 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3769 if ( ( aExp
| aSig
) == 0 ) {
3771 float_raise( float_flag_invalid
);
3772 z
.low
= floatx80_default_nan_low
;
3773 z
.high
= floatx80_default_nan_high
;
3776 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3779 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3780 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3783 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3784 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3786 zExp
= aExp
+ bExp
- 0x3FFE;
3787 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3788 if ( 0 < (sbits64
) zSig0
) {
3789 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3793 roundAndPackFloatx80(
3794 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3799 -------------------------------------------------------------------------------
3800 Returns the result of dividing the extended double-precision floating-point
3801 value `a' by the corresponding value `b'. The operation is performed
3802 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3803 -------------------------------------------------------------------------------
3805 floatx80
floatx80_div( floatx80 a
, floatx80 b
)
3807 flag aSign
, bSign
, zSign
;
3808 int32 aExp
, bExp
, zExp
;
3809 bits64 aSig
, bSig
, zSig0
, zSig1
;
3810 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3813 aSig
= extractFloatx80Frac( a
);
3814 aExp
= extractFloatx80Exp( a
);
3815 aSign
= extractFloatx80Sign( a
);
3816 bSig
= extractFloatx80Frac( b
);
3817 bExp
= extractFloatx80Exp( b
);
3818 bSign
= extractFloatx80Sign( b
);
3819 zSign
= aSign
^ bSign
;
3820 if ( aExp
== 0x7FFF ) {
3821 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3822 if ( bExp
== 0x7FFF ) {
3823 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3826 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3828 if ( bExp
== 0x7FFF ) {
3829 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3830 return packFloatx80( zSign
, 0, 0 );
3834 if ( ( aExp
| aSig
) == 0 ) {
3836 float_raise( float_flag_invalid
);
3837 z
.low
= floatx80_default_nan_low
;
3838 z
.high
= floatx80_default_nan_high
;
3841 float_raise( float_flag_divbyzero
);
3842 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3844 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3847 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3848 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3850 zExp
= aExp
- bExp
+ 0x3FFE;
3852 if ( bSig
<= aSig
) {
3853 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3856 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3857 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3858 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3859 while ( (sbits64
) rem0
< 0 ) {
3861 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3863 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3864 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3865 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3866 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3867 while ( (sbits64
) rem1
< 0 ) {
3869 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3871 zSig1
|= ( ( rem1
| rem2
) != 0 );
3874 roundAndPackFloatx80(
3875 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3880 -------------------------------------------------------------------------------
3881 Returns the remainder of the extended double-precision floating-point value
3882 `a' with respect to the corresponding value `b'. The operation is performed
3883 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3884 -------------------------------------------------------------------------------
3886 floatx80
floatx80_rem( floatx80 a
, floatx80 b
)
3888 flag aSign
, bSign
, zSign
;
3889 int32 aExp
, bExp
, expDiff
;
3890 bits64 aSig0
, aSig1
, bSig
;
3891 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3894 aSig0
= extractFloatx80Frac( a
);
3895 aExp
= extractFloatx80Exp( a
);
3896 aSign
= extractFloatx80Sign( a
);
3897 bSig
= extractFloatx80Frac( b
);
3898 bExp
= extractFloatx80Exp( b
);
3899 bSign
= extractFloatx80Sign( b
);
3900 if ( aExp
== 0x7FFF ) {
3901 if ( (bits64
) ( aSig0
<<1 )
3902 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3903 return propagateFloatx80NaN( a
, b
);
3907 if ( bExp
== 0x7FFF ) {
3908 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3914 float_raise( float_flag_invalid
);
3915 z
.low
= floatx80_default_nan_low
;
3916 z
.high
= floatx80_default_nan_high
;
3919 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3922 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3923 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3925 bSig
|= LIT64( 0x8000000000000000 );
3927 expDiff
= aExp
- bExp
;
3929 if ( expDiff
< 0 ) {
3930 if ( expDiff
< -1 ) return a
;
3931 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3934 q
= ( bSig
<= aSig0
);
3935 if ( q
) aSig0
-= bSig
;
3937 while ( 0 < expDiff
) {
3938 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3939 q
= ( 2 < q
) ? q
- 2 : 0;
3940 mul64To128( bSig
, q
, &term0
, &term1
);
3941 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3942 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3946 if ( 0 < expDiff
) {
3947 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3948 q
= ( 2 < q
) ? q
- 2 : 0;
3950 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
3951 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3952 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
3953 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
3955 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3962 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
3963 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3964 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3967 aSig0
= alternateASig0
;
3968 aSig1
= alternateASig1
;
3972 normalizeRoundAndPackFloatx80(
3973 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
);
3978 -------------------------------------------------------------------------------
3979 Returns the square root of the extended double-precision floating-point
3980 value `a'. The operation is performed according to the IEC/IEEE Standard
3981 for Binary Floating-Point Arithmetic.
3982 -------------------------------------------------------------------------------
3984 floatx80
floatx80_sqrt( floatx80 a
)
3988 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
3989 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
3992 aSig0
= extractFloatx80Frac( a
);
3993 aExp
= extractFloatx80Exp( a
);
3994 aSign
= extractFloatx80Sign( a
);
3995 if ( aExp
== 0x7FFF ) {
3996 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a
);
3997 if ( ! aSign
) return a
;
4001 if ( ( aExp
| aSig0
) == 0 ) return a
;
4003 float_raise( float_flag_invalid
);
4004 z
.low
= floatx80_default_nan_low
;
4005 z
.high
= floatx80_default_nan_high
;
4009 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4010 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4012 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4013 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4014 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4015 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4016 doubleZSig0
= zSig0
<<1;
4017 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4018 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4019 while ( (sbits64
) rem0
< 0 ) {
4022 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4024 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4025 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4026 if ( zSig1
== 0 ) zSig1
= 1;
4027 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4028 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4029 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4030 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4031 while ( (sbits64
) rem1
< 0 ) {
4033 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4035 term2
|= doubleZSig0
;
4036 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4038 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4040 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4041 zSig0
|= doubleZSig0
;
4043 roundAndPackFloatx80(
4044 floatx80_rounding_precision
, 0, zExp
, zSig0
, zSig1
);
4049 -------------------------------------------------------------------------------
4050 Returns 1 if the extended double-precision floating-point value `a' is
4051 equal to the corresponding value `b', and 0 otherwise. The comparison is
4052 performed according to the IEC/IEEE Standard for Binary Floating-Point
4054 -------------------------------------------------------------------------------
4056 flag
floatx80_eq( floatx80 a
, floatx80 b
)
4059 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4060 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4061 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4062 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4064 if ( floatx80_is_signaling_nan( a
)
4065 || floatx80_is_signaling_nan( b
) ) {
4066 float_raise( float_flag_invalid
);
4072 && ( ( a
.high
== b
.high
)
4074 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4080 -------------------------------------------------------------------------------
4081 Returns 1 if the extended double-precision floating-point value `a' is
4082 less than or equal to the corresponding value `b', and 0 otherwise. The
4083 comparison is performed according to the IEC/IEEE Standard for Binary
4084 Floating-Point Arithmetic.
4085 -------------------------------------------------------------------------------
4087 flag
floatx80_le( floatx80 a
, floatx80 b
)
4091 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4092 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4093 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4094 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4096 float_raise( float_flag_invalid
);
4099 aSign
= extractFloatx80Sign( a
);
4100 bSign
= extractFloatx80Sign( b
);
4101 if ( aSign
!= bSign
) {
4104 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4108 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4109 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4114 -------------------------------------------------------------------------------
4115 Returns 1 if the extended double-precision floating-point value `a' is
4116 less than the corresponding value `b', and 0 otherwise. The comparison
4117 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4119 -------------------------------------------------------------------------------
4121 flag
floatx80_lt( floatx80 a
, floatx80 b
)
4125 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4126 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4127 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4128 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4130 float_raise( float_flag_invalid
);
4133 aSign
= extractFloatx80Sign( a
);
4134 bSign
= extractFloatx80Sign( b
);
4135 if ( aSign
!= bSign
) {
4138 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4142 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4143 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4148 -------------------------------------------------------------------------------
4149 Returns 1 if the extended double-precision floating-point value `a' is equal
4150 to the corresponding value `b', and 0 otherwise. The invalid exception is
4151 raised if either operand is a NaN. Otherwise, the comparison is performed
4152 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4153 -------------------------------------------------------------------------------
4155 flag
floatx80_eq_signaling( floatx80 a
, floatx80 b
)
4158 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4159 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4160 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4161 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4163 float_raise( float_flag_invalid
);
4168 && ( ( a
.high
== b
.high
)
4170 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4176 -------------------------------------------------------------------------------
4177 Returns 1 if the extended double-precision floating-point value `a' is less
4178 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4179 do not cause an exception. Otherwise, the comparison is performed according
4180 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4181 -------------------------------------------------------------------------------
4183 flag
floatx80_le_quiet( floatx80 a
, floatx80 b
)
4187 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4188 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4189 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4190 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4192 if ( floatx80_is_signaling_nan( a
)
4193 || floatx80_is_signaling_nan( b
) ) {
4194 float_raise( float_flag_invalid
);
4198 aSign
= extractFloatx80Sign( a
);
4199 bSign
= extractFloatx80Sign( b
);
4200 if ( aSign
!= bSign
) {
4203 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4207 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4208 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4213 -------------------------------------------------------------------------------
4214 Returns 1 if the extended double-precision floating-point value `a' is less
4215 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4216 an exception. Otherwise, the comparison is performed according to the
4217 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4218 -------------------------------------------------------------------------------
4220 flag
floatx80_lt_quiet( floatx80 a
, floatx80 b
)
4224 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4225 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4226 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4227 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4229 if ( floatx80_is_signaling_nan( a
)
4230 || floatx80_is_signaling_nan( b
) ) {
4231 float_raise( float_flag_invalid
);
4235 aSign
= extractFloatx80Sign( a
);
4236 bSign
= extractFloatx80Sign( b
);
4237 if ( aSign
!= bSign
) {
4240 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4244 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4245 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4254 -------------------------------------------------------------------------------
4255 Returns the result of converting the quadruple-precision floating-point
4256 value `a' to the 32-bit two's complement integer format. The conversion
4257 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4258 Arithmetic---which means in particular that the conversion is rounded
4259 according to the current rounding mode. If `a' is a NaN, the largest
4260 positive integer is returned. Otherwise, if the conversion overflows, the
4261 largest integer with the same sign as `a' is returned.
4262 -------------------------------------------------------------------------------
4264 int32
float128_to_int32( float128 a
)
4267 int32 aExp
, shiftCount
;
4268 bits64 aSig0
, aSig1
;
4270 aSig1
= extractFloat128Frac1( a
);
4271 aSig0
= extractFloat128Frac0( a
);
4272 aExp
= extractFloat128Exp( a
);
4273 aSign
= extractFloat128Sign( a
);
4274 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4275 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4276 aSig0
|= ( aSig1
!= 0 );
4277 shiftCount
= 0x4028 - aExp
;
4278 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4279 return roundAndPackInt32( aSign
, aSig0
);
4284 -------------------------------------------------------------------------------
4285 Returns the result of converting the quadruple-precision floating-point
4286 value `a' to the 32-bit two's complement integer format. The conversion
4287 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4288 Arithmetic, except that the conversion is always rounded toward zero. If
4289 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4290 conversion overflows, the largest integer with the same sign as `a' is
4292 -------------------------------------------------------------------------------
4294 int32
float128_to_int32_round_to_zero( float128 a
)
4297 int32 aExp
, shiftCount
;
4298 bits64 aSig0
, aSig1
, savedASig
;
4301 aSig1
= extractFloat128Frac1( a
);
4302 aSig0
= extractFloat128Frac0( a
);
4303 aExp
= extractFloat128Exp( a
);
4304 aSign
= extractFloat128Sign( a
);
4305 aSig0
|= ( aSig1
!= 0 );
4306 if ( 0x401E < aExp
) {
4307 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4310 else if ( aExp
< 0x3FFF ) {
4311 if ( aExp
|| aSig0
) float_exception_flags
|= float_flag_inexact
;
4314 aSig0
|= LIT64( 0x0001000000000000 );
4315 shiftCount
= 0x402F - aExp
;
4317 aSig0
>>= shiftCount
;
4319 if ( aSign
) z
= - z
;
4320 if ( ( z
< 0 ) ^ aSign
) {
4322 float_raise( float_flag_invalid
);
4323 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4325 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4326 float_exception_flags
|= float_flag_inexact
;
4333 -------------------------------------------------------------------------------
4334 Returns the result of converting the quadruple-precision floating-point
4335 value `a' to the 64-bit two's complement integer format. The conversion
4336 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4337 Arithmetic---which means in particular that the conversion is rounded
4338 according to the current rounding mode. If `a' is a NaN, the largest
4339 positive integer is returned. Otherwise, if the conversion overflows, the
4340 largest integer with the same sign as `a' is returned.
4341 -------------------------------------------------------------------------------
4343 int64
float128_to_int64( float128 a
)
4346 int32 aExp
, shiftCount
;
4347 bits64 aSig0
, aSig1
;
4349 aSig1
= extractFloat128Frac1( a
);
4350 aSig0
= extractFloat128Frac0( a
);
4351 aExp
= extractFloat128Exp( a
);
4352 aSign
= extractFloat128Sign( a
);
4353 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4354 shiftCount
= 0x402F - aExp
;
4355 if ( shiftCount
<= 0 ) {
4356 if ( 0x403E < aExp
) {
4357 float_raise( float_flag_invalid
);
4359 || ( ( aExp
== 0x7FFF )
4360 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4363 return LIT64( 0x7FFFFFFFFFFFFFFF );
4365 return (sbits64
) LIT64( 0x8000000000000000 );
4367 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4370 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4372 return roundAndPackInt64( aSign
, aSig0
, aSig1
);
4377 -------------------------------------------------------------------------------
4378 Returns the result of converting the quadruple-precision floating-point
4379 value `a' to the 64-bit two's complement integer format. The conversion
4380 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4381 Arithmetic, except that the conversion is always rounded toward zero.
4382 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4383 the conversion overflows, the largest integer with the same sign as `a' is
4385 -------------------------------------------------------------------------------
4387 int64
float128_to_int64_round_to_zero( float128 a
)
4390 int32 aExp
, shiftCount
;
4391 bits64 aSig0
, aSig1
;
4394 aSig1
= extractFloat128Frac1( a
);
4395 aSig0
= extractFloat128Frac0( a
);
4396 aExp
= extractFloat128Exp( a
);
4397 aSign
= extractFloat128Sign( a
);
4398 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4399 shiftCount
= aExp
- 0x402F;
4400 if ( 0 < shiftCount
) {
4401 if ( 0x403E <= aExp
) {
4402 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4403 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4404 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4405 if ( aSig1
) float_exception_flags
|= float_flag_inexact
;
4408 float_raise( float_flag_invalid
);
4409 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4410 return LIT64( 0x7FFFFFFFFFFFFFFF );
4413 return (sbits64
) LIT64( 0x8000000000000000 );
4415 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4416 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4417 float_exception_flags
|= float_flag_inexact
;
4421 if ( aExp
< 0x3FFF ) {
4422 if ( aExp
| aSig0
| aSig1
) {
4423 float_exception_flags
|= float_flag_inexact
;
4427 z
= aSig0
>>( - shiftCount
);
4429 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4430 float_exception_flags
|= float_flag_inexact
;
4433 if ( aSign
) z
= - z
;
4439 * just like above - but do not care for overflow of signed results
4441 uint64
float128_to_uint64_round_to_zero( float128 a
)
4444 int32 aExp
, shiftCount
;
4445 bits64 aSig0
, aSig1
;
4448 aSig1
= extractFloat128Frac1( a
);
4449 aSig0
= extractFloat128Frac0( a
);
4450 aExp
= extractFloat128Exp( a
);
4451 aSign
= extractFloat128Sign( a
);
4452 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4453 shiftCount
= aExp
- 0x402F;
4454 if ( 0 < shiftCount
) {
4455 if ( 0x403F <= aExp
) {
4456 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4457 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4458 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4459 if ( aSig1
) float_exception_flags
|= float_flag_inexact
;
4462 float_raise( float_flag_invalid
);
4464 return LIT64( 0xFFFFFFFFFFFFFFFF );
4466 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4467 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4468 float_exception_flags
|= float_flag_inexact
;
4472 if ( aExp
< 0x3FFF ) {
4473 if ( aExp
| aSig0
| aSig1
) {
4474 float_exception_flags
|= float_flag_inexact
;
4478 z
= aSig0
>>( - shiftCount
);
4479 if (aSig1
|| ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4480 float_exception_flags
|= float_flag_inexact
;
4483 if ( aSign
) z
= - z
;
4489 -------------------------------------------------------------------------------
4490 Returns the result of converting the quadruple-precision floating-point
4491 value `a' to the single-precision floating-point format. The conversion
4492 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4494 -------------------------------------------------------------------------------
4496 float32
float128_to_float32( float128 a
)
4500 bits64 aSig0
, aSig1
;
4503 aSig1
= extractFloat128Frac1( a
);
4504 aSig0
= extractFloat128Frac0( a
);
4505 aExp
= extractFloat128Exp( a
);
4506 aSign
= extractFloat128Sign( a
);
4507 if ( aExp
== 0x7FFF ) {
4508 if ( aSig0
| aSig1
) {
4509 return commonNaNToFloat32( float128ToCommonNaN( a
) );
4511 return packFloat32( aSign
, 0xFF, 0 );
4513 aSig0
|= ( aSig1
!= 0 );
4514 shift64RightJamming( aSig0
, 18, &aSig0
);
4516 if ( aExp
|| zSig
) {
4520 return roundAndPackFloat32( aSign
, aExp
, zSig
);
4525 -------------------------------------------------------------------------------
4526 Returns the result of converting the quadruple-precision floating-point
4527 value `a' to the double-precision floating-point format. The conversion
4528 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4530 -------------------------------------------------------------------------------
4532 float64
float128_to_float64( float128 a
)
4536 bits64 aSig0
, aSig1
;
4538 aSig1
= extractFloat128Frac1( a
);
4539 aSig0
= extractFloat128Frac0( a
);
4540 aExp
= extractFloat128Exp( a
);
4541 aSign
= extractFloat128Sign( a
);
4542 if ( aExp
== 0x7FFF ) {
4543 if ( aSig0
| aSig1
) {
4544 return commonNaNToFloat64( float128ToCommonNaN( a
) );
4546 return packFloat64( aSign
, 0x7FF, 0 );
4548 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4549 aSig0
|= ( aSig1
!= 0 );
4550 if ( aExp
|| aSig0
) {
4551 aSig0
|= LIT64( 0x4000000000000000 );
4554 return roundAndPackFloat64( aSign
, aExp
, aSig0
);
4561 -------------------------------------------------------------------------------
4562 Returns the result of converting the quadruple-precision floating-point
4563 value `a' to the extended double-precision floating-point format. The
4564 conversion is performed according to the IEC/IEEE Standard for Binary
4565 Floating-Point Arithmetic.
4566 -------------------------------------------------------------------------------
4568 floatx80
float128_to_floatx80( float128 a
)
4572 bits64 aSig0
, aSig1
;
4574 aSig1
= extractFloat128Frac1( a
);
4575 aSig0
= extractFloat128Frac0( a
);
4576 aExp
= extractFloat128Exp( a
);
4577 aSign
= extractFloat128Sign( a
);
4578 if ( aExp
== 0x7FFF ) {
4579 if ( aSig0
| aSig1
) {
4580 return commonNaNToFloatx80( float128ToCommonNaN( a
) );
4582 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4585 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4586 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4589 aSig0
|= LIT64( 0x0001000000000000 );
4591 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4592 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1
);
4599 -------------------------------------------------------------------------------
4600 Rounds the quadruple-precision floating-point value `a' to an integer, and
4601 returns the result as a quadruple-precision floating-point value. The
4602 operation is performed according to the IEC/IEEE Standard for Binary
4603 Floating-Point Arithmetic.
4604 -------------------------------------------------------------------------------
4606 float128
float128_round_to_int( float128 a
)
4610 bits64 lastBitMask
, roundBitsMask
;
4614 aExp
= extractFloat128Exp( a
);
4615 if ( 0x402F <= aExp
) {
4616 if ( 0x406F <= aExp
) {
4617 if ( ( aExp
== 0x7FFF )
4618 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
4620 return propagateFloat128NaN( a
, a
);
4625 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
4626 roundBitsMask
= lastBitMask
- 1;
4628 roundingMode
= float_rounding_mode
;
4629 if ( roundingMode
== float_round_nearest_even
) {
4630 if ( lastBitMask
) {
4631 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
4632 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4635 if ( (sbits64
) z
.low
< 0 ) {
4637 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
4641 else if ( roundingMode
!= float_round_to_zero
) {
4642 if ( extractFloat128Sign( z
)
4643 ^ ( roundingMode
== float_round_up
) ) {
4644 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
4647 z
.low
&= ~ roundBitsMask
;
4650 if ( aExp
< 0x3FFF ) {
4651 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
4652 float_exception_flags
|= float_flag_inexact
;
4653 aSign
= extractFloat128Sign( a
);
4654 switch ( float_rounding_mode
) {
4655 case float_round_nearest_even
:
4656 if ( ( aExp
== 0x3FFE )
4657 && ( extractFloat128Frac0( a
)
4658 | extractFloat128Frac1( a
) )
4660 return packFloat128( aSign
, 0x3FFF, 0, 0 );
4663 case float_round_to_zero
:
4665 case float_round_down
:
4667 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
4668 : packFloat128( 0, 0, 0, 0 );
4669 case float_round_up
:
4671 aSign
? packFloat128( 1, 0, 0, 0 )
4672 : packFloat128( 0, 0x3FFF, 0, 0 );
4674 return packFloat128( aSign
, 0, 0, 0 );
4677 lastBitMask
<<= 0x402F - aExp
;
4678 roundBitsMask
= lastBitMask
- 1;
4681 roundingMode
= float_rounding_mode
;
4682 if ( roundingMode
== float_round_nearest_even
) {
4683 z
.high
+= lastBitMask
>>1;
4684 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
4685 z
.high
&= ~ lastBitMask
;
4688 else if ( roundingMode
!= float_round_to_zero
) {
4689 if ( extractFloat128Sign( z
)
4690 ^ ( roundingMode
== float_round_up
) ) {
4691 z
.high
|= ( a
.low
!= 0 );
4692 z
.high
+= roundBitsMask
;
4695 z
.high
&= ~ roundBitsMask
;
4697 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
4698 float_exception_flags
|= float_flag_inexact
;
4705 -------------------------------------------------------------------------------
4706 Returns the result of adding the absolute values of the quadruple-precision
4707 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4708 before being returned. `zSign' is ignored if the result is a NaN.
4709 The addition is performed according to the IEC/IEEE Standard for Binary
4710 Floating-Point Arithmetic.
4711 -------------------------------------------------------------------------------
4713 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4715 int32 aExp
, bExp
, zExp
;
4716 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4719 aSig1
= extractFloat128Frac1( a
);
4720 aSig0
= extractFloat128Frac0( a
);
4721 aExp
= extractFloat128Exp( a
);
4722 bSig1
= extractFloat128Frac1( b
);
4723 bSig0
= extractFloat128Frac0( b
);
4724 bExp
= extractFloat128Exp( b
);
4725 expDiff
= aExp
- bExp
;
4726 if ( 0 < expDiff
) {
4727 if ( aExp
== 0x7FFF ) {
4728 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4735 bSig0
|= LIT64( 0x0001000000000000 );
4737 shift128ExtraRightJamming(
4738 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
4741 else if ( expDiff
< 0 ) {
4742 if ( bExp
== 0x7FFF ) {
4743 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4744 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4750 aSig0
|= LIT64( 0x0001000000000000 );
4752 shift128ExtraRightJamming(
4753 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
4757 if ( aExp
== 0x7FFF ) {
4758 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4759 return propagateFloat128NaN( a
, b
);
4763 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4764 if ( aExp
== 0 ) return packFloat128( zSign
, 0, zSig0
, zSig1
);
4766 zSig0
|= LIT64( 0x0002000000000000 );
4770 aSig0
|= LIT64( 0x0001000000000000 );
4771 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4773 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
4776 shift128ExtraRightJamming(
4777 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4779 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4784 -------------------------------------------------------------------------------
4785 Returns the result of subtracting the absolute values of the quadruple-
4786 precision floating-point values `a' and `b'. If `zSign' is 1, the
4787 difference is negated before being returned. `zSign' is ignored if the
4788 result is a NaN. The subtraction is performed according to the IEC/IEEE
4789 Standard for Binary Floating-Point Arithmetic.
4790 -------------------------------------------------------------------------------
4792 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4794 int32 aExp
, bExp
, zExp
;
4795 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
4799 aSig1
= extractFloat128Frac1( a
);
4800 aSig0
= extractFloat128Frac0( a
);
4801 aExp
= extractFloat128Exp( a
);
4802 bSig1
= extractFloat128Frac1( b
);
4803 bSig0
= extractFloat128Frac0( b
);
4804 bExp
= extractFloat128Exp( b
);
4805 expDiff
= aExp
- bExp
;
4806 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4807 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
4808 if ( 0 < expDiff
) goto aExpBigger
;
4809 if ( expDiff
< 0 ) goto bExpBigger
;
4810 if ( aExp
== 0x7FFF ) {
4811 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4812 return propagateFloat128NaN( a
, b
);
4814 float_raise( float_flag_invalid
);
4815 z
.low
= float128_default_nan_low
;
4816 z
.high
= float128_default_nan_high
;
4823 if ( bSig0
< aSig0
) goto aBigger
;
4824 if ( aSig0
< bSig0
) goto bBigger
;
4825 if ( bSig1
< aSig1
) goto aBigger
;
4826 if ( aSig1
< bSig1
) goto bBigger
;
4827 return packFloat128( float_rounding_mode
== float_round_down
, 0, 0, 0 );
4829 if ( bExp
== 0x7FFF ) {
4830 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4831 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
4837 aSig0
|= LIT64( 0x4000000000000000 );
4839 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4840 bSig0
|= LIT64( 0x4000000000000000 );
4842 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4845 goto normalizeRoundAndPack
;
4847 if ( aExp
== 0x7FFF ) {
4848 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4855 bSig0
|= LIT64( 0x4000000000000000 );
4857 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
4858 aSig0
|= LIT64( 0x4000000000000000 );
4860 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4862 normalizeRoundAndPack
:
4864 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1
);
4869 -------------------------------------------------------------------------------
4870 Returns the result of adding the quadruple-precision floating-point values
4871 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4872 for Binary Floating-Point Arithmetic.
4873 -------------------------------------------------------------------------------
4875 float128
float128_add( float128 a
, float128 b
)
4879 aSign
= extractFloat128Sign( a
);
4880 bSign
= extractFloat128Sign( b
);
4881 if ( aSign
== bSign
) {
4882 return addFloat128Sigs( a
, b
, aSign
);
4885 return subFloat128Sigs( a
, b
, aSign
);
4891 -------------------------------------------------------------------------------
4892 Returns the result of subtracting the quadruple-precision floating-point
4893 values `a' and `b'. The operation is performed according to the IEC/IEEE
4894 Standard for Binary Floating-Point Arithmetic.
4895 -------------------------------------------------------------------------------
4897 float128
float128_sub( float128 a
, float128 b
)
4901 aSign
= extractFloat128Sign( a
);
4902 bSign
= extractFloat128Sign( b
);
4903 if ( aSign
== bSign
) {
4904 return subFloat128Sigs( a
, b
, aSign
);
4907 return addFloat128Sigs( a
, b
, aSign
);
4913 -------------------------------------------------------------------------------
4914 Returns the result of multiplying the quadruple-precision floating-point
4915 values `a' and `b'. The operation is performed according to the IEC/IEEE
4916 Standard for Binary Floating-Point Arithmetic.
4917 -------------------------------------------------------------------------------
4919 float128
float128_mul( float128 a
, float128 b
)
4921 flag aSign
, bSign
, zSign
;
4922 int32 aExp
, bExp
, zExp
;
4923 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
4926 aSig1
= extractFloat128Frac1( a
);
4927 aSig0
= extractFloat128Frac0( a
);
4928 aExp
= extractFloat128Exp( a
);
4929 aSign
= extractFloat128Sign( a
);
4930 bSig1
= extractFloat128Frac1( b
);
4931 bSig0
= extractFloat128Frac0( b
);
4932 bExp
= extractFloat128Exp( b
);
4933 bSign
= extractFloat128Sign( b
);
4934 zSign
= aSign
^ bSign
;
4935 if ( aExp
== 0x7FFF ) {
4936 if ( ( aSig0
| aSig1
)
4937 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4938 return propagateFloat128NaN( a
, b
);
4940 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
4941 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4943 if ( bExp
== 0x7FFF ) {
4944 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4945 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4947 float_raise( float_flag_invalid
);
4948 z
.low
= float128_default_nan_low
;
4949 z
.high
= float128_default_nan_high
;
4952 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4955 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4956 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4959 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4960 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4962 zExp
= aExp
+ bExp
- 0x4000;
4963 aSig0
|= LIT64( 0x0001000000000000 );
4964 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
4965 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
4966 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4967 zSig2
|= ( zSig3
!= 0 );
4968 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
4969 shift128ExtraRightJamming(
4970 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4973 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4978 -------------------------------------------------------------------------------
4979 Returns the result of dividing the quadruple-precision floating-point value
4980 `a' by the corresponding value `b'. The operation is performed according to
4981 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4982 -------------------------------------------------------------------------------
4984 float128
float128_div( float128 a
, float128 b
)
4986 flag aSign
, bSign
, zSign
;
4987 int32 aExp
, bExp
, zExp
;
4988 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4989 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4992 aSig1
= extractFloat128Frac1( a
);
4993 aSig0
= extractFloat128Frac0( a
);
4994 aExp
= extractFloat128Exp( a
);
4995 aSign
= extractFloat128Sign( a
);
4996 bSig1
= extractFloat128Frac1( b
);
4997 bSig0
= extractFloat128Frac0( b
);
4998 bExp
= extractFloat128Exp( b
);
4999 bSign
= extractFloat128Sign( b
);
5000 zSign
= aSign
^ bSign
;
5001 if ( aExp
== 0x7FFF ) {
5002 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
5003 if ( bExp
== 0x7FFF ) {
5004 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
5007 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5009 if ( bExp
== 0x7FFF ) {
5010 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
5011 return packFloat128( zSign
, 0, 0, 0 );
5014 if ( ( bSig0
| bSig1
) == 0 ) {
5015 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5017 float_raise( float_flag_invalid
);
5018 z
.low
= float128_default_nan_low
;
5019 z
.high
= float128_default_nan_high
;
5022 float_raise( float_flag_divbyzero
);
5023 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5025 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5028 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5029 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5031 zExp
= aExp
- bExp
+ 0x3FFD;
5033 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
5035 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5036 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
5037 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
5040 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5041 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
5042 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
5043 while ( (sbits64
) rem0
< 0 ) {
5045 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5047 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5048 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5049 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5050 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5051 while ( (sbits64
) rem1
< 0 ) {
5053 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5055 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5057 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5058 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
5063 -------------------------------------------------------------------------------
5064 Returns the remainder of the quadruple-precision floating-point value `a'
5065 with respect to the corresponding value `b'. The operation is performed
5066 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5067 -------------------------------------------------------------------------------
5069 float128
float128_rem( float128 a
, float128 b
)
5071 flag aSign
, bSign
, zSign
;
5072 int32 aExp
, bExp
, expDiff
;
5073 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5074 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5078 aSig1
= extractFloat128Frac1( a
);
5079 aSig0
= extractFloat128Frac0( a
);
5080 aExp
= extractFloat128Exp( a
);
5081 aSign
= extractFloat128Sign( a
);
5082 bSig1
= extractFloat128Frac1( b
);
5083 bSig0
= extractFloat128Frac0( b
);
5084 bExp
= extractFloat128Exp( b
);
5085 bSign
= extractFloat128Sign( b
);
5086 if ( aExp
== 0x7FFF ) {
5087 if ( ( aSig0
| aSig1
)
5088 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5089 return propagateFloat128NaN( a
, b
);
5093 if ( bExp
== 0x7FFF ) {
5094 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
5098 if ( ( bSig0
| bSig1
) == 0 ) {
5100 float_raise( float_flag_invalid
);
5101 z
.low
= float128_default_nan_low
;
5102 z
.high
= float128_default_nan_high
;
5105 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5108 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5109 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5111 expDiff
= aExp
- bExp
;
5112 if ( expDiff
< -1 ) return a
;
5114 aSig0
| LIT64( 0x0001000000000000 ),
5116 15 - ( expDiff
< 0 ),
5121 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5122 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5123 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5125 while ( 0 < expDiff
) {
5126 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5127 q
= ( 4 < q
) ? q
- 4 : 0;
5128 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5129 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5130 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5131 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5134 if ( -64 < expDiff
) {
5135 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5136 q
= ( 4 < q
) ? q
- 4 : 0;
5138 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5140 if ( expDiff
< 0 ) {
5141 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5144 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5146 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5147 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5150 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5151 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5154 alternateASig0
= aSig0
;
5155 alternateASig1
= aSig1
;
5157 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5158 } while ( 0 <= (sbits64
) aSig0
);
5160 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (bits64
*)&sigMean0
, &sigMean1
);
5161 if ( ( sigMean0
< 0 )
5162 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5163 aSig0
= alternateASig0
;
5164 aSig1
= alternateASig1
;
5166 zSign
= ( (sbits64
) aSig0
< 0 );
5167 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5169 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
);
5174 -------------------------------------------------------------------------------
5175 Returns the square root of the quadruple-precision floating-point value `a'.
5176 The operation is performed according to the IEC/IEEE Standard for Binary
5177 Floating-Point Arithmetic.
5178 -------------------------------------------------------------------------------
5180 float128
float128_sqrt( float128 a
)
5184 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5185 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5188 aSig1
= extractFloat128Frac1( a
);
5189 aSig0
= extractFloat128Frac0( a
);
5190 aExp
= extractFloat128Exp( a
);
5191 aSign
= extractFloat128Sign( a
);
5192 if ( aExp
== 0x7FFF ) {
5193 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a
);
5194 if ( ! aSign
) return a
;
5198 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5200 float_raise( float_flag_invalid
);
5201 z
.low
= float128_default_nan_low
;
5202 z
.high
= float128_default_nan_high
;
5206 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5207 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5209 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5210 aSig0
|= LIT64( 0x0001000000000000 );
5211 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5212 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5213 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5214 doubleZSig0
= zSig0
<<1;
5215 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5216 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5217 while ( (sbits64
) rem0
< 0 ) {
5220 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5222 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5223 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5224 if ( zSig1
== 0 ) zSig1
= 1;
5225 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5226 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5227 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5228 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5229 while ( (sbits64
) rem1
< 0 ) {
5231 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5233 term2
|= doubleZSig0
;
5234 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5236 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5238 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5239 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2
);
5244 -------------------------------------------------------------------------------
5245 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5246 the corresponding value `b', and 0 otherwise. The comparison is performed
5247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5248 -------------------------------------------------------------------------------
5250 flag
float128_eq( float128 a
, float128 b
)
5253 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5254 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5255 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5256 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5258 if ( float128_is_signaling_nan( a
)
5259 || float128_is_signaling_nan( b
) ) {
5260 float_raise( float_flag_invalid
);
5266 && ( ( a
.high
== b
.high
)
5268 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5274 -------------------------------------------------------------------------------
5275 Returns 1 if the quadruple-precision floating-point value `a' is less than
5276 or equal to the corresponding value `b', and 0 otherwise. The comparison
5277 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5279 -------------------------------------------------------------------------------
5281 flag
float128_le( float128 a
, float128 b
)
5285 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5286 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5287 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5288 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5290 float_raise( float_flag_invalid
);
5293 aSign
= extractFloat128Sign( a
);
5294 bSign
= extractFloat128Sign( b
);
5295 if ( aSign
!= bSign
) {
5298 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5302 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5303 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5308 -------------------------------------------------------------------------------
5309 Returns 1 if the quadruple-precision floating-point value `a' is less than
5310 the corresponding value `b', and 0 otherwise. The comparison is performed
5311 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5312 -------------------------------------------------------------------------------
5314 flag
float128_lt( float128 a
, float128 b
)
5318 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5319 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5320 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5321 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5323 float_raise( float_flag_invalid
);
5326 aSign
= extractFloat128Sign( a
);
5327 bSign
= extractFloat128Sign( b
);
5328 if ( aSign
!= bSign
) {
5331 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5335 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5336 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5341 -------------------------------------------------------------------------------
5342 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5343 the corresponding value `b', and 0 otherwise. The invalid exception is
5344 raised if either operand is a NaN. Otherwise, the comparison is performed
5345 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5346 -------------------------------------------------------------------------------
5348 flag
float128_eq_signaling( float128 a
, float128 b
)
5351 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5352 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5353 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5354 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5356 float_raise( float_flag_invalid
);
5361 && ( ( a
.high
== b
.high
)
5363 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5369 -------------------------------------------------------------------------------
5370 Returns 1 if the quadruple-precision floating-point value `a' is less than
5371 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5372 cause an exception. Otherwise, the comparison is performed according to the
5373 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5374 -------------------------------------------------------------------------------
5376 flag
float128_le_quiet( float128 a
, float128 b
)
5380 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5381 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5382 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5383 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5385 if ( float128_is_signaling_nan( a
)
5386 || float128_is_signaling_nan( b
) ) {
5387 float_raise( float_flag_invalid
);
5391 aSign
= extractFloat128Sign( a
);
5392 bSign
= extractFloat128Sign( b
);
5393 if ( aSign
!= bSign
) {
5396 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5400 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5401 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5406 -------------------------------------------------------------------------------
5407 Returns 1 if the quadruple-precision floating-point value `a' is less than
5408 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5409 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5410 Standard for Binary Floating-Point Arithmetic.
5411 -------------------------------------------------------------------------------
5413 flag
float128_lt_quiet( float128 a
, float128 b
)
5417 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5418 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5419 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5420 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5422 if ( float128_is_signaling_nan( a
)
5423 || float128_is_signaling_nan( b
) ) {
5424 float_raise( float_flag_invalid
);
5428 aSign
= extractFloat128Sign( a
);
5429 bSign
= extractFloat128Sign( b
);
5430 if ( aSign
!= bSign
) {
5433 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5437 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5438 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5445 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5448 * These two routines are not part of the original softfloat distribution.
5450 * They are based on the corresponding conversions to integer but return
5451 * unsigned numbers instead since these functions are required by GCC.
5453 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97
5455 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5459 -------------------------------------------------------------------------------
5460 Returns the result of converting the double-precision floating-point value
5461 `a' to the 32-bit unsigned integer format. The conversion is
5462 performed according to the IEC/IEEE Standard for Binary Floating-point
5463 Arithmetic, except that the conversion is always rounded toward zero. If
5464 `a' is a NaN, the largest positive integer is returned. If the conversion
5465 overflows, the largest integer positive is returned.
5466 -------------------------------------------------------------------------------
5468 uint32
float64_to_uint32_round_to_zero( float64 a
)
5471 int16 aExp
, shiftCount
;
5472 bits64 aSig
, savedASig
;
5475 aSig
= extractFloat64Frac( a
);
5476 aExp
= extractFloat64Exp( a
);
5477 aSign
= extractFloat64Sign( a
);
5480 float_raise( float_flag_invalid
);
5484 if ( 0x41E < aExp
) {
5485 float_raise( float_flag_invalid
);
5488 else if ( aExp
< 0x3FF ) {
5489 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
5492 aSig
|= LIT64( 0x0010000000000000 );
5493 shiftCount
= 0x433 - aExp
;
5495 aSig
>>= shiftCount
;
5497 if ( ( aSig
<<shiftCount
) != savedASig
) {
5498 float_exception_flags
|= float_flag_inexact
;
5505 -------------------------------------------------------------------------------
5506 Returns the result of converting the single-precision floating-point value
5507 `a' to the 32-bit unsigned integer format. The conversion is
5508 performed according to the IEC/IEEE Standard for Binary Floating-point
5509 Arithmetic, except that the conversion is always rounded toward zero. If
5510 `a' is a NaN, the largest positive integer is returned. If the conversion
5511 overflows, the largest positive integer is returned.
5512 -------------------------------------------------------------------------------
5514 uint32
float32_to_uint32_round_to_zero( float32 a
)
5517 int16 aExp
, shiftCount
;
5521 aSig
= extractFloat32Frac( a
);
5522 aExp
= extractFloat32Exp( a
);
5523 aSign
= extractFloat32Sign( a
);
5524 shiftCount
= aExp
- 0x9E;
5527 float_raise( float_flag_invalid
);
5530 if ( 0 < shiftCount
) {
5531 float_raise( float_flag_invalid
);
5534 else if ( aExp
<= 0x7E ) {
5535 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
5538 aSig
= ( aSig
| 0x800000 )<<8;
5539 z
= aSig
>>( - shiftCount
);
5540 if ( aSig
<<( shiftCount
& 31 ) ) {
5541 float_exception_flags
|= float_flag_inexact
;