1 /* $NetBSD: softfloat.c,v 1.4 2007/11/08 15:50:24 martin Exp $ */
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
18 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 #include <sys/cdefs.h>
48 #if defined(LIBC_SCCS) && !defined(lint)
49 __RCSID("$NetBSD: softfloat.c,v 1.4 2007/11/08 15:50:24 martin Exp $");
50 #endif /* LIBC_SCCS and not lint */
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
57 #include "softfloat.h"
60 * Conversions between floats as stored in memory and floats as
63 #ifndef FLOAT64_DEMANGLE
64 #define FLOAT64_DEMANGLE(a) (a)
66 #ifndef FLOAT64_MANGLE
67 #define FLOAT64_MANGLE(a) (a)
71 -------------------------------------------------------------------------------
72 Floating-point rounding mode, extended double-precision rounding precision,
74 -------------------------------------------------------------------------------
76 fp_rnd float_rounding_mode
= float_round_nearest_even
;
77 fp_except float_exception_flags
= 0;
79 int8 floatx80_rounding_precision
= 80;
83 -------------------------------------------------------------------------------
84 Primitive arithmetic functions, including multi-word arithmetic, and
85 division and square root approximations. (Can be specialized to target if
87 -------------------------------------------------------------------------------
89 #include "softfloat-macros"
92 -------------------------------------------------------------------------------
93 Functions and definitions to determine: (1) whether tininess for underflow
94 is detected before or after rounding by default, (2) what (if anything)
95 happens when exceptions are raised, (3) how signaling NaNs are distinguished
96 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
97 are propagated from function inputs to output. These details are target-
99 -------------------------------------------------------------------------------
101 #include "softfloat-specialize"
103 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
105 -------------------------------------------------------------------------------
106 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
107 and 7, and returns the properly rounded 32-bit integer corresponding to the
108 input. If `zSign' is 1, the input is negated before being converted to an
109 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
110 is simply rounded to an integer, with the inexact exception raised if the
111 input cannot be represented exactly as an integer. However, if the fixed-
112 point input is too large, the invalid exception is raised and the largest
113 positive or negative integer is returned.
114 -------------------------------------------------------------------------------
116 static int32
roundAndPackInt32( flag zSign
, bits64 absZ
)
119 flag roundNearestEven
;
120 int8 roundIncrement
, roundBits
;
123 roundingMode
= float_rounding_mode
;
124 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
125 roundIncrement
= 0x40;
126 if ( ! roundNearestEven
) {
127 if ( roundingMode
== float_round_to_zero
) {
131 roundIncrement
= 0x7F;
133 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
136 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
140 roundBits
= absZ
& 0x7F;
141 absZ
= ( absZ
+ roundIncrement
)>>7;
142 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
144 if ( zSign
) z
= - z
;
145 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
146 float_raise( float_flag_invalid
);
147 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
149 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
155 -------------------------------------------------------------------------------
156 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
157 `absZ1', with binary point between bits 63 and 64 (between the input words),
158 and returns the properly rounded 64-bit integer corresponding to the input.
159 If `zSign' is 1, the input is negated before being converted to an integer.
160 Ordinarily, the fixed-point input is simply rounded to an integer, with
161 the inexact exception raised if the input cannot be represented exactly as
162 an integer. However, if the fixed-point input is too large, the invalid
163 exception is raised and the largest positive or negative integer is
165 -------------------------------------------------------------------------------
167 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1
)
170 flag roundNearestEven
, increment
;
173 roundingMode
= float_rounding_mode
;
174 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
175 increment
= ( (sbits64
) absZ1
< 0 );
176 if ( ! roundNearestEven
) {
177 if ( roundingMode
== float_round_to_zero
) {
182 increment
= ( roundingMode
== float_round_down
) && absZ1
;
185 increment
= ( roundingMode
== float_round_up
) && absZ1
;
191 if ( absZ0
== 0 ) goto overflow
;
192 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
195 if ( zSign
) z
= - z
;
196 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
198 float_raise( float_flag_invalid
);
200 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
201 : LIT64( 0x7FFFFFFFFFFFFFFF );
203 if ( absZ1
) float_exception_flags
|= float_flag_inexact
;
210 -------------------------------------------------------------------------------
211 Returns the fraction bits of the single-precision floating-point value `a'.
212 -------------------------------------------------------------------------------
214 INLINE bits32
extractFloat32Frac( float32 a
)
217 return a
& 0x007FFFFF;
222 -------------------------------------------------------------------------------
223 Returns the exponent bits of the single-precision floating-point value `a'.
224 -------------------------------------------------------------------------------
226 INLINE int16
extractFloat32Exp( float32 a
)
229 return ( a
>>23 ) & 0xFF;
234 -------------------------------------------------------------------------------
235 Returns the sign bit of the single-precision floating-point value `a'.
236 -------------------------------------------------------------------------------
238 INLINE flag
extractFloat32Sign( float32 a
)
246 -------------------------------------------------------------------------------
247 Normalizes the subnormal single-precision floating-point value represented
248 by the denormalized significand `aSig'. The normalized exponent and
249 significand are stored at the locations pointed to by `zExpPtr' and
250 `zSigPtr', respectively.
251 -------------------------------------------------------------------------------
254 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
258 shiftCount
= countLeadingZeros32( aSig
) - 8;
259 *zSigPtr
= aSig
<<shiftCount
;
260 *zExpPtr
= 1 - shiftCount
;
265 -------------------------------------------------------------------------------
266 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
267 single-precision floating-point value, returning the result. After being
268 shifted into the proper positions, the three fields are simply added
269 together to form the result. This means that any integer portion of `zSig'
270 will be added into the exponent. Since a properly normalized significand
271 will have an integer portion equal to 1, the `zExp' input should be 1 less
272 than the desired result exponent whenever `zSig' is a complete, normalized
274 -------------------------------------------------------------------------------
276 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
279 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
284 -------------------------------------------------------------------------------
285 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
286 and significand `zSig', and returns the proper single-precision floating-
287 point value corresponding to the abstract input. Ordinarily, the abstract
288 value is simply rounded and packed into the single-precision format, with
289 the inexact exception raised if the abstract input cannot be represented
290 exactly. However, if the abstract value is too large, the overflow and
291 inexact exceptions are raised and an infinity or maximal finite value is
292 returned. If the abstract value is too small, the input value is rounded to
293 a subnormal number, and the underflow and inexact exceptions are raised if
294 the abstract input cannot be represented exactly as a subnormal single-
295 precision floating-point number.
296 The input significand `zSig' has its binary point between bits 30
297 and 29, which is 7 bits to the left of the usual location. This shifted
298 significand must be normalized or smaller. If `zSig' is not normalized,
299 `zExp' must be 0; in that case, the result returned is a subnormal number,
300 and it must not require rounding. In the usual case that `zSig' is
301 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
302 The handling of underflow and overflow follows the IEC/IEEE Standard for
303 Binary Floating-Point Arithmetic.
304 -------------------------------------------------------------------------------
306 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
309 flag roundNearestEven
;
310 int8 roundIncrement
, roundBits
;
313 roundingMode
= float_rounding_mode
;
314 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
315 roundIncrement
= 0x40;
316 if ( ! roundNearestEven
) {
317 if ( roundingMode
== float_round_to_zero
) {
321 roundIncrement
= 0x7F;
323 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
326 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
330 roundBits
= zSig
& 0x7F;
331 if ( 0xFD <= (bits16
) zExp
) {
333 || ( ( zExp
== 0xFD )
334 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
336 float_raise( float_flag_overflow
| float_flag_inexact
);
337 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
341 ( float_detect_tininess
== float_tininess_before_rounding
)
343 || ( zSig
+ roundIncrement
< 0x80000000 );
344 shift32RightJamming( zSig
, - zExp
, &zSig
);
346 roundBits
= zSig
& 0x7F;
347 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
350 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
351 zSig
= ( zSig
+ roundIncrement
)>>7;
352 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
353 if ( zSig
== 0 ) zExp
= 0;
354 return packFloat32( zSign
, zExp
, zSig
);
359 -------------------------------------------------------------------------------
360 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
361 and significand `zSig', and returns the proper single-precision floating-
362 point value corresponding to the abstract input. This routine is just like
363 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
364 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
365 floating-point exponent.
366 -------------------------------------------------------------------------------
369 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
373 shiftCount
= countLeadingZeros32( zSig
) - 1;
374 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
379 -------------------------------------------------------------------------------
380 Returns the fraction bits of the double-precision floating-point value `a'.
381 -------------------------------------------------------------------------------
383 INLINE bits64
extractFloat64Frac( float64 a
)
386 return FLOAT64_DEMANGLE(a
) & LIT64( 0x000FFFFFFFFFFFFF );
391 -------------------------------------------------------------------------------
392 Returns the exponent bits of the double-precision floating-point value `a'.
393 -------------------------------------------------------------------------------
395 INLINE int16
extractFloat64Exp( float64 a
)
398 return ( FLOAT64_DEMANGLE(a
)>>52 ) & 0x7FF;
403 -------------------------------------------------------------------------------
404 Returns the sign bit of the double-precision floating-point value `a'.
405 -------------------------------------------------------------------------------
407 INLINE flag
extractFloat64Sign( float64 a
)
410 return FLOAT64_DEMANGLE(a
)>>63;
415 -------------------------------------------------------------------------------
416 Normalizes the subnormal double-precision floating-point value represented
417 by the denormalized significand `aSig'. The normalized exponent and
418 significand are stored at the locations pointed to by `zExpPtr' and
419 `zSigPtr', respectively.
420 -------------------------------------------------------------------------------
423 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
427 shiftCount
= countLeadingZeros64( aSig
) - 11;
428 *zSigPtr
= aSig
<<shiftCount
;
429 *zExpPtr
= 1 - shiftCount
;
434 -------------------------------------------------------------------------------
435 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
436 double-precision floating-point value, returning the result. After being
437 shifted into the proper positions, the three fields are simply added
438 together to form the result. This means that any integer portion of `zSig'
439 will be added into the exponent. Since a properly normalized significand
440 will have an integer portion equal to 1, the `zExp' input should be 1 less
441 than the desired result exponent whenever `zSig' is a complete, normalized
443 -------------------------------------------------------------------------------
445 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
448 return FLOAT64_MANGLE( ( ( (bits64
) zSign
)<<63 ) +
449 ( ( (bits64
) zExp
)<<52 ) + zSig
);
454 -------------------------------------------------------------------------------
455 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
456 and significand `zSig', and returns the proper double-precision floating-
457 point value corresponding to the abstract input. Ordinarily, the abstract
458 value is simply rounded and packed into the double-precision format, with
459 the inexact exception raised if the abstract input cannot be represented
460 exactly. However, if the abstract value is too large, the overflow and
461 inexact exceptions are raised and an infinity or maximal finite value is
462 returned. If the abstract value is too small, the input value is rounded to
463 a subnormal number, and the underflow and inexact exceptions are raised if
464 the abstract input cannot be represented exactly as a subnormal double-
465 precision floating-point number.
466 The input significand `zSig' has its binary point between bits 62
467 and 61, which is 10 bits to the left of the usual location. This shifted
468 significand must be normalized or smaller. If `zSig' is not normalized,
469 `zExp' must be 0; in that case, the result returned is a subnormal number,
470 and it must not require rounding. In the usual case that `zSig' is
471 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
472 The handling of underflow and overflow follows the IEC/IEEE Standard for
473 Binary Floating-Point Arithmetic.
474 -------------------------------------------------------------------------------
476 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
479 flag roundNearestEven
;
480 int16 roundIncrement
, roundBits
;
483 roundingMode
= float_rounding_mode
;
484 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
485 roundIncrement
= 0x200;
486 if ( ! roundNearestEven
) {
487 if ( roundingMode
== float_round_to_zero
) {
491 roundIncrement
= 0x3FF;
493 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
496 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
500 roundBits
= zSig
& 0x3FF;
501 if ( 0x7FD <= (bits16
) zExp
) {
502 if ( ( 0x7FD < zExp
)
503 || ( ( zExp
== 0x7FD )
504 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
506 float_raise( float_flag_overflow
| float_flag_inexact
);
507 return FLOAT64_MANGLE(
508 FLOAT64_DEMANGLE(packFloat64( zSign
, 0x7FF, 0 )) -
509 ( roundIncrement
== 0 ));
513 ( float_detect_tininess
== float_tininess_before_rounding
)
515 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
516 shift64RightJamming( zSig
, - zExp
, &zSig
);
518 roundBits
= zSig
& 0x3FF;
519 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
522 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
523 zSig
= ( zSig
+ roundIncrement
)>>10;
524 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
525 if ( zSig
== 0 ) zExp
= 0;
526 return packFloat64( zSign
, zExp
, zSig
);
531 -------------------------------------------------------------------------------
532 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
533 and significand `zSig', and returns the proper double-precision floating-
534 point value corresponding to the abstract input. This routine is just like
535 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
536 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
537 floating-point exponent.
538 -------------------------------------------------------------------------------
541 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
545 shiftCount
= countLeadingZeros64( zSig
) - 1;
546 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
553 -------------------------------------------------------------------------------
554 Returns the fraction bits of the extended double-precision floating-point
556 -------------------------------------------------------------------------------
558 INLINE bits64
extractFloatx80Frac( floatx80 a
)
566 -------------------------------------------------------------------------------
567 Returns the exponent bits of the extended double-precision floating-point
569 -------------------------------------------------------------------------------
571 INLINE int32
extractFloatx80Exp( floatx80 a
)
574 return a
.high
& 0x7FFF;
579 -------------------------------------------------------------------------------
580 Returns the sign bit of the extended double-precision floating-point value
582 -------------------------------------------------------------------------------
584 INLINE flag
extractFloatx80Sign( floatx80 a
)
592 -------------------------------------------------------------------------------
593 Normalizes the subnormal extended double-precision floating-point value
594 represented by the denormalized significand `aSig'. The normalized exponent
595 and significand are stored at the locations pointed to by `zExpPtr' and
596 `zSigPtr', respectively.
597 -------------------------------------------------------------------------------
600 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
604 shiftCount
= countLeadingZeros64( aSig
);
605 *zSigPtr
= aSig
<<shiftCount
;
606 *zExpPtr
= 1 - shiftCount
;
611 -------------------------------------------------------------------------------
612 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
613 extended double-precision floating-point value, returning the result.
614 -------------------------------------------------------------------------------
616 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
621 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
627 -------------------------------------------------------------------------------
628 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 and returns the proper extended double-precision floating-point value
631 corresponding to the abstract input. Ordinarily, the abstract value is
632 rounded and packed into the extended double-precision format, with the
633 inexact exception raised if the abstract input cannot be represented
634 exactly. However, if the abstract value is too large, the overflow and
635 inexact exceptions are raised and an infinity or maximal finite value is
636 returned. If the abstract value is too small, the input value is rounded to
637 a subnormal number, and the underflow and inexact exceptions are raised if
638 the abstract input cannot be represented exactly as a subnormal extended
639 double-precision floating-point number.
640 If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 number of bits as single or double precision, respectively. Otherwise, the
642 result is rounded to the full precision of the extended double-precision
644 The input significand must be normalized or smaller. If the input
645 significand is not normalized, `zExp' must be 0; in that case, the result
646 returned is a subnormal number, and it must not require rounding. The
647 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 Floating-Point Arithmetic.
649 -------------------------------------------------------------------------------
652 roundAndPackFloatx80(
653 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
657 flag roundNearestEven
, increment
, isTiny
;
658 int64 roundIncrement
, roundMask
, roundBits
;
660 roundingMode
= float_rounding_mode
;
661 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
662 if ( roundingPrecision
== 80 ) goto precision80
;
663 if ( roundingPrecision
== 64 ) {
664 roundIncrement
= LIT64( 0x0000000000000400 );
665 roundMask
= LIT64( 0x00000000000007FF );
667 else if ( roundingPrecision
== 32 ) {
668 roundIncrement
= LIT64( 0x0000008000000000 );
669 roundMask
= LIT64( 0x000000FFFFFFFFFF );
674 zSig0
|= ( zSig1
!= 0 );
675 if ( ! roundNearestEven
) {
676 if ( roundingMode
== float_round_to_zero
) {
680 roundIncrement
= roundMask
;
682 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
685 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
689 roundBits
= zSig0
& roundMask
;
690 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
691 if ( ( 0x7FFE < zExp
)
692 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
698 ( float_detect_tininess
== float_tininess_before_rounding
)
700 || ( zSig0
<= zSig0
+ roundIncrement
);
701 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
703 roundBits
= zSig0
& roundMask
;
704 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
705 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
706 zSig0
+= roundIncrement
;
707 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
708 roundIncrement
= roundMask
+ 1;
709 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
710 roundMask
|= roundIncrement
;
712 zSig0
&= ~ roundMask
;
713 return packFloatx80( zSign
, zExp
, zSig0
);
716 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
717 zSig0
+= roundIncrement
;
718 if ( zSig0
< roundIncrement
) {
720 zSig0
= LIT64( 0x8000000000000000 );
722 roundIncrement
= roundMask
+ 1;
723 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
724 roundMask
|= roundIncrement
;
726 zSig0
&= ~ roundMask
;
727 if ( zSig0
== 0 ) zExp
= 0;
728 return packFloatx80( zSign
, zExp
, zSig0
);
730 increment
= ( (sbits64
) zSig1
< 0 );
731 if ( ! roundNearestEven
) {
732 if ( roundingMode
== float_round_to_zero
) {
737 increment
= ( roundingMode
== float_round_down
) && zSig1
;
740 increment
= ( roundingMode
== float_round_up
) && zSig1
;
744 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
745 if ( ( 0x7FFE < zExp
)
746 || ( ( zExp
== 0x7FFE )
747 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
753 float_raise( float_flag_overflow
| float_flag_inexact
);
754 if ( ( roundingMode
== float_round_to_zero
)
755 || ( zSign
&& ( roundingMode
== float_round_up
) )
756 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
758 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
760 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
764 ( float_detect_tininess
== float_tininess_before_rounding
)
767 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
768 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
770 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow
);
771 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
772 if ( roundNearestEven
) {
773 increment
= ( (sbits64
) zSig1
< 0 );
777 increment
= ( roundingMode
== float_round_down
) && zSig1
;
780 increment
= ( roundingMode
== float_round_up
) && zSig1
;
786 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
787 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
789 return packFloatx80( zSign
, zExp
, zSig0
);
792 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
797 zSig0
= LIT64( 0x8000000000000000 );
800 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
804 if ( zSig0
== 0 ) zExp
= 0;
806 return packFloatx80( zSign
, zExp
, zSig0
);
811 -------------------------------------------------------------------------------
812 Takes an abstract floating-point value having sign `zSign', exponent
813 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 and returns the proper extended double-precision floating-point value
815 corresponding to the abstract input. This routine is just like
816 `roundAndPackFloatx80' except that the input significand does not have to be
818 -------------------------------------------------------------------------------
821 normalizeRoundAndPackFloatx80(
822 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
832 shiftCount
= countLeadingZeros64( zSig0
);
833 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
836 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1
);
845 -------------------------------------------------------------------------------
846 Returns the least-significant 64 fraction bits of the quadruple-precision
847 floating-point value `a'.
848 -------------------------------------------------------------------------------
850 INLINE bits64
extractFloat128Frac1( float128 a
)
858 -------------------------------------------------------------------------------
859 Returns the most-significant 48 fraction bits of the quadruple-precision
860 floating-point value `a'.
861 -------------------------------------------------------------------------------
863 INLINE bits64
extractFloat128Frac0( float128 a
)
866 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
871 -------------------------------------------------------------------------------
872 Returns the exponent bits of the quadruple-precision floating-point value
874 -------------------------------------------------------------------------------
876 INLINE int32
extractFloat128Exp( float128 a
)
879 return ( a
.high
>>48 ) & 0x7FFF;
884 -------------------------------------------------------------------------------
885 Returns the sign bit of the quadruple-precision floating-point value `a'.
886 -------------------------------------------------------------------------------
888 INLINE flag
extractFloat128Sign( float128 a
)
896 -------------------------------------------------------------------------------
897 Normalizes the subnormal quadruple-precision floating-point value
898 represented by the denormalized significand formed by the concatenation of
899 `aSig0' and `aSig1'. The normalized exponent is stored at the location
900 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
901 significand are stored at the location pointed to by `zSig0Ptr', and the
902 least significant 64 bits of the normalized significand are stored at the
903 location pointed to by `zSig1Ptr'.
904 -------------------------------------------------------------------------------
907 normalizeFloat128Subnormal(
918 shiftCount
= countLeadingZeros64( aSig1
) - 15;
919 if ( shiftCount
< 0 ) {
920 *zSig0Ptr
= aSig1
>>( - shiftCount
);
921 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
924 *zSig0Ptr
= aSig1
<<shiftCount
;
927 *zExpPtr
= - shiftCount
- 63;
930 shiftCount
= countLeadingZeros64( aSig0
) - 15;
931 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
932 *zExpPtr
= 1 - shiftCount
;
938 -------------------------------------------------------------------------------
939 Packs the sign `zSign', the exponent `zExp', and the significand formed
940 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
941 floating-point value, returning the result. After being shifted into the
942 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
943 added together to form the most significant 32 bits of the result. This
944 means that any integer portion of `zSig0' will be added into the exponent.
945 Since a properly normalized significand will have an integer portion equal
946 to 1, the `zExp' input should be 1 less than the desired result exponent
947 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
949 -------------------------------------------------------------------------------
952 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
957 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
963 -------------------------------------------------------------------------------
964 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
965 and extended significand formed by the concatenation of `zSig0', `zSig1',
966 and `zSig2', and returns the proper quadruple-precision floating-point value
967 corresponding to the abstract input. Ordinarily, the abstract value is
968 simply rounded and packed into the quadruple-precision format, with the
969 inexact exception raised if the abstract input cannot be represented
970 exactly. However, if the abstract value is too large, the overflow and
971 inexact exceptions are raised and an infinity or maximal finite value is
972 returned. If the abstract value is too small, the input value is rounded to
973 a subnormal number, and the underflow and inexact exceptions are raised if
974 the abstract input cannot be represented exactly as a subnormal quadruple-
975 precision floating-point number.
976 The input significand must be normalized or smaller. If the input
977 significand is not normalized, `zExp' must be 0; in that case, the result
978 returned is a subnormal number, and it must not require rounding. In the
979 usual case that the input significand is normalized, `zExp' must be 1 less
980 than the ``true'' floating-point exponent. The handling of underflow and
981 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
982 -------------------------------------------------------------------------------
985 roundAndPackFloat128(
986 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2
)
989 flag roundNearestEven
, increment
, isTiny
;
991 roundingMode
= float_rounding_mode
;
992 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
993 increment
= ( (sbits64
) zSig2
< 0 );
994 if ( ! roundNearestEven
) {
995 if ( roundingMode
== float_round_to_zero
) {
1000 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1003 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1007 if ( 0x7FFD <= (bits32
) zExp
) {
1008 if ( ( 0x7FFD < zExp
)
1009 || ( ( zExp
== 0x7FFD )
1011 LIT64( 0x0001FFFFFFFFFFFF ),
1012 LIT64( 0xFFFFFFFFFFFFFFFF ),
1019 float_raise( float_flag_overflow
| float_flag_inexact
);
1020 if ( ( roundingMode
== float_round_to_zero
)
1021 || ( zSign
&& ( roundingMode
== float_round_up
) )
1022 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1028 LIT64( 0x0000FFFFFFFFFFFF ),
1029 LIT64( 0xFFFFFFFFFFFFFFFF )
1032 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1036 ( float_detect_tininess
== float_tininess_before_rounding
)
1042 LIT64( 0x0001FFFFFFFFFFFF ),
1043 LIT64( 0xFFFFFFFFFFFFFFFF )
1045 shift128ExtraRightJamming(
1046 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1048 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow
);
1049 if ( roundNearestEven
) {
1050 increment
= ( (sbits64
) zSig2
< 0 );
1054 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1057 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1062 if ( zSig2
) float_exception_flags
|= float_flag_inexact
;
1064 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1065 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1068 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1070 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1075 -------------------------------------------------------------------------------
1076 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1077 and significand formed by the concatenation of `zSig0' and `zSig1', and
1078 returns the proper quadruple-precision floating-point value corresponding
1079 to the abstract input. This routine is just like `roundAndPackFloat128'
1080 except that the input significand has fewer bits and does not have to be
1081 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1083 -------------------------------------------------------------------------------
1086 normalizeRoundAndPackFloat128(
1087 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
1097 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1098 if ( 0 <= shiftCount
) {
1100 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1103 shift128ExtraRightJamming(
1104 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1107 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1114 -------------------------------------------------------------------------------
1115 Returns the result of converting the 32-bit two's complement integer `a'
1116 to the single-precision floating-point format. The conversion is performed
1117 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1118 -------------------------------------------------------------------------------
1120 float32
int32_to_float32( int32 a
)
1124 if ( a
== 0 ) return 0;
1125 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1127 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a
);
1132 -------------------------------------------------------------------------------
1133 Returns the result of converting the 32-bit two's complement integer `a'
1134 to the double-precision floating-point format. The conversion is performed
1135 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1136 -------------------------------------------------------------------------------
1138 float64
int32_to_float64( int32 a
)
1145 if ( a
== 0 ) return 0;
1147 absA
= zSign
? - a
: a
;
1148 shiftCount
= countLeadingZeros32( absA
) + 21;
1150 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1157 -------------------------------------------------------------------------------
1158 Returns the result of converting the 32-bit two's complement integer `a'
1159 to the extended double-precision floating-point format. The conversion
1160 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1162 -------------------------------------------------------------------------------
1164 floatx80
int32_to_floatx80( int32 a
)
1171 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1173 absA
= zSign
? - a
: a
;
1174 shiftCount
= countLeadingZeros32( absA
) + 32;
1176 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1185 -------------------------------------------------------------------------------
1186 Returns the result of converting the 32-bit two's complement integer `a' to
1187 the quadruple-precision floating-point format. The conversion is performed
1188 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1189 -------------------------------------------------------------------------------
1191 float128
int32_to_float128( int32 a
)
1198 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1200 absA
= zSign
? - a
: a
;
1201 shiftCount
= countLeadingZeros32( absA
) + 17;
1203 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1209 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1211 -------------------------------------------------------------------------------
1212 Returns the result of converting the 64-bit two's complement integer `a'
1213 to the single-precision floating-point format. The conversion is performed
1214 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1215 -------------------------------------------------------------------------------
1217 float32
int64_to_float32( int64 a
)
1223 if ( a
== 0 ) return 0;
1225 absA
= zSign
? - a
: a
;
1226 shiftCount
= countLeadingZeros64( absA
) - 40;
1227 if ( 0 <= shiftCount
) {
1228 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1232 if ( shiftCount
< 0 ) {
1233 shift64RightJamming( absA
, - shiftCount
, &absA
);
1236 absA
<<= shiftCount
;
1238 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA
);
1244 -------------------------------------------------------------------------------
1245 Returns the result of converting the 64-bit two's complement integer `a'
1246 to the double-precision floating-point format. The conversion is performed
1247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1248 -------------------------------------------------------------------------------
1250 float64
int64_to_float64( int64 a
)
1254 if ( a
== 0 ) return 0;
1255 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1256 return packFloat64( 1, 0x43E, 0 );
1259 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a
);
1266 -------------------------------------------------------------------------------
1267 Returns the result of converting the 64-bit two's complement integer `a'
1268 to the extended double-precision floating-point format. The conversion
1269 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1271 -------------------------------------------------------------------------------
1273 floatx80
int64_to_floatx80( int64 a
)
1279 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1281 absA
= zSign
? - a
: a
;
1282 shiftCount
= countLeadingZeros64( absA
);
1283 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1289 #endif /* !SOFTFLOAT_FOR_GCC */
1294 -------------------------------------------------------------------------------
1295 Returns the result of converting the 64-bit two's complement integer `a' to
1296 the quadruple-precision floating-point format. The conversion is performed
1297 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1298 -------------------------------------------------------------------------------
1300 float128
int64_to_float128( int64 a
)
1306 bits64 zSig0
, zSig1
;
1308 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1310 absA
= zSign
? - a
: a
;
1311 shiftCount
= countLeadingZeros64( absA
) + 49;
1312 zExp
= 0x406E - shiftCount
;
1313 if ( 64 <= shiftCount
) {
1322 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1323 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1329 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1331 -------------------------------------------------------------------------------
1332 Returns the result of converting the single-precision floating-point value
1333 `a' to the 32-bit two's complement integer format. The conversion is
1334 performed according to the IEC/IEEE Standard for Binary Floating-Point
1335 Arithmetic---which means in particular that the conversion is rounded
1336 according to the current rounding mode. If `a' is a NaN, the largest
1337 positive integer is returned. Otherwise, if the conversion overflows, the
1338 largest integer with the same sign as `a' is returned.
1339 -------------------------------------------------------------------------------
1341 int32
float32_to_int32( float32 a
)
1344 int16 aExp
, shiftCount
;
1348 aSig
= extractFloat32Frac( a
);
1349 aExp
= extractFloat32Exp( a
);
1350 aSign
= extractFloat32Sign( a
);
1351 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1352 if ( aExp
) aSig
|= 0x00800000;
1353 shiftCount
= 0xAF - aExp
;
1356 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1357 return roundAndPackInt32( aSign
, aSig64
);
1360 #endif /* !SOFTFLOAT_FOR_GCC */
1363 -------------------------------------------------------------------------------
1364 Returns the result of converting the single-precision floating-point value
1365 `a' to the 32-bit two's complement integer format. The conversion is
1366 performed according to the IEC/IEEE Standard for Binary Floating-Point
1367 Arithmetic, except that the conversion is always rounded toward zero.
1368 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1369 the conversion overflows, the largest integer with the same sign as `a' is
1371 -------------------------------------------------------------------------------
1373 int32
float32_to_int32_round_to_zero( float32 a
)
1376 int16 aExp
, shiftCount
;
1380 aSig
= extractFloat32Frac( a
);
1381 aExp
= extractFloat32Exp( a
);
1382 aSign
= extractFloat32Sign( a
);
1383 shiftCount
= aExp
- 0x9E;
1384 if ( 0 <= shiftCount
) {
1385 if ( a
!= 0xCF000000 ) {
1386 float_raise( float_flag_invalid
);
1387 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1389 return (sbits32
) 0x80000000;
1391 else if ( aExp
<= 0x7E ) {
1392 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
1395 aSig
= ( aSig
| 0x00800000 )<<8;
1396 z
= aSig
>>( - shiftCount
);
1397 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1398 float_exception_flags
|= float_flag_inexact
;
1400 if ( aSign
) z
= - z
;
1405 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1407 -------------------------------------------------------------------------------
1408 Returns the result of converting the single-precision floating-point value
1409 `a' to the 64-bit two's complement integer format. The conversion is
1410 performed according to the IEC/IEEE Standard for Binary Floating-Point
1411 Arithmetic---which means in particular that the conversion is rounded
1412 according to the current rounding mode. If `a' is a NaN, the largest
1413 positive integer is returned. Otherwise, if the conversion overflows, the
1414 largest integer with the same sign as `a' is returned.
1415 -------------------------------------------------------------------------------
1417 int64
float32_to_int64( float32 a
)
1420 int16 aExp
, shiftCount
;
1422 bits64 aSig64
, aSigExtra
;
1424 aSig
= extractFloat32Frac( a
);
1425 aExp
= extractFloat32Exp( a
);
1426 aSign
= extractFloat32Sign( a
);
1427 shiftCount
= 0xBE - aExp
;
1428 if ( shiftCount
< 0 ) {
1429 float_raise( float_flag_invalid
);
1430 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1431 return LIT64( 0x7FFFFFFFFFFFFFFF );
1433 return (sbits64
) LIT64( 0x8000000000000000 );
1435 if ( aExp
) aSig
|= 0x00800000;
1438 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1439 return roundAndPackInt64( aSign
, aSig64
, aSigExtra
);
1444 -------------------------------------------------------------------------------
1445 Returns the result of converting the single-precision floating-point value
1446 `a' to the 64-bit two's complement integer format. The conversion is
1447 performed according to the IEC/IEEE Standard for Binary Floating-Point
1448 Arithmetic, except that the conversion is always rounded toward zero. If
1449 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1450 conversion overflows, the largest integer with the same sign as `a' is
1452 -------------------------------------------------------------------------------
1454 int64
float32_to_int64_round_to_zero( float32 a
)
1457 int16 aExp
, shiftCount
;
1462 aSig
= extractFloat32Frac( a
);
1463 aExp
= extractFloat32Exp( a
);
1464 aSign
= extractFloat32Sign( a
);
1465 shiftCount
= aExp
- 0xBE;
1466 if ( 0 <= shiftCount
) {
1467 if ( a
!= 0xDF000000 ) {
1468 float_raise( float_flag_invalid
);
1469 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1470 return LIT64( 0x7FFFFFFFFFFFFFFF );
1473 return (sbits64
) LIT64( 0x8000000000000000 );
1475 else if ( aExp
<= 0x7E ) {
1476 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
1479 aSig64
= aSig
| 0x00800000;
1481 z
= aSig64
>>( - shiftCount
);
1482 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1483 float_exception_flags
|= float_flag_inexact
;
1485 if ( aSign
) z
= - z
;
1489 #endif /* !SOFTFLOAT_FOR_GCC */
1492 -------------------------------------------------------------------------------
1493 Returns the result of converting the single-precision floating-point value
1494 `a' to the double-precision floating-point format. The conversion is
1495 performed according to the IEC/IEEE Standard for Binary Floating-Point
1497 -------------------------------------------------------------------------------
1499 float64
float32_to_float64( float32 a
)
1505 aSig
= extractFloat32Frac( a
);
1506 aExp
= extractFloat32Exp( a
);
1507 aSign
= extractFloat32Sign( a
);
1508 if ( aExp
== 0xFF ) {
1509 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
1510 return packFloat64( aSign
, 0x7FF, 0 );
1513 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1514 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1517 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1524 -------------------------------------------------------------------------------
1525 Returns the result of converting the single-precision floating-point value
1526 `a' to the extended double-precision floating-point format. The conversion
1527 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1529 -------------------------------------------------------------------------------
1531 floatx80
float32_to_floatx80( float32 a
)
1537 aSig
= extractFloat32Frac( a
);
1538 aExp
= extractFloat32Exp( a
);
1539 aSign
= extractFloat32Sign( a
);
1540 if ( aExp
== 0xFF ) {
1541 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a
) );
1542 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1545 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1546 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1549 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1558 -------------------------------------------------------------------------------
1559 Returns the result of converting the single-precision floating-point value
1560 `a' to the double-precision floating-point format. The conversion is
1561 performed according to the IEC/IEEE Standard for Binary Floating-Point
1563 -------------------------------------------------------------------------------
1565 float128
float32_to_float128( float32 a
)
1571 aSig
= extractFloat32Frac( a
);
1572 aExp
= extractFloat32Exp( a
);
1573 aSign
= extractFloat32Sign( a
);
1574 if ( aExp
== 0xFF ) {
1575 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a
) );
1576 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1579 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1580 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1583 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1589 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1591 -------------------------------------------------------------------------------
1592 Rounds the single-precision floating-point value `a' to an integer, and
1593 returns the result as a single-precision floating-point value. The
1594 operation is performed according to the IEC/IEEE Standard for Binary
1595 Floating-Point Arithmetic.
1596 -------------------------------------------------------------------------------
1598 float32
float32_round_to_int( float32 a
)
1602 bits32 lastBitMask
, roundBitsMask
;
1606 aExp
= extractFloat32Exp( a
);
1607 if ( 0x96 <= aExp
) {
1608 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1609 return propagateFloat32NaN( a
, a
);
1613 if ( aExp
<= 0x7E ) {
1614 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
1615 float_exception_flags
|= float_flag_inexact
;
1616 aSign
= extractFloat32Sign( a
);
1617 switch ( float_rounding_mode
) {
1618 case float_round_nearest_even
:
1619 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1620 return packFloat32( aSign
, 0x7F, 0 );
1623 case float_round_to_zero
:
1625 case float_round_down
:
1626 return aSign
? 0xBF800000 : 0;
1627 case float_round_up
:
1628 return aSign
? 0x80000000 : 0x3F800000;
1630 return packFloat32( aSign
, 0, 0 );
1633 lastBitMask
<<= 0x96 - aExp
;
1634 roundBitsMask
= lastBitMask
- 1;
1636 roundingMode
= float_rounding_mode
;
1637 if ( roundingMode
== float_round_nearest_even
) {
1638 z
+= lastBitMask
>>1;
1639 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1641 else if ( roundingMode
!= float_round_to_zero
) {
1642 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1646 z
&= ~ roundBitsMask
;
1647 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
1651 #endif /* !SOFTFLOAT_FOR_GCC */
1654 -------------------------------------------------------------------------------
1655 Returns the result of adding the absolute values of the single-precision
1656 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1657 before being returned. `zSign' is ignored if the result is a NaN.
1658 The addition is performed according to the IEC/IEEE Standard for Binary
1659 Floating-Point Arithmetic.
1660 -------------------------------------------------------------------------------
1662 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1664 int16 aExp
, bExp
, zExp
;
1665 bits32 aSig
, bSig
, zSig
;
1668 aSig
= extractFloat32Frac( a
);
1669 aExp
= extractFloat32Exp( a
);
1670 bSig
= extractFloat32Frac( b
);
1671 bExp
= extractFloat32Exp( b
);
1672 expDiff
= aExp
- bExp
;
1675 if ( 0 < expDiff
) {
1676 if ( aExp
== 0xFF ) {
1677 if ( aSig
) return propagateFloat32NaN( a
, b
);
1686 shift32RightJamming( bSig
, expDiff
, &bSig
);
1689 else if ( expDiff
< 0 ) {
1690 if ( bExp
== 0xFF ) {
1691 if ( bSig
) return propagateFloat32NaN( a
, b
);
1692 return packFloat32( zSign
, 0xFF, 0 );
1700 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1704 if ( aExp
== 0xFF ) {
1705 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1708 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1709 zSig
= 0x40000000 + aSig
+ bSig
;
1714 zSig
= ( aSig
+ bSig
)<<1;
1716 if ( (sbits32
) zSig
< 0 ) {
1721 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1726 -------------------------------------------------------------------------------
1727 Returns the result of subtracting the absolute values of the single-
1728 precision floating-point values `a' and `b'. If `zSign' is 1, the
1729 difference is negated before being returned. `zSign' is ignored if the
1730 result is a NaN. The subtraction is performed according to the IEC/IEEE
1731 Standard for Binary Floating-Point Arithmetic.
1732 -------------------------------------------------------------------------------
1734 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1736 int16 aExp
, bExp
, zExp
;
1737 bits32 aSig
, bSig
, zSig
;
1740 aSig
= extractFloat32Frac( a
);
1741 aExp
= extractFloat32Exp( a
);
1742 bSig
= extractFloat32Frac( b
);
1743 bExp
= extractFloat32Exp( b
);
1744 expDiff
= aExp
- bExp
;
1747 if ( 0 < expDiff
) goto aExpBigger
;
1748 if ( expDiff
< 0 ) goto bExpBigger
;
1749 if ( aExp
== 0xFF ) {
1750 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1751 float_raise( float_flag_invalid
);
1752 return float32_default_nan
;
1758 if ( bSig
< aSig
) goto aBigger
;
1759 if ( aSig
< bSig
) goto bBigger
;
1760 return packFloat32( float_rounding_mode
== float_round_down
, 0, 0 );
1762 if ( bExp
== 0xFF ) {
1763 if ( bSig
) return propagateFloat32NaN( a
, b
);
1764 return packFloat32( zSign
^ 1, 0xFF, 0 );
1772 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1778 goto normalizeRoundAndPack
;
1780 if ( aExp
== 0xFF ) {
1781 if ( aSig
) return propagateFloat32NaN( a
, b
);
1790 shift32RightJamming( bSig
, expDiff
, &bSig
);
1795 normalizeRoundAndPack
:
1797 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig
);
1802 -------------------------------------------------------------------------------
1803 Returns the result of adding the single-precision floating-point values `a'
1804 and `b'. The operation is performed according to the IEC/IEEE Standard for
1805 Binary Floating-Point Arithmetic.
1806 -------------------------------------------------------------------------------
1808 float32
float32_add( float32 a
, float32 b
)
1812 aSign
= extractFloat32Sign( a
);
1813 bSign
= extractFloat32Sign( b
);
1814 if ( aSign
== bSign
) {
1815 return addFloat32Sigs( a
, b
, aSign
);
1818 return subFloat32Sigs( a
, b
, aSign
);
1824 -------------------------------------------------------------------------------
1825 Returns the result of subtracting the single-precision floating-point values
1826 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1827 for Binary Floating-Point Arithmetic.
1828 -------------------------------------------------------------------------------
1830 float32
float32_sub( float32 a
, float32 b
)
1834 aSign
= extractFloat32Sign( a
);
1835 bSign
= extractFloat32Sign( b
);
1836 if ( aSign
== bSign
) {
1837 return subFloat32Sigs( a
, b
, aSign
);
1840 return addFloat32Sigs( a
, b
, aSign
);
1846 -------------------------------------------------------------------------------
1847 Returns the result of multiplying the single-precision floating-point values
1848 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1849 for Binary Floating-Point Arithmetic.
1850 -------------------------------------------------------------------------------
1852 float32
float32_mul( float32 a
, float32 b
)
1854 flag aSign
, bSign
, zSign
;
1855 int16 aExp
, bExp
, zExp
;
1860 aSig
= extractFloat32Frac( a
);
1861 aExp
= extractFloat32Exp( a
);
1862 aSign
= extractFloat32Sign( a
);
1863 bSig
= extractFloat32Frac( b
);
1864 bExp
= extractFloat32Exp( b
);
1865 bSign
= extractFloat32Sign( b
);
1866 zSign
= aSign
^ bSign
;
1867 if ( aExp
== 0xFF ) {
1868 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1869 return propagateFloat32NaN( a
, b
);
1871 if ( ( bExp
| bSig
) == 0 ) {
1872 float_raise( float_flag_invalid
);
1873 return float32_default_nan
;
1875 return packFloat32( zSign
, 0xFF, 0 );
1877 if ( bExp
== 0xFF ) {
1878 if ( bSig
) return propagateFloat32NaN( a
, b
);
1879 if ( ( aExp
| aSig
) == 0 ) {
1880 float_raise( float_flag_invalid
);
1881 return float32_default_nan
;
1883 return packFloat32( zSign
, 0xFF, 0 );
1886 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1887 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1890 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1891 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1893 zExp
= aExp
+ bExp
- 0x7F;
1894 aSig
= ( aSig
| 0x00800000 )<<7;
1895 bSig
= ( bSig
| 0x00800000 )<<8;
1896 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1898 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1902 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1907 -------------------------------------------------------------------------------
1908 Returns the result of dividing the single-precision floating-point value `a'
1909 by the corresponding value `b'. The operation is performed according to the
1910 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1911 -------------------------------------------------------------------------------
1913 float32
float32_div( float32 a
, float32 b
)
1915 flag aSign
, bSign
, zSign
;
1916 int16 aExp
, bExp
, zExp
;
1917 bits32 aSig
, bSig
, zSig
;
1919 aSig
= extractFloat32Frac( a
);
1920 aExp
= extractFloat32Exp( a
);
1921 aSign
= extractFloat32Sign( a
);
1922 bSig
= extractFloat32Frac( b
);
1923 bExp
= extractFloat32Exp( b
);
1924 bSign
= extractFloat32Sign( b
);
1925 zSign
= aSign
^ bSign
;
1926 if ( aExp
== 0xFF ) {
1927 if ( aSig
) return propagateFloat32NaN( a
, b
);
1928 if ( bExp
== 0xFF ) {
1929 if ( bSig
) return propagateFloat32NaN( a
, b
);
1930 float_raise( float_flag_invalid
);
1931 return float32_default_nan
;
1933 return packFloat32( zSign
, 0xFF, 0 );
1935 if ( bExp
== 0xFF ) {
1936 if ( bSig
) return propagateFloat32NaN( a
, b
);
1937 return packFloat32( zSign
, 0, 0 );
1941 if ( ( aExp
| aSig
) == 0 ) {
1942 float_raise( float_flag_invalid
);
1943 return float32_default_nan
;
1945 float_raise( float_flag_divbyzero
);
1946 return packFloat32( zSign
, 0xFF, 0 );
1948 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1951 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1952 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1954 zExp
= aExp
- bExp
+ 0x7D;
1955 aSig
= ( aSig
| 0x00800000 )<<7;
1956 bSig
= ( bSig
| 0x00800000 )<<8;
1957 if ( bSig
<= ( aSig
+ aSig
) ) {
1961 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1962 if ( ( zSig
& 0x3F ) == 0 ) {
1963 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
1965 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1969 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1971 -------------------------------------------------------------------------------
1972 Returns the remainder of the single-precision floating-point value `a'
1973 with respect to the corresponding value `b'. The operation is performed
1974 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1975 -------------------------------------------------------------------------------
1977 float32
float32_rem( float32 a
, float32 b
)
1979 flag aSign
, bSign
, zSign
;
1980 int16 aExp
, bExp
, expDiff
;
1983 bits64 aSig64
, bSig64
, q64
;
1984 bits32 alternateASig
;
1987 aSig
= extractFloat32Frac( a
);
1988 aExp
= extractFloat32Exp( a
);
1989 aSign
= extractFloat32Sign( a
);
1990 bSig
= extractFloat32Frac( b
);
1991 bExp
= extractFloat32Exp( b
);
1992 bSign
= extractFloat32Sign( b
);
1993 if ( aExp
== 0xFF ) {
1994 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1995 return propagateFloat32NaN( a
, b
);
1997 float_raise( float_flag_invalid
);
1998 return float32_default_nan
;
2000 if ( bExp
== 0xFF ) {
2001 if ( bSig
) return propagateFloat32NaN( a
, b
);
2006 float_raise( float_flag_invalid
);
2007 return float32_default_nan
;
2009 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2012 if ( aSig
== 0 ) return a
;
2013 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2015 expDiff
= aExp
- bExp
;
2018 if ( expDiff
< 32 ) {
2021 if ( expDiff
< 0 ) {
2022 if ( expDiff
< -1 ) return a
;
2025 q
= ( bSig
<= aSig
);
2026 if ( q
) aSig
-= bSig
;
2027 if ( 0 < expDiff
) {
2028 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
2031 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2039 if ( bSig
<= aSig
) aSig
-= bSig
;
2040 aSig64
= ( (bits64
) aSig
)<<40;
2041 bSig64
= ( (bits64
) bSig
)<<40;
2043 while ( 0 < expDiff
) {
2044 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2045 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2046 aSig64
= - ( ( bSig
* q64
)<<38 );
2050 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2051 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2052 q
= q64
>>( 64 - expDiff
);
2054 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2057 alternateASig
= aSig
;
2060 } while ( 0 <= (sbits32
) aSig
);
2061 sigMean
= aSig
+ alternateASig
;
2062 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2063 aSig
= alternateASig
;
2065 zSign
= ( (sbits32
) aSig
< 0 );
2066 if ( zSign
) aSig
= - aSig
;
2067 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig
);
2070 #endif /* !SOFTFLOAT_FOR_GCC */
2072 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2074 -------------------------------------------------------------------------------
2075 Returns the square root of the single-precision floating-point value `a'.
2076 The operation is performed according to the IEC/IEEE Standard for Binary
2077 Floating-Point Arithmetic.
2078 -------------------------------------------------------------------------------
2080 float32
float32_sqrt( float32 a
)
2087 aSig
= extractFloat32Frac( a
);
2088 aExp
= extractFloat32Exp( a
);
2089 aSign
= extractFloat32Sign( a
);
2090 if ( aExp
== 0xFF ) {
2091 if ( aSig
) return propagateFloat32NaN( a
, 0 );
2092 if ( ! aSign
) return a
;
2093 float_raise( float_flag_invalid
);
2094 return float32_default_nan
;
2097 if ( ( aExp
| aSig
) == 0 ) return a
;
2098 float_raise( float_flag_invalid
);
2099 return float32_default_nan
;
2102 if ( aSig
== 0 ) return 0;
2103 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2105 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2106 aSig
= ( aSig
| 0x00800000 )<<8;
2107 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2108 if ( ( zSig
& 0x7F ) <= 5 ) {
2114 term
= ( (bits64
) zSig
) * zSig
;
2115 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2116 while ( (sbits64
) rem
< 0 ) {
2118 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2120 zSig
|= ( rem
!= 0 );
2122 shift32RightJamming( zSig
, 1, &zSig
);
2124 return roundAndPackFloat32( 0, zExp
, zSig
);
2127 #endif /* !SOFTFLOAT_FOR_GCC */
2130 -------------------------------------------------------------------------------
2131 Returns 1 if the single-precision floating-point value `a' is equal to
2132 the corresponding value `b', and 0 otherwise. The comparison is performed
2133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2134 -------------------------------------------------------------------------------
2136 flag
float32_eq( float32 a
, float32 b
)
2139 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2140 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2142 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2143 float_raise( float_flag_invalid
);
2147 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2152 -------------------------------------------------------------------------------
2153 Returns 1 if the single-precision floating-point value `a' is less than
2154 or equal to the corresponding value `b', and 0 otherwise. The comparison
2155 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2157 -------------------------------------------------------------------------------
2159 flag
float32_le( float32 a
, float32 b
)
2163 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2164 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2166 float_raise( float_flag_invalid
);
2169 aSign
= extractFloat32Sign( a
);
2170 bSign
= extractFloat32Sign( b
);
2171 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2172 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2177 -------------------------------------------------------------------------------
2178 Returns 1 if the single-precision floating-point value `a' is less than
2179 the corresponding value `b', and 0 otherwise. The comparison is performed
2180 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2181 -------------------------------------------------------------------------------
2183 flag
float32_lt( float32 a
, float32 b
)
2187 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2188 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2190 float_raise( float_flag_invalid
);
2193 aSign
= extractFloat32Sign( a
);
2194 bSign
= extractFloat32Sign( b
);
2195 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2196 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2200 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2202 -------------------------------------------------------------------------------
2203 Returns 1 if the single-precision floating-point value `a' is equal to
2204 the corresponding value `b', and 0 otherwise. The invalid exception is
2205 raised if either operand is a NaN. Otherwise, the comparison is performed
2206 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2207 -------------------------------------------------------------------------------
2209 flag
float32_eq_signaling( float32 a
, float32 b
)
2212 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2213 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2215 float_raise( float_flag_invalid
);
2218 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2223 -------------------------------------------------------------------------------
2224 Returns 1 if the single-precision floating-point value `a' is less than or
2225 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2226 cause an exception. Otherwise, the comparison is performed according to the
2227 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2228 -------------------------------------------------------------------------------
2230 flag
float32_le_quiet( float32 a
, float32 b
)
2234 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2235 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2237 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2238 float_raise( float_flag_invalid
);
2242 aSign
= extractFloat32Sign( a
);
2243 bSign
= extractFloat32Sign( b
);
2244 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2245 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2250 -------------------------------------------------------------------------------
2251 Returns 1 if the single-precision floating-point value `a' is less than
2252 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2253 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2254 Standard for Binary Floating-Point Arithmetic.
2255 -------------------------------------------------------------------------------
2257 flag
float32_lt_quiet( float32 a
, float32 b
)
2261 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2262 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2264 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2265 float_raise( float_flag_invalid
);
2269 aSign
= extractFloat32Sign( a
);
2270 bSign
= extractFloat32Sign( b
);
2271 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2272 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2275 #endif /* !SOFTFLOAT_FOR_GCC */
2277 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2279 -------------------------------------------------------------------------------
2280 Returns the result of converting the double-precision floating-point value
2281 `a' to the 32-bit two's complement integer format. The conversion is
2282 performed according to the IEC/IEEE Standard for Binary Floating-Point
2283 Arithmetic---which means in particular that the conversion is rounded
2284 according to the current rounding mode. If `a' is a NaN, the largest
2285 positive integer is returned. Otherwise, if the conversion overflows, the
2286 largest integer with the same sign as `a' is returned.
2287 -------------------------------------------------------------------------------
2289 int32
float64_to_int32( float64 a
)
2292 int16 aExp
, shiftCount
;
2295 aSig
= extractFloat64Frac( a
);
2296 aExp
= extractFloat64Exp( a
);
2297 aSign
= extractFloat64Sign( a
);
2298 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2299 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2300 shiftCount
= 0x42C - aExp
;
2301 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2302 return roundAndPackInt32( aSign
, aSig
);
2305 #endif /* !SOFTFLOAT_FOR_GCC */
2308 -------------------------------------------------------------------------------
2309 Returns the result of converting the double-precision floating-point value
2310 `a' to the 32-bit two's complement integer format. The conversion is
2311 performed according to the IEC/IEEE Standard for Binary Floating-Point
2312 Arithmetic, except that the conversion is always rounded toward zero.
2313 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2314 the conversion overflows, the largest integer with the same sign as `a' is
2316 -------------------------------------------------------------------------------
2318 int32
float64_to_int32_round_to_zero( float64 a
)
2321 int16 aExp
, shiftCount
;
2322 bits64 aSig
, savedASig
;
2325 aSig
= extractFloat64Frac( a
);
2326 aExp
= extractFloat64Exp( a
);
2327 aSign
= extractFloat64Sign( a
);
2328 if ( 0x41E < aExp
) {
2329 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2332 else if ( aExp
< 0x3FF ) {
2333 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
2336 aSig
|= LIT64( 0x0010000000000000 );
2337 shiftCount
= 0x433 - aExp
;
2339 aSig
>>= shiftCount
;
2341 if ( aSign
) z
= - z
;
2342 if ( ( z
< 0 ) ^ aSign
) {
2344 float_raise( float_flag_invalid
);
2345 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2347 if ( ( aSig
<<shiftCount
) != savedASig
) {
2348 float_exception_flags
|= float_flag_inexact
;
2354 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2356 -------------------------------------------------------------------------------
2357 Returns the result of converting the double-precision floating-point value
2358 `a' to the 64-bit two's complement integer format. The conversion is
2359 performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 Arithmetic---which means in particular that the conversion is rounded
2361 according to the current rounding mode. If `a' is a NaN, the largest
2362 positive integer is returned. Otherwise, if the conversion overflows, the
2363 largest integer with the same sign as `a' is returned.
2364 -------------------------------------------------------------------------------
2366 int64
float64_to_int64( float64 a
)
2369 int16 aExp
, shiftCount
;
2370 bits64 aSig
, aSigExtra
;
2372 aSig
= extractFloat64Frac( a
);
2373 aExp
= extractFloat64Exp( a
);
2374 aSign
= extractFloat64Sign( a
);
2375 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2376 shiftCount
= 0x433 - aExp
;
2377 if ( shiftCount
<= 0 ) {
2378 if ( 0x43E < aExp
) {
2379 float_raise( float_flag_invalid
);
2381 || ( ( aExp
== 0x7FF )
2382 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2384 return LIT64( 0x7FFFFFFFFFFFFFFF );
2386 return (sbits64
) LIT64( 0x8000000000000000 );
2389 aSig
<<= - shiftCount
;
2392 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2394 return roundAndPackInt64( aSign
, aSig
, aSigExtra
);
2399 -------------------------------------------------------------------------------
2400 Returns the result of converting the double-precision floating-point value
2401 `a' to the 64-bit two's complement integer format. The conversion is
2402 performed according to the IEC/IEEE Standard for Binary Floating-Point
2403 Arithmetic, except that the conversion is always rounded toward zero.
2404 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2405 the conversion overflows, the largest integer with the same sign as `a' is
2407 -------------------------------------------------------------------------------
2409 int64
float64_to_int64_round_to_zero( float64 a
)
2412 int16 aExp
, shiftCount
;
2416 aSig
= extractFloat64Frac( a
);
2417 aExp
= extractFloat64Exp( a
);
2418 aSign
= extractFloat64Sign( a
);
2419 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2420 shiftCount
= aExp
- 0x433;
2421 if ( 0 <= shiftCount
) {
2422 if ( 0x43E <= aExp
) {
2423 if ( a
!= LIT64( 0xC3E0000000000000 ) ) {
2424 float_raise( float_flag_invalid
);
2426 || ( ( aExp
== 0x7FF )
2427 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2429 return LIT64( 0x7FFFFFFFFFFFFFFF );
2432 return (sbits64
) LIT64( 0x8000000000000000 );
2434 z
= aSig
<<shiftCount
;
2437 if ( aExp
< 0x3FE ) {
2438 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
2441 z
= aSig
>>( - shiftCount
);
2442 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2443 float_exception_flags
|= float_flag_inexact
;
2446 if ( aSign
) z
= - z
;
2450 #endif /* !SOFTFLOAT_FOR_GCC */
2453 -------------------------------------------------------------------------------
2454 Returns the result of converting the double-precision floating-point value
2455 `a' to the single-precision floating-point format. The conversion is
2456 performed according to the IEC/IEEE Standard for Binary Floating-Point
2458 -------------------------------------------------------------------------------
2460 float32
float64_to_float32( float64 a
)
2467 aSig
= extractFloat64Frac( a
);
2468 aExp
= extractFloat64Exp( a
);
2469 aSign
= extractFloat64Sign( a
);
2470 if ( aExp
== 0x7FF ) {
2471 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a
) );
2472 return packFloat32( aSign
, 0xFF, 0 );
2474 shift64RightJamming( aSig
, 22, &aSig
);
2476 if ( aExp
|| zSig
) {
2480 return roundAndPackFloat32( aSign
, aExp
, zSig
);
2487 -------------------------------------------------------------------------------
2488 Returns the result of converting the double-precision floating-point value
2489 `a' to the extended double-precision floating-point format. The conversion
2490 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2492 -------------------------------------------------------------------------------
2494 floatx80
float64_to_floatx80( float64 a
)
2500 aSig
= extractFloat64Frac( a
);
2501 aExp
= extractFloat64Exp( a
);
2502 aSign
= extractFloat64Sign( a
);
2503 if ( aExp
== 0x7FF ) {
2504 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a
) );
2505 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2508 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2509 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2513 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2522 -------------------------------------------------------------------------------
2523 Returns the result of converting the double-precision floating-point value
2524 `a' to the quadruple-precision floating-point format. The conversion is
2525 performed according to the IEC/IEEE Standard for Binary Floating-Point
2527 -------------------------------------------------------------------------------
2529 float128
float64_to_float128( float64 a
)
2533 bits64 aSig
, zSig0
, zSig1
;
2535 aSig
= extractFloat64Frac( a
);
2536 aExp
= extractFloat64Exp( a
);
2537 aSign
= extractFloat64Sign( a
);
2538 if ( aExp
== 0x7FF ) {
2539 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a
) );
2540 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2543 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2544 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2547 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2548 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2554 #ifndef SOFTFLOAT_FOR_GCC
2556 -------------------------------------------------------------------------------
2557 Rounds the double-precision floating-point value `a' to an integer, and
2558 returns the result as a double-precision floating-point value. The
2559 operation is performed according to the IEC/IEEE Standard for Binary
2560 Floating-Point Arithmetic.
2561 -------------------------------------------------------------------------------
2563 float64
float64_round_to_int( float64 a
)
2567 bits64 lastBitMask
, roundBitsMask
;
2571 aExp
= extractFloat64Exp( a
);
2572 if ( 0x433 <= aExp
) {
2573 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2574 return propagateFloat64NaN( a
, a
);
2578 if ( aExp
< 0x3FF ) {
2579 if ( (bits64
) ( a
<<1 ) == 0 ) return a
;
2580 float_exception_flags
|= float_flag_inexact
;
2581 aSign
= extractFloat64Sign( a
);
2582 switch ( float_rounding_mode
) {
2583 case float_round_nearest_even
:
2584 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2585 return packFloat64( aSign
, 0x3FF, 0 );
2588 case float_round_to_zero
:
2590 case float_round_down
:
2591 return aSign
? LIT64( 0xBFF0000000000000 ) : 0;
2592 case float_round_up
:
2594 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2596 return packFloat64( aSign
, 0, 0 );
2599 lastBitMask
<<= 0x433 - aExp
;
2600 roundBitsMask
= lastBitMask
- 1;
2602 roundingMode
= float_rounding_mode
;
2603 if ( roundingMode
== float_round_nearest_even
) {
2604 z
+= lastBitMask
>>1;
2605 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2607 else if ( roundingMode
!= float_round_to_zero
) {
2608 if ( extractFloat64Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
2612 z
&= ~ roundBitsMask
;
2613 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
2620 -------------------------------------------------------------------------------
2621 Returns the result of adding the absolute values of the double-precision
2622 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2623 before being returned. `zSign' is ignored if the result is a NaN.
2624 The addition is performed according to the IEC/IEEE Standard for Binary
2625 Floating-Point Arithmetic.
2626 -------------------------------------------------------------------------------
2628 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2630 int16 aExp
, bExp
, zExp
;
2631 bits64 aSig
, bSig
, zSig
;
2634 aSig
= extractFloat64Frac( a
);
2635 aExp
= extractFloat64Exp( a
);
2636 bSig
= extractFloat64Frac( b
);
2637 bExp
= extractFloat64Exp( b
);
2638 expDiff
= aExp
- bExp
;
2641 if ( 0 < expDiff
) {
2642 if ( aExp
== 0x7FF ) {
2643 if ( aSig
) return propagateFloat64NaN( a
, b
);
2650 bSig
|= LIT64( 0x2000000000000000 );
2652 shift64RightJamming( bSig
, expDiff
, &bSig
);
2655 else if ( expDiff
< 0 ) {
2656 if ( bExp
== 0x7FF ) {
2657 if ( bSig
) return propagateFloat64NaN( a
, b
);
2658 return packFloat64( zSign
, 0x7FF, 0 );
2664 aSig
|= LIT64( 0x2000000000000000 );
2666 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2670 if ( aExp
== 0x7FF ) {
2671 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2674 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
2675 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
2679 aSig
|= LIT64( 0x2000000000000000 );
2680 zSig
= ( aSig
+ bSig
)<<1;
2682 if ( (sbits64
) zSig
< 0 ) {
2687 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2692 -------------------------------------------------------------------------------
2693 Returns the result of subtracting the absolute values of the double-
2694 precision floating-point values `a' and `b'. If `zSign' is 1, the
2695 difference is negated before being returned. `zSign' is ignored if the
2696 result is a NaN. The subtraction is performed according to the IEC/IEEE
2697 Standard for Binary Floating-Point Arithmetic.
2698 -------------------------------------------------------------------------------
2700 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2702 int16 aExp
, bExp
, zExp
;
2703 bits64 aSig
, bSig
, zSig
;
2706 aSig
= extractFloat64Frac( a
);
2707 aExp
= extractFloat64Exp( a
);
2708 bSig
= extractFloat64Frac( b
);
2709 bExp
= extractFloat64Exp( b
);
2710 expDiff
= aExp
- bExp
;
2713 if ( 0 < expDiff
) goto aExpBigger
;
2714 if ( expDiff
< 0 ) goto bExpBigger
;
2715 if ( aExp
== 0x7FF ) {
2716 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2717 float_raise( float_flag_invalid
);
2718 return float64_default_nan
;
2724 if ( bSig
< aSig
) goto aBigger
;
2725 if ( aSig
< bSig
) goto bBigger
;
2726 return packFloat64( float_rounding_mode
== float_round_down
, 0, 0 );
2728 if ( bExp
== 0x7FF ) {
2729 if ( bSig
) return propagateFloat64NaN( a
, b
);
2730 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2736 aSig
|= LIT64( 0x4000000000000000 );
2738 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2739 bSig
|= LIT64( 0x4000000000000000 );
2744 goto normalizeRoundAndPack
;
2746 if ( aExp
== 0x7FF ) {
2747 if ( aSig
) return propagateFloat64NaN( a
, b
);
2754 bSig
|= LIT64( 0x4000000000000000 );
2756 shift64RightJamming( bSig
, expDiff
, &bSig
);
2757 aSig
|= LIT64( 0x4000000000000000 );
2761 normalizeRoundAndPack
:
2763 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig
);
2768 -------------------------------------------------------------------------------
2769 Returns the result of adding the double-precision floating-point values `a'
2770 and `b'. The operation is performed according to the IEC/IEEE Standard for
2771 Binary Floating-Point Arithmetic.
2772 -------------------------------------------------------------------------------
2774 float64
float64_add( float64 a
, float64 b
)
2778 aSign
= extractFloat64Sign( a
);
2779 bSign
= extractFloat64Sign( b
);
2780 if ( aSign
== bSign
) {
2781 return addFloat64Sigs( a
, b
, aSign
);
2784 return subFloat64Sigs( a
, b
, aSign
);
2790 -------------------------------------------------------------------------------
2791 Returns the result of subtracting the double-precision floating-point values
2792 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2793 for Binary Floating-Point Arithmetic.
2794 -------------------------------------------------------------------------------
2796 float64
float64_sub( float64 a
, float64 b
)
2800 aSign
= extractFloat64Sign( a
);
2801 bSign
= extractFloat64Sign( b
);
2802 if ( aSign
== bSign
) {
2803 return subFloat64Sigs( a
, b
, aSign
);
2806 return addFloat64Sigs( a
, b
, aSign
);
2812 -------------------------------------------------------------------------------
2813 Returns the result of multiplying the double-precision floating-point values
2814 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2815 for Binary Floating-Point Arithmetic.
2816 -------------------------------------------------------------------------------
2818 float64
float64_mul( float64 a
, float64 b
)
2820 flag aSign
, bSign
, zSign
;
2821 int16 aExp
, bExp
, zExp
;
2822 bits64 aSig
, bSig
, zSig0
, zSig1
;
2824 aSig
= extractFloat64Frac( a
);
2825 aExp
= extractFloat64Exp( a
);
2826 aSign
= extractFloat64Sign( a
);
2827 bSig
= extractFloat64Frac( b
);
2828 bExp
= extractFloat64Exp( b
);
2829 bSign
= extractFloat64Sign( b
);
2830 zSign
= aSign
^ bSign
;
2831 if ( aExp
== 0x7FF ) {
2832 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2833 return propagateFloat64NaN( a
, b
);
2835 if ( ( bExp
| bSig
) == 0 ) {
2836 float_raise( float_flag_invalid
);
2837 return float64_default_nan
;
2839 return packFloat64( zSign
, 0x7FF, 0 );
2841 if ( bExp
== 0x7FF ) {
2842 if ( bSig
) return propagateFloat64NaN( a
, b
);
2843 if ( ( aExp
| aSig
) == 0 ) {
2844 float_raise( float_flag_invalid
);
2845 return float64_default_nan
;
2847 return packFloat64( zSign
, 0x7FF, 0 );
2850 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2851 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2854 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2855 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2857 zExp
= aExp
+ bExp
- 0x3FF;
2858 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2859 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2860 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2861 zSig0
|= ( zSig1
!= 0 );
2862 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2866 return roundAndPackFloat64( zSign
, zExp
, zSig0
);
2871 -------------------------------------------------------------------------------
2872 Returns the result of dividing the double-precision floating-point value `a'
2873 by the corresponding value `b'. The operation is performed according to
2874 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2875 -------------------------------------------------------------------------------
2877 float64
float64_div( float64 a
, float64 b
)
2879 flag aSign
, bSign
, zSign
;
2880 int16 aExp
, bExp
, zExp
;
2881 bits64 aSig
, bSig
, zSig
;
2883 bits64 term0
, term1
;
2885 aSig
= extractFloat64Frac( a
);
2886 aExp
= extractFloat64Exp( a
);
2887 aSign
= extractFloat64Sign( a
);
2888 bSig
= extractFloat64Frac( b
);
2889 bExp
= extractFloat64Exp( b
);
2890 bSign
= extractFloat64Sign( b
);
2891 zSign
= aSign
^ bSign
;
2892 if ( aExp
== 0x7FF ) {
2893 if ( aSig
) return propagateFloat64NaN( a
, b
);
2894 if ( bExp
== 0x7FF ) {
2895 if ( bSig
) return propagateFloat64NaN( a
, b
);
2896 float_raise( float_flag_invalid
);
2897 return float64_default_nan
;
2899 return packFloat64( zSign
, 0x7FF, 0 );
2901 if ( bExp
== 0x7FF ) {
2902 if ( bSig
) return propagateFloat64NaN( a
, b
);
2903 return packFloat64( zSign
, 0, 0 );
2907 if ( ( aExp
| aSig
) == 0 ) {
2908 float_raise( float_flag_invalid
);
2909 return float64_default_nan
;
2911 float_raise( float_flag_divbyzero
);
2912 return packFloat64( zSign
, 0x7FF, 0 );
2914 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2917 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2918 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2920 zExp
= aExp
- bExp
+ 0x3FD;
2921 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2922 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2923 if ( bSig
<= ( aSig
+ aSig
) ) {
2927 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2928 if ( ( zSig
& 0x1FF ) <= 2 ) {
2929 mul64To128( bSig
, zSig
, &term0
, &term1
);
2930 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2931 while ( (sbits64
) rem0
< 0 ) {
2933 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2935 zSig
|= ( rem1
!= 0 );
2937 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2941 #ifndef SOFTFLOAT_FOR_GCC
2943 -------------------------------------------------------------------------------
2944 Returns the remainder of the double-precision floating-point value `a'
2945 with respect to the corresponding value `b'. The operation is performed
2946 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2947 -------------------------------------------------------------------------------
2949 float64
float64_rem( float64 a
, float64 b
)
2951 flag aSign
, bSign
, zSign
;
2952 int16 aExp
, bExp
, expDiff
;
2954 bits64 q
, alternateASig
;
2957 aSig
= extractFloat64Frac( a
);
2958 aExp
= extractFloat64Exp( a
);
2959 aSign
= extractFloat64Sign( a
);
2960 bSig
= extractFloat64Frac( b
);
2961 bExp
= extractFloat64Exp( b
);
2962 bSign
= extractFloat64Sign( b
);
2963 if ( aExp
== 0x7FF ) {
2964 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2965 return propagateFloat64NaN( a
, b
);
2967 float_raise( float_flag_invalid
);
2968 return float64_default_nan
;
2970 if ( bExp
== 0x7FF ) {
2971 if ( bSig
) return propagateFloat64NaN( a
, b
);
2976 float_raise( float_flag_invalid
);
2977 return float64_default_nan
;
2979 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2982 if ( aSig
== 0 ) return a
;
2983 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2985 expDiff
= aExp
- bExp
;
2986 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
2987 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2988 if ( expDiff
< 0 ) {
2989 if ( expDiff
< -1 ) return a
;
2992 q
= ( bSig
<= aSig
);
2993 if ( q
) aSig
-= bSig
;
2995 while ( 0 < expDiff
) {
2996 q
= estimateDiv128To64( aSig
, 0, bSig
);
2997 q
= ( 2 < q
) ? q
- 2 : 0;
2998 aSig
= - ( ( bSig
>>2 ) * q
);
3002 if ( 0 < expDiff
) {
3003 q
= estimateDiv128To64( aSig
, 0, bSig
);
3004 q
= ( 2 < q
) ? q
- 2 : 0;
3007 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3014 alternateASig
= aSig
;
3017 } while ( 0 <= (sbits64
) aSig
);
3018 sigMean
= aSig
+ alternateASig
;
3019 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3020 aSig
= alternateASig
;
3022 zSign
= ( (sbits64
) aSig
< 0 );
3023 if ( zSign
) aSig
= - aSig
;
3024 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig
);
3029 -------------------------------------------------------------------------------
3030 Returns the square root of the double-precision floating-point value `a'.
3031 The operation is performed according to the IEC/IEEE Standard for Binary
3032 Floating-Point Arithmetic.
3033 -------------------------------------------------------------------------------
3035 float64
float64_sqrt( float64 a
)
3039 bits64 aSig
, zSig
, doubleZSig
;
3040 bits64 rem0
, rem1
, term0
, term1
;
3042 aSig
= extractFloat64Frac( a
);
3043 aExp
= extractFloat64Exp( a
);
3044 aSign
= extractFloat64Sign( a
);
3045 if ( aExp
== 0x7FF ) {
3046 if ( aSig
) return propagateFloat64NaN( a
, a
);
3047 if ( ! aSign
) return a
;
3048 float_raise( float_flag_invalid
);
3049 return float64_default_nan
;
3052 if ( ( aExp
| aSig
) == 0 ) return a
;
3053 float_raise( float_flag_invalid
);
3054 return float64_default_nan
;
3057 if ( aSig
== 0 ) return 0;
3058 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3060 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3061 aSig
|= LIT64( 0x0010000000000000 );
3062 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3063 aSig
<<= 9 - ( aExp
& 1 );
3064 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3065 if ( ( zSig
& 0x1FF ) <= 5 ) {
3066 doubleZSig
= zSig
<<1;
3067 mul64To128( zSig
, zSig
, &term0
, &term1
);
3068 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3069 while ( (sbits64
) rem0
< 0 ) {
3072 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3074 zSig
|= ( ( rem0
| rem1
) != 0 );
3076 return roundAndPackFloat64( 0, zExp
, zSig
);
3082 -------------------------------------------------------------------------------
3083 Returns 1 if the double-precision floating-point value `a' is equal to the
3084 corresponding value `b', and 0 otherwise. The comparison is performed
3085 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3086 -------------------------------------------------------------------------------
3088 flag
float64_eq( float64 a
, float64 b
)
3091 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3092 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3094 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3095 float_raise( float_flag_invalid
);
3099 return ( a
== b
) ||
3100 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) == 0 );
3105 -------------------------------------------------------------------------------
3106 Returns 1 if the double-precision floating-point value `a' is less than or
3107 equal to the corresponding value `b', and 0 otherwise. The comparison is
3108 performed according to the IEC/IEEE Standard for Binary Floating-Point
3110 -------------------------------------------------------------------------------
3112 flag
float64_le( float64 a
, float64 b
)
3116 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3117 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3119 float_raise( float_flag_invalid
);
3122 aSign
= extractFloat64Sign( a
);
3123 bSign
= extractFloat64Sign( b
);
3124 if ( aSign
!= bSign
)
3126 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) ==
3128 return ( a
== b
) ||
3129 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
3134 -------------------------------------------------------------------------------
3135 Returns 1 if the double-precision floating-point value `a' is less than
3136 the corresponding value `b', and 0 otherwise. The comparison is performed
3137 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3138 -------------------------------------------------------------------------------
3140 flag
float64_lt( float64 a
, float64 b
)
3144 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3145 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3147 float_raise( float_flag_invalid
);
3150 aSign
= extractFloat64Sign( a
);
3151 bSign
= extractFloat64Sign( b
);
3152 if ( aSign
!= bSign
)
3154 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) !=
3156 return ( a
!= b
) &&
3157 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
3161 #ifndef SOFTFLOAT_FOR_GCC
3163 -------------------------------------------------------------------------------
3164 Returns 1 if the double-precision floating-point value `a' is equal to the
3165 corresponding value `b', and 0 otherwise. The invalid exception is raised
3166 if either operand is a NaN. Otherwise, the comparison is performed
3167 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3168 -------------------------------------------------------------------------------
3170 flag
float64_eq_signaling( float64 a
, float64 b
)
3173 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3174 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3176 float_raise( float_flag_invalid
);
3179 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3184 -------------------------------------------------------------------------------
3185 Returns 1 if the double-precision floating-point value `a' is less than or
3186 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3187 cause an exception. Otherwise, the comparison is performed according to the
3188 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3189 -------------------------------------------------------------------------------
3191 flag
float64_le_quiet( float64 a
, float64 b
)
3195 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3196 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3198 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3199 float_raise( float_flag_invalid
);
3203 aSign
= extractFloat64Sign( a
);
3204 bSign
= extractFloat64Sign( b
);
3205 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3206 return ( a
== b
) || ( aSign
^ ( a
< b
) );
3211 -------------------------------------------------------------------------------
3212 Returns 1 if the double-precision floating-point value `a' is less than
3213 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3214 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3215 Standard for Binary Floating-Point Arithmetic.
3216 -------------------------------------------------------------------------------
3218 flag
float64_lt_quiet( float64 a
, float64 b
)
3222 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3223 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3225 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3226 float_raise( float_flag_invalid
);
3230 aSign
= extractFloat64Sign( a
);
3231 bSign
= extractFloat64Sign( b
);
3232 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
3233 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
3241 -------------------------------------------------------------------------------
3242 Returns the result of converting the extended double-precision floating-
3243 point value `a' to the 32-bit two's complement integer format. The
3244 conversion is performed according to the IEC/IEEE Standard for Binary
3245 Floating-Point Arithmetic---which means in particular that the conversion
3246 is rounded according to the current rounding mode. If `a' is a NaN, the
3247 largest positive integer is returned. Otherwise, if the conversion
3248 overflows, the largest integer with the same sign as `a' is returned.
3249 -------------------------------------------------------------------------------
3251 int32
floatx80_to_int32( floatx80 a
)
3254 int32 aExp
, shiftCount
;
3257 aSig
= extractFloatx80Frac( a
);
3258 aExp
= extractFloatx80Exp( a
);
3259 aSign
= extractFloatx80Sign( a
);
3260 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3261 shiftCount
= 0x4037 - aExp
;
3262 if ( shiftCount
<= 0 ) shiftCount
= 1;
3263 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3264 return roundAndPackInt32( aSign
, aSig
);
3269 -------------------------------------------------------------------------------
3270 Returns the result of converting the extended double-precision floating-
3271 point value `a' to the 32-bit two's complement integer format. The
3272 conversion is performed according to the IEC/IEEE Standard for Binary
3273 Floating-Point Arithmetic, except that the conversion is always rounded
3274 toward zero. If `a' is a NaN, the largest positive integer is returned.
3275 Otherwise, if the conversion overflows, the largest integer with the same
3276 sign as `a' is returned.
3277 -------------------------------------------------------------------------------
3279 int32
floatx80_to_int32_round_to_zero( floatx80 a
)
3282 int32 aExp
, shiftCount
;
3283 bits64 aSig
, savedASig
;
3286 aSig
= extractFloatx80Frac( a
);
3287 aExp
= extractFloatx80Exp( a
);
3288 aSign
= extractFloatx80Sign( a
);
3289 if ( 0x401E < aExp
) {
3290 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3293 else if ( aExp
< 0x3FFF ) {
3294 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
3297 shiftCount
= 0x403E - aExp
;
3299 aSig
>>= shiftCount
;
3301 if ( aSign
) z
= - z
;
3302 if ( ( z
< 0 ) ^ aSign
) {
3304 float_raise( float_flag_invalid
);
3305 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3307 if ( ( aSig
<<shiftCount
) != savedASig
) {
3308 float_exception_flags
|= float_flag_inexact
;
3315 -------------------------------------------------------------------------------
3316 Returns the result of converting the extended double-precision floating-
3317 point value `a' to the 64-bit two's complement integer format. The
3318 conversion is performed according to the IEC/IEEE Standard for Binary
3319 Floating-Point Arithmetic---which means in particular that the conversion
3320 is rounded according to the current rounding mode. If `a' is a NaN,
3321 the largest positive integer is returned. Otherwise, if the conversion
3322 overflows, the largest integer with the same sign as `a' is returned.
3323 -------------------------------------------------------------------------------
3325 int64
floatx80_to_int64( floatx80 a
)
3328 int32 aExp
, shiftCount
;
3329 bits64 aSig
, aSigExtra
;
3331 aSig
= extractFloatx80Frac( a
);
3332 aExp
= extractFloatx80Exp( a
);
3333 aSign
= extractFloatx80Sign( a
);
3334 shiftCount
= 0x403E - aExp
;
3335 if ( shiftCount
<= 0 ) {
3337 float_raise( float_flag_invalid
);
3339 || ( ( aExp
== 0x7FFF )
3340 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3342 return LIT64( 0x7FFFFFFFFFFFFFFF );
3344 return (sbits64
) LIT64( 0x8000000000000000 );
3349 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3351 return roundAndPackInt64( aSign
, aSig
, aSigExtra
);
3356 -------------------------------------------------------------------------------
3357 Returns the result of converting the extended double-precision floating-
3358 point value `a' to the 64-bit two's complement integer format. The
3359 conversion is performed according to the IEC/IEEE Standard for Binary
3360 Floating-Point Arithmetic, except that the conversion is always rounded
3361 toward zero. If `a' is a NaN, the largest positive integer is returned.
3362 Otherwise, if the conversion overflows, the largest integer with the same
3363 sign as `a' is returned.
3364 -------------------------------------------------------------------------------
3366 int64
floatx80_to_int64_round_to_zero( floatx80 a
)
3369 int32 aExp
, shiftCount
;
3373 aSig
= extractFloatx80Frac( a
);
3374 aExp
= extractFloatx80Exp( a
);
3375 aSign
= extractFloatx80Sign( a
);
3376 shiftCount
= aExp
- 0x403E;
3377 if ( 0 <= shiftCount
) {
3378 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3379 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3380 float_raise( float_flag_invalid
);
3381 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3382 return LIT64( 0x7FFFFFFFFFFFFFFF );
3385 return (sbits64
) LIT64( 0x8000000000000000 );
3387 else if ( aExp
< 0x3FFF ) {
3388 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
3391 z
= aSig
>>( - shiftCount
);
3392 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3393 float_exception_flags
|= float_flag_inexact
;
3395 if ( aSign
) z
= - z
;
3401 -------------------------------------------------------------------------------
3402 Returns the result of converting the extended double-precision floating-
3403 point value `a' to the single-precision floating-point format. The
3404 conversion is performed according to the IEC/IEEE Standard for Binary
3405 Floating-Point Arithmetic.
3406 -------------------------------------------------------------------------------
3408 float32
floatx80_to_float32( floatx80 a
)
3414 aSig
= extractFloatx80Frac( a
);
3415 aExp
= extractFloatx80Exp( a
);
3416 aSign
= extractFloatx80Sign( a
);
3417 if ( aExp
== 0x7FFF ) {
3418 if ( (bits64
) ( aSig
<<1 ) ) {
3419 return commonNaNToFloat32( floatx80ToCommonNaN( a
) );
3421 return packFloat32( aSign
, 0xFF, 0 );
3423 shift64RightJamming( aSig
, 33, &aSig
);
3424 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3425 return roundAndPackFloat32( aSign
, aExp
, aSig
);
3430 -------------------------------------------------------------------------------
3431 Returns the result of converting the extended double-precision floating-
3432 point value `a' to the double-precision floating-point format. The
3433 conversion is performed according to the IEC/IEEE Standard for Binary
3434 Floating-Point Arithmetic.
3435 -------------------------------------------------------------------------------
3437 float64
floatx80_to_float64( floatx80 a
)
3443 aSig
= extractFloatx80Frac( a
);
3444 aExp
= extractFloatx80Exp( a
);
3445 aSign
= extractFloatx80Sign( a
);
3446 if ( aExp
== 0x7FFF ) {
3447 if ( (bits64
) ( aSig
<<1 ) ) {
3448 return commonNaNToFloat64( floatx80ToCommonNaN( a
) );
3450 return packFloat64( aSign
, 0x7FF, 0 );
3452 shift64RightJamming( aSig
, 1, &zSig
);
3453 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3454 return roundAndPackFloat64( aSign
, aExp
, zSig
);
3461 -------------------------------------------------------------------------------
3462 Returns the result of converting the extended double-precision floating-
3463 point value `a' to the quadruple-precision floating-point format. The
3464 conversion is performed according to the IEC/IEEE Standard for Binary
3465 Floating-Point Arithmetic.
3466 -------------------------------------------------------------------------------
3468 float128
floatx80_to_float128( floatx80 a
)
3472 bits64 aSig
, zSig0
, zSig1
;
3474 aSig
= extractFloatx80Frac( a
);
3475 aExp
= extractFloatx80Exp( a
);
3476 aSign
= extractFloatx80Sign( a
);
3477 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3478 return commonNaNToFloat128( floatx80ToCommonNaN( a
) );
3480 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3481 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3488 -------------------------------------------------------------------------------
3489 Rounds the extended double-precision floating-point value `a' to an integer,
3490 and returns the result as an extended quadruple-precision floating-point
3491 value. The operation is performed according to the IEC/IEEE Standard for
3492 Binary Floating-Point Arithmetic.
3493 -------------------------------------------------------------------------------
3495 floatx80
floatx80_round_to_int( floatx80 a
)
3499 bits64 lastBitMask
, roundBitsMask
;
3503 aExp
= extractFloatx80Exp( a
);
3504 if ( 0x403E <= aExp
) {
3505 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3506 return propagateFloatx80NaN( a
, a
);
3510 if ( aExp
< 0x3FFF ) {
3512 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3515 float_exception_flags
|= float_flag_inexact
;
3516 aSign
= extractFloatx80Sign( a
);
3517 switch ( float_rounding_mode
) {
3518 case float_round_nearest_even
:
3519 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3522 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3525 case float_round_to_zero
:
3527 case float_round_down
:
3530 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3531 : packFloatx80( 0, 0, 0 );
3532 case float_round_up
:
3534 aSign
? packFloatx80( 1, 0, 0 )
3535 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3537 return packFloatx80( aSign
, 0, 0 );
3540 lastBitMask
<<= 0x403E - aExp
;
3541 roundBitsMask
= lastBitMask
- 1;
3543 roundingMode
= float_rounding_mode
;
3544 if ( roundingMode
== float_round_nearest_even
) {
3545 z
.low
+= lastBitMask
>>1;
3546 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
3548 else if ( roundingMode
!= float_round_to_zero
) {
3549 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
3550 z
.low
+= roundBitsMask
;
3553 z
.low
&= ~ roundBitsMask
;
3556 z
.low
= LIT64( 0x8000000000000000 );
3558 if ( z
.low
!= a
.low
) float_exception_flags
|= float_flag_inexact
;
3564 -------------------------------------------------------------------------------
3565 Returns the result of adding the absolute values of the extended double-
3566 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3567 negated before being returned. `zSign' is ignored if the result is a NaN.
3568 The addition is performed according to the IEC/IEEE Standard for Binary
3569 Floating-Point Arithmetic.
3570 -------------------------------------------------------------------------------
3572 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3574 int32 aExp
, bExp
, zExp
;
3575 bits64 aSig
, bSig
, zSig0
, zSig1
;
3578 aSig
= extractFloatx80Frac( a
);
3579 aExp
= extractFloatx80Exp( a
);
3580 bSig
= extractFloatx80Frac( b
);
3581 bExp
= extractFloatx80Exp( b
);
3582 expDiff
= aExp
- bExp
;
3583 if ( 0 < expDiff
) {
3584 if ( aExp
== 0x7FFF ) {
3585 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3588 if ( bExp
== 0 ) --expDiff
;
3589 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3592 else if ( expDiff
< 0 ) {
3593 if ( bExp
== 0x7FFF ) {
3594 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3595 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3597 if ( aExp
== 0 ) ++expDiff
;
3598 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3602 if ( aExp
== 0x7FFF ) {
3603 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3604 return propagateFloatx80NaN( a
, b
);
3609 zSig0
= aSig
+ bSig
;
3611 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
3617 zSig0
= aSig
+ bSig
;
3618 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
3620 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3621 zSig0
|= LIT64( 0x8000000000000000 );
3625 roundAndPackFloatx80(
3626 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3631 -------------------------------------------------------------------------------
3632 Returns the result of subtracting the absolute values of the extended
3633 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3634 difference is negated before being returned. `zSign' is ignored if the
3635 result is a NaN. The subtraction is performed according to the IEC/IEEE
3636 Standard for Binary Floating-Point Arithmetic.
3637 -------------------------------------------------------------------------------
3639 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3641 int32 aExp
, bExp
, zExp
;
3642 bits64 aSig
, bSig
, zSig0
, zSig1
;
3646 aSig
= extractFloatx80Frac( a
);
3647 aExp
= extractFloatx80Exp( a
);
3648 bSig
= extractFloatx80Frac( b
);
3649 bExp
= extractFloatx80Exp( b
);
3650 expDiff
= aExp
- bExp
;
3651 if ( 0 < expDiff
) goto aExpBigger
;
3652 if ( expDiff
< 0 ) goto bExpBigger
;
3653 if ( aExp
== 0x7FFF ) {
3654 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3655 return propagateFloatx80NaN( a
, b
);
3657 float_raise( float_flag_invalid
);
3658 z
.low
= floatx80_default_nan_low
;
3659 z
.high
= floatx80_default_nan_high
;
3667 if ( bSig
< aSig
) goto aBigger
;
3668 if ( aSig
< bSig
) goto bBigger
;
3669 return packFloatx80( float_rounding_mode
== float_round_down
, 0, 0 );
3671 if ( bExp
== 0x7FFF ) {
3672 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3673 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3675 if ( aExp
== 0 ) ++expDiff
;
3676 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3678 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
3681 goto normalizeRoundAndPack
;
3683 if ( aExp
== 0x7FFF ) {
3684 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3687 if ( bExp
== 0 ) --expDiff
;
3688 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3690 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
3692 normalizeRoundAndPack
:
3694 normalizeRoundAndPackFloatx80(
3695 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3700 -------------------------------------------------------------------------------
3701 Returns the result of adding the extended double-precision floating-point
3702 values `a' and `b'. The operation is performed according to the IEC/IEEE
3703 Standard for Binary Floating-Point Arithmetic.
3704 -------------------------------------------------------------------------------
3706 floatx80
floatx80_add( floatx80 a
, floatx80 b
)
3710 aSign
= extractFloatx80Sign( a
);
3711 bSign
= extractFloatx80Sign( b
);
3712 if ( aSign
== bSign
) {
3713 return addFloatx80Sigs( a
, b
, aSign
);
3716 return subFloatx80Sigs( a
, b
, aSign
);
3722 -------------------------------------------------------------------------------
3723 Returns the result of subtracting the extended double-precision floating-
3724 point values `a' and `b'. The operation is performed according to the
3725 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3726 -------------------------------------------------------------------------------
3728 floatx80
floatx80_sub( floatx80 a
, floatx80 b
)
3732 aSign
= extractFloatx80Sign( a
);
3733 bSign
= extractFloatx80Sign( b
);
3734 if ( aSign
== bSign
) {
3735 return subFloatx80Sigs( a
, b
, aSign
);
3738 return addFloatx80Sigs( a
, b
, aSign
);
3744 -------------------------------------------------------------------------------
3745 Returns the result of multiplying the extended double-precision floating-
3746 point values `a' and `b'. The operation is performed according to the
3747 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3748 -------------------------------------------------------------------------------
3750 floatx80
floatx80_mul( floatx80 a
, floatx80 b
)
3752 flag aSign
, bSign
, zSign
;
3753 int32 aExp
, bExp
, zExp
;
3754 bits64 aSig
, bSig
, zSig0
, zSig1
;
3757 aSig
= extractFloatx80Frac( a
);
3758 aExp
= extractFloatx80Exp( a
);
3759 aSign
= extractFloatx80Sign( a
);
3760 bSig
= extractFloatx80Frac( b
);
3761 bExp
= extractFloatx80Exp( b
);
3762 bSign
= extractFloatx80Sign( b
);
3763 zSign
= aSign
^ bSign
;
3764 if ( aExp
== 0x7FFF ) {
3765 if ( (bits64
) ( aSig
<<1 )
3766 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3767 return propagateFloatx80NaN( a
, b
);
3769 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
3770 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3772 if ( bExp
== 0x7FFF ) {
3773 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3774 if ( ( aExp
| aSig
) == 0 ) {
3776 float_raise( float_flag_invalid
);
3777 z
.low
= floatx80_default_nan_low
;
3778 z
.high
= floatx80_default_nan_high
;
3781 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3784 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3785 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3788 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3789 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3791 zExp
= aExp
+ bExp
- 0x3FFE;
3792 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3793 if ( 0 < (sbits64
) zSig0
) {
3794 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3798 roundAndPackFloatx80(
3799 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3804 -------------------------------------------------------------------------------
3805 Returns the result of dividing the extended double-precision floating-point
3806 value `a' by the corresponding value `b'. The operation is performed
3807 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3808 -------------------------------------------------------------------------------
3810 floatx80
floatx80_div( floatx80 a
, floatx80 b
)
3812 flag aSign
, bSign
, zSign
;
3813 int32 aExp
, bExp
, zExp
;
3814 bits64 aSig
, bSig
, zSig0
, zSig1
;
3815 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3818 aSig
= extractFloatx80Frac( a
);
3819 aExp
= extractFloatx80Exp( a
);
3820 aSign
= extractFloatx80Sign( a
);
3821 bSig
= extractFloatx80Frac( b
);
3822 bExp
= extractFloatx80Exp( b
);
3823 bSign
= extractFloatx80Sign( b
);
3824 zSign
= aSign
^ bSign
;
3825 if ( aExp
== 0x7FFF ) {
3826 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3827 if ( bExp
== 0x7FFF ) {
3828 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3831 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3833 if ( bExp
== 0x7FFF ) {
3834 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3835 return packFloatx80( zSign
, 0, 0 );
3839 if ( ( aExp
| aSig
) == 0 ) {
3841 float_raise( float_flag_invalid
);
3842 z
.low
= floatx80_default_nan_low
;
3843 z
.high
= floatx80_default_nan_high
;
3846 float_raise( float_flag_divbyzero
);
3847 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3849 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3852 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3853 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3855 zExp
= aExp
- bExp
+ 0x3FFE;
3857 if ( bSig
<= aSig
) {
3858 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3861 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3862 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3863 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3864 while ( (sbits64
) rem0
< 0 ) {
3866 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3868 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3869 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3870 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3871 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3872 while ( (sbits64
) rem1
< 0 ) {
3874 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3876 zSig1
|= ( ( rem1
| rem2
) != 0 );
3879 roundAndPackFloatx80(
3880 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3885 -------------------------------------------------------------------------------
3886 Returns the remainder of the extended double-precision floating-point value
3887 `a' with respect to the corresponding value `b'. The operation is performed
3888 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3889 -------------------------------------------------------------------------------
3891 floatx80
floatx80_rem( floatx80 a
, floatx80 b
)
3893 flag aSign
, bSign
, zSign
;
3894 int32 aExp
, bExp
, expDiff
;
3895 bits64 aSig0
, aSig1
, bSig
;
3896 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3899 aSig0
= extractFloatx80Frac( a
);
3900 aExp
= extractFloatx80Exp( a
);
3901 aSign
= extractFloatx80Sign( a
);
3902 bSig
= extractFloatx80Frac( b
);
3903 bExp
= extractFloatx80Exp( b
);
3904 bSign
= extractFloatx80Sign( b
);
3905 if ( aExp
== 0x7FFF ) {
3906 if ( (bits64
) ( aSig0
<<1 )
3907 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3908 return propagateFloatx80NaN( a
, b
);
3912 if ( bExp
== 0x7FFF ) {
3913 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3919 float_raise( float_flag_invalid
);
3920 z
.low
= floatx80_default_nan_low
;
3921 z
.high
= floatx80_default_nan_high
;
3924 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3927 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3928 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3930 bSig
|= LIT64( 0x8000000000000000 );
3932 expDiff
= aExp
- bExp
;
3934 if ( expDiff
< 0 ) {
3935 if ( expDiff
< -1 ) return a
;
3936 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3939 q
= ( bSig
<= aSig0
);
3940 if ( q
) aSig0
-= bSig
;
3942 while ( 0 < expDiff
) {
3943 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3944 q
= ( 2 < q
) ? q
- 2 : 0;
3945 mul64To128( bSig
, q
, &term0
, &term1
);
3946 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3947 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3951 if ( 0 < expDiff
) {
3952 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3953 q
= ( 2 < q
) ? q
- 2 : 0;
3955 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
3956 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3957 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
3958 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
3960 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3967 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
3968 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3969 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3972 aSig0
= alternateASig0
;
3973 aSig1
= alternateASig1
;
3977 normalizeRoundAndPackFloatx80(
3978 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
);
3983 -------------------------------------------------------------------------------
3984 Returns the square root of the extended double-precision floating-point
3985 value `a'. The operation is performed according to the IEC/IEEE Standard
3986 for Binary Floating-Point Arithmetic.
3987 -------------------------------------------------------------------------------
3989 floatx80
floatx80_sqrt( floatx80 a
)
3993 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
3994 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
3997 aSig0
= extractFloatx80Frac( a
);
3998 aExp
= extractFloatx80Exp( a
);
3999 aSign
= extractFloatx80Sign( a
);
4000 if ( aExp
== 0x7FFF ) {
4001 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a
);
4002 if ( ! aSign
) return a
;
4006 if ( ( aExp
| aSig0
) == 0 ) return a
;
4008 float_raise( float_flag_invalid
);
4009 z
.low
= floatx80_default_nan_low
;
4010 z
.high
= floatx80_default_nan_high
;
4014 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4015 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4017 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4018 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4019 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4020 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4021 doubleZSig0
= zSig0
<<1;
4022 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4023 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4024 while ( (sbits64
) rem0
< 0 ) {
4027 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4029 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4030 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4031 if ( zSig1
== 0 ) zSig1
= 1;
4032 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4033 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4034 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4035 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4036 while ( (sbits64
) rem1
< 0 ) {
4038 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4040 term2
|= doubleZSig0
;
4041 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4043 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4045 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4046 zSig0
|= doubleZSig0
;
4048 roundAndPackFloatx80(
4049 floatx80_rounding_precision
, 0, zExp
, zSig0
, zSig1
);
4054 -------------------------------------------------------------------------------
4055 Returns 1 if the extended double-precision floating-point value `a' is
4056 equal to the corresponding value `b', and 0 otherwise. The comparison is
4057 performed according to the IEC/IEEE Standard for Binary Floating-Point
4059 -------------------------------------------------------------------------------
4061 flag
floatx80_eq( floatx80 a
, floatx80 b
)
4064 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4065 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4066 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4067 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4069 if ( floatx80_is_signaling_nan( a
)
4070 || floatx80_is_signaling_nan( b
) ) {
4071 float_raise( float_flag_invalid
);
4077 && ( ( a
.high
== b
.high
)
4079 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4085 -------------------------------------------------------------------------------
4086 Returns 1 if the extended double-precision floating-point value `a' is
4087 less than or equal to the corresponding value `b', and 0 otherwise. The
4088 comparison is performed according to the IEC/IEEE Standard for Binary
4089 Floating-Point Arithmetic.
4090 -------------------------------------------------------------------------------
4092 flag
floatx80_le( floatx80 a
, floatx80 b
)
4096 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4097 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4098 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4099 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4101 float_raise( float_flag_invalid
);
4104 aSign
= extractFloatx80Sign( a
);
4105 bSign
= extractFloatx80Sign( b
);
4106 if ( aSign
!= bSign
) {
4109 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4113 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4114 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4119 -------------------------------------------------------------------------------
4120 Returns 1 if the extended double-precision floating-point value `a' is
4121 less than the corresponding value `b', and 0 otherwise. The comparison
4122 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4124 -------------------------------------------------------------------------------
4126 flag
floatx80_lt( floatx80 a
, floatx80 b
)
4130 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4131 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4132 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4133 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4135 float_raise( float_flag_invalid
);
4138 aSign
= extractFloatx80Sign( a
);
4139 bSign
= extractFloatx80Sign( b
);
4140 if ( aSign
!= bSign
) {
4143 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4147 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4148 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4153 -------------------------------------------------------------------------------
4154 Returns 1 if the extended double-precision floating-point value `a' is equal
4155 to the corresponding value `b', and 0 otherwise. The invalid exception is
4156 raised if either operand is a NaN. Otherwise, the comparison is performed
4157 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4158 -------------------------------------------------------------------------------
4160 flag
floatx80_eq_signaling( floatx80 a
, floatx80 b
)
4163 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4164 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4165 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4166 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4168 float_raise( float_flag_invalid
);
4173 && ( ( a
.high
== b
.high
)
4175 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4181 -------------------------------------------------------------------------------
4182 Returns 1 if the extended double-precision floating-point value `a' is less
4183 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4184 do not cause an exception. Otherwise, the comparison is performed according
4185 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4186 -------------------------------------------------------------------------------
4188 flag
floatx80_le_quiet( floatx80 a
, floatx80 b
)
4192 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4193 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4194 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4195 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4197 if ( floatx80_is_signaling_nan( a
)
4198 || floatx80_is_signaling_nan( b
) ) {
4199 float_raise( float_flag_invalid
);
4203 aSign
= extractFloatx80Sign( a
);
4204 bSign
= extractFloatx80Sign( b
);
4205 if ( aSign
!= bSign
) {
4208 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4212 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4213 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4218 -------------------------------------------------------------------------------
4219 Returns 1 if the extended double-precision floating-point value `a' is less
4220 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4221 an exception. Otherwise, the comparison is performed according to the
4222 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4223 -------------------------------------------------------------------------------
4225 flag
floatx80_lt_quiet( floatx80 a
, floatx80 b
)
4229 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4230 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4231 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4232 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4234 if ( floatx80_is_signaling_nan( a
)
4235 || floatx80_is_signaling_nan( b
) ) {
4236 float_raise( float_flag_invalid
);
4240 aSign
= extractFloatx80Sign( a
);
4241 bSign
= extractFloatx80Sign( b
);
4242 if ( aSign
!= bSign
) {
4245 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4249 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4250 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4259 -------------------------------------------------------------------------------
4260 Returns the result of converting the quadruple-precision floating-point
4261 value `a' to the 32-bit two's complement integer format. The conversion
4262 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4263 Arithmetic---which means in particular that the conversion is rounded
4264 according to the current rounding mode. If `a' is a NaN, the largest
4265 positive integer is returned. Otherwise, if the conversion overflows, the
4266 largest integer with the same sign as `a' is returned.
4267 -------------------------------------------------------------------------------
4269 int32
float128_to_int32( float128 a
)
4272 int32 aExp
, shiftCount
;
4273 bits64 aSig0
, aSig1
;
4275 aSig1
= extractFloat128Frac1( a
);
4276 aSig0
= extractFloat128Frac0( a
);
4277 aExp
= extractFloat128Exp( a
);
4278 aSign
= extractFloat128Sign( a
);
4279 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4280 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4281 aSig0
|= ( aSig1
!= 0 );
4282 shiftCount
= 0x4028 - aExp
;
4283 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4284 return roundAndPackInt32( aSign
, aSig0
);
4289 -------------------------------------------------------------------------------
4290 Returns the result of converting the quadruple-precision floating-point
4291 value `a' to the 32-bit two's complement integer format. The conversion
4292 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4293 Arithmetic, except that the conversion is always rounded toward zero. If
4294 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4295 conversion overflows, the largest integer with the same sign as `a' is
4297 -------------------------------------------------------------------------------
4299 int32
float128_to_int32_round_to_zero( float128 a
)
4302 int32 aExp
, shiftCount
;
4303 bits64 aSig0
, aSig1
, savedASig
;
4306 aSig1
= extractFloat128Frac1( a
);
4307 aSig0
= extractFloat128Frac0( a
);
4308 aExp
= extractFloat128Exp( a
);
4309 aSign
= extractFloat128Sign( a
);
4310 aSig0
|= ( aSig1
!= 0 );
4311 if ( 0x401E < aExp
) {
4312 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4315 else if ( aExp
< 0x3FFF ) {
4316 if ( aExp
|| aSig0
) float_exception_flags
|= float_flag_inexact
;
4319 aSig0
|= LIT64( 0x0001000000000000 );
4320 shiftCount
= 0x402F - aExp
;
4322 aSig0
>>= shiftCount
;
4324 if ( aSign
) z
= - z
;
4325 if ( ( z
< 0 ) ^ aSign
) {
4327 float_raise( float_flag_invalid
);
4328 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4330 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4331 float_exception_flags
|= float_flag_inexact
;
4338 -------------------------------------------------------------------------------
4339 Returns the result of converting the quadruple-precision floating-point
4340 value `a' to the 64-bit two's complement integer format. The conversion
4341 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4342 Arithmetic---which means in particular that the conversion is rounded
4343 according to the current rounding mode. If `a' is a NaN, the largest
4344 positive integer is returned. Otherwise, if the conversion overflows, the
4345 largest integer with the same sign as `a' is returned.
4346 -------------------------------------------------------------------------------
4348 int64
float128_to_int64( float128 a
)
4351 int32 aExp
, shiftCount
;
4352 bits64 aSig0
, aSig1
;
4354 aSig1
= extractFloat128Frac1( a
);
4355 aSig0
= extractFloat128Frac0( a
);
4356 aExp
= extractFloat128Exp( a
);
4357 aSign
= extractFloat128Sign( a
);
4358 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4359 shiftCount
= 0x402F - aExp
;
4360 if ( shiftCount
<= 0 ) {
4361 if ( 0x403E < aExp
) {
4362 float_raise( float_flag_invalid
);
4364 || ( ( aExp
== 0x7FFF )
4365 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4368 return LIT64( 0x7FFFFFFFFFFFFFFF );
4370 return (sbits64
) LIT64( 0x8000000000000000 );
4372 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4375 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4377 return roundAndPackInt64( aSign
, aSig0
, aSig1
);
4382 -------------------------------------------------------------------------------
4383 Returns the result of converting the quadruple-precision floating-point
4384 value `a' to the 64-bit two's complement integer format. The conversion
4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4386 Arithmetic, except that the conversion is always rounded toward zero.
4387 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4388 the conversion overflows, the largest integer with the same sign as `a' is
4390 -------------------------------------------------------------------------------
4392 int64
float128_to_int64_round_to_zero( float128 a
)
4395 int32 aExp
, shiftCount
;
4396 bits64 aSig0
, aSig1
;
4399 aSig1
= extractFloat128Frac1( a
);
4400 aSig0
= extractFloat128Frac0( a
);
4401 aExp
= extractFloat128Exp( a
);
4402 aSign
= extractFloat128Sign( a
);
4403 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4404 shiftCount
= aExp
- 0x402F;
4405 if ( 0 < shiftCount
) {
4406 if ( 0x403E <= aExp
) {
4407 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4408 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4409 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4410 if ( aSig1
) float_exception_flags
|= float_flag_inexact
;
4413 float_raise( float_flag_invalid
);
4414 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4415 return LIT64( 0x7FFFFFFFFFFFFFFF );
4418 return (sbits64
) LIT64( 0x8000000000000000 );
4420 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4421 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4422 float_exception_flags
|= float_flag_inexact
;
4426 if ( aExp
< 0x3FFF ) {
4427 if ( aExp
| aSig0
| aSig1
) {
4428 float_exception_flags
|= float_flag_inexact
;
4432 z
= aSig0
>>( - shiftCount
);
4434 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4435 float_exception_flags
|= float_flag_inexact
;
4438 if ( aSign
) z
= - z
;
4444 * just like above - but do not care for overflow of signed results
4446 uint64
float128_to_uint64_round_to_zero( float128 a
)
4449 int32 aExp
, shiftCount
;
4450 bits64 aSig0
, aSig1
;
4453 aSig1
= extractFloat128Frac1( a
);
4454 aSig0
= extractFloat128Frac0( a
);
4455 aExp
= extractFloat128Exp( a
);
4456 aSign
= extractFloat128Sign( a
);
4457 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4458 shiftCount
= aExp
- 0x402F;
4459 if ( 0 < shiftCount
) {
4460 if ( 0x403F <= aExp
) {
4461 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4462 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4463 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4464 if ( aSig1
) float_exception_flags
|= float_flag_inexact
;
4467 float_raise( float_flag_invalid
);
4469 return LIT64( 0xFFFFFFFFFFFFFFFF );
4471 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4472 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4473 float_exception_flags
|= float_flag_inexact
;
4477 if ( aExp
< 0x3FFF ) {
4478 if ( aExp
| aSig0
| aSig1
) {
4479 float_exception_flags
|= float_flag_inexact
;
4483 z
= aSig0
>>( - shiftCount
);
4484 if (aSig1
|| ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4485 float_exception_flags
|= float_flag_inexact
;
4488 if ( aSign
) z
= - z
;
4494 -------------------------------------------------------------------------------
4495 Returns the result of converting the quadruple-precision floating-point
4496 value `a' to the single-precision floating-point format. The conversion
4497 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4499 -------------------------------------------------------------------------------
4501 float32
float128_to_float32( float128 a
)
4505 bits64 aSig0
, aSig1
;
4508 aSig1
= extractFloat128Frac1( a
);
4509 aSig0
= extractFloat128Frac0( a
);
4510 aExp
= extractFloat128Exp( a
);
4511 aSign
= extractFloat128Sign( a
);
4512 if ( aExp
== 0x7FFF ) {
4513 if ( aSig0
| aSig1
) {
4514 return commonNaNToFloat32( float128ToCommonNaN( a
) );
4516 return packFloat32( aSign
, 0xFF, 0 );
4518 aSig0
|= ( aSig1
!= 0 );
4519 shift64RightJamming( aSig0
, 18, &aSig0
);
4521 if ( aExp
|| zSig
) {
4525 return roundAndPackFloat32( aSign
, aExp
, zSig
);
4530 -------------------------------------------------------------------------------
4531 Returns the result of converting the quadruple-precision floating-point
4532 value `a' to the double-precision floating-point format. The conversion
4533 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4535 -------------------------------------------------------------------------------
4537 float64
float128_to_float64( float128 a
)
4541 bits64 aSig0
, aSig1
;
4543 aSig1
= extractFloat128Frac1( a
);
4544 aSig0
= extractFloat128Frac0( a
);
4545 aExp
= extractFloat128Exp( a
);
4546 aSign
= extractFloat128Sign( a
);
4547 if ( aExp
== 0x7FFF ) {
4548 if ( aSig0
| aSig1
) {
4549 return commonNaNToFloat64( float128ToCommonNaN( a
) );
4551 return packFloat64( aSign
, 0x7FF, 0 );
4553 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4554 aSig0
|= ( aSig1
!= 0 );
4555 if ( aExp
|| aSig0
) {
4556 aSig0
|= LIT64( 0x4000000000000000 );
4559 return roundAndPackFloat64( aSign
, aExp
, aSig0
);
4566 -------------------------------------------------------------------------------
4567 Returns the result of converting the quadruple-precision floating-point
4568 value `a' to the extended double-precision floating-point format. The
4569 conversion is performed according to the IEC/IEEE Standard for Binary
4570 Floating-Point Arithmetic.
4571 -------------------------------------------------------------------------------
4573 floatx80
float128_to_floatx80( float128 a
)
4577 bits64 aSig0
, aSig1
;
4579 aSig1
= extractFloat128Frac1( a
);
4580 aSig0
= extractFloat128Frac0( a
);
4581 aExp
= extractFloat128Exp( a
);
4582 aSign
= extractFloat128Sign( a
);
4583 if ( aExp
== 0x7FFF ) {
4584 if ( aSig0
| aSig1
) {
4585 return commonNaNToFloatx80( float128ToCommonNaN( a
) );
4587 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4590 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4591 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4594 aSig0
|= LIT64( 0x0001000000000000 );
4596 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4597 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1
);
4604 -------------------------------------------------------------------------------
4605 Rounds the quadruple-precision floating-point value `a' to an integer, and
4606 returns the result as a quadruple-precision floating-point value. The
4607 operation is performed according to the IEC/IEEE Standard for Binary
4608 Floating-Point Arithmetic.
4609 -------------------------------------------------------------------------------
4611 float128
float128_round_to_int( float128 a
)
4615 bits64 lastBitMask
, roundBitsMask
;
4619 aExp
= extractFloat128Exp( a
);
4620 if ( 0x402F <= aExp
) {
4621 if ( 0x406F <= aExp
) {
4622 if ( ( aExp
== 0x7FFF )
4623 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
4625 return propagateFloat128NaN( a
, a
);
4630 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
4631 roundBitsMask
= lastBitMask
- 1;
4633 roundingMode
= float_rounding_mode
;
4634 if ( roundingMode
== float_round_nearest_even
) {
4635 if ( lastBitMask
) {
4636 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
4637 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4640 if ( (sbits64
) z
.low
< 0 ) {
4642 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
4646 else if ( roundingMode
!= float_round_to_zero
) {
4647 if ( extractFloat128Sign( z
)
4648 ^ ( roundingMode
== float_round_up
) ) {
4649 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
4652 z
.low
&= ~ roundBitsMask
;
4655 if ( aExp
< 0x3FFF ) {
4656 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
4657 float_exception_flags
|= float_flag_inexact
;
4658 aSign
= extractFloat128Sign( a
);
4659 switch ( float_rounding_mode
) {
4660 case float_round_nearest_even
:
4661 if ( ( aExp
== 0x3FFE )
4662 && ( extractFloat128Frac0( a
)
4663 | extractFloat128Frac1( a
) )
4665 return packFloat128( aSign
, 0x3FFF, 0, 0 );
4668 case float_round_to_zero
:
4670 case float_round_down
:
4672 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
4673 : packFloat128( 0, 0, 0, 0 );
4674 case float_round_up
:
4676 aSign
? packFloat128( 1, 0, 0, 0 )
4677 : packFloat128( 0, 0x3FFF, 0, 0 );
4679 return packFloat128( aSign
, 0, 0, 0 );
4682 lastBitMask
<<= 0x402F - aExp
;
4683 roundBitsMask
= lastBitMask
- 1;
4686 roundingMode
= float_rounding_mode
;
4687 if ( roundingMode
== float_round_nearest_even
) {
4688 z
.high
+= lastBitMask
>>1;
4689 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
4690 z
.high
&= ~ lastBitMask
;
4693 else if ( roundingMode
!= float_round_to_zero
) {
4694 if ( extractFloat128Sign( z
)
4695 ^ ( roundingMode
== float_round_up
) ) {
4696 z
.high
|= ( a
.low
!= 0 );
4697 z
.high
+= roundBitsMask
;
4700 z
.high
&= ~ roundBitsMask
;
4702 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
4703 float_exception_flags
|= float_flag_inexact
;
4710 -------------------------------------------------------------------------------
4711 Returns the result of adding the absolute values of the quadruple-precision
4712 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4713 before being returned. `zSign' is ignored if the result is a NaN.
4714 The addition is performed according to the IEC/IEEE Standard for Binary
4715 Floating-Point Arithmetic.
4716 -------------------------------------------------------------------------------
4718 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4720 int32 aExp
, bExp
, zExp
;
4721 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4724 aSig1
= extractFloat128Frac1( a
);
4725 aSig0
= extractFloat128Frac0( a
);
4726 aExp
= extractFloat128Exp( a
);
4727 bSig1
= extractFloat128Frac1( b
);
4728 bSig0
= extractFloat128Frac0( b
);
4729 bExp
= extractFloat128Exp( b
);
4730 expDiff
= aExp
- bExp
;
4731 if ( 0 < expDiff
) {
4732 if ( aExp
== 0x7FFF ) {
4733 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4740 bSig0
|= LIT64( 0x0001000000000000 );
4742 shift128ExtraRightJamming(
4743 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
4746 else if ( expDiff
< 0 ) {
4747 if ( bExp
== 0x7FFF ) {
4748 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4749 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4755 aSig0
|= LIT64( 0x0001000000000000 );
4757 shift128ExtraRightJamming(
4758 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
4762 if ( aExp
== 0x7FFF ) {
4763 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4764 return propagateFloat128NaN( a
, b
);
4768 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4769 if ( aExp
== 0 ) return packFloat128( zSign
, 0, zSig0
, zSig1
);
4771 zSig0
|= LIT64( 0x0002000000000000 );
4775 aSig0
|= LIT64( 0x0001000000000000 );
4776 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4778 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
4781 shift128ExtraRightJamming(
4782 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4784 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4789 -------------------------------------------------------------------------------
4790 Returns the result of subtracting the absolute values of the quadruple-
4791 precision floating-point values `a' and `b'. If `zSign' is 1, the
4792 difference is negated before being returned. `zSign' is ignored if the
4793 result is a NaN. The subtraction is performed according to the IEC/IEEE
4794 Standard for Binary Floating-Point Arithmetic.
4795 -------------------------------------------------------------------------------
4797 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4799 int32 aExp
, bExp
, zExp
;
4800 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
4804 aSig1
= extractFloat128Frac1( a
);
4805 aSig0
= extractFloat128Frac0( a
);
4806 aExp
= extractFloat128Exp( a
);
4807 bSig1
= extractFloat128Frac1( b
);
4808 bSig0
= extractFloat128Frac0( b
);
4809 bExp
= extractFloat128Exp( b
);
4810 expDiff
= aExp
- bExp
;
4811 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4812 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
4813 if ( 0 < expDiff
) goto aExpBigger
;
4814 if ( expDiff
< 0 ) goto bExpBigger
;
4815 if ( aExp
== 0x7FFF ) {
4816 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4817 return propagateFloat128NaN( a
, b
);
4819 float_raise( float_flag_invalid
);
4820 z
.low
= float128_default_nan_low
;
4821 z
.high
= float128_default_nan_high
;
4828 if ( bSig0
< aSig0
) goto aBigger
;
4829 if ( aSig0
< bSig0
) goto bBigger
;
4830 if ( bSig1
< aSig1
) goto aBigger
;
4831 if ( aSig1
< bSig1
) goto bBigger
;
4832 return packFloat128( float_rounding_mode
== float_round_down
, 0, 0, 0 );
4834 if ( bExp
== 0x7FFF ) {
4835 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4836 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
4842 aSig0
|= LIT64( 0x4000000000000000 );
4844 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4845 bSig0
|= LIT64( 0x4000000000000000 );
4847 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4850 goto normalizeRoundAndPack
;
4852 if ( aExp
== 0x7FFF ) {
4853 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4860 bSig0
|= LIT64( 0x4000000000000000 );
4862 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
4863 aSig0
|= LIT64( 0x4000000000000000 );
4865 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4867 normalizeRoundAndPack
:
4869 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1
);
4874 -------------------------------------------------------------------------------
4875 Returns the result of adding the quadruple-precision floating-point values
4876 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4877 for Binary Floating-Point Arithmetic.
4878 -------------------------------------------------------------------------------
4880 float128
float128_add( float128 a
, float128 b
)
4884 aSign
= extractFloat128Sign( a
);
4885 bSign
= extractFloat128Sign( b
);
4886 if ( aSign
== bSign
) {
4887 return addFloat128Sigs( a
, b
, aSign
);
4890 return subFloat128Sigs( a
, b
, aSign
);
4896 -------------------------------------------------------------------------------
4897 Returns the result of subtracting the quadruple-precision floating-point
4898 values `a' and `b'. The operation is performed according to the IEC/IEEE
4899 Standard for Binary Floating-Point Arithmetic.
4900 -------------------------------------------------------------------------------
4902 float128
float128_sub( float128 a
, float128 b
)
4906 aSign
= extractFloat128Sign( a
);
4907 bSign
= extractFloat128Sign( b
);
4908 if ( aSign
== bSign
) {
4909 return subFloat128Sigs( a
, b
, aSign
);
4912 return addFloat128Sigs( a
, b
, aSign
);
4918 -------------------------------------------------------------------------------
4919 Returns the result of multiplying the quadruple-precision floating-point
4920 values `a' and `b'. The operation is performed according to the IEC/IEEE
4921 Standard for Binary Floating-Point Arithmetic.
4922 -------------------------------------------------------------------------------
4924 float128
float128_mul( float128 a
, float128 b
)
4926 flag aSign
, bSign
, zSign
;
4927 int32 aExp
, bExp
, zExp
;
4928 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
4931 aSig1
= extractFloat128Frac1( a
);
4932 aSig0
= extractFloat128Frac0( a
);
4933 aExp
= extractFloat128Exp( a
);
4934 aSign
= extractFloat128Sign( a
);
4935 bSig1
= extractFloat128Frac1( b
);
4936 bSig0
= extractFloat128Frac0( b
);
4937 bExp
= extractFloat128Exp( b
);
4938 bSign
= extractFloat128Sign( b
);
4939 zSign
= aSign
^ bSign
;
4940 if ( aExp
== 0x7FFF ) {
4941 if ( ( aSig0
| aSig1
)
4942 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4943 return propagateFloat128NaN( a
, b
);
4945 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
4946 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4948 if ( bExp
== 0x7FFF ) {
4949 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4950 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4952 float_raise( float_flag_invalid
);
4953 z
.low
= float128_default_nan_low
;
4954 z
.high
= float128_default_nan_high
;
4957 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4960 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4961 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4964 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4965 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4967 zExp
= aExp
+ bExp
- 0x4000;
4968 aSig0
|= LIT64( 0x0001000000000000 );
4969 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
4970 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
4971 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4972 zSig2
|= ( zSig3
!= 0 );
4973 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
4974 shift128ExtraRightJamming(
4975 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4978 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4983 -------------------------------------------------------------------------------
4984 Returns the result of dividing the quadruple-precision floating-point value
4985 `a' by the corresponding value `b'. The operation is performed according to
4986 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4987 -------------------------------------------------------------------------------
4989 float128
float128_div( float128 a
, float128 b
)
4991 flag aSign
, bSign
, zSign
;
4992 int32 aExp
, bExp
, zExp
;
4993 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4994 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4997 aSig1
= extractFloat128Frac1( a
);
4998 aSig0
= extractFloat128Frac0( a
);
4999 aExp
= extractFloat128Exp( a
);
5000 aSign
= extractFloat128Sign( a
);
5001 bSig1
= extractFloat128Frac1( b
);
5002 bSig0
= extractFloat128Frac0( b
);
5003 bExp
= extractFloat128Exp( b
);
5004 bSign
= extractFloat128Sign( b
);
5005 zSign
= aSign
^ bSign
;
5006 if ( aExp
== 0x7FFF ) {
5007 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
5008 if ( bExp
== 0x7FFF ) {
5009 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
5012 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5014 if ( bExp
== 0x7FFF ) {
5015 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
5016 return packFloat128( zSign
, 0, 0, 0 );
5019 if ( ( bSig0
| bSig1
) == 0 ) {
5020 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5022 float_raise( float_flag_invalid
);
5023 z
.low
= float128_default_nan_low
;
5024 z
.high
= float128_default_nan_high
;
5027 float_raise( float_flag_divbyzero
);
5028 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5030 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5033 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5034 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5036 zExp
= aExp
- bExp
+ 0x3FFD;
5038 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
5040 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5041 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
5042 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
5045 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5046 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
5047 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
5048 while ( (sbits64
) rem0
< 0 ) {
5050 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5052 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5053 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5054 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5055 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5056 while ( (sbits64
) rem1
< 0 ) {
5058 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5060 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5062 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5063 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
5068 -------------------------------------------------------------------------------
5069 Returns the remainder of the quadruple-precision floating-point value `a'
5070 with respect to the corresponding value `b'. The operation is performed
5071 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5072 -------------------------------------------------------------------------------
5074 float128
float128_rem( float128 a
, float128 b
)
5076 flag aSign
, bSign
, zSign
;
5077 int32 aExp
, bExp
, expDiff
;
5078 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5079 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5083 aSig1
= extractFloat128Frac1( a
);
5084 aSig0
= extractFloat128Frac0( a
);
5085 aExp
= extractFloat128Exp( a
);
5086 aSign
= extractFloat128Sign( a
);
5087 bSig1
= extractFloat128Frac1( b
);
5088 bSig0
= extractFloat128Frac0( b
);
5089 bExp
= extractFloat128Exp( b
);
5090 bSign
= extractFloat128Sign( b
);
5091 if ( aExp
== 0x7FFF ) {
5092 if ( ( aSig0
| aSig1
)
5093 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5094 return propagateFloat128NaN( a
, b
);
5098 if ( bExp
== 0x7FFF ) {
5099 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
5103 if ( ( bSig0
| bSig1
) == 0 ) {
5105 float_raise( float_flag_invalid
);
5106 z
.low
= float128_default_nan_low
;
5107 z
.high
= float128_default_nan_high
;
5110 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5113 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5114 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5116 expDiff
= aExp
- bExp
;
5117 if ( expDiff
< -1 ) return a
;
5119 aSig0
| LIT64( 0x0001000000000000 ),
5121 15 - ( expDiff
< 0 ),
5126 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5127 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5128 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5130 while ( 0 < expDiff
) {
5131 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5132 q
= ( 4 < q
) ? q
- 4 : 0;
5133 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5134 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5135 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5136 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5139 if ( -64 < expDiff
) {
5140 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5141 q
= ( 4 < q
) ? q
- 4 : 0;
5143 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5145 if ( expDiff
< 0 ) {
5146 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5149 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5151 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5152 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5155 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5156 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5159 alternateASig0
= aSig0
;
5160 alternateASig1
= aSig1
;
5162 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5163 } while ( 0 <= (sbits64
) aSig0
);
5165 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (bits64
*)&sigMean0
, &sigMean1
);
5166 if ( ( sigMean0
< 0 )
5167 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5168 aSig0
= alternateASig0
;
5169 aSig1
= alternateASig1
;
5171 zSign
= ( (sbits64
) aSig0
< 0 );
5172 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5174 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
);
5179 -------------------------------------------------------------------------------
5180 Returns the square root of the quadruple-precision floating-point value `a'.
5181 The operation is performed according to the IEC/IEEE Standard for Binary
5182 Floating-Point Arithmetic.
5183 -------------------------------------------------------------------------------
5185 float128
float128_sqrt( float128 a
)
5189 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5190 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5193 aSig1
= extractFloat128Frac1( a
);
5194 aSig0
= extractFloat128Frac0( a
);
5195 aExp
= extractFloat128Exp( a
);
5196 aSign
= extractFloat128Sign( a
);
5197 if ( aExp
== 0x7FFF ) {
5198 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a
);
5199 if ( ! aSign
) return a
;
5203 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5205 float_raise( float_flag_invalid
);
5206 z
.low
= float128_default_nan_low
;
5207 z
.high
= float128_default_nan_high
;
5211 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5212 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5214 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5215 aSig0
|= LIT64( 0x0001000000000000 );
5216 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5217 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5218 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5219 doubleZSig0
= zSig0
<<1;
5220 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5221 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5222 while ( (sbits64
) rem0
< 0 ) {
5225 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5227 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5228 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5229 if ( zSig1
== 0 ) zSig1
= 1;
5230 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5231 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5232 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5233 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5234 while ( (sbits64
) rem1
< 0 ) {
5236 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5238 term2
|= doubleZSig0
;
5239 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5241 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5243 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5244 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2
);
5249 -------------------------------------------------------------------------------
5250 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5251 the corresponding value `b', and 0 otherwise. The comparison is performed
5252 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5253 -------------------------------------------------------------------------------
5255 flag
float128_eq( float128 a
, float128 b
)
5258 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5259 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5260 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5261 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5263 if ( float128_is_signaling_nan( a
)
5264 || float128_is_signaling_nan( b
) ) {
5265 float_raise( float_flag_invalid
);
5271 && ( ( a
.high
== b
.high
)
5273 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5279 -------------------------------------------------------------------------------
5280 Returns 1 if the quadruple-precision floating-point value `a' is less than
5281 or equal to the corresponding value `b', and 0 otherwise. The comparison
5282 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5284 -------------------------------------------------------------------------------
5286 flag
float128_le( float128 a
, float128 b
)
5290 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5291 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5292 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5293 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5295 float_raise( float_flag_invalid
);
5298 aSign
= extractFloat128Sign( a
);
5299 bSign
= extractFloat128Sign( b
);
5300 if ( aSign
!= bSign
) {
5303 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5307 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5308 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5313 -------------------------------------------------------------------------------
5314 Returns 1 if the quadruple-precision floating-point value `a' is less than
5315 the corresponding value `b', and 0 otherwise. The comparison is performed
5316 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5317 -------------------------------------------------------------------------------
5319 flag
float128_lt( float128 a
, float128 b
)
5323 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5324 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5325 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5326 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5328 float_raise( float_flag_invalid
);
5331 aSign
= extractFloat128Sign( a
);
5332 bSign
= extractFloat128Sign( b
);
5333 if ( aSign
!= bSign
) {
5336 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5340 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5341 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5346 -------------------------------------------------------------------------------
5347 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5348 the corresponding value `b', and 0 otherwise. The invalid exception is
5349 raised if either operand is a NaN. Otherwise, the comparison is performed
5350 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5351 -------------------------------------------------------------------------------
5353 flag
float128_eq_signaling( float128 a
, float128 b
)
5356 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5357 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5358 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5359 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5361 float_raise( float_flag_invalid
);
5366 && ( ( a
.high
== b
.high
)
5368 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5374 -------------------------------------------------------------------------------
5375 Returns 1 if the quadruple-precision floating-point value `a' is less than
5376 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5377 cause an exception. Otherwise, the comparison is performed according to the
5378 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5379 -------------------------------------------------------------------------------
5381 flag
float128_le_quiet( float128 a
, float128 b
)
5385 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5386 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5387 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5388 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5390 if ( float128_is_signaling_nan( a
)
5391 || float128_is_signaling_nan( b
) ) {
5392 float_raise( float_flag_invalid
);
5396 aSign
= extractFloat128Sign( a
);
5397 bSign
= extractFloat128Sign( b
);
5398 if ( aSign
!= bSign
) {
5401 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5405 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5406 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5411 -------------------------------------------------------------------------------
5412 Returns 1 if the quadruple-precision floating-point value `a' is less than
5413 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5414 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5415 Standard for Binary Floating-Point Arithmetic.
5416 -------------------------------------------------------------------------------
5418 flag
float128_lt_quiet( float128 a
, float128 b
)
5422 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5423 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5424 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5425 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5427 if ( float128_is_signaling_nan( a
)
5428 || float128_is_signaling_nan( b
) ) {
5429 float_raise( float_flag_invalid
);
5433 aSign
= extractFloat128Sign( a
);
5434 bSign
= extractFloat128Sign( b
);
5435 if ( aSign
!= bSign
) {
5438 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5442 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5443 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5450 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5453 * These two routines are not part of the original softfloat distribution.
5455 * They are based on the corresponding conversions to integer but return
5456 * unsigned numbers instead since these functions are required by GCC.
5458 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97
5460 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5464 -------------------------------------------------------------------------------
5465 Returns the result of converting the double-precision floating-point value
5466 `a' to the 32-bit unsigned integer format. The conversion is
5467 performed according to the IEC/IEEE Standard for Binary Floating-point
5468 Arithmetic, except that the conversion is always rounded toward zero. If
5469 `a' is a NaN, the largest positive integer is returned. If the conversion
5470 overflows, the largest integer positive is returned.
5471 -------------------------------------------------------------------------------
5473 uint32
float64_to_uint32_round_to_zero( float64 a
)
5476 int16 aExp
, shiftCount
;
5477 bits64 aSig
, savedASig
;
5480 aSig
= extractFloat64Frac( a
);
5481 aExp
= extractFloat64Exp( a
);
5482 aSign
= extractFloat64Sign( a
);
5485 float_raise( float_flag_invalid
);
5489 if ( 0x41E < aExp
) {
5490 float_raise( float_flag_invalid
);
5493 else if ( aExp
< 0x3FF ) {
5494 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
5497 aSig
|= LIT64( 0x0010000000000000 );
5498 shiftCount
= 0x433 - aExp
;
5500 aSig
>>= shiftCount
;
5502 if ( ( aSig
<<shiftCount
) != savedASig
) {
5503 float_exception_flags
|= float_flag_inexact
;
5510 -------------------------------------------------------------------------------
5511 Returns the result of converting the single-precision floating-point value
5512 `a' to the 32-bit unsigned integer format. The conversion is
5513 performed according to the IEC/IEEE Standard for Binary Floating-point
5514 Arithmetic, except that the conversion is always rounded toward zero. If
5515 `a' is a NaN, the largest positive integer is returned. If the conversion
5516 overflows, the largest positive integer is returned.
5517 -------------------------------------------------------------------------------
5519 uint32
float32_to_uint32_round_to_zero( float32 a
)
5522 int16 aExp
, shiftCount
;
5526 aSig
= extractFloat32Frac( a
);
5527 aExp
= extractFloat32Exp( a
);
5528 aSign
= extractFloat32Sign( a
);
5529 shiftCount
= aExp
- 0x9E;
5532 float_raise( float_flag_invalid
);
5535 if ( 0 < shiftCount
) {
5536 float_raise( float_flag_invalid
);
5539 else if ( aExp
<= 0x7E ) {
5540 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
5543 aSig
= ( aSig
| 0x800000 )<<8;
5544 z
= aSig
>>( - shiftCount
);
5545 if ( aSig
<<( shiftCount
& 31 ) ) {
5546 float_exception_flags
|= float_flag_inexact
;