1 /* $NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $ */
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
18 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 #include <sys/cdefs.h>
48 #if defined(LIBC_SCCS) && !defined(lint)
49 __RCSID("$NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $");
50 #endif /* LIBC_SCCS and not lint */
52 #ifdef SOFTFLOAT_FOR_GCC
53 #include "softfloat-for-gcc.h"
57 #include "softfloat.h"
60 * Conversions between floats as stored in memory and floats as
63 #ifndef FLOAT64_DEMANGLE
64 #define FLOAT64_DEMANGLE(a) (a)
66 #ifndef FLOAT64_MANGLE
67 #define FLOAT64_MANGLE(a) (a)
71 -------------------------------------------------------------------------------
72 Floating-point rounding mode, extended double-precision rounding precision,
74 -------------------------------------------------------------------------------
76 #ifndef set_float_rounding_mode
77 fp_rnd float_rounding_mode
= float_round_nearest_even
;
78 fp_except float_exception_flags
= 0;
80 #ifndef set_float_exception_inexact_flag
81 #define set_float_exception_inexact_flag() \
82 ((void)(float_exception_flags |= float_flag_inexact))
85 int8 floatx80_rounding_precision
= 80;
89 -------------------------------------------------------------------------------
90 Primitive arithmetic functions, including multi-word arithmetic, and
91 division and square root approximations. (Can be specialized to target if
93 -------------------------------------------------------------------------------
95 #include "softfloat-macros"
98 -------------------------------------------------------------------------------
99 Functions and definitions to determine: (1) whether tininess for underflow
100 is detected before or after rounding by default, (2) what (if anything)
101 happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 are propagated from function inputs to output. These details are target-
105 -------------------------------------------------------------------------------
107 #include "softfloat-specialize"
109 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
111 -------------------------------------------------------------------------------
112 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
113 and 7, and returns the properly rounded 32-bit integer corresponding to the
114 input. If `zSign' is 1, the input is negated before being converted to an
115 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
116 is simply rounded to an integer, with the inexact exception raised if the
117 input cannot be represented exactly as an integer. However, if the fixed-
118 point input is too large, the invalid exception is raised and the largest
119 positive or negative integer is returned.
120 -------------------------------------------------------------------------------
122 static int32
roundAndPackInt32( flag zSign
, bits64 absZ
)
125 flag roundNearestEven
;
126 int8 roundIncrement
, roundBits
;
129 roundingMode
= float_rounding_mode
;
130 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
131 roundIncrement
= 0x40;
132 if ( ! roundNearestEven
) {
133 if ( roundingMode
== float_round_to_zero
) {
137 roundIncrement
= 0x7F;
139 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
142 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
146 roundBits
= (int8
)(absZ
& 0x7F);
147 absZ
= ( absZ
+ roundIncrement
)>>7;
148 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
150 if ( zSign
) z
= - z
;
151 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
152 float_raise( float_flag_invalid
);
153 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
155 if ( roundBits
) set_float_exception_inexact_flag();
161 -------------------------------------------------------------------------------
162 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
163 `absZ1', with binary point between bits 63 and 64 (between the input words),
164 and returns the properly rounded 64-bit integer corresponding to the input.
165 If `zSign' is 1, the input is negated before being converted to an integer.
166 Ordinarily, the fixed-point input is simply rounded to an integer, with
167 the inexact exception raised if the input cannot be represented exactly as
168 an integer. However, if the fixed-point input is too large, the invalid
169 exception is raised and the largest positive or negative integer is
171 -------------------------------------------------------------------------------
173 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1
)
176 flag roundNearestEven
, increment
;
179 roundingMode
= float_rounding_mode
;
180 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
181 increment
= ( (sbits64
) absZ1
< 0 );
182 if ( ! roundNearestEven
) {
183 if ( roundingMode
== float_round_to_zero
) {
188 increment
= ( roundingMode
== float_round_down
) && absZ1
;
191 increment
= ( roundingMode
== float_round_up
) && absZ1
;
197 if ( absZ0
== 0 ) goto overflow
;
198 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
201 if ( zSign
) z
= - z
;
202 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
204 float_raise( float_flag_invalid
);
206 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
207 : LIT64( 0x7FFFFFFFFFFFFFFF );
209 if ( absZ1
) set_float_exception_inexact_flag();
216 -------------------------------------------------------------------------------
217 Returns the fraction bits of the single-precision floating-point value `a'.
218 -------------------------------------------------------------------------------
220 INLINE bits32
extractFloat32Frac( float32 a
)
223 return a
& 0x007FFFFF;
228 -------------------------------------------------------------------------------
229 Returns the exponent bits of the single-precision floating-point value `a'.
230 -------------------------------------------------------------------------------
232 INLINE int16
extractFloat32Exp( float32 a
)
235 return ( a
>>23 ) & 0xFF;
240 -------------------------------------------------------------------------------
241 Returns the sign bit of the single-precision floating-point value `a'.
242 -------------------------------------------------------------------------------
244 INLINE flag
extractFloat32Sign( float32 a
)
252 -------------------------------------------------------------------------------
253 Normalizes the subnormal single-precision floating-point value represented
254 by the denormalized significand `aSig'. The normalized exponent and
255 significand are stored at the locations pointed to by `zExpPtr' and
256 `zSigPtr', respectively.
257 -------------------------------------------------------------------------------
260 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
264 shiftCount
= countLeadingZeros32( aSig
) - 8;
265 *zSigPtr
= aSig
<<shiftCount
;
266 *zExpPtr
= 1 - shiftCount
;
271 -------------------------------------------------------------------------------
272 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
273 single-precision floating-point value, returning the result. After being
274 shifted into the proper positions, the three fields are simply added
275 together to form the result. This means that any integer portion of `zSig'
276 will be added into the exponent. Since a properly normalized significand
277 will have an integer portion equal to 1, the `zExp' input should be 1 less
278 than the desired result exponent whenever `zSig' is a complete, normalized
280 -------------------------------------------------------------------------------
282 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
285 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
290 -------------------------------------------------------------------------------
291 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
292 and significand `zSig', and returns the proper single-precision floating-
293 point value corresponding to the abstract input. Ordinarily, the abstract
294 value is simply rounded and packed into the single-precision format, with
295 the inexact exception raised if the abstract input cannot be represented
296 exactly. However, if the abstract value is too large, the overflow and
297 inexact exceptions are raised and an infinity or maximal finite value is
298 returned. If the abstract value is too small, the input value is rounded to
299 a subnormal number, and the underflow and inexact exceptions are raised if
300 the abstract input cannot be represented exactly as a subnormal single-
301 precision floating-point number.
302 The input significand `zSig' has its binary point between bits 30
303 and 29, which is 7 bits to the left of the usual location. This shifted
304 significand must be normalized or smaller. If `zSig' is not normalized,
305 `zExp' must be 0; in that case, the result returned is a subnormal number,
306 and it must not require rounding. In the usual case that `zSig' is
307 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
308 The handling of underflow and overflow follows the IEC/IEEE Standard for
309 Binary Floating-Point Arithmetic.
310 -------------------------------------------------------------------------------
312 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
315 flag roundNearestEven
;
316 int8 roundIncrement
, roundBits
;
319 roundingMode
= float_rounding_mode
;
320 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
321 roundIncrement
= 0x40;
322 if ( ! roundNearestEven
) {
323 if ( roundingMode
== float_round_to_zero
) {
327 roundIncrement
= 0x7F;
329 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
332 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
336 roundBits
= zSig
& 0x7F;
337 if ( 0xFD <= (bits16
) zExp
) {
339 || ( ( zExp
== 0xFD )
340 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
342 float_raise( float_flag_overflow
| float_flag_inexact
);
343 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
347 ( float_detect_tininess
== float_tininess_before_rounding
)
349 || ( zSig
+ roundIncrement
< 0x80000000U
);
350 shift32RightJamming( zSig
, - zExp
, &zSig
);
352 roundBits
= zSig
& 0x7F;
353 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
356 if ( roundBits
) set_float_exception_inexact_flag();
357 zSig
= ( zSig
+ roundIncrement
)>>7;
358 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
359 if ( zSig
== 0 ) zExp
= 0;
360 return packFloat32( zSign
, zExp
, zSig
);
365 -------------------------------------------------------------------------------
366 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
367 and significand `zSig', and returns the proper single-precision floating-
368 point value corresponding to the abstract input. This routine is just like
369 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
370 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
371 floating-point exponent.
372 -------------------------------------------------------------------------------
375 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
379 shiftCount
= countLeadingZeros32( zSig
) - 1;
380 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
385 -------------------------------------------------------------------------------
386 Returns the fraction bits of the double-precision floating-point value `a'.
387 -------------------------------------------------------------------------------
389 INLINE bits64
extractFloat64Frac( float64 a
)
392 return FLOAT64_DEMANGLE(a
) & LIT64( 0x000FFFFFFFFFFFFF );
397 -------------------------------------------------------------------------------
398 Returns the exponent bits of the double-precision floating-point value `a'.
399 -------------------------------------------------------------------------------
401 INLINE int16
extractFloat64Exp( float64 a
)
404 return (int16
)((FLOAT64_DEMANGLE(a
) >> 52) & 0x7FF);
409 -------------------------------------------------------------------------------
410 Returns the sign bit of the double-precision floating-point value `a'.
411 -------------------------------------------------------------------------------
413 INLINE flag
extractFloat64Sign( float64 a
)
416 return (flag
)(FLOAT64_DEMANGLE(a
) >> 63);
421 -------------------------------------------------------------------------------
422 Normalizes the subnormal double-precision floating-point value represented
423 by the denormalized significand `aSig'. The normalized exponent and
424 significand are stored at the locations pointed to by `zExpPtr' and
425 `zSigPtr', respectively.
426 -------------------------------------------------------------------------------
429 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
433 shiftCount
= countLeadingZeros64( aSig
) - 11;
434 *zSigPtr
= aSig
<<shiftCount
;
435 *zExpPtr
= 1 - shiftCount
;
440 -------------------------------------------------------------------------------
441 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
442 double-precision floating-point value, returning the result. After being
443 shifted into the proper positions, the three fields are simply added
444 together to form the result. This means that any integer portion of `zSig'
445 will be added into the exponent. Since a properly normalized significand
446 will have an integer portion equal to 1, the `zExp' input should be 1 less
447 than the desired result exponent whenever `zSig' is a complete, normalized
449 -------------------------------------------------------------------------------
451 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
454 return FLOAT64_MANGLE( ( ( (bits64
) zSign
)<<63 ) +
455 ( ( (bits64
) zExp
)<<52 ) + zSig
);
460 -------------------------------------------------------------------------------
461 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
462 and significand `zSig', and returns the proper double-precision floating-
463 point value corresponding to the abstract input. Ordinarily, the abstract
464 value is simply rounded and packed into the double-precision format, with
465 the inexact exception raised if the abstract input cannot be represented
466 exactly. However, if the abstract value is too large, the overflow and
467 inexact exceptions are raised and an infinity or maximal finite value is
468 returned. If the abstract value is too small, the input value is rounded to
469 a subnormal number, and the underflow and inexact exceptions are raised if
470 the abstract input cannot be represented exactly as a subnormal double-
471 precision floating-point number.
472 The input significand `zSig' has its binary point between bits 62
473 and 61, which is 10 bits to the left of the usual location. This shifted
474 significand must be normalized or smaller. If `zSig' is not normalized,
475 `zExp' must be 0; in that case, the result returned is a subnormal number,
476 and it must not require rounding. In the usual case that `zSig' is
477 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
478 The handling of underflow and overflow follows the IEC/IEEE Standard for
479 Binary Floating-Point Arithmetic.
480 -------------------------------------------------------------------------------
482 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
485 flag roundNearestEven
;
486 int16 roundIncrement
, roundBits
;
489 roundingMode
= float_rounding_mode
;
490 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
491 roundIncrement
= 0x200;
492 if ( ! roundNearestEven
) {
493 if ( roundingMode
== float_round_to_zero
) {
497 roundIncrement
= 0x3FF;
499 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
502 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
506 roundBits
= (int16
)(zSig
& 0x3FF);
507 if ( 0x7FD <= (bits16
) zExp
) {
508 if ( ( 0x7FD < zExp
)
509 || ( ( zExp
== 0x7FD )
510 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
512 float_raise( float_flag_overflow
| float_flag_inexact
);
513 return FLOAT64_MANGLE(
514 FLOAT64_DEMANGLE(packFloat64( zSign
, 0x7FF, 0 )) -
515 ( roundIncrement
== 0 ));
519 ( float_detect_tininess
== float_tininess_before_rounding
)
521 || ( zSig
+ roundIncrement
< (bits64
)LIT64( 0x8000000000000000 ) );
522 shift64RightJamming( zSig
, - zExp
, &zSig
);
524 roundBits
= (int16
)(zSig
& 0x3FF);
525 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
528 if ( roundBits
) set_float_exception_inexact_flag();
529 zSig
= ( zSig
+ roundIncrement
)>>10;
530 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
531 if ( zSig
== 0 ) zExp
= 0;
532 return packFloat64( zSign
, zExp
, zSig
);
537 -------------------------------------------------------------------------------
538 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
539 and significand `zSig', and returns the proper double-precision floating-
540 point value corresponding to the abstract input. This routine is just like
541 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
542 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
543 floating-point exponent.
544 -------------------------------------------------------------------------------
547 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
551 shiftCount
= countLeadingZeros64( zSig
) - 1;
552 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
559 -------------------------------------------------------------------------------
560 Returns the fraction bits of the extended double-precision floating-point
562 -------------------------------------------------------------------------------
564 INLINE bits64
extractFloatx80Frac( floatx80 a
)
572 -------------------------------------------------------------------------------
573 Returns the exponent bits of the extended double-precision floating-point
575 -------------------------------------------------------------------------------
577 INLINE int32
extractFloatx80Exp( floatx80 a
)
580 return a
.high
& 0x7FFF;
585 -------------------------------------------------------------------------------
586 Returns the sign bit of the extended double-precision floating-point value
588 -------------------------------------------------------------------------------
590 INLINE flag
extractFloatx80Sign( floatx80 a
)
598 -------------------------------------------------------------------------------
599 Normalizes the subnormal extended double-precision floating-point value
600 represented by the denormalized significand `aSig'. The normalized exponent
601 and significand are stored at the locations pointed to by `zExpPtr' and
602 `zSigPtr', respectively.
603 -------------------------------------------------------------------------------
606 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
610 shiftCount
= countLeadingZeros64( aSig
);
611 *zSigPtr
= aSig
<<shiftCount
;
612 *zExpPtr
= 1 - shiftCount
;
617 -------------------------------------------------------------------------------
618 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
619 extended double-precision floating-point value, returning the result.
620 -------------------------------------------------------------------------------
622 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
627 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
633 -------------------------------------------------------------------------------
634 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
635 and extended significand formed by the concatenation of `zSig0' and `zSig1',
636 and returns the proper extended double-precision floating-point value
637 corresponding to the abstract input. Ordinarily, the abstract value is
638 rounded and packed into the extended double-precision format, with the
639 inexact exception raised if the abstract input cannot be represented
640 exactly. However, if the abstract value is too large, the overflow and
641 inexact exceptions are raised and an infinity or maximal finite value is
642 returned. If the abstract value is too small, the input value is rounded to
643 a subnormal number, and the underflow and inexact exceptions are raised if
644 the abstract input cannot be represented exactly as a subnormal extended
645 double-precision floating-point number.
646 If `roundingPrecision' is 32 or 64, the result is rounded to the same
647 number of bits as single or double precision, respectively. Otherwise, the
648 result is rounded to the full precision of the extended double-precision
650 The input significand must be normalized or smaller. If the input
651 significand is not normalized, `zExp' must be 0; in that case, the result
652 returned is a subnormal number, and it must not require rounding. The
653 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
654 Floating-Point Arithmetic.
655 -------------------------------------------------------------------------------
658 roundAndPackFloatx80(
659 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
663 flag roundNearestEven
, increment
, isTiny
;
664 int64 roundIncrement
, roundMask
, roundBits
;
666 roundingMode
= float_rounding_mode
;
667 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
668 if ( roundingPrecision
== 80 ) goto precision80
;
669 if ( roundingPrecision
== 64 ) {
670 roundIncrement
= LIT64( 0x0000000000000400 );
671 roundMask
= LIT64( 0x00000000000007FF );
673 else if ( roundingPrecision
== 32 ) {
674 roundIncrement
= LIT64( 0x0000008000000000 );
675 roundMask
= LIT64( 0x000000FFFFFFFFFF );
680 zSig0
|= ( zSig1
!= 0 );
681 if ( ! roundNearestEven
) {
682 if ( roundingMode
== float_round_to_zero
) {
686 roundIncrement
= roundMask
;
688 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
691 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
695 roundBits
= zSig0
& roundMask
;
696 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
697 if ( ( 0x7FFE < zExp
)
698 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
704 ( float_detect_tininess
== float_tininess_before_rounding
)
706 || ( zSig0
<= zSig0
+ roundIncrement
);
707 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
709 roundBits
= zSig0
& roundMask
;
710 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
711 if ( roundBits
) set_float_exception_inexact_flag();
712 zSig0
+= roundIncrement
;
713 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
714 roundIncrement
= roundMask
+ 1;
715 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
716 roundMask
|= roundIncrement
;
718 zSig0
&= ~ roundMask
;
719 return packFloatx80( zSign
, zExp
, zSig0
);
722 if ( roundBits
) set_float_exception_inexact_flag();
723 zSig0
+= roundIncrement
;
724 if ( zSig0
< roundIncrement
) {
726 zSig0
= LIT64( 0x8000000000000000 );
728 roundIncrement
= roundMask
+ 1;
729 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
730 roundMask
|= roundIncrement
;
732 zSig0
&= ~ roundMask
;
733 if ( zSig0
== 0 ) zExp
= 0;
734 return packFloatx80( zSign
, zExp
, zSig0
);
736 increment
= ( (sbits64
) zSig1
< 0 );
737 if ( ! roundNearestEven
) {
738 if ( roundingMode
== float_round_to_zero
) {
743 increment
= ( roundingMode
== float_round_down
) && zSig1
;
746 increment
= ( roundingMode
== float_round_up
) && zSig1
;
750 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
751 if ( ( 0x7FFE < zExp
)
752 || ( ( zExp
== 0x7FFE )
753 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
759 float_raise( float_flag_overflow
| float_flag_inexact
);
760 if ( ( roundingMode
== float_round_to_zero
)
761 || ( zSign
&& ( roundingMode
== float_round_up
) )
762 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
764 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
766 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
770 ( float_detect_tininess
== float_tininess_before_rounding
)
773 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
774 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
776 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow
);
777 if ( zSig1
) set_float_exception_inexact_flag();
778 if ( roundNearestEven
) {
779 increment
= ( (sbits64
) zSig1
< 0 );
783 increment
= ( roundingMode
== float_round_down
) && zSig1
;
786 increment
= ( roundingMode
== float_round_up
) && zSig1
;
792 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
793 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
795 return packFloatx80( zSign
, zExp
, zSig0
);
798 if ( zSig1
) set_float_exception_inexact_flag();
803 zSig0
= LIT64( 0x8000000000000000 );
806 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
810 if ( zSig0
== 0 ) zExp
= 0;
812 return packFloatx80( zSign
, zExp
, zSig0
);
817 -------------------------------------------------------------------------------
818 Takes an abstract floating-point value having sign `zSign', exponent
819 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
820 and returns the proper extended double-precision floating-point value
821 corresponding to the abstract input. This routine is just like
822 `roundAndPackFloatx80' except that the input significand does not have to be
824 -------------------------------------------------------------------------------
827 normalizeRoundAndPackFloatx80(
828 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
838 shiftCount
= countLeadingZeros64( zSig0
);
839 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
842 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1
);
851 -------------------------------------------------------------------------------
852 Returns the least-significant 64 fraction bits of the quadruple-precision
853 floating-point value `a'.
854 -------------------------------------------------------------------------------
856 INLINE bits64
extractFloat128Frac1( float128 a
)
864 -------------------------------------------------------------------------------
865 Returns the most-significant 48 fraction bits of the quadruple-precision
866 floating-point value `a'.
867 -------------------------------------------------------------------------------
869 INLINE bits64
extractFloat128Frac0( float128 a
)
872 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
877 -------------------------------------------------------------------------------
878 Returns the exponent bits of the quadruple-precision floating-point value
880 -------------------------------------------------------------------------------
882 INLINE int32
extractFloat128Exp( float128 a
)
885 return (int32
)((a
.high
>> 48) & 0x7FFF);
890 -------------------------------------------------------------------------------
891 Returns the sign bit of the quadruple-precision floating-point value `a'.
892 -------------------------------------------------------------------------------
894 INLINE flag
extractFloat128Sign( float128 a
)
897 return (flag
)(a
.high
>> 63);
902 -------------------------------------------------------------------------------
903 Normalizes the subnormal quadruple-precision floating-point value
904 represented by the denormalized significand formed by the concatenation of
905 `aSig0' and `aSig1'. The normalized exponent is stored at the location
906 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
907 significand are stored at the location pointed to by `zSig0Ptr', and the
908 least significant 64 bits of the normalized significand are stored at the
909 location pointed to by `zSig1Ptr'.
910 -------------------------------------------------------------------------------
913 normalizeFloat128Subnormal(
924 shiftCount
= countLeadingZeros64( aSig1
) - 15;
925 if ( shiftCount
< 0 ) {
926 *zSig0Ptr
= aSig1
>>( - shiftCount
);
927 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
930 *zSig0Ptr
= aSig1
<<shiftCount
;
933 *zExpPtr
= - shiftCount
- 63;
936 shiftCount
= countLeadingZeros64( aSig0
) - 15;
937 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
938 *zExpPtr
= 1 - shiftCount
;
944 -------------------------------------------------------------------------------
945 Packs the sign `zSign', the exponent `zExp', and the significand formed
946 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
947 floating-point value, returning the result. After being shifted into the
948 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
949 added together to form the most significant 32 bits of the result. This
950 means that any integer portion of `zSig0' will be added into the exponent.
951 Since a properly normalized significand will have an integer portion equal
952 to 1, the `zExp' input should be 1 less than the desired result exponent
953 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
955 -------------------------------------------------------------------------------
958 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
963 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
969 -------------------------------------------------------------------------------
970 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
971 and extended significand formed by the concatenation of `zSig0', `zSig1',
972 and `zSig2', and returns the proper quadruple-precision floating-point value
973 corresponding to the abstract input. Ordinarily, the abstract value is
974 simply rounded and packed into the quadruple-precision format, with the
975 inexact exception raised if the abstract input cannot be represented
976 exactly. However, if the abstract value is too large, the overflow and
977 inexact exceptions are raised and an infinity or maximal finite value is
978 returned. If the abstract value is too small, the input value is rounded to
979 a subnormal number, and the underflow and inexact exceptions are raised if
980 the abstract input cannot be represented exactly as a subnormal quadruple-
981 precision floating-point number.
982 The input significand must be normalized or smaller. If the input
983 significand is not normalized, `zExp' must be 0; in that case, the result
984 returned is a subnormal number, and it must not require rounding. In the
985 usual case that the input significand is normalized, `zExp' must be 1 less
986 than the ``true'' floating-point exponent. The handling of underflow and
987 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
988 -------------------------------------------------------------------------------
991 roundAndPackFloat128(
992 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2
)
995 flag roundNearestEven
, increment
, isTiny
;
997 roundingMode
= float_rounding_mode
;
998 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
999 increment
= ( (sbits64
) zSig2
< 0 );
1000 if ( ! roundNearestEven
) {
1001 if ( roundingMode
== float_round_to_zero
) {
1006 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1009 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1013 if ( 0x7FFD <= (bits32
) zExp
) {
1014 if ( ( 0x7FFD < zExp
)
1015 || ( ( zExp
== 0x7FFD )
1017 LIT64( 0x0001FFFFFFFFFFFF ),
1018 LIT64( 0xFFFFFFFFFFFFFFFF ),
1025 float_raise( float_flag_overflow
| float_flag_inexact
);
1026 if ( ( roundingMode
== float_round_to_zero
)
1027 || ( zSign
&& ( roundingMode
== float_round_up
) )
1028 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1034 LIT64( 0x0000FFFFFFFFFFFF ),
1035 LIT64( 0xFFFFFFFFFFFFFFFF )
1038 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1042 ( float_detect_tininess
== float_tininess_before_rounding
)
1048 LIT64( 0x0001FFFFFFFFFFFF ),
1049 LIT64( 0xFFFFFFFFFFFFFFFF )
1051 shift128ExtraRightJamming(
1052 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1054 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow
);
1055 if ( roundNearestEven
) {
1056 increment
= ( (sbits64
) zSig2
< 0 );
1060 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1063 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1068 if ( zSig2
) set_float_exception_inexact_flag();
1070 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1071 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1074 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1076 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1081 -------------------------------------------------------------------------------
1082 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1083 and significand formed by the concatenation of `zSig0' and `zSig1', and
1084 returns the proper quadruple-precision floating-point value corresponding
1085 to the abstract input. This routine is just like `roundAndPackFloat128'
1086 except that the input significand has fewer bits and does not have to be
1087 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1089 -------------------------------------------------------------------------------
1092 normalizeRoundAndPackFloat128(
1093 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
1103 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1104 if ( 0 <= shiftCount
) {
1106 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1109 shift128ExtraRightJamming(
1110 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1113 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1120 -------------------------------------------------------------------------------
1121 Returns the result of converting the 32-bit two's complement integer `a'
1122 to the single-precision floating-point format. The conversion is performed
1123 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1124 -------------------------------------------------------------------------------
1126 float32
int32_to_float32( int32 a
)
1130 if ( a
== 0 ) return 0;
1131 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1133 return normalizeRoundAndPackFloat32(zSign
, 0x9C, (uint32
)(zSign
? - a
: a
));
1137 float32
uint32_to_float32( uint32 a
)
1139 if ( a
== 0 ) return 0;
1140 if ( a
& (bits32
) 0x80000000 )
1141 return normalizeRoundAndPackFloat32( 0, 0x9D, a
>> 1 );
1142 return normalizeRoundAndPackFloat32( 0, 0x9C, a
);
1147 -------------------------------------------------------------------------------
1148 Returns the result of converting the 32-bit two's complement integer `a'
1149 to the double-precision floating-point format. The conversion is performed
1150 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1151 -------------------------------------------------------------------------------
1153 float64
int32_to_float64( int32 a
)
1160 if ( a
== 0 ) return 0;
1162 absA
= zSign
? - a
: a
;
1163 shiftCount
= countLeadingZeros32( absA
) + 21;
1165 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1169 float64
uint32_to_float64( uint32 a
)
1174 if ( a
== 0 ) return 0;
1175 shiftCount
= countLeadingZeros32( a
) + 21;
1176 return packFloat64( 0, 0x432 - shiftCount
, zSig
<<shiftCount
);
1183 -------------------------------------------------------------------------------
1184 Returns the result of converting the 32-bit two's complement integer `a'
1185 to the extended double-precision floating-point format. The conversion
1186 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1188 -------------------------------------------------------------------------------
1190 floatx80
int32_to_floatx80( int32 a
)
1197 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1199 absA
= zSign
? - a
: a
;
1200 shiftCount
= countLeadingZeros32( absA
) + 32;
1202 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1206 floatx80
uint32_to_floatx80( uint32 a
)
1211 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1212 shiftCount
= countLeadingZeros32( a
) + 32;
1213 return packFloatx80( 0, 0x403E - shiftCount
, zSig
<<shiftCount
);
1222 -------------------------------------------------------------------------------
1223 Returns the result of converting the 32-bit two's complement integer `a' to
1224 the quadruple-precision floating-point format. The conversion is performed
1225 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1226 -------------------------------------------------------------------------------
1228 float128
int32_to_float128( int32 a
)
1235 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1237 absA
= zSign
? - a
: a
;
1238 shiftCount
= countLeadingZeros32( absA
) + 17;
1240 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1244 float128
uint32_to_float128( uint32 a
)
1249 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1250 shiftCount
= countLeadingZeros32( a
) + 17;
1251 return packFloat128( 0, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1257 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1259 -------------------------------------------------------------------------------
1260 Returns the result of converting the 64-bit two's complement integer `a'
1261 to the single-precision floating-point format. The conversion is performed
1262 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1263 -------------------------------------------------------------------------------
1265 float32
int64_to_float32( int64 a
)
1271 if ( a
== 0 ) return 0;
1273 absA
= zSign
? - a
: a
;
1274 shiftCount
= countLeadingZeros64( absA
) - 40;
1275 if ( 0 <= shiftCount
) {
1276 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1280 if ( shiftCount
< 0 ) {
1281 shift64RightJamming( absA
, - shiftCount
, &absA
);
1284 absA
<<= shiftCount
;
1286 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA
);
1292 -------------------------------------------------------------------------------
1293 Returns the result of converting the 64-bit two's complement integer `a'
1294 to the double-precision floating-point format. The conversion is performed
1295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 -------------------------------------------------------------------------------
1298 float64
int64_to_float64( int64 a
)
1302 if ( a
== 0 ) return 0;
1303 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1304 return packFloat64( 1, 0x43E, 0 );
1307 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a
);
1314 -------------------------------------------------------------------------------
1315 Returns the result of converting the 64-bit two's complement integer `a'
1316 to the extended double-precision floating-point format. The conversion
1317 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1319 -------------------------------------------------------------------------------
1321 floatx80
int64_to_floatx80( int64 a
)
1327 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1329 absA
= zSign
? - a
: a
;
1330 shiftCount
= countLeadingZeros64( absA
);
1331 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1337 #endif /* !SOFTFLOAT_FOR_GCC */
1342 -------------------------------------------------------------------------------
1343 Returns the result of converting the 64-bit two's complement integer `a' to
1344 the quadruple-precision floating-point format. The conversion is performed
1345 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1346 -------------------------------------------------------------------------------
1348 float128
int64_to_float128( int64 a
)
1354 bits64 zSig0
, zSig1
;
1356 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1358 absA
= zSign
? - a
: a
;
1359 shiftCount
= countLeadingZeros64( absA
) + 49;
1360 zExp
= 0x406E - shiftCount
;
1361 if ( 64 <= shiftCount
) {
1370 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1371 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1377 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1379 -------------------------------------------------------------------------------
1380 Returns the result of converting the single-precision floating-point value
1381 `a' to the 32-bit two's complement integer format. The conversion is
1382 performed according to the IEC/IEEE Standard for Binary Floating-Point
1383 Arithmetic---which means in particular that the conversion is rounded
1384 according to the current rounding mode. If `a' is a NaN, the largest
1385 positive integer is returned. Otherwise, if the conversion overflows, the
1386 largest integer with the same sign as `a' is returned.
1387 -------------------------------------------------------------------------------
1389 int32
float32_to_int32( float32 a
)
1392 int16 aExp
, shiftCount
;
1396 aSig
= extractFloat32Frac( a
);
1397 aExp
= extractFloat32Exp( a
);
1398 aSign
= extractFloat32Sign( a
);
1399 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1400 if ( aExp
) aSig
|= 0x00800000;
1401 shiftCount
= 0xAF - aExp
;
1404 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1405 return roundAndPackInt32( aSign
, aSig64
);
1408 #endif /* !SOFTFLOAT_FOR_GCC */
1411 -------------------------------------------------------------------------------
1412 Returns the result of converting the single-precision floating-point value
1413 `a' to the 32-bit two's complement integer format. The conversion is
1414 performed according to the IEC/IEEE Standard for Binary Floating-Point
1415 Arithmetic, except that the conversion is always rounded toward zero.
1416 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1417 the conversion overflows, the largest integer with the same sign as `a' is
1419 -------------------------------------------------------------------------------
1421 int32
float32_to_int32_round_to_zero( float32 a
)
1424 int16 aExp
, shiftCount
;
1428 aSig
= extractFloat32Frac( a
);
1429 aExp
= extractFloat32Exp( a
);
1430 aSign
= extractFloat32Sign( a
);
1431 shiftCount
= aExp
- 0x9E;
1432 if ( 0 <= shiftCount
) {
1433 if ( a
!= 0xCF000000 ) {
1434 float_raise( float_flag_invalid
);
1435 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1437 return (sbits32
) 0x80000000;
1439 else if ( aExp
<= 0x7E ) {
1440 if ( aExp
| aSig
) set_float_exception_inexact_flag();
1443 aSig
= ( aSig
| 0x00800000 )<<8;
1444 z
= aSig
>>( - shiftCount
);
1445 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1446 set_float_exception_inexact_flag();
1448 if ( aSign
) z
= - z
;
1453 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1455 -------------------------------------------------------------------------------
1456 Returns the result of converting the single-precision floating-point value
1457 `a' to the 64-bit two's complement integer format. The conversion is
1458 performed according to the IEC/IEEE Standard for Binary Floating-Point
1459 Arithmetic---which means in particular that the conversion is rounded
1460 according to the current rounding mode. If `a' is a NaN, the largest
1461 positive integer is returned. Otherwise, if the conversion overflows, the
1462 largest integer with the same sign as `a' is returned.
1463 -------------------------------------------------------------------------------
1465 int64
float32_to_int64( float32 a
)
1468 int16 aExp
, shiftCount
;
1470 bits64 aSig64
, aSigExtra
;
1472 aSig
= extractFloat32Frac( a
);
1473 aExp
= extractFloat32Exp( a
);
1474 aSign
= extractFloat32Sign( a
);
1475 shiftCount
= 0xBE - aExp
;
1476 if ( shiftCount
< 0 ) {
1477 float_raise( float_flag_invalid
);
1478 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1479 return LIT64( 0x7FFFFFFFFFFFFFFF );
1481 return (sbits64
) LIT64( 0x8000000000000000 );
1483 if ( aExp
) aSig
|= 0x00800000;
1486 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1487 return roundAndPackInt64( aSign
, aSig64
, aSigExtra
);
1492 -------------------------------------------------------------------------------
1493 Returns the result of converting the single-precision floating-point value
1494 `a' to the 64-bit two's complement integer format. The conversion is
1495 performed according to the IEC/IEEE Standard for Binary Floating-Point
1496 Arithmetic, except that the conversion is always rounded toward zero. If
1497 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1498 conversion overflows, the largest integer with the same sign as `a' is
1500 -------------------------------------------------------------------------------
1502 int64
float32_to_int64_round_to_zero( float32 a
)
1505 int16 aExp
, shiftCount
;
1510 aSig
= extractFloat32Frac( a
);
1511 aExp
= extractFloat32Exp( a
);
1512 aSign
= extractFloat32Sign( a
);
1513 shiftCount
= aExp
- 0xBE;
1514 if ( 0 <= shiftCount
) {
1515 if ( a
!= 0xDF000000 ) {
1516 float_raise( float_flag_invalid
);
1517 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1518 return LIT64( 0x7FFFFFFFFFFFFFFF );
1521 return (sbits64
) LIT64( 0x8000000000000000 );
1523 else if ( aExp
<= 0x7E ) {
1524 if ( aExp
| aSig
) set_float_exception_inexact_flag();
1527 aSig64
= aSig
| 0x00800000;
1529 z
= aSig64
>>( - shiftCount
);
1530 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1531 set_float_exception_inexact_flag();
1533 if ( aSign
) z
= - z
;
1537 #endif /* !SOFTFLOAT_FOR_GCC */
1540 -------------------------------------------------------------------------------
1541 Returns the result of converting the single-precision floating-point value
1542 `a' to the double-precision floating-point format. The conversion is
1543 performed according to the IEC/IEEE Standard for Binary Floating-Point
1545 -------------------------------------------------------------------------------
1547 float64
float32_to_float64( float32 a
)
1553 aSig
= extractFloat32Frac( a
);
1554 aExp
= extractFloat32Exp( a
);
1555 aSign
= extractFloat32Sign( a
);
1556 if ( aExp
== 0xFF ) {
1557 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
1558 return packFloat64( aSign
, 0x7FF, 0 );
1561 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1562 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1565 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1572 -------------------------------------------------------------------------------
1573 Returns the result of converting the single-precision floating-point value
1574 `a' to the extended double-precision floating-point format. The conversion
1575 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1577 -------------------------------------------------------------------------------
1579 floatx80
float32_to_floatx80( float32 a
)
1585 aSig
= extractFloat32Frac( a
);
1586 aExp
= extractFloat32Exp( a
);
1587 aSign
= extractFloat32Sign( a
);
1588 if ( aExp
== 0xFF ) {
1589 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a
) );
1590 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1593 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1594 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1597 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1606 -------------------------------------------------------------------------------
1607 Returns the result of converting the single-precision floating-point value
1608 `a' to the double-precision floating-point format. The conversion is
1609 performed according to the IEC/IEEE Standard for Binary Floating-Point
1611 -------------------------------------------------------------------------------
1613 float128
float32_to_float128( float32 a
)
1619 aSig
= extractFloat32Frac( a
);
1620 aExp
= extractFloat32Exp( a
);
1621 aSign
= extractFloat32Sign( a
);
1622 if ( aExp
== 0xFF ) {
1623 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a
) );
1624 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1627 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1628 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1631 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1637 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1639 -------------------------------------------------------------------------------
1640 Rounds the single-precision floating-point value `a' to an integer, and
1641 returns the result as a single-precision floating-point value. The
1642 operation is performed according to the IEC/IEEE Standard for Binary
1643 Floating-Point Arithmetic.
1644 -------------------------------------------------------------------------------
1646 float32
float32_round_to_int( float32 a
)
1650 bits32 lastBitMask
, roundBitsMask
;
1654 aExp
= extractFloat32Exp( a
);
1655 if ( 0x96 <= aExp
) {
1656 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1657 return propagateFloat32NaN( a
, a
);
1661 if ( aExp
<= 0x7E ) {
1662 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
1663 set_float_exception_inexact_flag();
1664 aSign
= extractFloat32Sign( a
);
1665 switch ( float_rounding_mode
) {
1666 case float_round_nearest_even
:
1667 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1668 return packFloat32( aSign
, 0x7F, 0 );
1671 case float_round_to_zero
:
1673 case float_round_down
:
1674 return aSign
? 0xBF800000 : 0;
1675 case float_round_up
:
1676 return aSign
? 0x80000000 : 0x3F800000;
1678 return packFloat32( aSign
, 0, 0 );
1681 lastBitMask
<<= 0x96 - aExp
;
1682 roundBitsMask
= lastBitMask
- 1;
1684 roundingMode
= float_rounding_mode
;
1685 if ( roundingMode
== float_round_nearest_even
) {
1686 z
+= lastBitMask
>>1;
1687 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1689 else if ( roundingMode
!= float_round_to_zero
) {
1690 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1694 z
&= ~ roundBitsMask
;
1695 if ( z
!= a
) set_float_exception_inexact_flag();
1699 #endif /* !SOFTFLOAT_FOR_GCC */
1702 -------------------------------------------------------------------------------
1703 Returns the result of adding the absolute values of the single-precision
1704 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1705 before being returned. `zSign' is ignored if the result is a NaN.
1706 The addition is performed according to the IEC/IEEE Standard for Binary
1707 Floating-Point Arithmetic.
1708 -------------------------------------------------------------------------------
1710 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1712 int16 aExp
, bExp
, zExp
;
1713 bits32 aSig
, bSig
, zSig
;
1716 aSig
= extractFloat32Frac( a
);
1717 aExp
= extractFloat32Exp( a
);
1718 bSig
= extractFloat32Frac( b
);
1719 bExp
= extractFloat32Exp( b
);
1720 expDiff
= aExp
- bExp
;
1723 if ( 0 < expDiff
) {
1724 if ( aExp
== 0xFF ) {
1725 if ( aSig
) return propagateFloat32NaN( a
, b
);
1734 shift32RightJamming( bSig
, expDiff
, &bSig
);
1737 else if ( expDiff
< 0 ) {
1738 if ( bExp
== 0xFF ) {
1739 if ( bSig
) return propagateFloat32NaN( a
, b
);
1740 return packFloat32( zSign
, 0xFF, 0 );
1748 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1752 if ( aExp
== 0xFF ) {
1753 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1756 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1757 zSig
= 0x40000000 + aSig
+ bSig
;
1762 zSig
= ( aSig
+ bSig
)<<1;
1764 if ( (sbits32
) zSig
< 0 ) {
1769 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1774 -------------------------------------------------------------------------------
1775 Returns the result of subtracting the absolute values of the single-
1776 precision floating-point values `a' and `b'. If `zSign' is 1, the
1777 difference is negated before being returned. `zSign' is ignored if the
1778 result is a NaN. The subtraction is performed according to the IEC/IEEE
1779 Standard for Binary Floating-Point Arithmetic.
1780 -------------------------------------------------------------------------------
1782 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1784 int16 aExp
, bExp
, zExp
;
1785 bits32 aSig
, bSig
, zSig
;
1788 aSig
= extractFloat32Frac( a
);
1789 aExp
= extractFloat32Exp( a
);
1790 bSig
= extractFloat32Frac( b
);
1791 bExp
= extractFloat32Exp( b
);
1792 expDiff
= aExp
- bExp
;
1795 if ( 0 < expDiff
) goto aExpBigger
;
1796 if ( expDiff
< 0 ) goto bExpBigger
;
1797 if ( aExp
== 0xFF ) {
1798 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1799 float_raise( float_flag_invalid
);
1800 return float32_default_nan
;
1806 if ( bSig
< aSig
) goto aBigger
;
1807 if ( aSig
< bSig
) goto bBigger
;
1808 return packFloat32( float_rounding_mode
== float_round_down
, 0, 0 );
1810 if ( bExp
== 0xFF ) {
1811 if ( bSig
) return propagateFloat32NaN( a
, b
);
1812 return packFloat32( zSign
^ 1, 0xFF, 0 );
1820 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1826 goto normalizeRoundAndPack
;
1828 if ( aExp
== 0xFF ) {
1829 if ( aSig
) return propagateFloat32NaN( a
, b
);
1838 shift32RightJamming( bSig
, expDiff
, &bSig
);
1843 normalizeRoundAndPack
:
1845 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig
);
1850 -------------------------------------------------------------------------------
1851 Returns the result of adding the single-precision floating-point values `a'
1852 and `b'. The operation is performed according to the IEC/IEEE Standard for
1853 Binary Floating-Point Arithmetic.
1854 -------------------------------------------------------------------------------
1856 float32
float32_add( float32 a
, float32 b
)
1860 aSign
= extractFloat32Sign( a
);
1861 bSign
= extractFloat32Sign( b
);
1862 if ( aSign
== bSign
) {
1863 return addFloat32Sigs( a
, b
, aSign
);
1866 return subFloat32Sigs( a
, b
, aSign
);
1872 -------------------------------------------------------------------------------
1873 Returns the result of subtracting the single-precision floating-point values
1874 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1875 for Binary Floating-Point Arithmetic.
1876 -------------------------------------------------------------------------------
1878 float32
float32_sub( float32 a
, float32 b
)
1882 aSign
= extractFloat32Sign( a
);
1883 bSign
= extractFloat32Sign( b
);
1884 if ( aSign
== bSign
) {
1885 return subFloat32Sigs( a
, b
, aSign
);
1888 return addFloat32Sigs( a
, b
, aSign
);
1894 -------------------------------------------------------------------------------
1895 Returns the result of multiplying the single-precision floating-point values
1896 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1897 for Binary Floating-Point Arithmetic.
1898 -------------------------------------------------------------------------------
1900 float32
float32_mul( float32 a
, float32 b
)
1902 flag aSign
, bSign
, zSign
;
1903 int16 aExp
, bExp
, zExp
;
1908 aSig
= extractFloat32Frac( a
);
1909 aExp
= extractFloat32Exp( a
);
1910 aSign
= extractFloat32Sign( a
);
1911 bSig
= extractFloat32Frac( b
);
1912 bExp
= extractFloat32Exp( b
);
1913 bSign
= extractFloat32Sign( b
);
1914 zSign
= aSign
^ bSign
;
1915 if ( aExp
== 0xFF ) {
1916 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1917 return propagateFloat32NaN( a
, b
);
1919 if ( ( bExp
| bSig
) == 0 ) {
1920 float_raise( float_flag_invalid
);
1921 return float32_default_nan
;
1923 return packFloat32( zSign
, 0xFF, 0 );
1925 if ( bExp
== 0xFF ) {
1926 if ( bSig
) return propagateFloat32NaN( a
, b
);
1927 if ( ( aExp
| aSig
) == 0 ) {
1928 float_raise( float_flag_invalid
);
1929 return float32_default_nan
;
1931 return packFloat32( zSign
, 0xFF, 0 );
1934 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1935 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1938 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1939 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1941 zExp
= aExp
+ bExp
- 0x7F;
1942 aSig
= ( aSig
| 0x00800000 )<<7;
1943 bSig
= ( bSig
| 0x00800000 )<<8;
1944 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1945 zSig
= (bits32
)zSig64
;
1946 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1950 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1955 -------------------------------------------------------------------------------
1956 Returns the result of dividing the single-precision floating-point value `a'
1957 by the corresponding value `b'. The operation is performed according to the
1958 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1959 -------------------------------------------------------------------------------
1961 float32
float32_div( float32 a
, float32 b
)
1963 flag aSign
, bSign
, zSign
;
1964 int16 aExp
, bExp
, zExp
;
1965 bits32 aSig
, bSig
, zSig
;
1967 aSig
= extractFloat32Frac( a
);
1968 aExp
= extractFloat32Exp( a
);
1969 aSign
= extractFloat32Sign( a
);
1970 bSig
= extractFloat32Frac( b
);
1971 bExp
= extractFloat32Exp( b
);
1972 bSign
= extractFloat32Sign( b
);
1973 zSign
= aSign
^ bSign
;
1974 if ( aExp
== 0xFF ) {
1975 if ( aSig
) return propagateFloat32NaN( a
, b
);
1976 if ( bExp
== 0xFF ) {
1977 if ( bSig
) return propagateFloat32NaN( a
, b
);
1978 float_raise( float_flag_invalid
);
1979 return float32_default_nan
;
1981 return packFloat32( zSign
, 0xFF, 0 );
1983 if ( bExp
== 0xFF ) {
1984 if ( bSig
) return propagateFloat32NaN( a
, b
);
1985 return packFloat32( zSign
, 0, 0 );
1989 if ( ( aExp
| aSig
) == 0 ) {
1990 float_raise( float_flag_invalid
);
1991 return float32_default_nan
;
1993 float_raise( float_flag_divbyzero
);
1994 return packFloat32( zSign
, 0xFF, 0 );
1996 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1999 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2000 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2002 zExp
= aExp
- bExp
+ 0x7D;
2003 aSig
= ( aSig
| 0x00800000 )<<7;
2004 bSig
= ( bSig
| 0x00800000 )<<8;
2005 if ( bSig
<= ( aSig
+ aSig
) ) {
2009 zSig
= (bits32
)((((bits64
) aSig
) << 32) / bSig
);
2010 if ( ( zSig
& 0x3F ) == 0 ) {
2011 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
2013 return roundAndPackFloat32( zSign
, zExp
, zSig
);
2017 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2019 -------------------------------------------------------------------------------
2020 Returns the remainder of the single-precision floating-point value `a'
2021 with respect to the corresponding value `b'. The operation is performed
2022 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2023 -------------------------------------------------------------------------------
2025 float32
float32_rem( float32 a
, float32 b
)
2027 flag aSign
, bSign
, zSign
;
2028 int16 aExp
, bExp
, expDiff
;
2031 bits64 aSig64
, bSig64
, q64
;
2032 bits32 alternateASig
;
2035 aSig
= extractFloat32Frac( a
);
2036 aExp
= extractFloat32Exp( a
);
2037 aSign
= extractFloat32Sign( a
);
2038 bSig
= extractFloat32Frac( b
);
2039 bExp
= extractFloat32Exp( b
);
2040 bSign
= extractFloat32Sign( b
);
2041 if ( aExp
== 0xFF ) {
2042 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2043 return propagateFloat32NaN( a
, b
);
2045 float_raise( float_flag_invalid
);
2046 return float32_default_nan
;
2048 if ( bExp
== 0xFF ) {
2049 if ( bSig
) return propagateFloat32NaN( a
, b
);
2054 float_raise( float_flag_invalid
);
2055 return float32_default_nan
;
2057 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2060 if ( aSig
== 0 ) return a
;
2061 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2063 expDiff
= aExp
- bExp
;
2066 if ( expDiff
< 32 ) {
2069 if ( expDiff
< 0 ) {
2070 if ( expDiff
< -1 ) return a
;
2073 q
= ( bSig
<= aSig
);
2074 if ( q
) aSig
-= bSig
;
2075 if ( 0 < expDiff
) {
2076 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
2079 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2087 if ( bSig
<= aSig
) aSig
-= bSig
;
2088 aSig64
= ( (bits64
) aSig
)<<40;
2089 bSig64
= ( (bits64
) bSig
)<<40;
2091 while ( 0 < expDiff
) {
2092 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2093 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2094 aSig64
= - ( ( bSig
* q64
)<<38 );
2098 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2099 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2100 q
= q64
>>( 64 - expDiff
);
2102 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2105 alternateASig
= aSig
;
2108 } while ( 0 <= (sbits32
) aSig
);
2109 sigMean
= aSig
+ alternateASig
;
2110 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2111 aSig
= alternateASig
;
2113 zSign
= ( (sbits32
) aSig
< 0 );
2114 if ( zSign
) aSig
= - aSig
;
2115 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig
);
2118 #endif /* !SOFTFLOAT_FOR_GCC */
2120 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2122 -------------------------------------------------------------------------------
2123 Returns the square root of the single-precision floating-point value `a'.
2124 The operation is performed according to the IEC/IEEE Standard for Binary
2125 Floating-Point Arithmetic.
2126 -------------------------------------------------------------------------------
2128 float32
float32_sqrt( float32 a
)
2135 aSig
= extractFloat32Frac( a
);
2136 aExp
= extractFloat32Exp( a
);
2137 aSign
= extractFloat32Sign( a
);
2138 if ( aExp
== 0xFF ) {
2139 if ( aSig
) return propagateFloat32NaN( a
, 0 );
2140 if ( ! aSign
) return a
;
2141 float_raise( float_flag_invalid
);
2142 return float32_default_nan
;
2145 if ( ( aExp
| aSig
) == 0 ) return a
;
2146 float_raise( float_flag_invalid
);
2147 return float32_default_nan
;
2150 if ( aSig
== 0 ) return 0;
2151 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2153 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2154 aSig
= ( aSig
| 0x00800000 )<<8;
2155 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2156 if ( ( zSig
& 0x7F ) <= 5 ) {
2162 term
= ( (bits64
) zSig
) * zSig
;
2163 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2164 while ( (sbits64
) rem
< 0 ) {
2166 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2168 zSig
|= ( rem
!= 0 );
2170 shift32RightJamming( zSig
, 1, &zSig
);
2172 return roundAndPackFloat32( 0, zExp
, zSig
);
2175 #endif /* !SOFTFLOAT_FOR_GCC */
2178 -------------------------------------------------------------------------------
2179 Returns 1 if the single-precision floating-point value `a' is equal to
2180 the corresponding value `b', and 0 otherwise. The comparison is performed
2181 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2182 -------------------------------------------------------------------------------
2184 flag
float32_eq( float32 a
, float32 b
)
2187 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2188 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2190 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2191 float_raise( float_flag_invalid
);
2195 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2200 -------------------------------------------------------------------------------
2201 Returns 1 if the single-precision floating-point value `a' is less than
2202 or equal to the corresponding value `b', and 0 otherwise. The comparison
2203 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2205 -------------------------------------------------------------------------------
2207 flag
float32_le( float32 a
, float32 b
)
2211 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2212 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2214 float_raise( float_flag_invalid
);
2217 aSign
= extractFloat32Sign( a
);
2218 bSign
= extractFloat32Sign( b
);
2219 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2220 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2225 -------------------------------------------------------------------------------
2226 Returns 1 if the single-precision floating-point value `a' is less than
2227 the corresponding value `b', and 0 otherwise. The comparison is performed
2228 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2229 -------------------------------------------------------------------------------
2231 flag
float32_lt( float32 a
, float32 b
)
2235 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2236 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2238 float_raise( float_flag_invalid
);
2241 aSign
= extractFloat32Sign( a
);
2242 bSign
= extractFloat32Sign( b
);
2243 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2244 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2248 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2250 -------------------------------------------------------------------------------
2251 Returns 1 if the single-precision floating-point value `a' is equal to
2252 the corresponding value `b', and 0 otherwise. The invalid exception is
2253 raised if either operand is a NaN. Otherwise, the comparison is performed
2254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2255 -------------------------------------------------------------------------------
2257 flag
float32_eq_signaling( float32 a
, float32 b
)
2260 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2261 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2263 float_raise( float_flag_invalid
);
2266 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2271 -------------------------------------------------------------------------------
2272 Returns 1 if the single-precision floating-point value `a' is less than or
2273 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2274 cause an exception. Otherwise, the comparison is performed according to the
2275 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2276 -------------------------------------------------------------------------------
2278 flag
float32_le_quiet( float32 a
, float32 b
)
2282 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2283 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2285 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2286 float_raise( float_flag_invalid
);
2290 aSign
= extractFloat32Sign( a
);
2291 bSign
= extractFloat32Sign( b
);
2292 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2293 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2298 -------------------------------------------------------------------------------
2299 Returns 1 if the single-precision floating-point value `a' is less than
2300 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2301 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2302 Standard for Binary Floating-Point Arithmetic.
2303 -------------------------------------------------------------------------------
2305 flag
float32_lt_quiet( float32 a
, float32 b
)
2309 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2310 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2312 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2313 float_raise( float_flag_invalid
);
2317 aSign
= extractFloat32Sign( a
);
2318 bSign
= extractFloat32Sign( b
);
2319 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2320 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2323 #endif /* !SOFTFLOAT_FOR_GCC */
2325 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2327 -------------------------------------------------------------------------------
2328 Returns the result of converting the double-precision floating-point value
2329 `a' to the 32-bit two's complement integer format. The conversion is
2330 performed according to the IEC/IEEE Standard for Binary Floating-Point
2331 Arithmetic---which means in particular that the conversion is rounded
2332 according to the current rounding mode. If `a' is a NaN, the largest
2333 positive integer is returned. Otherwise, if the conversion overflows, the
2334 largest integer with the same sign as `a' is returned.
2335 -------------------------------------------------------------------------------
2337 int32
float64_to_int32( float64 a
)
2340 int16 aExp
, shiftCount
;
2343 aSig
= extractFloat64Frac( a
);
2344 aExp
= extractFloat64Exp( a
);
2345 aSign
= extractFloat64Sign( a
);
2346 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2347 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2348 shiftCount
= 0x42C - aExp
;
2349 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2350 return roundAndPackInt32( aSign
, aSig
);
2353 #endif /* !SOFTFLOAT_FOR_GCC */
2356 -------------------------------------------------------------------------------
2357 Returns the result of converting the double-precision floating-point value
2358 `a' to the 32-bit two's complement integer format. The conversion is
2359 performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 Arithmetic, except that the conversion is always rounded toward zero.
2361 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2362 the conversion overflows, the largest integer with the same sign as `a' is
2364 -------------------------------------------------------------------------------
2366 int32
float64_to_int32_round_to_zero( float64 a
)
2369 int16 aExp
, shiftCount
;
2370 bits64 aSig
, savedASig
;
2373 aSig
= extractFloat64Frac( a
);
2374 aExp
= extractFloat64Exp( a
);
2375 aSign
= extractFloat64Sign( a
);
2376 if ( 0x41E < aExp
) {
2377 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2380 else if ( aExp
< 0x3FF ) {
2381 if ( aExp
|| aSig
) set_float_exception_inexact_flag();
2384 aSig
|= LIT64( 0x0010000000000000 );
2385 shiftCount
= 0x433 - aExp
;
2387 aSig
>>= shiftCount
;
2389 if ( aSign
) z
= - z
;
2390 if ( ( z
< 0 ) ^ aSign
) {
2392 float_raise( float_flag_invalid
);
2393 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2395 if ( ( aSig
<<shiftCount
) != savedASig
) {
2396 set_float_exception_inexact_flag();
2402 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2404 -------------------------------------------------------------------------------
2405 Returns the result of converting the double-precision floating-point value
2406 `a' to the 64-bit two's complement integer format. The conversion is
2407 performed according to the IEC/IEEE Standard for Binary Floating-Point
2408 Arithmetic---which means in particular that the conversion is rounded
2409 according to the current rounding mode. If `a' is a NaN, the largest
2410 positive integer is returned. Otherwise, if the conversion overflows, the
2411 largest integer with the same sign as `a' is returned.
2412 -------------------------------------------------------------------------------
2414 int64
float64_to_int64( float64 a
)
2417 int16 aExp
, shiftCount
;
2418 bits64 aSig
, aSigExtra
;
2420 aSig
= extractFloat64Frac( a
);
2421 aExp
= extractFloat64Exp( a
);
2422 aSign
= extractFloat64Sign( a
);
2423 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2424 shiftCount
= 0x433 - aExp
;
2425 if ( shiftCount
<= 0 ) {
2426 if ( 0x43E < aExp
) {
2427 float_raise( float_flag_invalid
);
2429 || ( ( aExp
== 0x7FF )
2430 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2432 return LIT64( 0x7FFFFFFFFFFFFFFF );
2434 return (sbits64
) LIT64( 0x8000000000000000 );
2437 aSig
<<= - shiftCount
;
2440 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2442 return roundAndPackInt64( aSign
, aSig
, aSigExtra
);
2447 -------------------------------------------------------------------------------
2448 Returns the result of converting the double-precision floating-point value
2449 `a' to the 64-bit two's complement integer format. The conversion is
2450 performed according to the IEC/IEEE Standard for Binary Floating-Point
2451 Arithmetic, except that the conversion is always rounded toward zero.
2452 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2453 the conversion overflows, the largest integer with the same sign as `a' is
2455 -------------------------------------------------------------------------------
2457 int64
float64_to_int64_round_to_zero( float64 a
)
2460 int16 aExp
, shiftCount
;
2464 aSig
= extractFloat64Frac( a
);
2465 aExp
= extractFloat64Exp( a
);
2466 aSign
= extractFloat64Sign( a
);
2467 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2468 shiftCount
= aExp
- 0x433;
2469 if ( 0 <= shiftCount
) {
2470 if ( 0x43E <= aExp
) {
2471 if ( a
!= LIT64( 0xC3E0000000000000 ) ) {
2472 float_raise( float_flag_invalid
);
2474 || ( ( aExp
== 0x7FF )
2475 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2477 return LIT64( 0x7FFFFFFFFFFFFFFF );
2480 return (sbits64
) LIT64( 0x8000000000000000 );
2482 z
= aSig
<<shiftCount
;
2485 if ( aExp
< 0x3FE ) {
2486 if ( aExp
| aSig
) set_float_exception_inexact_flag();
2489 z
= aSig
>>( - shiftCount
);
2490 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2491 set_float_exception_inexact_flag();
2494 if ( aSign
) z
= - z
;
2498 #endif /* !SOFTFLOAT_FOR_GCC */
2501 -------------------------------------------------------------------------------
2502 Returns the result of converting the double-precision floating-point value
2503 `a' to the single-precision floating-point format. The conversion is
2504 performed according to the IEC/IEEE Standard for Binary Floating-Point
2506 -------------------------------------------------------------------------------
2508 float32
float64_to_float32( float64 a
)
2515 aSig
= extractFloat64Frac( a
);
2516 aExp
= extractFloat64Exp( a
);
2517 aSign
= extractFloat64Sign( a
);
2518 if ( aExp
== 0x7FF ) {
2519 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a
) );
2520 return packFloat32( aSign
, 0xFF, 0 );
2522 shift64RightJamming( aSig
, 22, &aSig
);
2523 zSig
= (bits32
)aSig
;
2524 if ( aExp
|| zSig
) {
2528 return roundAndPackFloat32( aSign
, aExp
, zSig
);
2535 -------------------------------------------------------------------------------
2536 Returns the result of converting the double-precision floating-point value
2537 `a' to the extended double-precision floating-point format. The conversion
2538 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2540 -------------------------------------------------------------------------------
2542 floatx80
float64_to_floatx80( float64 a
)
2548 aSig
= extractFloat64Frac( a
);
2549 aExp
= extractFloat64Exp( a
);
2550 aSign
= extractFloat64Sign( a
);
2551 if ( aExp
== 0x7FF ) {
2552 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a
) );
2553 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2556 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2557 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2561 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2570 -------------------------------------------------------------------------------
2571 Returns the result of converting the double-precision floating-point value
2572 `a' to the quadruple-precision floating-point format. The conversion is
2573 performed according to the IEC/IEEE Standard for Binary Floating-Point
2575 -------------------------------------------------------------------------------
2577 float128
float64_to_float128( float64 a
)
2581 bits64 aSig
, zSig0
, zSig1
;
2583 aSig
= extractFloat64Frac( a
);
2584 aExp
= extractFloat64Exp( a
);
2585 aSign
= extractFloat64Sign( a
);
2586 if ( aExp
== 0x7FF ) {
2587 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a
) );
2588 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2591 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2592 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2595 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2596 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2602 #ifndef SOFTFLOAT_FOR_GCC
2604 -------------------------------------------------------------------------------
2605 Rounds the double-precision floating-point value `a' to an integer, and
2606 returns the result as a double-precision floating-point value. The
2607 operation is performed according to the IEC/IEEE Standard for Binary
2608 Floating-Point Arithmetic.
2609 -------------------------------------------------------------------------------
2611 float64
float64_round_to_int( float64 a
)
2615 bits64 lastBitMask
, roundBitsMask
;
2619 aExp
= extractFloat64Exp( a
);
2620 if ( 0x433 <= aExp
) {
2621 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2622 return propagateFloat64NaN( a
, a
);
2626 if ( aExp
< 0x3FF ) {
2627 if ( (bits64
) ( a
<<1 ) == 0 ) return a
;
2628 set_float_exception_inexact_flag();
2629 aSign
= extractFloat64Sign( a
);
2630 switch ( float_rounding_mode
) {
2631 case float_round_nearest_even
:
2632 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2633 return packFloat64( aSign
, 0x3FF, 0 );
2636 case float_round_to_zero
:
2638 case float_round_down
:
2639 return aSign
? LIT64( 0xBFF0000000000000 ) : 0;
2640 case float_round_up
:
2642 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2644 return packFloat64( aSign
, 0, 0 );
2647 lastBitMask
<<= 0x433 - aExp
;
2648 roundBitsMask
= lastBitMask
- 1;
2650 roundingMode
= float_rounding_mode
;
2651 if ( roundingMode
== float_round_nearest_even
) {
2652 z
+= lastBitMask
>>1;
2653 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2655 else if ( roundingMode
!= float_round_to_zero
) {
2656 if ( extractFloat64Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
2660 z
&= ~ roundBitsMask
;
2661 if ( z
!= a
) set_float_exception_inexact_flag();
2668 -------------------------------------------------------------------------------
2669 Returns the result of adding the absolute values of the double-precision
2670 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2671 before being returned. `zSign' is ignored if the result is a NaN.
2672 The addition is performed according to the IEC/IEEE Standard for Binary
2673 Floating-Point Arithmetic.
2674 -------------------------------------------------------------------------------
2676 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2678 int16 aExp
, bExp
, zExp
;
2679 bits64 aSig
, bSig
, zSig
;
2682 aSig
= extractFloat64Frac( a
);
2683 aExp
= extractFloat64Exp( a
);
2684 bSig
= extractFloat64Frac( b
);
2685 bExp
= extractFloat64Exp( b
);
2686 expDiff
= aExp
- bExp
;
2689 if ( 0 < expDiff
) {
2690 if ( aExp
== 0x7FF ) {
2691 if ( aSig
) return propagateFloat64NaN( a
, b
);
2698 bSig
|= LIT64( 0x2000000000000000 );
2700 shift64RightJamming( bSig
, expDiff
, &bSig
);
2703 else if ( expDiff
< 0 ) {
2704 if ( bExp
== 0x7FF ) {
2705 if ( bSig
) return propagateFloat64NaN( a
, b
);
2706 return packFloat64( zSign
, 0x7FF, 0 );
2712 aSig
|= LIT64( 0x2000000000000000 );
2714 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2718 if ( aExp
== 0x7FF ) {
2719 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2722 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
2723 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
2727 aSig
|= LIT64( 0x2000000000000000 );
2728 zSig
= ( aSig
+ bSig
)<<1;
2730 if ( (sbits64
) zSig
< 0 ) {
2735 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2740 -------------------------------------------------------------------------------
2741 Returns the result of subtracting the absolute values of the double-
2742 precision floating-point values `a' and `b'. If `zSign' is 1, the
2743 difference is negated before being returned. `zSign' is ignored if the
2744 result is a NaN. The subtraction is performed according to the IEC/IEEE
2745 Standard for Binary Floating-Point Arithmetic.
2746 -------------------------------------------------------------------------------
2748 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2750 int16 aExp
, bExp
, zExp
;
2751 bits64 aSig
, bSig
, zSig
;
2754 aSig
= extractFloat64Frac( a
);
2755 aExp
= extractFloat64Exp( a
);
2756 bSig
= extractFloat64Frac( b
);
2757 bExp
= extractFloat64Exp( b
);
2758 expDiff
= aExp
- bExp
;
2761 if ( 0 < expDiff
) goto aExpBigger
;
2762 if ( expDiff
< 0 ) goto bExpBigger
;
2763 if ( aExp
== 0x7FF ) {
2764 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2765 float_raise( float_flag_invalid
);
2766 return float64_default_nan
;
2772 if ( bSig
< aSig
) goto aBigger
;
2773 if ( aSig
< bSig
) goto bBigger
;
2774 return packFloat64( float_rounding_mode
== float_round_down
, 0, 0 );
2776 if ( bExp
== 0x7FF ) {
2777 if ( bSig
) return propagateFloat64NaN( a
, b
);
2778 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2784 aSig
|= LIT64( 0x4000000000000000 );
2786 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2787 bSig
|= LIT64( 0x4000000000000000 );
2792 goto normalizeRoundAndPack
;
2794 if ( aExp
== 0x7FF ) {
2795 if ( aSig
) return propagateFloat64NaN( a
, b
);
2802 bSig
|= LIT64( 0x4000000000000000 );
2804 shift64RightJamming( bSig
, expDiff
, &bSig
);
2805 aSig
|= LIT64( 0x4000000000000000 );
2809 normalizeRoundAndPack
:
2811 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig
);
2816 -------------------------------------------------------------------------------
2817 Returns the result of adding the double-precision floating-point values `a'
2818 and `b'. The operation is performed according to the IEC/IEEE Standard for
2819 Binary Floating-Point Arithmetic.
2820 -------------------------------------------------------------------------------
2822 float64
float64_add( float64 a
, float64 b
)
2826 aSign
= extractFloat64Sign( a
);
2827 bSign
= extractFloat64Sign( b
);
2828 if ( aSign
== bSign
) {
2829 return addFloat64Sigs( a
, b
, aSign
);
2832 return subFloat64Sigs( a
, b
, aSign
);
2838 -------------------------------------------------------------------------------
2839 Returns the result of subtracting the double-precision floating-point values
2840 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2841 for Binary Floating-Point Arithmetic.
2842 -------------------------------------------------------------------------------
2844 float64
float64_sub( float64 a
, float64 b
)
2848 aSign
= extractFloat64Sign( a
);
2849 bSign
= extractFloat64Sign( b
);
2850 if ( aSign
== bSign
) {
2851 return subFloat64Sigs( a
, b
, aSign
);
2854 return addFloat64Sigs( a
, b
, aSign
);
2860 -------------------------------------------------------------------------------
2861 Returns the result of multiplying the double-precision floating-point values
2862 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2863 for Binary Floating-Point Arithmetic.
2864 -------------------------------------------------------------------------------
2866 float64
float64_mul( float64 a
, float64 b
)
2868 flag aSign
, bSign
, zSign
;
2869 int16 aExp
, bExp
, zExp
;
2870 bits64 aSig
, bSig
, zSig0
, zSig1
;
2872 aSig
= extractFloat64Frac( a
);
2873 aExp
= extractFloat64Exp( a
);
2874 aSign
= extractFloat64Sign( a
);
2875 bSig
= extractFloat64Frac( b
);
2876 bExp
= extractFloat64Exp( b
);
2877 bSign
= extractFloat64Sign( b
);
2878 zSign
= aSign
^ bSign
;
2879 if ( aExp
== 0x7FF ) {
2880 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2881 return propagateFloat64NaN( a
, b
);
2883 if ( ( bExp
| bSig
) == 0 ) {
2884 float_raise( float_flag_invalid
);
2885 return float64_default_nan
;
2887 return packFloat64( zSign
, 0x7FF, 0 );
2889 if ( bExp
== 0x7FF ) {
2890 if ( bSig
) return propagateFloat64NaN( a
, b
);
2891 if ( ( aExp
| aSig
) == 0 ) {
2892 float_raise( float_flag_invalid
);
2893 return float64_default_nan
;
2895 return packFloat64( zSign
, 0x7FF, 0 );
2898 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2899 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2902 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2903 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2905 zExp
= aExp
+ bExp
- 0x3FF;
2906 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2907 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2908 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2909 zSig0
|= ( zSig1
!= 0 );
2910 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2914 return roundAndPackFloat64( zSign
, zExp
, zSig0
);
2919 -------------------------------------------------------------------------------
2920 Returns the result of dividing the double-precision floating-point value `a'
2921 by the corresponding value `b'. The operation is performed according to
2922 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2923 -------------------------------------------------------------------------------
2925 float64
float64_div( float64 a
, float64 b
)
2927 flag aSign
, bSign
, zSign
;
2928 int16 aExp
, bExp
, zExp
;
2929 bits64 aSig
, bSig
, zSig
;
2931 bits64 term0
, term1
;
2933 aSig
= extractFloat64Frac( a
);
2934 aExp
= extractFloat64Exp( a
);
2935 aSign
= extractFloat64Sign( a
);
2936 bSig
= extractFloat64Frac( b
);
2937 bExp
= extractFloat64Exp( b
);
2938 bSign
= extractFloat64Sign( b
);
2939 zSign
= aSign
^ bSign
;
2940 if ( aExp
== 0x7FF ) {
2941 if ( aSig
) return propagateFloat64NaN( a
, b
);
2942 if ( bExp
== 0x7FF ) {
2943 if ( bSig
) return propagateFloat64NaN( a
, b
);
2944 float_raise( float_flag_invalid
);
2945 return float64_default_nan
;
2947 return packFloat64( zSign
, 0x7FF, 0 );
2949 if ( bExp
== 0x7FF ) {
2950 if ( bSig
) return propagateFloat64NaN( a
, b
);
2951 return packFloat64( zSign
, 0, 0 );
2955 if ( ( aExp
| aSig
) == 0 ) {
2956 float_raise( float_flag_invalid
);
2957 return float64_default_nan
;
2959 float_raise( float_flag_divbyzero
);
2960 return packFloat64( zSign
, 0x7FF, 0 );
2962 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2965 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2966 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2968 zExp
= aExp
- bExp
+ 0x3FD;
2969 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2970 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2971 if ( bSig
<= ( aSig
+ aSig
) ) {
2975 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2976 if ( ( zSig
& 0x1FF ) <= 2 ) {
2977 mul64To128( bSig
, zSig
, &term0
, &term1
);
2978 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2979 while ( (sbits64
) rem0
< 0 ) {
2981 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2983 zSig
|= ( rem1
!= 0 );
2985 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2989 #ifndef SOFTFLOAT_FOR_GCC
2991 -------------------------------------------------------------------------------
2992 Returns the remainder of the double-precision floating-point value `a'
2993 with respect to the corresponding value `b'. The operation is performed
2994 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2995 -------------------------------------------------------------------------------
2997 float64
float64_rem( float64 a
, float64 b
)
2999 flag aSign
, bSign
, zSign
;
3000 int16 aExp
, bExp
, expDiff
;
3002 bits64 q
, alternateASig
;
3005 aSig
= extractFloat64Frac( a
);
3006 aExp
= extractFloat64Exp( a
);
3007 aSign
= extractFloat64Sign( a
);
3008 bSig
= extractFloat64Frac( b
);
3009 bExp
= extractFloat64Exp( b
);
3010 bSign
= extractFloat64Sign( b
);
3011 if ( aExp
== 0x7FF ) {
3012 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3013 return propagateFloat64NaN( a
, b
);
3015 float_raise( float_flag_invalid
);
3016 return float64_default_nan
;
3018 if ( bExp
== 0x7FF ) {
3019 if ( bSig
) return propagateFloat64NaN( a
, b
);
3024 float_raise( float_flag_invalid
);
3025 return float64_default_nan
;
3027 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3030 if ( aSig
== 0 ) return a
;
3031 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3033 expDiff
= aExp
- bExp
;
3034 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
3035 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3036 if ( expDiff
< 0 ) {
3037 if ( expDiff
< -1 ) return a
;
3040 q
= ( bSig
<= aSig
);
3041 if ( q
) aSig
-= bSig
;
3043 while ( 0 < expDiff
) {
3044 q
= estimateDiv128To64( aSig
, 0, bSig
);
3045 q
= ( 2 < q
) ? q
- 2 : 0;
3046 aSig
= - ( ( bSig
>>2 ) * q
);
3050 if ( 0 < expDiff
) {
3051 q
= estimateDiv128To64( aSig
, 0, bSig
);
3052 q
= ( 2 < q
) ? q
- 2 : 0;
3055 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3062 alternateASig
= aSig
;
3065 } while ( 0 <= (sbits64
) aSig
);
3066 sigMean
= aSig
+ alternateASig
;
3067 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3068 aSig
= alternateASig
;
3070 zSign
= ( (sbits64
) aSig
< 0 );
3071 if ( zSign
) aSig
= - aSig
;
3072 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig
);
3077 -------------------------------------------------------------------------------
3078 Returns the square root of the double-precision floating-point value `a'.
3079 The operation is performed according to the IEC/IEEE Standard for Binary
3080 Floating-Point Arithmetic.
3081 -------------------------------------------------------------------------------
3083 float64
float64_sqrt( float64 a
)
3087 bits64 aSig
, zSig
, doubleZSig
;
3088 bits64 rem0
, rem1
, term0
, term1
;
3090 aSig
= extractFloat64Frac( a
);
3091 aExp
= extractFloat64Exp( a
);
3092 aSign
= extractFloat64Sign( a
);
3093 if ( aExp
== 0x7FF ) {
3094 if ( aSig
) return propagateFloat64NaN( a
, a
);
3095 if ( ! aSign
) return a
;
3096 float_raise( float_flag_invalid
);
3097 return float64_default_nan
;
3100 if ( ( aExp
| aSig
) == 0 ) return a
;
3101 float_raise( float_flag_invalid
);
3102 return float64_default_nan
;
3105 if ( aSig
== 0 ) return 0;
3106 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3108 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3109 aSig
|= LIT64( 0x0010000000000000 );
3110 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3111 aSig
<<= 9 - ( aExp
& 1 );
3112 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3113 if ( ( zSig
& 0x1FF ) <= 5 ) {
3114 doubleZSig
= zSig
<<1;
3115 mul64To128( zSig
, zSig
, &term0
, &term1
);
3116 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3117 while ( (sbits64
) rem0
< 0 ) {
3120 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3122 zSig
|= ( ( rem0
| rem1
) != 0 );
3124 return roundAndPackFloat64( 0, zExp
, zSig
);
3130 -------------------------------------------------------------------------------
3131 Returns 1 if the double-precision floating-point value `a' is equal to the
3132 corresponding value `b', and 0 otherwise. The comparison is performed
3133 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3134 -------------------------------------------------------------------------------
3136 flag
float64_eq( float64 a
, float64 b
)
3139 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3140 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3142 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3143 float_raise( float_flag_invalid
);
3147 return ( a
== b
) ||
3148 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) == 0 );
3153 -------------------------------------------------------------------------------
3154 Returns 1 if the double-precision floating-point value `a' is less than or
3155 equal to the corresponding value `b', and 0 otherwise. The comparison is
3156 performed according to the IEC/IEEE Standard for Binary Floating-Point
3158 -------------------------------------------------------------------------------
3160 flag
float64_le( float64 a
, float64 b
)
3164 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3165 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3167 float_raise( float_flag_invalid
);
3170 aSign
= extractFloat64Sign( a
);
3171 bSign
= extractFloat64Sign( b
);
3172 if ( aSign
!= bSign
)
3174 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) ==
3176 return ( a
== b
) ||
3177 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
3182 -------------------------------------------------------------------------------
3183 Returns 1 if the double-precision floating-point value `a' is less than
3184 the corresponding value `b', and 0 otherwise. The comparison is performed
3185 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3186 -------------------------------------------------------------------------------
3188 flag
float64_lt( float64 a
, float64 b
)
3192 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3193 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3195 float_raise( float_flag_invalid
);
3198 aSign
= extractFloat64Sign( a
);
3199 bSign
= extractFloat64Sign( b
);
3200 if ( aSign
!= bSign
)
3202 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) !=
3204 return ( a
!= b
) &&
3205 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
3209 #ifndef SOFTFLOAT_FOR_GCC
3211 -------------------------------------------------------------------------------
3212 Returns 1 if the double-precision floating-point value `a' is equal to the
3213 corresponding value `b', and 0 otherwise. The invalid exception is raised
3214 if either operand is a NaN. Otherwise, the comparison is performed
3215 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3216 -------------------------------------------------------------------------------
3218 flag
float64_eq_signaling( float64 a
, float64 b
)
3221 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3222 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3224 float_raise( float_flag_invalid
);
3227 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3232 -------------------------------------------------------------------------------
3233 Returns 1 if the double-precision floating-point value `a' is less than or
3234 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3235 cause an exception. Otherwise, the comparison is performed according to the
3236 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3237 -------------------------------------------------------------------------------
3239 flag
float64_le_quiet( float64 a
, float64 b
)
3243 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3244 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3246 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3247 float_raise( float_flag_invalid
);
3251 aSign
= extractFloat64Sign( a
);
3252 bSign
= extractFloat64Sign( b
);
3253 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3254 return ( a
== b
) || ( aSign
^ ( a
< b
) );
3259 -------------------------------------------------------------------------------
3260 Returns 1 if the double-precision floating-point value `a' is less than
3261 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3262 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3263 Standard for Binary Floating-Point Arithmetic.
3264 -------------------------------------------------------------------------------
3266 flag
float64_lt_quiet( float64 a
, float64 b
)
3270 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3271 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3273 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3274 float_raise( float_flag_invalid
);
3278 aSign
= extractFloat64Sign( a
);
3279 bSign
= extractFloat64Sign( b
);
3280 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
3281 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
3289 -------------------------------------------------------------------------------
3290 Returns the result of converting the extended double-precision floating-
3291 point value `a' to the 32-bit two's complement integer format. The
3292 conversion is performed according to the IEC/IEEE Standard for Binary
3293 Floating-Point Arithmetic---which means in particular that the conversion
3294 is rounded according to the current rounding mode. If `a' is a NaN, the
3295 largest positive integer is returned. Otherwise, if the conversion
3296 overflows, the largest integer with the same sign as `a' is returned.
3297 -------------------------------------------------------------------------------
3299 int32
floatx80_to_int32( floatx80 a
)
3302 int32 aExp
, shiftCount
;
3305 aSig
= extractFloatx80Frac( a
);
3306 aExp
= extractFloatx80Exp( a
);
3307 aSign
= extractFloatx80Sign( a
);
3308 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3309 shiftCount
= 0x4037 - aExp
;
3310 if ( shiftCount
<= 0 ) shiftCount
= 1;
3311 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3312 return roundAndPackInt32( aSign
, aSig
);
3317 -------------------------------------------------------------------------------
3318 Returns the result of converting the extended double-precision floating-
3319 point value `a' to the 32-bit two's complement integer format. The
3320 conversion is performed according to the IEC/IEEE Standard for Binary
3321 Floating-Point Arithmetic, except that the conversion is always rounded
3322 toward zero. If `a' is a NaN, the largest positive integer is returned.
3323 Otherwise, if the conversion overflows, the largest integer with the same
3324 sign as `a' is returned.
3325 -------------------------------------------------------------------------------
3327 int32
floatx80_to_int32_round_to_zero( floatx80 a
)
3330 int32 aExp
, shiftCount
;
3331 bits64 aSig
, savedASig
;
3334 aSig
= extractFloatx80Frac( a
);
3335 aExp
= extractFloatx80Exp( a
);
3336 aSign
= extractFloatx80Sign( a
);
3337 if ( 0x401E < aExp
) {
3338 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3341 else if ( aExp
< 0x3FFF ) {
3342 if ( aExp
|| aSig
) set_float_exception_inexact_flag();
3345 shiftCount
= 0x403E - aExp
;
3347 aSig
>>= shiftCount
;
3349 if ( aSign
) z
= - z
;
3350 if ( ( z
< 0 ) ^ aSign
) {
3352 float_raise( float_flag_invalid
);
3353 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3355 if ( ( aSig
<<shiftCount
) != savedASig
) {
3356 set_float_exception_inexact_flag();
3363 -------------------------------------------------------------------------------
3364 Returns the result of converting the extended double-precision floating-
3365 point value `a' to the 64-bit two's complement integer format. The
3366 conversion is performed according to the IEC/IEEE Standard for Binary
3367 Floating-Point Arithmetic---which means in particular that the conversion
3368 is rounded according to the current rounding mode. If `a' is a NaN,
3369 the largest positive integer is returned. Otherwise, if the conversion
3370 overflows, the largest integer with the same sign as `a' is returned.
3371 -------------------------------------------------------------------------------
3373 int64
floatx80_to_int64( floatx80 a
)
3376 int32 aExp
, shiftCount
;
3377 bits64 aSig
, aSigExtra
;
3379 aSig
= extractFloatx80Frac( a
);
3380 aExp
= extractFloatx80Exp( a
);
3381 aSign
= extractFloatx80Sign( a
);
3382 shiftCount
= 0x403E - aExp
;
3383 if ( shiftCount
<= 0 ) {
3385 float_raise( float_flag_invalid
);
3387 || ( ( aExp
== 0x7FFF )
3388 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3390 return LIT64( 0x7FFFFFFFFFFFFFFF );
3392 return (sbits64
) LIT64( 0x8000000000000000 );
3397 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3399 return roundAndPackInt64( aSign
, aSig
, aSigExtra
);
3404 -------------------------------------------------------------------------------
3405 Returns the result of converting the extended double-precision floating-
3406 point value `a' to the 64-bit two's complement integer format. The
3407 conversion is performed according to the IEC/IEEE Standard for Binary
3408 Floating-Point Arithmetic, except that the conversion is always rounded
3409 toward zero. If `a' is a NaN, the largest positive integer is returned.
3410 Otherwise, if the conversion overflows, the largest integer with the same
3411 sign as `a' is returned.
3412 -------------------------------------------------------------------------------
3414 int64
floatx80_to_int64_round_to_zero( floatx80 a
)
3417 int32 aExp
, shiftCount
;
3421 aSig
= extractFloatx80Frac( a
);
3422 aExp
= extractFloatx80Exp( a
);
3423 aSign
= extractFloatx80Sign( a
);
3424 shiftCount
= aExp
- 0x403E;
3425 if ( 0 <= shiftCount
) {
3426 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3427 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3428 float_raise( float_flag_invalid
);
3429 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3430 return LIT64( 0x7FFFFFFFFFFFFFFF );
3433 return (sbits64
) LIT64( 0x8000000000000000 );
3435 else if ( aExp
< 0x3FFF ) {
3436 if ( aExp
| aSig
) set_float_exception_inexact_flag();
3439 z
= aSig
>>( - shiftCount
);
3440 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3441 set_float_exception_inexact_flag();
3443 if ( aSign
) z
= - z
;
3449 -------------------------------------------------------------------------------
3450 Returns the result of converting the extended double-precision floating-
3451 point value `a' to the single-precision floating-point format. The
3452 conversion is performed according to the IEC/IEEE Standard for Binary
3453 Floating-Point Arithmetic.
3454 -------------------------------------------------------------------------------
3456 float32
floatx80_to_float32( floatx80 a
)
3462 aSig
= extractFloatx80Frac( a
);
3463 aExp
= extractFloatx80Exp( a
);
3464 aSign
= extractFloatx80Sign( a
);
3465 if ( aExp
== 0x7FFF ) {
3466 if ( (bits64
) ( aSig
<<1 ) ) {
3467 return commonNaNToFloat32( floatx80ToCommonNaN( a
) );
3469 return packFloat32( aSign
, 0xFF, 0 );
3471 shift64RightJamming( aSig
, 33, &aSig
);
3472 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3473 return roundAndPackFloat32( aSign
, aExp
, aSig
);
3478 -------------------------------------------------------------------------------
3479 Returns the result of converting the extended double-precision floating-
3480 point value `a' to the double-precision floating-point format. The
3481 conversion is performed according to the IEC/IEEE Standard for Binary
3482 Floating-Point Arithmetic.
3483 -------------------------------------------------------------------------------
3485 float64
floatx80_to_float64( floatx80 a
)
3491 aSig
= extractFloatx80Frac( a
);
3492 aExp
= extractFloatx80Exp( a
);
3493 aSign
= extractFloatx80Sign( a
);
3494 if ( aExp
== 0x7FFF ) {
3495 if ( (bits64
) ( aSig
<<1 ) ) {
3496 return commonNaNToFloat64( floatx80ToCommonNaN( a
) );
3498 return packFloat64( aSign
, 0x7FF, 0 );
3500 shift64RightJamming( aSig
, 1, &zSig
);
3501 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3502 return roundAndPackFloat64( aSign
, aExp
, zSig
);
3509 -------------------------------------------------------------------------------
3510 Returns the result of converting the extended double-precision floating-
3511 point value `a' to the quadruple-precision floating-point format. The
3512 conversion is performed according to the IEC/IEEE Standard for Binary
3513 Floating-Point Arithmetic.
3514 -------------------------------------------------------------------------------
3516 float128
floatx80_to_float128( floatx80 a
)
3520 bits64 aSig
, zSig0
, zSig1
;
3522 aSig
= extractFloatx80Frac( a
);
3523 aExp
= extractFloatx80Exp( a
);
3524 aSign
= extractFloatx80Sign( a
);
3525 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3526 return commonNaNToFloat128( floatx80ToCommonNaN( a
) );
3528 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3529 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3536 -------------------------------------------------------------------------------
3537 Rounds the extended double-precision floating-point value `a' to an integer,
3538 and returns the result as an extended quadruple-precision floating-point
3539 value. The operation is performed according to the IEC/IEEE Standard for
3540 Binary Floating-Point Arithmetic.
3541 -------------------------------------------------------------------------------
3543 floatx80
floatx80_round_to_int( floatx80 a
)
3547 bits64 lastBitMask
, roundBitsMask
;
3551 aExp
= extractFloatx80Exp( a
);
3552 if ( 0x403E <= aExp
) {
3553 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3554 return propagateFloatx80NaN( a
, a
);
3558 if ( aExp
< 0x3FFF ) {
3560 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3563 set_float_exception_inexact_flag();
3564 aSign
= extractFloatx80Sign( a
);
3565 switch ( float_rounding_mode
) {
3566 case float_round_nearest_even
:
3567 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3570 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3573 case float_round_to_zero
:
3575 case float_round_down
:
3578 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3579 : packFloatx80( 0, 0, 0 );
3580 case float_round_up
:
3582 aSign
? packFloatx80( 1, 0, 0 )
3583 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3585 return packFloatx80( aSign
, 0, 0 );
3588 lastBitMask
<<= 0x403E - aExp
;
3589 roundBitsMask
= lastBitMask
- 1;
3591 roundingMode
= float_rounding_mode
;
3592 if ( roundingMode
== float_round_nearest_even
) {
3593 z
.low
+= lastBitMask
>>1;
3594 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
3596 else if ( roundingMode
!= float_round_to_zero
) {
3597 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
3598 z
.low
+= roundBitsMask
;
3601 z
.low
&= ~ roundBitsMask
;
3604 z
.low
= LIT64( 0x8000000000000000 );
3606 if ( z
.low
!= a
.low
) set_float_exception_inexact_flag();
3612 -------------------------------------------------------------------------------
3613 Returns the result of adding the absolute values of the extended double-
3614 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3615 negated before being returned. `zSign' is ignored if the result is a NaN.
3616 The addition is performed according to the IEC/IEEE Standard for Binary
3617 Floating-Point Arithmetic.
3618 -------------------------------------------------------------------------------
3620 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3622 int32 aExp
, bExp
, zExp
;
3623 bits64 aSig
, bSig
, zSig0
, zSig1
;
3626 aSig
= extractFloatx80Frac( a
);
3627 aExp
= extractFloatx80Exp( a
);
3628 bSig
= extractFloatx80Frac( b
);
3629 bExp
= extractFloatx80Exp( b
);
3630 expDiff
= aExp
- bExp
;
3631 if ( 0 < expDiff
) {
3632 if ( aExp
== 0x7FFF ) {
3633 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3636 if ( bExp
== 0 ) --expDiff
;
3637 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3640 else if ( expDiff
< 0 ) {
3641 if ( bExp
== 0x7FFF ) {
3642 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3643 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3645 if ( aExp
== 0 ) ++expDiff
;
3646 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3650 if ( aExp
== 0x7FFF ) {
3651 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3652 return propagateFloatx80NaN( a
, b
);
3657 zSig0
= aSig
+ bSig
;
3659 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
3665 zSig0
= aSig
+ bSig
;
3666 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
3668 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3669 zSig0
|= LIT64( 0x8000000000000000 );
3673 roundAndPackFloatx80(
3674 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3679 -------------------------------------------------------------------------------
3680 Returns the result of subtracting the absolute values of the extended
3681 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3682 difference is negated before being returned. `zSign' is ignored if the
3683 result is a NaN. The subtraction is performed according to the IEC/IEEE
3684 Standard for Binary Floating-Point Arithmetic.
3685 -------------------------------------------------------------------------------
3687 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3689 int32 aExp
, bExp
, zExp
;
3690 bits64 aSig
, bSig
, zSig0
, zSig1
;
3694 aSig
= extractFloatx80Frac( a
);
3695 aExp
= extractFloatx80Exp( a
);
3696 bSig
= extractFloatx80Frac( b
);
3697 bExp
= extractFloatx80Exp( b
);
3698 expDiff
= aExp
- bExp
;
3699 if ( 0 < expDiff
) goto aExpBigger
;
3700 if ( expDiff
< 0 ) goto bExpBigger
;
3701 if ( aExp
== 0x7FFF ) {
3702 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3703 return propagateFloatx80NaN( a
, b
);
3705 float_raise( float_flag_invalid
);
3706 z
.low
= floatx80_default_nan_low
;
3707 z
.high
= floatx80_default_nan_high
;
3715 if ( bSig
< aSig
) goto aBigger
;
3716 if ( aSig
< bSig
) goto bBigger
;
3717 return packFloatx80( float_rounding_mode
== float_round_down
, 0, 0 );
3719 if ( bExp
== 0x7FFF ) {
3720 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3721 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3723 if ( aExp
== 0 ) ++expDiff
;
3724 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3726 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
3729 goto normalizeRoundAndPack
;
3731 if ( aExp
== 0x7FFF ) {
3732 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3735 if ( bExp
== 0 ) --expDiff
;
3736 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3738 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
3740 normalizeRoundAndPack
:
3742 normalizeRoundAndPackFloatx80(
3743 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3748 -------------------------------------------------------------------------------
3749 Returns the result of adding the extended double-precision floating-point
3750 values `a' and `b'. The operation is performed according to the IEC/IEEE
3751 Standard for Binary Floating-Point Arithmetic.
3752 -------------------------------------------------------------------------------
3754 floatx80
floatx80_add( floatx80 a
, floatx80 b
)
3758 aSign
= extractFloatx80Sign( a
);
3759 bSign
= extractFloatx80Sign( b
);
3760 if ( aSign
== bSign
) {
3761 return addFloatx80Sigs( a
, b
, aSign
);
3764 return subFloatx80Sigs( a
, b
, aSign
);
3770 -------------------------------------------------------------------------------
3771 Returns the result of subtracting the extended double-precision floating-
3772 point values `a' and `b'. The operation is performed according to the
3773 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3774 -------------------------------------------------------------------------------
3776 floatx80
floatx80_sub( floatx80 a
, floatx80 b
)
3780 aSign
= extractFloatx80Sign( a
);
3781 bSign
= extractFloatx80Sign( b
);
3782 if ( aSign
== bSign
) {
3783 return subFloatx80Sigs( a
, b
, aSign
);
3786 return addFloatx80Sigs( a
, b
, aSign
);
3792 -------------------------------------------------------------------------------
3793 Returns the result of multiplying the extended double-precision floating-
3794 point values `a' and `b'. The operation is performed according to the
3795 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3796 -------------------------------------------------------------------------------
3798 floatx80
floatx80_mul( floatx80 a
, floatx80 b
)
3800 flag aSign
, bSign
, zSign
;
3801 int32 aExp
, bExp
, zExp
;
3802 bits64 aSig
, bSig
, zSig0
, zSig1
;
3805 aSig
= extractFloatx80Frac( a
);
3806 aExp
= extractFloatx80Exp( a
);
3807 aSign
= extractFloatx80Sign( a
);
3808 bSig
= extractFloatx80Frac( b
);
3809 bExp
= extractFloatx80Exp( b
);
3810 bSign
= extractFloatx80Sign( b
);
3811 zSign
= aSign
^ bSign
;
3812 if ( aExp
== 0x7FFF ) {
3813 if ( (bits64
) ( aSig
<<1 )
3814 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3815 return propagateFloatx80NaN( a
, b
);
3817 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
3818 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3820 if ( bExp
== 0x7FFF ) {
3821 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3822 if ( ( aExp
| aSig
) == 0 ) {
3824 float_raise( float_flag_invalid
);
3825 z
.low
= floatx80_default_nan_low
;
3826 z
.high
= floatx80_default_nan_high
;
3829 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3832 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3833 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3836 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3837 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3839 zExp
= aExp
+ bExp
- 0x3FFE;
3840 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3841 if ( 0 < (sbits64
) zSig0
) {
3842 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3846 roundAndPackFloatx80(
3847 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3852 -------------------------------------------------------------------------------
3853 Returns the result of dividing the extended double-precision floating-point
3854 value `a' by the corresponding value `b'. The operation is performed
3855 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3856 -------------------------------------------------------------------------------
3858 floatx80
floatx80_div( floatx80 a
, floatx80 b
)
3860 flag aSign
, bSign
, zSign
;
3861 int32 aExp
, bExp
, zExp
;
3862 bits64 aSig
, bSig
, zSig0
, zSig1
;
3863 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3866 aSig
= extractFloatx80Frac( a
);
3867 aExp
= extractFloatx80Exp( a
);
3868 aSign
= extractFloatx80Sign( a
);
3869 bSig
= extractFloatx80Frac( b
);
3870 bExp
= extractFloatx80Exp( b
);
3871 bSign
= extractFloatx80Sign( b
);
3872 zSign
= aSign
^ bSign
;
3873 if ( aExp
== 0x7FFF ) {
3874 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3875 if ( bExp
== 0x7FFF ) {
3876 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3879 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3881 if ( bExp
== 0x7FFF ) {
3882 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3883 return packFloatx80( zSign
, 0, 0 );
3887 if ( ( aExp
| aSig
) == 0 ) {
3889 float_raise( float_flag_invalid
);
3890 z
.low
= floatx80_default_nan_low
;
3891 z
.high
= floatx80_default_nan_high
;
3894 float_raise( float_flag_divbyzero
);
3895 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3897 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3900 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3901 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3903 zExp
= aExp
- bExp
+ 0x3FFE;
3905 if ( bSig
<= aSig
) {
3906 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3909 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3910 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3911 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3912 while ( (sbits64
) rem0
< 0 ) {
3914 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3916 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3917 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3918 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3919 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3920 while ( (sbits64
) rem1
< 0 ) {
3922 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3924 zSig1
|= ( ( rem1
| rem2
) != 0 );
3927 roundAndPackFloatx80(
3928 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3933 -------------------------------------------------------------------------------
3934 Returns the remainder of the extended double-precision floating-point value
3935 `a' with respect to the corresponding value `b'. The operation is performed
3936 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3937 -------------------------------------------------------------------------------
3939 floatx80
floatx80_rem( floatx80 a
, floatx80 b
)
3941 flag aSign
, bSign
, zSign
;
3942 int32 aExp
, bExp
, expDiff
;
3943 bits64 aSig0
, aSig1
, bSig
;
3944 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3947 aSig0
= extractFloatx80Frac( a
);
3948 aExp
= extractFloatx80Exp( a
);
3949 aSign
= extractFloatx80Sign( a
);
3950 bSig
= extractFloatx80Frac( b
);
3951 bExp
= extractFloatx80Exp( b
);
3952 bSign
= extractFloatx80Sign( b
);
3953 if ( aExp
== 0x7FFF ) {
3954 if ( (bits64
) ( aSig0
<<1 )
3955 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3956 return propagateFloatx80NaN( a
, b
);
3960 if ( bExp
== 0x7FFF ) {
3961 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3967 float_raise( float_flag_invalid
);
3968 z
.low
= floatx80_default_nan_low
;
3969 z
.high
= floatx80_default_nan_high
;
3972 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3975 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3976 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3978 bSig
|= LIT64( 0x8000000000000000 );
3980 expDiff
= aExp
- bExp
;
3982 if ( expDiff
< 0 ) {
3983 if ( expDiff
< -1 ) return a
;
3984 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3987 q
= ( bSig
<= aSig0
);
3988 if ( q
) aSig0
-= bSig
;
3990 while ( 0 < expDiff
) {
3991 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3992 q
= ( 2 < q
) ? q
- 2 : 0;
3993 mul64To128( bSig
, q
, &term0
, &term1
);
3994 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3995 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3999 if ( 0 < expDiff
) {
4000 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4001 q
= ( 2 < q
) ? q
- 2 : 0;
4003 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
4004 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4005 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
4006 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
4008 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4015 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
4016 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4017 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4020 aSig0
= alternateASig0
;
4021 aSig1
= alternateASig1
;
4025 normalizeRoundAndPackFloatx80(
4026 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
);
4031 -------------------------------------------------------------------------------
4032 Returns the square root of the extended double-precision floating-point
4033 value `a'. The operation is performed according to the IEC/IEEE Standard
4034 for Binary Floating-Point Arithmetic.
4035 -------------------------------------------------------------------------------
4037 floatx80
floatx80_sqrt( floatx80 a
)
4041 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
4042 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4045 aSig0
= extractFloatx80Frac( a
);
4046 aExp
= extractFloatx80Exp( a
);
4047 aSign
= extractFloatx80Sign( a
);
4048 if ( aExp
== 0x7FFF ) {
4049 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a
);
4050 if ( ! aSign
) return a
;
4054 if ( ( aExp
| aSig0
) == 0 ) return a
;
4056 float_raise( float_flag_invalid
);
4057 z
.low
= floatx80_default_nan_low
;
4058 z
.high
= floatx80_default_nan_high
;
4062 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4063 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4065 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4066 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4067 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4068 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4069 doubleZSig0
= zSig0
<<1;
4070 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4071 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4072 while ( (sbits64
) rem0
< 0 ) {
4075 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4077 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4078 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4079 if ( zSig1
== 0 ) zSig1
= 1;
4080 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4081 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4082 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4083 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4084 while ( (sbits64
) rem1
< 0 ) {
4086 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4088 term2
|= doubleZSig0
;
4089 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4091 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4093 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4094 zSig0
|= doubleZSig0
;
4096 roundAndPackFloatx80(
4097 floatx80_rounding_precision
, 0, zExp
, zSig0
, zSig1
);
4102 -------------------------------------------------------------------------------
4103 Returns 1 if the extended double-precision floating-point value `a' is
4104 equal to the corresponding value `b', and 0 otherwise. The comparison is
4105 performed according to the IEC/IEEE Standard for Binary Floating-Point
4107 -------------------------------------------------------------------------------
4109 flag
floatx80_eq( floatx80 a
, floatx80 b
)
4112 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4113 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4114 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4115 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4117 if ( floatx80_is_signaling_nan( a
)
4118 || floatx80_is_signaling_nan( b
) ) {
4119 float_raise( float_flag_invalid
);
4125 && ( ( a
.high
== b
.high
)
4127 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4133 -------------------------------------------------------------------------------
4134 Returns 1 if the extended double-precision floating-point value `a' is
4135 less than or equal to the corresponding value `b', and 0 otherwise. The
4136 comparison is performed according to the IEC/IEEE Standard for Binary
4137 Floating-Point Arithmetic.
4138 -------------------------------------------------------------------------------
4140 flag
floatx80_le( floatx80 a
, floatx80 b
)
4144 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4145 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4146 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4147 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4149 float_raise( float_flag_invalid
);
4152 aSign
= extractFloatx80Sign( a
);
4153 bSign
= extractFloatx80Sign( b
);
4154 if ( aSign
!= bSign
) {
4157 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4161 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4162 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4167 -------------------------------------------------------------------------------
4168 Returns 1 if the extended double-precision floating-point value `a' is
4169 less than the corresponding value `b', and 0 otherwise. The comparison
4170 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4172 -------------------------------------------------------------------------------
4174 flag
floatx80_lt( floatx80 a
, floatx80 b
)
4178 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4179 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4180 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4181 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4183 float_raise( float_flag_invalid
);
4186 aSign
= extractFloatx80Sign( a
);
4187 bSign
= extractFloatx80Sign( b
);
4188 if ( aSign
!= bSign
) {
4191 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4195 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4196 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4201 -------------------------------------------------------------------------------
4202 Returns 1 if the extended double-precision floating-point value `a' is equal
4203 to the corresponding value `b', and 0 otherwise. The invalid exception is
4204 raised if either operand is a NaN. Otherwise, the comparison is performed
4205 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4206 -------------------------------------------------------------------------------
4208 flag
floatx80_eq_signaling( floatx80 a
, floatx80 b
)
4211 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4212 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4213 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4214 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4216 float_raise( float_flag_invalid
);
4221 && ( ( a
.high
== b
.high
)
4223 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4229 -------------------------------------------------------------------------------
4230 Returns 1 if the extended double-precision floating-point value `a' is less
4231 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4232 do not cause an exception. Otherwise, the comparison is performed according
4233 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4234 -------------------------------------------------------------------------------
4236 flag
floatx80_le_quiet( floatx80 a
, floatx80 b
)
4240 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4241 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4242 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4243 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4245 if ( floatx80_is_signaling_nan( a
)
4246 || floatx80_is_signaling_nan( b
) ) {
4247 float_raise( float_flag_invalid
);
4251 aSign
= extractFloatx80Sign( a
);
4252 bSign
= extractFloatx80Sign( b
);
4253 if ( aSign
!= bSign
) {
4256 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4260 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4261 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4266 -------------------------------------------------------------------------------
4267 Returns 1 if the extended double-precision floating-point value `a' is less
4268 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4269 an exception. Otherwise, the comparison is performed according to the
4270 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4271 -------------------------------------------------------------------------------
4273 flag
floatx80_lt_quiet( floatx80 a
, floatx80 b
)
4277 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4278 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4279 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4280 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4282 if ( floatx80_is_signaling_nan( a
)
4283 || floatx80_is_signaling_nan( b
) ) {
4284 float_raise( float_flag_invalid
);
4288 aSign
= extractFloatx80Sign( a
);
4289 bSign
= extractFloatx80Sign( b
);
4290 if ( aSign
!= bSign
) {
4293 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4297 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4298 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4307 -------------------------------------------------------------------------------
4308 Returns the result of converting the quadruple-precision floating-point
4309 value `a' to the 32-bit two's complement integer format. The conversion
4310 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4311 Arithmetic---which means in particular that the conversion is rounded
4312 according to the current rounding mode. If `a' is a NaN, the largest
4313 positive integer is returned. Otherwise, if the conversion overflows, the
4314 largest integer with the same sign as `a' is returned.
4315 -------------------------------------------------------------------------------
4317 int32
float128_to_int32( float128 a
)
4320 int32 aExp
, shiftCount
;
4321 bits64 aSig0
, aSig1
;
4323 aSig1
= extractFloat128Frac1( a
);
4324 aSig0
= extractFloat128Frac0( a
);
4325 aExp
= extractFloat128Exp( a
);
4326 aSign
= extractFloat128Sign( a
);
4327 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4328 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4329 aSig0
|= ( aSig1
!= 0 );
4330 shiftCount
= 0x4028 - aExp
;
4331 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4332 return roundAndPackInt32( aSign
, aSig0
);
4337 -------------------------------------------------------------------------------
4338 Returns the result of converting the quadruple-precision floating-point
4339 value `a' to the 32-bit two's complement integer format. The conversion
4340 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4341 Arithmetic, except that the conversion is always rounded toward zero. If
4342 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4343 conversion overflows, the largest integer with the same sign as `a' is
4345 -------------------------------------------------------------------------------
4347 int32
float128_to_int32_round_to_zero( float128 a
)
4350 int32 aExp
, shiftCount
;
4351 bits64 aSig0
, aSig1
, savedASig
;
4354 aSig1
= extractFloat128Frac1( a
);
4355 aSig0
= extractFloat128Frac0( a
);
4356 aExp
= extractFloat128Exp( a
);
4357 aSign
= extractFloat128Sign( a
);
4358 aSig0
|= ( aSig1
!= 0 );
4359 if ( 0x401E < aExp
) {
4360 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4363 else if ( aExp
< 0x3FFF ) {
4364 if ( aExp
|| aSig0
) set_float_exception_inexact_flag();
4367 aSig0
|= LIT64( 0x0001000000000000 );
4368 shiftCount
= 0x402F - aExp
;
4370 aSig0
>>= shiftCount
;
4372 if ( aSign
) z
= - z
;
4373 if ( ( z
< 0 ) ^ aSign
) {
4375 float_raise( float_flag_invalid
);
4376 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4378 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4379 set_float_exception_inexact_flag();
4386 -------------------------------------------------------------------------------
4387 Returns the result of converting the quadruple-precision floating-point
4388 value `a' to the 64-bit two's complement integer format. The conversion
4389 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4390 Arithmetic---which means in particular that the conversion is rounded
4391 according to the current rounding mode. If `a' is a NaN, the largest
4392 positive integer is returned. Otherwise, if the conversion overflows, the
4393 largest integer with the same sign as `a' is returned.
4394 -------------------------------------------------------------------------------
4396 int64
float128_to_int64( float128 a
)
4399 int32 aExp
, shiftCount
;
4400 bits64 aSig0
, aSig1
;
4402 aSig1
= extractFloat128Frac1( a
);
4403 aSig0
= extractFloat128Frac0( a
);
4404 aExp
= extractFloat128Exp( a
);
4405 aSign
= extractFloat128Sign( a
);
4406 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4407 shiftCount
= 0x402F - aExp
;
4408 if ( shiftCount
<= 0 ) {
4409 if ( 0x403E < aExp
) {
4410 float_raise( float_flag_invalid
);
4412 || ( ( aExp
== 0x7FFF )
4413 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4416 return LIT64( 0x7FFFFFFFFFFFFFFF );
4418 return (sbits64
) LIT64( 0x8000000000000000 );
4420 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4423 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4425 return roundAndPackInt64( aSign
, aSig0
, aSig1
);
4430 -------------------------------------------------------------------------------
4431 Returns the result of converting the quadruple-precision floating-point
4432 value `a' to the 64-bit two's complement integer format. The conversion
4433 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4434 Arithmetic, except that the conversion is always rounded toward zero.
4435 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4436 the conversion overflows, the largest integer with the same sign as `a' is
4438 -------------------------------------------------------------------------------
4440 int64
float128_to_int64_round_to_zero( float128 a
)
4443 int32 aExp
, shiftCount
;
4444 bits64 aSig0
, aSig1
;
4447 aSig1
= extractFloat128Frac1( a
);
4448 aSig0
= extractFloat128Frac0( a
);
4449 aExp
= extractFloat128Exp( a
);
4450 aSign
= extractFloat128Sign( a
);
4451 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4452 shiftCount
= aExp
- 0x402F;
4453 if ( 0 < shiftCount
) {
4454 if ( 0x403E <= aExp
) {
4455 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4456 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4457 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4458 if ( aSig1
) set_float_exception_inexact_flag();
4461 float_raise( float_flag_invalid
);
4462 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4463 return LIT64( 0x7FFFFFFFFFFFFFFF );
4466 return (sbits64
) LIT64( 0x8000000000000000 );
4468 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4469 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4470 set_float_exception_inexact_flag();
4474 if ( aExp
< 0x3FFF ) {
4475 if ( aExp
| aSig0
| aSig1
) {
4476 set_float_exception_inexact_flag();
4480 z
= aSig0
>>( - shiftCount
);
4482 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4483 set_float_exception_inexact_flag();
4486 if ( aSign
) z
= - z
;
4491 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4492 && defined(SOFTFLOAT_NEED_FIXUNS)
4494 * just like above - but do not care for overflow of signed results
4496 uint64
float128_to_uint64_round_to_zero( float128 a
)
4499 int32 aExp
, shiftCount
;
4500 bits64 aSig0
, aSig1
;
4503 aSig1
= extractFloat128Frac1( a
);
4504 aSig0
= extractFloat128Frac0( a
);
4505 aExp
= extractFloat128Exp( a
);
4506 aSign
= extractFloat128Sign( a
);
4507 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4508 shiftCount
= aExp
- 0x402F;
4509 if ( 0 < shiftCount
) {
4510 if ( 0x403F <= aExp
) {
4511 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4512 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4513 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4514 if ( aSig1
) set_float_exception_inexact_flag();
4517 float_raise( float_flag_invalid
);
4519 return LIT64( 0xFFFFFFFFFFFFFFFF );
4521 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4522 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4523 set_float_exception_inexact_flag();
4527 if ( aExp
< 0x3FFF ) {
4528 if ( aExp
| aSig0
| aSig1
) {
4529 set_float_exception_inexact_flag();
4533 z
= aSig0
>>( - shiftCount
);
4534 if (aSig1
|| ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4535 set_float_exception_inexact_flag();
4538 if ( aSign
) z
= - z
;
4542 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4545 -------------------------------------------------------------------------------
4546 Returns the result of converting the quadruple-precision floating-point
4547 value `a' to the single-precision floating-point format. The conversion
4548 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4550 -------------------------------------------------------------------------------
4552 float32
float128_to_float32( float128 a
)
4556 bits64 aSig0
, aSig1
;
4559 aSig1
= extractFloat128Frac1( a
);
4560 aSig0
= extractFloat128Frac0( a
);
4561 aExp
= extractFloat128Exp( a
);
4562 aSign
= extractFloat128Sign( a
);
4563 if ( aExp
== 0x7FFF ) {
4564 if ( aSig0
| aSig1
) {
4565 return commonNaNToFloat32( float128ToCommonNaN( a
) );
4567 return packFloat32( aSign
, 0xFF, 0 );
4569 aSig0
|= ( aSig1
!= 0 );
4570 shift64RightJamming( aSig0
, 18, &aSig0
);
4571 zSig
= (bits32
)aSig0
;
4572 if ( aExp
|| zSig
) {
4576 return roundAndPackFloat32( aSign
, aExp
, zSig
);
4581 -------------------------------------------------------------------------------
4582 Returns the result of converting the quadruple-precision floating-point
4583 value `a' to the double-precision floating-point format. The conversion
4584 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4586 -------------------------------------------------------------------------------
4588 float64
float128_to_float64( float128 a
)
4592 bits64 aSig0
, aSig1
;
4594 aSig1
= extractFloat128Frac1( a
);
4595 aSig0
= extractFloat128Frac0( a
);
4596 aExp
= extractFloat128Exp( a
);
4597 aSign
= extractFloat128Sign( a
);
4598 if ( aExp
== 0x7FFF ) {
4599 if ( aSig0
| aSig1
) {
4600 return commonNaNToFloat64( float128ToCommonNaN( a
) );
4602 return packFloat64( aSign
, 0x7FF, 0 );
4604 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4605 aSig0
|= ( aSig1
!= 0 );
4606 if ( aExp
|| aSig0
) {
4607 aSig0
|= LIT64( 0x4000000000000000 );
4610 return roundAndPackFloat64( aSign
, aExp
, aSig0
);
4617 -------------------------------------------------------------------------------
4618 Returns the result of converting the quadruple-precision floating-point
4619 value `a' to the extended double-precision floating-point format. The
4620 conversion is performed according to the IEC/IEEE Standard for Binary
4621 Floating-Point Arithmetic.
4622 -------------------------------------------------------------------------------
4624 floatx80
float128_to_floatx80( float128 a
)
4628 bits64 aSig0
, aSig1
;
4630 aSig1
= extractFloat128Frac1( a
);
4631 aSig0
= extractFloat128Frac0( a
);
4632 aExp
= extractFloat128Exp( a
);
4633 aSign
= extractFloat128Sign( a
);
4634 if ( aExp
== 0x7FFF ) {
4635 if ( aSig0
| aSig1
) {
4636 return commonNaNToFloatx80( float128ToCommonNaN( a
) );
4638 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4641 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4642 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4645 aSig0
|= LIT64( 0x0001000000000000 );
4647 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4648 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1
);
4655 -------------------------------------------------------------------------------
4656 Rounds the quadruple-precision floating-point value `a' to an integer, and
4657 returns the result as a quadruple-precision floating-point value. The
4658 operation is performed according to the IEC/IEEE Standard for Binary
4659 Floating-Point Arithmetic.
4660 -------------------------------------------------------------------------------
4662 float128
float128_round_to_int( float128 a
)
4666 bits64 lastBitMask
, roundBitsMask
;
4670 aExp
= extractFloat128Exp( a
);
4671 if ( 0x402F <= aExp
) {
4672 if ( 0x406F <= aExp
) {
4673 if ( ( aExp
== 0x7FFF )
4674 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
4676 return propagateFloat128NaN( a
, a
);
4681 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
4682 roundBitsMask
= lastBitMask
- 1;
4684 roundingMode
= float_rounding_mode
;
4685 if ( roundingMode
== float_round_nearest_even
) {
4686 if ( lastBitMask
) {
4687 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
4688 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4691 if ( (sbits64
) z
.low
< 0 ) {
4693 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
4697 else if ( roundingMode
!= float_round_to_zero
) {
4698 if ( extractFloat128Sign( z
)
4699 ^ ( roundingMode
== float_round_up
) ) {
4700 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
4703 z
.low
&= ~ roundBitsMask
;
4706 if ( aExp
< 0x3FFF ) {
4707 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
4708 set_float_exception_inexact_flag();
4709 aSign
= extractFloat128Sign( a
);
4710 switch ( float_rounding_mode
) {
4711 case float_round_nearest_even
:
4712 if ( ( aExp
== 0x3FFE )
4713 && ( extractFloat128Frac0( a
)
4714 | extractFloat128Frac1( a
) )
4716 return packFloat128( aSign
, 0x3FFF, 0, 0 );
4719 case float_round_to_zero
:
4721 case float_round_down
:
4723 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
4724 : packFloat128( 0, 0, 0, 0 );
4725 case float_round_up
:
4727 aSign
? packFloat128( 1, 0, 0, 0 )
4728 : packFloat128( 0, 0x3FFF, 0, 0 );
4730 return packFloat128( aSign
, 0, 0, 0 );
4733 lastBitMask
<<= 0x402F - aExp
;
4734 roundBitsMask
= lastBitMask
- 1;
4737 roundingMode
= float_rounding_mode
;
4738 if ( roundingMode
== float_round_nearest_even
) {
4739 z
.high
+= lastBitMask
>>1;
4740 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
4741 z
.high
&= ~ lastBitMask
;
4744 else if ( roundingMode
!= float_round_to_zero
) {
4745 if ( extractFloat128Sign( z
)
4746 ^ ( roundingMode
== float_round_up
) ) {
4747 z
.high
|= ( a
.low
!= 0 );
4748 z
.high
+= roundBitsMask
;
4751 z
.high
&= ~ roundBitsMask
;
4753 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
4754 set_float_exception_inexact_flag();
4761 -------------------------------------------------------------------------------
4762 Returns the result of adding the absolute values of the quadruple-precision
4763 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4764 before being returned. `zSign' is ignored if the result is a NaN.
4765 The addition is performed according to the IEC/IEEE Standard for Binary
4766 Floating-Point Arithmetic.
4767 -------------------------------------------------------------------------------
4769 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4771 int32 aExp
, bExp
, zExp
;
4772 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4775 aSig1
= extractFloat128Frac1( a
);
4776 aSig0
= extractFloat128Frac0( a
);
4777 aExp
= extractFloat128Exp( a
);
4778 bSig1
= extractFloat128Frac1( b
);
4779 bSig0
= extractFloat128Frac0( b
);
4780 bExp
= extractFloat128Exp( b
);
4781 expDiff
= aExp
- bExp
;
4782 if ( 0 < expDiff
) {
4783 if ( aExp
== 0x7FFF ) {
4784 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4791 bSig0
|= LIT64( 0x0001000000000000 );
4793 shift128ExtraRightJamming(
4794 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
4797 else if ( expDiff
< 0 ) {
4798 if ( bExp
== 0x7FFF ) {
4799 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4800 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4806 aSig0
|= LIT64( 0x0001000000000000 );
4808 shift128ExtraRightJamming(
4809 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
4813 if ( aExp
== 0x7FFF ) {
4814 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4815 return propagateFloat128NaN( a
, b
);
4819 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4820 if ( aExp
== 0 ) return packFloat128( zSign
, 0, zSig0
, zSig1
);
4822 zSig0
|= LIT64( 0x0002000000000000 );
4826 aSig0
|= LIT64( 0x0001000000000000 );
4827 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4829 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
4832 shift128ExtraRightJamming(
4833 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4835 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4840 -------------------------------------------------------------------------------
4841 Returns the result of subtracting the absolute values of the quadruple-
4842 precision floating-point values `a' and `b'. If `zSign' is 1, the
4843 difference is negated before being returned. `zSign' is ignored if the
4844 result is a NaN. The subtraction is performed according to the IEC/IEEE
4845 Standard for Binary Floating-Point Arithmetic.
4846 -------------------------------------------------------------------------------
4848 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4850 int32 aExp
, bExp
, zExp
;
4851 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
4855 aSig1
= extractFloat128Frac1( a
);
4856 aSig0
= extractFloat128Frac0( a
);
4857 aExp
= extractFloat128Exp( a
);
4858 bSig1
= extractFloat128Frac1( b
);
4859 bSig0
= extractFloat128Frac0( b
);
4860 bExp
= extractFloat128Exp( b
);
4861 expDiff
= aExp
- bExp
;
4862 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4863 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
4864 if ( 0 < expDiff
) goto aExpBigger
;
4865 if ( expDiff
< 0 ) goto bExpBigger
;
4866 if ( aExp
== 0x7FFF ) {
4867 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4868 return propagateFloat128NaN( a
, b
);
4870 float_raise( float_flag_invalid
);
4871 z
.low
= float128_default_nan_low
;
4872 z
.high
= float128_default_nan_high
;
4879 if ( bSig0
< aSig0
) goto aBigger
;
4880 if ( aSig0
< bSig0
) goto bBigger
;
4881 if ( bSig1
< aSig1
) goto aBigger
;
4882 if ( aSig1
< bSig1
) goto bBigger
;
4883 return packFloat128( float_rounding_mode
== float_round_down
, 0, 0, 0 );
4885 if ( bExp
== 0x7FFF ) {
4886 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4887 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
4893 aSig0
|= LIT64( 0x4000000000000000 );
4895 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4896 bSig0
|= LIT64( 0x4000000000000000 );
4898 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4901 goto normalizeRoundAndPack
;
4903 if ( aExp
== 0x7FFF ) {
4904 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4911 bSig0
|= LIT64( 0x4000000000000000 );
4913 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
4914 aSig0
|= LIT64( 0x4000000000000000 );
4916 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4918 normalizeRoundAndPack
:
4920 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1
);
4925 -------------------------------------------------------------------------------
4926 Returns the result of adding the quadruple-precision floating-point values
4927 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4928 for Binary Floating-Point Arithmetic.
4929 -------------------------------------------------------------------------------
4931 float128
float128_add( float128 a
, float128 b
)
4935 aSign
= extractFloat128Sign( a
);
4936 bSign
= extractFloat128Sign( b
);
4937 if ( aSign
== bSign
) {
4938 return addFloat128Sigs( a
, b
, aSign
);
4941 return subFloat128Sigs( a
, b
, aSign
);
4947 -------------------------------------------------------------------------------
4948 Returns the result of subtracting the quadruple-precision floating-point
4949 values `a' and `b'. The operation is performed according to the IEC/IEEE
4950 Standard for Binary Floating-Point Arithmetic.
4951 -------------------------------------------------------------------------------
4953 float128
float128_sub( float128 a
, float128 b
)
4957 aSign
= extractFloat128Sign( a
);
4958 bSign
= extractFloat128Sign( b
);
4959 if ( aSign
== bSign
) {
4960 return subFloat128Sigs( a
, b
, aSign
);
4963 return addFloat128Sigs( a
, b
, aSign
);
4969 -------------------------------------------------------------------------------
4970 Returns the result of multiplying the quadruple-precision floating-point
4971 values `a' and `b'. The operation is performed according to the IEC/IEEE
4972 Standard for Binary Floating-Point Arithmetic.
4973 -------------------------------------------------------------------------------
4975 float128
float128_mul( float128 a
, float128 b
)
4977 flag aSign
, bSign
, zSign
;
4978 int32 aExp
, bExp
, zExp
;
4979 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
4982 aSig1
= extractFloat128Frac1( a
);
4983 aSig0
= extractFloat128Frac0( a
);
4984 aExp
= extractFloat128Exp( a
);
4985 aSign
= extractFloat128Sign( a
);
4986 bSig1
= extractFloat128Frac1( b
);
4987 bSig0
= extractFloat128Frac0( b
);
4988 bExp
= extractFloat128Exp( b
);
4989 bSign
= extractFloat128Sign( b
);
4990 zSign
= aSign
^ bSign
;
4991 if ( aExp
== 0x7FFF ) {
4992 if ( ( aSig0
| aSig1
)
4993 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4994 return propagateFloat128NaN( a
, b
);
4996 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
4997 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4999 if ( bExp
== 0x7FFF ) {
5000 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
5001 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5003 float_raise( float_flag_invalid
);
5004 z
.low
= float128_default_nan_low
;
5005 z
.high
= float128_default_nan_high
;
5008 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5011 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5012 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5015 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5016 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5018 zExp
= aExp
+ bExp
- 0x4000;
5019 aSig0
|= LIT64( 0x0001000000000000 );
5020 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
5021 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
5022 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5023 zSig2
|= ( zSig3
!= 0 );
5024 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
5025 shift128ExtraRightJamming(
5026 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5029 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
5034 -------------------------------------------------------------------------------
5035 Returns the result of dividing the quadruple-precision floating-point value
5036 `a' by the corresponding value `b'. The operation is performed according to
5037 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5038 -------------------------------------------------------------------------------
5040 float128
float128_div( float128 a
, float128 b
)
5042 flag aSign
, bSign
, zSign
;
5043 int32 aExp
, bExp
, zExp
;
5044 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5045 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5048 aSig1
= extractFloat128Frac1( a
);
5049 aSig0
= extractFloat128Frac0( a
);
5050 aExp
= extractFloat128Exp( a
);
5051 aSign
= extractFloat128Sign( a
);
5052 bSig1
= extractFloat128Frac1( b
);
5053 bSig0
= extractFloat128Frac0( b
);
5054 bExp
= extractFloat128Exp( b
);
5055 bSign
= extractFloat128Sign( b
);
5056 zSign
= aSign
^ bSign
;
5057 if ( aExp
== 0x7FFF ) {
5058 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
5059 if ( bExp
== 0x7FFF ) {
5060 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
5063 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5065 if ( bExp
== 0x7FFF ) {
5066 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
5067 return packFloat128( zSign
, 0, 0, 0 );
5070 if ( ( bSig0
| bSig1
) == 0 ) {
5071 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5073 float_raise( float_flag_invalid
);
5074 z
.low
= float128_default_nan_low
;
5075 z
.high
= float128_default_nan_high
;
5078 float_raise( float_flag_divbyzero
);
5079 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5081 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5084 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5085 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5087 zExp
= aExp
- bExp
+ 0x3FFD;
5089 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
5091 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5092 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
5093 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
5096 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5097 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
5098 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
5099 while ( (sbits64
) rem0
< 0 ) {
5101 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5103 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5104 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5105 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5106 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5107 while ( (sbits64
) rem1
< 0 ) {
5109 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5111 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5113 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5114 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
5119 -------------------------------------------------------------------------------
5120 Returns the remainder of the quadruple-precision floating-point value `a'
5121 with respect to the corresponding value `b'. The operation is performed
5122 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5123 -------------------------------------------------------------------------------
5125 float128
float128_rem( float128 a
, float128 b
)
5128 int32 aExp
, bExp
, expDiff
;
5129 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5130 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5134 aSig1
= extractFloat128Frac1( a
);
5135 aSig0
= extractFloat128Frac0( a
);
5136 aExp
= extractFloat128Exp( a
);
5137 aSign
= extractFloat128Sign( a
);
5138 bSig1
= extractFloat128Frac1( b
);
5139 bSig0
= extractFloat128Frac0( b
);
5140 bExp
= extractFloat128Exp( b
);
5141 if ( aExp
== 0x7FFF ) {
5142 if ( ( aSig0
| aSig1
)
5143 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5144 return propagateFloat128NaN( a
, b
);
5148 if ( bExp
== 0x7FFF ) {
5149 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
5153 if ( ( bSig0
| bSig1
) == 0 ) {
5155 float_raise( float_flag_invalid
);
5156 z
.low
= float128_default_nan_low
;
5157 z
.high
= float128_default_nan_high
;
5160 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5163 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5164 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5166 expDiff
= aExp
- bExp
;
5167 if ( expDiff
< -1 ) return a
;
5169 aSig0
| LIT64( 0x0001000000000000 ),
5171 15 - ( expDiff
< 0 ),
5176 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5177 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5178 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5180 while ( 0 < expDiff
) {
5181 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5182 q
= ( 4 < q
) ? q
- 4 : 0;
5183 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5184 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5185 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5186 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5189 if ( -64 < expDiff
) {
5190 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5191 q
= ( 4 < q
) ? q
- 4 : 0;
5193 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5195 if ( expDiff
< 0 ) {
5196 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5199 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5201 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5202 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5205 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5206 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5209 alternateASig0
= aSig0
;
5210 alternateASig1
= aSig1
;
5212 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5213 } while ( 0 <= (sbits64
) aSig0
);
5215 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (bits64
*)&sigMean0
, &sigMean1
);
5216 if ( ( sigMean0
< 0 )
5217 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5218 aSig0
= alternateASig0
;
5219 aSig1
= alternateASig1
;
5221 zSign
= ( (sbits64
) aSig0
< 0 );
5222 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5224 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
);
5229 -------------------------------------------------------------------------------
5230 Returns the square root of the quadruple-precision floating-point value `a'.
5231 The operation is performed according to the IEC/IEEE Standard for Binary
5232 Floating-Point Arithmetic.
5233 -------------------------------------------------------------------------------
5235 float128
float128_sqrt( float128 a
)
5239 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5240 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5243 aSig1
= extractFloat128Frac1( a
);
5244 aSig0
= extractFloat128Frac0( a
);
5245 aExp
= extractFloat128Exp( a
);
5246 aSign
= extractFloat128Sign( a
);
5247 if ( aExp
== 0x7FFF ) {
5248 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a
);
5249 if ( ! aSign
) return a
;
5253 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5255 float_raise( float_flag_invalid
);
5256 z
.low
= float128_default_nan_low
;
5257 z
.high
= float128_default_nan_high
;
5261 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5262 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5264 zExp
= (int32
) ( (aExp
- 0x3FFF) >> 1) + 0x3FFE;
5265 aSig0
|= LIT64( 0x0001000000000000 );
5266 zSig0
= estimateSqrt32((int16
)aExp
, (bits32
)(aSig0
>>17));
5267 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5268 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5269 doubleZSig0
= zSig0
<<1;
5270 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5271 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5272 while ( (sbits64
) rem0
< 0 ) {
5275 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5277 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5278 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5279 if ( zSig1
== 0 ) zSig1
= 1;
5280 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5281 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5282 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5283 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5284 while ( (sbits64
) rem1
< 0 ) {
5286 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5288 term2
|= doubleZSig0
;
5289 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5291 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5293 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5294 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2
);
5299 -------------------------------------------------------------------------------
5300 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5301 the corresponding value `b', and 0 otherwise. The comparison is performed
5302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5303 -------------------------------------------------------------------------------
5305 flag
float128_eq( float128 a
, float128 b
)
5308 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5309 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5310 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5311 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5313 if ( float128_is_signaling_nan( a
)
5314 || float128_is_signaling_nan( b
) ) {
5315 float_raise( float_flag_invalid
);
5321 && ( ( a
.high
== b
.high
)
5323 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5329 -------------------------------------------------------------------------------
5330 Returns 1 if the quadruple-precision floating-point value `a' is less than
5331 or equal to the corresponding value `b', and 0 otherwise. The comparison
5332 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5334 -------------------------------------------------------------------------------
5336 flag
float128_le( float128 a
, float128 b
)
5340 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5341 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5342 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5343 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5345 float_raise( float_flag_invalid
);
5348 aSign
= extractFloat128Sign( a
);
5349 bSign
= extractFloat128Sign( b
);
5350 if ( aSign
!= bSign
) {
5353 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5357 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5358 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5363 -------------------------------------------------------------------------------
5364 Returns 1 if the quadruple-precision floating-point value `a' is less than
5365 the corresponding value `b', and 0 otherwise. The comparison is performed
5366 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5367 -------------------------------------------------------------------------------
5369 flag
float128_lt( float128 a
, float128 b
)
5373 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5374 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5375 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5376 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5378 float_raise( float_flag_invalid
);
5381 aSign
= extractFloat128Sign( a
);
5382 bSign
= extractFloat128Sign( b
);
5383 if ( aSign
!= bSign
) {
5386 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5390 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5391 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5396 -------------------------------------------------------------------------------
5397 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5398 the corresponding value `b', and 0 otherwise. The invalid exception is
5399 raised if either operand is a NaN. Otherwise, the comparison is performed
5400 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5401 -------------------------------------------------------------------------------
5403 flag
float128_eq_signaling( float128 a
, float128 b
)
5406 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5407 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5408 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5409 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5411 float_raise( float_flag_invalid
);
5416 && ( ( a
.high
== b
.high
)
5418 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5424 -------------------------------------------------------------------------------
5425 Returns 1 if the quadruple-precision floating-point value `a' is less than
5426 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5427 cause an exception. Otherwise, the comparison is performed according to the
5428 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5429 -------------------------------------------------------------------------------
5431 flag
float128_le_quiet( float128 a
, float128 b
)
5435 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5436 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5437 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5438 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5440 if ( float128_is_signaling_nan( a
)
5441 || float128_is_signaling_nan( b
) ) {
5442 float_raise( float_flag_invalid
);
5446 aSign
= extractFloat128Sign( a
);
5447 bSign
= extractFloat128Sign( b
);
5448 if ( aSign
!= bSign
) {
5451 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5455 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5456 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5461 -------------------------------------------------------------------------------
5462 Returns 1 if the quadruple-precision floating-point value `a' is less than
5463 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5464 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5465 Standard for Binary Floating-Point Arithmetic.
5466 -------------------------------------------------------------------------------
5468 flag
float128_lt_quiet( float128 a
, float128 b
)
5472 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5473 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5474 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5475 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5477 if ( float128_is_signaling_nan( a
)
5478 || float128_is_signaling_nan( b
) ) {
5479 float_raise( float_flag_invalid
);
5483 aSign
= extractFloat128Sign( a
);
5484 bSign
= extractFloat128Sign( b
);
5485 if ( aSign
!= bSign
) {
5488 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5492 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5493 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5500 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5503 * These two routines are not part of the original softfloat distribution.
5505 * They are based on the corresponding conversions to integer but return
5506 * unsigned numbers instead since these functions are required by GCC.
5508 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97
5510 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5514 -------------------------------------------------------------------------------
5515 Returns the result of converting the double-precision floating-point value
5516 `a' to the 32-bit unsigned integer format. The conversion is
5517 performed according to the IEC/IEEE Standard for Binary Floating-point
5518 Arithmetic, except that the conversion is always rounded toward zero. If
5519 `a' is a NaN, the largest positive integer is returned. If the conversion
5520 overflows, the largest integer positive is returned.
5521 -------------------------------------------------------------------------------
5523 uint32
float64_to_uint32_round_to_zero( float64 a
)
5526 int16 aExp
, shiftCount
;
5527 bits64 aSig
, savedASig
;
5530 aSig
= extractFloat64Frac( a
);
5531 aExp
= extractFloat64Exp( a
);
5532 aSign
= extractFloat64Sign( a
);
5535 float_raise( float_flag_invalid
);
5539 if ( 0x41E < aExp
) {
5540 float_raise( float_flag_invalid
);
5543 else if ( aExp
< 0x3FF ) {
5544 if ( aExp
|| aSig
) set_float_exception_inexact_flag();
5547 aSig
|= LIT64( 0x0010000000000000 );
5548 shiftCount
= 0x433 - aExp
;
5550 aSig
>>= shiftCount
;
5552 if ( ( aSig
<<shiftCount
) != savedASig
) {
5553 set_float_exception_inexact_flag();
5560 -------------------------------------------------------------------------------
5561 Returns the result of converting the single-precision floating-point value
5562 `a' to the 32-bit unsigned integer format. The conversion is
5563 performed according to the IEC/IEEE Standard for Binary Floating-point
5564 Arithmetic, except that the conversion is always rounded toward zero. If
5565 `a' is a NaN, the largest positive integer is returned. If the conversion
5566 overflows, the largest positive integer is returned.
5567 -------------------------------------------------------------------------------
5569 uint32
float32_to_uint32_round_to_zero( float32 a
)
5572 int16 aExp
, shiftCount
;
5576 aSig
= extractFloat32Frac( a
);
5577 aExp
= extractFloat32Exp( a
);
5578 aSign
= extractFloat32Sign( a
);
5579 shiftCount
= aExp
- 0x9E;
5582 float_raise( float_flag_invalid
);
5585 if ( 0 < shiftCount
) {
5586 float_raise( float_flag_invalid
);
5589 else if ( aExp
<= 0x7E ) {
5590 if ( aExp
| aSig
) set_float_exception_inexact_flag();
5593 aSig
= ( aSig
| 0x800000 )<<8;
5594 z
= aSig
>>( - shiftCount
);
5595 if ( aSig
<<( shiftCount
& 31 ) ) {
5596 set_float_exception_inexact_flag();