2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
32 #include "softfloat.h"
35 -------------------------------------------------------------------------------
36 Floating-point rounding mode, extended double-precision rounding precision,
38 -------------------------------------------------------------------------------
40 int8 float_rounding_mode
= float_round_nearest_even
;
41 int8 floatx80_rounding_precision
= 80;
42 int8 float_exception_flags
= 0;
45 -------------------------------------------------------------------------------
46 Primitive arithmetic functions, including multi-word arithmetic, and
47 division and square root approximations. (Can be specialized to target if
49 -------------------------------------------------------------------------------
51 #include "softfloat-macros"
54 -------------------------------------------------------------------------------
55 Functions and definitions to determine: (1) whether tininess for underflow
56 is detected before or after rounding by default, (2) what (if anything)
57 happens when exceptions are raised, (3) how signaling NaNs are distinguished
58 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
59 are propagated from function inputs to output. These details are target-
61 -------------------------------------------------------------------------------
63 #include "softfloat-specialize"
66 -------------------------------------------------------------------------------
67 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
68 and 7, and returns the properly rounded 32-bit integer corresponding to the
69 input. If `zSign' is nonzero, the input is negated before being converted
70 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
71 input is simply rounded to an integer, with the inexact exception raised if
72 the input cannot be represented exactly as an integer. If the fixed-point
73 input is too large, however, the invalid exception is raised and the largest
74 positive or negative integer is returned.
75 -------------------------------------------------------------------------------
77 static int32
roundAndPackInt32( flag zSign
, bits64 absZ
)
80 flag roundNearestEven
;
81 int8 roundIncrement
, roundBits
;
84 roundingMode
= float_rounding_mode
;
85 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
86 roundIncrement
= 0x40;
87 if ( ! roundNearestEven
) {
88 if ( roundingMode
== float_round_to_zero
) {
92 roundIncrement
= 0x7F;
94 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
97 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
101 roundBits
= absZ
& 0x7F;
102 absZ
= ( absZ
+ roundIncrement
)>>7;
103 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
105 if ( zSign
) z
= - z
;
106 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
107 float_exception_flags
|= float_flag_invalid
;
108 return zSign
? 0x80000000 : 0x7FFFFFFF;
110 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
116 -------------------------------------------------------------------------------
117 Returns the fraction bits of the single-precision floating-point value `a'.
118 -------------------------------------------------------------------------------
120 INLINE bits32
extractFloat32Frac( float32 a
)
123 return a
& 0x007FFFFF;
128 -------------------------------------------------------------------------------
129 Returns the exponent bits of the single-precision floating-point value `a'.
130 -------------------------------------------------------------------------------
132 INLINE int16
extractFloat32Exp( float32 a
)
135 return ( a
>>23 ) & 0xFF;
140 -------------------------------------------------------------------------------
141 Returns the sign bit of the single-precision floating-point value `a'.
142 -------------------------------------------------------------------------------
144 INLINE flag
extractFloat32Sign( float32 a
)
152 -------------------------------------------------------------------------------
153 Normalizes the subnormal single-precision floating-point value represented
154 by the denormalized significand `aSig'. The normalized exponent and
155 significand are stored at the locations pointed to by `zExpPtr' and
156 `zSigPtr', respectively.
157 -------------------------------------------------------------------------------
160 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
164 shiftCount
= countLeadingZeros32( aSig
) - 8;
165 *zSigPtr
= aSig
<<shiftCount
;
166 *zExpPtr
= 1 - shiftCount
;
171 -------------------------------------------------------------------------------
172 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
173 single-precision floating-point value, returning the result. After being
174 shifted into the proper positions, the three fields are simply added
175 together to form the result. This means that any integer portion of `zSig'
176 will be added into the exponent. Since a properly normalized significand
177 will have an integer portion equal to 1, the `zExp' input should be 1 less
178 than the desired result exponent whenever `zSig' is a complete, normalized
180 -------------------------------------------------------------------------------
182 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
186 __asm__("@ packFloat32;
191 : "g" (f
), "g" (zSign
), "g" (zExp
), "g" (zSig
)
195 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
200 -------------------------------------------------------------------------------
201 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
202 and significand `zSig', and returns the proper single-precision floating-
203 point value corresponding to the abstract input. Ordinarily, the abstract
204 value is simply rounded and packed into the single-precision format, with
205 the inexact exception raised if the abstract input cannot be represented
206 exactly. If the abstract value is too large, however, the overflow and
207 inexact exceptions are raised and an infinity or maximal finite value is
208 returned. If the abstract value is too small, the input value is rounded to
209 a subnormal number, and the underflow and inexact exceptions are raised if
210 the abstract input cannot be represented exactly as a subnormal single-
211 precision floating-point number.
212 The input significand `zSig' has its binary point between bits 30
213 and 29, which is 7 bits to the left of the usual location. This shifted
214 significand must be normalized or smaller. If `zSig' is not normalized,
215 `zExp' must be 0; in that case, the result returned is a subnormal number,
216 and it must not require rounding. In the usual case that `zSig' is
217 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
218 The handling of underflow and overflow follows the IEC/IEEE Standard for
219 Binary Floating-point Arithmetic.
220 -------------------------------------------------------------------------------
222 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
225 flag roundNearestEven
;
226 int8 roundIncrement
, roundBits
;
229 roundingMode
= float_rounding_mode
;
230 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
231 roundIncrement
= 0x40;
232 if ( ! roundNearestEven
) {
233 if ( roundingMode
== float_round_to_zero
) {
237 roundIncrement
= 0x7F;
239 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
242 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
246 roundBits
= zSig
& 0x7F;
247 if ( 0xFD <= (bits16
) zExp
) {
249 || ( ( zExp
== 0xFD )
250 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
252 float_raise( float_flag_overflow
| float_flag_inexact
);
253 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
257 ( float_detect_tininess
== float_tininess_before_rounding
)
259 || ( zSig
+ roundIncrement
< 0x80000000 );
260 shift32RightJamming( zSig
, - zExp
, &zSig
);
262 roundBits
= zSig
& 0x7F;
263 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
266 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
267 zSig
= ( zSig
+ roundIncrement
)>>7;
268 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
269 if ( zSig
== 0 ) zExp
= 0;
270 return packFloat32( zSign
, zExp
, zSig
);
275 -------------------------------------------------------------------------------
276 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
277 and significand `zSig', and returns the proper single-precision floating-
278 point value corresponding to the abstract input. This routine is just like
279 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
280 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
282 -------------------------------------------------------------------------------
285 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
289 shiftCount
= countLeadingZeros32( zSig
) - 1;
290 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
295 -------------------------------------------------------------------------------
296 Returns the fraction bits of the double-precision floating-point value `a'.
297 -------------------------------------------------------------------------------
299 INLINE bits64
extractFloat64Frac( float64 a
)
302 return a
& LIT64( 0x000FFFFFFFFFFFFF );
307 -------------------------------------------------------------------------------
308 Returns the exponent bits of the double-precision floating-point value `a'.
309 -------------------------------------------------------------------------------
311 INLINE int16
extractFloat64Exp( float64 a
)
314 return ( a
>>52 ) & 0x7FF;
319 -------------------------------------------------------------------------------
320 Returns the sign bit of the double-precision floating-point value `a'.
321 -------------------------------------------------------------------------------
323 INLINE flag
extractFloat64Sign( float64 a
)
331 -------------------------------------------------------------------------------
332 Normalizes the subnormal double-precision floating-point value represented
333 by the denormalized significand `aSig'. The normalized exponent and
334 significand are stored at the locations pointed to by `zExpPtr' and
335 `zSigPtr', respectively.
336 -------------------------------------------------------------------------------
339 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
343 shiftCount
= countLeadingZeros64( aSig
) - 11;
344 *zSigPtr
= aSig
<<shiftCount
;
345 *zExpPtr
= 1 - shiftCount
;
350 -------------------------------------------------------------------------------
351 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
352 double-precision floating-point value, returning the result. After being
353 shifted into the proper positions, the three fields are simply added
354 together to form the result. This means that any integer portion of `zSig'
355 will be added into the exponent. Since a properly normalized significand
356 will have an integer portion equal to 1, the `zExp' input should be 1 less
357 than the desired result exponent whenever `zSig' is a complete, normalized
359 -------------------------------------------------------------------------------
361 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
364 return ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
;
369 -------------------------------------------------------------------------------
370 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371 and significand `zSig', and returns the proper double-precision floating-
372 point value corresponding to the abstract input. Ordinarily, the abstract
373 value is simply rounded and packed into the double-precision format, with
374 the inexact exception raised if the abstract input cannot be represented
375 exactly. If the abstract value is too large, however, the overflow and
376 inexact exceptions are raised and an infinity or maximal finite value is
377 returned. If the abstract value is too small, the input value is rounded to
378 a subnormal number, and the underflow and inexact exceptions are raised if
379 the abstract input cannot be represented exactly as a subnormal double-
380 precision floating-point number.
381 The input significand `zSig' has its binary point between bits 62
382 and 61, which is 10 bits to the left of the usual location. This shifted
383 significand must be normalized or smaller. If `zSig' is not normalized,
384 `zExp' must be 0; in that case, the result returned is a subnormal number,
385 and it must not require rounding. In the usual case that `zSig' is
386 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
387 The handling of underflow and overflow follows the IEC/IEEE Standard for
388 Binary Floating-point Arithmetic.
389 -------------------------------------------------------------------------------
391 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
394 flag roundNearestEven
;
395 int16 roundIncrement
, roundBits
;
398 roundingMode
= float_rounding_mode
;
399 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
400 roundIncrement
= 0x200;
401 if ( ! roundNearestEven
) {
402 if ( roundingMode
== float_round_to_zero
) {
406 roundIncrement
= 0x3FF;
408 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
411 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
415 roundBits
= zSig
& 0x3FF;
416 if ( 0x7FD <= (bits16
) zExp
) {
417 if ( ( 0x7FD < zExp
)
418 || ( ( zExp
== 0x7FD )
419 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
422 //__asm__("mov %0, lr" :: "g" (lr));
423 //fp_printk("roundAndPackFloat64 called from 0x%08x\n",lr);
424 float_raise( float_flag_overflow
| float_flag_inexact
);
425 return packFloat64( zSign
, 0x7FF, 0 ) - ( roundIncrement
== 0 );
429 ( float_detect_tininess
== float_tininess_before_rounding
)
431 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
432 shift64RightJamming( zSig
, - zExp
, &zSig
);
434 roundBits
= zSig
& 0x3FF;
435 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
438 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
439 zSig
= ( zSig
+ roundIncrement
)>>10;
440 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
441 if ( zSig
== 0 ) zExp
= 0;
442 return packFloat64( zSign
, zExp
, zSig
);
447 -------------------------------------------------------------------------------
448 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
449 and significand `zSig', and returns the proper double-precision floating-
450 point value corresponding to the abstract input. This routine is just like
451 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
452 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
454 -------------------------------------------------------------------------------
457 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
461 shiftCount
= countLeadingZeros64( zSig
) - 1;
462 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
469 -------------------------------------------------------------------------------
470 Returns the fraction bits of the extended double-precision floating-point
472 -------------------------------------------------------------------------------
474 INLINE bits64
extractFloatx80Frac( floatx80 a
)
482 -------------------------------------------------------------------------------
483 Returns the exponent bits of the extended double-precision floating-point
485 -------------------------------------------------------------------------------
487 INLINE int32
extractFloatx80Exp( floatx80 a
)
490 return a
.high
& 0x7FFF;
495 -------------------------------------------------------------------------------
496 Returns the sign bit of the extended double-precision floating-point value
498 -------------------------------------------------------------------------------
500 INLINE flag
extractFloatx80Sign( floatx80 a
)
508 -------------------------------------------------------------------------------
509 Normalizes the subnormal extended double-precision floating-point value
510 represented by the denormalized significand `aSig'. The normalized exponent
511 and significand are stored at the locations pointed to by `zExpPtr' and
512 `zSigPtr', respectively.
513 -------------------------------------------------------------------------------
516 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
520 shiftCount
= countLeadingZeros64( aSig
);
521 *zSigPtr
= aSig
<<shiftCount
;
522 *zExpPtr
= 1 - shiftCount
;
527 -------------------------------------------------------------------------------
528 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
529 extended double-precision floating-point value, returning the result.
530 -------------------------------------------------------------------------------
532 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
537 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
543 -------------------------------------------------------------------------------
544 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
545 and extended significand formed by the concatenation of `zSig0' and `zSig1',
546 and returns the proper extended double-precision floating-point value
547 corresponding to the abstract input. Ordinarily, the abstract value is
548 rounded and packed into the extended double-precision format, with the
549 inexact exception raised if the abstract input cannot be represented
550 exactly. If the abstract value is too large, however, the overflow and
551 inexact exceptions are raised and an infinity or maximal finite value is
552 returned. If the abstract value is too small, the input value is rounded to
553 a subnormal number, and the underflow and inexact exceptions are raised if
554 the abstract input cannot be represented exactly as a subnormal extended
555 double-precision floating-point number.
556 If `roundingPrecision' is 32 or 64, the result is rounded to the same
557 number of bits as single or double precision, respectively. Otherwise, the
558 result is rounded to the full precision of the extended double-precision
560 The input significand must be normalized or smaller. If the input
561 significand is not normalized, `zExp' must be 0; in that case, the result
562 returned is a subnormal number, and it must not require rounding. The
563 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
564 Floating-point Arithmetic.
565 -------------------------------------------------------------------------------
568 roundAndPackFloatx80(
569 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
573 flag roundNearestEven
, increment
, isTiny
;
574 int64 roundIncrement
, roundMask
, roundBits
;
576 roundingMode
= float_rounding_mode
;
577 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
578 if ( roundingPrecision
== 80 ) goto precision80
;
579 if ( roundingPrecision
== 64 ) {
580 roundIncrement
= LIT64( 0x0000000000000400 );
581 roundMask
= LIT64( 0x00000000000007FF );
583 else if ( roundingPrecision
== 32 ) {
584 roundIncrement
= LIT64( 0x0000008000000000 );
585 roundMask
= LIT64( 0x000000FFFFFFFFFF );
590 zSig0
|= ( zSig1
!= 0 );
591 if ( ! roundNearestEven
) {
592 if ( roundingMode
== float_round_to_zero
) {
596 roundIncrement
= roundMask
;
598 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
601 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
605 roundBits
= zSig0
& roundMask
;
606 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
607 if ( ( 0x7FFE < zExp
)
608 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
614 ( float_detect_tininess
== float_tininess_before_rounding
)
616 || ( zSig0
<= zSig0
+ roundIncrement
);
617 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
619 roundBits
= zSig0
& roundMask
;
620 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
621 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
622 zSig0
+= roundIncrement
;
623 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
624 roundIncrement
= roundMask
+ 1;
625 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
626 roundMask
|= roundIncrement
;
628 zSig0
&= ~ roundMask
;
629 return packFloatx80( zSign
, zExp
, zSig0
);
632 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
633 zSig0
+= roundIncrement
;
634 if ( zSig0
< roundIncrement
) {
636 zSig0
= LIT64( 0x8000000000000000 );
638 roundIncrement
= roundMask
+ 1;
639 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
640 roundMask
|= roundIncrement
;
642 zSig0
&= ~ roundMask
;
643 if ( zSig0
== 0 ) zExp
= 0;
644 return packFloatx80( zSign
, zExp
, zSig0
);
646 increment
= ( (sbits64
) zSig1
< 0 );
647 if ( ! roundNearestEven
) {
648 if ( roundingMode
== float_round_to_zero
) {
653 increment
= ( roundingMode
== float_round_down
) && zSig1
;
656 increment
= ( roundingMode
== float_round_up
) && zSig1
;
660 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
661 if ( ( 0x7FFE < zExp
)
662 || ( ( zExp
== 0x7FFE )
663 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
669 float_raise( float_flag_overflow
| float_flag_inexact
);
670 if ( ( roundingMode
== float_round_to_zero
)
671 || ( zSign
&& ( roundingMode
== float_round_up
) )
672 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
674 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
676 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
680 ( float_detect_tininess
== float_tininess_before_rounding
)
683 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
684 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
686 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow
);
687 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
688 if ( roundNearestEven
) {
689 increment
= ( (sbits64
) zSig1
< 0 );
693 increment
= ( roundingMode
== float_round_down
) && zSig1
;
696 increment
= ( roundingMode
== float_round_up
) && zSig1
;
701 zSig0
&= ~ ( ( zSig1
+ zSig1
== 0 ) & roundNearestEven
);
702 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
704 return packFloatx80( zSign
, zExp
, zSig0
);
707 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
712 zSig0
= LIT64( 0x8000000000000000 );
715 zSig0
&= ~ ( ( zSig1
+ zSig1
== 0 ) & roundNearestEven
);
719 if ( zSig0
== 0 ) zExp
= 0;
722 return packFloatx80( zSign
, zExp
, zSig0
);
726 -------------------------------------------------------------------------------
727 Takes an abstract floating-point value having sign `zSign', exponent
728 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
729 and returns the proper extended double-precision floating-point value
730 corresponding to the abstract input. This routine is just like
731 `roundAndPackFloatx80' except that the input significand does not have to be
733 -------------------------------------------------------------------------------
736 normalizeRoundAndPackFloatx80(
737 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
747 shiftCount
= countLeadingZeros64( zSig0
);
748 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
751 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1
);
760 -------------------------------------------------------------------------------
761 Returns the least-significant 64 fraction bits of the quadruple-precision
762 floating-point value `a'.
763 -------------------------------------------------------------------------------
765 INLINE bits64
extractFloat128Frac1( float128 a
)
773 -------------------------------------------------------------------------------
774 Returns the most-significant 48 fraction bits of the quadruple-precision
775 floating-point value `a'.
776 -------------------------------------------------------------------------------
778 INLINE bits64
extractFloat128Frac0( float128 a
)
781 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
786 -------------------------------------------------------------------------------
787 Returns the exponent bits of the quadruple-precision floating-point value
789 -------------------------------------------------------------------------------
791 INLINE int32
extractFloat128Exp( float128 a
)
794 return ( a
.high
>>48 ) & 0x7FFF;
799 -------------------------------------------------------------------------------
800 Returns the sign bit of the quadruple-precision floating-point value `a'.
801 -------------------------------------------------------------------------------
803 INLINE flag
extractFloat128Sign( float128 a
)
811 -------------------------------------------------------------------------------
812 Normalizes the subnormal quadruple-precision floating-point value
813 represented by the denormalized significand formed by the concatenation of
814 `aSig0' and `aSig1'. The normalized exponent is stored at the location
815 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
816 significand are stored at the location pointed to by `zSig0Ptr', and the
817 least significant 64 bits of the normalized significand are stored at the
818 location pointed to by `zSig1Ptr'.
819 -------------------------------------------------------------------------------
822 normalizeFloat128Subnormal(
833 shiftCount
= countLeadingZeros64( aSig1
) - 15;
834 if ( shiftCount
< 0 ) {
835 *zSig0Ptr
= aSig1
>>( - shiftCount
);
836 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
839 *zSig0Ptr
= aSig1
<<shiftCount
;
842 *zExpPtr
= - shiftCount
- 63;
845 shiftCount
= countLeadingZeros64( aSig0
) - 15;
846 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
847 *zExpPtr
= 1 - shiftCount
;
853 -------------------------------------------------------------------------------
854 Packs the sign `zSign', the exponent `zExp', and the significand formed
855 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
856 floating-point value, returning the result. After being shifted into the
857 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
858 added together to form the most significant 32 bits of the result. This
859 means that any integer portion of `zSig0' will be added into the exponent.
860 Since a properly normalized significand will have an integer portion equal
861 to 1, the `zExp' input should be 1 less than the desired result exponent
862 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
864 -------------------------------------------------------------------------------
867 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
872 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
878 -------------------------------------------------------------------------------
879 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
880 and extended significand formed by the concatenation of `zSig0', `zSig1',
881 and `zSig2', and returns the proper quadruple-precision floating-point value
882 corresponding to the abstract input. Ordinarily, the abstract value is
883 simply rounded and packed into the quadruple-precision format, with the
884 inexact exception raised if the abstract input cannot be represented
885 exactly. If the abstract value is too large, however, the overflow and
886 inexact exceptions are raised and an infinity or maximal finite value is
887 returned. If the abstract value is too small, the input value is rounded to
888 a subnormal number, and the underflow and inexact exceptions are raised if
889 the abstract input cannot be represented exactly as a subnormal quadruple-
890 precision floating-point number.
891 The input significand must be normalized or smaller. If the input
892 significand is not normalized, `zExp' must be 0; in that case, the result
893 returned is a subnormal number, and it must not require rounding. In the
894 usual case that the input significand is normalized, `zExp' must be 1 less
895 than the ``true'' floating-point exponent. The handling of underflow and
896 overflow follows the IEC/IEEE Standard for Binary Floating-point Arithmetic.
897 -------------------------------------------------------------------------------
900 roundAndPackFloat128(
901 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2
)
904 flag roundNearestEven
, increment
, isTiny
;
906 roundingMode
= float_rounding_mode
;
907 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
908 increment
= ( (sbits64
) zSig2
< 0 );
909 if ( ! roundNearestEven
) {
910 if ( roundingMode
== float_round_to_zero
) {
915 increment
= ( roundingMode
== float_round_down
) && zSig2
;
918 increment
= ( roundingMode
== float_round_up
) && zSig2
;
922 if ( 0x7FFD <= (bits32
) zExp
) {
923 if ( ( 0x7FFD < zExp
)
924 || ( ( zExp
== 0x7FFD )
926 LIT64( 0x0001FFFFFFFFFFFF ),
927 LIT64( 0xFFFFFFFFFFFFFFFF ),
934 float_raise( float_flag_overflow
| float_flag_inexact
);
935 if ( ( roundingMode
== float_round_to_zero
)
936 || ( zSign
&& ( roundingMode
== float_round_up
) )
937 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
943 LIT64( 0x0000FFFFFFFFFFFF ),
944 LIT64( 0xFFFFFFFFFFFFFFFF )
947 return packFloat128( zSign
, 0x7FFF, 0, 0 );
951 ( float_detect_tininess
== float_tininess_before_rounding
)
957 LIT64( 0x0001FFFFFFFFFFFF ),
958 LIT64( 0xFFFFFFFFFFFFFFFF )
960 shift128ExtraRightJamming(
961 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
963 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow
);
964 if ( roundNearestEven
) {
965 increment
= ( (sbits64
) zSig2
< 0 );
969 increment
= ( roundingMode
== float_round_down
) && zSig2
;
972 increment
= ( roundingMode
== float_round_up
) && zSig2
;
977 if ( zSig2
) float_exception_flags
|= float_flag_inexact
;
979 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
980 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
983 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
985 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
990 -------------------------------------------------------------------------------
991 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
992 and significand formed by the concatenation of `zSig0' and `zSig1', and
993 returns the proper quadruple-precision floating-point value corresponding to
994 the abstract input. This routine is just like `roundAndPackFloat128' except
995 that the input significand has fewer bits and does not have to be normalized
996 in any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
998 -------------------------------------------------------------------------------
1001 normalizeRoundAndPackFloat128(
1002 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
1012 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1013 if ( 0 <= shiftCount
) {
1015 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1018 shift128ExtraRightJamming(
1019 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1022 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1029 -------------------------------------------------------------------------------
1030 Returns the result of converting the 32-bit two's complement integer `a' to
1031 the single-precision floating-point format. The conversion is performed
1032 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1033 -------------------------------------------------------------------------------
1035 float32
int32_to_float32( int32 a
)
1039 if ( a
== 0 ) return 0;
1040 if ( a
== 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1042 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a
);
1047 -------------------------------------------------------------------------------
1048 Returns the result of converting the 32-bit two's complement integer `a' to
1049 the double-precision floating-point format. The conversion is performed
1050 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1051 -------------------------------------------------------------------------------
1053 float64
int32_to_float64( int32 a
)
1060 if ( a
== 0 ) return 0;
1062 absA
= aSign
? - a
: a
;
1063 shiftCount
= countLeadingZeros32( absA
) + 21;
1065 return packFloat64( aSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1072 -------------------------------------------------------------------------------
1073 Returns the result of converting the 32-bit two's complement integer `a'
1074 to the extended double-precision floating-point format. The conversion
1075 is performed according to the IEC/IEEE Standard for Binary Floating-point
1077 -------------------------------------------------------------------------------
1079 floatx80
int32_to_floatx80( int32 a
)
1086 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1088 absA
= zSign
? - a
: a
;
1089 shiftCount
= countLeadingZeros32( absA
) + 32;
1091 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1100 -------------------------------------------------------------------------------
1101 Returns the result of converting the 32-bit two's complement integer `a' to
1102 the quadruple-precision floating-point format. The conversion is performed
1103 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1104 -------------------------------------------------------------------------------
1106 float128
int32_to_float128( int32 a
)
1113 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1115 absA
= zSign
? - a
: a
;
1116 shiftCount
= countLeadingZeros32( absA
) + 17;
1118 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1125 -------------------------------------------------------------------------------
1126 Returns the result of converting the single-precision floating-point value
1127 `a' to the 32-bit two's complement integer format. The conversion is
1128 performed according to the IEC/IEEE Standard for Binary Floating-point
1129 Arithmetic---which means in particular that the conversion is rounded
1130 according to the current rounding mode. If `a' is a NaN, the largest
1131 positive integer is returned. Otherwise, if the conversion overflows, the
1132 largest integer with the same sign as `a' is returned.
1133 -------------------------------------------------------------------------------
1135 int32
float32_to_int32( float32 a
)
1138 int16 aExp
, shiftCount
;
1142 aSig
= extractFloat32Frac( a
);
1143 aExp
= extractFloat32Exp( a
);
1144 aSign
= extractFloat32Sign( a
);
1145 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1146 if ( aExp
) aSig
|= 0x00800000;
1147 shiftCount
= 0xAF - aExp
;
1150 if ( 0 < shiftCount
) shift64RightJamming( zSig
, shiftCount
, &zSig
);
1151 return roundAndPackInt32( aSign
, zSig
);
1156 -------------------------------------------------------------------------------
1157 Returns the result of converting the single-precision floating-point value
1158 `a' to the 32-bit two's complement integer format. The conversion is
1159 performed according to the IEC/IEEE Standard for Binary Floating-point
1160 Arithmetic, except that the conversion is always rounded toward zero. If
1161 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1162 conversion overflows, the largest integer with the same sign as `a' is
1164 -------------------------------------------------------------------------------
1166 int32
float32_to_int32_round_to_zero( float32 a
)
1169 int16 aExp
, shiftCount
;
1173 aSig
= extractFloat32Frac( a
);
1174 aExp
= extractFloat32Exp( a
);
1175 aSign
= extractFloat32Sign( a
);
1176 shiftCount
= aExp
- 0x9E;
1177 if ( 0 <= shiftCount
) {
1178 if ( a
== 0xCF000000 ) return 0x80000000;
1179 float_raise( float_flag_invalid
);
1180 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1183 else if ( aExp
<= 0x7E ) {
1184 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
1187 aSig
= ( aSig
| 0x00800000 )<<8;
1188 z
= aSig
>>( - shiftCount
);
1189 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1190 float_exception_flags
|= float_flag_inexact
;
1192 return aSign
? - z
: z
;
1197 -------------------------------------------------------------------------------
1198 Returns the result of converting the single-precision floating-point value
1199 `a' to the double-precision floating-point format. The conversion is
1200 performed according to the IEC/IEEE Standard for Binary Floating-point
1202 -------------------------------------------------------------------------------
1204 float64
float32_to_float64( float32 a
)
1210 aSig
= extractFloat32Frac( a
);
1211 aExp
= extractFloat32Exp( a
);
1212 aSign
= extractFloat32Sign( a
);
1213 if ( aExp
== 0xFF ) {
1214 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
1215 return packFloat64( aSign
, 0x7FF, 0 );
1218 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1219 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1222 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1229 -------------------------------------------------------------------------------
1230 Returns the result of converting the single-precision floating-point value
1231 `a' to the extended double-precision floating-point format. The conversion
1232 is performed according to the IEC/IEEE Standard for Binary Floating-point
1234 -------------------------------------------------------------------------------
1236 floatx80
float32_to_floatx80( float32 a
)
1242 aSig
= extractFloat32Frac( a
);
1243 aExp
= extractFloat32Exp( a
);
1244 aSign
= extractFloat32Sign( a
);
1245 if ( aExp
== 0xFF ) {
1246 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a
) );
1247 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1250 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1251 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1254 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1263 -------------------------------------------------------------------------------
1264 Returns the result of converting the single-precision floating-point value
1265 `a' to the double-precision floating-point format. The conversion is
1266 performed according to the IEC/IEEE Standard for Binary Floating-point
1268 -------------------------------------------------------------------------------
1270 float128
float32_to_float128( float32 a
)
1276 aSig
= extractFloat32Frac( a
);
1277 aExp
= extractFloat32Exp( a
);
1278 aSign
= extractFloat32Sign( a
);
1279 if ( aExp
== 0xFF ) {
1280 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a
) );
1281 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1284 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1285 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1288 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1295 -------------------------------------------------------------------------------
1296 Rounds the single-precision floating-point value `a' to an integer, and
1297 returns the result as a single-precision floating-point value. The
1298 operation is performed according to the IEC/IEEE Standard for Binary
1299 Floating-point Arithmetic.
1300 -------------------------------------------------------------------------------
1302 float32
float32_round_to_int( float32 a
)
1306 bits32 lastBitMask
, roundBitsMask
;
1310 aExp
= extractFloat32Exp( a
);
1311 if ( 0x96 <= aExp
) {
1312 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1313 return propagateFloat32NaN( a
, a
);
1317 if ( aExp
<= 0x7E ) {
1318 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
1319 float_exception_flags
|= float_flag_inexact
;
1320 aSign
= extractFloat32Sign( a
);
1321 switch ( float_rounding_mode
) {
1322 case float_round_nearest_even
:
1323 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1324 return packFloat32( aSign
, 0x7F, 0 );
1327 case float_round_down
:
1328 return aSign
? 0xBF800000 : 0;
1329 case float_round_up
:
1330 return aSign
? 0x80000000 : 0x3F800000;
1332 return packFloat32( aSign
, 0, 0 );
1335 lastBitMask
<<= 0x96 - aExp
;
1336 roundBitsMask
= lastBitMask
- 1;
1338 roundingMode
= float_rounding_mode
;
1339 if ( roundingMode
== float_round_nearest_even
) {
1340 z
+= lastBitMask
>>1;
1341 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1343 else if ( roundingMode
!= float_round_to_zero
) {
1344 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1348 z
&= ~ roundBitsMask
;
1349 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
1355 -------------------------------------------------------------------------------
1356 Returns the result of adding the absolute values of the single-precision
1357 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1358 before being returned. `zSign' is ignored if the result is a NaN. The
1359 addition is performed according to the IEC/IEEE Standard for Binary
1360 Floating-point Arithmetic.
1361 -------------------------------------------------------------------------------
1363 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1365 int16 aExp
, bExp
, zExp
;
1366 bits32 aSig
, bSig
, zSig
;
1369 aSig
= extractFloat32Frac( a
);
1370 aExp
= extractFloat32Exp( a
);
1371 bSig
= extractFloat32Frac( b
);
1372 bExp
= extractFloat32Exp( b
);
1373 expDiff
= aExp
- bExp
;
1376 if ( 0 < expDiff
) {
1377 if ( aExp
== 0xFF ) {
1378 if ( aSig
) return propagateFloat32NaN( a
, b
);
1387 shift32RightJamming( bSig
, expDiff
, &bSig
);
1390 else if ( expDiff
< 0 ) {
1391 if ( bExp
== 0xFF ) {
1392 if ( bSig
) return propagateFloat32NaN( a
, b
);
1393 return packFloat32( zSign
, 0xFF, 0 );
1401 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1405 if ( aExp
== 0xFF ) {
1406 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1409 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1410 zSig
= 0x40000000 + aSig
+ bSig
;
1415 zSig
= ( aSig
+ bSig
)<<1;
1417 if ( (sbits32
) zSig
< 0 ) {
1422 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1427 -------------------------------------------------------------------------------
1428 Returns the result of subtracting the absolute values of the single-
1429 precision floating-point values `a' and `b'. If `zSign' is true, the
1430 difference is negated before being returned. `zSign' is ignored if the
1431 result is a NaN. The subtraction is performed according to the IEC/IEEE
1432 Standard for Binary Floating-point Arithmetic.
1433 -------------------------------------------------------------------------------
1435 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1437 int16 aExp
, bExp
, zExp
;
1438 bits32 aSig
, bSig
, zSig
;
1441 aSig
= extractFloat32Frac( a
);
1442 aExp
= extractFloat32Exp( a
);
1443 bSig
= extractFloat32Frac( b
);
1444 bExp
= extractFloat32Exp( b
);
1445 expDiff
= aExp
- bExp
;
1448 if ( 0 < expDiff
) goto aExpBigger
;
1449 if ( expDiff
< 0 ) goto bExpBigger
;
1450 if ( aExp
== 0xFF ) {
1451 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1452 float_raise( float_flag_invalid
);
1453 return float32_default_nan
;
1459 if ( bSig
< aSig
) goto aBigger
;
1460 if ( aSig
< bSig
) goto bBigger
;
1461 return packFloat32( float_rounding_mode
== float_round_down
, 0, 0 );
1463 if ( bExp
== 0xFF ) {
1464 if ( bSig
) return propagateFloat32NaN( a
, b
);
1465 return packFloat32( zSign
^ 1, 0xFF, 0 );
1473 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1479 goto normalizeRoundAndPack
;
1481 if ( aExp
== 0xFF ) {
1482 if ( aSig
) return propagateFloat32NaN( a
, b
);
1491 shift32RightJamming( bSig
, expDiff
, &bSig
);
1496 normalizeRoundAndPack
:
1498 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig
);
1503 -------------------------------------------------------------------------------
1504 Returns the result of adding the single-precision floating-point values `a'
1505 and `b'. The operation is performed according to the IEC/IEEE Standard for
1506 Binary Floating-point Arithmetic.
1507 -------------------------------------------------------------------------------
1509 float32
float32_add( float32 a
, float32 b
)
1513 aSign
= extractFloat32Sign( a
);
1514 bSign
= extractFloat32Sign( b
);
1515 if ( aSign
== bSign
) {
1516 return addFloat32Sigs( a
, b
, aSign
);
1519 return subFloat32Sigs( a
, b
, aSign
);
1525 -------------------------------------------------------------------------------
1526 Returns the result of subtracting the single-precision floating-point values
1527 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1528 for Binary Floating-point Arithmetic.
1529 -------------------------------------------------------------------------------
1531 float32
float32_sub( float32 a
, float32 b
)
1535 aSign
= extractFloat32Sign( a
);
1536 bSign
= extractFloat32Sign( b
);
1537 if ( aSign
== bSign
) {
1538 return subFloat32Sigs( a
, b
, aSign
);
1541 return addFloat32Sigs( a
, b
, aSign
);
1547 -------------------------------------------------------------------------------
1548 Returns the result of multiplying the single-precision floating-point values
1549 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1550 for Binary Floating-point Arithmetic.
1551 -------------------------------------------------------------------------------
1553 float32
float32_mul( float32 a
, float32 b
)
1555 flag aSign
, bSign
, zSign
;
1556 int16 aExp
, bExp
, zExp
;
1561 aSig
= extractFloat32Frac( a
);
1562 aExp
= extractFloat32Exp( a
);
1563 aSign
= extractFloat32Sign( a
);
1564 bSig
= extractFloat32Frac( b
);
1565 bExp
= extractFloat32Exp( b
);
1566 bSign
= extractFloat32Sign( b
);
1567 zSign
= aSign
^ bSign
;
1568 if ( aExp
== 0xFF ) {
1569 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1570 return propagateFloat32NaN( a
, b
);
1572 if ( ( bExp
| bSig
) == 0 ) {
1573 float_raise( float_flag_invalid
);
1574 return float32_default_nan
;
1576 return packFloat32( zSign
, 0xFF, 0 );
1578 if ( bExp
== 0xFF ) {
1579 if ( bSig
) return propagateFloat32NaN( a
, b
);
1580 if ( ( aExp
| aSig
) == 0 ) {
1581 float_raise( float_flag_invalid
);
1582 return float32_default_nan
;
1584 return packFloat32( zSign
, 0xFF, 0 );
1587 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1588 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1591 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1592 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1594 zExp
= aExp
+ bExp
- 0x7F;
1595 aSig
= ( aSig
| 0x00800000 )<<7;
1596 bSig
= ( bSig
| 0x00800000 )<<8;
1597 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1599 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1603 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1608 -------------------------------------------------------------------------------
1609 Returns the result of dividing the single-precision floating-point value `a'
1610 by the corresponding value `b'. The operation is performed according to the
1611 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1612 -------------------------------------------------------------------------------
1614 float32
float32_div( float32 a
, float32 b
)
1616 flag aSign
, bSign
, zSign
;
1617 int16 aExp
, bExp
, zExp
;
1618 bits32 aSig
, bSig
, zSig
;
1620 aSig
= extractFloat32Frac( a
);
1621 aExp
= extractFloat32Exp( a
);
1622 aSign
= extractFloat32Sign( a
);
1623 bSig
= extractFloat32Frac( b
);
1624 bExp
= extractFloat32Exp( b
);
1625 bSign
= extractFloat32Sign( b
);
1626 zSign
= aSign
^ bSign
;
1627 if ( aExp
== 0xFF ) {
1628 if ( aSig
) return propagateFloat32NaN( a
, b
);
1629 if ( bExp
== 0xFF ) {
1630 if ( bSig
) return propagateFloat32NaN( a
, b
);
1631 float_raise( float_flag_invalid
);
1632 return float32_default_nan
;
1634 return packFloat32( zSign
, 0xFF, 0 );
1636 if ( bExp
== 0xFF ) {
1637 if ( bSig
) return propagateFloat32NaN( a
, b
);
1638 return packFloat32( zSign
, 0, 0 );
1642 if ( ( aExp
| aSig
) == 0 ) {
1643 float_raise( float_flag_invalid
);
1644 return float32_default_nan
;
1646 float_raise( float_flag_divbyzero
);
1647 return packFloat32( zSign
, 0xFF, 0 );
1649 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1652 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1653 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1655 zExp
= aExp
- bExp
+ 0x7D;
1656 aSig
= ( aSig
| 0x00800000 )<<7;
1657 bSig
= ( bSig
| 0x00800000 )<<8;
1658 if ( bSig
<= ( aSig
+ aSig
) ) {
1662 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1663 if ( ( zSig
& 0x3F ) == 0 ) {
1664 zSig
|= ( ( (bits64
) bSig
) * zSig
!= ( (bits64
) aSig
)<<32 );
1666 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1671 -------------------------------------------------------------------------------
1672 Returns the remainder of the single-precision floating-point value `a'
1673 with respect to the corresponding value `b'. The operation is performed
1674 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1675 -------------------------------------------------------------------------------
1677 float32
float32_rem( float32 a
, float32 b
)
1679 flag aSign
, bSign
, zSign
;
1680 int16 aExp
, bExp
, expDiff
;
1683 bits64 aSig64
, bSig64
, q64
;
1684 bits32 alternateASig
;
1687 aSig
= extractFloat32Frac( a
);
1688 aExp
= extractFloat32Exp( a
);
1689 aSign
= extractFloat32Sign( a
);
1690 bSig
= extractFloat32Frac( b
);
1691 bExp
= extractFloat32Exp( b
);
1692 bSign
= extractFloat32Sign( b
);
1693 if ( aExp
== 0xFF ) {
1694 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1695 return propagateFloat32NaN( a
, b
);
1697 float_raise( float_flag_invalid
);
1698 return float32_default_nan
;
1700 if ( bExp
== 0xFF ) {
1701 if ( bSig
) return propagateFloat32NaN( a
, b
);
1706 float_raise( float_flag_invalid
);
1707 return float32_default_nan
;
1709 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1712 if ( aSig
== 0 ) return a
;
1713 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1715 expDiff
= aExp
- bExp
;
1718 if ( expDiff
< 32 ) {
1721 if ( expDiff
< 0 ) {
1722 if ( expDiff
< -1 ) return a
;
1725 q
= ( bSig
<= aSig
);
1726 if ( q
) aSig
-= bSig
;
1727 if ( 0 < expDiff
) {
1728 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1731 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1739 if ( bSig
<= aSig
) aSig
-= bSig
;
1740 aSig64
= ( (bits64
) aSig
)<<40;
1741 bSig64
= ( (bits64
) bSig
)<<40;
1743 while ( 0 < expDiff
) {
1744 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1745 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1746 aSig64
= - ( ( bSig
* q64
)<<38 );
1750 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1751 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1752 q
= q64
>>( 64 - expDiff
);
1754 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
1757 alternateASig
= aSig
;
1760 } while ( 0 <= (sbits32
) aSig
);
1761 sigMean
= aSig
+ alternateASig
;
1762 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1763 aSig
= alternateASig
;
1765 zSign
= ( (sbits32
) aSig
< 0 );
1766 if ( zSign
) aSig
= - aSig
;
1767 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig
);
1772 -------------------------------------------------------------------------------
1773 Returns the square root of the single-precision floating-point value `a'.
1774 The operation is performed according to the IEC/IEEE Standard for Binary
1775 Floating-point Arithmetic.
1776 -------------------------------------------------------------------------------
1778 float32
float32_sqrt( float32 a
)
1785 aSig
= extractFloat32Frac( a
);
1786 aExp
= extractFloat32Exp( a
);
1787 aSign
= extractFloat32Sign( a
);
1788 if ( aExp
== 0xFF ) {
1789 if ( aSig
) return propagateFloat32NaN( a
, 0 );
1790 if ( ! aSign
) return a
;
1791 float_raise( float_flag_invalid
);
1792 return float32_default_nan
;
1795 if ( ( aExp
| aSig
) == 0 ) return a
;
1796 float_raise( float_flag_invalid
);
1797 return float32_default_nan
;
1800 if ( aSig
== 0 ) return 0;
1801 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1803 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
1804 aSig
= ( aSig
| 0x00800000 )<<8;
1805 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
1806 if ( ( zSig
& 0x7F ) <= 5 ) {
1812 term
= ( (bits64
) zSig
) * zSig
;
1813 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
1814 while ( (sbits64
) rem
< 0 ) {
1816 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
1818 zSig
|= ( rem
!= 0 );
1821 shift32RightJamming( zSig
, 1, &zSig
);
1822 return roundAndPackFloat32( 0, zExp
, zSig
);
1827 -------------------------------------------------------------------------------
1828 Returns 1 if the single-precision floating-point value `a' is equal to the
1829 corresponding value `b', and 0 otherwise. The comparison is performed
1830 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1831 -------------------------------------------------------------------------------
1833 flag
float32_eq( float32 a
, float32 b
)
1836 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1837 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1839 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1840 float_raise( float_flag_invalid
);
1844 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1849 -------------------------------------------------------------------------------
1850 Returns 1 if the single-precision floating-point value `a' is less than or
1851 equal to the corresponding value `b', and 0 otherwise. The comparison is
1852 performed according to the IEC/IEEE Standard for Binary Floating-point
1854 -------------------------------------------------------------------------------
1856 flag
float32_le( float32 a
, float32 b
)
1860 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1861 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1863 float_raise( float_flag_invalid
);
1866 aSign
= extractFloat32Sign( a
);
1867 bSign
= extractFloat32Sign( b
);
1868 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1869 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1874 -------------------------------------------------------------------------------
1875 Returns 1 if the single-precision floating-point value `a' is less than
1876 the corresponding value `b', and 0 otherwise. The comparison is performed
1877 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1878 -------------------------------------------------------------------------------
1880 flag
float32_lt( float32 a
, float32 b
)
1884 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1885 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1887 float_raise( float_flag_invalid
);
1890 aSign
= extractFloat32Sign( a
);
1891 bSign
= extractFloat32Sign( b
);
1892 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1893 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1898 -------------------------------------------------------------------------------
1899 Returns 1 if the single-precision floating-point value `a' is equal to the
1900 corresponding value `b', and 0 otherwise. The invalid exception is raised
1901 if either operand is a NaN. Otherwise, the comparison is performed
1902 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1903 -------------------------------------------------------------------------------
1905 flag
float32_eq_signaling( float32 a
, float32 b
)
1908 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1909 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1911 float_raise( float_flag_invalid
);
1914 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1919 -------------------------------------------------------------------------------
1920 Returns 1 if the single-precision floating-point value `a' is less than or
1921 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1922 cause an exception. Otherwise, the comparison is performed according to the
1923 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1924 -------------------------------------------------------------------------------
1926 flag
float32_le_quiet( float32 a
, float32 b
)
1931 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1932 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1934 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1935 float_raise( float_flag_invalid
);
1939 aSign
= extractFloat32Sign( a
);
1940 bSign
= extractFloat32Sign( b
);
1941 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1942 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1947 -------------------------------------------------------------------------------
1948 Returns 1 if the single-precision floating-point value `a' is less than
1949 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1950 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1951 Standard for Binary Floating-point Arithmetic.
1952 -------------------------------------------------------------------------------
1954 flag
float32_lt_quiet( float32 a
, float32 b
)
1958 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1959 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1961 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1962 float_raise( float_flag_invalid
);
1966 aSign
= extractFloat32Sign( a
);
1967 bSign
= extractFloat32Sign( b
);
1968 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1969 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1974 -------------------------------------------------------------------------------
1975 Returns the result of converting the double-precision floating-point value
1976 `a' to the 32-bit two's complement integer format. The conversion is
1977 performed according to the IEC/IEEE Standard for Binary Floating-point
1978 Arithmetic---which means in particular that the conversion is rounded
1979 according to the current rounding mode. If `a' is a NaN, the largest
1980 positive integer is returned. Otherwise, if the conversion overflows, the
1981 largest integer with the same sign as `a' is returned.
1982 -------------------------------------------------------------------------------
1984 int32
float64_to_int32( float64 a
)
1987 int16 aExp
, shiftCount
;
1990 aSig
= extractFloat64Frac( a
);
1991 aExp
= extractFloat64Exp( a
);
1992 aSign
= extractFloat64Sign( a
);
1993 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1994 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
1995 shiftCount
= 0x42C - aExp
;
1996 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
1997 return roundAndPackInt32( aSign
, aSig
);
2002 -------------------------------------------------------------------------------
2003 Returns the result of converting the double-precision floating-point value
2004 `a' to the 32-bit two's complement integer format. The conversion is
2005 performed according to the IEC/IEEE Standard for Binary Floating-point
2006 Arithmetic, except that the conversion is always rounded toward zero. If
2007 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
2008 conversion overflows, the largest integer with the same sign as `a' is
2010 -------------------------------------------------------------------------------
2012 int32
float64_to_int32_round_to_zero( float64 a
)
2015 int16 aExp
, shiftCount
;
2016 bits64 aSig
, savedASig
;
2019 aSig
= extractFloat64Frac( a
);
2020 aExp
= extractFloat64Exp( a
);
2021 aSign
= extractFloat64Sign( a
);
2022 shiftCount
= 0x433 - aExp
;
2023 if ( shiftCount
< 21 ) {
2024 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2027 else if ( 52 < shiftCount
) {
2028 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
2031 aSig
|= LIT64( 0x0010000000000000 );
2033 aSig
>>= shiftCount
;
2035 if ( aSign
) z
= - z
;
2036 if ( ( z
< 0 ) ^ aSign
) {
2038 float_exception_flags
|= float_flag_invalid
;
2039 return aSign
? 0x80000000 : 0x7FFFFFFF;
2041 if ( ( aSig
<<shiftCount
) != savedASig
) {
2042 float_exception_flags
|= float_flag_inexact
;
2049 -------------------------------------------------------------------------------
2050 Returns the result of converting the double-precision floating-point value
2051 `a' to the 32-bit two's complement unsigned integer format. The conversion
2052 is performed according to the IEC/IEEE Standard for Binary Floating-point
2053 Arithmetic---which means in particular that the conversion is rounded
2054 according to the current rounding mode. If `a' is a NaN, the largest
2055 positive integer is returned. Otherwise, if the conversion overflows, the
2056 largest positive integer is returned.
2057 -------------------------------------------------------------------------------
2059 int32
float64_to_uint32( float64 a
)
2062 int16 aExp
, shiftCount
;
2065 aSig
= extractFloat64Frac( a
);
2066 aExp
= extractFloat64Exp( a
);
2067 aSign
= 0; //extractFloat64Sign( a );
2068 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2069 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2070 shiftCount
= 0x42C - aExp
;
2071 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2072 return roundAndPackInt32( aSign
, aSig
);
2076 -------------------------------------------------------------------------------
2077 Returns the result of converting the double-precision floating-point value
2078 `a' to the 32-bit two's complement integer format. The conversion is
2079 performed according to the IEC/IEEE Standard for Binary Floating-point
2080 Arithmetic, except that the conversion is always rounded toward zero. If
2081 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
2082 conversion overflows, the largest positive integer is returned.
2083 -------------------------------------------------------------------------------
2085 int32
float64_to_uint32_round_to_zero( float64 a
)
2088 int16 aExp
, shiftCount
;
2089 bits64 aSig
, savedASig
;
2092 aSig
= extractFloat64Frac( a
);
2093 aExp
= extractFloat64Exp( a
);
2094 aSign
= extractFloat64Sign( a
);
2095 shiftCount
= 0x433 - aExp
;
2096 if ( shiftCount
< 21 ) {
2097 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2100 else if ( 52 < shiftCount
) {
2101 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
2104 aSig
|= LIT64( 0x0010000000000000 );
2106 aSig
>>= shiftCount
;
2108 if ( aSign
) z
= - z
;
2109 if ( ( z
< 0 ) ^ aSign
) {
2111 float_exception_flags
|= float_flag_invalid
;
2112 return aSign
? 0x80000000 : 0x7FFFFFFF;
2114 if ( ( aSig
<<shiftCount
) != savedASig
) {
2115 float_exception_flags
|= float_flag_inexact
;
2121 -------------------------------------------------------------------------------
2122 Returns the result of converting the double-precision floating-point value
2123 `a' to the single-precision floating-point format. The conversion is
2124 performed according to the IEC/IEEE Standard for Binary Floating-point
2126 -------------------------------------------------------------------------------
2128 float32
float64_to_float32( float64 a
)
2135 aSig
= extractFloat64Frac( a
);
2136 aExp
= extractFloat64Exp( a
);
2137 aSign
= extractFloat64Sign( a
);
2138 if ( aExp
== 0x7FF ) {
2139 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a
) );
2140 return packFloat32( aSign
, 0xFF, 0 );
2142 shift64RightJamming( aSig
, 22, &aSig
);
2144 if ( aExp
|| zSig
) {
2148 return roundAndPackFloat32( aSign
, aExp
, zSig
);
2155 -------------------------------------------------------------------------------
2156 Returns the result of converting the double-precision floating-point value
2157 `a' to the extended double-precision floating-point format. The conversion
2158 is performed according to the IEC/IEEE Standard for Binary Floating-point
2160 -------------------------------------------------------------------------------
2162 floatx80
float64_to_floatx80( float64 a
)
2168 aSig
= extractFloat64Frac( a
);
2169 aExp
= extractFloat64Exp( a
);
2170 aSign
= extractFloat64Sign( a
);
2171 if ( aExp
== 0x7FF ) {
2172 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a
) );
2173 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2176 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2177 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2181 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2190 -------------------------------------------------------------------------------
2191 Returns the result of converting the double-precision floating-point value
2192 `a' to the quadruple-precision floating-point format. The conversion is
2193 performed according to the IEC/IEEE Standard for Binary Floating-point
2195 -------------------------------------------------------------------------------
2197 float128
float64_to_float128( float64 a
)
2201 bits64 aSig
, zSig0
, zSig1
;
2203 aSig
= extractFloat64Frac( a
);
2204 aExp
= extractFloat64Exp( a
);
2205 aSign
= extractFloat64Sign( a
);
2206 if ( aExp
== 0x7FF ) {
2207 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a
) );
2208 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2211 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2212 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2215 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2216 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2223 -------------------------------------------------------------------------------
2224 Rounds the double-precision floating-point value `a' to an integer, and
2225 returns the result as a double-precision floating-point value. The
2226 operation is performed according to the IEC/IEEE Standard for Binary
2227 Floating-point Arithmetic.
2228 -------------------------------------------------------------------------------
2230 float64
float64_round_to_int( float64 a
)
2234 bits64 lastBitMask
, roundBitsMask
;
2238 aExp
= extractFloat64Exp( a
);
2239 if ( 0x433 <= aExp
) {
2240 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2241 return propagateFloat64NaN( a
, a
);
2245 if ( aExp
<= 0x3FE ) {
2246 if ( (bits64
) ( a
<<1 ) == 0 ) return a
;
2247 float_exception_flags
|= float_flag_inexact
;
2248 aSign
= extractFloat64Sign( a
);
2249 switch ( float_rounding_mode
) {
2250 case float_round_nearest_even
:
2251 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2252 return packFloat64( aSign
, 0x3FF, 0 );
2255 case float_round_down
:
2256 return aSign
? LIT64( 0xBFF0000000000000 ) : 0;
2257 case float_round_up
:
2259 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2261 return packFloat64( aSign
, 0, 0 );
2264 lastBitMask
<<= 0x433 - aExp
;
2265 roundBitsMask
= lastBitMask
- 1;
2267 roundingMode
= float_rounding_mode
;
2268 if ( roundingMode
== float_round_nearest_even
) {
2269 z
+= lastBitMask
>>1;
2270 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2272 else if ( roundingMode
!= float_round_to_zero
) {
2273 if ( extractFloat64Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
2277 z
&= ~ roundBitsMask
;
2278 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
2284 -------------------------------------------------------------------------------
2285 Returns the result of adding the absolute values of the double-precision
2286 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
2287 before being returned. `zSign' is ignored if the result is a NaN. The
2288 addition is performed according to the IEC/IEEE Standard for Binary
2289 Floating-point Arithmetic.
2290 -------------------------------------------------------------------------------
2292 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2294 int16 aExp
, bExp
, zExp
;
2295 bits64 aSig
, bSig
, zSig
;
2298 aSig
= extractFloat64Frac( a
);
2299 aExp
= extractFloat64Exp( a
);
2300 bSig
= extractFloat64Frac( b
);
2301 bExp
= extractFloat64Exp( b
);
2302 expDiff
= aExp
- bExp
;
2305 if ( 0 < expDiff
) {
2306 if ( aExp
== 0x7FF ) {
2307 if ( aSig
) return propagateFloat64NaN( a
, b
);
2314 bSig
|= LIT64( 0x2000000000000000 );
2316 shift64RightJamming( bSig
, expDiff
, &bSig
);
2319 else if ( expDiff
< 0 ) {
2320 if ( bExp
== 0x7FF ) {
2321 if ( bSig
) return propagateFloat64NaN( a
, b
);
2322 return packFloat64( zSign
, 0x7FF, 0 );
2328 aSig
|= LIT64( 0x2000000000000000 );
2330 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2334 if ( aExp
== 0x7FF ) {
2335 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2338 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
2339 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
2343 aSig
|= LIT64( 0x2000000000000000 );
2344 zSig
= ( aSig
+ bSig
)<<1;
2346 if ( (sbits64
) zSig
< 0 ) {
2351 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2356 -------------------------------------------------------------------------------
2357 Returns the result of subtracting the absolute values of the double-
2358 precision floating-point values `a' and `b'. If `zSign' is true, the
2359 difference is negated before being returned. `zSign' is ignored if the
2360 result is a NaN. The subtraction is performed according to the IEC/IEEE
2361 Standard for Binary Floating-point Arithmetic.
2362 -------------------------------------------------------------------------------
2364 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2366 int16 aExp
, bExp
, zExp
;
2367 bits64 aSig
, bSig
, zSig
;
2370 aSig
= extractFloat64Frac( a
);
2371 aExp
= extractFloat64Exp( a
);
2372 bSig
= extractFloat64Frac( b
);
2373 bExp
= extractFloat64Exp( b
);
2374 expDiff
= aExp
- bExp
;
2377 if ( 0 < expDiff
) goto aExpBigger
;
2378 if ( expDiff
< 0 ) goto bExpBigger
;
2379 if ( aExp
== 0x7FF ) {
2380 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2381 float_raise( float_flag_invalid
);
2382 return float64_default_nan
;
2388 if ( bSig
< aSig
) goto aBigger
;
2389 if ( aSig
< bSig
) goto bBigger
;
2390 return packFloat64( float_rounding_mode
== float_round_down
, 0, 0 );
2392 if ( bExp
== 0x7FF ) {
2393 if ( bSig
) return propagateFloat64NaN( a
, b
);
2394 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2400 aSig
|= LIT64( 0x4000000000000000 );
2402 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2403 bSig
|= LIT64( 0x4000000000000000 );
2408 goto normalizeRoundAndPack
;
2410 if ( aExp
== 0x7FF ) {
2411 if ( aSig
) return propagateFloat64NaN( a
, b
);
2418 bSig
|= LIT64( 0x4000000000000000 );
2420 shift64RightJamming( bSig
, expDiff
, &bSig
);
2421 aSig
|= LIT64( 0x4000000000000000 );
2425 normalizeRoundAndPack
:
2427 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig
);
2432 -------------------------------------------------------------------------------
2433 Returns the result of adding the double-precision floating-point values `a'
2434 and `b'. The operation is performed according to the IEC/IEEE Standard for
2435 Binary Floating-point Arithmetic.
2436 -------------------------------------------------------------------------------
2438 float64
float64_add( float64 a
, float64 b
)
2442 aSign
= extractFloat64Sign( a
);
2443 bSign
= extractFloat64Sign( b
);
2444 if ( aSign
== bSign
) {
2445 return addFloat64Sigs( a
, b
, aSign
);
2448 return subFloat64Sigs( a
, b
, aSign
);
2454 -------------------------------------------------------------------------------
2455 Returns the result of subtracting the double-precision floating-point values
2456 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2457 for Binary Floating-point Arithmetic.
2458 -------------------------------------------------------------------------------
2460 float64
float64_sub( float64 a
, float64 b
)
2464 aSign
= extractFloat64Sign( a
);
2465 bSign
= extractFloat64Sign( b
);
2466 if ( aSign
== bSign
) {
2467 return subFloat64Sigs( a
, b
, aSign
);
2470 return addFloat64Sigs( a
, b
, aSign
);
2476 -------------------------------------------------------------------------------
2477 Returns the result of multiplying the double-precision floating-point values
2478 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2479 for Binary Floating-point Arithmetic.
2480 -------------------------------------------------------------------------------
2482 float64
float64_mul( float64 a
, float64 b
)
2484 flag aSign
, bSign
, zSign
;
2485 int16 aExp
, bExp
, zExp
;
2486 bits64 aSig
, bSig
, zSig0
, zSig1
;
2488 aSig
= extractFloat64Frac( a
);
2489 aExp
= extractFloat64Exp( a
);
2490 aSign
= extractFloat64Sign( a
);
2491 bSig
= extractFloat64Frac( b
);
2492 bExp
= extractFloat64Exp( b
);
2493 bSign
= extractFloat64Sign( b
);
2494 zSign
= aSign
^ bSign
;
2495 if ( aExp
== 0x7FF ) {
2496 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2497 return propagateFloat64NaN( a
, b
);
2499 if ( ( bExp
| bSig
) == 0 ) {
2500 float_raise( float_flag_invalid
);
2501 return float64_default_nan
;
2503 return packFloat64( zSign
, 0x7FF, 0 );
2505 if ( bExp
== 0x7FF ) {
2506 if ( bSig
) return propagateFloat64NaN( a
, b
);
2507 if ( ( aExp
| aSig
) == 0 ) {
2508 float_raise( float_flag_invalid
);
2509 return float64_default_nan
;
2511 return packFloat64( zSign
, 0x7FF, 0 );
2514 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2515 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2518 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2519 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2521 zExp
= aExp
+ bExp
- 0x3FF;
2522 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2523 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2524 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2525 zSig0
|= ( zSig1
!= 0 );
2526 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2530 return roundAndPackFloat64( zSign
, zExp
, zSig0
);
2535 -------------------------------------------------------------------------------
2536 Returns the result of dividing the double-precision floating-point value `a'
2537 by the corresponding value `b'. The operation is performed according to
2538 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2539 -------------------------------------------------------------------------------
2541 float64
float64_div( float64 a
, float64 b
)
2543 flag aSign
, bSign
, zSign
;
2544 int16 aExp
, bExp
, zExp
;
2545 bits64 aSig
, bSig
, zSig
;
2547 bits64 term0
, term1
;
2549 aSig
= extractFloat64Frac( a
);
2550 aExp
= extractFloat64Exp( a
);
2551 aSign
= extractFloat64Sign( a
);
2552 bSig
= extractFloat64Frac( b
);
2553 bExp
= extractFloat64Exp( b
);
2554 bSign
= extractFloat64Sign( b
);
2555 zSign
= aSign
^ bSign
;
2556 if ( aExp
== 0x7FF ) {
2557 if ( aSig
) return propagateFloat64NaN( a
, b
);
2558 if ( bExp
== 0x7FF ) {
2559 if ( bSig
) return propagateFloat64NaN( a
, b
);
2560 float_raise( float_flag_invalid
);
2561 return float64_default_nan
;
2563 return packFloat64( zSign
, 0x7FF, 0 );
2565 if ( bExp
== 0x7FF ) {
2566 if ( bSig
) return propagateFloat64NaN( a
, b
);
2567 return packFloat64( zSign
, 0, 0 );
2571 if ( ( aExp
| aSig
) == 0 ) {
2572 float_raise( float_flag_invalid
);
2573 return float64_default_nan
;
2575 float_raise( float_flag_divbyzero
);
2576 return packFloat64( zSign
, 0x7FF, 0 );
2578 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2581 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2582 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2584 zExp
= aExp
- bExp
+ 0x3FD;
2585 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2586 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2587 if ( bSig
<= ( aSig
+ aSig
) ) {
2591 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2592 if ( ( zSig
& 0x1FF ) <= 2 ) {
2593 mul64To128( bSig
, zSig
, &term0
, &term1
);
2594 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2595 while ( (sbits64
) rem0
< 0 ) {
2597 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2599 zSig
|= ( rem1
!= 0 );
2601 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2606 -------------------------------------------------------------------------------
2607 Returns the remainder of the double-precision floating-point value `a'
2608 with respect to the corresponding value `b'. The operation is performed
2609 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2610 -------------------------------------------------------------------------------
2612 float64
float64_rem( float64 a
, float64 b
)
2614 flag aSign
, bSign
, zSign
;
2615 int16 aExp
, bExp
, expDiff
;
2617 bits64 q
, alternateASig
;
2620 aSig
= extractFloat64Frac( a
);
2621 aExp
= extractFloat64Exp( a
);
2622 aSign
= extractFloat64Sign( a
);
2623 bSig
= extractFloat64Frac( b
);
2624 bExp
= extractFloat64Exp( b
);
2625 bSign
= extractFloat64Sign( b
);
2626 if ( aExp
== 0x7FF ) {
2627 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2628 return propagateFloat64NaN( a
, b
);
2630 float_raise( float_flag_invalid
);
2631 return float64_default_nan
;
2633 if ( bExp
== 0x7FF ) {
2634 if ( bSig
) return propagateFloat64NaN( a
, b
);
2639 float_raise( float_flag_invalid
);
2640 return float64_default_nan
;
2642 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2645 if ( aSig
== 0 ) return a
;
2646 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2648 expDiff
= aExp
- bExp
;
2649 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
2650 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2651 if ( expDiff
< 0 ) {
2652 if ( expDiff
< -1 ) return a
;
2655 q
= ( bSig
<= aSig
);
2656 if ( q
) aSig
-= bSig
;
2658 while ( 0 < expDiff
) {
2659 q
= estimateDiv128To64( aSig
, 0, bSig
);
2660 q
= ( 2 < q
) ? q
- 2 : 0;
2661 aSig
= - ( ( bSig
>>2 ) * q
);
2665 if ( 0 < expDiff
) {
2666 q
= estimateDiv128To64( aSig
, 0, bSig
);
2667 q
= ( 2 < q
) ? q
- 2 : 0;
2670 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2677 alternateASig
= aSig
;
2680 } while ( 0 <= (sbits64
) aSig
);
2681 sigMean
= aSig
+ alternateASig
;
2682 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2683 aSig
= alternateASig
;
2685 zSign
= ( (sbits64
) aSig
< 0 );
2686 if ( zSign
) aSig
= - aSig
;
2687 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig
);
2692 -------------------------------------------------------------------------------
2693 Returns the square root of the double-precision floating-point value `a'.
2694 The operation is performed according to the IEC/IEEE Standard for Binary
2695 Floating-point Arithmetic.
2696 -------------------------------------------------------------------------------
2698 float64
float64_sqrt( float64 a
)
2703 bits64 rem0
, rem1
, term0
, term1
; //, shiftedRem;
2706 aSig
= extractFloat64Frac( a
);
2707 aExp
= extractFloat64Exp( a
);
2708 aSign
= extractFloat64Sign( a
);
2709 if ( aExp
== 0x7FF ) {
2710 if ( aSig
) return propagateFloat64NaN( a
, a
);
2711 if ( ! aSign
) return a
;
2712 float_raise( float_flag_invalid
);
2713 return float64_default_nan
;
2716 if ( ( aExp
| aSig
) == 0 ) return a
;
2717 float_raise( float_flag_invalid
);
2718 return float64_default_nan
;
2721 if ( aSig
== 0 ) return 0;
2722 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2724 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
2725 aSig
|= LIT64( 0x0010000000000000 );
2726 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
2728 aSig
<<= 9 - ( aExp
& 1 );
2729 zSig
= estimateDiv128To64( aSig
, 0, zSig
) + zSig
+ 2;
2730 if ( ( zSig
& 0x3FF ) <= 5 ) {
2732 zSig
= LIT64( 0xFFFFFFFFFFFFFFFF );
2736 mul64To128( zSig
, zSig
, &term0
, &term1
);
2737 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2738 while ( (sbits64
) rem0
< 0 ) {
2740 shortShift128Left( 0, zSig
, 1, &term0
, &term1
);
2742 add128( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
2744 zSig
|= ( ( rem0
| rem1
) != 0 );
2747 shift64RightJamming( zSig
, 1, &zSig
);
2748 return roundAndPackFloat64( 0, zExp
, zSig
);
2753 -------------------------------------------------------------------------------
2754 Returns 1 if the double-precision floating-point value `a' is equal to the
2755 corresponding value `b', and 0 otherwise. The comparison is performed
2756 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2757 -------------------------------------------------------------------------------
2759 flag
float64_eq( float64 a
, float64 b
)
2762 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2763 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2765 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2766 float_raise( float_flag_invalid
);
2770 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2775 -------------------------------------------------------------------------------
2776 Returns 1 if the double-precision floating-point value `a' is less than or
2777 equal to the corresponding value `b', and 0 otherwise. The comparison is
2778 performed according to the IEC/IEEE Standard for Binary Floating-point
2780 -------------------------------------------------------------------------------
2782 flag
float64_le( float64 a
, float64 b
)
2786 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2787 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2789 float_raise( float_flag_invalid
);
2792 aSign
= extractFloat64Sign( a
);
2793 bSign
= extractFloat64Sign( b
);
2794 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2795 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2800 -------------------------------------------------------------------------------
2801 Returns 1 if the double-precision floating-point value `a' is less than
2802 the corresponding value `b', and 0 otherwise. The comparison is performed
2803 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2804 -------------------------------------------------------------------------------
2806 flag
float64_lt( float64 a
, float64 b
)
2810 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2811 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2813 float_raise( float_flag_invalid
);
2816 aSign
= extractFloat64Sign( a
);
2817 bSign
= extractFloat64Sign( b
);
2818 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2819 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2824 -------------------------------------------------------------------------------
2825 Returns 1 if the double-precision floating-point value `a' is equal to the
2826 corresponding value `b', and 0 otherwise. The invalid exception is raised
2827 if either operand is a NaN. Otherwise, the comparison is performed
2828 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2829 -------------------------------------------------------------------------------
2831 flag
float64_eq_signaling( float64 a
, float64 b
)
2834 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2835 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2837 float_raise( float_flag_invalid
);
2840 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2845 -------------------------------------------------------------------------------
2846 Returns 1 if the double-precision floating-point value `a' is less than or
2847 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2848 cause an exception. Otherwise, the comparison is performed according to the
2849 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2850 -------------------------------------------------------------------------------
2852 flag
float64_le_quiet( float64 a
, float64 b
)
2857 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2858 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2860 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2861 float_raise( float_flag_invalid
);
2865 aSign
= extractFloat64Sign( a
);
2866 bSign
= extractFloat64Sign( b
);
2867 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2868 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2873 -------------------------------------------------------------------------------
2874 Returns 1 if the double-precision floating-point value `a' is less than
2875 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2876 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2877 Standard for Binary Floating-point Arithmetic.
2878 -------------------------------------------------------------------------------
2880 flag
float64_lt_quiet( float64 a
, float64 b
)
2884 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2885 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2887 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2888 float_raise( float_flag_invalid
);
2892 aSign
= extractFloat64Sign( a
);
2893 bSign
= extractFloat64Sign( b
);
2894 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2895 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2902 -------------------------------------------------------------------------------
2903 Returns the result of converting the extended double-precision floating-
2904 point value `a' to the 32-bit two's complement integer format. The
2905 conversion is performed according to the IEC/IEEE Standard for Binary
2906 Floating-point Arithmetic---which means in particular that the conversion
2907 is rounded according to the current rounding mode. If `a' is a NaN, the
2908 largest positive integer is returned. Otherwise, if the conversion
2909 overflows, the largest integer with the same sign as `a' is returned.
2910 -------------------------------------------------------------------------------
2912 int32
floatx80_to_int32( floatx80 a
)
2915 int32 aExp
, shiftCount
;
2918 aSig
= extractFloatx80Frac( a
);
2919 aExp
= extractFloatx80Exp( a
);
2920 aSign
= extractFloatx80Sign( a
);
2921 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
2922 shiftCount
= 0x4037 - aExp
;
2923 if ( shiftCount
<= 0 ) shiftCount
= 1;
2924 shift64RightJamming( aSig
, shiftCount
, &aSig
);
2925 return roundAndPackInt32( aSign
, aSig
);
2930 -------------------------------------------------------------------------------
2931 Returns the result of converting the extended double-precision floating-
2932 point value `a' to the 32-bit two's complement integer format. The
2933 conversion is performed according to the IEC/IEEE Standard for Binary
2934 Floating-point Arithmetic, except that the conversion is always rounded
2935 toward zero. If `a' is a NaN, the largest positive integer is returned.
2936 Otherwise, if the conversion overflows, the largest integer with the same
2937 sign as `a' is returned.
2938 -------------------------------------------------------------------------------
2940 int32
floatx80_to_int32_round_to_zero( floatx80 a
)
2943 int32 aExp
, shiftCount
;
2944 bits64 aSig
, savedASig
;
2947 aSig
= extractFloatx80Frac( a
);
2948 aExp
= extractFloatx80Exp( a
);
2949 aSign
= extractFloatx80Sign( a
);
2950 shiftCount
= 0x403E - aExp
;
2951 if ( shiftCount
< 32 ) {
2952 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
2955 else if ( 63 < shiftCount
) {
2956 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
2960 aSig
>>= shiftCount
;
2962 if ( aSign
) z
= - z
;
2963 if ( ( z
< 0 ) ^ aSign
) {
2965 float_exception_flags
|= float_flag_invalid
;
2966 return aSign
? 0x80000000 : 0x7FFFFFFF;
2968 if ( ( aSig
<<shiftCount
) != savedASig
) {
2969 float_exception_flags
|= float_flag_inexact
;
2976 -------------------------------------------------------------------------------
2977 Returns the result of converting the extended double-precision floating-
2978 point value `a' to the single-precision floating-point format. The
2979 conversion is performed according to the IEC/IEEE Standard for Binary
2980 Floating-point Arithmetic.
2981 -------------------------------------------------------------------------------
2983 float32
floatx80_to_float32( floatx80 a
)
2989 aSig
= extractFloatx80Frac( a
);
2990 aExp
= extractFloatx80Exp( a
);
2991 aSign
= extractFloatx80Sign( a
);
2992 if ( aExp
== 0x7FFF ) {
2993 if ( (bits64
) ( aSig
<<1 ) ) {
2994 return commonNaNToFloat32( floatx80ToCommonNaN( a
) );
2996 return packFloat32( aSign
, 0xFF, 0 );
2998 shift64RightJamming( aSig
, 33, &aSig
);
2999 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3000 return roundAndPackFloat32( aSign
, aExp
, aSig
);
3005 -------------------------------------------------------------------------------
3006 Returns the result of converting the extended double-precision floating-
3007 point value `a' to the double-precision floating-point format. The
3008 conversion is performed according to the IEC/IEEE Standard for Binary
3009 Floating-point Arithmetic.
3010 -------------------------------------------------------------------------------
3012 float64
floatx80_to_float64( floatx80 a
)
3018 aSig
= extractFloatx80Frac( a
);
3019 aExp
= extractFloatx80Exp( a
);
3020 aSign
= extractFloatx80Sign( a
);
3021 if ( aExp
== 0x7FFF ) {
3022 if ( (bits64
) ( aSig
<<1 ) ) {
3023 return commonNaNToFloat64( floatx80ToCommonNaN( a
) );
3025 return packFloat64( aSign
, 0x7FF, 0 );
3027 shift64RightJamming( aSig
, 1, &zSig
);
3028 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3029 return roundAndPackFloat64( aSign
, aExp
, zSig
);
3036 -------------------------------------------------------------------------------
3037 Returns the result of converting the extended double-precision floating-
3038 point value `a' to the quadruple-precision floating-point format. The
3039 conversion is performed according to the IEC/IEEE Standard for Binary
3040 Floating-point Arithmetic.
3041 -------------------------------------------------------------------------------
3043 float128
floatx80_to_float128( floatx80 a
)
3047 bits64 aSig
, zSig0
, zSig1
;
3049 aSig
= extractFloatx80Frac( a
);
3050 aExp
= extractFloatx80Exp( a
);
3051 aSign
= extractFloatx80Sign( a
);
3052 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3053 return commonNaNToFloat128( floatx80ToCommonNaN( a
) );
3055 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3056 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3063 -------------------------------------------------------------------------------
3064 Rounds the extended double-precision floating-point value `a' to an integer,
3065 and returns the result as an extended quadruple-precision floating-point
3066 value. The operation is performed according to the IEC/IEEE Standard for
3067 Binary Floating-point Arithmetic.
3068 -------------------------------------------------------------------------------
3070 floatx80
floatx80_round_to_int( floatx80 a
)
3074 bits64 lastBitMask
, roundBitsMask
;
3078 aExp
= extractFloatx80Exp( a
);
3079 if ( 0x403E <= aExp
) {
3080 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3081 return propagateFloatx80NaN( a
, a
);
3085 if ( aExp
<= 0x3FFE ) {
3087 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3090 float_exception_flags
|= float_flag_inexact
;
3091 aSign
= extractFloatx80Sign( a
);
3092 switch ( float_rounding_mode
) {
3093 case float_round_nearest_even
:
3094 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3097 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3100 case float_round_down
:
3103 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3104 : packFloatx80( 0, 0, 0 );
3105 case float_round_up
:
3107 aSign
? packFloatx80( 1, 0, 0 )
3108 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3110 return packFloatx80( aSign
, 0, 0 );
3113 lastBitMask
<<= 0x403E - aExp
;
3114 roundBitsMask
= lastBitMask
- 1;
3116 roundingMode
= float_rounding_mode
;
3117 if ( roundingMode
== float_round_nearest_even
) {
3118 z
.low
+= lastBitMask
>>1;
3119 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
3121 else if ( roundingMode
!= float_round_to_zero
) {
3122 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
3123 z
.low
+= roundBitsMask
;
3126 z
.low
&= ~ roundBitsMask
;
3129 z
.low
= LIT64( 0x8000000000000000 );
3131 if ( z
.low
!= a
.low
) float_exception_flags
|= float_flag_inexact
;
3137 -------------------------------------------------------------------------------
3138 Returns the result of adding the absolute values of the extended double-
3139 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
3140 negated before being returned. `zSign' is ignored if the result is a NaN.
3141 The addition is performed according to the IEC/IEEE Standard for Binary
3142 Floating-point Arithmetic.
3143 -------------------------------------------------------------------------------
3145 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3147 int32 aExp
, bExp
, zExp
;
3148 bits64 aSig
, bSig
, zSig0
, zSig1
;
3151 aSig
= extractFloatx80Frac( a
);
3152 aExp
= extractFloatx80Exp( a
);
3153 bSig
= extractFloatx80Frac( b
);
3154 bExp
= extractFloatx80Exp( b
);
3155 expDiff
= aExp
- bExp
;
3156 if ( 0 < expDiff
) {
3157 if ( aExp
== 0x7FFF ) {
3158 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3161 if ( bExp
== 0 ) --expDiff
;
3162 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3165 else if ( expDiff
< 0 ) {
3166 if ( bExp
== 0x7FFF ) {
3167 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3168 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3170 if ( aExp
== 0 ) ++expDiff
;
3171 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3175 if ( aExp
== 0x7FFF ) {
3176 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3177 return propagateFloatx80NaN( a
, b
);
3182 zSig0
= aSig
+ bSig
;
3184 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
3191 zSig0
= aSig
+ bSig
;
3193 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
3195 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3196 zSig0
|= LIT64( 0x8000000000000000 );
3200 roundAndPackFloatx80(
3201 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3206 -------------------------------------------------------------------------------
3207 Returns the result of subtracting the absolute values of the extended
3208 double-precision floating-point values `a' and `b'. If `zSign' is true,
3209 the difference is negated before being returned. `zSign' is ignored if the
3210 result is a NaN. The subtraction is performed according to the IEC/IEEE
3211 Standard for Binary Floating-point Arithmetic.
3212 -------------------------------------------------------------------------------
3214 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3216 int32 aExp
, bExp
, zExp
;
3217 bits64 aSig
, bSig
, zSig0
, zSig1
;
3221 aSig
= extractFloatx80Frac( a
);
3222 aExp
= extractFloatx80Exp( a
);
3223 bSig
= extractFloatx80Frac( b
);
3224 bExp
= extractFloatx80Exp( b
);
3225 expDiff
= aExp
- bExp
;
3226 if ( 0 < expDiff
) goto aExpBigger
;
3227 if ( expDiff
< 0 ) goto bExpBigger
;
3228 if ( aExp
== 0x7FFF ) {
3229 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3230 return propagateFloatx80NaN( a
, b
);
3232 float_raise( float_flag_invalid
);
3233 z
.low
= floatx80_default_nan_low
;
3234 z
.high
= floatx80_default_nan_high
;
3242 if ( bSig
< aSig
) goto aBigger
;
3243 if ( aSig
< bSig
) goto bBigger
;
3244 return packFloatx80( float_rounding_mode
== float_round_down
, 0, 0 );
3246 if ( bExp
== 0x7FFF ) {
3247 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3248 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3250 if ( aExp
== 0 ) ++expDiff
;
3251 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3253 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
3256 goto normalizeRoundAndPack
;
3258 if ( aExp
== 0x7FFF ) {
3259 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3262 if ( bExp
== 0 ) --expDiff
;
3263 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3265 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
3267 normalizeRoundAndPack
:
3269 normalizeRoundAndPackFloatx80(
3270 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3275 -------------------------------------------------------------------------------
3276 Returns the result of adding the extended double-precision floating-point
3277 values `a' and `b'. The operation is performed according to the IEC/IEEE
3278 Standard for Binary Floating-point Arithmetic.
3279 -------------------------------------------------------------------------------
3281 floatx80
floatx80_add( floatx80 a
, floatx80 b
)
3285 aSign
= extractFloatx80Sign( a
);
3286 bSign
= extractFloatx80Sign( b
);
3287 if ( aSign
== bSign
) {
3288 return addFloatx80Sigs( a
, b
, aSign
);
3291 return subFloatx80Sigs( a
, b
, aSign
);
3297 -------------------------------------------------------------------------------
3298 Returns the result of subtracting the extended double-precision floating-
3299 point values `a' and `b'. The operation is performed according to the
3300 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3301 -------------------------------------------------------------------------------
3303 floatx80
floatx80_sub( floatx80 a
, floatx80 b
)
3307 aSign
= extractFloatx80Sign( a
);
3308 bSign
= extractFloatx80Sign( b
);
3309 if ( aSign
== bSign
) {
3310 return subFloatx80Sigs( a
, b
, aSign
);
3313 return addFloatx80Sigs( a
, b
, aSign
);
3319 -------------------------------------------------------------------------------
3320 Returns the result of multiplying the extended double-precision floating-
3321 point values `a' and `b'. The operation is performed according to the
3322 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3323 -------------------------------------------------------------------------------
3325 floatx80
floatx80_mul( floatx80 a
, floatx80 b
)
3327 flag aSign
, bSign
, zSign
;
3328 int32 aExp
, bExp
, zExp
;
3329 bits64 aSig
, bSig
, zSig0
, zSig1
;
3332 aSig
= extractFloatx80Frac( a
);
3333 aExp
= extractFloatx80Exp( a
);
3334 aSign
= extractFloatx80Sign( a
);
3335 bSig
= extractFloatx80Frac( b
);
3336 bExp
= extractFloatx80Exp( b
);
3337 bSign
= extractFloatx80Sign( b
);
3338 zSign
= aSign
^ bSign
;
3339 if ( aExp
== 0x7FFF ) {
3340 if ( (bits64
) ( aSig
<<1 )
3341 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3342 return propagateFloatx80NaN( a
, b
);
3344 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
3345 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3347 if ( bExp
== 0x7FFF ) {
3348 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3349 if ( ( aExp
| aSig
) == 0 ) {
3351 float_raise( float_flag_invalid
);
3352 z
.low
= floatx80_default_nan_low
;
3353 z
.high
= floatx80_default_nan_high
;
3356 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3359 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3360 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3363 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3364 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3366 zExp
= aExp
+ bExp
- 0x3FFE;
3367 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3368 if ( 0 < (sbits64
) zSig0
) {
3369 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3373 roundAndPackFloatx80(
3374 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3379 -------------------------------------------------------------------------------
3380 Returns the result of dividing the extended double-precision floating-point
3381 value `a' by the corresponding value `b'. The operation is performed
3382 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3383 -------------------------------------------------------------------------------
3385 floatx80
floatx80_div( floatx80 a
, floatx80 b
)
3387 flag aSign
, bSign
, zSign
;
3388 int32 aExp
, bExp
, zExp
;
3389 bits64 aSig
, bSig
, zSig0
, zSig1
;
3390 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3393 aSig
= extractFloatx80Frac( a
);
3394 aExp
= extractFloatx80Exp( a
);
3395 aSign
= extractFloatx80Sign( a
);
3396 bSig
= extractFloatx80Frac( b
);
3397 bExp
= extractFloatx80Exp( b
);
3398 bSign
= extractFloatx80Sign( b
);
3399 zSign
= aSign
^ bSign
;
3400 if ( aExp
== 0x7FFF ) {
3401 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3402 if ( bExp
== 0x7FFF ) {
3403 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3406 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3408 if ( bExp
== 0x7FFF ) {
3409 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3410 return packFloatx80( zSign
, 0, 0 );
3414 if ( ( aExp
| aSig
) == 0 ) {
3416 float_raise( float_flag_invalid
);
3417 z
.low
= floatx80_default_nan_low
;
3418 z
.high
= floatx80_default_nan_high
;
3421 float_raise( float_flag_divbyzero
);
3422 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3424 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3427 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3428 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3430 zExp
= aExp
- bExp
+ 0x3FFE;
3432 if ( bSig
<= aSig
) {
3433 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3436 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3437 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3438 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3439 while ( (sbits64
) rem0
< 0 ) {
3441 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3443 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3444 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3445 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3446 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3447 while ( (sbits64
) rem1
< 0 ) {
3449 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3451 zSig1
|= ( ( rem1
| rem2
) != 0 );
3454 roundAndPackFloatx80(
3455 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3460 -------------------------------------------------------------------------------
3461 Returns the remainder of the extended double-precision floating-point value
3462 `a' with respect to the corresponding value `b'. The operation is performed
3463 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3464 -------------------------------------------------------------------------------
3466 floatx80
floatx80_rem( floatx80 a
, floatx80 b
)
3468 flag aSign
, bSign
, zSign
;
3469 int32 aExp
, bExp
, expDiff
;
3470 bits64 aSig0
, aSig1
, bSig
;
3471 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3474 aSig0
= extractFloatx80Frac( a
);
3475 aExp
= extractFloatx80Exp( a
);
3476 aSign
= extractFloatx80Sign( a
);
3477 bSig
= extractFloatx80Frac( b
);
3478 bExp
= extractFloatx80Exp( b
);
3479 bSign
= extractFloatx80Sign( b
);
3480 if ( aExp
== 0x7FFF ) {
3481 if ( (bits64
) ( aSig0
<<1 )
3482 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3483 return propagateFloatx80NaN( a
, b
);
3487 if ( bExp
== 0x7FFF ) {
3488 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3494 float_raise( float_flag_invalid
);
3495 z
.low
= floatx80_default_nan_low
;
3496 z
.high
= floatx80_default_nan_high
;
3499 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3502 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3503 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3505 bSig
|= LIT64( 0x8000000000000000 );
3507 expDiff
= aExp
- bExp
;
3509 if ( expDiff
< 0 ) {
3510 if ( expDiff
< -1 ) return a
;
3511 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3514 q
= ( bSig
<= aSig0
);
3515 if ( q
) aSig0
-= bSig
;
3517 while ( 0 < expDiff
) {
3518 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3519 q
= ( 2 < q
) ? q
- 2 : 0;
3520 mul64To128( bSig
, q
, &term0
, &term1
);
3521 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3522 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3526 if ( 0 < expDiff
) {
3527 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3528 q
= ( 2 < q
) ? q
- 2 : 0;
3530 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
3531 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3532 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
3533 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
3535 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3542 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
3543 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3544 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3547 aSig0
= alternateASig0
;
3548 aSig1
= alternateASig1
;
3552 normalizeRoundAndPackFloatx80(
3553 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
);
3558 -------------------------------------------------------------------------------
3559 Returns the square root of the extended double-precision floating-point
3560 value `a'. The operation is performed according to the IEC/IEEE Standard
3561 for Binary Floating-point Arithmetic.
3562 -------------------------------------------------------------------------------
3564 floatx80
floatx80_sqrt( floatx80 a
)
3568 bits64 aSig0
, aSig1
, zSig0
, zSig1
;
3569 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
3570 bits64 shiftedRem0
, shiftedRem1
;
3573 aSig0
= extractFloatx80Frac( a
);
3574 aExp
= extractFloatx80Exp( a
);
3575 aSign
= extractFloatx80Sign( a
);
3576 if ( aExp
== 0x7FFF ) {
3577 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a
);
3578 if ( ! aSign
) return a
;
3582 if ( ( aExp
| aSig0
) == 0 ) return a
;
3584 float_raise( float_flag_invalid
);
3585 z
.low
= floatx80_default_nan_low
;
3586 z
.high
= floatx80_default_nan_high
;
3590 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
3591 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3593 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
3594 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
3597 shift128Right( aSig0
, 0, ( aExp
& 1 ) + 2, &aSig0
, &aSig1
);
3598 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
) + zSig0
+ 4;
3599 if ( 0 <= (sbits64
) zSig0
) zSig0
= LIT64( 0xFFFFFFFFFFFFFFFF );
3600 shortShift128Left( aSig0
, aSig1
, 2, &aSig0
, &aSig1
);
3601 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
3602 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
3603 while ( (sbits64
) rem0
< 0 ) {
3605 shortShift128Left( 0, zSig0
, 1, &term0
, &term1
);
3607 add128( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
3609 shortShift128Left( rem0
, rem1
, 63, &shiftedRem0
, &shiftedRem1
);
3610 zSig1
= estimateDiv128To64( shiftedRem0
, shiftedRem1
, zSig0
);
3611 if ( (bits64
) ( zSig1
<<1 ) <= 10 ) {
3612 if ( zSig1
== 0 ) zSig1
= 1;
3613 mul64To128( zSig0
, zSig1
, &term1
, &term2
);
3614 shortShift128Left( term1
, term2
, 1, &term1
, &term2
);
3615 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3616 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
3617 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
3618 while ( (sbits64
) rem1
< 0 ) {
3620 shortShift192Left( 0, zSig0
, zSig1
, 1, &term1
, &term2
, &term3
);
3623 rem1
, rem2
, rem3
, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
3625 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
3628 roundAndPackFloatx80(
3629 floatx80_rounding_precision
, 0, zExp
, zSig0
, zSig1
);
3634 -------------------------------------------------------------------------------
3635 Returns 1 if the extended double-precision floating-point value `a' is
3636 equal to the corresponding value `b', and 0 otherwise. The comparison is
3637 performed according to the IEC/IEEE Standard for Binary Floating-point
3639 -------------------------------------------------------------------------------
3641 flag
floatx80_eq( floatx80 a
, floatx80 b
)
3644 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3645 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3646 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3647 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3649 if ( floatx80_is_signaling_nan( a
)
3650 || floatx80_is_signaling_nan( b
) ) {
3651 float_raise( float_flag_invalid
);
3657 && ( ( a
.high
== b
.high
)
3659 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3665 -------------------------------------------------------------------------------
3666 Returns 1 if the extended double-precision floating-point value `a' is
3667 less than or equal to the corresponding value `b', and 0 otherwise. The
3668 comparison is performed according to the IEC/IEEE Standard for Binary
3669 Floating-point Arithmetic.
3670 -------------------------------------------------------------------------------
3672 flag
floatx80_le( floatx80 a
, floatx80 b
)
3676 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3677 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3678 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3679 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3681 float_raise( float_flag_invalid
);
3684 aSign
= extractFloatx80Sign( a
);
3685 bSign
= extractFloatx80Sign( b
);
3686 if ( aSign
!= bSign
) {
3689 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3693 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3694 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3699 -------------------------------------------------------------------------------
3700 Returns 1 if the extended double-precision floating-point value `a' is
3701 less than the corresponding value `b', and 0 otherwise. The comparison
3702 is performed according to the IEC/IEEE Standard for Binary Floating-point
3704 -------------------------------------------------------------------------------
3706 flag
floatx80_lt( floatx80 a
, floatx80 b
)
3710 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3711 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3712 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3713 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3715 float_raise( float_flag_invalid
);
3718 aSign
= extractFloatx80Sign( a
);
3719 bSign
= extractFloatx80Sign( b
);
3720 if ( aSign
!= bSign
) {
3723 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3727 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
3728 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
3733 -------------------------------------------------------------------------------
3734 Returns 1 if the extended double-precision floating-point value `a' is equal
3735 to the corresponding value `b', and 0 otherwise. The invalid exception is
3736 raised if either operand is a NaN. Otherwise, the comparison is performed
3737 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3738 -------------------------------------------------------------------------------
3740 flag
floatx80_eq_signaling( floatx80 a
, floatx80 b
)
3743 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3744 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3745 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3746 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3748 float_raise( float_flag_invalid
);
3753 && ( ( a
.high
== b
.high
)
3755 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3761 -------------------------------------------------------------------------------
3762 Returns 1 if the extended double-precision floating-point value `a' is less
3763 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3764 do not cause an exception. Otherwise, the comparison is performed according
3765 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3766 -------------------------------------------------------------------------------
3768 flag
floatx80_le_quiet( floatx80 a
, floatx80 b
)
3772 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3773 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3774 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3775 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3777 if ( floatx80_is_signaling_nan( a
)
3778 || floatx80_is_signaling_nan( b
) ) {
3779 float_raise( float_flag_invalid
);
3783 aSign
= extractFloatx80Sign( a
);
3784 bSign
= extractFloatx80Sign( b
);
3785 if ( aSign
!= bSign
) {
3788 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3792 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3793 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3798 -------------------------------------------------------------------------------
3799 Returns 1 if the extended double-precision floating-point value `a' is less
3800 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3801 an exception. Otherwise, the comparison is performed according to the
3802 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3803 -------------------------------------------------------------------------------
3805 flag
floatx80_lt_quiet( floatx80 a
, floatx80 b
)
3809 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3810 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3811 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3812 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3814 if ( floatx80_is_signaling_nan( a
)
3815 || floatx80_is_signaling_nan( b
) ) {
3816 float_raise( float_flag_invalid
);
3820 aSign
= extractFloatx80Sign( a
);
3821 bSign
= extractFloatx80Sign( b
);
3822 if ( aSign
!= bSign
) {
3825 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3829 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
3830 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
3839 -------------------------------------------------------------------------------
3840 Returns the result of converting the quadruple-precision floating-point
3841 value `a' to the 32-bit two's complement integer format. The conversion
3842 is performed according to the IEC/IEEE Standard for Binary Floating-point
3843 Arithmetic---which means in particular that the conversion is rounded
3844 according to the current rounding mode. If `a' is a NaN, the largest
3845 positive integer is returned. Otherwise, if the conversion overflows, the
3846 largest integer with the same sign as `a' is returned.
3847 -------------------------------------------------------------------------------
3849 int32
float128_to_int32( float128 a
)
3852 int32 aExp
, shiftCount
;
3853 bits64 aSig0
, aSig1
;
3855 aSig1
= extractFloat128Frac1( a
);
3856 aSig0
= extractFloat128Frac0( a
);
3857 aExp
= extractFloat128Exp( a
);
3858 aSign
= extractFloat128Sign( a
);
3859 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
3860 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
3861 aSig0
|= ( aSig1
!= 0 );
3862 shiftCount
= 0x4028 - aExp
;
3863 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
3864 return roundAndPackInt32( aSign
, aSig0
);
3869 -------------------------------------------------------------------------------
3870 Returns the result of converting the quadruple-precision floating-point
3871 value `a' to the 32-bit two's complement integer format. The conversion
3872 is performed according to the IEC/IEEE Standard for Binary Floating-point
3873 Arithmetic, except that the conversion is always rounded toward zero. If
3874 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
3875 conversion overflows, the largest integer with the same sign as `a' is
3877 -------------------------------------------------------------------------------
3879 int32
float128_to_int32_round_to_zero( float128 a
)
3882 int32 aExp
, shiftCount
;
3883 bits64 aSig0
, aSig1
, savedASig
;
3886 aSig1
= extractFloat128Frac1( a
);
3887 aSig0
= extractFloat128Frac0( a
);
3888 aExp
= extractFloat128Exp( a
);
3889 aSign
= extractFloat128Sign( a
);
3890 aSig0
|= ( aSig1
!= 0 );
3891 shiftCount
= 0x402F - aExp
;
3892 if ( shiftCount
< 17 ) {
3893 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
3896 else if ( 48 < shiftCount
) {
3897 if ( aExp
|| aSig0
) float_exception_flags
|= float_flag_inexact
;
3900 aSig0
|= LIT64( 0x0001000000000000 );
3902 aSig0
>>= shiftCount
;
3904 if ( aSign
) z
= - z
;
3905 if ( ( z
< 0 ) ^ aSign
) {
3907 float_exception_flags
|= float_flag_invalid
;
3908 return aSign
? 0x80000000 : 0x7FFFFFFF;
3910 if ( ( aSig0
<<shiftCount
) != savedASig
) {
3911 float_exception_flags
|= float_flag_inexact
;
3918 -------------------------------------------------------------------------------
3919 Returns the result of converting the quadruple-precision floating-point
3920 value `a' to the single-precision floating-point format. The conversion
3921 is performed according to the IEC/IEEE Standard for Binary Floating-point
3923 -------------------------------------------------------------------------------
3925 float32
float128_to_float32( float128 a
)
3929 bits64 aSig0
, aSig1
;
3932 aSig1
= extractFloat128Frac1( a
);
3933 aSig0
= extractFloat128Frac0( a
);
3934 aExp
= extractFloat128Exp( a
);
3935 aSign
= extractFloat128Sign( a
);
3936 if ( aExp
== 0x7FFF ) {
3937 if ( aSig0
| aSig1
) {
3938 return commonNaNToFloat32( float128ToCommonNaN( a
) );
3940 return packFloat32( aSign
, 0xFF, 0 );
3942 aSig0
|= ( aSig1
!= 0 );
3943 shift64RightJamming( aSig0
, 18, &aSig0
);
3945 if ( aExp
|| zSig
) {
3949 return roundAndPackFloat32( aSign
, aExp
, zSig
);
3954 -------------------------------------------------------------------------------
3955 Returns the result of converting the quadruple-precision floating-point
3956 value `a' to the double-precision floating-point format. The conversion
3957 is performed according to the IEC/IEEE Standard for Binary Floating-point
3959 -------------------------------------------------------------------------------
3961 float64
float128_to_float64( float128 a
)
3965 bits64 aSig0
, aSig1
;
3967 aSig1
= extractFloat128Frac1( a
);
3968 aSig0
= extractFloat128Frac0( a
);
3969 aExp
= extractFloat128Exp( a
);
3970 aSign
= extractFloat128Sign( a
);
3971 if ( aExp
== 0x7FFF ) {
3972 if ( aSig0
| aSig1
) {
3973 return commonNaNToFloat64( float128ToCommonNaN( a
) );
3975 return packFloat64( aSign
, 0x7FF, 0 );
3977 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
3978 aSig0
|= ( aSig1
!= 0 );
3979 if ( aExp
|| aSig0
) {
3980 aSig0
|= LIT64( 0x4000000000000000 );
3983 return roundAndPackFloat64( aSign
, aExp
, aSig0
);
3990 -------------------------------------------------------------------------------
3991 Returns the result of converting the quadruple-precision floating-point
3992 value `a' to the extended double-precision floating-point format. The
3993 conversion is performed according to the IEC/IEEE Standard for Binary
3994 Floating-point Arithmetic.
3995 -------------------------------------------------------------------------------
3997 floatx80
float128_to_floatx80( float128 a
)
4001 bits64 aSig0
, aSig1
;
4003 aSig1
= extractFloat128Frac1( a
);
4004 aSig0
= extractFloat128Frac0( a
);
4005 aExp
= extractFloat128Exp( a
);
4006 aSign
= extractFloat128Sign( a
);
4007 if ( aExp
== 0x7FFF ) {
4008 if ( aSig0
| aSig1
) {
4009 return commonNaNToFloatx80( float128ToCommonNaN( a
) );
4011 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4014 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4015 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4018 aSig0
|= LIT64( 0x0001000000000000 );
4020 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4021 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1
);
4028 -------------------------------------------------------------------------------
4029 Rounds the quadruple-precision floating-point value `a' to an integer, and
4030 returns the result as a quadruple-precision floating-point value. The
4031 operation is performed according to the IEC/IEEE Standard for Binary
4032 Floating-point Arithmetic.
4033 -------------------------------------------------------------------------------
4035 float128
float128_round_to_int( float128 a
)
4039 bits64 lastBitMask
, roundBitsMask
;
4043 aExp
= extractFloat128Exp( a
);
4044 if ( 0x402F <= aExp
) {
4045 if ( 0x406F <= aExp
) {
4046 if ( ( aExp
== 0x7FFF )
4047 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
4049 return propagateFloat128NaN( a
, a
);
4054 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
4055 roundBitsMask
= lastBitMask
- 1;
4057 roundingMode
= float_rounding_mode
;
4058 if ( roundingMode
== float_round_nearest_even
) {
4059 if ( lastBitMask
) {
4060 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
4061 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4064 if ( (sbits64
) z
.low
< 0 ) {
4066 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
4070 else if ( roundingMode
!= float_round_to_zero
) {
4071 if ( extractFloat128Sign( z
)
4072 ^ ( roundingMode
== float_round_up
) ) {
4073 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
4076 z
.low
&= ~ roundBitsMask
;
4079 if ( aExp
<= 0x3FFE ) {
4080 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
4081 float_exception_flags
|= float_flag_inexact
;
4082 aSign
= extractFloat128Sign( a
);
4083 switch ( float_rounding_mode
) {
4084 case float_round_nearest_even
:
4085 if ( ( aExp
== 0x3FFE )
4086 && ( extractFloat128Frac0( a
)
4087 | extractFloat128Frac1( a
) )
4089 return packFloat128( aSign
, 0x3FFF, 0, 0 );
4092 case float_round_down
:
4094 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
4095 : packFloat128( 0, 0, 0, 0 );
4096 case float_round_up
:
4098 aSign
? packFloat128( 1, 0, 0, 0 )
4099 : packFloat128( 0, 0x3FFF, 0, 0 );
4101 return packFloat128( aSign
, 0, 0, 0 );
4104 lastBitMask
<<= 0x402F - aExp
;
4105 roundBitsMask
= lastBitMask
- 1;
4108 roundingMode
= float_rounding_mode
;
4109 if ( roundingMode
== float_round_nearest_even
) {
4110 z
.high
+= lastBitMask
>>1;
4111 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
4112 z
.high
&= ~ lastBitMask
;
4115 else if ( roundingMode
!= float_round_to_zero
) {
4116 if ( extractFloat128Sign( z
)
4117 ^ ( roundingMode
== float_round_up
) ) {
4118 z
.high
|= ( a
.low
!= 0 );
4119 z
.high
+= roundBitsMask
;
4122 z
.high
&= ~ roundBitsMask
;
4124 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
4125 float_exception_flags
|= float_flag_inexact
;
4132 -------------------------------------------------------------------------------
4133 Returns the result of adding the absolute values of the quadruple-precision
4134 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
4135 before being returned. `zSign' is ignored if the result is a NaN. The
4136 addition is performed according to the IEC/IEEE Standard for Binary
4137 Floating-point Arithmetic.
4138 -------------------------------------------------------------------------------
4140 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4142 int32 aExp
, bExp
, zExp
;
4143 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4146 aSig1
= extractFloat128Frac1( a
);
4147 aSig0
= extractFloat128Frac0( a
);
4148 aExp
= extractFloat128Exp( a
);
4149 bSig1
= extractFloat128Frac1( b
);
4150 bSig0
= extractFloat128Frac0( b
);
4151 bExp
= extractFloat128Exp( b
);
4152 expDiff
= aExp
- bExp
;
4153 if ( 0 < expDiff
) {
4154 if ( aExp
== 0x7FFF ) {
4155 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4162 bSig0
|= LIT64( 0x0001000000000000 );
4164 shift128ExtraRightJamming(
4165 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
4168 else if ( expDiff
< 0 ) {
4169 if ( bExp
== 0x7FFF ) {
4170 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4171 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4177 aSig0
|= LIT64( 0x0001000000000000 );
4179 shift128ExtraRightJamming(
4180 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
4184 if ( aExp
== 0x7FFF ) {
4185 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4186 return propagateFloat128NaN( a
, b
);
4190 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4191 if ( aExp
== 0 ) return packFloat128( zSign
, 0, zSig0
, zSig1
);
4193 zSig0
|= LIT64( 0x0002000000000000 );
4197 aSig0
|= LIT64( 0x0001000000000000 );
4198 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4200 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
4203 shift128ExtraRightJamming(
4204 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4206 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4211 -------------------------------------------------------------------------------
4212 Returns the result of subtracting the absolute values of the quadruple-
4213 precision floating-point values `a' and `b'. If `zSign' is true, the
4214 difference is negated before being returned. `zSign' is ignored if the
4215 result is a NaN. The subtraction is performed according to the IEC/IEEE
4216 Standard for Binary Floating-point Arithmetic.
4217 -------------------------------------------------------------------------------
4219 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4221 int32 aExp
, bExp
, zExp
;
4222 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
4226 aSig1
= extractFloat128Frac1( a
);
4227 aSig0
= extractFloat128Frac0( a
);
4228 aExp
= extractFloat128Exp( a
);
4229 bSig1
= extractFloat128Frac1( b
);
4230 bSig0
= extractFloat128Frac0( b
);
4231 bExp
= extractFloat128Exp( b
);
4232 expDiff
= aExp
- bExp
;
4233 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4234 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
4235 if ( 0 < expDiff
) goto aExpBigger
;
4236 if ( expDiff
< 0 ) goto bExpBigger
;
4237 if ( aExp
== 0x7FFF ) {
4238 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4239 return propagateFloat128NaN( a
, b
);
4241 float_raise( float_flag_invalid
);
4242 z
.low
= float128_default_nan_low
;
4243 z
.high
= float128_default_nan_high
;
4250 if ( bSig0
< aSig0
) goto aBigger
;
4251 if ( aSig0
< bSig0
) goto bBigger
;
4252 if ( bSig1
< aSig1
) goto aBigger
;
4253 if ( aSig1
< bSig1
) goto bBigger
;
4254 return packFloat128( float_rounding_mode
== float_round_down
, 0, 0, 0 );
4256 if ( bExp
== 0x7FFF ) {
4257 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4258 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
4264 aSig0
|= LIT64( 0x4000000000000000 );
4266 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4267 bSig0
|= LIT64( 0x4000000000000000 );
4269 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4272 goto normalizeRoundAndPack
;
4274 if ( aExp
== 0x7FFF ) {
4275 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4282 bSig0
|= LIT64( 0x4000000000000000 );
4284 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
4285 aSig0
|= LIT64( 0x4000000000000000 );
4287 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4289 normalizeRoundAndPack
:
4291 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1
);
4296 -------------------------------------------------------------------------------
4297 Returns the result of adding the quadruple-precision floating-point values
4298 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4299 for Binary Floating-point Arithmetic.
4300 -------------------------------------------------------------------------------
4302 float128
float128_add( float128 a
, float128 b
)
4306 aSign
= extractFloat128Sign( a
);
4307 bSign
= extractFloat128Sign( b
);
4308 if ( aSign
== bSign
) {
4309 return addFloat128Sigs( a
, b
, aSign
);
4312 return subFloat128Sigs( a
, b
, aSign
);
4318 -------------------------------------------------------------------------------
4319 Returns the result of subtracting the quadruple-precision floating-point
4320 values `a' and `b'. The operation is performed according to the IEC/IEEE
4321 Standard for Binary Floating-point Arithmetic.
4322 -------------------------------------------------------------------------------
4324 float128
float128_sub( float128 a
, float128 b
)
4328 aSign
= extractFloat128Sign( a
);
4329 bSign
= extractFloat128Sign( b
);
4330 if ( aSign
== bSign
) {
4331 return subFloat128Sigs( a
, b
, aSign
);
4334 return addFloat128Sigs( a
, b
, aSign
);
4340 -------------------------------------------------------------------------------
4341 Returns the result of multiplying the quadruple-precision floating-point
4342 values `a' and `b'. The operation is performed according to the IEC/IEEE
4343 Standard for Binary Floating-point Arithmetic.
4344 -------------------------------------------------------------------------------
4346 float128
float128_mul( float128 a
, float128 b
)
4348 flag aSign
, bSign
, zSign
;
4349 int32 aExp
, bExp
, zExp
;
4350 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
4353 aSig1
= extractFloat128Frac1( a
);
4354 aSig0
= extractFloat128Frac0( a
);
4355 aExp
= extractFloat128Exp( a
);
4356 aSign
= extractFloat128Sign( a
);
4357 bSig1
= extractFloat128Frac1( b
);
4358 bSig0
= extractFloat128Frac0( b
);
4359 bExp
= extractFloat128Exp( b
);
4360 bSign
= extractFloat128Sign( b
);
4361 zSign
= aSign
^ bSign
;
4362 if ( aExp
== 0x7FFF ) {
4363 if ( ( aSig0
| aSig1
)
4364 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4365 return propagateFloat128NaN( a
, b
);
4367 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
4368 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4370 if ( bExp
== 0x7FFF ) {
4371 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4372 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4374 float_raise( float_flag_invalid
);
4375 z
.low
= float128_default_nan_low
;
4376 z
.high
= float128_default_nan_high
;
4379 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4382 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4383 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4386 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4387 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4389 zExp
= aExp
+ bExp
- 0x4000;
4390 aSig0
|= LIT64( 0x0001000000000000 );
4391 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
4392 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
4393 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4394 zSig2
|= ( zSig3
!= 0 );
4395 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
4396 shift128ExtraRightJamming(
4397 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4400 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4405 -------------------------------------------------------------------------------
4406 Returns the result of dividing the quadruple-precision floating-point value
4407 `a' by the corresponding value `b'. The operation is performed according to
4408 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4409 -------------------------------------------------------------------------------
4411 float128
float128_div( float128 a
, float128 b
)
4413 flag aSign
, bSign
, zSign
;
4414 int32 aExp
, bExp
, zExp
;
4415 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4416 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4419 aSig1
= extractFloat128Frac1( a
);
4420 aSig0
= extractFloat128Frac0( a
);
4421 aExp
= extractFloat128Exp( a
);
4422 aSign
= extractFloat128Sign( a
);
4423 bSig1
= extractFloat128Frac1( b
);
4424 bSig0
= extractFloat128Frac0( b
);
4425 bExp
= extractFloat128Exp( b
);
4426 bSign
= extractFloat128Sign( b
);
4427 zSign
= aSign
^ bSign
;
4428 if ( aExp
== 0x7FFF ) {
4429 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4430 if ( bExp
== 0x7FFF ) {
4431 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4434 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4436 if ( bExp
== 0x7FFF ) {
4437 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4438 return packFloat128( zSign
, 0, 0, 0 );
4441 if ( ( bSig0
| bSig1
) == 0 ) {
4442 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4444 float_raise( float_flag_invalid
);
4445 z
.low
= float128_default_nan_low
;
4446 z
.high
= float128_default_nan_high
;
4449 float_raise( float_flag_divbyzero
);
4450 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4452 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4455 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4456 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4458 zExp
= aExp
- bExp
+ 0x3FFD;
4460 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
4462 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
4463 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
4464 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
4467 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4468 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
4469 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
4470 while ( (sbits64
) rem0
< 0 ) {
4472 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
4474 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
4475 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
4476 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
4477 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
4478 while ( (sbits64
) rem1
< 0 ) {
4480 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
4482 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4484 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
4485 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4490 -------------------------------------------------------------------------------
4491 Returns the remainder of the quadruple-precision floating-point value `a'
4492 with respect to the corresponding value `b'. The operation is performed
4493 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4494 -------------------------------------------------------------------------------
4496 float128
float128_rem( float128 a
, float128 b
)
4498 flag aSign
, bSign
, zSign
;
4499 int32 aExp
, bExp
, expDiff
;
4500 bits64 aSig0
, aSig1
, bSig0
, bSig1
;
4501 bits64 q
, term0
, term1
, term2
, allZero
, alternateASig0
, alternateASig1
;
4506 aSig1
= extractFloat128Frac1( a
);
4507 aSig0
= extractFloat128Frac0( a
);
4508 aExp
= extractFloat128Exp( a
);
4509 aSign
= extractFloat128Sign( a
);
4510 bSig1
= extractFloat128Frac1( b
);
4511 bSig0
= extractFloat128Frac0( b
);
4512 bExp
= extractFloat128Exp( b
);
4513 bSign
= extractFloat128Sign( b
);
4514 if ( aExp
== 0x7FFF ) {
4515 if ( ( aSig0
| aSig1
)
4516 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4517 return propagateFloat128NaN( a
, b
);
4521 if ( bExp
== 0x7FFF ) {
4522 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4526 if ( ( bSig0
| bSig1
) == 0 ) {
4528 float_raise( float_flag_invalid
);
4529 z
.low
= float128_default_nan_low
;
4530 z
.high
= float128_default_nan_high
;
4533 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4536 if ( ( aSig0
| aSig1
) == 0 ) return a
;
4537 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4539 expDiff
= aExp
- bExp
;
4540 if ( expDiff
< -1 ) return a
;
4542 aSig0
| LIT64( 0x0001000000000000 ),
4544 15 - ( expDiff
< 0 ),
4549 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
4550 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
4551 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
4553 while ( 0 < expDiff
) {
4554 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4555 q
= ( 4 < q
) ? q
- 4 : 0;
4556 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
4557 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
4558 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
4559 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
4562 if ( -64 < expDiff
) {
4563 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4564 q
= ( 4 < q
) ? q
- 4 : 0;
4566 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
4568 if ( expDiff
< 0 ) {
4569 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4572 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
4574 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
4575 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
4578 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
4579 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
4582 alternateASig0
= aSig0
;
4583 alternateASig1
= aSig1
;
4585 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
4586 } while ( 0 <= (sbits64
) aSig0
);
4588 aSig0
, aSig1
, alternateASig0
, alternateASig1
, &sigMean0
, &sigMean1
);
4589 if ( ( sigMean0
< 0 )
4590 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
4591 aSig0
= alternateASig0
;
4592 aSig1
= alternateASig1
;
4594 zSign
= ( (sbits64
) aSig0
< 0 );
4595 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
4597 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
);
4602 -------------------------------------------------------------------------------
4603 Returns the square root of the quadruple-precision floating-point value `a'.
4604 The operation is performed according to the IEC/IEEE Standard for Binary
4605 Floating-point Arithmetic.
4606 -------------------------------------------------------------------------------
4608 float128
float128_sqrt( float128 a
)
4612 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
;
4613 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4614 bits64 shiftedRem0
, shiftedRem1
;
4617 aSig1
= extractFloat128Frac1( a
);
4618 aSig0
= extractFloat128Frac0( a
);
4619 aExp
= extractFloat128Exp( a
);
4620 aSign
= extractFloat128Sign( a
);
4621 if ( aExp
== 0x7FFF ) {
4622 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a
);
4623 if ( ! aSign
) return a
;
4627 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
4629 float_raise( float_flag_invalid
);
4630 z
.low
= float128_default_nan_low
;
4631 z
.high
= float128_default_nan_high
;
4635 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
4636 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4638 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
4639 aSig0
|= LIT64( 0x0001000000000000 );
4640 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
4642 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
4643 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
) + zSig0
+ 4;
4644 if ( 0 <= (sbits64
) zSig0
) zSig0
= LIT64( 0xFFFFFFFFFFFFFFFF );
4645 shortShift128Left( aSig0
, aSig1
, 2, &aSig0
, &aSig1
);
4646 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4647 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4648 while ( (sbits64
) rem0
< 0 ) {
4650 shortShift128Left( 0, zSig0
, 1, &term0
, &term1
);
4652 add128( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
4654 shortShift128Left( rem0
, rem1
, 63, &shiftedRem0
, &shiftedRem1
);
4655 zSig1
= estimateDiv128To64( shiftedRem0
, shiftedRem1
, zSig0
);
4656 if ( ( zSig1
& 0x3FFF ) <= 5 ) {
4657 if ( zSig1
== 0 ) zSig1
= 1;
4658 mul64To128( zSig0
, zSig1
, &term1
, &term2
);
4659 shortShift128Left( term1
, term2
, 1, &term1
, &term2
);
4660 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4661 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4662 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4663 while ( (sbits64
) rem1
< 0 ) {
4665 shortShift192Left( 0, zSig0
, zSig1
, 1, &term1
, &term2
, &term3
);
4668 rem1
, rem2
, rem3
, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
4670 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4672 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
4673 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2
);
4678 -------------------------------------------------------------------------------
4679 Returns 1 if the quadruple-precision floating-point value `a' is equal to
4680 the corresponding value `b', and 0 otherwise. The comparison is performed
4681 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4682 -------------------------------------------------------------------------------
4684 flag
float128_eq( float128 a
, float128 b
)
4687 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
4688 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
4689 || ( ( extractFloat128Exp( b
) == 0x7FFF )
4690 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
4692 if ( float128_is_signaling_nan( a
)
4693 || float128_is_signaling_nan( b
) ) {
4694 float_raise( float_flag_invalid
);
4700 && ( ( a
.high
== b
.high
)
4702 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4708 -------------------------------------------------------------------------------
4709 Returns 1 if the quadruple-precision floating-point value `a' is less than
4710 or equal to the corresponding value `b', and 0 otherwise. The comparison
4711 is performed according to the IEC/IEEE Standard for Binary Floating-point
4713 -------------------------------------------------------------------------------
4715 flag
float128_le( float128 a
, float128 b
)
4719 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
4720 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
4721 || ( ( extractFloat128Exp( b
) == 0x7FFF )
4722 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
4724 float_raise( float_flag_invalid
);
4727 aSign
= extractFloat128Sign( a
);
4728 bSign
= extractFloat128Sign( b
);
4729 if ( aSign
!= bSign
) {
4732 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4736 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4737 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4742 -------------------------------------------------------------------------------
4743 Returns 1 if the quadruple-precision floating-point value `a' is less than
4744 the corresponding value `b', and 0 otherwise. The comparison is performed
4745 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4746 -------------------------------------------------------------------------------
4748 flag
float128_lt( float128 a
, float128 b
)
4752 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
4753 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
4754 || ( ( extractFloat128Exp( b
) == 0x7FFF )
4755 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
4757 float_raise( float_flag_invalid
);
4760 aSign
= extractFloat128Sign( a
);
4761 bSign
= extractFloat128Sign( b
);
4762 if ( aSign
!= bSign
) {
4765 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4769 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4770 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4775 -------------------------------------------------------------------------------
4776 Returns 1 if the quadruple-precision floating-point value `a' is equal to
4777 the corresponding value `b', and 0 otherwise. The invalid exception is
4778 raised if either operand is a NaN. Otherwise, the comparison is performed
4779 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4780 -------------------------------------------------------------------------------
4782 flag
float128_eq_signaling( float128 a
, float128 b
)
4785 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
4786 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
4787 || ( ( extractFloat128Exp( b
) == 0x7FFF )
4788 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
4790 float_raise( float_flag_invalid
);
4795 && ( ( a
.high
== b
.high
)
4797 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4803 -------------------------------------------------------------------------------
4804 Returns 1 if the quadruple-precision floating-point value `a' is less than
4805 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4806 cause an exception. Otherwise, the comparison is performed according to the
4807 IEC/IEEE Standard for Binary Floating-point Arithmetic.
4808 -------------------------------------------------------------------------------
4810 flag
float128_le_quiet( float128 a
, float128 b
)
4814 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
4815 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
4816 || ( ( extractFloat128Exp( b
) == 0x7FFF )
4817 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
4819 if ( float128_is_signaling_nan( a
)
4820 || float128_is_signaling_nan( b
) ) {
4821 float_raise( float_flag_invalid
);
4825 aSign
= extractFloat128Sign( a
);
4826 bSign
= extractFloat128Sign( b
);
4827 if ( aSign
!= bSign
) {
4830 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4834 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4835 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4840 -------------------------------------------------------------------------------
4841 Returns 1 if the quadruple-precision floating-point value `a' is less than
4842 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4843 exception. Otherwise, the comparison is performed according to the IEC/IEEE
4844 Standard for Binary Floating-point Arithmetic.
4845 -------------------------------------------------------------------------------
4847 flag
float128_lt_quiet( float128 a
, float128 b
)
4851 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
4852 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
4853 || ( ( extractFloat128Exp( b
) == 0x7FFF )
4854 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
4856 if ( float128_is_signaling_nan( a
)
4857 || float128_is_signaling_nan( b
) ) {
4858 float_raise( float_flag_invalid
);
4862 aSign
= extractFloat128Sign( a
);
4863 bSign
= extractFloat128Sign( b
);
4864 if ( aSign
!= bSign
) {
4867 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4871 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4872 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);