2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
31 #include <asm/div64.h>
35 //#include "softfloat.h"
38 -------------------------------------------------------------------------------
39 Floating-point rounding mode, extended double-precision rounding precision,
41 -------------------------------------------------------------------------------
43 int8 float_rounding_mode
= float_round_nearest_even
;
44 int8 floatx80_rounding_precision
= 80;
45 int8 float_exception_flags
;
48 -------------------------------------------------------------------------------
49 Primitive arithmetic functions, including multi-word arithmetic, and
50 division and square root approximations. (Can be specialized to target if
52 -------------------------------------------------------------------------------
54 #include "softfloat-macros"
57 -------------------------------------------------------------------------------
58 Functions and definitions to determine: (1) whether tininess for underflow
59 is detected before or after rounding by default, (2) what (if anything)
60 happens when exceptions are raised, (3) how signaling NaNs are distinguished
61 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
62 are propagated from function inputs to output. These details are target-
64 -------------------------------------------------------------------------------
66 #include "softfloat-specialize"
69 -------------------------------------------------------------------------------
70 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71 and 7, and returns the properly rounded 32-bit integer corresponding to the
72 input. If `zSign' is nonzero, the input is negated before being converted
73 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
74 input is simply rounded to an integer, with the inexact exception raised if
75 the input cannot be represented exactly as an integer. If the fixed-point
76 input is too large, however, the invalid exception is raised and the largest
77 positive or negative integer is returned.
78 -------------------------------------------------------------------------------
80 static int32
roundAndPackInt32( flag zSign
, bits64 absZ
)
83 flag roundNearestEven
;
84 int8 roundIncrement
, roundBits
;
87 roundingMode
= float_rounding_mode
;
88 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
89 roundIncrement
= 0x40;
90 if ( ! roundNearestEven
) {
91 if ( roundingMode
== float_round_to_zero
) {
95 roundIncrement
= 0x7F;
97 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
100 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
104 roundBits
= absZ
& 0x7F;
105 absZ
= ( absZ
+ roundIncrement
)>>7;
106 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
108 if ( zSign
) z
= - z
;
109 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
110 float_exception_flags
|= float_flag_invalid
;
111 return zSign
? 0x80000000 : 0x7FFFFFFF;
113 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
119 -------------------------------------------------------------------------------
120 Returns the fraction bits of the single-precision floating-point value `a'.
121 -------------------------------------------------------------------------------
123 INLINE bits32
extractFloat32Frac( float32 a
)
126 return a
& 0x007FFFFF;
131 -------------------------------------------------------------------------------
132 Returns the exponent bits of the single-precision floating-point value `a'.
133 -------------------------------------------------------------------------------
135 INLINE int16
extractFloat32Exp( float32 a
)
138 return ( a
>>23 ) & 0xFF;
143 -------------------------------------------------------------------------------
144 Returns the sign bit of the single-precision floating-point value `a'.
145 -------------------------------------------------------------------------------
147 #if 0 /* in softfloat.h */
148 INLINE flag
extractFloat32Sign( float32 a
)
157 -------------------------------------------------------------------------------
158 Normalizes the subnormal single-precision floating-point value represented
159 by the denormalized significand `aSig'. The normalized exponent and
160 significand are stored at the locations pointed to by `zExpPtr' and
161 `zSigPtr', respectively.
162 -------------------------------------------------------------------------------
165 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
169 shiftCount
= countLeadingZeros32( aSig
) - 8;
170 *zSigPtr
= aSig
<<shiftCount
;
171 *zExpPtr
= 1 - shiftCount
;
176 -------------------------------------------------------------------------------
177 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
178 single-precision floating-point value, returning the result. After being
179 shifted into the proper positions, the three fields are simply added
180 together to form the result. This means that any integer portion of `zSig'
181 will be added into the exponent. Since a properly normalized significand
182 will have an integer portion equal to 1, the `zExp' input should be 1 less
183 than the desired result exponent whenever `zSig' is a complete, normalized
185 -------------------------------------------------------------------------------
187 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
191 __asm__("@ packFloat32 \n\
192 mov %0, %1, asl #31 \n\
193 orr %0, %2, asl #23 \n\
196 : "g" (f
), "g" (zSign
), "g" (zExp
), "g" (zSig
)
200 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
205 -------------------------------------------------------------------------------
206 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
207 and significand `zSig', and returns the proper single-precision floating-
208 point value corresponding to the abstract input. Ordinarily, the abstract
209 value is simply rounded and packed into the single-precision format, with
210 the inexact exception raised if the abstract input cannot be represented
211 exactly. If the abstract value is too large, however, the overflow and
212 inexact exceptions are raised and an infinity or maximal finite value is
213 returned. If the abstract value is too small, the input value is rounded to
214 a subnormal number, and the underflow and inexact exceptions are raised if
215 the abstract input cannot be represented exactly as a subnormal single-
216 precision floating-point number.
217 The input significand `zSig' has its binary point between bits 30
218 and 29, which is 7 bits to the left of the usual location. This shifted
219 significand must be normalized or smaller. If `zSig' is not normalized,
220 `zExp' must be 0; in that case, the result returned is a subnormal number,
221 and it must not require rounding. In the usual case that `zSig' is
222 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
223 The handling of underflow and overflow follows the IEC/IEEE Standard for
224 Binary Floating-point Arithmetic.
225 -------------------------------------------------------------------------------
227 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
230 flag roundNearestEven
;
231 int8 roundIncrement
, roundBits
;
234 roundingMode
= float_rounding_mode
;
235 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
236 roundIncrement
= 0x40;
237 if ( ! roundNearestEven
) {
238 if ( roundingMode
== float_round_to_zero
) {
242 roundIncrement
= 0x7F;
244 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
247 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
251 roundBits
= zSig
& 0x7F;
252 if ( 0xFD <= (bits16
) zExp
) {
254 || ( ( zExp
== 0xFD )
255 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
257 float_raise( float_flag_overflow
| float_flag_inexact
);
258 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
262 ( float_detect_tininess
== float_tininess_before_rounding
)
264 || ( zSig
+ roundIncrement
< 0x80000000 );
265 shift32RightJamming( zSig
, - zExp
, &zSig
);
267 roundBits
= zSig
& 0x7F;
268 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
271 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
272 zSig
= ( zSig
+ roundIncrement
)>>7;
273 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
274 if ( zSig
== 0 ) zExp
= 0;
275 return packFloat32( zSign
, zExp
, zSig
);
280 -------------------------------------------------------------------------------
281 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
282 and significand `zSig', and returns the proper single-precision floating-
283 point value corresponding to the abstract input. This routine is just like
284 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
285 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
287 -------------------------------------------------------------------------------
290 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
294 shiftCount
= countLeadingZeros32( zSig
) - 1;
295 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
300 -------------------------------------------------------------------------------
301 Returns the fraction bits of the double-precision floating-point value `a'.
302 -------------------------------------------------------------------------------
304 INLINE bits64
extractFloat64Frac( float64 a
)
307 return a
& LIT64( 0x000FFFFFFFFFFFFF );
312 -------------------------------------------------------------------------------
313 Returns the exponent bits of the double-precision floating-point value `a'.
314 -------------------------------------------------------------------------------
316 INLINE int16
extractFloat64Exp( float64 a
)
319 return ( a
>>52 ) & 0x7FF;
324 -------------------------------------------------------------------------------
325 Returns the sign bit of the double-precision floating-point value `a'.
326 -------------------------------------------------------------------------------
328 #if 0 /* in softfloat.h */
329 INLINE flag
extractFloat64Sign( float64 a
)
338 -------------------------------------------------------------------------------
339 Normalizes the subnormal double-precision floating-point value represented
340 by the denormalized significand `aSig'. The normalized exponent and
341 significand are stored at the locations pointed to by `zExpPtr' and
342 `zSigPtr', respectively.
343 -------------------------------------------------------------------------------
346 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
350 shiftCount
= countLeadingZeros64( aSig
) - 11;
351 *zSigPtr
= aSig
<<shiftCount
;
352 *zExpPtr
= 1 - shiftCount
;
357 -------------------------------------------------------------------------------
358 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
359 double-precision floating-point value, returning the result. After being
360 shifted into the proper positions, the three fields are simply added
361 together to form the result. This means that any integer portion of `zSig'
362 will be added into the exponent. Since a properly normalized significand
363 will have an integer portion equal to 1, the `zExp' input should be 1 less
364 than the desired result exponent whenever `zSig' is a complete, normalized
366 -------------------------------------------------------------------------------
368 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
371 return ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
;
376 -------------------------------------------------------------------------------
377 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
378 and significand `zSig', and returns the proper double-precision floating-
379 point value corresponding to the abstract input. Ordinarily, the abstract
380 value is simply rounded and packed into the double-precision format, with
381 the inexact exception raised if the abstract input cannot be represented
382 exactly. If the abstract value is too large, however, the overflow and
383 inexact exceptions are raised and an infinity or maximal finite value is
384 returned. If the abstract value is too small, the input value is rounded to
385 a subnormal number, and the underflow and inexact exceptions are raised if
386 the abstract input cannot be represented exactly as a subnormal double-
387 precision floating-point number.
388 The input significand `zSig' has its binary point between bits 62
389 and 61, which is 10 bits to the left of the usual location. This shifted
390 significand must be normalized or smaller. If `zSig' is not normalized,
391 `zExp' must be 0; in that case, the result returned is a subnormal number,
392 and it must not require rounding. In the usual case that `zSig' is
393 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
394 The handling of underflow and overflow follows the IEC/IEEE Standard for
395 Binary Floating-point Arithmetic.
396 -------------------------------------------------------------------------------
398 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
401 flag roundNearestEven
;
402 int16 roundIncrement
, roundBits
;
405 roundingMode
= float_rounding_mode
;
406 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
407 roundIncrement
= 0x200;
408 if ( ! roundNearestEven
) {
409 if ( roundingMode
== float_round_to_zero
) {
413 roundIncrement
= 0x3FF;
415 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
418 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
422 roundBits
= zSig
& 0x3FF;
423 if ( 0x7FD <= (bits16
) zExp
) {
424 if ( ( 0x7FD < zExp
)
425 || ( ( zExp
== 0x7FD )
426 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
428 //register int lr = __builtin_return_address(0);
429 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
430 float_raise( float_flag_overflow
| float_flag_inexact
);
431 return packFloat64( zSign
, 0x7FF, 0 ) - ( roundIncrement
== 0 );
435 ( float_detect_tininess
== float_tininess_before_rounding
)
437 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
438 shift64RightJamming( zSig
, - zExp
, &zSig
);
440 roundBits
= zSig
& 0x3FF;
441 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
444 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
445 zSig
= ( zSig
+ roundIncrement
)>>10;
446 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
447 if ( zSig
== 0 ) zExp
= 0;
448 return packFloat64( zSign
, zExp
, zSig
);
453 -------------------------------------------------------------------------------
454 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
455 and significand `zSig', and returns the proper double-precision floating-
456 point value corresponding to the abstract input. This routine is just like
457 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
458 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
460 -------------------------------------------------------------------------------
463 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
467 shiftCount
= countLeadingZeros64( zSig
) - 1;
468 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
475 -------------------------------------------------------------------------------
476 Returns the fraction bits of the extended double-precision floating-point
478 -------------------------------------------------------------------------------
480 INLINE bits64
extractFloatx80Frac( floatx80 a
)
488 -------------------------------------------------------------------------------
489 Returns the exponent bits of the extended double-precision floating-point
491 -------------------------------------------------------------------------------
493 INLINE int32
extractFloatx80Exp( floatx80 a
)
496 return a
.high
& 0x7FFF;
501 -------------------------------------------------------------------------------
502 Returns the sign bit of the extended double-precision floating-point value
504 -------------------------------------------------------------------------------
506 INLINE flag
extractFloatx80Sign( floatx80 a
)
514 -------------------------------------------------------------------------------
515 Normalizes the subnormal extended double-precision floating-point value
516 represented by the denormalized significand `aSig'. The normalized exponent
517 and significand are stored at the locations pointed to by `zExpPtr' and
518 `zSigPtr', respectively.
519 -------------------------------------------------------------------------------
522 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
526 shiftCount
= countLeadingZeros64( aSig
);
527 *zSigPtr
= aSig
<<shiftCount
;
528 *zExpPtr
= 1 - shiftCount
;
533 -------------------------------------------------------------------------------
534 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
535 extended double-precision floating-point value, returning the result.
536 -------------------------------------------------------------------------------
538 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
543 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
549 -------------------------------------------------------------------------------
550 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
551 and extended significand formed by the concatenation of `zSig0' and `zSig1',
552 and returns the proper extended double-precision floating-point value
553 corresponding to the abstract input. Ordinarily, the abstract value is
554 rounded and packed into the extended double-precision format, with the
555 inexact exception raised if the abstract input cannot be represented
556 exactly. If the abstract value is too large, however, the overflow and
557 inexact exceptions are raised and an infinity or maximal finite value is
558 returned. If the abstract value is too small, the input value is rounded to
559 a subnormal number, and the underflow and inexact exceptions are raised if
560 the abstract input cannot be represented exactly as a subnormal extended
561 double-precision floating-point number.
562 If `roundingPrecision' is 32 or 64, the result is rounded to the same
563 number of bits as single or double precision, respectively. Otherwise, the
564 result is rounded to the full precision of the extended double-precision
566 The input significand must be normalized or smaller. If the input
567 significand is not normalized, `zExp' must be 0; in that case, the result
568 returned is a subnormal number, and it must not require rounding. The
569 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
570 Floating-point Arithmetic.
571 -------------------------------------------------------------------------------
574 roundAndPackFloatx80(
575 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
579 flag roundNearestEven
, increment
, isTiny
;
580 int64 roundIncrement
, roundMask
, roundBits
;
582 roundingMode
= float_rounding_mode
;
583 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
584 if ( roundingPrecision
== 80 ) goto precision80
;
585 if ( roundingPrecision
== 64 ) {
586 roundIncrement
= LIT64( 0x0000000000000400 );
587 roundMask
= LIT64( 0x00000000000007FF );
589 else if ( roundingPrecision
== 32 ) {
590 roundIncrement
= LIT64( 0x0000008000000000 );
591 roundMask
= LIT64( 0x000000FFFFFFFFFF );
596 zSig0
|= ( zSig1
!= 0 );
597 if ( ! roundNearestEven
) {
598 if ( roundingMode
== float_round_to_zero
) {
602 roundIncrement
= roundMask
;
604 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
607 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
611 roundBits
= zSig0
& roundMask
;
612 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
613 if ( ( 0x7FFE < zExp
)
614 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
620 ( float_detect_tininess
== float_tininess_before_rounding
)
622 || ( zSig0
<= zSig0
+ roundIncrement
);
623 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
625 roundBits
= zSig0
& roundMask
;
626 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
627 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
628 zSig0
+= roundIncrement
;
629 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
630 roundIncrement
= roundMask
+ 1;
631 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
632 roundMask
|= roundIncrement
;
634 zSig0
&= ~ roundMask
;
635 return packFloatx80( zSign
, zExp
, zSig0
);
638 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
639 zSig0
+= roundIncrement
;
640 if ( zSig0
< roundIncrement
) {
642 zSig0
= LIT64( 0x8000000000000000 );
644 roundIncrement
= roundMask
+ 1;
645 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
646 roundMask
|= roundIncrement
;
648 zSig0
&= ~ roundMask
;
649 if ( zSig0
== 0 ) zExp
= 0;
650 return packFloatx80( zSign
, zExp
, zSig0
);
652 increment
= ( (sbits64
) zSig1
< 0 );
653 if ( ! roundNearestEven
) {
654 if ( roundingMode
== float_round_to_zero
) {
659 increment
= ( roundingMode
== float_round_down
) && zSig1
;
662 increment
= ( roundingMode
== float_round_up
) && zSig1
;
666 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
667 if ( ( 0x7FFE < zExp
)
668 || ( ( zExp
== 0x7FFE )
669 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
675 float_raise( float_flag_overflow
| float_flag_inexact
);
676 if ( ( roundingMode
== float_round_to_zero
)
677 || ( zSign
&& ( roundingMode
== float_round_up
) )
678 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
680 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
682 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
686 ( float_detect_tininess
== float_tininess_before_rounding
)
689 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
690 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
692 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow
);
693 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
694 if ( roundNearestEven
) {
695 increment
= ( (sbits64
) zSig1
< 0 );
699 increment
= ( roundingMode
== float_round_down
) && zSig1
;
702 increment
= ( roundingMode
== float_round_up
) && zSig1
;
707 zSig0
&= ~ ( ( zSig1
+ zSig1
== 0 ) & roundNearestEven
);
708 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
710 return packFloatx80( zSign
, zExp
, zSig0
);
713 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
718 zSig0
= LIT64( 0x8000000000000000 );
721 zSig0
&= ~ ( ( zSig1
+ zSig1
== 0 ) & roundNearestEven
);
725 if ( zSig0
== 0 ) zExp
= 0;
728 return packFloatx80( zSign
, zExp
, zSig0
);
732 -------------------------------------------------------------------------------
733 Takes an abstract floating-point value having sign `zSign', exponent
734 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
735 and returns the proper extended double-precision floating-point value
736 corresponding to the abstract input. This routine is just like
737 `roundAndPackFloatx80' except that the input significand does not have to be
739 -------------------------------------------------------------------------------
742 normalizeRoundAndPackFloatx80(
743 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
753 shiftCount
= countLeadingZeros64( zSig0
);
754 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
757 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1
);
764 -------------------------------------------------------------------------------
765 Returns the result of converting the 32-bit two's complement integer `a' to
766 the single-precision floating-point format. The conversion is performed
767 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
768 -------------------------------------------------------------------------------
770 float32
int32_to_float32( int32 a
)
774 if ( a
== 0 ) return 0;
775 if ( a
== 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
777 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a
);
782 -------------------------------------------------------------------------------
783 Returns the result of converting the 32-bit two's complement integer `a' to
784 the double-precision floating-point format. The conversion is performed
785 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
786 -------------------------------------------------------------------------------
788 float64
int32_to_float64( int32 a
)
795 if ( a
== 0 ) return 0;
797 absA
= aSign
? - a
: a
;
798 shiftCount
= countLeadingZeros32( absA
) + 21;
800 return packFloat64( aSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
807 -------------------------------------------------------------------------------
808 Returns the result of converting the 32-bit two's complement integer `a'
809 to the extended double-precision floating-point format. The conversion
810 is performed according to the IEC/IEEE Standard for Binary Floating-point
812 -------------------------------------------------------------------------------
814 floatx80
int32_to_floatx80( int32 a
)
821 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
823 absA
= zSign
? - a
: a
;
824 shiftCount
= countLeadingZeros32( absA
) + 32;
826 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
833 -------------------------------------------------------------------------------
834 Returns the result of converting the single-precision floating-point value
835 `a' to the 32-bit two's complement integer format. The conversion is
836 performed according to the IEC/IEEE Standard for Binary Floating-point
837 Arithmetic---which means in particular that the conversion is rounded
838 according to the current rounding mode. If `a' is a NaN, the largest
839 positive integer is returned. Otherwise, if the conversion overflows, the
840 largest integer with the same sign as `a' is returned.
841 -------------------------------------------------------------------------------
843 int32
float32_to_int32( float32 a
)
846 int16 aExp
, shiftCount
;
850 aSig
= extractFloat32Frac( a
);
851 aExp
= extractFloat32Exp( a
);
852 aSign
= extractFloat32Sign( a
);
853 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
854 if ( aExp
) aSig
|= 0x00800000;
855 shiftCount
= 0xAF - aExp
;
858 if ( 0 < shiftCount
) shift64RightJamming( zSig
, shiftCount
, &zSig
);
859 return roundAndPackInt32( aSign
, zSig
);
864 -------------------------------------------------------------------------------
865 Returns the result of converting the single-precision floating-point value
866 `a' to the 32-bit two's complement integer format. The conversion is
867 performed according to the IEC/IEEE Standard for Binary Floating-point
868 Arithmetic, except that the conversion is always rounded toward zero. If
869 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
870 conversion overflows, the largest integer with the same sign as `a' is
872 -------------------------------------------------------------------------------
874 int32
float32_to_int32_round_to_zero( float32 a
)
877 int16 aExp
, shiftCount
;
881 aSig
= extractFloat32Frac( a
);
882 aExp
= extractFloat32Exp( a
);
883 aSign
= extractFloat32Sign( a
);
884 shiftCount
= aExp
- 0x9E;
885 if ( 0 <= shiftCount
) {
886 if ( a
== 0xCF000000 ) return 0x80000000;
887 float_raise( float_flag_invalid
);
888 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
891 else if ( aExp
<= 0x7E ) {
892 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
895 aSig
= ( aSig
| 0x00800000 )<<8;
896 z
= aSig
>>( - shiftCount
);
897 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
898 float_exception_flags
|= float_flag_inexact
;
900 return aSign
? - z
: z
;
905 -------------------------------------------------------------------------------
906 Returns the result of converting the single-precision floating-point value
907 `a' to the double-precision floating-point format. The conversion is
908 performed according to the IEC/IEEE Standard for Binary Floating-point
910 -------------------------------------------------------------------------------
912 float64
float32_to_float64( float32 a
)
918 aSig
= extractFloat32Frac( a
);
919 aExp
= extractFloat32Exp( a
);
920 aSign
= extractFloat32Sign( a
);
921 if ( aExp
== 0xFF ) {
922 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
923 return packFloat64( aSign
, 0x7FF, 0 );
926 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
927 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
930 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
937 -------------------------------------------------------------------------------
938 Returns the result of converting the single-precision floating-point value
939 `a' to the extended double-precision floating-point format. The conversion
940 is performed according to the IEC/IEEE Standard for Binary Floating-point
942 -------------------------------------------------------------------------------
944 floatx80
float32_to_floatx80( float32 a
)
950 aSig
= extractFloat32Frac( a
);
951 aExp
= extractFloat32Exp( a
);
952 aSign
= extractFloat32Sign( a
);
953 if ( aExp
== 0xFF ) {
954 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a
) );
955 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
958 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
959 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
962 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
969 -------------------------------------------------------------------------------
970 Rounds the single-precision floating-point value `a' to an integer, and
971 returns the result as a single-precision floating-point value. The
972 operation is performed according to the IEC/IEEE Standard for Binary
973 Floating-point Arithmetic.
974 -------------------------------------------------------------------------------
976 float32
float32_round_to_int( float32 a
)
980 bits32 lastBitMask
, roundBitsMask
;
984 aExp
= extractFloat32Exp( a
);
985 if ( 0x96 <= aExp
) {
986 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
987 return propagateFloat32NaN( a
, a
);
991 if ( aExp
<= 0x7E ) {
992 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
993 float_exception_flags
|= float_flag_inexact
;
994 aSign
= extractFloat32Sign( a
);
995 switch ( float_rounding_mode
) {
996 case float_round_nearest_even
:
997 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
998 return packFloat32( aSign
, 0x7F, 0 );
1001 case float_round_down
:
1002 return aSign
? 0xBF800000 : 0;
1003 case float_round_up
:
1004 return aSign
? 0x80000000 : 0x3F800000;
1006 return packFloat32( aSign
, 0, 0 );
1009 lastBitMask
<<= 0x96 - aExp
;
1010 roundBitsMask
= lastBitMask
- 1;
1012 roundingMode
= float_rounding_mode
;
1013 if ( roundingMode
== float_round_nearest_even
) {
1014 z
+= lastBitMask
>>1;
1015 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1017 else if ( roundingMode
!= float_round_to_zero
) {
1018 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1022 z
&= ~ roundBitsMask
;
1023 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
1029 -------------------------------------------------------------------------------
1030 Returns the result of adding the absolute values of the single-precision
1031 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1032 before being returned. `zSign' is ignored if the result is a NaN. The
1033 addition is performed according to the IEC/IEEE Standard for Binary
1034 Floating-point Arithmetic.
1035 -------------------------------------------------------------------------------
1037 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1039 int16 aExp
, bExp
, zExp
;
1040 bits32 aSig
, bSig
, zSig
;
1043 aSig
= extractFloat32Frac( a
);
1044 aExp
= extractFloat32Exp( a
);
1045 bSig
= extractFloat32Frac( b
);
1046 bExp
= extractFloat32Exp( b
);
1047 expDiff
= aExp
- bExp
;
1050 if ( 0 < expDiff
) {
1051 if ( aExp
== 0xFF ) {
1052 if ( aSig
) return propagateFloat32NaN( a
, b
);
1061 shift32RightJamming( bSig
, expDiff
, &bSig
);
1064 else if ( expDiff
< 0 ) {
1065 if ( bExp
== 0xFF ) {
1066 if ( bSig
) return propagateFloat32NaN( a
, b
);
1067 return packFloat32( zSign
, 0xFF, 0 );
1075 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1079 if ( aExp
== 0xFF ) {
1080 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1083 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1084 zSig
= 0x40000000 + aSig
+ bSig
;
1089 zSig
= ( aSig
+ bSig
)<<1;
1091 if ( (sbits32
) zSig
< 0 ) {
1096 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1101 -------------------------------------------------------------------------------
1102 Returns the result of subtracting the absolute values of the single-
1103 precision floating-point values `a' and `b'. If `zSign' is true, the
1104 difference is negated before being returned. `zSign' is ignored if the
1105 result is a NaN. The subtraction is performed according to the IEC/IEEE
1106 Standard for Binary Floating-point Arithmetic.
1107 -------------------------------------------------------------------------------
1109 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1111 int16 aExp
, bExp
, zExp
;
1112 bits32 aSig
, bSig
, zSig
;
1115 aSig
= extractFloat32Frac( a
);
1116 aExp
= extractFloat32Exp( a
);
1117 bSig
= extractFloat32Frac( b
);
1118 bExp
= extractFloat32Exp( b
);
1119 expDiff
= aExp
- bExp
;
1122 if ( 0 < expDiff
) goto aExpBigger
;
1123 if ( expDiff
< 0 ) goto bExpBigger
;
1124 if ( aExp
== 0xFF ) {
1125 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1126 float_raise( float_flag_invalid
);
1127 return float32_default_nan
;
1133 if ( bSig
< aSig
) goto aBigger
;
1134 if ( aSig
< bSig
) goto bBigger
;
1135 return packFloat32( float_rounding_mode
== float_round_down
, 0, 0 );
1137 if ( bExp
== 0xFF ) {
1138 if ( bSig
) return propagateFloat32NaN( a
, b
);
1139 return packFloat32( zSign
^ 1, 0xFF, 0 );
1147 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1153 goto normalizeRoundAndPack
;
1155 if ( aExp
== 0xFF ) {
1156 if ( aSig
) return propagateFloat32NaN( a
, b
);
1165 shift32RightJamming( bSig
, expDiff
, &bSig
);
1170 normalizeRoundAndPack
:
1172 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig
);
1177 -------------------------------------------------------------------------------
1178 Returns the result of adding the single-precision floating-point values `a'
1179 and `b'. The operation is performed according to the IEC/IEEE Standard for
1180 Binary Floating-point Arithmetic.
1181 -------------------------------------------------------------------------------
1183 float32
float32_add( float32 a
, float32 b
)
1187 aSign
= extractFloat32Sign( a
);
1188 bSign
= extractFloat32Sign( b
);
1189 if ( aSign
== bSign
) {
1190 return addFloat32Sigs( a
, b
, aSign
);
1193 return subFloat32Sigs( a
, b
, aSign
);
1199 -------------------------------------------------------------------------------
1200 Returns the result of subtracting the single-precision floating-point values
1201 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1202 for Binary Floating-point Arithmetic.
1203 -------------------------------------------------------------------------------
1205 float32
float32_sub( float32 a
, float32 b
)
1209 aSign
= extractFloat32Sign( a
);
1210 bSign
= extractFloat32Sign( b
);
1211 if ( aSign
== bSign
) {
1212 return subFloat32Sigs( a
, b
, aSign
);
1215 return addFloat32Sigs( a
, b
, aSign
);
1221 -------------------------------------------------------------------------------
1222 Returns the result of multiplying the single-precision floating-point values
1223 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1224 for Binary Floating-point Arithmetic.
1225 -------------------------------------------------------------------------------
1227 float32
float32_mul( float32 a
, float32 b
)
1229 flag aSign
, bSign
, zSign
;
1230 int16 aExp
, bExp
, zExp
;
1235 aSig
= extractFloat32Frac( a
);
1236 aExp
= extractFloat32Exp( a
);
1237 aSign
= extractFloat32Sign( a
);
1238 bSig
= extractFloat32Frac( b
);
1239 bExp
= extractFloat32Exp( b
);
1240 bSign
= extractFloat32Sign( b
);
1241 zSign
= aSign
^ bSign
;
1242 if ( aExp
== 0xFF ) {
1243 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1244 return propagateFloat32NaN( a
, b
);
1246 if ( ( bExp
| bSig
) == 0 ) {
1247 float_raise( float_flag_invalid
);
1248 return float32_default_nan
;
1250 return packFloat32( zSign
, 0xFF, 0 );
1252 if ( bExp
== 0xFF ) {
1253 if ( bSig
) return propagateFloat32NaN( a
, b
);
1254 if ( ( aExp
| aSig
) == 0 ) {
1255 float_raise( float_flag_invalid
);
1256 return float32_default_nan
;
1258 return packFloat32( zSign
, 0xFF, 0 );
1261 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1262 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1265 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1266 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1268 zExp
= aExp
+ bExp
- 0x7F;
1269 aSig
= ( aSig
| 0x00800000 )<<7;
1270 bSig
= ( bSig
| 0x00800000 )<<8;
1271 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1273 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1277 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1282 -------------------------------------------------------------------------------
1283 Returns the result of dividing the single-precision floating-point value `a'
1284 by the corresponding value `b'. The operation is performed according to the
1285 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1286 -------------------------------------------------------------------------------
1288 float32
float32_div( float32 a
, float32 b
)
1290 flag aSign
, bSign
, zSign
;
1291 int16 aExp
, bExp
, zExp
;
1292 bits32 aSig
, bSig
, zSig
;
1294 aSig
= extractFloat32Frac( a
);
1295 aExp
= extractFloat32Exp( a
);
1296 aSign
= extractFloat32Sign( a
);
1297 bSig
= extractFloat32Frac( b
);
1298 bExp
= extractFloat32Exp( b
);
1299 bSign
= extractFloat32Sign( b
);
1300 zSign
= aSign
^ bSign
;
1301 if ( aExp
== 0xFF ) {
1302 if ( aSig
) return propagateFloat32NaN( a
, b
);
1303 if ( bExp
== 0xFF ) {
1304 if ( bSig
) return propagateFloat32NaN( a
, b
);
1305 float_raise( float_flag_invalid
);
1306 return float32_default_nan
;
1308 return packFloat32( zSign
, 0xFF, 0 );
1310 if ( bExp
== 0xFF ) {
1311 if ( bSig
) return propagateFloat32NaN( a
, b
);
1312 return packFloat32( zSign
, 0, 0 );
1316 if ( ( aExp
| aSig
) == 0 ) {
1317 float_raise( float_flag_invalid
);
1318 return float32_default_nan
;
1320 float_raise( float_flag_divbyzero
);
1321 return packFloat32( zSign
, 0xFF, 0 );
1323 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1326 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1327 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1329 zExp
= aExp
- bExp
+ 0x7D;
1330 aSig
= ( aSig
| 0x00800000 )<<7;
1331 bSig
= ( bSig
| 0x00800000 )<<8;
1332 if ( bSig
<= ( aSig
+ aSig
) ) {
1337 bits64 tmp
= ( (bits64
) aSig
)<<32;
1338 do_div( tmp
, bSig
);
1341 if ( ( zSig
& 0x3F ) == 0 ) {
1342 zSig
|= ( ( (bits64
) bSig
) * zSig
!= ( (bits64
) aSig
)<<32 );
1344 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1349 -------------------------------------------------------------------------------
1350 Returns the remainder of the single-precision floating-point value `a'
1351 with respect to the corresponding value `b'. The operation is performed
1352 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1353 -------------------------------------------------------------------------------
1355 float32
float32_rem( float32 a
, float32 b
)
1357 flag aSign
, bSign
, zSign
;
1358 int16 aExp
, bExp
, expDiff
;
1361 bits64 aSig64
, bSig64
, q64
;
1362 bits32 alternateASig
;
1365 aSig
= extractFloat32Frac( a
);
1366 aExp
= extractFloat32Exp( a
);
1367 aSign
= extractFloat32Sign( a
);
1368 bSig
= extractFloat32Frac( b
);
1369 bExp
= extractFloat32Exp( b
);
1370 bSign
= extractFloat32Sign( b
);
1371 if ( aExp
== 0xFF ) {
1372 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1373 return propagateFloat32NaN( a
, b
);
1375 float_raise( float_flag_invalid
);
1376 return float32_default_nan
;
1378 if ( bExp
== 0xFF ) {
1379 if ( bSig
) return propagateFloat32NaN( a
, b
);
1384 float_raise( float_flag_invalid
);
1385 return float32_default_nan
;
1387 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1390 if ( aSig
== 0 ) return a
;
1391 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1393 expDiff
= aExp
- bExp
;
1396 if ( expDiff
< 32 ) {
1399 if ( expDiff
< 0 ) {
1400 if ( expDiff
< -1 ) return a
;
1403 q
= ( bSig
<= aSig
);
1404 if ( q
) aSig
-= bSig
;
1405 if ( 0 < expDiff
) {
1406 bits64 tmp
= ( (bits64
) aSig
)<<32;
1407 do_div( tmp
, bSig
);
1411 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1419 if ( bSig
<= aSig
) aSig
-= bSig
;
1420 aSig64
= ( (bits64
) aSig
)<<40;
1421 bSig64
= ( (bits64
) bSig
)<<40;
1423 while ( 0 < expDiff
) {
1424 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1425 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1426 aSig64
= - ( ( bSig
* q64
)<<38 );
1430 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1431 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1432 q
= q64
>>( 64 - expDiff
);
1434 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
1437 alternateASig
= aSig
;
1440 } while ( 0 <= (sbits32
) aSig
);
1441 sigMean
= aSig
+ alternateASig
;
1442 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1443 aSig
= alternateASig
;
1445 zSign
= ( (sbits32
) aSig
< 0 );
1446 if ( zSign
) aSig
= - aSig
;
1447 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig
);
1452 -------------------------------------------------------------------------------
1453 Returns the square root of the single-precision floating-point value `a'.
1454 The operation is performed according to the IEC/IEEE Standard for Binary
1455 Floating-point Arithmetic.
1456 -------------------------------------------------------------------------------
1458 float32
float32_sqrt( float32 a
)
1465 aSig
= extractFloat32Frac( a
);
1466 aExp
= extractFloat32Exp( a
);
1467 aSign
= extractFloat32Sign( a
);
1468 if ( aExp
== 0xFF ) {
1469 if ( aSig
) return propagateFloat32NaN( a
, 0 );
1470 if ( ! aSign
) return a
;
1471 float_raise( float_flag_invalid
);
1472 return float32_default_nan
;
1475 if ( ( aExp
| aSig
) == 0 ) return a
;
1476 float_raise( float_flag_invalid
);
1477 return float32_default_nan
;
1480 if ( aSig
== 0 ) return 0;
1481 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1483 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
1484 aSig
= ( aSig
| 0x00800000 )<<8;
1485 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
1486 if ( ( zSig
& 0x7F ) <= 5 ) {
1492 term
= ( (bits64
) zSig
) * zSig
;
1493 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
1494 while ( (sbits64
) rem
< 0 ) {
1496 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
1498 zSig
|= ( rem
!= 0 );
1501 shift32RightJamming( zSig
, 1, &zSig
);
1502 return roundAndPackFloat32( 0, zExp
, zSig
);
1507 -------------------------------------------------------------------------------
1508 Returns 1 if the single-precision floating-point value `a' is equal to the
1509 corresponding value `b', and 0 otherwise. The comparison is performed
1510 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1511 -------------------------------------------------------------------------------
1513 flag
float32_eq( float32 a
, float32 b
)
1516 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1517 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1519 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1520 float_raise( float_flag_invalid
);
1524 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1529 -------------------------------------------------------------------------------
1530 Returns 1 if the single-precision floating-point value `a' is less than or
1531 equal to the corresponding value `b', and 0 otherwise. The comparison is
1532 performed according to the IEC/IEEE Standard for Binary Floating-point
1534 -------------------------------------------------------------------------------
1536 flag
float32_le( float32 a
, float32 b
)
1540 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1541 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1543 float_raise( float_flag_invalid
);
1546 aSign
= extractFloat32Sign( a
);
1547 bSign
= extractFloat32Sign( b
);
1548 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1549 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1554 -------------------------------------------------------------------------------
1555 Returns 1 if the single-precision floating-point value `a' is less than
1556 the corresponding value `b', and 0 otherwise. The comparison is performed
1557 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1558 -------------------------------------------------------------------------------
1560 flag
float32_lt( float32 a
, float32 b
)
1564 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1565 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1567 float_raise( float_flag_invalid
);
1570 aSign
= extractFloat32Sign( a
);
1571 bSign
= extractFloat32Sign( b
);
1572 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1573 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1578 -------------------------------------------------------------------------------
1579 Returns 1 if the single-precision floating-point value `a' is equal to the
1580 corresponding value `b', and 0 otherwise. The invalid exception is raised
1581 if either operand is a NaN. Otherwise, the comparison is performed
1582 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1583 -------------------------------------------------------------------------------
1585 flag
float32_eq_signaling( float32 a
, float32 b
)
1588 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1589 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1591 float_raise( float_flag_invalid
);
1594 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1599 -------------------------------------------------------------------------------
1600 Returns 1 if the single-precision floating-point value `a' is less than or
1601 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1602 cause an exception. Otherwise, the comparison is performed according to the
1603 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1604 -------------------------------------------------------------------------------
1606 flag
float32_le_quiet( float32 a
, float32 b
)
1611 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1612 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1614 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1615 float_raise( float_flag_invalid
);
1619 aSign
= extractFloat32Sign( a
);
1620 bSign
= extractFloat32Sign( b
);
1621 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1622 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1627 -------------------------------------------------------------------------------
1628 Returns 1 if the single-precision floating-point value `a' is less than
1629 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1630 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1631 Standard for Binary Floating-point Arithmetic.
1632 -------------------------------------------------------------------------------
1634 flag
float32_lt_quiet( float32 a
, float32 b
)
1638 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1639 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1641 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1642 float_raise( float_flag_invalid
);
1646 aSign
= extractFloat32Sign( a
);
1647 bSign
= extractFloat32Sign( b
);
1648 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1649 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1654 -------------------------------------------------------------------------------
1655 Returns the result of converting the double-precision floating-point value
1656 `a' to the 32-bit two's complement integer format. The conversion is
1657 performed according to the IEC/IEEE Standard for Binary Floating-point
1658 Arithmetic---which means in particular that the conversion is rounded
1659 according to the current rounding mode. If `a' is a NaN, the largest
1660 positive integer is returned. Otherwise, if the conversion overflows, the
1661 largest integer with the same sign as `a' is returned.
1662 -------------------------------------------------------------------------------
1664 int32
float64_to_int32( float64 a
)
1667 int16 aExp
, shiftCount
;
1670 aSig
= extractFloat64Frac( a
);
1671 aExp
= extractFloat64Exp( a
);
1672 aSign
= extractFloat64Sign( a
);
1673 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1674 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
1675 shiftCount
= 0x42C - aExp
;
1676 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
1677 return roundAndPackInt32( aSign
, aSig
);
1682 -------------------------------------------------------------------------------
1683 Returns the result of converting the double-precision floating-point value
1684 `a' to the 32-bit two's complement integer format. The conversion is
1685 performed according to the IEC/IEEE Standard for Binary Floating-point
1686 Arithmetic, except that the conversion is always rounded toward zero. If
1687 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1688 conversion overflows, the largest integer with the same sign as `a' is
1690 -------------------------------------------------------------------------------
1692 int32
float64_to_int32_round_to_zero( float64 a
)
1695 int16 aExp
, shiftCount
;
1696 bits64 aSig
, savedASig
;
1699 aSig
= extractFloat64Frac( a
);
1700 aExp
= extractFloat64Exp( a
);
1701 aSign
= extractFloat64Sign( a
);
1702 shiftCount
= 0x433 - aExp
;
1703 if ( shiftCount
< 21 ) {
1704 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1707 else if ( 52 < shiftCount
) {
1708 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
1711 aSig
|= LIT64( 0x0010000000000000 );
1713 aSig
>>= shiftCount
;
1715 if ( aSign
) z
= - z
;
1716 if ( ( z
< 0 ) ^ aSign
) {
1718 float_exception_flags
|= float_flag_invalid
;
1719 return aSign
? 0x80000000 : 0x7FFFFFFF;
1721 if ( ( aSig
<<shiftCount
) != savedASig
) {
1722 float_exception_flags
|= float_flag_inexact
;
1729 -------------------------------------------------------------------------------
1730 Returns the result of converting the double-precision floating-point value
1731 `a' to the 32-bit two's complement unsigned integer format. The conversion
1732 is performed according to the IEC/IEEE Standard for Binary Floating-point
1733 Arithmetic---which means in particular that the conversion is rounded
1734 according to the current rounding mode. If `a' is a NaN, the largest
1735 positive integer is returned. Otherwise, if the conversion overflows, the
1736 largest positive integer is returned.
1737 -------------------------------------------------------------------------------
1739 int32
float64_to_uint32( float64 a
)
1742 int16 aExp
, shiftCount
;
1745 aSig
= extractFloat64Frac( a
);
1746 aExp
= extractFloat64Exp( a
);
1747 aSign
= 0; //extractFloat64Sign( a );
1748 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1749 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
1750 shiftCount
= 0x42C - aExp
;
1751 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
1752 return roundAndPackInt32( aSign
, aSig
);
1756 -------------------------------------------------------------------------------
1757 Returns the result of converting the double-precision floating-point value
1758 `a' to the 32-bit two's complement integer format. The conversion is
1759 performed according to the IEC/IEEE Standard for Binary Floating-point
1760 Arithmetic, except that the conversion is always rounded toward zero. If
1761 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1762 conversion overflows, the largest positive integer is returned.
1763 -------------------------------------------------------------------------------
1765 int32
float64_to_uint32_round_to_zero( float64 a
)
1768 int16 aExp
, shiftCount
;
1769 bits64 aSig
, savedASig
;
1772 aSig
= extractFloat64Frac( a
);
1773 aExp
= extractFloat64Exp( a
);
1774 aSign
= extractFloat64Sign( a
);
1775 shiftCount
= 0x433 - aExp
;
1776 if ( shiftCount
< 21 ) {
1777 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1780 else if ( 52 < shiftCount
) {
1781 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
1784 aSig
|= LIT64( 0x0010000000000000 );
1786 aSig
>>= shiftCount
;
1788 if ( aSign
) z
= - z
;
1789 if ( ( z
< 0 ) ^ aSign
) {
1791 float_exception_flags
|= float_flag_invalid
;
1792 return aSign
? 0x80000000 : 0x7FFFFFFF;
1794 if ( ( aSig
<<shiftCount
) != savedASig
) {
1795 float_exception_flags
|= float_flag_inexact
;
1801 -------------------------------------------------------------------------------
1802 Returns the result of converting the double-precision floating-point value
1803 `a' to the single-precision floating-point format. The conversion is
1804 performed according to the IEC/IEEE Standard for Binary Floating-point
1806 -------------------------------------------------------------------------------
1808 float32
float64_to_float32( float64 a
)
1815 aSig
= extractFloat64Frac( a
);
1816 aExp
= extractFloat64Exp( a
);
1817 aSign
= extractFloat64Sign( a
);
1818 if ( aExp
== 0x7FF ) {
1819 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a
) );
1820 return packFloat32( aSign
, 0xFF, 0 );
1822 shift64RightJamming( aSig
, 22, &aSig
);
1824 if ( aExp
|| zSig
) {
1828 return roundAndPackFloat32( aSign
, aExp
, zSig
);
1835 -------------------------------------------------------------------------------
1836 Returns the result of converting the double-precision floating-point value
1837 `a' to the extended double-precision floating-point format. The conversion
1838 is performed according to the IEC/IEEE Standard for Binary Floating-point
1840 -------------------------------------------------------------------------------
1842 floatx80
float64_to_floatx80( float64 a
)
1848 aSig
= extractFloat64Frac( a
);
1849 aExp
= extractFloat64Exp( a
);
1850 aSign
= extractFloat64Sign( a
);
1851 if ( aExp
== 0x7FF ) {
1852 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a
) );
1853 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1856 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1857 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
1861 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
1868 -------------------------------------------------------------------------------
1869 Rounds the double-precision floating-point value `a' to an integer, and
1870 returns the result as a double-precision floating-point value. The
1871 operation is performed according to the IEC/IEEE Standard for Binary
1872 Floating-point Arithmetic.
1873 -------------------------------------------------------------------------------
1875 float64
float64_round_to_int( float64 a
)
1879 bits64 lastBitMask
, roundBitsMask
;
1883 aExp
= extractFloat64Exp( a
);
1884 if ( 0x433 <= aExp
) {
1885 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
1886 return propagateFloat64NaN( a
, a
);
1890 if ( aExp
<= 0x3FE ) {
1891 if ( (bits64
) ( a
<<1 ) == 0 ) return a
;
1892 float_exception_flags
|= float_flag_inexact
;
1893 aSign
= extractFloat64Sign( a
);
1894 switch ( float_rounding_mode
) {
1895 case float_round_nearest_even
:
1896 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
1897 return packFloat64( aSign
, 0x3FF, 0 );
1900 case float_round_down
:
1901 return aSign
? LIT64( 0xBFF0000000000000 ) : 0;
1902 case float_round_up
:
1904 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1906 return packFloat64( aSign
, 0, 0 );
1909 lastBitMask
<<= 0x433 - aExp
;
1910 roundBitsMask
= lastBitMask
- 1;
1912 roundingMode
= float_rounding_mode
;
1913 if ( roundingMode
== float_round_nearest_even
) {
1914 z
+= lastBitMask
>>1;
1915 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1917 else if ( roundingMode
!= float_round_to_zero
) {
1918 if ( extractFloat64Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1922 z
&= ~ roundBitsMask
;
1923 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
1929 -------------------------------------------------------------------------------
1930 Returns the result of adding the absolute values of the double-precision
1931 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1932 before being returned. `zSign' is ignored if the result is a NaN. The
1933 addition is performed according to the IEC/IEEE Standard for Binary
1934 Floating-point Arithmetic.
1935 -------------------------------------------------------------------------------
1937 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign
)
1939 int16 aExp
, bExp
, zExp
;
1940 bits64 aSig
, bSig
, zSig
;
1943 aSig
= extractFloat64Frac( a
);
1944 aExp
= extractFloat64Exp( a
);
1945 bSig
= extractFloat64Frac( b
);
1946 bExp
= extractFloat64Exp( b
);
1947 expDiff
= aExp
- bExp
;
1950 if ( 0 < expDiff
) {
1951 if ( aExp
== 0x7FF ) {
1952 if ( aSig
) return propagateFloat64NaN( a
, b
);
1959 bSig
|= LIT64( 0x2000000000000000 );
1961 shift64RightJamming( bSig
, expDiff
, &bSig
);
1964 else if ( expDiff
< 0 ) {
1965 if ( bExp
== 0x7FF ) {
1966 if ( bSig
) return propagateFloat64NaN( a
, b
);
1967 return packFloat64( zSign
, 0x7FF, 0 );
1973 aSig
|= LIT64( 0x2000000000000000 );
1975 shift64RightJamming( aSig
, - expDiff
, &aSig
);
1979 if ( aExp
== 0x7FF ) {
1980 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
1983 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
1984 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
1988 aSig
|= LIT64( 0x2000000000000000 );
1989 zSig
= ( aSig
+ bSig
)<<1;
1991 if ( (sbits64
) zSig
< 0 ) {
1996 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2001 -------------------------------------------------------------------------------
2002 Returns the result of subtracting the absolute values of the double-
2003 precision floating-point values `a' and `b'. If `zSign' is true, the
2004 difference is negated before being returned. `zSign' is ignored if the
2005 result is a NaN. The subtraction is performed according to the IEC/IEEE
2006 Standard for Binary Floating-point Arithmetic.
2007 -------------------------------------------------------------------------------
2009 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2011 int16 aExp
, bExp
, zExp
;
2012 bits64 aSig
, bSig
, zSig
;
2015 aSig
= extractFloat64Frac( a
);
2016 aExp
= extractFloat64Exp( a
);
2017 bSig
= extractFloat64Frac( b
);
2018 bExp
= extractFloat64Exp( b
);
2019 expDiff
= aExp
- bExp
;
2022 if ( 0 < expDiff
) goto aExpBigger
;
2023 if ( expDiff
< 0 ) goto bExpBigger
;
2024 if ( aExp
== 0x7FF ) {
2025 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2026 float_raise( float_flag_invalid
);
2027 return float64_default_nan
;
2033 if ( bSig
< aSig
) goto aBigger
;
2034 if ( aSig
< bSig
) goto bBigger
;
2035 return packFloat64( float_rounding_mode
== float_round_down
, 0, 0 );
2037 if ( bExp
== 0x7FF ) {
2038 if ( bSig
) return propagateFloat64NaN( a
, b
);
2039 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2045 aSig
|= LIT64( 0x4000000000000000 );
2047 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2048 bSig
|= LIT64( 0x4000000000000000 );
2053 goto normalizeRoundAndPack
;
2055 if ( aExp
== 0x7FF ) {
2056 if ( aSig
) return propagateFloat64NaN( a
, b
);
2063 bSig
|= LIT64( 0x4000000000000000 );
2065 shift64RightJamming( bSig
, expDiff
, &bSig
);
2066 aSig
|= LIT64( 0x4000000000000000 );
2070 normalizeRoundAndPack
:
2072 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig
);
2077 -------------------------------------------------------------------------------
2078 Returns the result of adding the double-precision floating-point values `a'
2079 and `b'. The operation is performed according to the IEC/IEEE Standard for
2080 Binary Floating-point Arithmetic.
2081 -------------------------------------------------------------------------------
2083 float64
float64_add( float64 a
, float64 b
)
2087 aSign
= extractFloat64Sign( a
);
2088 bSign
= extractFloat64Sign( b
);
2089 if ( aSign
== bSign
) {
2090 return addFloat64Sigs( a
, b
, aSign
);
2093 return subFloat64Sigs( a
, b
, aSign
);
2099 -------------------------------------------------------------------------------
2100 Returns the result of subtracting the double-precision floating-point values
2101 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2102 for Binary Floating-point Arithmetic.
2103 -------------------------------------------------------------------------------
2105 float64
float64_sub( float64 a
, float64 b
)
2109 aSign
= extractFloat64Sign( a
);
2110 bSign
= extractFloat64Sign( b
);
2111 if ( aSign
== bSign
) {
2112 return subFloat64Sigs( a
, b
, aSign
);
2115 return addFloat64Sigs( a
, b
, aSign
);
2121 -------------------------------------------------------------------------------
2122 Returns the result of multiplying the double-precision floating-point values
2123 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2124 for Binary Floating-point Arithmetic.
2125 -------------------------------------------------------------------------------
2127 float64
float64_mul( float64 a
, float64 b
)
2129 flag aSign
, bSign
, zSign
;
2130 int16 aExp
, bExp
, zExp
;
2131 bits64 aSig
, bSig
, zSig0
, zSig1
;
2133 aSig
= extractFloat64Frac( a
);
2134 aExp
= extractFloat64Exp( a
);
2135 aSign
= extractFloat64Sign( a
);
2136 bSig
= extractFloat64Frac( b
);
2137 bExp
= extractFloat64Exp( b
);
2138 bSign
= extractFloat64Sign( b
);
2139 zSign
= aSign
^ bSign
;
2140 if ( aExp
== 0x7FF ) {
2141 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2142 return propagateFloat64NaN( a
, b
);
2144 if ( ( bExp
| bSig
) == 0 ) {
2145 float_raise( float_flag_invalid
);
2146 return float64_default_nan
;
2148 return packFloat64( zSign
, 0x7FF, 0 );
2150 if ( bExp
== 0x7FF ) {
2151 if ( bSig
) return propagateFloat64NaN( a
, b
);
2152 if ( ( aExp
| aSig
) == 0 ) {
2153 float_raise( float_flag_invalid
);
2154 return float64_default_nan
;
2156 return packFloat64( zSign
, 0x7FF, 0 );
2159 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2160 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2163 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2164 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2166 zExp
= aExp
+ bExp
- 0x3FF;
2167 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2168 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2169 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2170 zSig0
|= ( zSig1
!= 0 );
2171 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2175 return roundAndPackFloat64( zSign
, zExp
, zSig0
);
2180 -------------------------------------------------------------------------------
2181 Returns the result of dividing the double-precision floating-point value `a'
2182 by the corresponding value `b'. The operation is performed according to
2183 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2184 -------------------------------------------------------------------------------
2186 float64
float64_div( float64 a
, float64 b
)
2188 flag aSign
, bSign
, zSign
;
2189 int16 aExp
, bExp
, zExp
;
2190 bits64 aSig
, bSig
, zSig
;
2192 bits64 term0
, term1
;
2194 aSig
= extractFloat64Frac( a
);
2195 aExp
= extractFloat64Exp( a
);
2196 aSign
= extractFloat64Sign( a
);
2197 bSig
= extractFloat64Frac( b
);
2198 bExp
= extractFloat64Exp( b
);
2199 bSign
= extractFloat64Sign( b
);
2200 zSign
= aSign
^ bSign
;
2201 if ( aExp
== 0x7FF ) {
2202 if ( aSig
) return propagateFloat64NaN( a
, b
);
2203 if ( bExp
== 0x7FF ) {
2204 if ( bSig
) return propagateFloat64NaN( a
, b
);
2205 float_raise( float_flag_invalid
);
2206 return float64_default_nan
;
2208 return packFloat64( zSign
, 0x7FF, 0 );
2210 if ( bExp
== 0x7FF ) {
2211 if ( bSig
) return propagateFloat64NaN( a
, b
);
2212 return packFloat64( zSign
, 0, 0 );
2216 if ( ( aExp
| aSig
) == 0 ) {
2217 float_raise( float_flag_invalid
);
2218 return float64_default_nan
;
2220 float_raise( float_flag_divbyzero
);
2221 return packFloat64( zSign
, 0x7FF, 0 );
2223 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2226 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2227 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2229 zExp
= aExp
- bExp
+ 0x3FD;
2230 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2231 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2232 if ( bSig
<= ( aSig
+ aSig
) ) {
2236 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2237 if ( ( zSig
& 0x1FF ) <= 2 ) {
2238 mul64To128( bSig
, zSig
, &term0
, &term1
);
2239 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2240 while ( (sbits64
) rem0
< 0 ) {
2242 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2244 zSig
|= ( rem1
!= 0 );
2246 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2251 -------------------------------------------------------------------------------
2252 Returns the remainder of the double-precision floating-point value `a'
2253 with respect to the corresponding value `b'. The operation is performed
2254 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2255 -------------------------------------------------------------------------------
2257 float64
float64_rem( float64 a
, float64 b
)
2259 flag aSign
, bSign
, zSign
;
2260 int16 aExp
, bExp
, expDiff
;
2262 bits64 q
, alternateASig
;
2265 aSig
= extractFloat64Frac( a
);
2266 aExp
= extractFloat64Exp( a
);
2267 aSign
= extractFloat64Sign( a
);
2268 bSig
= extractFloat64Frac( b
);
2269 bExp
= extractFloat64Exp( b
);
2270 bSign
= extractFloat64Sign( b
);
2271 if ( aExp
== 0x7FF ) {
2272 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2273 return propagateFloat64NaN( a
, b
);
2275 float_raise( float_flag_invalid
);
2276 return float64_default_nan
;
2278 if ( bExp
== 0x7FF ) {
2279 if ( bSig
) return propagateFloat64NaN( a
, b
);
2284 float_raise( float_flag_invalid
);
2285 return float64_default_nan
;
2287 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2290 if ( aSig
== 0 ) return a
;
2291 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2293 expDiff
= aExp
- bExp
;
2294 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
2295 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2296 if ( expDiff
< 0 ) {
2297 if ( expDiff
< -1 ) return a
;
2300 q
= ( bSig
<= aSig
);
2301 if ( q
) aSig
-= bSig
;
2303 while ( 0 < expDiff
) {
2304 q
= estimateDiv128To64( aSig
, 0, bSig
);
2305 q
= ( 2 < q
) ? q
- 2 : 0;
2306 aSig
= - ( ( bSig
>>2 ) * q
);
2310 if ( 0 < expDiff
) {
2311 q
= estimateDiv128To64( aSig
, 0, bSig
);
2312 q
= ( 2 < q
) ? q
- 2 : 0;
2315 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2322 alternateASig
= aSig
;
2325 } while ( 0 <= (sbits64
) aSig
);
2326 sigMean
= aSig
+ alternateASig
;
2327 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2328 aSig
= alternateASig
;
2330 zSign
= ( (sbits64
) aSig
< 0 );
2331 if ( zSign
) aSig
= - aSig
;
2332 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig
);
2337 -------------------------------------------------------------------------------
2338 Returns the square root of the double-precision floating-point value `a'.
2339 The operation is performed according to the IEC/IEEE Standard for Binary
2340 Floating-point Arithmetic.
2341 -------------------------------------------------------------------------------
2343 float64
float64_sqrt( float64 a
)
2348 bits64 rem0
, rem1
, term0
, term1
; //, shiftedRem;
2351 aSig
= extractFloat64Frac( a
);
2352 aExp
= extractFloat64Exp( a
);
2353 aSign
= extractFloat64Sign( a
);
2354 if ( aExp
== 0x7FF ) {
2355 if ( aSig
) return propagateFloat64NaN( a
, a
);
2356 if ( ! aSign
) return a
;
2357 float_raise( float_flag_invalid
);
2358 return float64_default_nan
;
2361 if ( ( aExp
| aSig
) == 0 ) return a
;
2362 float_raise( float_flag_invalid
);
2363 return float64_default_nan
;
2366 if ( aSig
== 0 ) return 0;
2367 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2369 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
2370 aSig
|= LIT64( 0x0010000000000000 );
2371 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
2373 aSig
<<= 9 - ( aExp
& 1 );
2374 zSig
= estimateDiv128To64( aSig
, 0, zSig
) + zSig
+ 2;
2375 if ( ( zSig
& 0x3FF ) <= 5 ) {
2377 zSig
= LIT64( 0xFFFFFFFFFFFFFFFF );
2381 mul64To128( zSig
, zSig
, &term0
, &term1
);
2382 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2383 while ( (sbits64
) rem0
< 0 ) {
2385 shortShift128Left( 0, zSig
, 1, &term0
, &term1
);
2387 add128( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
2389 zSig
|= ( ( rem0
| rem1
) != 0 );
2392 shift64RightJamming( zSig
, 1, &zSig
);
2393 return roundAndPackFloat64( 0, zExp
, zSig
);
2398 -------------------------------------------------------------------------------
2399 Returns 1 if the double-precision floating-point value `a' is equal to the
2400 corresponding value `b', and 0 otherwise. The comparison is performed
2401 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2402 -------------------------------------------------------------------------------
2404 flag
float64_eq( float64 a
, float64 b
)
2407 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2408 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2410 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2411 float_raise( float_flag_invalid
);
2415 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2420 -------------------------------------------------------------------------------
2421 Returns 1 if the double-precision floating-point value `a' is less than or
2422 equal to the corresponding value `b', and 0 otherwise. The comparison is
2423 performed according to the IEC/IEEE Standard for Binary Floating-point
2425 -------------------------------------------------------------------------------
2427 flag
float64_le( float64 a
, float64 b
)
2431 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2432 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2434 float_raise( float_flag_invalid
);
2437 aSign
= extractFloat64Sign( a
);
2438 bSign
= extractFloat64Sign( b
);
2439 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2440 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2445 -------------------------------------------------------------------------------
2446 Returns 1 if the double-precision floating-point value `a' is less than
2447 the corresponding value `b', and 0 otherwise. The comparison is performed
2448 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2449 -------------------------------------------------------------------------------
2451 flag
float64_lt( float64 a
, float64 b
)
2455 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2456 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2458 float_raise( float_flag_invalid
);
2461 aSign
= extractFloat64Sign( a
);
2462 bSign
= extractFloat64Sign( b
);
2463 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2464 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2469 -------------------------------------------------------------------------------
2470 Returns 1 if the double-precision floating-point value `a' is equal to the
2471 corresponding value `b', and 0 otherwise. The invalid exception is raised
2472 if either operand is a NaN. Otherwise, the comparison is performed
2473 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2474 -------------------------------------------------------------------------------
2476 flag
float64_eq_signaling( float64 a
, float64 b
)
2479 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2480 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2482 float_raise( float_flag_invalid
);
2485 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2490 -------------------------------------------------------------------------------
2491 Returns 1 if the double-precision floating-point value `a' is less than or
2492 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2493 cause an exception. Otherwise, the comparison is performed according to the
2494 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2495 -------------------------------------------------------------------------------
2497 flag
float64_le_quiet( float64 a
, float64 b
)
2502 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2503 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2505 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2506 float_raise( float_flag_invalid
);
2510 aSign
= extractFloat64Sign( a
);
2511 bSign
= extractFloat64Sign( b
);
2512 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2513 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2518 -------------------------------------------------------------------------------
2519 Returns 1 if the double-precision floating-point value `a' is less than
2520 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2521 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2522 Standard for Binary Floating-point Arithmetic.
2523 -------------------------------------------------------------------------------
2525 flag
float64_lt_quiet( float64 a
, float64 b
)
2529 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2530 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2532 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2533 float_raise( float_flag_invalid
);
2537 aSign
= extractFloat64Sign( a
);
2538 bSign
= extractFloat64Sign( b
);
2539 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2540 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2547 -------------------------------------------------------------------------------
2548 Returns the result of converting the extended double-precision floating-
2549 point value `a' to the 32-bit two's complement integer format. The
2550 conversion is performed according to the IEC/IEEE Standard for Binary
2551 Floating-point Arithmetic---which means in particular that the conversion
2552 is rounded according to the current rounding mode. If `a' is a NaN, the
2553 largest positive integer is returned. Otherwise, if the conversion
2554 overflows, the largest integer with the same sign as `a' is returned.
2555 -------------------------------------------------------------------------------
2557 int32
floatx80_to_int32( floatx80 a
)
2560 int32 aExp
, shiftCount
;
2563 aSig
= extractFloatx80Frac( a
);
2564 aExp
= extractFloatx80Exp( a
);
2565 aSign
= extractFloatx80Sign( a
);
2566 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
2567 shiftCount
= 0x4037 - aExp
;
2568 if ( shiftCount
<= 0 ) shiftCount
= 1;
2569 shift64RightJamming( aSig
, shiftCount
, &aSig
);
2570 return roundAndPackInt32( aSign
, aSig
);
2575 -------------------------------------------------------------------------------
2576 Returns the result of converting the extended double-precision floating-
2577 point value `a' to the 32-bit two's complement integer format. The
2578 conversion is performed according to the IEC/IEEE Standard for Binary
2579 Floating-point Arithmetic, except that the conversion is always rounded
2580 toward zero. If `a' is a NaN, the largest positive integer is returned.
2581 Otherwise, if the conversion overflows, the largest integer with the same
2582 sign as `a' is returned.
2583 -------------------------------------------------------------------------------
2585 int32
floatx80_to_int32_round_to_zero( floatx80 a
)
2588 int32 aExp
, shiftCount
;
2589 bits64 aSig
, savedASig
;
2592 aSig
= extractFloatx80Frac( a
);
2593 aExp
= extractFloatx80Exp( a
);
2594 aSign
= extractFloatx80Sign( a
);
2595 shiftCount
= 0x403E - aExp
;
2596 if ( shiftCount
< 32 ) {
2597 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
2600 else if ( 63 < shiftCount
) {
2601 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
2605 aSig
>>= shiftCount
;
2607 if ( aSign
) z
= - z
;
2608 if ( ( z
< 0 ) ^ aSign
) {
2610 float_exception_flags
|= float_flag_invalid
;
2611 return aSign
? 0x80000000 : 0x7FFFFFFF;
2613 if ( ( aSig
<<shiftCount
) != savedASig
) {
2614 float_exception_flags
|= float_flag_inexact
;
2621 -------------------------------------------------------------------------------
2622 Returns the result of converting the extended double-precision floating-
2623 point value `a' to the single-precision floating-point format. The
2624 conversion is performed according to the IEC/IEEE Standard for Binary
2625 Floating-point Arithmetic.
2626 -------------------------------------------------------------------------------
2628 float32
floatx80_to_float32( floatx80 a
)
2634 aSig
= extractFloatx80Frac( a
);
2635 aExp
= extractFloatx80Exp( a
);
2636 aSign
= extractFloatx80Sign( a
);
2637 if ( aExp
== 0x7FFF ) {
2638 if ( (bits64
) ( aSig
<<1 ) ) {
2639 return commonNaNToFloat32( floatx80ToCommonNaN( a
) );
2641 return packFloat32( aSign
, 0xFF, 0 );
2643 shift64RightJamming( aSig
, 33, &aSig
);
2644 if ( aExp
|| aSig
) aExp
-= 0x3F81;
2645 return roundAndPackFloat32( aSign
, aExp
, aSig
);
2650 -------------------------------------------------------------------------------
2651 Returns the result of converting the extended double-precision floating-
2652 point value `a' to the double-precision floating-point format. The
2653 conversion is performed according to the IEC/IEEE Standard for Binary
2654 Floating-point Arithmetic.
2655 -------------------------------------------------------------------------------
2657 float64
floatx80_to_float64( floatx80 a
)
2663 aSig
= extractFloatx80Frac( a
);
2664 aExp
= extractFloatx80Exp( a
);
2665 aSign
= extractFloatx80Sign( a
);
2666 if ( aExp
== 0x7FFF ) {
2667 if ( (bits64
) ( aSig
<<1 ) ) {
2668 return commonNaNToFloat64( floatx80ToCommonNaN( a
) );
2670 return packFloat64( aSign
, 0x7FF, 0 );
2672 shift64RightJamming( aSig
, 1, &zSig
);
2673 if ( aExp
|| aSig
) aExp
-= 0x3C01;
2674 return roundAndPackFloat64( aSign
, aExp
, zSig
);
2679 -------------------------------------------------------------------------------
2680 Rounds the extended double-precision floating-point value `a' to an integer,
2681 and returns the result as an extended quadruple-precision floating-point
2682 value. The operation is performed according to the IEC/IEEE Standard for
2683 Binary Floating-point Arithmetic.
2684 -------------------------------------------------------------------------------
2686 floatx80
floatx80_round_to_int( floatx80 a
)
2690 bits64 lastBitMask
, roundBitsMask
;
2694 aExp
= extractFloatx80Exp( a
);
2695 if ( 0x403E <= aExp
) {
2696 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
2697 return propagateFloatx80NaN( a
, a
);
2701 if ( aExp
<= 0x3FFE ) {
2703 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
2706 float_exception_flags
|= float_flag_inexact
;
2707 aSign
= extractFloatx80Sign( a
);
2708 switch ( float_rounding_mode
) {
2709 case float_round_nearest_even
:
2710 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
2713 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
2716 case float_round_down
:
2719 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2720 : packFloatx80( 0, 0, 0 );
2721 case float_round_up
:
2723 aSign
? packFloatx80( 1, 0, 0 )
2724 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2726 return packFloatx80( aSign
, 0, 0 );
2729 lastBitMask
<<= 0x403E - aExp
;
2730 roundBitsMask
= lastBitMask
- 1;
2732 roundingMode
= float_rounding_mode
;
2733 if ( roundingMode
== float_round_nearest_even
) {
2734 z
.low
+= lastBitMask
>>1;
2735 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
2737 else if ( roundingMode
!= float_round_to_zero
) {
2738 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
2739 z
.low
+= roundBitsMask
;
2742 z
.low
&= ~ roundBitsMask
;
2745 z
.low
= LIT64( 0x8000000000000000 );
2747 if ( z
.low
!= a
.low
) float_exception_flags
|= float_flag_inexact
;
2753 -------------------------------------------------------------------------------
2754 Returns the result of adding the absolute values of the extended double-
2755 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2756 negated before being returned. `zSign' is ignored if the result is a NaN.
2757 The addition is performed according to the IEC/IEEE Standard for Binary
2758 Floating-point Arithmetic.
2759 -------------------------------------------------------------------------------
2761 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
2763 int32 aExp
, bExp
, zExp
;
2764 bits64 aSig
, bSig
, zSig0
, zSig1
;
2767 aSig
= extractFloatx80Frac( a
);
2768 aExp
= extractFloatx80Exp( a
);
2769 bSig
= extractFloatx80Frac( b
);
2770 bExp
= extractFloatx80Exp( b
);
2771 expDiff
= aExp
- bExp
;
2772 if ( 0 < expDiff
) {
2773 if ( aExp
== 0x7FFF ) {
2774 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2777 if ( bExp
== 0 ) --expDiff
;
2778 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
2781 else if ( expDiff
< 0 ) {
2782 if ( bExp
== 0x7FFF ) {
2783 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2784 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2786 if ( aExp
== 0 ) ++expDiff
;
2787 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
2791 if ( aExp
== 0x7FFF ) {
2792 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
2793 return propagateFloatx80NaN( a
, b
);
2798 zSig0
= aSig
+ bSig
;
2800 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
2807 zSig0
= aSig
+ bSig
;
2809 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
2811 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
2812 zSig0
|= LIT64( 0x8000000000000000 );
2816 roundAndPackFloatx80(
2817 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
2822 -------------------------------------------------------------------------------
2823 Returns the result of subtracting the absolute values of the extended
2824 double-precision floating-point values `a' and `b'. If `zSign' is true,
2825 the difference is negated before being returned. `zSign' is ignored if the
2826 result is a NaN. The subtraction is performed according to the IEC/IEEE
2827 Standard for Binary Floating-point Arithmetic.
2828 -------------------------------------------------------------------------------
2830 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
2832 int32 aExp
, bExp
, zExp
;
2833 bits64 aSig
, bSig
, zSig0
, zSig1
;
2837 aSig
= extractFloatx80Frac( a
);
2838 aExp
= extractFloatx80Exp( a
);
2839 bSig
= extractFloatx80Frac( b
);
2840 bExp
= extractFloatx80Exp( b
);
2841 expDiff
= aExp
- bExp
;
2842 if ( 0 < expDiff
) goto aExpBigger
;
2843 if ( expDiff
< 0 ) goto bExpBigger
;
2844 if ( aExp
== 0x7FFF ) {
2845 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
2846 return propagateFloatx80NaN( a
, b
);
2848 float_raise( float_flag_invalid
);
2849 z
.low
= floatx80_default_nan_low
;
2850 z
.high
= floatx80_default_nan_high
;
2858 if ( bSig
< aSig
) goto aBigger
;
2859 if ( aSig
< bSig
) goto bBigger
;
2860 return packFloatx80( float_rounding_mode
== float_round_down
, 0, 0 );
2862 if ( bExp
== 0x7FFF ) {
2863 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2864 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2866 if ( aExp
== 0 ) ++expDiff
;
2867 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
2869 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
2872 goto normalizeRoundAndPack
;
2874 if ( aExp
== 0x7FFF ) {
2875 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2878 if ( bExp
== 0 ) --expDiff
;
2879 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
2881 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
2883 normalizeRoundAndPack
:
2885 normalizeRoundAndPackFloatx80(
2886 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
2891 -------------------------------------------------------------------------------
2892 Returns the result of adding the extended double-precision floating-point
2893 values `a' and `b'. The operation is performed according to the IEC/IEEE
2894 Standard for Binary Floating-point Arithmetic.
2895 -------------------------------------------------------------------------------
2897 floatx80
floatx80_add( floatx80 a
, floatx80 b
)
2901 aSign
= extractFloatx80Sign( a
);
2902 bSign
= extractFloatx80Sign( b
);
2903 if ( aSign
== bSign
) {
2904 return addFloatx80Sigs( a
, b
, aSign
);
2907 return subFloatx80Sigs( a
, b
, aSign
);
2913 -------------------------------------------------------------------------------
2914 Returns the result of subtracting the extended double-precision floating-
2915 point values `a' and `b'. The operation is performed according to the
2916 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2917 -------------------------------------------------------------------------------
2919 floatx80
floatx80_sub( floatx80 a
, floatx80 b
)
2923 aSign
= extractFloatx80Sign( a
);
2924 bSign
= extractFloatx80Sign( b
);
2925 if ( aSign
== bSign
) {
2926 return subFloatx80Sigs( a
, b
, aSign
);
2929 return addFloatx80Sigs( a
, b
, aSign
);
2935 -------------------------------------------------------------------------------
2936 Returns the result of multiplying the extended double-precision floating-
2937 point values `a' and `b'. The operation is performed according to the
2938 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2939 -------------------------------------------------------------------------------
2941 floatx80
floatx80_mul( floatx80 a
, floatx80 b
)
2943 flag aSign
, bSign
, zSign
;
2944 int32 aExp
, bExp
, zExp
;
2945 bits64 aSig
, bSig
, zSig0
, zSig1
;
2948 aSig
= extractFloatx80Frac( a
);
2949 aExp
= extractFloatx80Exp( a
);
2950 aSign
= extractFloatx80Sign( a
);
2951 bSig
= extractFloatx80Frac( b
);
2952 bExp
= extractFloatx80Exp( b
);
2953 bSign
= extractFloatx80Sign( b
);
2954 zSign
= aSign
^ bSign
;
2955 if ( aExp
== 0x7FFF ) {
2956 if ( (bits64
) ( aSig
<<1 )
2957 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
2958 return propagateFloatx80NaN( a
, b
);
2960 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
2961 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2963 if ( bExp
== 0x7FFF ) {
2964 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2965 if ( ( aExp
| aSig
) == 0 ) {
2967 float_raise( float_flag_invalid
);
2968 z
.low
= floatx80_default_nan_low
;
2969 z
.high
= floatx80_default_nan_high
;
2972 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2975 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
2976 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
2979 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
2980 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
2982 zExp
= aExp
+ bExp
- 0x3FFE;
2983 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2984 if ( 0 < (sbits64
) zSig0
) {
2985 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
2989 roundAndPackFloatx80(
2990 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
2995 -------------------------------------------------------------------------------
2996 Returns the result of dividing the extended double-precision floating-point
2997 value `a' by the corresponding value `b'. The operation is performed
2998 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2999 -------------------------------------------------------------------------------
3001 floatx80
floatx80_div( floatx80 a
, floatx80 b
)
3003 flag aSign
, bSign
, zSign
;
3004 int32 aExp
, bExp
, zExp
;
3005 bits64 aSig
, bSig
, zSig0
, zSig1
;
3006 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3009 aSig
= extractFloatx80Frac( a
);
3010 aExp
= extractFloatx80Exp( a
);
3011 aSign
= extractFloatx80Sign( a
);
3012 bSig
= extractFloatx80Frac( b
);
3013 bExp
= extractFloatx80Exp( b
);
3014 bSign
= extractFloatx80Sign( b
);
3015 zSign
= aSign
^ bSign
;
3016 if ( aExp
== 0x7FFF ) {
3017 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3018 if ( bExp
== 0x7FFF ) {
3019 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3022 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3024 if ( bExp
== 0x7FFF ) {
3025 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3026 return packFloatx80( zSign
, 0, 0 );
3030 if ( ( aExp
| aSig
) == 0 ) {
3032 float_raise( float_flag_invalid
);
3033 z
.low
= floatx80_default_nan_low
;
3034 z
.high
= floatx80_default_nan_high
;
3037 float_raise( float_flag_divbyzero
);
3038 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3040 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3043 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3044 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3046 zExp
= aExp
- bExp
+ 0x3FFE;
3048 if ( bSig
<= aSig
) {
3049 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3052 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3053 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3054 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3055 while ( (sbits64
) rem0
< 0 ) {
3057 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3059 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3060 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3061 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3062 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3063 while ( (sbits64
) rem1
< 0 ) {
3065 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3067 zSig1
|= ( ( rem1
| rem2
) != 0 );
3070 roundAndPackFloatx80(
3071 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3076 -------------------------------------------------------------------------------
3077 Returns the remainder of the extended double-precision floating-point value
3078 `a' with respect to the corresponding value `b'. The operation is performed
3079 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3080 -------------------------------------------------------------------------------
3082 floatx80
floatx80_rem( floatx80 a
, floatx80 b
)
3084 flag aSign
, bSign
, zSign
;
3085 int32 aExp
, bExp
, expDiff
;
3086 bits64 aSig0
, aSig1
, bSig
;
3087 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3090 aSig0
= extractFloatx80Frac( a
);
3091 aExp
= extractFloatx80Exp( a
);
3092 aSign
= extractFloatx80Sign( a
);
3093 bSig
= extractFloatx80Frac( b
);
3094 bExp
= extractFloatx80Exp( b
);
3095 bSign
= extractFloatx80Sign( b
);
3096 if ( aExp
== 0x7FFF ) {
3097 if ( (bits64
) ( aSig0
<<1 )
3098 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3099 return propagateFloatx80NaN( a
, b
);
3103 if ( bExp
== 0x7FFF ) {
3104 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3110 float_raise( float_flag_invalid
);
3111 z
.low
= floatx80_default_nan_low
;
3112 z
.high
= floatx80_default_nan_high
;
3115 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3118 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3119 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3121 bSig
|= LIT64( 0x8000000000000000 );
3123 expDiff
= aExp
- bExp
;
3125 if ( expDiff
< 0 ) {
3126 if ( expDiff
< -1 ) return a
;
3127 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3130 q
= ( bSig
<= aSig0
);
3131 if ( q
) aSig0
-= bSig
;
3133 while ( 0 < expDiff
) {
3134 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3135 q
= ( 2 < q
) ? q
- 2 : 0;
3136 mul64To128( bSig
, q
, &term0
, &term1
);
3137 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3138 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3142 if ( 0 < expDiff
) {
3143 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3144 q
= ( 2 < q
) ? q
- 2 : 0;
3146 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
3147 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3148 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
3149 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
3151 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3158 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
3159 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3160 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3163 aSig0
= alternateASig0
;
3164 aSig1
= alternateASig1
;
3168 normalizeRoundAndPackFloatx80(
3169 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
);
3174 -------------------------------------------------------------------------------
3175 Returns the square root of the extended double-precision floating-point
3176 value `a'. The operation is performed according to the IEC/IEEE Standard
3177 for Binary Floating-point Arithmetic.
3178 -------------------------------------------------------------------------------
3180 floatx80
floatx80_sqrt( floatx80 a
)
3184 bits64 aSig0
, aSig1
, zSig0
, zSig1
;
3185 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
3186 bits64 shiftedRem0
, shiftedRem1
;
3189 aSig0
= extractFloatx80Frac( a
);
3190 aExp
= extractFloatx80Exp( a
);
3191 aSign
= extractFloatx80Sign( a
);
3192 if ( aExp
== 0x7FFF ) {
3193 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a
);
3194 if ( ! aSign
) return a
;
3198 if ( ( aExp
| aSig0
) == 0 ) return a
;
3200 float_raise( float_flag_invalid
);
3201 z
.low
= floatx80_default_nan_low
;
3202 z
.high
= floatx80_default_nan_high
;
3206 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
3207 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3209 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
3210 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
3213 shift128Right( aSig0
, 0, ( aExp
& 1 ) + 2, &aSig0
, &aSig1
);
3214 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
) + zSig0
+ 4;
3215 if ( 0 <= (sbits64
) zSig0
) zSig0
= LIT64( 0xFFFFFFFFFFFFFFFF );
3216 shortShift128Left( aSig0
, aSig1
, 2, &aSig0
, &aSig1
);
3217 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
3218 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
3219 while ( (sbits64
) rem0
< 0 ) {
3221 shortShift128Left( 0, zSig0
, 1, &term0
, &term1
);
3223 add128( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
3225 shortShift128Left( rem0
, rem1
, 63, &shiftedRem0
, &shiftedRem1
);
3226 zSig1
= estimateDiv128To64( shiftedRem0
, shiftedRem1
, zSig0
);
3227 if ( (bits64
) ( zSig1
<<1 ) <= 10 ) {
3228 if ( zSig1
== 0 ) zSig1
= 1;
3229 mul64To128( zSig0
, zSig1
, &term1
, &term2
);
3230 shortShift128Left( term1
, term2
, 1, &term1
, &term2
);
3231 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3232 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
3233 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
3234 while ( (sbits64
) rem1
< 0 ) {
3236 shortShift192Left( 0, zSig0
, zSig1
, 1, &term1
, &term2
, &term3
);
3239 rem1
, rem2
, rem3
, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
3241 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
3244 roundAndPackFloatx80(
3245 floatx80_rounding_precision
, 0, zExp
, zSig0
, zSig1
);
3250 -------------------------------------------------------------------------------
3251 Returns 1 if the extended double-precision floating-point value `a' is
3252 equal to the corresponding value `b', and 0 otherwise. The comparison is
3253 performed according to the IEC/IEEE Standard for Binary Floating-point
3255 -------------------------------------------------------------------------------
3257 flag
floatx80_eq( floatx80 a
, floatx80 b
)
3260 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3261 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3262 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3263 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3265 if ( floatx80_is_signaling_nan( a
)
3266 || floatx80_is_signaling_nan( b
) ) {
3267 float_raise( float_flag_invalid
);
3273 && ( ( a
.high
== b
.high
)
3275 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3281 -------------------------------------------------------------------------------
3282 Returns 1 if the extended double-precision floating-point value `a' is
3283 less than or equal to the corresponding value `b', and 0 otherwise. The
3284 comparison is performed according to the IEC/IEEE Standard for Binary
3285 Floating-point Arithmetic.
3286 -------------------------------------------------------------------------------
3288 flag
floatx80_le( floatx80 a
, floatx80 b
)
3292 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3293 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3294 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3295 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3297 float_raise( float_flag_invalid
);
3300 aSign
= extractFloatx80Sign( a
);
3301 bSign
= extractFloatx80Sign( b
);
3302 if ( aSign
!= bSign
) {
3305 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3309 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3310 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3315 -------------------------------------------------------------------------------
3316 Returns 1 if the extended double-precision floating-point value `a' is
3317 less than the corresponding value `b', and 0 otherwise. The comparison
3318 is performed according to the IEC/IEEE Standard for Binary Floating-point
3320 -------------------------------------------------------------------------------
3322 flag
floatx80_lt( floatx80 a
, floatx80 b
)
3326 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3327 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3328 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3329 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3331 float_raise( float_flag_invalid
);
3334 aSign
= extractFloatx80Sign( a
);
3335 bSign
= extractFloatx80Sign( b
);
3336 if ( aSign
!= bSign
) {
3339 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3343 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
3344 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
3349 -------------------------------------------------------------------------------
3350 Returns 1 if the extended double-precision floating-point value `a' is equal
3351 to the corresponding value `b', and 0 otherwise. The invalid exception is
3352 raised if either operand is a NaN. Otherwise, the comparison is performed
3353 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3354 -------------------------------------------------------------------------------
3356 flag
floatx80_eq_signaling( floatx80 a
, floatx80 b
)
3359 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3360 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3361 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3362 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3364 float_raise( float_flag_invalid
);
3369 && ( ( a
.high
== b
.high
)
3371 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3377 -------------------------------------------------------------------------------
3378 Returns 1 if the extended double-precision floating-point value `a' is less
3379 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3380 do not cause an exception. Otherwise, the comparison is performed according
3381 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3382 -------------------------------------------------------------------------------
3384 flag
floatx80_le_quiet( floatx80 a
, floatx80 b
)
3388 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3389 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3390 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3391 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3393 if ( floatx80_is_signaling_nan( a
)
3394 || floatx80_is_signaling_nan( b
) ) {
3395 float_raise( float_flag_invalid
);
3399 aSign
= extractFloatx80Sign( a
);
3400 bSign
= extractFloatx80Sign( b
);
3401 if ( aSign
!= bSign
) {
3404 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3408 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3409 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3414 -------------------------------------------------------------------------------
3415 Returns 1 if the extended double-precision floating-point value `a' is less
3416 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3417 an exception. Otherwise, the comparison is performed according to the
3418 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3419 -------------------------------------------------------------------------------
3421 flag
floatx80_lt_quiet( floatx80 a
, floatx80 b
)
3425 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3426 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3427 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3428 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3430 if ( floatx80_is_signaling_nan( a
)
3431 || floatx80_is_signaling_nan( b
) ) {
3432 float_raise( float_flag_invalid
);
3436 aSign
= extractFloatx80Sign( a
);
3437 bSign
= extractFloatx80Sign( b
);
3438 if ( aSign
!= bSign
) {
3441 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3445 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
3446 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);