2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
33 //#include "softfloat.h"
36 -------------------------------------------------------------------------------
37 Floating-point rounding mode, extended double-precision rounding precision,
39 -------------------------------------------------------------------------------
41 int8 float_rounding_mode
= float_round_nearest_even
;
42 int8 floatx80_rounding_precision
= 80;
43 int8 float_exception_flags
;
46 -------------------------------------------------------------------------------
47 Primitive arithmetic functions, including multi-word arithmetic, and
48 division and square root approximations. (Can be specialized to target if
50 -------------------------------------------------------------------------------
52 #include "softfloat-macros"
55 -------------------------------------------------------------------------------
56 Functions and definitions to determine: (1) whether tininess for underflow
57 is detected before or after rounding by default, (2) what (if anything)
58 happens when exceptions are raised, (3) how signaling NaNs are distinguished
59 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60 are propagated from function inputs to output. These details are target-
62 -------------------------------------------------------------------------------
64 #include "softfloat-specialize"
67 -------------------------------------------------------------------------------
68 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
69 and 7, and returns the properly rounded 32-bit integer corresponding to the
70 input. If `zSign' is nonzero, the input is negated before being converted
71 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
72 input is simply rounded to an integer, with the inexact exception raised if
73 the input cannot be represented exactly as an integer. If the fixed-point
74 input is too large, however, the invalid exception is raised and the largest
75 positive or negative integer is returned.
76 -------------------------------------------------------------------------------
78 static int32
roundAndPackInt32( flag zSign
, bits64 absZ
)
81 flag roundNearestEven
;
82 int8 roundIncrement
, roundBits
;
85 roundingMode
= float_rounding_mode
;
86 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
87 roundIncrement
= 0x40;
88 if ( ! roundNearestEven
) {
89 if ( roundingMode
== float_round_to_zero
) {
93 roundIncrement
= 0x7F;
95 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
98 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
102 roundBits
= absZ
& 0x7F;
103 absZ
= ( absZ
+ roundIncrement
)>>7;
104 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
106 if ( zSign
) z
= - z
;
107 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
108 float_exception_flags
|= float_flag_invalid
;
109 return zSign
? 0x80000000 : 0x7FFFFFFF;
111 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
117 -------------------------------------------------------------------------------
118 Returns the fraction bits of the single-precision floating-point value `a'.
119 -------------------------------------------------------------------------------
121 INLINE bits32
extractFloat32Frac( float32 a
)
124 return a
& 0x007FFFFF;
129 -------------------------------------------------------------------------------
130 Returns the exponent bits of the single-precision floating-point value `a'.
131 -------------------------------------------------------------------------------
133 INLINE int16
extractFloat32Exp( float32 a
)
136 return ( a
>>23 ) & 0xFF;
141 -------------------------------------------------------------------------------
142 Returns the sign bit of the single-precision floating-point value `a'.
143 -------------------------------------------------------------------------------
145 #if 0 /* in softfloat.h */
146 INLINE flag
extractFloat32Sign( float32 a
)
155 -------------------------------------------------------------------------------
156 Normalizes the subnormal single-precision floating-point value represented
157 by the denormalized significand `aSig'. The normalized exponent and
158 significand are stored at the locations pointed to by `zExpPtr' and
159 `zSigPtr', respectively.
160 -------------------------------------------------------------------------------
163 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
167 shiftCount
= countLeadingZeros32( aSig
) - 8;
168 *zSigPtr
= aSig
<<shiftCount
;
169 *zExpPtr
= 1 - shiftCount
;
174 -------------------------------------------------------------------------------
175 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
176 single-precision floating-point value, returning the result. After being
177 shifted into the proper positions, the three fields are simply added
178 together to form the result. This means that any integer portion of `zSig'
179 will be added into the exponent. Since a properly normalized significand
180 will have an integer portion equal to 1, the `zExp' input should be 1 less
181 than the desired result exponent whenever `zSig' is a complete, normalized
183 -------------------------------------------------------------------------------
185 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
189 __asm__("@ packFloat32 \n\
190 mov %0, %1, asl #31 \n\
191 orr %0, %2, asl #23 \n\
194 : "g" (f
), "g" (zSign
), "g" (zExp
), "g" (zSig
)
198 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
203 -------------------------------------------------------------------------------
204 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
205 and significand `zSig', and returns the proper single-precision floating-
206 point value corresponding to the abstract input. Ordinarily, the abstract
207 value is simply rounded and packed into the single-precision format, with
208 the inexact exception raised if the abstract input cannot be represented
209 exactly. If the abstract value is too large, however, the overflow and
210 inexact exceptions are raised and an infinity or maximal finite value is
211 returned. If the abstract value is too small, the input value is rounded to
212 a subnormal number, and the underflow and inexact exceptions are raised if
213 the abstract input cannot be represented exactly as a subnormal single-
214 precision floating-point number.
215 The input significand `zSig' has its binary point between bits 30
216 and 29, which is 7 bits to the left of the usual location. This shifted
217 significand must be normalized or smaller. If `zSig' is not normalized,
218 `zExp' must be 0; in that case, the result returned is a subnormal number,
219 and it must not require rounding. In the usual case that `zSig' is
220 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
221 The handling of underflow and overflow follows the IEC/IEEE Standard for
222 Binary Floating-point Arithmetic.
223 -------------------------------------------------------------------------------
225 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
228 flag roundNearestEven
;
229 int8 roundIncrement
, roundBits
;
232 roundingMode
= float_rounding_mode
;
233 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
234 roundIncrement
= 0x40;
235 if ( ! roundNearestEven
) {
236 if ( roundingMode
== float_round_to_zero
) {
240 roundIncrement
= 0x7F;
242 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
245 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
249 roundBits
= zSig
& 0x7F;
250 if ( 0xFD <= (bits16
) zExp
) {
252 || ( ( zExp
== 0xFD )
253 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
255 float_raise( float_flag_overflow
| float_flag_inexact
);
256 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
260 ( float_detect_tininess
== float_tininess_before_rounding
)
262 || ( zSig
+ roundIncrement
< 0x80000000 );
263 shift32RightJamming( zSig
, - zExp
, &zSig
);
265 roundBits
= zSig
& 0x7F;
266 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
269 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
270 zSig
= ( zSig
+ roundIncrement
)>>7;
271 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
272 if ( zSig
== 0 ) zExp
= 0;
273 return packFloat32( zSign
, zExp
, zSig
);
278 -------------------------------------------------------------------------------
279 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
280 and significand `zSig', and returns the proper single-precision floating-
281 point value corresponding to the abstract input. This routine is just like
282 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
283 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
285 -------------------------------------------------------------------------------
288 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
292 shiftCount
= countLeadingZeros32( zSig
) - 1;
293 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
298 -------------------------------------------------------------------------------
299 Returns the fraction bits of the double-precision floating-point value `a'.
300 -------------------------------------------------------------------------------
302 INLINE bits64
extractFloat64Frac( float64 a
)
305 return a
& LIT64( 0x000FFFFFFFFFFFFF );
310 -------------------------------------------------------------------------------
311 Returns the exponent bits of the double-precision floating-point value `a'.
312 -------------------------------------------------------------------------------
314 INLINE int16
extractFloat64Exp( float64 a
)
317 return ( a
>>52 ) & 0x7FF;
322 -------------------------------------------------------------------------------
323 Returns the sign bit of the double-precision floating-point value `a'.
324 -------------------------------------------------------------------------------
326 #if 0 /* in softfloat.h */
327 INLINE flag
extractFloat64Sign( float64 a
)
336 -------------------------------------------------------------------------------
337 Normalizes the subnormal double-precision floating-point value represented
338 by the denormalized significand `aSig'. The normalized exponent and
339 significand are stored at the locations pointed to by `zExpPtr' and
340 `zSigPtr', respectively.
341 -------------------------------------------------------------------------------
344 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
348 shiftCount
= countLeadingZeros64( aSig
) - 11;
349 *zSigPtr
= aSig
<<shiftCount
;
350 *zExpPtr
= 1 - shiftCount
;
355 -------------------------------------------------------------------------------
356 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
357 double-precision floating-point value, returning the result. After being
358 shifted into the proper positions, the three fields are simply added
359 together to form the result. This means that any integer portion of `zSig'
360 will be added into the exponent. Since a properly normalized significand
361 will have an integer portion equal to 1, the `zExp' input should be 1 less
362 than the desired result exponent whenever `zSig' is a complete, normalized
364 -------------------------------------------------------------------------------
366 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
369 return ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
;
374 -------------------------------------------------------------------------------
375 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
376 and significand `zSig', and returns the proper double-precision floating-
377 point value corresponding to the abstract input. Ordinarily, the abstract
378 value is simply rounded and packed into the double-precision format, with
379 the inexact exception raised if the abstract input cannot be represented
380 exactly. If the abstract value is too large, however, the overflow and
381 inexact exceptions are raised and an infinity or maximal finite value is
382 returned. If the abstract value is too small, the input value is rounded to
383 a subnormal number, and the underflow and inexact exceptions are raised if
384 the abstract input cannot be represented exactly as a subnormal double-
385 precision floating-point number.
386 The input significand `zSig' has its binary point between bits 62
387 and 61, which is 10 bits to the left of the usual location. This shifted
388 significand must be normalized or smaller. If `zSig' is not normalized,
389 `zExp' must be 0; in that case, the result returned is a subnormal number,
390 and it must not require rounding. In the usual case that `zSig' is
391 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
392 The handling of underflow and overflow follows the IEC/IEEE Standard for
393 Binary Floating-point Arithmetic.
394 -------------------------------------------------------------------------------
396 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
399 flag roundNearestEven
;
400 int16 roundIncrement
, roundBits
;
403 roundingMode
= float_rounding_mode
;
404 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
405 roundIncrement
= 0x200;
406 if ( ! roundNearestEven
) {
407 if ( roundingMode
== float_round_to_zero
) {
411 roundIncrement
= 0x3FF;
413 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
416 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
420 roundBits
= zSig
& 0x3FF;
421 if ( 0x7FD <= (bits16
) zExp
) {
422 if ( ( 0x7FD < zExp
)
423 || ( ( zExp
== 0x7FD )
424 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
426 //register int lr = __builtin_return_address(0);
427 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
428 float_raise( float_flag_overflow
| float_flag_inexact
);
429 return packFloat64( zSign
, 0x7FF, 0 ) - ( roundIncrement
== 0 );
433 ( float_detect_tininess
== float_tininess_before_rounding
)
435 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
436 shift64RightJamming( zSig
, - zExp
, &zSig
);
438 roundBits
= zSig
& 0x3FF;
439 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
442 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
443 zSig
= ( zSig
+ roundIncrement
)>>10;
444 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
445 if ( zSig
== 0 ) zExp
= 0;
446 return packFloat64( zSign
, zExp
, zSig
);
451 -------------------------------------------------------------------------------
452 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
453 and significand `zSig', and returns the proper double-precision floating-
454 point value corresponding to the abstract input. This routine is just like
455 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
456 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
458 -------------------------------------------------------------------------------
461 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
465 shiftCount
= countLeadingZeros64( zSig
) - 1;
466 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
473 -------------------------------------------------------------------------------
474 Returns the fraction bits of the extended double-precision floating-point
476 -------------------------------------------------------------------------------
478 INLINE bits64
extractFloatx80Frac( floatx80 a
)
486 -------------------------------------------------------------------------------
487 Returns the exponent bits of the extended double-precision floating-point
489 -------------------------------------------------------------------------------
491 INLINE int32
extractFloatx80Exp( floatx80 a
)
494 return a
.high
& 0x7FFF;
499 -------------------------------------------------------------------------------
500 Returns the sign bit of the extended double-precision floating-point value
502 -------------------------------------------------------------------------------
504 INLINE flag
extractFloatx80Sign( floatx80 a
)
512 -------------------------------------------------------------------------------
513 Normalizes the subnormal extended double-precision floating-point value
514 represented by the denormalized significand `aSig'. The normalized exponent
515 and significand are stored at the locations pointed to by `zExpPtr' and
516 `zSigPtr', respectively.
517 -------------------------------------------------------------------------------
520 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
524 shiftCount
= countLeadingZeros64( aSig
);
525 *zSigPtr
= aSig
<<shiftCount
;
526 *zExpPtr
= 1 - shiftCount
;
531 -------------------------------------------------------------------------------
532 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
533 extended double-precision floating-point value, returning the result.
534 -------------------------------------------------------------------------------
536 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
541 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
547 -------------------------------------------------------------------------------
548 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
549 and extended significand formed by the concatenation of `zSig0' and `zSig1',
550 and returns the proper extended double-precision floating-point value
551 corresponding to the abstract input. Ordinarily, the abstract value is
552 rounded and packed into the extended double-precision format, with the
553 inexact exception raised if the abstract input cannot be represented
554 exactly. If the abstract value is too large, however, the overflow and
555 inexact exceptions are raised and an infinity or maximal finite value is
556 returned. If the abstract value is too small, the input value is rounded to
557 a subnormal number, and the underflow and inexact exceptions are raised if
558 the abstract input cannot be represented exactly as a subnormal extended
559 double-precision floating-point number.
560 If `roundingPrecision' is 32 or 64, the result is rounded to the same
561 number of bits as single or double precision, respectively. Otherwise, the
562 result is rounded to the full precision of the extended double-precision
564 The input significand must be normalized or smaller. If the input
565 significand is not normalized, `zExp' must be 0; in that case, the result
566 returned is a subnormal number, and it must not require rounding. The
567 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
568 Floating-point Arithmetic.
569 -------------------------------------------------------------------------------
572 roundAndPackFloatx80(
573 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
577 flag roundNearestEven
, increment
, isTiny
;
578 int64 roundIncrement
, roundMask
, roundBits
;
580 roundingMode
= float_rounding_mode
;
581 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
582 if ( roundingPrecision
== 80 ) goto precision80
;
583 if ( roundingPrecision
== 64 ) {
584 roundIncrement
= LIT64( 0x0000000000000400 );
585 roundMask
= LIT64( 0x00000000000007FF );
587 else if ( roundingPrecision
== 32 ) {
588 roundIncrement
= LIT64( 0x0000008000000000 );
589 roundMask
= LIT64( 0x000000FFFFFFFFFF );
594 zSig0
|= ( zSig1
!= 0 );
595 if ( ! roundNearestEven
) {
596 if ( roundingMode
== float_round_to_zero
) {
600 roundIncrement
= roundMask
;
602 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
605 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
609 roundBits
= zSig0
& roundMask
;
610 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
611 if ( ( 0x7FFE < zExp
)
612 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
618 ( float_detect_tininess
== float_tininess_before_rounding
)
620 || ( zSig0
<= zSig0
+ roundIncrement
);
621 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
623 roundBits
= zSig0
& roundMask
;
624 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
625 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
626 zSig0
+= roundIncrement
;
627 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
628 roundIncrement
= roundMask
+ 1;
629 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
630 roundMask
|= roundIncrement
;
632 zSig0
&= ~ roundMask
;
633 return packFloatx80( zSign
, zExp
, zSig0
);
636 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
637 zSig0
+= roundIncrement
;
638 if ( zSig0
< roundIncrement
) {
640 zSig0
= LIT64( 0x8000000000000000 );
642 roundIncrement
= roundMask
+ 1;
643 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
644 roundMask
|= roundIncrement
;
646 zSig0
&= ~ roundMask
;
647 if ( zSig0
== 0 ) zExp
= 0;
648 return packFloatx80( zSign
, zExp
, zSig0
);
650 increment
= ( (sbits64
) zSig1
< 0 );
651 if ( ! roundNearestEven
) {
652 if ( roundingMode
== float_round_to_zero
) {
657 increment
= ( roundingMode
== float_round_down
) && zSig1
;
660 increment
= ( roundingMode
== float_round_up
) && zSig1
;
664 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
665 if ( ( 0x7FFE < zExp
)
666 || ( ( zExp
== 0x7FFE )
667 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
673 float_raise( float_flag_overflow
| float_flag_inexact
);
674 if ( ( roundingMode
== float_round_to_zero
)
675 || ( zSign
&& ( roundingMode
== float_round_up
) )
676 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
678 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
680 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
684 ( float_detect_tininess
== float_tininess_before_rounding
)
687 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
688 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
690 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow
);
691 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
692 if ( roundNearestEven
) {
693 increment
= ( (sbits64
) zSig1
< 0 );
697 increment
= ( roundingMode
== float_round_down
) && zSig1
;
700 increment
= ( roundingMode
== float_round_up
) && zSig1
;
705 zSig0
&= ~ ( ( zSig1
+ zSig1
== 0 ) & roundNearestEven
);
706 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
708 return packFloatx80( zSign
, zExp
, zSig0
);
711 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
716 zSig0
= LIT64( 0x8000000000000000 );
719 zSig0
&= ~ ( ( zSig1
+ zSig1
== 0 ) & roundNearestEven
);
723 if ( zSig0
== 0 ) zExp
= 0;
726 return packFloatx80( zSign
, zExp
, zSig0
);
730 -------------------------------------------------------------------------------
731 Takes an abstract floating-point value having sign `zSign', exponent
732 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
733 and returns the proper extended double-precision floating-point value
734 corresponding to the abstract input. This routine is just like
735 `roundAndPackFloatx80' except that the input significand does not have to be
737 -------------------------------------------------------------------------------
740 normalizeRoundAndPackFloatx80(
741 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
751 shiftCount
= countLeadingZeros64( zSig0
);
752 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
755 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1
);
762 -------------------------------------------------------------------------------
763 Returns the result of converting the 32-bit two's complement integer `a' to
764 the single-precision floating-point format. The conversion is performed
765 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
766 -------------------------------------------------------------------------------
768 float32
int32_to_float32( int32 a
)
772 if ( a
== 0 ) return 0;
773 if ( a
== 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
775 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a
);
780 -------------------------------------------------------------------------------
781 Returns the result of converting the 32-bit two's complement integer `a' to
782 the double-precision floating-point format. The conversion is performed
783 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
784 -------------------------------------------------------------------------------
786 float64
int32_to_float64( int32 a
)
793 if ( a
== 0 ) return 0;
795 absA
= aSign
? - a
: a
;
796 shiftCount
= countLeadingZeros32( absA
) + 21;
798 return packFloat64( aSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
805 -------------------------------------------------------------------------------
806 Returns the result of converting the 32-bit two's complement integer `a'
807 to the extended double-precision floating-point format. The conversion
808 is performed according to the IEC/IEEE Standard for Binary Floating-point
810 -------------------------------------------------------------------------------
812 floatx80
int32_to_floatx80( int32 a
)
819 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
821 absA
= zSign
? - a
: a
;
822 shiftCount
= countLeadingZeros32( absA
) + 32;
824 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
831 -------------------------------------------------------------------------------
832 Returns the result of converting the single-precision floating-point value
833 `a' to the 32-bit two's complement integer format. The conversion is
834 performed according to the IEC/IEEE Standard for Binary Floating-point
835 Arithmetic---which means in particular that the conversion is rounded
836 according to the current rounding mode. If `a' is a NaN, the largest
837 positive integer is returned. Otherwise, if the conversion overflows, the
838 largest integer with the same sign as `a' is returned.
839 -------------------------------------------------------------------------------
841 int32
float32_to_int32( float32 a
)
844 int16 aExp
, shiftCount
;
848 aSig
= extractFloat32Frac( a
);
849 aExp
= extractFloat32Exp( a
);
850 aSign
= extractFloat32Sign( a
);
851 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
852 if ( aExp
) aSig
|= 0x00800000;
853 shiftCount
= 0xAF - aExp
;
856 if ( 0 < shiftCount
) shift64RightJamming( zSig
, shiftCount
, &zSig
);
857 return roundAndPackInt32( aSign
, zSig
);
862 -------------------------------------------------------------------------------
863 Returns the result of converting the single-precision floating-point value
864 `a' to the 32-bit two's complement integer format. The conversion is
865 performed according to the IEC/IEEE Standard for Binary Floating-point
866 Arithmetic, except that the conversion is always rounded toward zero. If
867 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
868 conversion overflows, the largest integer with the same sign as `a' is
870 -------------------------------------------------------------------------------
872 int32
float32_to_int32_round_to_zero( float32 a
)
875 int16 aExp
, shiftCount
;
879 aSig
= extractFloat32Frac( a
);
880 aExp
= extractFloat32Exp( a
);
881 aSign
= extractFloat32Sign( a
);
882 shiftCount
= aExp
- 0x9E;
883 if ( 0 <= shiftCount
) {
884 if ( a
== 0xCF000000 ) return 0x80000000;
885 float_raise( float_flag_invalid
);
886 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
889 else if ( aExp
<= 0x7E ) {
890 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
893 aSig
= ( aSig
| 0x00800000 )<<8;
894 z
= aSig
>>( - shiftCount
);
895 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
896 float_exception_flags
|= float_flag_inexact
;
898 return aSign
? - z
: z
;
903 -------------------------------------------------------------------------------
904 Returns the result of converting the single-precision floating-point value
905 `a' to the double-precision floating-point format. The conversion is
906 performed according to the IEC/IEEE Standard for Binary Floating-point
908 -------------------------------------------------------------------------------
910 float64
float32_to_float64( float32 a
)
916 aSig
= extractFloat32Frac( a
);
917 aExp
= extractFloat32Exp( a
);
918 aSign
= extractFloat32Sign( a
);
919 if ( aExp
== 0xFF ) {
920 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
921 return packFloat64( aSign
, 0x7FF, 0 );
924 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
925 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
928 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
935 -------------------------------------------------------------------------------
936 Returns the result of converting the single-precision floating-point value
937 `a' to the extended double-precision floating-point format. The conversion
938 is performed according to the IEC/IEEE Standard for Binary Floating-point
940 -------------------------------------------------------------------------------
942 floatx80
float32_to_floatx80( float32 a
)
948 aSig
= extractFloat32Frac( a
);
949 aExp
= extractFloat32Exp( a
);
950 aSign
= extractFloat32Sign( a
);
951 if ( aExp
== 0xFF ) {
952 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a
) );
953 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
956 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
957 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
960 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
967 -------------------------------------------------------------------------------
968 Rounds the single-precision floating-point value `a' to an integer, and
969 returns the result as a single-precision floating-point value. The
970 operation is performed according to the IEC/IEEE Standard for Binary
971 Floating-point Arithmetic.
972 -------------------------------------------------------------------------------
974 float32
float32_round_to_int( float32 a
)
978 bits32 lastBitMask
, roundBitsMask
;
982 aExp
= extractFloat32Exp( a
);
983 if ( 0x96 <= aExp
) {
984 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
985 return propagateFloat32NaN( a
, a
);
989 if ( aExp
<= 0x7E ) {
990 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
991 float_exception_flags
|= float_flag_inexact
;
992 aSign
= extractFloat32Sign( a
);
993 switch ( float_rounding_mode
) {
994 case float_round_nearest_even
:
995 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
996 return packFloat32( aSign
, 0x7F, 0 );
999 case float_round_down
:
1000 return aSign
? 0xBF800000 : 0;
1001 case float_round_up
:
1002 return aSign
? 0x80000000 : 0x3F800000;
1004 return packFloat32( aSign
, 0, 0 );
1007 lastBitMask
<<= 0x96 - aExp
;
1008 roundBitsMask
= lastBitMask
- 1;
1010 roundingMode
= float_rounding_mode
;
1011 if ( roundingMode
== float_round_nearest_even
) {
1012 z
+= lastBitMask
>>1;
1013 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1015 else if ( roundingMode
!= float_round_to_zero
) {
1016 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1020 z
&= ~ roundBitsMask
;
1021 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
1027 -------------------------------------------------------------------------------
1028 Returns the result of adding the absolute values of the single-precision
1029 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1030 before being returned. `zSign' is ignored if the result is a NaN. The
1031 addition is performed according to the IEC/IEEE Standard for Binary
1032 Floating-point Arithmetic.
1033 -------------------------------------------------------------------------------
1035 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1037 int16 aExp
, bExp
, zExp
;
1038 bits32 aSig
, bSig
, zSig
;
1041 aSig
= extractFloat32Frac( a
);
1042 aExp
= extractFloat32Exp( a
);
1043 bSig
= extractFloat32Frac( b
);
1044 bExp
= extractFloat32Exp( b
);
1045 expDiff
= aExp
- bExp
;
1048 if ( 0 < expDiff
) {
1049 if ( aExp
== 0xFF ) {
1050 if ( aSig
) return propagateFloat32NaN( a
, b
);
1059 shift32RightJamming( bSig
, expDiff
, &bSig
);
1062 else if ( expDiff
< 0 ) {
1063 if ( bExp
== 0xFF ) {
1064 if ( bSig
) return propagateFloat32NaN( a
, b
);
1065 return packFloat32( zSign
, 0xFF, 0 );
1073 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1077 if ( aExp
== 0xFF ) {
1078 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1081 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1082 zSig
= 0x40000000 + aSig
+ bSig
;
1087 zSig
= ( aSig
+ bSig
)<<1;
1089 if ( (sbits32
) zSig
< 0 ) {
1094 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1099 -------------------------------------------------------------------------------
1100 Returns the result of subtracting the absolute values of the single-
1101 precision floating-point values `a' and `b'. If `zSign' is true, the
1102 difference is negated before being returned. `zSign' is ignored if the
1103 result is a NaN. The subtraction is performed according to the IEC/IEEE
1104 Standard for Binary Floating-point Arithmetic.
1105 -------------------------------------------------------------------------------
1107 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1109 int16 aExp
, bExp
, zExp
;
1110 bits32 aSig
, bSig
, zSig
;
1113 aSig
= extractFloat32Frac( a
);
1114 aExp
= extractFloat32Exp( a
);
1115 bSig
= extractFloat32Frac( b
);
1116 bExp
= extractFloat32Exp( b
);
1117 expDiff
= aExp
- bExp
;
1120 if ( 0 < expDiff
) goto aExpBigger
;
1121 if ( expDiff
< 0 ) goto bExpBigger
;
1122 if ( aExp
== 0xFF ) {
1123 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1124 float_raise( float_flag_invalid
);
1125 return float32_default_nan
;
1131 if ( bSig
< aSig
) goto aBigger
;
1132 if ( aSig
< bSig
) goto bBigger
;
1133 return packFloat32( float_rounding_mode
== float_round_down
, 0, 0 );
1135 if ( bExp
== 0xFF ) {
1136 if ( bSig
) return propagateFloat32NaN( a
, b
);
1137 return packFloat32( zSign
^ 1, 0xFF, 0 );
1145 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1151 goto normalizeRoundAndPack
;
1153 if ( aExp
== 0xFF ) {
1154 if ( aSig
) return propagateFloat32NaN( a
, b
);
1163 shift32RightJamming( bSig
, expDiff
, &bSig
);
1168 normalizeRoundAndPack
:
1170 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig
);
1175 -------------------------------------------------------------------------------
1176 Returns the result of adding the single-precision floating-point values `a'
1177 and `b'. The operation is performed according to the IEC/IEEE Standard for
1178 Binary Floating-point Arithmetic.
1179 -------------------------------------------------------------------------------
1181 float32
float32_add( float32 a
, float32 b
)
1185 aSign
= extractFloat32Sign( a
);
1186 bSign
= extractFloat32Sign( b
);
1187 if ( aSign
== bSign
) {
1188 return addFloat32Sigs( a
, b
, aSign
);
1191 return subFloat32Sigs( a
, b
, aSign
);
1197 -------------------------------------------------------------------------------
1198 Returns the result of subtracting the single-precision floating-point values
1199 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1200 for Binary Floating-point Arithmetic.
1201 -------------------------------------------------------------------------------
1203 float32
float32_sub( float32 a
, float32 b
)
1207 aSign
= extractFloat32Sign( a
);
1208 bSign
= extractFloat32Sign( b
);
1209 if ( aSign
== bSign
) {
1210 return subFloat32Sigs( a
, b
, aSign
);
1213 return addFloat32Sigs( a
, b
, aSign
);
1219 -------------------------------------------------------------------------------
1220 Returns the result of multiplying the single-precision floating-point values
1221 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1222 for Binary Floating-point Arithmetic.
1223 -------------------------------------------------------------------------------
1225 float32
float32_mul( float32 a
, float32 b
)
1227 flag aSign
, bSign
, zSign
;
1228 int16 aExp
, bExp
, zExp
;
1233 aSig
= extractFloat32Frac( a
);
1234 aExp
= extractFloat32Exp( a
);
1235 aSign
= extractFloat32Sign( a
);
1236 bSig
= extractFloat32Frac( b
);
1237 bExp
= extractFloat32Exp( b
);
1238 bSign
= extractFloat32Sign( b
);
1239 zSign
= aSign
^ bSign
;
1240 if ( aExp
== 0xFF ) {
1241 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1242 return propagateFloat32NaN( a
, b
);
1244 if ( ( bExp
| bSig
) == 0 ) {
1245 float_raise( float_flag_invalid
);
1246 return float32_default_nan
;
1248 return packFloat32( zSign
, 0xFF, 0 );
1250 if ( bExp
== 0xFF ) {
1251 if ( bSig
) return propagateFloat32NaN( a
, b
);
1252 if ( ( aExp
| aSig
) == 0 ) {
1253 float_raise( float_flag_invalid
);
1254 return float32_default_nan
;
1256 return packFloat32( zSign
, 0xFF, 0 );
1259 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1260 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1263 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1264 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1266 zExp
= aExp
+ bExp
- 0x7F;
1267 aSig
= ( aSig
| 0x00800000 )<<7;
1268 bSig
= ( bSig
| 0x00800000 )<<8;
1269 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1271 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1275 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1280 -------------------------------------------------------------------------------
1281 Returns the result of dividing the single-precision floating-point value `a'
1282 by the corresponding value `b'. The operation is performed according to the
1283 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1284 -------------------------------------------------------------------------------
1286 float32
float32_div( float32 a
, float32 b
)
1288 flag aSign
, bSign
, zSign
;
1289 int16 aExp
, bExp
, zExp
;
1290 bits32 aSig
, bSig
, zSig
;
1292 aSig
= extractFloat32Frac( a
);
1293 aExp
= extractFloat32Exp( a
);
1294 aSign
= extractFloat32Sign( a
);
1295 bSig
= extractFloat32Frac( b
);
1296 bExp
= extractFloat32Exp( b
);
1297 bSign
= extractFloat32Sign( b
);
1298 zSign
= aSign
^ bSign
;
1299 if ( aExp
== 0xFF ) {
1300 if ( aSig
) return propagateFloat32NaN( a
, b
);
1301 if ( bExp
== 0xFF ) {
1302 if ( bSig
) return propagateFloat32NaN( a
, b
);
1303 float_raise( float_flag_invalid
);
1304 return float32_default_nan
;
1306 return packFloat32( zSign
, 0xFF, 0 );
1308 if ( bExp
== 0xFF ) {
1309 if ( bSig
) return propagateFloat32NaN( a
, b
);
1310 return packFloat32( zSign
, 0, 0 );
1314 if ( ( aExp
| aSig
) == 0 ) {
1315 float_raise( float_flag_invalid
);
1316 return float32_default_nan
;
1318 float_raise( float_flag_divbyzero
);
1319 return packFloat32( zSign
, 0xFF, 0 );
1321 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1324 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1325 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1327 zExp
= aExp
- bExp
+ 0x7D;
1328 aSig
= ( aSig
| 0x00800000 )<<7;
1329 bSig
= ( bSig
| 0x00800000 )<<8;
1330 if ( bSig
<= ( aSig
+ aSig
) ) {
1334 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1335 if ( ( zSig
& 0x3F ) == 0 ) {
1336 zSig
|= ( ( (bits64
) bSig
) * zSig
!= ( (bits64
) aSig
)<<32 );
1338 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1343 -------------------------------------------------------------------------------
1344 Returns the remainder of the single-precision floating-point value `a'
1345 with respect to the corresponding value `b'. The operation is performed
1346 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1347 -------------------------------------------------------------------------------
1349 float32
float32_rem( float32 a
, float32 b
)
1351 flag aSign
, bSign
, zSign
;
1352 int16 aExp
, bExp
, expDiff
;
1355 bits64 aSig64
, bSig64
, q64
;
1356 bits32 alternateASig
;
1359 aSig
= extractFloat32Frac( a
);
1360 aExp
= extractFloat32Exp( a
);
1361 aSign
= extractFloat32Sign( a
);
1362 bSig
= extractFloat32Frac( b
);
1363 bExp
= extractFloat32Exp( b
);
1364 bSign
= extractFloat32Sign( b
);
1365 if ( aExp
== 0xFF ) {
1366 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1367 return propagateFloat32NaN( a
, b
);
1369 float_raise( float_flag_invalid
);
1370 return float32_default_nan
;
1372 if ( bExp
== 0xFF ) {
1373 if ( bSig
) return propagateFloat32NaN( a
, b
);
1378 float_raise( float_flag_invalid
);
1379 return float32_default_nan
;
1381 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1384 if ( aSig
== 0 ) return a
;
1385 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1387 expDiff
= aExp
- bExp
;
1390 if ( expDiff
< 32 ) {
1393 if ( expDiff
< 0 ) {
1394 if ( expDiff
< -1 ) return a
;
1397 q
= ( bSig
<= aSig
);
1398 if ( q
) aSig
-= bSig
;
1399 if ( 0 < expDiff
) {
1400 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1403 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1411 if ( bSig
<= aSig
) aSig
-= bSig
;
1412 aSig64
= ( (bits64
) aSig
)<<40;
1413 bSig64
= ( (bits64
) bSig
)<<40;
1415 while ( 0 < expDiff
) {
1416 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1417 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1418 aSig64
= - ( ( bSig
* q64
)<<38 );
1422 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1423 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1424 q
= q64
>>( 64 - expDiff
);
1426 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
1429 alternateASig
= aSig
;
1432 } while ( 0 <= (sbits32
) aSig
);
1433 sigMean
= aSig
+ alternateASig
;
1434 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1435 aSig
= alternateASig
;
1437 zSign
= ( (sbits32
) aSig
< 0 );
1438 if ( zSign
) aSig
= - aSig
;
1439 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig
);
1444 -------------------------------------------------------------------------------
1445 Returns the square root of the single-precision floating-point value `a'.
1446 The operation is performed according to the IEC/IEEE Standard for Binary
1447 Floating-point Arithmetic.
1448 -------------------------------------------------------------------------------
1450 float32
float32_sqrt( float32 a
)
1457 aSig
= extractFloat32Frac( a
);
1458 aExp
= extractFloat32Exp( a
);
1459 aSign
= extractFloat32Sign( a
);
1460 if ( aExp
== 0xFF ) {
1461 if ( aSig
) return propagateFloat32NaN( a
, 0 );
1462 if ( ! aSign
) return a
;
1463 float_raise( float_flag_invalid
);
1464 return float32_default_nan
;
1467 if ( ( aExp
| aSig
) == 0 ) return a
;
1468 float_raise( float_flag_invalid
);
1469 return float32_default_nan
;
1472 if ( aSig
== 0 ) return 0;
1473 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1475 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
1476 aSig
= ( aSig
| 0x00800000 )<<8;
1477 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
1478 if ( ( zSig
& 0x7F ) <= 5 ) {
1484 term
= ( (bits64
) zSig
) * zSig
;
1485 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
1486 while ( (sbits64
) rem
< 0 ) {
1488 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
1490 zSig
|= ( rem
!= 0 );
1493 shift32RightJamming( zSig
, 1, &zSig
);
1494 return roundAndPackFloat32( 0, zExp
, zSig
);
1499 -------------------------------------------------------------------------------
1500 Returns 1 if the single-precision floating-point value `a' is equal to the
1501 corresponding value `b', and 0 otherwise. The comparison is performed
1502 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1503 -------------------------------------------------------------------------------
1505 flag
float32_eq( float32 a
, float32 b
)
1508 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1509 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1511 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1512 float_raise( float_flag_invalid
);
1516 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1521 -------------------------------------------------------------------------------
1522 Returns 1 if the single-precision floating-point value `a' is less than or
1523 equal to the corresponding value `b', and 0 otherwise. The comparison is
1524 performed according to the IEC/IEEE Standard for Binary Floating-point
1526 -------------------------------------------------------------------------------
1528 flag
float32_le( float32 a
, float32 b
)
1532 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1533 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1535 float_raise( float_flag_invalid
);
1538 aSign
= extractFloat32Sign( a
);
1539 bSign
= extractFloat32Sign( b
);
1540 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1541 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1546 -------------------------------------------------------------------------------
1547 Returns 1 if the single-precision floating-point value `a' is less than
1548 the corresponding value `b', and 0 otherwise. The comparison is performed
1549 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1550 -------------------------------------------------------------------------------
1552 flag
float32_lt( float32 a
, float32 b
)
1556 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1557 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1559 float_raise( float_flag_invalid
);
1562 aSign
= extractFloat32Sign( a
);
1563 bSign
= extractFloat32Sign( b
);
1564 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1565 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1570 -------------------------------------------------------------------------------
1571 Returns 1 if the single-precision floating-point value `a' is equal to the
1572 corresponding value `b', and 0 otherwise. The invalid exception is raised
1573 if either operand is a NaN. Otherwise, the comparison is performed
1574 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1575 -------------------------------------------------------------------------------
1577 flag
float32_eq_signaling( float32 a
, float32 b
)
1580 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1581 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1583 float_raise( float_flag_invalid
);
1586 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1591 -------------------------------------------------------------------------------
1592 Returns 1 if the single-precision floating-point value `a' is less than or
1593 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1594 cause an exception. Otherwise, the comparison is performed according to the
1595 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1596 -------------------------------------------------------------------------------
1598 flag
float32_le_quiet( float32 a
, float32 b
)
1603 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1604 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1606 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1607 float_raise( float_flag_invalid
);
1611 aSign
= extractFloat32Sign( a
);
1612 bSign
= extractFloat32Sign( b
);
1613 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1614 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1619 -------------------------------------------------------------------------------
1620 Returns 1 if the single-precision floating-point value `a' is less than
1621 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1622 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1623 Standard for Binary Floating-point Arithmetic.
1624 -------------------------------------------------------------------------------
1626 flag
float32_lt_quiet( float32 a
, float32 b
)
1630 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1631 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1633 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1634 float_raise( float_flag_invalid
);
1638 aSign
= extractFloat32Sign( a
);
1639 bSign
= extractFloat32Sign( b
);
1640 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1641 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1646 -------------------------------------------------------------------------------
1647 Returns the result of converting the double-precision floating-point value
1648 `a' to the 32-bit two's complement integer format. The conversion is
1649 performed according to the IEC/IEEE Standard for Binary Floating-point
1650 Arithmetic---which means in particular that the conversion is rounded
1651 according to the current rounding mode. If `a' is a NaN, the largest
1652 positive integer is returned. Otherwise, if the conversion overflows, the
1653 largest integer with the same sign as `a' is returned.
1654 -------------------------------------------------------------------------------
1656 int32
float64_to_int32( float64 a
)
1659 int16 aExp
, shiftCount
;
1662 aSig
= extractFloat64Frac( a
);
1663 aExp
= extractFloat64Exp( a
);
1664 aSign
= extractFloat64Sign( a
);
1665 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1666 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
1667 shiftCount
= 0x42C - aExp
;
1668 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
1669 return roundAndPackInt32( aSign
, aSig
);
1674 -------------------------------------------------------------------------------
1675 Returns the result of converting the double-precision floating-point value
1676 `a' to the 32-bit two's complement integer format. The conversion is
1677 performed according to the IEC/IEEE Standard for Binary Floating-point
1678 Arithmetic, except that the conversion is always rounded toward zero. If
1679 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1680 conversion overflows, the largest integer with the same sign as `a' is
1682 -------------------------------------------------------------------------------
1684 int32
float64_to_int32_round_to_zero( float64 a
)
1687 int16 aExp
, shiftCount
;
1688 bits64 aSig
, savedASig
;
1691 aSig
= extractFloat64Frac( a
);
1692 aExp
= extractFloat64Exp( a
);
1693 aSign
= extractFloat64Sign( a
);
1694 shiftCount
= 0x433 - aExp
;
1695 if ( shiftCount
< 21 ) {
1696 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1699 else if ( 52 < shiftCount
) {
1700 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
1703 aSig
|= LIT64( 0x0010000000000000 );
1705 aSig
>>= shiftCount
;
1707 if ( aSign
) z
= - z
;
1708 if ( ( z
< 0 ) ^ aSign
) {
1710 float_exception_flags
|= float_flag_invalid
;
1711 return aSign
? 0x80000000 : 0x7FFFFFFF;
1713 if ( ( aSig
<<shiftCount
) != savedASig
) {
1714 float_exception_flags
|= float_flag_inexact
;
1721 -------------------------------------------------------------------------------
1722 Returns the result of converting the double-precision floating-point value
1723 `a' to the 32-bit two's complement unsigned integer format. The conversion
1724 is performed according to the IEC/IEEE Standard for Binary Floating-point
1725 Arithmetic---which means in particular that the conversion is rounded
1726 according to the current rounding mode. If `a' is a NaN, the largest
1727 positive integer is returned. Otherwise, if the conversion overflows, the
1728 largest positive integer is returned.
1729 -------------------------------------------------------------------------------
1731 int32
float64_to_uint32( float64 a
)
1734 int16 aExp
, shiftCount
;
1737 aSig
= extractFloat64Frac( a
);
1738 aExp
= extractFloat64Exp( a
);
1739 aSign
= 0; //extractFloat64Sign( a );
1740 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1741 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
1742 shiftCount
= 0x42C - aExp
;
1743 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
1744 return roundAndPackInt32( aSign
, aSig
);
1748 -------------------------------------------------------------------------------
1749 Returns the result of converting the double-precision floating-point value
1750 `a' to the 32-bit two's complement integer format. The conversion is
1751 performed according to the IEC/IEEE Standard for Binary Floating-point
1752 Arithmetic, except that the conversion is always rounded toward zero. If
1753 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1754 conversion overflows, the largest positive integer is returned.
1755 -------------------------------------------------------------------------------
1757 int32
float64_to_uint32_round_to_zero( float64 a
)
1760 int16 aExp
, shiftCount
;
1761 bits64 aSig
, savedASig
;
1764 aSig
= extractFloat64Frac( a
);
1765 aExp
= extractFloat64Exp( a
);
1766 aSign
= extractFloat64Sign( a
);
1767 shiftCount
= 0x433 - aExp
;
1768 if ( shiftCount
< 21 ) {
1769 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1772 else if ( 52 < shiftCount
) {
1773 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
1776 aSig
|= LIT64( 0x0010000000000000 );
1778 aSig
>>= shiftCount
;
1780 if ( aSign
) z
= - z
;
1781 if ( ( z
< 0 ) ^ aSign
) {
1783 float_exception_flags
|= float_flag_invalid
;
1784 return aSign
? 0x80000000 : 0x7FFFFFFF;
1786 if ( ( aSig
<<shiftCount
) != savedASig
) {
1787 float_exception_flags
|= float_flag_inexact
;
1793 -------------------------------------------------------------------------------
1794 Returns the result of converting the double-precision floating-point value
1795 `a' to the single-precision floating-point format. The conversion is
1796 performed according to the IEC/IEEE Standard for Binary Floating-point
1798 -------------------------------------------------------------------------------
1800 float32
float64_to_float32( float64 a
)
1807 aSig
= extractFloat64Frac( a
);
1808 aExp
= extractFloat64Exp( a
);
1809 aSign
= extractFloat64Sign( a
);
1810 if ( aExp
== 0x7FF ) {
1811 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a
) );
1812 return packFloat32( aSign
, 0xFF, 0 );
1814 shift64RightJamming( aSig
, 22, &aSig
);
1816 if ( aExp
|| zSig
) {
1820 return roundAndPackFloat32( aSign
, aExp
, zSig
);
1827 -------------------------------------------------------------------------------
1828 Returns the result of converting the double-precision floating-point value
1829 `a' to the extended double-precision floating-point format. The conversion
1830 is performed according to the IEC/IEEE Standard for Binary Floating-point
1832 -------------------------------------------------------------------------------
1834 floatx80
float64_to_floatx80( float64 a
)
1840 aSig
= extractFloat64Frac( a
);
1841 aExp
= extractFloat64Exp( a
);
1842 aSign
= extractFloat64Sign( a
);
1843 if ( aExp
== 0x7FF ) {
1844 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a
) );
1845 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1848 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1849 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
1853 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
1860 -------------------------------------------------------------------------------
1861 Rounds the double-precision floating-point value `a' to an integer, and
1862 returns the result as a double-precision floating-point value. The
1863 operation is performed according to the IEC/IEEE Standard for Binary
1864 Floating-point Arithmetic.
1865 -------------------------------------------------------------------------------
1867 float64
float64_round_to_int( float64 a
)
1871 bits64 lastBitMask
, roundBitsMask
;
1875 aExp
= extractFloat64Exp( a
);
1876 if ( 0x433 <= aExp
) {
1877 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
1878 return propagateFloat64NaN( a
, a
);
1882 if ( aExp
<= 0x3FE ) {
1883 if ( (bits64
) ( a
<<1 ) == 0 ) return a
;
1884 float_exception_flags
|= float_flag_inexact
;
1885 aSign
= extractFloat64Sign( a
);
1886 switch ( float_rounding_mode
) {
1887 case float_round_nearest_even
:
1888 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
1889 return packFloat64( aSign
, 0x3FF, 0 );
1892 case float_round_down
:
1893 return aSign
? LIT64( 0xBFF0000000000000 ) : 0;
1894 case float_round_up
:
1896 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1898 return packFloat64( aSign
, 0, 0 );
1901 lastBitMask
<<= 0x433 - aExp
;
1902 roundBitsMask
= lastBitMask
- 1;
1904 roundingMode
= float_rounding_mode
;
1905 if ( roundingMode
== float_round_nearest_even
) {
1906 z
+= lastBitMask
>>1;
1907 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1909 else if ( roundingMode
!= float_round_to_zero
) {
1910 if ( extractFloat64Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1914 z
&= ~ roundBitsMask
;
1915 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
1921 -------------------------------------------------------------------------------
1922 Returns the result of adding the absolute values of the double-precision
1923 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1924 before being returned. `zSign' is ignored if the result is a NaN. The
1925 addition is performed according to the IEC/IEEE Standard for Binary
1926 Floating-point Arithmetic.
1927 -------------------------------------------------------------------------------
1929 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign
)
1931 int16 aExp
, bExp
, zExp
;
1932 bits64 aSig
, bSig
, zSig
;
1935 aSig
= extractFloat64Frac( a
);
1936 aExp
= extractFloat64Exp( a
);
1937 bSig
= extractFloat64Frac( b
);
1938 bExp
= extractFloat64Exp( b
);
1939 expDiff
= aExp
- bExp
;
1942 if ( 0 < expDiff
) {
1943 if ( aExp
== 0x7FF ) {
1944 if ( aSig
) return propagateFloat64NaN( a
, b
);
1951 bSig
|= LIT64( 0x2000000000000000 );
1953 shift64RightJamming( bSig
, expDiff
, &bSig
);
1956 else if ( expDiff
< 0 ) {
1957 if ( bExp
== 0x7FF ) {
1958 if ( bSig
) return propagateFloat64NaN( a
, b
);
1959 return packFloat64( zSign
, 0x7FF, 0 );
1965 aSig
|= LIT64( 0x2000000000000000 );
1967 shift64RightJamming( aSig
, - expDiff
, &aSig
);
1971 if ( aExp
== 0x7FF ) {
1972 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
1975 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
1976 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
1980 aSig
|= LIT64( 0x2000000000000000 );
1981 zSig
= ( aSig
+ bSig
)<<1;
1983 if ( (sbits64
) zSig
< 0 ) {
1988 return roundAndPackFloat64( zSign
, zExp
, zSig
);
1993 -------------------------------------------------------------------------------
1994 Returns the result of subtracting the absolute values of the double-
1995 precision floating-point values `a' and `b'. If `zSign' is true, the
1996 difference is negated before being returned. `zSign' is ignored if the
1997 result is a NaN. The subtraction is performed according to the IEC/IEEE
1998 Standard for Binary Floating-point Arithmetic.
1999 -------------------------------------------------------------------------------
2001 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2003 int16 aExp
, bExp
, zExp
;
2004 bits64 aSig
, bSig
, zSig
;
2007 aSig
= extractFloat64Frac( a
);
2008 aExp
= extractFloat64Exp( a
);
2009 bSig
= extractFloat64Frac( b
);
2010 bExp
= extractFloat64Exp( b
);
2011 expDiff
= aExp
- bExp
;
2014 if ( 0 < expDiff
) goto aExpBigger
;
2015 if ( expDiff
< 0 ) goto bExpBigger
;
2016 if ( aExp
== 0x7FF ) {
2017 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2018 float_raise( float_flag_invalid
);
2019 return float64_default_nan
;
2025 if ( bSig
< aSig
) goto aBigger
;
2026 if ( aSig
< bSig
) goto bBigger
;
2027 return packFloat64( float_rounding_mode
== float_round_down
, 0, 0 );
2029 if ( bExp
== 0x7FF ) {
2030 if ( bSig
) return propagateFloat64NaN( a
, b
);
2031 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2037 aSig
|= LIT64( 0x4000000000000000 );
2039 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2040 bSig
|= LIT64( 0x4000000000000000 );
2045 goto normalizeRoundAndPack
;
2047 if ( aExp
== 0x7FF ) {
2048 if ( aSig
) return propagateFloat64NaN( a
, b
);
2055 bSig
|= LIT64( 0x4000000000000000 );
2057 shift64RightJamming( bSig
, expDiff
, &bSig
);
2058 aSig
|= LIT64( 0x4000000000000000 );
2062 normalizeRoundAndPack
:
2064 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig
);
2069 -------------------------------------------------------------------------------
2070 Returns the result of adding the double-precision floating-point values `a'
2071 and `b'. The operation is performed according to the IEC/IEEE Standard for
2072 Binary Floating-point Arithmetic.
2073 -------------------------------------------------------------------------------
2075 float64
float64_add( float64 a
, float64 b
)
2079 aSign
= extractFloat64Sign( a
);
2080 bSign
= extractFloat64Sign( b
);
2081 if ( aSign
== bSign
) {
2082 return addFloat64Sigs( a
, b
, aSign
);
2085 return subFloat64Sigs( a
, b
, aSign
);
2091 -------------------------------------------------------------------------------
2092 Returns the result of subtracting the double-precision floating-point values
2093 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2094 for Binary Floating-point Arithmetic.
2095 -------------------------------------------------------------------------------
2097 float64
float64_sub( float64 a
, float64 b
)
2101 aSign
= extractFloat64Sign( a
);
2102 bSign
= extractFloat64Sign( b
);
2103 if ( aSign
== bSign
) {
2104 return subFloat64Sigs( a
, b
, aSign
);
2107 return addFloat64Sigs( a
, b
, aSign
);
2113 -------------------------------------------------------------------------------
2114 Returns the result of multiplying the double-precision floating-point values
2115 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2116 for Binary Floating-point Arithmetic.
2117 -------------------------------------------------------------------------------
2119 float64
float64_mul( float64 a
, float64 b
)
2121 flag aSign
, bSign
, zSign
;
2122 int16 aExp
, bExp
, zExp
;
2123 bits64 aSig
, bSig
, zSig0
, zSig1
;
2125 aSig
= extractFloat64Frac( a
);
2126 aExp
= extractFloat64Exp( a
);
2127 aSign
= extractFloat64Sign( a
);
2128 bSig
= extractFloat64Frac( b
);
2129 bExp
= extractFloat64Exp( b
);
2130 bSign
= extractFloat64Sign( b
);
2131 zSign
= aSign
^ bSign
;
2132 if ( aExp
== 0x7FF ) {
2133 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2134 return propagateFloat64NaN( a
, b
);
2136 if ( ( bExp
| bSig
) == 0 ) {
2137 float_raise( float_flag_invalid
);
2138 return float64_default_nan
;
2140 return packFloat64( zSign
, 0x7FF, 0 );
2142 if ( bExp
== 0x7FF ) {
2143 if ( bSig
) return propagateFloat64NaN( a
, b
);
2144 if ( ( aExp
| aSig
) == 0 ) {
2145 float_raise( float_flag_invalid
);
2146 return float64_default_nan
;
2148 return packFloat64( zSign
, 0x7FF, 0 );
2151 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2152 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2155 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2156 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2158 zExp
= aExp
+ bExp
- 0x3FF;
2159 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2160 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2161 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2162 zSig0
|= ( zSig1
!= 0 );
2163 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2167 return roundAndPackFloat64( zSign
, zExp
, zSig0
);
2172 -------------------------------------------------------------------------------
2173 Returns the result of dividing the double-precision floating-point value `a'
2174 by the corresponding value `b'. The operation is performed according to
2175 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2176 -------------------------------------------------------------------------------
2178 float64
float64_div( float64 a
, float64 b
)
2180 flag aSign
, bSign
, zSign
;
2181 int16 aExp
, bExp
, zExp
;
2182 bits64 aSig
, bSig
, zSig
;
2184 bits64 term0
, term1
;
2186 aSig
= extractFloat64Frac( a
);
2187 aExp
= extractFloat64Exp( a
);
2188 aSign
= extractFloat64Sign( a
);
2189 bSig
= extractFloat64Frac( b
);
2190 bExp
= extractFloat64Exp( b
);
2191 bSign
= extractFloat64Sign( b
);
2192 zSign
= aSign
^ bSign
;
2193 if ( aExp
== 0x7FF ) {
2194 if ( aSig
) return propagateFloat64NaN( a
, b
);
2195 if ( bExp
== 0x7FF ) {
2196 if ( bSig
) return propagateFloat64NaN( a
, b
);
2197 float_raise( float_flag_invalid
);
2198 return float64_default_nan
;
2200 return packFloat64( zSign
, 0x7FF, 0 );
2202 if ( bExp
== 0x7FF ) {
2203 if ( bSig
) return propagateFloat64NaN( a
, b
);
2204 return packFloat64( zSign
, 0, 0 );
2208 if ( ( aExp
| aSig
) == 0 ) {
2209 float_raise( float_flag_invalid
);
2210 return float64_default_nan
;
2212 float_raise( float_flag_divbyzero
);
2213 return packFloat64( zSign
, 0x7FF, 0 );
2215 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2218 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2219 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2221 zExp
= aExp
- bExp
+ 0x3FD;
2222 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2223 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2224 if ( bSig
<= ( aSig
+ aSig
) ) {
2228 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2229 if ( ( zSig
& 0x1FF ) <= 2 ) {
2230 mul64To128( bSig
, zSig
, &term0
, &term1
);
2231 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2232 while ( (sbits64
) rem0
< 0 ) {
2234 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2236 zSig
|= ( rem1
!= 0 );
2238 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2243 -------------------------------------------------------------------------------
2244 Returns the remainder of the double-precision floating-point value `a'
2245 with respect to the corresponding value `b'. The operation is performed
2246 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2247 -------------------------------------------------------------------------------
2249 float64
float64_rem( float64 a
, float64 b
)
2251 flag aSign
, bSign
, zSign
;
2252 int16 aExp
, bExp
, expDiff
;
2254 bits64 q
, alternateASig
;
2257 aSig
= extractFloat64Frac( a
);
2258 aExp
= extractFloat64Exp( a
);
2259 aSign
= extractFloat64Sign( a
);
2260 bSig
= extractFloat64Frac( b
);
2261 bExp
= extractFloat64Exp( b
);
2262 bSign
= extractFloat64Sign( b
);
2263 if ( aExp
== 0x7FF ) {
2264 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2265 return propagateFloat64NaN( a
, b
);
2267 float_raise( float_flag_invalid
);
2268 return float64_default_nan
;
2270 if ( bExp
== 0x7FF ) {
2271 if ( bSig
) return propagateFloat64NaN( a
, b
);
2276 float_raise( float_flag_invalid
);
2277 return float64_default_nan
;
2279 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2282 if ( aSig
== 0 ) return a
;
2283 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2285 expDiff
= aExp
- bExp
;
2286 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
2287 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2288 if ( expDiff
< 0 ) {
2289 if ( expDiff
< -1 ) return a
;
2292 q
= ( bSig
<= aSig
);
2293 if ( q
) aSig
-= bSig
;
2295 while ( 0 < expDiff
) {
2296 q
= estimateDiv128To64( aSig
, 0, bSig
);
2297 q
= ( 2 < q
) ? q
- 2 : 0;
2298 aSig
= - ( ( bSig
>>2 ) * q
);
2302 if ( 0 < expDiff
) {
2303 q
= estimateDiv128To64( aSig
, 0, bSig
);
2304 q
= ( 2 < q
) ? q
- 2 : 0;
2307 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2314 alternateASig
= aSig
;
2317 } while ( 0 <= (sbits64
) aSig
);
2318 sigMean
= aSig
+ alternateASig
;
2319 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2320 aSig
= alternateASig
;
2322 zSign
= ( (sbits64
) aSig
< 0 );
2323 if ( zSign
) aSig
= - aSig
;
2324 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig
);
2329 -------------------------------------------------------------------------------
2330 Returns the square root of the double-precision floating-point value `a'.
2331 The operation is performed according to the IEC/IEEE Standard for Binary
2332 Floating-point Arithmetic.
2333 -------------------------------------------------------------------------------
2335 float64
float64_sqrt( float64 a
)
2340 bits64 rem0
, rem1
, term0
, term1
; //, shiftedRem;
2343 aSig
= extractFloat64Frac( a
);
2344 aExp
= extractFloat64Exp( a
);
2345 aSign
= extractFloat64Sign( a
);
2346 if ( aExp
== 0x7FF ) {
2347 if ( aSig
) return propagateFloat64NaN( a
, a
);
2348 if ( ! aSign
) return a
;
2349 float_raise( float_flag_invalid
);
2350 return float64_default_nan
;
2353 if ( ( aExp
| aSig
) == 0 ) return a
;
2354 float_raise( float_flag_invalid
);
2355 return float64_default_nan
;
2358 if ( aSig
== 0 ) return 0;
2359 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2361 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
2362 aSig
|= LIT64( 0x0010000000000000 );
2363 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
2365 aSig
<<= 9 - ( aExp
& 1 );
2366 zSig
= estimateDiv128To64( aSig
, 0, zSig
) + zSig
+ 2;
2367 if ( ( zSig
& 0x3FF ) <= 5 ) {
2369 zSig
= LIT64( 0xFFFFFFFFFFFFFFFF );
2373 mul64To128( zSig
, zSig
, &term0
, &term1
);
2374 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2375 while ( (sbits64
) rem0
< 0 ) {
2377 shortShift128Left( 0, zSig
, 1, &term0
, &term1
);
2379 add128( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
2381 zSig
|= ( ( rem0
| rem1
) != 0 );
2384 shift64RightJamming( zSig
, 1, &zSig
);
2385 return roundAndPackFloat64( 0, zExp
, zSig
);
2390 -------------------------------------------------------------------------------
2391 Returns 1 if the double-precision floating-point value `a' is equal to the
2392 corresponding value `b', and 0 otherwise. The comparison is performed
2393 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2394 -------------------------------------------------------------------------------
2396 flag
float64_eq( float64 a
, float64 b
)
2399 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2400 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2402 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2403 float_raise( float_flag_invalid
);
2407 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2412 -------------------------------------------------------------------------------
2413 Returns 1 if the double-precision floating-point value `a' is less than or
2414 equal to the corresponding value `b', and 0 otherwise. The comparison is
2415 performed according to the IEC/IEEE Standard for Binary Floating-point
2417 -------------------------------------------------------------------------------
2419 flag
float64_le( float64 a
, float64 b
)
2423 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2424 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2426 float_raise( float_flag_invalid
);
2429 aSign
= extractFloat64Sign( a
);
2430 bSign
= extractFloat64Sign( b
);
2431 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2432 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2437 -------------------------------------------------------------------------------
2438 Returns 1 if the double-precision floating-point value `a' is less than
2439 the corresponding value `b', and 0 otherwise. The comparison is performed
2440 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2441 -------------------------------------------------------------------------------
2443 flag
float64_lt( float64 a
, float64 b
)
2447 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2448 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2450 float_raise( float_flag_invalid
);
2453 aSign
= extractFloat64Sign( a
);
2454 bSign
= extractFloat64Sign( b
);
2455 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2456 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2461 -------------------------------------------------------------------------------
2462 Returns 1 if the double-precision floating-point value `a' is equal to the
2463 corresponding value `b', and 0 otherwise. The invalid exception is raised
2464 if either operand is a NaN. Otherwise, the comparison is performed
2465 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2466 -------------------------------------------------------------------------------
2468 flag
float64_eq_signaling( float64 a
, float64 b
)
2471 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2472 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2474 float_raise( float_flag_invalid
);
2477 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2482 -------------------------------------------------------------------------------
2483 Returns 1 if the double-precision floating-point value `a' is less than or
2484 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2485 cause an exception. Otherwise, the comparison is performed according to the
2486 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2487 -------------------------------------------------------------------------------
2489 flag
float64_le_quiet( float64 a
, float64 b
)
2494 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2495 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2497 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2498 float_raise( float_flag_invalid
);
2502 aSign
= extractFloat64Sign( a
);
2503 bSign
= extractFloat64Sign( b
);
2504 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2505 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2510 -------------------------------------------------------------------------------
2511 Returns 1 if the double-precision floating-point value `a' is less than
2512 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2513 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2514 Standard for Binary Floating-point Arithmetic.
2515 -------------------------------------------------------------------------------
2517 flag
float64_lt_quiet( float64 a
, float64 b
)
2521 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2522 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2524 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2525 float_raise( float_flag_invalid
);
2529 aSign
= extractFloat64Sign( a
);
2530 bSign
= extractFloat64Sign( b
);
2531 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2532 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2539 -------------------------------------------------------------------------------
2540 Returns the result of converting the extended double-precision floating-
2541 point value `a' to the 32-bit two's complement integer format. The
2542 conversion is performed according to the IEC/IEEE Standard for Binary
2543 Floating-point Arithmetic---which means in particular that the conversion
2544 is rounded according to the current rounding mode. If `a' is a NaN, the
2545 largest positive integer is returned. Otherwise, if the conversion
2546 overflows, the largest integer with the same sign as `a' is returned.
2547 -------------------------------------------------------------------------------
2549 int32
floatx80_to_int32( floatx80 a
)
2552 int32 aExp
, shiftCount
;
2555 aSig
= extractFloatx80Frac( a
);
2556 aExp
= extractFloatx80Exp( a
);
2557 aSign
= extractFloatx80Sign( a
);
2558 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
2559 shiftCount
= 0x4037 - aExp
;
2560 if ( shiftCount
<= 0 ) shiftCount
= 1;
2561 shift64RightJamming( aSig
, shiftCount
, &aSig
);
2562 return roundAndPackInt32( aSign
, aSig
);
2567 -------------------------------------------------------------------------------
2568 Returns the result of converting the extended double-precision floating-
2569 point value `a' to the 32-bit two's complement integer format. The
2570 conversion is performed according to the IEC/IEEE Standard for Binary
2571 Floating-point Arithmetic, except that the conversion is always rounded
2572 toward zero. If `a' is a NaN, the largest positive integer is returned.
2573 Otherwise, if the conversion overflows, the largest integer with the same
2574 sign as `a' is returned.
2575 -------------------------------------------------------------------------------
2577 int32
floatx80_to_int32_round_to_zero( floatx80 a
)
2580 int32 aExp
, shiftCount
;
2581 bits64 aSig
, savedASig
;
2584 aSig
= extractFloatx80Frac( a
);
2585 aExp
= extractFloatx80Exp( a
);
2586 aSign
= extractFloatx80Sign( a
);
2587 shiftCount
= 0x403E - aExp
;
2588 if ( shiftCount
< 32 ) {
2589 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
2592 else if ( 63 < shiftCount
) {
2593 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
2597 aSig
>>= shiftCount
;
2599 if ( aSign
) z
= - z
;
2600 if ( ( z
< 0 ) ^ aSign
) {
2602 float_exception_flags
|= float_flag_invalid
;
2603 return aSign
? 0x80000000 : 0x7FFFFFFF;
2605 if ( ( aSig
<<shiftCount
) != savedASig
) {
2606 float_exception_flags
|= float_flag_inexact
;
2613 -------------------------------------------------------------------------------
2614 Returns the result of converting the extended double-precision floating-
2615 point value `a' to the single-precision floating-point format. The
2616 conversion is performed according to the IEC/IEEE Standard for Binary
2617 Floating-point Arithmetic.
2618 -------------------------------------------------------------------------------
2620 float32
floatx80_to_float32( floatx80 a
)
2626 aSig
= extractFloatx80Frac( a
);
2627 aExp
= extractFloatx80Exp( a
);
2628 aSign
= extractFloatx80Sign( a
);
2629 if ( aExp
== 0x7FFF ) {
2630 if ( (bits64
) ( aSig
<<1 ) ) {
2631 return commonNaNToFloat32( floatx80ToCommonNaN( a
) );
2633 return packFloat32( aSign
, 0xFF, 0 );
2635 shift64RightJamming( aSig
, 33, &aSig
);
2636 if ( aExp
|| aSig
) aExp
-= 0x3F81;
2637 return roundAndPackFloat32( aSign
, aExp
, aSig
);
2642 -------------------------------------------------------------------------------
2643 Returns the result of converting the extended double-precision floating-
2644 point value `a' to the double-precision floating-point format. The
2645 conversion is performed according to the IEC/IEEE Standard for Binary
2646 Floating-point Arithmetic.
2647 -------------------------------------------------------------------------------
2649 float64
floatx80_to_float64( floatx80 a
)
2655 aSig
= extractFloatx80Frac( a
);
2656 aExp
= extractFloatx80Exp( a
);
2657 aSign
= extractFloatx80Sign( a
);
2658 if ( aExp
== 0x7FFF ) {
2659 if ( (bits64
) ( aSig
<<1 ) ) {
2660 return commonNaNToFloat64( floatx80ToCommonNaN( a
) );
2662 return packFloat64( aSign
, 0x7FF, 0 );
2664 shift64RightJamming( aSig
, 1, &zSig
);
2665 if ( aExp
|| aSig
) aExp
-= 0x3C01;
2666 return roundAndPackFloat64( aSign
, aExp
, zSig
);
2671 -------------------------------------------------------------------------------
2672 Rounds the extended double-precision floating-point value `a' to an integer,
2673 and returns the result as an extended quadruple-precision floating-point
2674 value. The operation is performed according to the IEC/IEEE Standard for
2675 Binary Floating-point Arithmetic.
2676 -------------------------------------------------------------------------------
2678 floatx80
floatx80_round_to_int( floatx80 a
)
2682 bits64 lastBitMask
, roundBitsMask
;
2686 aExp
= extractFloatx80Exp( a
);
2687 if ( 0x403E <= aExp
) {
2688 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
2689 return propagateFloatx80NaN( a
, a
);
2693 if ( aExp
<= 0x3FFE ) {
2695 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
2698 float_exception_flags
|= float_flag_inexact
;
2699 aSign
= extractFloatx80Sign( a
);
2700 switch ( float_rounding_mode
) {
2701 case float_round_nearest_even
:
2702 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
2705 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
2708 case float_round_down
:
2711 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2712 : packFloatx80( 0, 0, 0 );
2713 case float_round_up
:
2715 aSign
? packFloatx80( 1, 0, 0 )
2716 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2718 return packFloatx80( aSign
, 0, 0 );
2721 lastBitMask
<<= 0x403E - aExp
;
2722 roundBitsMask
= lastBitMask
- 1;
2724 roundingMode
= float_rounding_mode
;
2725 if ( roundingMode
== float_round_nearest_even
) {
2726 z
.low
+= lastBitMask
>>1;
2727 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
2729 else if ( roundingMode
!= float_round_to_zero
) {
2730 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
2731 z
.low
+= roundBitsMask
;
2734 z
.low
&= ~ roundBitsMask
;
2737 z
.low
= LIT64( 0x8000000000000000 );
2739 if ( z
.low
!= a
.low
) float_exception_flags
|= float_flag_inexact
;
2745 -------------------------------------------------------------------------------
2746 Returns the result of adding the absolute values of the extended double-
2747 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2748 negated before being returned. `zSign' is ignored if the result is a NaN.
2749 The addition is performed according to the IEC/IEEE Standard for Binary
2750 Floating-point Arithmetic.
2751 -------------------------------------------------------------------------------
2753 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
2755 int32 aExp
, bExp
, zExp
;
2756 bits64 aSig
, bSig
, zSig0
, zSig1
;
2759 aSig
= extractFloatx80Frac( a
);
2760 aExp
= extractFloatx80Exp( a
);
2761 bSig
= extractFloatx80Frac( b
);
2762 bExp
= extractFloatx80Exp( b
);
2763 expDiff
= aExp
- bExp
;
2764 if ( 0 < expDiff
) {
2765 if ( aExp
== 0x7FFF ) {
2766 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2769 if ( bExp
== 0 ) --expDiff
;
2770 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
2773 else if ( expDiff
< 0 ) {
2774 if ( bExp
== 0x7FFF ) {
2775 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2776 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2778 if ( aExp
== 0 ) ++expDiff
;
2779 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
2783 if ( aExp
== 0x7FFF ) {
2784 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
2785 return propagateFloatx80NaN( a
, b
);
2790 zSig0
= aSig
+ bSig
;
2792 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
2799 zSig0
= aSig
+ bSig
;
2801 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
2803 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
2804 zSig0
|= LIT64( 0x8000000000000000 );
2808 roundAndPackFloatx80(
2809 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
2814 -------------------------------------------------------------------------------
2815 Returns the result of subtracting the absolute values of the extended
2816 double-precision floating-point values `a' and `b'. If `zSign' is true,
2817 the difference is negated before being returned. `zSign' is ignored if the
2818 result is a NaN. The subtraction is performed according to the IEC/IEEE
2819 Standard for Binary Floating-point Arithmetic.
2820 -------------------------------------------------------------------------------
2822 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
2824 int32 aExp
, bExp
, zExp
;
2825 bits64 aSig
, bSig
, zSig0
, zSig1
;
2829 aSig
= extractFloatx80Frac( a
);
2830 aExp
= extractFloatx80Exp( a
);
2831 bSig
= extractFloatx80Frac( b
);
2832 bExp
= extractFloatx80Exp( b
);
2833 expDiff
= aExp
- bExp
;
2834 if ( 0 < expDiff
) goto aExpBigger
;
2835 if ( expDiff
< 0 ) goto bExpBigger
;
2836 if ( aExp
== 0x7FFF ) {
2837 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
2838 return propagateFloatx80NaN( a
, b
);
2840 float_raise( float_flag_invalid
);
2841 z
.low
= floatx80_default_nan_low
;
2842 z
.high
= floatx80_default_nan_high
;
2850 if ( bSig
< aSig
) goto aBigger
;
2851 if ( aSig
< bSig
) goto bBigger
;
2852 return packFloatx80( float_rounding_mode
== float_round_down
, 0, 0 );
2854 if ( bExp
== 0x7FFF ) {
2855 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2856 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2858 if ( aExp
== 0 ) ++expDiff
;
2859 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
2861 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
2864 goto normalizeRoundAndPack
;
2866 if ( aExp
== 0x7FFF ) {
2867 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2870 if ( bExp
== 0 ) --expDiff
;
2871 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
2873 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
2875 normalizeRoundAndPack
:
2877 normalizeRoundAndPackFloatx80(
2878 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
2883 -------------------------------------------------------------------------------
2884 Returns the result of adding the extended double-precision floating-point
2885 values `a' and `b'. The operation is performed according to the IEC/IEEE
2886 Standard for Binary Floating-point Arithmetic.
2887 -------------------------------------------------------------------------------
2889 floatx80
floatx80_add( floatx80 a
, floatx80 b
)
2893 aSign
= extractFloatx80Sign( a
);
2894 bSign
= extractFloatx80Sign( b
);
2895 if ( aSign
== bSign
) {
2896 return addFloatx80Sigs( a
, b
, aSign
);
2899 return subFloatx80Sigs( a
, b
, aSign
);
2905 -------------------------------------------------------------------------------
2906 Returns the result of subtracting the extended double-precision floating-
2907 point values `a' and `b'. The operation is performed according to the
2908 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2909 -------------------------------------------------------------------------------
2911 floatx80
floatx80_sub( floatx80 a
, floatx80 b
)
2915 aSign
= extractFloatx80Sign( a
);
2916 bSign
= extractFloatx80Sign( b
);
2917 if ( aSign
== bSign
) {
2918 return subFloatx80Sigs( a
, b
, aSign
);
2921 return addFloatx80Sigs( a
, b
, aSign
);
2927 -------------------------------------------------------------------------------
2928 Returns the result of multiplying the extended double-precision floating-
2929 point values `a' and `b'. The operation is performed according to the
2930 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2931 -------------------------------------------------------------------------------
2933 floatx80
floatx80_mul( floatx80 a
, floatx80 b
)
2935 flag aSign
, bSign
, zSign
;
2936 int32 aExp
, bExp
, zExp
;
2937 bits64 aSig
, bSig
, zSig0
, zSig1
;
2940 aSig
= extractFloatx80Frac( a
);
2941 aExp
= extractFloatx80Exp( a
);
2942 aSign
= extractFloatx80Sign( a
);
2943 bSig
= extractFloatx80Frac( b
);
2944 bExp
= extractFloatx80Exp( b
);
2945 bSign
= extractFloatx80Sign( b
);
2946 zSign
= aSign
^ bSign
;
2947 if ( aExp
== 0x7FFF ) {
2948 if ( (bits64
) ( aSig
<<1 )
2949 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
2950 return propagateFloatx80NaN( a
, b
);
2952 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
2953 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2955 if ( bExp
== 0x7FFF ) {
2956 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2957 if ( ( aExp
| aSig
) == 0 ) {
2959 float_raise( float_flag_invalid
);
2960 z
.low
= floatx80_default_nan_low
;
2961 z
.high
= floatx80_default_nan_high
;
2964 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2967 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
2968 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
2971 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
2972 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
2974 zExp
= aExp
+ bExp
- 0x3FFE;
2975 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2976 if ( 0 < (sbits64
) zSig0
) {
2977 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
2981 roundAndPackFloatx80(
2982 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
2987 -------------------------------------------------------------------------------
2988 Returns the result of dividing the extended double-precision floating-point
2989 value `a' by the corresponding value `b'. The operation is performed
2990 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2991 -------------------------------------------------------------------------------
2993 floatx80
floatx80_div( floatx80 a
, floatx80 b
)
2995 flag aSign
, bSign
, zSign
;
2996 int32 aExp
, bExp
, zExp
;
2997 bits64 aSig
, bSig
, zSig0
, zSig1
;
2998 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3001 aSig
= extractFloatx80Frac( a
);
3002 aExp
= extractFloatx80Exp( a
);
3003 aSign
= extractFloatx80Sign( a
);
3004 bSig
= extractFloatx80Frac( b
);
3005 bExp
= extractFloatx80Exp( b
);
3006 bSign
= extractFloatx80Sign( b
);
3007 zSign
= aSign
^ bSign
;
3008 if ( aExp
== 0x7FFF ) {
3009 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3010 if ( bExp
== 0x7FFF ) {
3011 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3014 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3016 if ( bExp
== 0x7FFF ) {
3017 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3018 return packFloatx80( zSign
, 0, 0 );
3022 if ( ( aExp
| aSig
) == 0 ) {
3024 float_raise( float_flag_invalid
);
3025 z
.low
= floatx80_default_nan_low
;
3026 z
.high
= floatx80_default_nan_high
;
3029 float_raise( float_flag_divbyzero
);
3030 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3032 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3035 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3036 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3038 zExp
= aExp
- bExp
+ 0x3FFE;
3040 if ( bSig
<= aSig
) {
3041 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3044 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3045 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3046 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3047 while ( (sbits64
) rem0
< 0 ) {
3049 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3051 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3052 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3053 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3054 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3055 while ( (sbits64
) rem1
< 0 ) {
3057 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3059 zSig1
|= ( ( rem1
| rem2
) != 0 );
3062 roundAndPackFloatx80(
3063 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3068 -------------------------------------------------------------------------------
3069 Returns the remainder of the extended double-precision floating-point value
3070 `a' with respect to the corresponding value `b'. The operation is performed
3071 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3072 -------------------------------------------------------------------------------
3074 floatx80
floatx80_rem( floatx80 a
, floatx80 b
)
3076 flag aSign
, bSign
, zSign
;
3077 int32 aExp
, bExp
, expDiff
;
3078 bits64 aSig0
, aSig1
, bSig
;
3079 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3082 aSig0
= extractFloatx80Frac( a
);
3083 aExp
= extractFloatx80Exp( a
);
3084 aSign
= extractFloatx80Sign( a
);
3085 bSig
= extractFloatx80Frac( b
);
3086 bExp
= extractFloatx80Exp( b
);
3087 bSign
= extractFloatx80Sign( b
);
3088 if ( aExp
== 0x7FFF ) {
3089 if ( (bits64
) ( aSig0
<<1 )
3090 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3091 return propagateFloatx80NaN( a
, b
);
3095 if ( bExp
== 0x7FFF ) {
3096 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3102 float_raise( float_flag_invalid
);
3103 z
.low
= floatx80_default_nan_low
;
3104 z
.high
= floatx80_default_nan_high
;
3107 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3110 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3111 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3113 bSig
|= LIT64( 0x8000000000000000 );
3115 expDiff
= aExp
- bExp
;
3117 if ( expDiff
< 0 ) {
3118 if ( expDiff
< -1 ) return a
;
3119 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3122 q
= ( bSig
<= aSig0
);
3123 if ( q
) aSig0
-= bSig
;
3125 while ( 0 < expDiff
) {
3126 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3127 q
= ( 2 < q
) ? q
- 2 : 0;
3128 mul64To128( bSig
, q
, &term0
, &term1
);
3129 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3130 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3134 if ( 0 < expDiff
) {
3135 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3136 q
= ( 2 < q
) ? q
- 2 : 0;
3138 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
3139 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3140 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
3141 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
3143 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3150 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
3151 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3152 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3155 aSig0
= alternateASig0
;
3156 aSig1
= alternateASig1
;
3160 normalizeRoundAndPackFloatx80(
3161 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
);
3166 -------------------------------------------------------------------------------
3167 Returns the square root of the extended double-precision floating-point
3168 value `a'. The operation is performed according to the IEC/IEEE Standard
3169 for Binary Floating-point Arithmetic.
3170 -------------------------------------------------------------------------------
3172 floatx80
floatx80_sqrt( floatx80 a
)
3176 bits64 aSig0
, aSig1
, zSig0
, zSig1
;
3177 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
3178 bits64 shiftedRem0
, shiftedRem1
;
3181 aSig0
= extractFloatx80Frac( a
);
3182 aExp
= extractFloatx80Exp( a
);
3183 aSign
= extractFloatx80Sign( a
);
3184 if ( aExp
== 0x7FFF ) {
3185 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a
);
3186 if ( ! aSign
) return a
;
3190 if ( ( aExp
| aSig0
) == 0 ) return a
;
3192 float_raise( float_flag_invalid
);
3193 z
.low
= floatx80_default_nan_low
;
3194 z
.high
= floatx80_default_nan_high
;
3198 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
3199 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3201 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
3202 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
3205 shift128Right( aSig0
, 0, ( aExp
& 1 ) + 2, &aSig0
, &aSig1
);
3206 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
) + zSig0
+ 4;
3207 if ( 0 <= (sbits64
) zSig0
) zSig0
= LIT64( 0xFFFFFFFFFFFFFFFF );
3208 shortShift128Left( aSig0
, aSig1
, 2, &aSig0
, &aSig1
);
3209 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
3210 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
3211 while ( (sbits64
) rem0
< 0 ) {
3213 shortShift128Left( 0, zSig0
, 1, &term0
, &term1
);
3215 add128( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
3217 shortShift128Left( rem0
, rem1
, 63, &shiftedRem0
, &shiftedRem1
);
3218 zSig1
= estimateDiv128To64( shiftedRem0
, shiftedRem1
, zSig0
);
3219 if ( (bits64
) ( zSig1
<<1 ) <= 10 ) {
3220 if ( zSig1
== 0 ) zSig1
= 1;
3221 mul64To128( zSig0
, zSig1
, &term1
, &term2
);
3222 shortShift128Left( term1
, term2
, 1, &term1
, &term2
);
3223 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3224 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
3225 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
3226 while ( (sbits64
) rem1
< 0 ) {
3228 shortShift192Left( 0, zSig0
, zSig1
, 1, &term1
, &term2
, &term3
);
3231 rem1
, rem2
, rem3
, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
3233 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
3236 roundAndPackFloatx80(
3237 floatx80_rounding_precision
, 0, zExp
, zSig0
, zSig1
);
3242 -------------------------------------------------------------------------------
3243 Returns 1 if the extended double-precision floating-point value `a' is
3244 equal to the corresponding value `b', and 0 otherwise. The comparison is
3245 performed according to the IEC/IEEE Standard for Binary Floating-point
3247 -------------------------------------------------------------------------------
3249 flag
floatx80_eq( floatx80 a
, floatx80 b
)
3252 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3253 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3254 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3255 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3257 if ( floatx80_is_signaling_nan( a
)
3258 || floatx80_is_signaling_nan( b
) ) {
3259 float_raise( float_flag_invalid
);
3265 && ( ( a
.high
== b
.high
)
3267 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3273 -------------------------------------------------------------------------------
3274 Returns 1 if the extended double-precision floating-point value `a' is
3275 less than or equal to the corresponding value `b', and 0 otherwise. The
3276 comparison is performed according to the IEC/IEEE Standard for Binary
3277 Floating-point Arithmetic.
3278 -------------------------------------------------------------------------------
3280 flag
floatx80_le( floatx80 a
, floatx80 b
)
3284 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3285 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3286 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3287 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3289 float_raise( float_flag_invalid
);
3292 aSign
= extractFloatx80Sign( a
);
3293 bSign
= extractFloatx80Sign( b
);
3294 if ( aSign
!= bSign
) {
3297 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3301 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3302 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3307 -------------------------------------------------------------------------------
3308 Returns 1 if the extended double-precision floating-point value `a' is
3309 less than the corresponding value `b', and 0 otherwise. The comparison
3310 is performed according to the IEC/IEEE Standard for Binary Floating-point
3312 -------------------------------------------------------------------------------
3314 flag
floatx80_lt( floatx80 a
, floatx80 b
)
3318 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3319 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3320 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3321 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3323 float_raise( float_flag_invalid
);
3326 aSign
= extractFloatx80Sign( a
);
3327 bSign
= extractFloatx80Sign( b
);
3328 if ( aSign
!= bSign
) {
3331 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3335 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
3336 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
3341 -------------------------------------------------------------------------------
3342 Returns 1 if the extended double-precision floating-point value `a' is equal
3343 to the corresponding value `b', and 0 otherwise. The invalid exception is
3344 raised if either operand is a NaN. Otherwise, the comparison is performed
3345 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3346 -------------------------------------------------------------------------------
3348 flag
floatx80_eq_signaling( floatx80 a
, floatx80 b
)
3351 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3352 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3353 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3354 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3356 float_raise( float_flag_invalid
);
3361 && ( ( a
.high
== b
.high
)
3363 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3369 -------------------------------------------------------------------------------
3370 Returns 1 if the extended double-precision floating-point value `a' is less
3371 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3372 do not cause an exception. Otherwise, the comparison is performed according
3373 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3374 -------------------------------------------------------------------------------
3376 flag
floatx80_le_quiet( floatx80 a
, floatx80 b
)
3380 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3381 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3382 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3383 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3385 if ( floatx80_is_signaling_nan( a
)
3386 || floatx80_is_signaling_nan( b
) ) {
3387 float_raise( float_flag_invalid
);
3391 aSign
= extractFloatx80Sign( a
);
3392 bSign
= extractFloatx80Sign( b
);
3393 if ( aSign
!= bSign
) {
3396 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3400 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3401 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3406 -------------------------------------------------------------------------------
3407 Returns 1 if the extended double-precision floating-point value `a' is less
3408 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3409 an exception. Otherwise, the comparison is performed according to the
3410 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3411 -------------------------------------------------------------------------------
3413 flag
floatx80_lt_quiet( floatx80 a
, floatx80 b
)
3417 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3418 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3419 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3420 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3422 if ( floatx80_is_signaling_nan( a
)
3423 || floatx80_is_signaling_nan( b
) ) {
3424 float_raise( float_flag_invalid
);
3428 aSign
= extractFloatx80Sign( a
);
3429 bSign
= extractFloatx80Sign( b
);
3430 if ( aSign
!= bSign
) {
3433 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3437 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
3438 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);