2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
31 #include <asm/div64.h>
35 //#include "softfloat.h"
38 -------------------------------------------------------------------------------
39 Primitive arithmetic functions, including multi-word arithmetic, and
40 division and square root approximations. (Can be specialized to target if
42 -------------------------------------------------------------------------------
44 #include "softfloat-macros"
47 -------------------------------------------------------------------------------
48 Functions and definitions to determine: (1) whether tininess for underflow
49 is detected before or after rounding by default, (2) what (if anything)
50 happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 are propagated from function inputs to output. These details are target-
54 -------------------------------------------------------------------------------
56 #include "softfloat-specialize"
59 -------------------------------------------------------------------------------
60 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
61 and 7, and returns the properly rounded 32-bit integer corresponding to the
62 input. If `zSign' is nonzero, the input is negated before being converted
63 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
64 input is simply rounded to an integer, with the inexact exception raised if
65 the input cannot be represented exactly as an integer. If the fixed-point
66 input is too large, however, the invalid exception is raised and the largest
67 positive or negative integer is returned.
68 -------------------------------------------------------------------------------
70 static int32
roundAndPackInt32( struct roundingData
*roundData
, flag zSign
, bits64 absZ
)
73 flag roundNearestEven
;
74 int8 roundIncrement
, roundBits
;
77 roundingMode
= roundData
->mode
;
78 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
79 roundIncrement
= 0x40;
80 if ( ! roundNearestEven
) {
81 if ( roundingMode
== float_round_to_zero
) {
85 roundIncrement
= 0x7F;
87 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
90 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
94 roundBits
= absZ
& 0x7F;
95 absZ
= ( absZ
+ roundIncrement
)>>7;
96 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
99 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
100 roundData
->exception
|= float_flag_invalid
;
101 return zSign
? 0x80000000 : 0x7FFFFFFF;
103 if ( roundBits
) roundData
->exception
|= float_flag_inexact
;
109 -------------------------------------------------------------------------------
110 Returns the fraction bits of the single-precision floating-point value `a'.
111 -------------------------------------------------------------------------------
113 INLINE bits32
extractFloat32Frac( float32 a
)
116 return a
& 0x007FFFFF;
121 -------------------------------------------------------------------------------
122 Returns the exponent bits of the single-precision floating-point value `a'.
123 -------------------------------------------------------------------------------
125 INLINE int16
extractFloat32Exp( float32 a
)
128 return ( a
>>23 ) & 0xFF;
133 -------------------------------------------------------------------------------
134 Returns the sign bit of the single-precision floating-point value `a'.
135 -------------------------------------------------------------------------------
137 #if 0 /* in softfloat.h */
138 INLINE flag
extractFloat32Sign( float32 a
)
147 -------------------------------------------------------------------------------
148 Normalizes the subnormal single-precision floating-point value represented
149 by the denormalized significand `aSig'. The normalized exponent and
150 significand are stored at the locations pointed to by `zExpPtr' and
151 `zSigPtr', respectively.
152 -------------------------------------------------------------------------------
155 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
159 shiftCount
= countLeadingZeros32( aSig
) - 8;
160 *zSigPtr
= aSig
<<shiftCount
;
161 *zExpPtr
= 1 - shiftCount
;
166 -------------------------------------------------------------------------------
167 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
168 single-precision floating-point value, returning the result. After being
169 shifted into the proper positions, the three fields are simply added
170 together to form the result. This means that any integer portion of `zSig'
171 will be added into the exponent. Since a properly normalized significand
172 will have an integer portion equal to 1, the `zExp' input should be 1 less
173 than the desired result exponent whenever `zSig' is a complete, normalized
175 -------------------------------------------------------------------------------
177 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
181 __asm__("@ packFloat32 \n\
182 mov %0, %1, asl #31 \n\
183 orr %0, %2, asl #23 \n\
186 : "g" (f
), "g" (zSign
), "g" (zExp
), "g" (zSig
)
190 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
195 -------------------------------------------------------------------------------
196 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
197 and significand `zSig', and returns the proper single-precision floating-
198 point value corresponding to the abstract input. Ordinarily, the abstract
199 value is simply rounded and packed into the single-precision format, with
200 the inexact exception raised if the abstract input cannot be represented
201 exactly. If the abstract value is too large, however, the overflow and
202 inexact exceptions are raised and an infinity or maximal finite value is
203 returned. If the abstract value is too small, the input value is rounded to
204 a subnormal number, and the underflow and inexact exceptions are raised if
205 the abstract input cannot be represented exactly as a subnormal single-
206 precision floating-point number.
207 The input significand `zSig' has its binary point between bits 30
208 and 29, which is 7 bits to the left of the usual location. This shifted
209 significand must be normalized or smaller. If `zSig' is not normalized,
210 `zExp' must be 0; in that case, the result returned is a subnormal number,
211 and it must not require rounding. In the usual case that `zSig' is
212 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
213 The handling of underflow and overflow follows the IEC/IEEE Standard for
214 Binary Floating-point Arithmetic.
215 -------------------------------------------------------------------------------
217 static float32
roundAndPackFloat32( struct roundingData
*roundData
, flag zSign
, int16 zExp
, bits32 zSig
)
220 flag roundNearestEven
;
221 int8 roundIncrement
, roundBits
;
224 roundingMode
= roundData
->mode
;
225 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
226 roundIncrement
= 0x40;
227 if ( ! roundNearestEven
) {
228 if ( roundingMode
== float_round_to_zero
) {
232 roundIncrement
= 0x7F;
234 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
237 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
241 roundBits
= zSig
& 0x7F;
242 if ( 0xFD <= (bits16
) zExp
) {
244 || ( ( zExp
== 0xFD )
245 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
247 roundData
->exception
|= float_flag_overflow
| float_flag_inexact
;
248 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
252 ( float_detect_tininess
== float_tininess_before_rounding
)
254 || ( zSig
+ roundIncrement
< 0x80000000 );
255 shift32RightJamming( zSig
, - zExp
, &zSig
);
257 roundBits
= zSig
& 0x7F;
258 if ( isTiny
&& roundBits
) roundData
->exception
|= float_flag_underflow
;
261 if ( roundBits
) roundData
->exception
|= float_flag_inexact
;
262 zSig
= ( zSig
+ roundIncrement
)>>7;
263 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
264 if ( zSig
== 0 ) zExp
= 0;
265 return packFloat32( zSign
, zExp
, zSig
);
270 -------------------------------------------------------------------------------
271 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
272 and significand `zSig', and returns the proper single-precision floating-
273 point value corresponding to the abstract input. This routine is just like
274 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
275 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
277 -------------------------------------------------------------------------------
280 normalizeRoundAndPackFloat32( struct roundingData
*roundData
, flag zSign
, int16 zExp
, bits32 zSig
)
284 shiftCount
= countLeadingZeros32( zSig
) - 1;
285 return roundAndPackFloat32( roundData
, zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
290 -------------------------------------------------------------------------------
291 Returns the fraction bits of the double-precision floating-point value `a'.
292 -------------------------------------------------------------------------------
294 INLINE bits64
extractFloat64Frac( float64 a
)
297 return a
& LIT64( 0x000FFFFFFFFFFFFF );
302 -------------------------------------------------------------------------------
303 Returns the exponent bits of the double-precision floating-point value `a'.
304 -------------------------------------------------------------------------------
306 INLINE int16
extractFloat64Exp( float64 a
)
309 return ( a
>>52 ) & 0x7FF;
314 -------------------------------------------------------------------------------
315 Returns the sign bit of the double-precision floating-point value `a'.
316 -------------------------------------------------------------------------------
318 #if 0 /* in softfloat.h */
319 INLINE flag
extractFloat64Sign( float64 a
)
328 -------------------------------------------------------------------------------
329 Normalizes the subnormal double-precision floating-point value represented
330 by the denormalized significand `aSig'. The normalized exponent and
331 significand are stored at the locations pointed to by `zExpPtr' and
332 `zSigPtr', respectively.
333 -------------------------------------------------------------------------------
336 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
340 shiftCount
= countLeadingZeros64( aSig
) - 11;
341 *zSigPtr
= aSig
<<shiftCount
;
342 *zExpPtr
= 1 - shiftCount
;
347 -------------------------------------------------------------------------------
348 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
349 double-precision floating-point value, returning the result. After being
350 shifted into the proper positions, the three fields are simply added
351 together to form the result. This means that any integer portion of `zSig'
352 will be added into the exponent. Since a properly normalized significand
353 will have an integer portion equal to 1, the `zExp' input should be 1 less
354 than the desired result exponent whenever `zSig' is a complete, normalized
356 -------------------------------------------------------------------------------
358 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
361 return ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
;
366 -------------------------------------------------------------------------------
367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 and significand `zSig', and returns the proper double-precision floating-
369 point value corresponding to the abstract input. Ordinarily, the abstract
370 value is simply rounded and packed into the double-precision format, with
371 the inexact exception raised if the abstract input cannot be represented
372 exactly. If the abstract value is too large, however, the overflow and
373 inexact exceptions are raised and an infinity or maximal finite value is
374 returned. If the abstract value is too small, the input value is rounded to
375 a subnormal number, and the underflow and inexact exceptions are raised if
376 the abstract input cannot be represented exactly as a subnormal double-
377 precision floating-point number.
378 The input significand `zSig' has its binary point between bits 62
379 and 61, which is 10 bits to the left of the usual location. This shifted
380 significand must be normalized or smaller. If `zSig' is not normalized,
381 `zExp' must be 0; in that case, the result returned is a subnormal number,
382 and it must not require rounding. In the usual case that `zSig' is
383 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
384 The handling of underflow and overflow follows the IEC/IEEE Standard for
385 Binary Floating-point Arithmetic.
386 -------------------------------------------------------------------------------
388 static float64
roundAndPackFloat64( struct roundingData
*roundData
, flag zSign
, int16 zExp
, bits64 zSig
)
391 flag roundNearestEven
;
392 int16 roundIncrement
, roundBits
;
395 roundingMode
= roundData
->mode
;
396 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
397 roundIncrement
= 0x200;
398 if ( ! roundNearestEven
) {
399 if ( roundingMode
== float_round_to_zero
) {
403 roundIncrement
= 0x3FF;
405 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
408 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
412 roundBits
= zSig
& 0x3FF;
413 if ( 0x7FD <= (bits16
) zExp
) {
414 if ( ( 0x7FD < zExp
)
415 || ( ( zExp
== 0x7FD )
416 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
418 //register int lr = __builtin_return_address(0);
419 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
420 roundData
->exception
|= float_flag_overflow
| float_flag_inexact
;
421 return packFloat64( zSign
, 0x7FF, 0 ) - ( roundIncrement
== 0 );
425 ( float_detect_tininess
== float_tininess_before_rounding
)
427 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
428 shift64RightJamming( zSig
, - zExp
, &zSig
);
430 roundBits
= zSig
& 0x3FF;
431 if ( isTiny
&& roundBits
) roundData
->exception
|= float_flag_underflow
;
434 if ( roundBits
) roundData
->exception
|= float_flag_inexact
;
435 zSig
= ( zSig
+ roundIncrement
)>>10;
436 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
437 if ( zSig
== 0 ) zExp
= 0;
438 return packFloat64( zSign
, zExp
, zSig
);
443 -------------------------------------------------------------------------------
444 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
445 and significand `zSig', and returns the proper double-precision floating-
446 point value corresponding to the abstract input. This routine is just like
447 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
448 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
450 -------------------------------------------------------------------------------
453 normalizeRoundAndPackFloat64( struct roundingData
*roundData
, flag zSign
, int16 zExp
, bits64 zSig
)
457 shiftCount
= countLeadingZeros64( zSig
) - 1;
458 return roundAndPackFloat64( roundData
, zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
465 -------------------------------------------------------------------------------
466 Returns the fraction bits of the extended double-precision floating-point
468 -------------------------------------------------------------------------------
470 INLINE bits64
extractFloatx80Frac( floatx80 a
)
478 -------------------------------------------------------------------------------
479 Returns the exponent bits of the extended double-precision floating-point
481 -------------------------------------------------------------------------------
483 INLINE int32
extractFloatx80Exp( floatx80 a
)
486 return a
.high
& 0x7FFF;
491 -------------------------------------------------------------------------------
492 Returns the sign bit of the extended double-precision floating-point value
494 -------------------------------------------------------------------------------
496 INLINE flag
extractFloatx80Sign( floatx80 a
)
504 -------------------------------------------------------------------------------
505 Normalizes the subnormal extended double-precision floating-point value
506 represented by the denormalized significand `aSig'. The normalized exponent
507 and significand are stored at the locations pointed to by `zExpPtr' and
508 `zSigPtr', respectively.
509 -------------------------------------------------------------------------------
512 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
516 shiftCount
= countLeadingZeros64( aSig
);
517 *zSigPtr
= aSig
<<shiftCount
;
518 *zExpPtr
= 1 - shiftCount
;
523 -------------------------------------------------------------------------------
524 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
525 extended double-precision floating-point value, returning the result.
526 -------------------------------------------------------------------------------
528 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
533 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
539 -------------------------------------------------------------------------------
540 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
541 and extended significand formed by the concatenation of `zSig0' and `zSig1',
542 and returns the proper extended double-precision floating-point value
543 corresponding to the abstract input. Ordinarily, the abstract value is
544 rounded and packed into the extended double-precision format, with the
545 inexact exception raised if the abstract input cannot be represented
546 exactly. If the abstract value is too large, however, the overflow and
547 inexact exceptions are raised and an infinity or maximal finite value is
548 returned. If the abstract value is too small, the input value is rounded to
549 a subnormal number, and the underflow and inexact exceptions are raised if
550 the abstract input cannot be represented exactly as a subnormal extended
551 double-precision floating-point number.
552 If `roundingPrecision' is 32 or 64, the result is rounded to the same
553 number of bits as single or double precision, respectively. Otherwise, the
554 result is rounded to the full precision of the extended double-precision
556 The input significand must be normalized or smaller. If the input
557 significand is not normalized, `zExp' must be 0; in that case, the result
558 returned is a subnormal number, and it must not require rounding. The
559 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
560 Floating-point Arithmetic.
561 -------------------------------------------------------------------------------
564 roundAndPackFloatx80(
565 struct roundingData
*roundData
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
568 int8 roundingMode
, roundingPrecision
;
569 flag roundNearestEven
, increment
, isTiny
;
570 int64 roundIncrement
, roundMask
, roundBits
;
572 roundingMode
= roundData
->mode
;
573 roundingPrecision
= roundData
->precision
;
574 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
575 if ( roundingPrecision
== 80 ) goto precision80
;
576 if ( roundingPrecision
== 64 ) {
577 roundIncrement
= LIT64( 0x0000000000000400 );
578 roundMask
= LIT64( 0x00000000000007FF );
580 else if ( roundingPrecision
== 32 ) {
581 roundIncrement
= LIT64( 0x0000008000000000 );
582 roundMask
= LIT64( 0x000000FFFFFFFFFF );
587 zSig0
|= ( zSig1
!= 0 );
588 if ( ! roundNearestEven
) {
589 if ( roundingMode
== float_round_to_zero
) {
593 roundIncrement
= roundMask
;
595 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
598 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
602 roundBits
= zSig0
& roundMask
;
603 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
604 if ( ( 0x7FFE < zExp
)
605 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
611 ( float_detect_tininess
== float_tininess_before_rounding
)
613 || ( zSig0
<= zSig0
+ roundIncrement
);
614 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
616 roundBits
= zSig0
& roundMask
;
617 if ( isTiny
&& roundBits
) roundData
->exception
|= float_flag_underflow
;
618 if ( roundBits
) roundData
->exception
|= float_flag_inexact
;
619 zSig0
+= roundIncrement
;
620 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
621 roundIncrement
= roundMask
+ 1;
622 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
623 roundMask
|= roundIncrement
;
625 zSig0
&= ~ roundMask
;
626 return packFloatx80( zSign
, zExp
, zSig0
);
629 if ( roundBits
) roundData
->exception
|= float_flag_inexact
;
630 zSig0
+= roundIncrement
;
631 if ( zSig0
< roundIncrement
) {
633 zSig0
= LIT64( 0x8000000000000000 );
635 roundIncrement
= roundMask
+ 1;
636 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
637 roundMask
|= roundIncrement
;
639 zSig0
&= ~ roundMask
;
640 if ( zSig0
== 0 ) zExp
= 0;
641 return packFloatx80( zSign
, zExp
, zSig0
);
643 increment
= ( (sbits64
) zSig1
< 0 );
644 if ( ! roundNearestEven
) {
645 if ( roundingMode
== float_round_to_zero
) {
650 increment
= ( roundingMode
== float_round_down
) && zSig1
;
653 increment
= ( roundingMode
== float_round_up
) && zSig1
;
657 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
658 if ( ( 0x7FFE < zExp
)
659 || ( ( zExp
== 0x7FFE )
660 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
666 roundData
->exception
|= float_flag_overflow
| float_flag_inexact
;
667 if ( ( roundingMode
== float_round_to_zero
)
668 || ( zSign
&& ( roundingMode
== float_round_up
) )
669 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
671 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
673 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
677 ( float_detect_tininess
== float_tininess_before_rounding
)
680 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
681 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
683 if ( isTiny
&& zSig1
) roundData
->exception
|= float_flag_underflow
;
684 if ( zSig1
) roundData
->exception
|= float_flag_inexact
;
685 if ( roundNearestEven
) {
686 increment
= ( (sbits64
) zSig1
< 0 );
690 increment
= ( roundingMode
== float_round_down
) && zSig1
;
693 increment
= ( roundingMode
== float_round_up
) && zSig1
;
698 zSig0
&= ~ ( ( zSig1
+ zSig1
== 0 ) & roundNearestEven
);
699 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
701 return packFloatx80( zSign
, zExp
, zSig0
);
704 if ( zSig1
) roundData
->exception
|= float_flag_inexact
;
709 zSig0
= LIT64( 0x8000000000000000 );
712 zSig0
&= ~ ( ( zSig1
+ zSig1
== 0 ) & roundNearestEven
);
716 if ( zSig0
== 0 ) zExp
= 0;
719 return packFloatx80( zSign
, zExp
, zSig0
);
723 -------------------------------------------------------------------------------
724 Takes an abstract floating-point value having sign `zSign', exponent
725 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
726 and returns the proper extended double-precision floating-point value
727 corresponding to the abstract input. This routine is just like
728 `roundAndPackFloatx80' except that the input significand does not have to be
730 -------------------------------------------------------------------------------
733 normalizeRoundAndPackFloatx80(
734 struct roundingData
*roundData
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
744 shiftCount
= countLeadingZeros64( zSig0
);
745 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
748 roundAndPackFloatx80( roundData
, zSign
, zExp
, zSig0
, zSig1
);
755 -------------------------------------------------------------------------------
756 Returns the result of converting the 32-bit two's complement integer `a' to
757 the single-precision floating-point format. The conversion is performed
758 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
759 -------------------------------------------------------------------------------
761 float32
int32_to_float32(struct roundingData
*roundData
, int32 a
)
765 if ( a
== 0 ) return 0;
766 if ( a
== 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
768 return normalizeRoundAndPackFloat32( roundData
, zSign
, 0x9C, zSign
? - a
: a
);
773 -------------------------------------------------------------------------------
774 Returns the result of converting the 32-bit two's complement integer `a' to
775 the double-precision floating-point format. The conversion is performed
776 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
777 -------------------------------------------------------------------------------
779 float64
int32_to_float64( int32 a
)
786 if ( a
== 0 ) return 0;
788 absA
= aSign
? - a
: a
;
789 shiftCount
= countLeadingZeros32( absA
) + 21;
791 return packFloat64( aSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
798 -------------------------------------------------------------------------------
799 Returns the result of converting the 32-bit two's complement integer `a'
800 to the extended double-precision floating-point format. The conversion
801 is performed according to the IEC/IEEE Standard for Binary Floating-point
803 -------------------------------------------------------------------------------
805 floatx80
int32_to_floatx80( int32 a
)
812 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
814 absA
= zSign
? - a
: a
;
815 shiftCount
= countLeadingZeros32( absA
) + 32;
817 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
824 -------------------------------------------------------------------------------
825 Returns the result of converting the single-precision floating-point value
826 `a' to the 32-bit two's complement integer format. The conversion is
827 performed according to the IEC/IEEE Standard for Binary Floating-point
828 Arithmetic---which means in particular that the conversion is rounded
829 according to the current rounding mode. If `a' is a NaN, the largest
830 positive integer is returned. Otherwise, if the conversion overflows, the
831 largest integer with the same sign as `a' is returned.
832 -------------------------------------------------------------------------------
834 int32
float32_to_int32( struct roundingData
*roundData
, float32 a
)
837 int16 aExp
, shiftCount
;
841 aSig
= extractFloat32Frac( a
);
842 aExp
= extractFloat32Exp( a
);
843 aSign
= extractFloat32Sign( a
);
844 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
845 if ( aExp
) aSig
|= 0x00800000;
846 shiftCount
= 0xAF - aExp
;
849 if ( 0 < shiftCount
) shift64RightJamming( zSig
, shiftCount
, &zSig
);
850 return roundAndPackInt32( roundData
, aSign
, zSig
);
855 -------------------------------------------------------------------------------
856 Returns the result of converting the single-precision floating-point value
857 `a' to the 32-bit two's complement integer format. The conversion is
858 performed according to the IEC/IEEE Standard for Binary Floating-point
859 Arithmetic, except that the conversion is always rounded toward zero. If
860 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
861 conversion overflows, the largest integer with the same sign as `a' is
863 -------------------------------------------------------------------------------
865 int32
float32_to_int32_round_to_zero( float32 a
)
868 int16 aExp
, shiftCount
;
872 aSig
= extractFloat32Frac( a
);
873 aExp
= extractFloat32Exp( a
);
874 aSign
= extractFloat32Sign( a
);
875 shiftCount
= aExp
- 0x9E;
876 if ( 0 <= shiftCount
) {
877 if ( a
== 0xCF000000 ) return 0x80000000;
878 float_raise( float_flag_invalid
);
879 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
882 else if ( aExp
<= 0x7E ) {
883 if ( aExp
| aSig
) float_raise( float_flag_inexact
);
886 aSig
= ( aSig
| 0x00800000 )<<8;
887 z
= aSig
>>( - shiftCount
);
888 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
889 float_raise( float_flag_inexact
);
891 return aSign
? - z
: z
;
896 -------------------------------------------------------------------------------
897 Returns the result of converting the single-precision floating-point value
898 `a' to the double-precision floating-point format. The conversion is
899 performed according to the IEC/IEEE Standard for Binary Floating-point
901 -------------------------------------------------------------------------------
903 float64
float32_to_float64( float32 a
)
909 aSig
= extractFloat32Frac( a
);
910 aExp
= extractFloat32Exp( a
);
911 aSign
= extractFloat32Sign( a
);
912 if ( aExp
== 0xFF ) {
913 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
914 return packFloat64( aSign
, 0x7FF, 0 );
917 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
918 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
921 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
928 -------------------------------------------------------------------------------
929 Returns the result of converting the single-precision floating-point value
930 `a' to the extended double-precision floating-point format. The conversion
931 is performed according to the IEC/IEEE Standard for Binary Floating-point
933 -------------------------------------------------------------------------------
935 floatx80
float32_to_floatx80( float32 a
)
941 aSig
= extractFloat32Frac( a
);
942 aExp
= extractFloat32Exp( a
);
943 aSign
= extractFloat32Sign( a
);
944 if ( aExp
== 0xFF ) {
945 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a
) );
946 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
949 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
950 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
953 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
960 -------------------------------------------------------------------------------
961 Rounds the single-precision floating-point value `a' to an integer, and
962 returns the result as a single-precision floating-point value. The
963 operation is performed according to the IEC/IEEE Standard for Binary
964 Floating-point Arithmetic.
965 -------------------------------------------------------------------------------
967 float32
float32_round_to_int( struct roundingData
*roundData
, float32 a
)
971 bits32 lastBitMask
, roundBitsMask
;
975 aExp
= extractFloat32Exp( a
);
976 if ( 0x96 <= aExp
) {
977 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
978 return propagateFloat32NaN( a
, a
);
982 roundingMode
= roundData
->mode
;
983 if ( aExp
<= 0x7E ) {
984 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
985 roundData
->exception
|= float_flag_inexact
;
986 aSign
= extractFloat32Sign( a
);
987 switch ( roundingMode
) {
988 case float_round_nearest_even
:
989 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
990 return packFloat32( aSign
, 0x7F, 0 );
993 case float_round_down
:
994 return aSign
? 0xBF800000 : 0;
996 return aSign
? 0x80000000 : 0x3F800000;
998 return packFloat32( aSign
, 0, 0 );
1001 lastBitMask
<<= 0x96 - aExp
;
1002 roundBitsMask
= lastBitMask
- 1;
1004 if ( roundingMode
== float_round_nearest_even
) {
1005 z
+= lastBitMask
>>1;
1006 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1008 else if ( roundingMode
!= float_round_to_zero
) {
1009 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1013 z
&= ~ roundBitsMask
;
1014 if ( z
!= a
) roundData
->exception
|= float_flag_inexact
;
1020 -------------------------------------------------------------------------------
1021 Returns the result of adding the absolute values of the single-precision
1022 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1023 before being returned. `zSign' is ignored if the result is a NaN. The
1024 addition is performed according to the IEC/IEEE Standard for Binary
1025 Floating-point Arithmetic.
1026 -------------------------------------------------------------------------------
1028 static float32
addFloat32Sigs( struct roundingData
*roundData
, float32 a
, float32 b
, flag zSign
)
1030 int16 aExp
, bExp
, zExp
;
1031 bits32 aSig
, bSig
, zSig
;
1034 aSig
= extractFloat32Frac( a
);
1035 aExp
= extractFloat32Exp( a
);
1036 bSig
= extractFloat32Frac( b
);
1037 bExp
= extractFloat32Exp( b
);
1038 expDiff
= aExp
- bExp
;
1041 if ( 0 < expDiff
) {
1042 if ( aExp
== 0xFF ) {
1043 if ( aSig
) return propagateFloat32NaN( a
, b
);
1052 shift32RightJamming( bSig
, expDiff
, &bSig
);
1055 else if ( expDiff
< 0 ) {
1056 if ( bExp
== 0xFF ) {
1057 if ( bSig
) return propagateFloat32NaN( a
, b
);
1058 return packFloat32( zSign
, 0xFF, 0 );
1066 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1070 if ( aExp
== 0xFF ) {
1071 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1074 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1075 zSig
= 0x40000000 + aSig
+ bSig
;
1080 zSig
= ( aSig
+ bSig
)<<1;
1082 if ( (sbits32
) zSig
< 0 ) {
1087 return roundAndPackFloat32( roundData
, zSign
, zExp
, zSig
);
1092 -------------------------------------------------------------------------------
1093 Returns the result of subtracting the absolute values of the single-
1094 precision floating-point values `a' and `b'. If `zSign' is true, the
1095 difference is negated before being returned. `zSign' is ignored if the
1096 result is a NaN. The subtraction is performed according to the IEC/IEEE
1097 Standard for Binary Floating-point Arithmetic.
1098 -------------------------------------------------------------------------------
1100 static float32
subFloat32Sigs( struct roundingData
*roundData
, float32 a
, float32 b
, flag zSign
)
1102 int16 aExp
, bExp
, zExp
;
1103 bits32 aSig
, bSig
, zSig
;
1106 aSig
= extractFloat32Frac( a
);
1107 aExp
= extractFloat32Exp( a
);
1108 bSig
= extractFloat32Frac( b
);
1109 bExp
= extractFloat32Exp( b
);
1110 expDiff
= aExp
- bExp
;
1113 if ( 0 < expDiff
) goto aExpBigger
;
1114 if ( expDiff
< 0 ) goto bExpBigger
;
1115 if ( aExp
== 0xFF ) {
1116 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1117 roundData
->exception
|= float_flag_invalid
;
1118 return float32_default_nan
;
1124 if ( bSig
< aSig
) goto aBigger
;
1125 if ( aSig
< bSig
) goto bBigger
;
1126 return packFloat32( roundData
->mode
== float_round_down
, 0, 0 );
1128 if ( bExp
== 0xFF ) {
1129 if ( bSig
) return propagateFloat32NaN( a
, b
);
1130 return packFloat32( zSign
^ 1, 0xFF, 0 );
1138 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1144 goto normalizeRoundAndPack
;
1146 if ( aExp
== 0xFF ) {
1147 if ( aSig
) return propagateFloat32NaN( a
, b
);
1156 shift32RightJamming( bSig
, expDiff
, &bSig
);
1161 normalizeRoundAndPack
:
1163 return normalizeRoundAndPackFloat32( roundData
, zSign
, zExp
, zSig
);
1168 -------------------------------------------------------------------------------
1169 Returns the result of adding the single-precision floating-point values `a'
1170 and `b'. The operation is performed according to the IEC/IEEE Standard for
1171 Binary Floating-point Arithmetic.
1172 -------------------------------------------------------------------------------
1174 float32
float32_add( struct roundingData
*roundData
, float32 a
, float32 b
)
1178 aSign
= extractFloat32Sign( a
);
1179 bSign
= extractFloat32Sign( b
);
1180 if ( aSign
== bSign
) {
1181 return addFloat32Sigs( roundData
, a
, b
, aSign
);
1184 return subFloat32Sigs( roundData
, a
, b
, aSign
);
1190 -------------------------------------------------------------------------------
1191 Returns the result of subtracting the single-precision floating-point values
1192 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1193 for Binary Floating-point Arithmetic.
1194 -------------------------------------------------------------------------------
1196 float32
float32_sub( struct roundingData
*roundData
, float32 a
, float32 b
)
1200 aSign
= extractFloat32Sign( a
);
1201 bSign
= extractFloat32Sign( b
);
1202 if ( aSign
== bSign
) {
1203 return subFloat32Sigs( roundData
, a
, b
, aSign
);
1206 return addFloat32Sigs( roundData
, a
, b
, aSign
);
1212 -------------------------------------------------------------------------------
1213 Returns the result of multiplying the single-precision floating-point values
1214 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1215 for Binary Floating-point Arithmetic.
1216 -------------------------------------------------------------------------------
1218 float32
float32_mul( struct roundingData
*roundData
, float32 a
, float32 b
)
1220 flag aSign
, bSign
, zSign
;
1221 int16 aExp
, bExp
, zExp
;
1226 aSig
= extractFloat32Frac( a
);
1227 aExp
= extractFloat32Exp( a
);
1228 aSign
= extractFloat32Sign( a
);
1229 bSig
= extractFloat32Frac( b
);
1230 bExp
= extractFloat32Exp( b
);
1231 bSign
= extractFloat32Sign( b
);
1232 zSign
= aSign
^ bSign
;
1233 if ( aExp
== 0xFF ) {
1234 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1235 return propagateFloat32NaN( a
, b
);
1237 if ( ( bExp
| bSig
) == 0 ) {
1238 roundData
->exception
|= float_flag_invalid
;
1239 return float32_default_nan
;
1241 return packFloat32( zSign
, 0xFF, 0 );
1243 if ( bExp
== 0xFF ) {
1244 if ( bSig
) return propagateFloat32NaN( a
, b
);
1245 if ( ( aExp
| aSig
) == 0 ) {
1246 roundData
->exception
|= float_flag_invalid
;
1247 return float32_default_nan
;
1249 return packFloat32( zSign
, 0xFF, 0 );
1252 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1253 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1256 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1257 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1259 zExp
= aExp
+ bExp
- 0x7F;
1260 aSig
= ( aSig
| 0x00800000 )<<7;
1261 bSig
= ( bSig
| 0x00800000 )<<8;
1262 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1264 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1268 return roundAndPackFloat32( roundData
, zSign
, zExp
, zSig
);
1273 -------------------------------------------------------------------------------
1274 Returns the result of dividing the single-precision floating-point value `a'
1275 by the corresponding value `b'. The operation is performed according to the
1276 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1277 -------------------------------------------------------------------------------
1279 float32
float32_div( struct roundingData
*roundData
, float32 a
, float32 b
)
1281 flag aSign
, bSign
, zSign
;
1282 int16 aExp
, bExp
, zExp
;
1283 bits32 aSig
, bSig
, zSig
;
1285 aSig
= extractFloat32Frac( a
);
1286 aExp
= extractFloat32Exp( a
);
1287 aSign
= extractFloat32Sign( a
);
1288 bSig
= extractFloat32Frac( b
);
1289 bExp
= extractFloat32Exp( b
);
1290 bSign
= extractFloat32Sign( b
);
1291 zSign
= aSign
^ bSign
;
1292 if ( aExp
== 0xFF ) {
1293 if ( aSig
) return propagateFloat32NaN( a
, b
);
1294 if ( bExp
== 0xFF ) {
1295 if ( bSig
) return propagateFloat32NaN( a
, b
);
1296 roundData
->exception
|= float_flag_invalid
;
1297 return float32_default_nan
;
1299 return packFloat32( zSign
, 0xFF, 0 );
1301 if ( bExp
== 0xFF ) {
1302 if ( bSig
) return propagateFloat32NaN( a
, b
);
1303 return packFloat32( zSign
, 0, 0 );
1307 if ( ( aExp
| aSig
) == 0 ) {
1308 roundData
->exception
|= float_flag_invalid
;
1309 return float32_default_nan
;
1311 roundData
->exception
|= float_flag_divbyzero
;
1312 return packFloat32( zSign
, 0xFF, 0 );
1314 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1317 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1318 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1320 zExp
= aExp
- bExp
+ 0x7D;
1321 aSig
= ( aSig
| 0x00800000 )<<7;
1322 bSig
= ( bSig
| 0x00800000 )<<8;
1323 if ( bSig
<= ( aSig
+ aSig
) ) {
1328 bits64 tmp
= ( (bits64
) aSig
)<<32;
1329 do_div( tmp
, bSig
);
1332 if ( ( zSig
& 0x3F ) == 0 ) {
1333 zSig
|= ( ( (bits64
) bSig
) * zSig
!= ( (bits64
) aSig
)<<32 );
1335 return roundAndPackFloat32( roundData
, zSign
, zExp
, zSig
);
1340 -------------------------------------------------------------------------------
1341 Returns the remainder of the single-precision floating-point value `a'
1342 with respect to the corresponding value `b'. The operation is performed
1343 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1344 -------------------------------------------------------------------------------
1346 float32
float32_rem( struct roundingData
*roundData
, float32 a
, float32 b
)
1348 flag aSign
, bSign
, zSign
;
1349 int16 aExp
, bExp
, expDiff
;
1352 bits64 aSig64
, bSig64
, q64
;
1353 bits32 alternateASig
;
1356 aSig
= extractFloat32Frac( a
);
1357 aExp
= extractFloat32Exp( a
);
1358 aSign
= extractFloat32Sign( a
);
1359 bSig
= extractFloat32Frac( b
);
1360 bExp
= extractFloat32Exp( b
);
1361 bSign
= extractFloat32Sign( b
);
1362 if ( aExp
== 0xFF ) {
1363 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1364 return propagateFloat32NaN( a
, b
);
1366 roundData
->exception
|= float_flag_invalid
;
1367 return float32_default_nan
;
1369 if ( bExp
== 0xFF ) {
1370 if ( bSig
) return propagateFloat32NaN( a
, b
);
1375 roundData
->exception
|= float_flag_invalid
;
1376 return float32_default_nan
;
1378 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1381 if ( aSig
== 0 ) return a
;
1382 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1384 expDiff
= aExp
- bExp
;
1387 if ( expDiff
< 32 ) {
1390 if ( expDiff
< 0 ) {
1391 if ( expDiff
< -1 ) return a
;
1394 q
= ( bSig
<= aSig
);
1395 if ( q
) aSig
-= bSig
;
1396 if ( 0 < expDiff
) {
1397 bits64 tmp
= ( (bits64
) aSig
)<<32;
1398 do_div( tmp
, bSig
);
1402 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1410 if ( bSig
<= aSig
) aSig
-= bSig
;
1411 aSig64
= ( (bits64
) aSig
)<<40;
1412 bSig64
= ( (bits64
) bSig
)<<40;
1414 while ( 0 < expDiff
) {
1415 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1416 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1417 aSig64
= - ( ( bSig
* q64
)<<38 );
1421 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1422 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1423 q
= q64
>>( 64 - expDiff
);
1425 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
1428 alternateASig
= aSig
;
1431 } while ( 0 <= (sbits32
) aSig
);
1432 sigMean
= aSig
+ alternateASig
;
1433 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1434 aSig
= alternateASig
;
1436 zSign
= ( (sbits32
) aSig
< 0 );
1437 if ( zSign
) aSig
= - aSig
;
1438 return normalizeRoundAndPackFloat32( roundData
, aSign
^ zSign
, bExp
, aSig
);
1443 -------------------------------------------------------------------------------
1444 Returns the square root of the single-precision floating-point value `a'.
1445 The operation is performed according to the IEC/IEEE Standard for Binary
1446 Floating-point Arithmetic.
1447 -------------------------------------------------------------------------------
1449 float32
float32_sqrt( struct roundingData
*roundData
, float32 a
)
1456 aSig
= extractFloat32Frac( a
);
1457 aExp
= extractFloat32Exp( a
);
1458 aSign
= extractFloat32Sign( a
);
1459 if ( aExp
== 0xFF ) {
1460 if ( aSig
) return propagateFloat32NaN( a
, 0 );
1461 if ( ! aSign
) return a
;
1462 roundData
->exception
|= float_flag_invalid
;
1463 return float32_default_nan
;
1466 if ( ( aExp
| aSig
) == 0 ) return a
;
1467 roundData
->exception
|= float_flag_invalid
;
1468 return float32_default_nan
;
1471 if ( aSig
== 0 ) return 0;
1472 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1474 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
1475 aSig
= ( aSig
| 0x00800000 )<<8;
1476 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
1477 if ( ( zSig
& 0x7F ) <= 5 ) {
1483 term
= ( (bits64
) zSig
) * zSig
;
1484 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
1485 while ( (sbits64
) rem
< 0 ) {
1487 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
1489 zSig
|= ( rem
!= 0 );
1492 shift32RightJamming( zSig
, 1, &zSig
);
1493 return roundAndPackFloat32( roundData
, 0, zExp
, zSig
);
1498 -------------------------------------------------------------------------------
1499 Returns 1 if the single-precision floating-point value `a' is equal to the
1500 corresponding value `b', and 0 otherwise. The comparison is performed
1501 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1502 -------------------------------------------------------------------------------
1504 flag
float32_eq( float32 a
, float32 b
)
1507 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1508 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1510 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1511 float_raise( float_flag_invalid
);
1515 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1520 -------------------------------------------------------------------------------
1521 Returns 1 if the single-precision floating-point value `a' is less than or
1522 equal to the corresponding value `b', and 0 otherwise. The comparison is
1523 performed according to the IEC/IEEE Standard for Binary Floating-point
1525 -------------------------------------------------------------------------------
1527 flag
float32_le( float32 a
, float32 b
)
1531 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1532 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1534 float_raise( float_flag_invalid
);
1537 aSign
= extractFloat32Sign( a
);
1538 bSign
= extractFloat32Sign( b
);
1539 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1540 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1545 -------------------------------------------------------------------------------
1546 Returns 1 if the single-precision floating-point value `a' is less than
1547 the corresponding value `b', and 0 otherwise. The comparison is performed
1548 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1549 -------------------------------------------------------------------------------
1551 flag
float32_lt( float32 a
, float32 b
)
1555 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1556 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1558 float_raise( float_flag_invalid
);
1561 aSign
= extractFloat32Sign( a
);
1562 bSign
= extractFloat32Sign( b
);
1563 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1564 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1569 -------------------------------------------------------------------------------
1570 Returns 1 if the single-precision floating-point value `a' is equal to the
1571 corresponding value `b', and 0 otherwise. The invalid exception is raised
1572 if either operand is a NaN. Otherwise, the comparison is performed
1573 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1574 -------------------------------------------------------------------------------
1576 flag
float32_eq_signaling( float32 a
, float32 b
)
1579 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1580 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1582 float_raise( float_flag_invalid
);
1585 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1590 -------------------------------------------------------------------------------
1591 Returns 1 if the single-precision floating-point value `a' is less than or
1592 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1593 cause an exception. Otherwise, the comparison is performed according to the
1594 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1595 -------------------------------------------------------------------------------
1597 flag
float32_le_quiet( float32 a
, float32 b
)
1602 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1603 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1605 /* Do nothing, even if NaN as we're quiet */
1608 aSign
= extractFloat32Sign( a
);
1609 bSign
= extractFloat32Sign( b
);
1610 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1611 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1616 -------------------------------------------------------------------------------
1617 Returns 1 if the single-precision floating-point value `a' is less than
1618 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1619 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1620 Standard for Binary Floating-point Arithmetic.
1621 -------------------------------------------------------------------------------
1623 flag
float32_lt_quiet( float32 a
, float32 b
)
1627 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1628 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1630 /* Do nothing, even if NaN as we're quiet */
1633 aSign
= extractFloat32Sign( a
);
1634 bSign
= extractFloat32Sign( b
);
1635 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1636 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1641 -------------------------------------------------------------------------------
1642 Returns the result of converting the double-precision floating-point value
1643 `a' to the 32-bit two's complement integer format. The conversion is
1644 performed according to the IEC/IEEE Standard for Binary Floating-point
1645 Arithmetic---which means in particular that the conversion is rounded
1646 according to the current rounding mode. If `a' is a NaN, the largest
1647 positive integer is returned. Otherwise, if the conversion overflows, the
1648 largest integer with the same sign as `a' is returned.
1649 -------------------------------------------------------------------------------
1651 int32
float64_to_int32( struct roundingData
*roundData
, float64 a
)
1654 int16 aExp
, shiftCount
;
1657 aSig
= extractFloat64Frac( a
);
1658 aExp
= extractFloat64Exp( a
);
1659 aSign
= extractFloat64Sign( a
);
1660 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1661 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
1662 shiftCount
= 0x42C - aExp
;
1663 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
1664 return roundAndPackInt32( roundData
, aSign
, aSig
);
1669 -------------------------------------------------------------------------------
1670 Returns the result of converting the double-precision floating-point value
1671 `a' to the 32-bit two's complement integer format. The conversion is
1672 performed according to the IEC/IEEE Standard for Binary Floating-point
1673 Arithmetic, except that the conversion is always rounded toward zero. If
1674 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1675 conversion overflows, the largest integer with the same sign as `a' is
1677 -------------------------------------------------------------------------------
1679 int32
float64_to_int32_round_to_zero( float64 a
)
1682 int16 aExp
, shiftCount
;
1683 bits64 aSig
, savedASig
;
1686 aSig
= extractFloat64Frac( a
);
1687 aExp
= extractFloat64Exp( a
);
1688 aSign
= extractFloat64Sign( a
);
1689 shiftCount
= 0x433 - aExp
;
1690 if ( shiftCount
< 21 ) {
1691 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1694 else if ( 52 < shiftCount
) {
1695 if ( aExp
|| aSig
) float_raise( float_flag_inexact
);
1698 aSig
|= LIT64( 0x0010000000000000 );
1700 aSig
>>= shiftCount
;
1702 if ( aSign
) z
= - z
;
1703 if ( ( z
< 0 ) ^ aSign
) {
1705 float_raise( float_flag_invalid
);
1706 return aSign
? 0x80000000 : 0x7FFFFFFF;
1708 if ( ( aSig
<<shiftCount
) != savedASig
) {
1709 float_raise( float_flag_inexact
);
1716 -------------------------------------------------------------------------------
1717 Returns the result of converting the double-precision floating-point value
1718 `a' to the 32-bit two's complement unsigned integer format. The conversion
1719 is performed according to the IEC/IEEE Standard for Binary Floating-point
1720 Arithmetic---which means in particular that the conversion is rounded
1721 according to the current rounding mode. If `a' is a NaN, the largest
1722 positive integer is returned. Otherwise, if the conversion overflows, the
1723 largest positive integer is returned.
1724 -------------------------------------------------------------------------------
1726 int32
float64_to_uint32( struct roundingData
*roundData
, float64 a
)
1729 int16 aExp
, shiftCount
;
1732 aSig
= extractFloat64Frac( a
);
1733 aExp
= extractFloat64Exp( a
);
1734 aSign
= 0; //extractFloat64Sign( a );
1735 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1736 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
1737 shiftCount
= 0x42C - aExp
;
1738 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
1739 return roundAndPackInt32( roundData
, aSign
, aSig
);
1743 -------------------------------------------------------------------------------
1744 Returns the result of converting the double-precision floating-point value
1745 `a' to the 32-bit two's complement integer format. The conversion is
1746 performed according to the IEC/IEEE Standard for Binary Floating-point
1747 Arithmetic, except that the conversion is always rounded toward zero. If
1748 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1749 conversion overflows, the largest positive integer is returned.
1750 -------------------------------------------------------------------------------
1752 int32
float64_to_uint32_round_to_zero( float64 a
)
1755 int16 aExp
, shiftCount
;
1756 bits64 aSig
, savedASig
;
1759 aSig
= extractFloat64Frac( a
);
1760 aExp
= extractFloat64Exp( a
);
1761 aSign
= extractFloat64Sign( a
);
1762 shiftCount
= 0x433 - aExp
;
1763 if ( shiftCount
< 21 ) {
1764 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
1767 else if ( 52 < shiftCount
) {
1768 if ( aExp
|| aSig
) float_raise( float_flag_inexact
);
1771 aSig
|= LIT64( 0x0010000000000000 );
1773 aSig
>>= shiftCount
;
1775 if ( aSign
) z
= - z
;
1776 if ( ( z
< 0 ) ^ aSign
) {
1778 float_raise( float_flag_invalid
);
1779 return aSign
? 0x80000000 : 0x7FFFFFFF;
1781 if ( ( aSig
<<shiftCount
) != savedASig
) {
1782 float_raise( float_flag_inexact
);
1788 -------------------------------------------------------------------------------
1789 Returns the result of converting the double-precision floating-point value
1790 `a' to the single-precision floating-point format. The conversion is
1791 performed according to the IEC/IEEE Standard for Binary Floating-point
1793 -------------------------------------------------------------------------------
1795 float32
float64_to_float32( struct roundingData
*roundData
, float64 a
)
1802 aSig
= extractFloat64Frac( a
);
1803 aExp
= extractFloat64Exp( a
);
1804 aSign
= extractFloat64Sign( a
);
1805 if ( aExp
== 0x7FF ) {
1806 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a
) );
1807 return packFloat32( aSign
, 0xFF, 0 );
1809 shift64RightJamming( aSig
, 22, &aSig
);
1811 if ( aExp
|| zSig
) {
1815 return roundAndPackFloat32( roundData
, aSign
, aExp
, zSig
);
1822 -------------------------------------------------------------------------------
1823 Returns the result of converting the double-precision floating-point value
1824 `a' to the extended double-precision floating-point format. The conversion
1825 is performed according to the IEC/IEEE Standard for Binary Floating-point
1827 -------------------------------------------------------------------------------
1829 floatx80
float64_to_floatx80( float64 a
)
1835 aSig
= extractFloat64Frac( a
);
1836 aExp
= extractFloat64Exp( a
);
1837 aSign
= extractFloat64Sign( a
);
1838 if ( aExp
== 0x7FF ) {
1839 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a
) );
1840 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1843 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1844 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
1848 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
1855 -------------------------------------------------------------------------------
1856 Rounds the double-precision floating-point value `a' to an integer, and
1857 returns the result as a double-precision floating-point value. The
1858 operation is performed according to the IEC/IEEE Standard for Binary
1859 Floating-point Arithmetic.
1860 -------------------------------------------------------------------------------
1862 float64
float64_round_to_int( struct roundingData
*roundData
, float64 a
)
1866 bits64 lastBitMask
, roundBitsMask
;
1870 aExp
= extractFloat64Exp( a
);
1871 if ( 0x433 <= aExp
) {
1872 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
1873 return propagateFloat64NaN( a
, a
);
1877 if ( aExp
<= 0x3FE ) {
1878 if ( (bits64
) ( a
<<1 ) == 0 ) return a
;
1879 roundData
->exception
|= float_flag_inexact
;
1880 aSign
= extractFloat64Sign( a
);
1881 switch ( roundData
->mode
) {
1882 case float_round_nearest_even
:
1883 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
1884 return packFloat64( aSign
, 0x3FF, 0 );
1887 case float_round_down
:
1888 return aSign
? LIT64( 0xBFF0000000000000 ) : 0;
1889 case float_round_up
:
1891 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1893 return packFloat64( aSign
, 0, 0 );
1896 lastBitMask
<<= 0x433 - aExp
;
1897 roundBitsMask
= lastBitMask
- 1;
1899 roundingMode
= roundData
->mode
;
1900 if ( roundingMode
== float_round_nearest_even
) {
1901 z
+= lastBitMask
>>1;
1902 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1904 else if ( roundingMode
!= float_round_to_zero
) {
1905 if ( extractFloat64Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1909 z
&= ~ roundBitsMask
;
1910 if ( z
!= a
) roundData
->exception
|= float_flag_inexact
;
1916 -------------------------------------------------------------------------------
1917 Returns the result of adding the absolute values of the double-precision
1918 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1919 before being returned. `zSign' is ignored if the result is a NaN. The
1920 addition is performed according to the IEC/IEEE Standard for Binary
1921 Floating-point Arithmetic.
1922 -------------------------------------------------------------------------------
1924 static float64
addFloat64Sigs( struct roundingData
*roundData
, float64 a
, float64 b
, flag zSign
)
1926 int16 aExp
, bExp
, zExp
;
1927 bits64 aSig
, bSig
, zSig
;
1930 aSig
= extractFloat64Frac( a
);
1931 aExp
= extractFloat64Exp( a
);
1932 bSig
= extractFloat64Frac( b
);
1933 bExp
= extractFloat64Exp( b
);
1934 expDiff
= aExp
- bExp
;
1937 if ( 0 < expDiff
) {
1938 if ( aExp
== 0x7FF ) {
1939 if ( aSig
) return propagateFloat64NaN( a
, b
);
1946 bSig
|= LIT64( 0x2000000000000000 );
1948 shift64RightJamming( bSig
, expDiff
, &bSig
);
1951 else if ( expDiff
< 0 ) {
1952 if ( bExp
== 0x7FF ) {
1953 if ( bSig
) return propagateFloat64NaN( a
, b
);
1954 return packFloat64( zSign
, 0x7FF, 0 );
1960 aSig
|= LIT64( 0x2000000000000000 );
1962 shift64RightJamming( aSig
, - expDiff
, &aSig
);
1966 if ( aExp
== 0x7FF ) {
1967 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
1970 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
1971 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
1975 aSig
|= LIT64( 0x2000000000000000 );
1976 zSig
= ( aSig
+ bSig
)<<1;
1978 if ( (sbits64
) zSig
< 0 ) {
1983 return roundAndPackFloat64( roundData
, zSign
, zExp
, zSig
);
1988 -------------------------------------------------------------------------------
1989 Returns the result of subtracting the absolute values of the double-
1990 precision floating-point values `a' and `b'. If `zSign' is true, the
1991 difference is negated before being returned. `zSign' is ignored if the
1992 result is a NaN. The subtraction is performed according to the IEC/IEEE
1993 Standard for Binary Floating-point Arithmetic.
1994 -------------------------------------------------------------------------------
1996 static float64
subFloat64Sigs( struct roundingData
*roundData
, float64 a
, float64 b
, flag zSign
)
1998 int16 aExp
, bExp
, zExp
;
1999 bits64 aSig
, bSig
, zSig
;
2002 aSig
= extractFloat64Frac( a
);
2003 aExp
= extractFloat64Exp( a
);
2004 bSig
= extractFloat64Frac( b
);
2005 bExp
= extractFloat64Exp( b
);
2006 expDiff
= aExp
- bExp
;
2009 if ( 0 < expDiff
) goto aExpBigger
;
2010 if ( expDiff
< 0 ) goto bExpBigger
;
2011 if ( aExp
== 0x7FF ) {
2012 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2013 roundData
->exception
|= float_flag_invalid
;
2014 return float64_default_nan
;
2020 if ( bSig
< aSig
) goto aBigger
;
2021 if ( aSig
< bSig
) goto bBigger
;
2022 return packFloat64( roundData
->mode
== float_round_down
, 0, 0 );
2024 if ( bExp
== 0x7FF ) {
2025 if ( bSig
) return propagateFloat64NaN( a
, b
);
2026 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2032 aSig
|= LIT64( 0x4000000000000000 );
2034 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2035 bSig
|= LIT64( 0x4000000000000000 );
2040 goto normalizeRoundAndPack
;
2042 if ( aExp
== 0x7FF ) {
2043 if ( aSig
) return propagateFloat64NaN( a
, b
);
2050 bSig
|= LIT64( 0x4000000000000000 );
2052 shift64RightJamming( bSig
, expDiff
, &bSig
);
2053 aSig
|= LIT64( 0x4000000000000000 );
2057 normalizeRoundAndPack
:
2059 return normalizeRoundAndPackFloat64( roundData
, zSign
, zExp
, zSig
);
2064 -------------------------------------------------------------------------------
2065 Returns the result of adding the double-precision floating-point values `a'
2066 and `b'. The operation is performed according to the IEC/IEEE Standard for
2067 Binary Floating-point Arithmetic.
2068 -------------------------------------------------------------------------------
2070 float64
float64_add( struct roundingData
*roundData
, float64 a
, float64 b
)
2074 aSign
= extractFloat64Sign( a
);
2075 bSign
= extractFloat64Sign( b
);
2076 if ( aSign
== bSign
) {
2077 return addFloat64Sigs( roundData
, a
, b
, aSign
);
2080 return subFloat64Sigs( roundData
, a
, b
, aSign
);
2086 -------------------------------------------------------------------------------
2087 Returns the result of subtracting the double-precision floating-point values
2088 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2089 for Binary Floating-point Arithmetic.
2090 -------------------------------------------------------------------------------
2092 float64
float64_sub( struct roundingData
*roundData
, float64 a
, float64 b
)
2096 aSign
= extractFloat64Sign( a
);
2097 bSign
= extractFloat64Sign( b
);
2098 if ( aSign
== bSign
) {
2099 return subFloat64Sigs( roundData
, a
, b
, aSign
);
2102 return addFloat64Sigs( roundData
, a
, b
, aSign
);
2108 -------------------------------------------------------------------------------
2109 Returns the result of multiplying the double-precision floating-point values
2110 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2111 for Binary Floating-point Arithmetic.
2112 -------------------------------------------------------------------------------
2114 float64
float64_mul( struct roundingData
*roundData
, float64 a
, float64 b
)
2116 flag aSign
, bSign
, zSign
;
2117 int16 aExp
, bExp
, zExp
;
2118 bits64 aSig
, bSig
, zSig0
, zSig1
;
2120 aSig
= extractFloat64Frac( a
);
2121 aExp
= extractFloat64Exp( a
);
2122 aSign
= extractFloat64Sign( a
);
2123 bSig
= extractFloat64Frac( b
);
2124 bExp
= extractFloat64Exp( b
);
2125 bSign
= extractFloat64Sign( b
);
2126 zSign
= aSign
^ bSign
;
2127 if ( aExp
== 0x7FF ) {
2128 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2129 return propagateFloat64NaN( a
, b
);
2131 if ( ( bExp
| bSig
) == 0 ) {
2132 roundData
->exception
|= float_flag_invalid
;
2133 return float64_default_nan
;
2135 return packFloat64( zSign
, 0x7FF, 0 );
2137 if ( bExp
== 0x7FF ) {
2138 if ( bSig
) return propagateFloat64NaN( a
, b
);
2139 if ( ( aExp
| aSig
) == 0 ) {
2140 roundData
->exception
|= float_flag_invalid
;
2141 return float64_default_nan
;
2143 return packFloat64( zSign
, 0x7FF, 0 );
2146 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2147 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2150 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2151 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2153 zExp
= aExp
+ bExp
- 0x3FF;
2154 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2155 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2156 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2157 zSig0
|= ( zSig1
!= 0 );
2158 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2162 return roundAndPackFloat64( roundData
, zSign
, zExp
, zSig0
);
2167 -------------------------------------------------------------------------------
2168 Returns the result of dividing the double-precision floating-point value `a'
2169 by the corresponding value `b'. The operation is performed according to
2170 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2171 -------------------------------------------------------------------------------
2173 float64
float64_div( struct roundingData
*roundData
, float64 a
, float64 b
)
2175 flag aSign
, bSign
, zSign
;
2176 int16 aExp
, bExp
, zExp
;
2177 bits64 aSig
, bSig
, zSig
;
2179 bits64 term0
, term1
;
2181 aSig
= extractFloat64Frac( a
);
2182 aExp
= extractFloat64Exp( a
);
2183 aSign
= extractFloat64Sign( a
);
2184 bSig
= extractFloat64Frac( b
);
2185 bExp
= extractFloat64Exp( b
);
2186 bSign
= extractFloat64Sign( b
);
2187 zSign
= aSign
^ bSign
;
2188 if ( aExp
== 0x7FF ) {
2189 if ( aSig
) return propagateFloat64NaN( a
, b
);
2190 if ( bExp
== 0x7FF ) {
2191 if ( bSig
) return propagateFloat64NaN( a
, b
);
2192 roundData
->exception
|= float_flag_invalid
;
2193 return float64_default_nan
;
2195 return packFloat64( zSign
, 0x7FF, 0 );
2197 if ( bExp
== 0x7FF ) {
2198 if ( bSig
) return propagateFloat64NaN( a
, b
);
2199 return packFloat64( zSign
, 0, 0 );
2203 if ( ( aExp
| aSig
) == 0 ) {
2204 roundData
->exception
|= float_flag_invalid
;
2205 return float64_default_nan
;
2207 roundData
->exception
|= float_flag_divbyzero
;
2208 return packFloat64( zSign
, 0x7FF, 0 );
2210 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2213 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2214 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2216 zExp
= aExp
- bExp
+ 0x3FD;
2217 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2218 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2219 if ( bSig
<= ( aSig
+ aSig
) ) {
2223 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2224 if ( ( zSig
& 0x1FF ) <= 2 ) {
2225 mul64To128( bSig
, zSig
, &term0
, &term1
);
2226 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2227 while ( (sbits64
) rem0
< 0 ) {
2229 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2231 zSig
|= ( rem1
!= 0 );
2233 return roundAndPackFloat64( roundData
, zSign
, zExp
, zSig
);
2238 -------------------------------------------------------------------------------
2239 Returns the remainder of the double-precision floating-point value `a'
2240 with respect to the corresponding value `b'. The operation is performed
2241 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2242 -------------------------------------------------------------------------------
2244 float64
float64_rem( struct roundingData
*roundData
, float64 a
, float64 b
)
2246 flag aSign
, bSign
, zSign
;
2247 int16 aExp
, bExp
, expDiff
;
2249 bits64 q
, alternateASig
;
2252 aSig
= extractFloat64Frac( a
);
2253 aExp
= extractFloat64Exp( a
);
2254 aSign
= extractFloat64Sign( a
);
2255 bSig
= extractFloat64Frac( b
);
2256 bExp
= extractFloat64Exp( b
);
2257 bSign
= extractFloat64Sign( b
);
2258 if ( aExp
== 0x7FF ) {
2259 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2260 return propagateFloat64NaN( a
, b
);
2262 roundData
->exception
|= float_flag_invalid
;
2263 return float64_default_nan
;
2265 if ( bExp
== 0x7FF ) {
2266 if ( bSig
) return propagateFloat64NaN( a
, b
);
2271 roundData
->exception
|= float_flag_invalid
;
2272 return float64_default_nan
;
2274 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2277 if ( aSig
== 0 ) return a
;
2278 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2280 expDiff
= aExp
- bExp
;
2281 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
2282 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2283 if ( expDiff
< 0 ) {
2284 if ( expDiff
< -1 ) return a
;
2287 q
= ( bSig
<= aSig
);
2288 if ( q
) aSig
-= bSig
;
2290 while ( 0 < expDiff
) {
2291 q
= estimateDiv128To64( aSig
, 0, bSig
);
2292 q
= ( 2 < q
) ? q
- 2 : 0;
2293 aSig
= - ( ( bSig
>>2 ) * q
);
2297 if ( 0 < expDiff
) {
2298 q
= estimateDiv128To64( aSig
, 0, bSig
);
2299 q
= ( 2 < q
) ? q
- 2 : 0;
2302 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2309 alternateASig
= aSig
;
2312 } while ( 0 <= (sbits64
) aSig
);
2313 sigMean
= aSig
+ alternateASig
;
2314 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2315 aSig
= alternateASig
;
2317 zSign
= ( (sbits64
) aSig
< 0 );
2318 if ( zSign
) aSig
= - aSig
;
2319 return normalizeRoundAndPackFloat64( roundData
, aSign
^ zSign
, bExp
, aSig
);
2324 -------------------------------------------------------------------------------
2325 Returns the square root of the double-precision floating-point value `a'.
2326 The operation is performed according to the IEC/IEEE Standard for Binary
2327 Floating-point Arithmetic.
2328 -------------------------------------------------------------------------------
2330 float64
float64_sqrt( struct roundingData
*roundData
, float64 a
)
2335 bits64 rem0
, rem1
, term0
, term1
; //, shiftedRem;
2338 aSig
= extractFloat64Frac( a
);
2339 aExp
= extractFloat64Exp( a
);
2340 aSign
= extractFloat64Sign( a
);
2341 if ( aExp
== 0x7FF ) {
2342 if ( aSig
) return propagateFloat64NaN( a
, a
);
2343 if ( ! aSign
) return a
;
2344 roundData
->exception
|= float_flag_invalid
;
2345 return float64_default_nan
;
2348 if ( ( aExp
| aSig
) == 0 ) return a
;
2349 roundData
->exception
|= float_flag_invalid
;
2350 return float64_default_nan
;
2353 if ( aSig
== 0 ) return 0;
2354 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2356 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
2357 aSig
|= LIT64( 0x0010000000000000 );
2358 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
2360 aSig
<<= 9 - ( aExp
& 1 );
2361 zSig
= estimateDiv128To64( aSig
, 0, zSig
) + zSig
+ 2;
2362 if ( ( zSig
& 0x3FF ) <= 5 ) {
2364 zSig
= LIT64( 0xFFFFFFFFFFFFFFFF );
2368 mul64To128( zSig
, zSig
, &term0
, &term1
);
2369 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2370 while ( (sbits64
) rem0
< 0 ) {
2372 shortShift128Left( 0, zSig
, 1, &term0
, &term1
);
2374 add128( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
2376 zSig
|= ( ( rem0
| rem1
) != 0 );
2379 shift64RightJamming( zSig
, 1, &zSig
);
2380 return roundAndPackFloat64( roundData
, 0, zExp
, zSig
);
2385 -------------------------------------------------------------------------------
2386 Returns 1 if the double-precision floating-point value `a' is equal to the
2387 corresponding value `b', and 0 otherwise. The comparison is performed
2388 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2389 -------------------------------------------------------------------------------
2391 flag
float64_eq( float64 a
, float64 b
)
2394 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2395 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2397 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2398 float_raise( float_flag_invalid
);
2402 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2407 -------------------------------------------------------------------------------
2408 Returns 1 if the double-precision floating-point value `a' is less than or
2409 equal to the corresponding value `b', and 0 otherwise. The comparison is
2410 performed according to the IEC/IEEE Standard for Binary Floating-point
2412 -------------------------------------------------------------------------------
2414 flag
float64_le( float64 a
, float64 b
)
2418 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2419 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2421 float_raise( float_flag_invalid
);
2424 aSign
= extractFloat64Sign( a
);
2425 bSign
= extractFloat64Sign( b
);
2426 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2427 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2432 -------------------------------------------------------------------------------
2433 Returns 1 if the double-precision floating-point value `a' is less than
2434 the corresponding value `b', and 0 otherwise. The comparison is performed
2435 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2436 -------------------------------------------------------------------------------
2438 flag
float64_lt( float64 a
, float64 b
)
2442 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2443 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2445 float_raise( float_flag_invalid
);
2448 aSign
= extractFloat64Sign( a
);
2449 bSign
= extractFloat64Sign( b
);
2450 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2451 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2456 -------------------------------------------------------------------------------
2457 Returns 1 if the double-precision floating-point value `a' is equal to the
2458 corresponding value `b', and 0 otherwise. The invalid exception is raised
2459 if either operand is a NaN. Otherwise, the comparison is performed
2460 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2461 -------------------------------------------------------------------------------
2463 flag
float64_eq_signaling( float64 a
, float64 b
)
2466 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2467 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2469 float_raise( float_flag_invalid
);
2472 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2477 -------------------------------------------------------------------------------
2478 Returns 1 if the double-precision floating-point value `a' is less than or
2479 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2480 cause an exception. Otherwise, the comparison is performed according to the
2481 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2482 -------------------------------------------------------------------------------
2484 flag
float64_le_quiet( float64 a
, float64 b
)
2489 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2490 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2492 /* Do nothing, even if NaN as we're quiet */
2495 aSign
= extractFloat64Sign( a
);
2496 bSign
= extractFloat64Sign( b
);
2497 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2498 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2503 -------------------------------------------------------------------------------
2504 Returns 1 if the double-precision floating-point value `a' is less than
2505 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2506 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2507 Standard for Binary Floating-point Arithmetic.
2508 -------------------------------------------------------------------------------
2510 flag
float64_lt_quiet( float64 a
, float64 b
)
2514 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2515 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2517 /* Do nothing, even if NaN as we're quiet */
2520 aSign
= extractFloat64Sign( a
);
2521 bSign
= extractFloat64Sign( b
);
2522 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2523 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2530 -------------------------------------------------------------------------------
2531 Returns the result of converting the extended double-precision floating-
2532 point value `a' to the 32-bit two's complement integer format. The
2533 conversion is performed according to the IEC/IEEE Standard for Binary
2534 Floating-point Arithmetic---which means in particular that the conversion
2535 is rounded according to the current rounding mode. If `a' is a NaN, the
2536 largest positive integer is returned. Otherwise, if the conversion
2537 overflows, the largest integer with the same sign as `a' is returned.
2538 -------------------------------------------------------------------------------
2540 int32
floatx80_to_int32( struct roundingData
*roundData
, floatx80 a
)
2543 int32 aExp
, shiftCount
;
2546 aSig
= extractFloatx80Frac( a
);
2547 aExp
= extractFloatx80Exp( a
);
2548 aSign
= extractFloatx80Sign( a
);
2549 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
2550 shiftCount
= 0x4037 - aExp
;
2551 if ( shiftCount
<= 0 ) shiftCount
= 1;
2552 shift64RightJamming( aSig
, shiftCount
, &aSig
);
2553 return roundAndPackInt32( roundData
, aSign
, aSig
);
2558 -------------------------------------------------------------------------------
2559 Returns the result of converting the extended double-precision floating-
2560 point value `a' to the 32-bit two's complement integer format. The
2561 conversion is performed according to the IEC/IEEE Standard for Binary
2562 Floating-point Arithmetic, except that the conversion is always rounded
2563 toward zero. If `a' is a NaN, the largest positive integer is returned.
2564 Otherwise, if the conversion overflows, the largest integer with the same
2565 sign as `a' is returned.
2566 -------------------------------------------------------------------------------
2568 int32
floatx80_to_int32_round_to_zero( floatx80 a
)
2571 int32 aExp
, shiftCount
;
2572 bits64 aSig
, savedASig
;
2575 aSig
= extractFloatx80Frac( a
);
2576 aExp
= extractFloatx80Exp( a
);
2577 aSign
= extractFloatx80Sign( a
);
2578 shiftCount
= 0x403E - aExp
;
2579 if ( shiftCount
< 32 ) {
2580 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
2583 else if ( 63 < shiftCount
) {
2584 if ( aExp
|| aSig
) float_raise( float_flag_inexact
);
2588 aSig
>>= shiftCount
;
2590 if ( aSign
) z
= - z
;
2591 if ( ( z
< 0 ) ^ aSign
) {
2593 float_raise( float_flag_invalid
);
2594 return aSign
? 0x80000000 : 0x7FFFFFFF;
2596 if ( ( aSig
<<shiftCount
) != savedASig
) {
2597 float_raise( float_flag_inexact
);
2604 -------------------------------------------------------------------------------
2605 Returns the result of converting the extended double-precision floating-
2606 point value `a' to the single-precision floating-point format. The
2607 conversion is performed according to the IEC/IEEE Standard for Binary
2608 Floating-point Arithmetic.
2609 -------------------------------------------------------------------------------
2611 float32
floatx80_to_float32( struct roundingData
*roundData
, floatx80 a
)
2617 aSig
= extractFloatx80Frac( a
);
2618 aExp
= extractFloatx80Exp( a
);
2619 aSign
= extractFloatx80Sign( a
);
2620 if ( aExp
== 0x7FFF ) {
2621 if ( (bits64
) ( aSig
<<1 ) ) {
2622 return commonNaNToFloat32( floatx80ToCommonNaN( a
) );
2624 return packFloat32( aSign
, 0xFF, 0 );
2626 shift64RightJamming( aSig
, 33, &aSig
);
2627 if ( aExp
|| aSig
) aExp
-= 0x3F81;
2628 return roundAndPackFloat32( roundData
, aSign
, aExp
, aSig
);
2633 -------------------------------------------------------------------------------
2634 Returns the result of converting the extended double-precision floating-
2635 point value `a' to the double-precision floating-point format. The
2636 conversion is performed according to the IEC/IEEE Standard for Binary
2637 Floating-point Arithmetic.
2638 -------------------------------------------------------------------------------
2640 float64
floatx80_to_float64( struct roundingData
*roundData
, floatx80 a
)
2646 aSig
= extractFloatx80Frac( a
);
2647 aExp
= extractFloatx80Exp( a
);
2648 aSign
= extractFloatx80Sign( a
);
2649 if ( aExp
== 0x7FFF ) {
2650 if ( (bits64
) ( aSig
<<1 ) ) {
2651 return commonNaNToFloat64( floatx80ToCommonNaN( a
) );
2653 return packFloat64( aSign
, 0x7FF, 0 );
2655 shift64RightJamming( aSig
, 1, &zSig
);
2656 if ( aExp
|| aSig
) aExp
-= 0x3C01;
2657 return roundAndPackFloat64( roundData
, aSign
, aExp
, zSig
);
2662 -------------------------------------------------------------------------------
2663 Rounds the extended double-precision floating-point value `a' to an integer,
2664 and returns the result as an extended quadruple-precision floating-point
2665 value. The operation is performed according to the IEC/IEEE Standard for
2666 Binary Floating-point Arithmetic.
2667 -------------------------------------------------------------------------------
2669 floatx80
floatx80_round_to_int( struct roundingData
*roundData
, floatx80 a
)
2673 bits64 lastBitMask
, roundBitsMask
;
2677 aExp
= extractFloatx80Exp( a
);
2678 if ( 0x403E <= aExp
) {
2679 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
2680 return propagateFloatx80NaN( a
, a
);
2684 if ( aExp
<= 0x3FFE ) {
2686 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
2689 roundData
->exception
|= float_flag_inexact
;
2690 aSign
= extractFloatx80Sign( a
);
2691 switch ( roundData
->mode
) {
2692 case float_round_nearest_even
:
2693 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
2696 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
2699 case float_round_down
:
2702 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2703 : packFloatx80( 0, 0, 0 );
2704 case float_round_up
:
2706 aSign
? packFloatx80( 1, 0, 0 )
2707 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2709 return packFloatx80( aSign
, 0, 0 );
2712 lastBitMask
<<= 0x403E - aExp
;
2713 roundBitsMask
= lastBitMask
- 1;
2715 roundingMode
= roundData
->mode
;
2716 if ( roundingMode
== float_round_nearest_even
) {
2717 z
.low
+= lastBitMask
>>1;
2718 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
2720 else if ( roundingMode
!= float_round_to_zero
) {
2721 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
2722 z
.low
+= roundBitsMask
;
2725 z
.low
&= ~ roundBitsMask
;
2728 z
.low
= LIT64( 0x8000000000000000 );
2730 if ( z
.low
!= a
.low
) roundData
->exception
|= float_flag_inexact
;
2736 -------------------------------------------------------------------------------
2737 Returns the result of adding the absolute values of the extended double-
2738 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2739 negated before being returned. `zSign' is ignored if the result is a NaN.
2740 The addition is performed according to the IEC/IEEE Standard for Binary
2741 Floating-point Arithmetic.
2742 -------------------------------------------------------------------------------
2744 static floatx80
addFloatx80Sigs( struct roundingData
*roundData
, floatx80 a
, floatx80 b
, flag zSign
)
2746 int32 aExp
, bExp
, zExp
;
2747 bits64 aSig
, bSig
, zSig0
, zSig1
;
2750 aSig
= extractFloatx80Frac( a
);
2751 aExp
= extractFloatx80Exp( a
);
2752 bSig
= extractFloatx80Frac( b
);
2753 bExp
= extractFloatx80Exp( b
);
2754 expDiff
= aExp
- bExp
;
2755 if ( 0 < expDiff
) {
2756 if ( aExp
== 0x7FFF ) {
2757 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2760 if ( bExp
== 0 ) --expDiff
;
2761 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
2764 else if ( expDiff
< 0 ) {
2765 if ( bExp
== 0x7FFF ) {
2766 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2767 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2769 if ( aExp
== 0 ) ++expDiff
;
2770 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
2774 if ( aExp
== 0x7FFF ) {
2775 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
2776 return propagateFloatx80NaN( a
, b
);
2781 zSig0
= aSig
+ bSig
;
2783 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
2790 zSig0
= aSig
+ bSig
;
2792 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
2794 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
2795 zSig0
|= LIT64( 0x8000000000000000 );
2799 roundAndPackFloatx80(
2800 roundData
, zSign
, zExp
, zSig0
, zSig1
);
2805 -------------------------------------------------------------------------------
2806 Returns the result of subtracting the absolute values of the extended
2807 double-precision floating-point values `a' and `b'. If `zSign' is true,
2808 the difference is negated before being returned. `zSign' is ignored if the
2809 result is a NaN. The subtraction is performed according to the IEC/IEEE
2810 Standard for Binary Floating-point Arithmetic.
2811 -------------------------------------------------------------------------------
2813 static floatx80
subFloatx80Sigs( struct roundingData
*roundData
, floatx80 a
, floatx80 b
, flag zSign
)
2815 int32 aExp
, bExp
, zExp
;
2816 bits64 aSig
, bSig
, zSig0
, zSig1
;
2820 aSig
= extractFloatx80Frac( a
);
2821 aExp
= extractFloatx80Exp( a
);
2822 bSig
= extractFloatx80Frac( b
);
2823 bExp
= extractFloatx80Exp( b
);
2824 expDiff
= aExp
- bExp
;
2825 if ( 0 < expDiff
) goto aExpBigger
;
2826 if ( expDiff
< 0 ) goto bExpBigger
;
2827 if ( aExp
== 0x7FFF ) {
2828 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
2829 return propagateFloatx80NaN( a
, b
);
2831 roundData
->exception
|= float_flag_invalid
;
2832 z
.low
= floatx80_default_nan_low
;
2833 z
.high
= floatx80_default_nan_high
;
2841 if ( bSig
< aSig
) goto aBigger
;
2842 if ( aSig
< bSig
) goto bBigger
;
2843 return packFloatx80( roundData
->mode
== float_round_down
, 0, 0 );
2845 if ( bExp
== 0x7FFF ) {
2846 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2847 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2849 if ( aExp
== 0 ) ++expDiff
;
2850 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
2852 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
2855 goto normalizeRoundAndPack
;
2857 if ( aExp
== 0x7FFF ) {
2858 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2861 if ( bExp
== 0 ) --expDiff
;
2862 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
2864 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
2866 normalizeRoundAndPack
:
2868 normalizeRoundAndPackFloatx80(
2869 roundData
, zSign
, zExp
, zSig0
, zSig1
);
2874 -------------------------------------------------------------------------------
2875 Returns the result of adding the extended double-precision floating-point
2876 values `a' and `b'. The operation is performed according to the IEC/IEEE
2877 Standard for Binary Floating-point Arithmetic.
2878 -------------------------------------------------------------------------------
2880 floatx80
floatx80_add( struct roundingData
*roundData
, floatx80 a
, floatx80 b
)
2884 aSign
= extractFloatx80Sign( a
);
2885 bSign
= extractFloatx80Sign( b
);
2886 if ( aSign
== bSign
) {
2887 return addFloatx80Sigs( roundData
, a
, b
, aSign
);
2890 return subFloatx80Sigs( roundData
, a
, b
, aSign
);
2896 -------------------------------------------------------------------------------
2897 Returns the result of subtracting the extended double-precision floating-
2898 point values `a' and `b'. The operation is performed according to the
2899 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2900 -------------------------------------------------------------------------------
2902 floatx80
floatx80_sub( struct roundingData
*roundData
, floatx80 a
, floatx80 b
)
2906 aSign
= extractFloatx80Sign( a
);
2907 bSign
= extractFloatx80Sign( b
);
2908 if ( aSign
== bSign
) {
2909 return subFloatx80Sigs( roundData
, a
, b
, aSign
);
2912 return addFloatx80Sigs( roundData
, a
, b
, aSign
);
2918 -------------------------------------------------------------------------------
2919 Returns the result of multiplying the extended double-precision floating-
2920 point values `a' and `b'. The operation is performed according to the
2921 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2922 -------------------------------------------------------------------------------
2924 floatx80
floatx80_mul( struct roundingData
*roundData
, floatx80 a
, floatx80 b
)
2926 flag aSign
, bSign
, zSign
;
2927 int32 aExp
, bExp
, zExp
;
2928 bits64 aSig
, bSig
, zSig0
, zSig1
;
2931 aSig
= extractFloatx80Frac( a
);
2932 aExp
= extractFloatx80Exp( a
);
2933 aSign
= extractFloatx80Sign( a
);
2934 bSig
= extractFloatx80Frac( b
);
2935 bExp
= extractFloatx80Exp( b
);
2936 bSign
= extractFloatx80Sign( b
);
2937 zSign
= aSign
^ bSign
;
2938 if ( aExp
== 0x7FFF ) {
2939 if ( (bits64
) ( aSig
<<1 )
2940 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
2941 return propagateFloatx80NaN( a
, b
);
2943 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
2944 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2946 if ( bExp
== 0x7FFF ) {
2947 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
2948 if ( ( aExp
| aSig
) == 0 ) {
2950 roundData
->exception
|= float_flag_invalid
;
2951 z
.low
= floatx80_default_nan_low
;
2952 z
.high
= floatx80_default_nan_high
;
2955 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2958 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
2959 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
2962 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
2963 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
2965 zExp
= aExp
+ bExp
- 0x3FFE;
2966 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2967 if ( 0 < (sbits64
) zSig0
) {
2968 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
2972 roundAndPackFloatx80(
2973 roundData
, zSign
, zExp
, zSig0
, zSig1
);
2978 -------------------------------------------------------------------------------
2979 Returns the result of dividing the extended double-precision floating-point
2980 value `a' by the corresponding value `b'. The operation is performed
2981 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2982 -------------------------------------------------------------------------------
2984 floatx80
floatx80_div( struct roundingData
*roundData
, floatx80 a
, floatx80 b
)
2986 flag aSign
, bSign
, zSign
;
2987 int32 aExp
, bExp
, zExp
;
2988 bits64 aSig
, bSig
, zSig0
, zSig1
;
2989 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
2992 aSig
= extractFloatx80Frac( a
);
2993 aExp
= extractFloatx80Exp( a
);
2994 aSign
= extractFloatx80Sign( a
);
2995 bSig
= extractFloatx80Frac( b
);
2996 bExp
= extractFloatx80Exp( b
);
2997 bSign
= extractFloatx80Sign( b
);
2998 zSign
= aSign
^ bSign
;
2999 if ( aExp
== 0x7FFF ) {
3000 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3001 if ( bExp
== 0x7FFF ) {
3002 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3005 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3007 if ( bExp
== 0x7FFF ) {
3008 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3009 return packFloatx80( zSign
, 0, 0 );
3013 if ( ( aExp
| aSig
) == 0 ) {
3015 roundData
->exception
|= float_flag_invalid
;
3016 z
.low
= floatx80_default_nan_low
;
3017 z
.high
= floatx80_default_nan_high
;
3020 roundData
->exception
|= float_flag_divbyzero
;
3021 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3023 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3026 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3027 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3029 zExp
= aExp
- bExp
+ 0x3FFE;
3031 if ( bSig
<= aSig
) {
3032 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3035 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3036 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3037 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3038 while ( (sbits64
) rem0
< 0 ) {
3040 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3042 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3043 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3044 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3045 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3046 while ( (sbits64
) rem1
< 0 ) {
3048 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3050 zSig1
|= ( ( rem1
| rem2
) != 0 );
3053 roundAndPackFloatx80(
3054 roundData
, zSign
, zExp
, zSig0
, zSig1
);
3059 -------------------------------------------------------------------------------
3060 Returns the remainder of the extended double-precision floating-point value
3061 `a' with respect to the corresponding value `b'. The operation is performed
3062 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3063 -------------------------------------------------------------------------------
3065 floatx80
floatx80_rem( struct roundingData
*roundData
, floatx80 a
, floatx80 b
)
3067 flag aSign
, bSign
, zSign
;
3068 int32 aExp
, bExp
, expDiff
;
3069 bits64 aSig0
, aSig1
, bSig
;
3070 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3073 aSig0
= extractFloatx80Frac( a
);
3074 aExp
= extractFloatx80Exp( a
);
3075 aSign
= extractFloatx80Sign( a
);
3076 bSig
= extractFloatx80Frac( b
);
3077 bExp
= extractFloatx80Exp( b
);
3078 bSign
= extractFloatx80Sign( b
);
3079 if ( aExp
== 0x7FFF ) {
3080 if ( (bits64
) ( aSig0
<<1 )
3081 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3082 return propagateFloatx80NaN( a
, b
);
3086 if ( bExp
== 0x7FFF ) {
3087 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3093 roundData
->exception
|= float_flag_invalid
;
3094 z
.low
= floatx80_default_nan_low
;
3095 z
.high
= floatx80_default_nan_high
;
3098 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3101 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3102 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3104 bSig
|= LIT64( 0x8000000000000000 );
3106 expDiff
= aExp
- bExp
;
3108 if ( expDiff
< 0 ) {
3109 if ( expDiff
< -1 ) return a
;
3110 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3113 q
= ( bSig
<= aSig0
);
3114 if ( q
) aSig0
-= bSig
;
3116 while ( 0 < expDiff
) {
3117 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3118 q
= ( 2 < q
) ? q
- 2 : 0;
3119 mul64To128( bSig
, q
, &term0
, &term1
);
3120 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3121 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3125 if ( 0 < expDiff
) {
3126 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3127 q
= ( 2 < q
) ? q
- 2 : 0;
3129 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
3130 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3131 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
3132 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
3134 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3141 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
3142 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3143 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3146 aSig0
= alternateASig0
;
3147 aSig1
= alternateASig1
;
3152 normalizeRoundAndPackFloatx80(
3153 roundData
, zSign
, bExp
+ expDiff
, aSig0
, aSig1
);
3158 -------------------------------------------------------------------------------
3159 Returns the square root of the extended double-precision floating-point
3160 value `a'. The operation is performed according to the IEC/IEEE Standard
3161 for Binary Floating-point Arithmetic.
3162 -------------------------------------------------------------------------------
3164 floatx80
floatx80_sqrt( struct roundingData
*roundData
, floatx80 a
)
3168 bits64 aSig0
, aSig1
, zSig0
, zSig1
;
3169 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
3170 bits64 shiftedRem0
, shiftedRem1
;
3173 aSig0
= extractFloatx80Frac( a
);
3174 aExp
= extractFloatx80Exp( a
);
3175 aSign
= extractFloatx80Sign( a
);
3176 if ( aExp
== 0x7FFF ) {
3177 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a
);
3178 if ( ! aSign
) return a
;
3182 if ( ( aExp
| aSig0
) == 0 ) return a
;
3184 roundData
->exception
|= float_flag_invalid
;
3185 z
.low
= floatx80_default_nan_low
;
3186 z
.high
= floatx80_default_nan_high
;
3190 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
3191 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3193 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
3194 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
3197 shift128Right( aSig0
, 0, ( aExp
& 1 ) + 2, &aSig0
, &aSig1
);
3198 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
) + zSig0
+ 4;
3199 if ( 0 <= (sbits64
) zSig0
) zSig0
= LIT64( 0xFFFFFFFFFFFFFFFF );
3200 shortShift128Left( aSig0
, aSig1
, 2, &aSig0
, &aSig1
);
3201 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
3202 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
3203 while ( (sbits64
) rem0
< 0 ) {
3205 shortShift128Left( 0, zSig0
, 1, &term0
, &term1
);
3207 add128( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
3209 shortShift128Left( rem0
, rem1
, 63, &shiftedRem0
, &shiftedRem1
);
3210 zSig1
= estimateDiv128To64( shiftedRem0
, shiftedRem1
, zSig0
);
3211 if ( (bits64
) ( zSig1
<<1 ) <= 10 ) {
3212 if ( zSig1
== 0 ) zSig1
= 1;
3213 mul64To128( zSig0
, zSig1
, &term1
, &term2
);
3214 shortShift128Left( term1
, term2
, 1, &term1
, &term2
);
3215 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3216 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
3217 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
3218 while ( (sbits64
) rem1
< 0 ) {
3220 shortShift192Left( 0, zSig0
, zSig1
, 1, &term1
, &term2
, &term3
);
3223 rem1
, rem2
, rem3
, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
3225 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
3228 roundAndPackFloatx80(
3229 roundData
, 0, zExp
, zSig0
, zSig1
);
3234 -------------------------------------------------------------------------------
3235 Returns 1 if the extended double-precision floating-point value `a' is
3236 equal to the corresponding value `b', and 0 otherwise. The comparison is
3237 performed according to the IEC/IEEE Standard for Binary Floating-point
3239 -------------------------------------------------------------------------------
3241 flag
floatx80_eq( floatx80 a
, floatx80 b
)
3244 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3245 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3246 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3247 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3249 if ( floatx80_is_signaling_nan( a
)
3250 || floatx80_is_signaling_nan( b
) ) {
3251 float_raise( float_flag_invalid
);
3257 && ( ( a
.high
== b
.high
)
3259 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3265 -------------------------------------------------------------------------------
3266 Returns 1 if the extended double-precision floating-point value `a' is
3267 less than or equal to the corresponding value `b', and 0 otherwise. The
3268 comparison is performed according to the IEC/IEEE Standard for Binary
3269 Floating-point Arithmetic.
3270 -------------------------------------------------------------------------------
3272 flag
floatx80_le( floatx80 a
, floatx80 b
)
3276 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3277 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3278 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3279 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3281 float_raise( float_flag_invalid
);
3284 aSign
= extractFloatx80Sign( a
);
3285 bSign
= extractFloatx80Sign( b
);
3286 if ( aSign
!= bSign
) {
3289 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3293 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3294 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3299 -------------------------------------------------------------------------------
3300 Returns 1 if the extended double-precision floating-point value `a' is
3301 less than the corresponding value `b', and 0 otherwise. The comparison
3302 is performed according to the IEC/IEEE Standard for Binary Floating-point
3304 -------------------------------------------------------------------------------
3306 flag
floatx80_lt( floatx80 a
, floatx80 b
)
3310 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3311 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3312 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3313 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3315 float_raise( float_flag_invalid
);
3318 aSign
= extractFloatx80Sign( a
);
3319 bSign
= extractFloatx80Sign( b
);
3320 if ( aSign
!= bSign
) {
3323 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3327 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
3328 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
3333 -------------------------------------------------------------------------------
3334 Returns 1 if the extended double-precision floating-point value `a' is equal
3335 to the corresponding value `b', and 0 otherwise. The invalid exception is
3336 raised if either operand is a NaN. Otherwise, the comparison is performed
3337 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3338 -------------------------------------------------------------------------------
3340 flag
floatx80_eq_signaling( floatx80 a
, floatx80 b
)
3343 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3344 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3345 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3346 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3348 float_raise( float_flag_invalid
);
3353 && ( ( a
.high
== b
.high
)
3355 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3361 -------------------------------------------------------------------------------
3362 Returns 1 if the extended double-precision floating-point value `a' is less
3363 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3364 do not cause an exception. Otherwise, the comparison is performed according
3365 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3366 -------------------------------------------------------------------------------
3368 flag
floatx80_le_quiet( floatx80 a
, floatx80 b
)
3372 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3373 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3374 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3375 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3377 /* Do nothing, even if NaN as we're quiet */
3380 aSign
= extractFloatx80Sign( a
);
3381 bSign
= extractFloatx80Sign( b
);
3382 if ( aSign
!= bSign
) {
3385 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3389 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3390 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3395 -------------------------------------------------------------------------------
3396 Returns 1 if the extended double-precision floating-point value `a' is less
3397 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3398 an exception. Otherwise, the comparison is performed according to the
3399 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3400 -------------------------------------------------------------------------------
3402 flag
floatx80_lt_quiet( floatx80 a
, floatx80 b
)
3406 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3407 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3408 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3409 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3411 /* Do nothing, even if NaN as we're quiet */
3414 aSign
= extractFloatx80Sign( a
);
3415 bSign
= extractFloatx80Sign( b
);
3416 if ( aSign
!= bSign
) {
3419 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3423 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
3424 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);