Linux 2.6.13-rc4
[linux-2.6/next.git] / arch / arm / nwfpe / softfloat.c
blobe038dd3be9b3c63e019a5be3f77de94f25f4c949
1 /*
2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
31 #include <asm/div64.h>
33 #include "fpa11.h"
34 //#include "milieu.h"
35 //#include "softfloat.h"
38 -------------------------------------------------------------------------------
39 Floating-point rounding mode, extended double-precision rounding precision,
40 and exception flags.
41 -------------------------------------------------------------------------------
43 int8 float_rounding_mode = float_round_nearest_even;
44 int8 floatx80_rounding_precision = 80;
45 int8 float_exception_flags;
48 -------------------------------------------------------------------------------
49 Primitive arithmetic functions, including multi-word arithmetic, and
50 division and square root approximations. (Can be specialized to target if
51 desired.)
52 -------------------------------------------------------------------------------
54 #include "softfloat-macros"
57 -------------------------------------------------------------------------------
58 Functions and definitions to determine: (1) whether tininess for underflow
59 is detected before or after rounding by default, (2) what (if anything)
60 happens when exceptions are raised, (3) how signaling NaNs are distinguished
61 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
62 are propagated from function inputs to output. These details are target-
63 specific.
64 -------------------------------------------------------------------------------
66 #include "softfloat-specialize"
69 -------------------------------------------------------------------------------
70 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71 and 7, and returns the properly rounded 32-bit integer corresponding to the
72 input. If `zSign' is nonzero, the input is negated before being converted
73 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
74 input is simply rounded to an integer, with the inexact exception raised if
75 the input cannot be represented exactly as an integer. If the fixed-point
76 input is too large, however, the invalid exception is raised and the largest
77 positive or negative integer is returned.
78 -------------------------------------------------------------------------------
80 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
82 int8 roundingMode;
83 flag roundNearestEven;
84 int8 roundIncrement, roundBits;
85 int32 z;
87 roundingMode = float_rounding_mode;
88 roundNearestEven = ( roundingMode == float_round_nearest_even );
89 roundIncrement = 0x40;
90 if ( ! roundNearestEven ) {
91 if ( roundingMode == float_round_to_zero ) {
92 roundIncrement = 0;
94 else {
95 roundIncrement = 0x7F;
96 if ( zSign ) {
97 if ( roundingMode == float_round_up ) roundIncrement = 0;
99 else {
100 if ( roundingMode == float_round_down ) roundIncrement = 0;
104 roundBits = absZ & 0x7F;
105 absZ = ( absZ + roundIncrement )>>7;
106 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107 z = absZ;
108 if ( zSign ) z = - z;
109 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110 float_exception_flags |= float_flag_invalid;
111 return zSign ? 0x80000000 : 0x7FFFFFFF;
113 if ( roundBits ) float_exception_flags |= float_flag_inexact;
114 return z;
119 -------------------------------------------------------------------------------
120 Returns the fraction bits of the single-precision floating-point value `a'.
121 -------------------------------------------------------------------------------
123 INLINE bits32 extractFloat32Frac( float32 a )
126 return a & 0x007FFFFF;
131 -------------------------------------------------------------------------------
132 Returns the exponent bits of the single-precision floating-point value `a'.
133 -------------------------------------------------------------------------------
135 INLINE int16 extractFloat32Exp( float32 a )
138 return ( a>>23 ) & 0xFF;
143 -------------------------------------------------------------------------------
144 Returns the sign bit of the single-precision floating-point value `a'.
145 -------------------------------------------------------------------------------
147 #if 0 /* in softfloat.h */
148 INLINE flag extractFloat32Sign( float32 a )
151 return a>>31;
154 #endif
157 -------------------------------------------------------------------------------
158 Normalizes the subnormal single-precision floating-point value represented
159 by the denormalized significand `aSig'. The normalized exponent and
160 significand are stored at the locations pointed to by `zExpPtr' and
161 `zSigPtr', respectively.
162 -------------------------------------------------------------------------------
164 static void
165 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
167 int8 shiftCount;
169 shiftCount = countLeadingZeros32( aSig ) - 8;
170 *zSigPtr = aSig<<shiftCount;
171 *zExpPtr = 1 - shiftCount;
176 -------------------------------------------------------------------------------
177 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
178 single-precision floating-point value, returning the result. After being
179 shifted into the proper positions, the three fields are simply added
180 together to form the result. This means that any integer portion of `zSig'
181 will be added into the exponent. Since a properly normalized significand
182 will have an integer portion equal to 1, the `zExp' input should be 1 less
183 than the desired result exponent whenever `zSig' is a complete, normalized
184 significand.
185 -------------------------------------------------------------------------------
187 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
189 #if 0
190 float32 f;
191 __asm__("@ packFloat32 \n\
192 mov %0, %1, asl #31 \n\
193 orr %0, %2, asl #23 \n\
194 orr %0, %3"
195 : /* no outputs */
196 : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
197 : "cc");
198 return f;
199 #else
200 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
201 #endif
205 -------------------------------------------------------------------------------
206 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
207 and significand `zSig', and returns the proper single-precision floating-
208 point value corresponding to the abstract input. Ordinarily, the abstract
209 value is simply rounded and packed into the single-precision format, with
210 the inexact exception raised if the abstract input cannot be represented
211 exactly. If the abstract value is too large, however, the overflow and
212 inexact exceptions are raised and an infinity or maximal finite value is
213 returned. If the abstract value is too small, the input value is rounded to
214 a subnormal number, and the underflow and inexact exceptions are raised if
215 the abstract input cannot be represented exactly as a subnormal single-
216 precision floating-point number.
217 The input significand `zSig' has its binary point between bits 30
218 and 29, which is 7 bits to the left of the usual location. This shifted
219 significand must be normalized or smaller. If `zSig' is not normalized,
220 `zExp' must be 0; in that case, the result returned is a subnormal number,
221 and it must not require rounding. In the usual case that `zSig' is
222 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
223 The handling of underflow and overflow follows the IEC/IEEE Standard for
224 Binary Floating-point Arithmetic.
225 -------------------------------------------------------------------------------
227 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
229 int8 roundingMode;
230 flag roundNearestEven;
231 int8 roundIncrement, roundBits;
232 flag isTiny;
234 roundingMode = float_rounding_mode;
235 roundNearestEven = ( roundingMode == float_round_nearest_even );
236 roundIncrement = 0x40;
237 if ( ! roundNearestEven ) {
238 if ( roundingMode == float_round_to_zero ) {
239 roundIncrement = 0;
241 else {
242 roundIncrement = 0x7F;
243 if ( zSign ) {
244 if ( roundingMode == float_round_up ) roundIncrement = 0;
246 else {
247 if ( roundingMode == float_round_down ) roundIncrement = 0;
251 roundBits = zSig & 0x7F;
252 if ( 0xFD <= (bits16) zExp ) {
253 if ( ( 0xFD < zExp )
254 || ( ( zExp == 0xFD )
255 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
257 float_raise( float_flag_overflow | float_flag_inexact );
258 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
260 if ( zExp < 0 ) {
261 isTiny =
262 ( float_detect_tininess == float_tininess_before_rounding )
263 || ( zExp < -1 )
264 || ( zSig + roundIncrement < 0x80000000 );
265 shift32RightJamming( zSig, - zExp, &zSig );
266 zExp = 0;
267 roundBits = zSig & 0x7F;
268 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
271 if ( roundBits ) float_exception_flags |= float_flag_inexact;
272 zSig = ( zSig + roundIncrement )>>7;
273 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
274 if ( zSig == 0 ) zExp = 0;
275 return packFloat32( zSign, zExp, zSig );
280 -------------------------------------------------------------------------------
281 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
282 and significand `zSig', and returns the proper single-precision floating-
283 point value corresponding to the abstract input. This routine is just like
284 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
285 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
286 point exponent.
287 -------------------------------------------------------------------------------
289 static float32
290 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
292 int8 shiftCount;
294 shiftCount = countLeadingZeros32( zSig ) - 1;
295 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
300 -------------------------------------------------------------------------------
301 Returns the fraction bits of the double-precision floating-point value `a'.
302 -------------------------------------------------------------------------------
304 INLINE bits64 extractFloat64Frac( float64 a )
307 return a & LIT64( 0x000FFFFFFFFFFFFF );
312 -------------------------------------------------------------------------------
313 Returns the exponent bits of the double-precision floating-point value `a'.
314 -------------------------------------------------------------------------------
316 INLINE int16 extractFloat64Exp( float64 a )
319 return ( a>>52 ) & 0x7FF;
324 -------------------------------------------------------------------------------
325 Returns the sign bit of the double-precision floating-point value `a'.
326 -------------------------------------------------------------------------------
328 #if 0 /* in softfloat.h */
329 INLINE flag extractFloat64Sign( float64 a )
332 return a>>63;
335 #endif
338 -------------------------------------------------------------------------------
339 Normalizes the subnormal double-precision floating-point value represented
340 by the denormalized significand `aSig'. The normalized exponent and
341 significand are stored at the locations pointed to by `zExpPtr' and
342 `zSigPtr', respectively.
343 -------------------------------------------------------------------------------
345 static void
346 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
348 int8 shiftCount;
350 shiftCount = countLeadingZeros64( aSig ) - 11;
351 *zSigPtr = aSig<<shiftCount;
352 *zExpPtr = 1 - shiftCount;
357 -------------------------------------------------------------------------------
358 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
359 double-precision floating-point value, returning the result. After being
360 shifted into the proper positions, the three fields are simply added
361 together to form the result. This means that any integer portion of `zSig'
362 will be added into the exponent. Since a properly normalized significand
363 will have an integer portion equal to 1, the `zExp' input should be 1 less
364 than the desired result exponent whenever `zSig' is a complete, normalized
365 significand.
366 -------------------------------------------------------------------------------
368 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
371 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
376 -------------------------------------------------------------------------------
377 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
378 and significand `zSig', and returns the proper double-precision floating-
379 point value corresponding to the abstract input. Ordinarily, the abstract
380 value is simply rounded and packed into the double-precision format, with
381 the inexact exception raised if the abstract input cannot be represented
382 exactly. If the abstract value is too large, however, the overflow and
383 inexact exceptions are raised and an infinity or maximal finite value is
384 returned. If the abstract value is too small, the input value is rounded to
385 a subnormal number, and the underflow and inexact exceptions are raised if
386 the abstract input cannot be represented exactly as a subnormal double-
387 precision floating-point number.
388 The input significand `zSig' has its binary point between bits 62
389 and 61, which is 10 bits to the left of the usual location. This shifted
390 significand must be normalized or smaller. If `zSig' is not normalized,
391 `zExp' must be 0; in that case, the result returned is a subnormal number,
392 and it must not require rounding. In the usual case that `zSig' is
393 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
394 The handling of underflow and overflow follows the IEC/IEEE Standard for
395 Binary Floating-point Arithmetic.
396 -------------------------------------------------------------------------------
398 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
400 int8 roundingMode;
401 flag roundNearestEven;
402 int16 roundIncrement, roundBits;
403 flag isTiny;
405 roundingMode = float_rounding_mode;
406 roundNearestEven = ( roundingMode == float_round_nearest_even );
407 roundIncrement = 0x200;
408 if ( ! roundNearestEven ) {
409 if ( roundingMode == float_round_to_zero ) {
410 roundIncrement = 0;
412 else {
413 roundIncrement = 0x3FF;
414 if ( zSign ) {
415 if ( roundingMode == float_round_up ) roundIncrement = 0;
417 else {
418 if ( roundingMode == float_round_down ) roundIncrement = 0;
422 roundBits = zSig & 0x3FF;
423 if ( 0x7FD <= (bits16) zExp ) {
424 if ( ( 0x7FD < zExp )
425 || ( ( zExp == 0x7FD )
426 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
428 //register int lr = __builtin_return_address(0);
429 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
430 float_raise( float_flag_overflow | float_flag_inexact );
431 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
433 if ( zExp < 0 ) {
434 isTiny =
435 ( float_detect_tininess == float_tininess_before_rounding )
436 || ( zExp < -1 )
437 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
438 shift64RightJamming( zSig, - zExp, &zSig );
439 zExp = 0;
440 roundBits = zSig & 0x3FF;
441 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
444 if ( roundBits ) float_exception_flags |= float_flag_inexact;
445 zSig = ( zSig + roundIncrement )>>10;
446 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
447 if ( zSig == 0 ) zExp = 0;
448 return packFloat64( zSign, zExp, zSig );
453 -------------------------------------------------------------------------------
454 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
455 and significand `zSig', and returns the proper double-precision floating-
456 point value corresponding to the abstract input. This routine is just like
457 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
458 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
459 point exponent.
460 -------------------------------------------------------------------------------
462 static float64
463 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
465 int8 shiftCount;
467 shiftCount = countLeadingZeros64( zSig ) - 1;
468 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
472 #ifdef FLOATX80
475 -------------------------------------------------------------------------------
476 Returns the fraction bits of the extended double-precision floating-point
477 value `a'.
478 -------------------------------------------------------------------------------
480 INLINE bits64 extractFloatx80Frac( floatx80 a )
483 return a.low;
488 -------------------------------------------------------------------------------
489 Returns the exponent bits of the extended double-precision floating-point
490 value `a'.
491 -------------------------------------------------------------------------------
493 INLINE int32 extractFloatx80Exp( floatx80 a )
496 return a.high & 0x7FFF;
501 -------------------------------------------------------------------------------
502 Returns the sign bit of the extended double-precision floating-point value
503 `a'.
504 -------------------------------------------------------------------------------
506 INLINE flag extractFloatx80Sign( floatx80 a )
509 return a.high>>15;
514 -------------------------------------------------------------------------------
515 Normalizes the subnormal extended double-precision floating-point value
516 represented by the denormalized significand `aSig'. The normalized exponent
517 and significand are stored at the locations pointed to by `zExpPtr' and
518 `zSigPtr', respectively.
519 -------------------------------------------------------------------------------
521 static void
522 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
524 int8 shiftCount;
526 shiftCount = countLeadingZeros64( aSig );
527 *zSigPtr = aSig<<shiftCount;
528 *zExpPtr = 1 - shiftCount;
533 -------------------------------------------------------------------------------
534 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
535 extended double-precision floating-point value, returning the result.
536 -------------------------------------------------------------------------------
538 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
540 floatx80 z;
542 z.low = zSig;
543 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
544 return z;
549 -------------------------------------------------------------------------------
550 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
551 and extended significand formed by the concatenation of `zSig0' and `zSig1',
552 and returns the proper extended double-precision floating-point value
553 corresponding to the abstract input. Ordinarily, the abstract value is
554 rounded and packed into the extended double-precision format, with the
555 inexact exception raised if the abstract input cannot be represented
556 exactly. If the abstract value is too large, however, the overflow and
557 inexact exceptions are raised and an infinity or maximal finite value is
558 returned. If the abstract value is too small, the input value is rounded to
559 a subnormal number, and the underflow and inexact exceptions are raised if
560 the abstract input cannot be represented exactly as a subnormal extended
561 double-precision floating-point number.
562 If `roundingPrecision' is 32 or 64, the result is rounded to the same
563 number of bits as single or double precision, respectively. Otherwise, the
564 result is rounded to the full precision of the extended double-precision
565 format.
566 The input significand must be normalized or smaller. If the input
567 significand is not normalized, `zExp' must be 0; in that case, the result
568 returned is a subnormal number, and it must not require rounding. The
569 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
570 Floating-point Arithmetic.
571 -------------------------------------------------------------------------------
573 static floatx80
574 roundAndPackFloatx80(
575 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
578 int8 roundingMode;
579 flag roundNearestEven, increment, isTiny;
580 int64 roundIncrement, roundMask, roundBits;
582 roundingMode = float_rounding_mode;
583 roundNearestEven = ( roundingMode == float_round_nearest_even );
584 if ( roundingPrecision == 80 ) goto precision80;
585 if ( roundingPrecision == 64 ) {
586 roundIncrement = LIT64( 0x0000000000000400 );
587 roundMask = LIT64( 0x00000000000007FF );
589 else if ( roundingPrecision == 32 ) {
590 roundIncrement = LIT64( 0x0000008000000000 );
591 roundMask = LIT64( 0x000000FFFFFFFFFF );
593 else {
594 goto precision80;
596 zSig0 |= ( zSig1 != 0 );
597 if ( ! roundNearestEven ) {
598 if ( roundingMode == float_round_to_zero ) {
599 roundIncrement = 0;
601 else {
602 roundIncrement = roundMask;
603 if ( zSign ) {
604 if ( roundingMode == float_round_up ) roundIncrement = 0;
606 else {
607 if ( roundingMode == float_round_down ) roundIncrement = 0;
611 roundBits = zSig0 & roundMask;
612 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
613 if ( ( 0x7FFE < zExp )
614 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
616 goto overflow;
618 if ( zExp <= 0 ) {
619 isTiny =
620 ( float_detect_tininess == float_tininess_before_rounding )
621 || ( zExp < 0 )
622 || ( zSig0 <= zSig0 + roundIncrement );
623 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
624 zExp = 0;
625 roundBits = zSig0 & roundMask;
626 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
627 if ( roundBits ) float_exception_flags |= float_flag_inexact;
628 zSig0 += roundIncrement;
629 if ( (sbits64) zSig0 < 0 ) zExp = 1;
630 roundIncrement = roundMask + 1;
631 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
632 roundMask |= roundIncrement;
634 zSig0 &= ~ roundMask;
635 return packFloatx80( zSign, zExp, zSig0 );
638 if ( roundBits ) float_exception_flags |= float_flag_inexact;
639 zSig0 += roundIncrement;
640 if ( zSig0 < roundIncrement ) {
641 ++zExp;
642 zSig0 = LIT64( 0x8000000000000000 );
644 roundIncrement = roundMask + 1;
645 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
646 roundMask |= roundIncrement;
648 zSig0 &= ~ roundMask;
649 if ( zSig0 == 0 ) zExp = 0;
650 return packFloatx80( zSign, zExp, zSig0 );
651 precision80:
652 increment = ( (sbits64) zSig1 < 0 );
653 if ( ! roundNearestEven ) {
654 if ( roundingMode == float_round_to_zero ) {
655 increment = 0;
657 else {
658 if ( zSign ) {
659 increment = ( roundingMode == float_round_down ) && zSig1;
661 else {
662 increment = ( roundingMode == float_round_up ) && zSig1;
666 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
667 if ( ( 0x7FFE < zExp )
668 || ( ( zExp == 0x7FFE )
669 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
670 && increment
673 roundMask = 0;
674 overflow:
675 float_raise( float_flag_overflow | float_flag_inexact );
676 if ( ( roundingMode == float_round_to_zero )
677 || ( zSign && ( roundingMode == float_round_up ) )
678 || ( ! zSign && ( roundingMode == float_round_down ) )
680 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
682 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
684 if ( zExp <= 0 ) {
685 isTiny =
686 ( float_detect_tininess == float_tininess_before_rounding )
687 || ( zExp < 0 )
688 || ! increment
689 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
690 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
691 zExp = 0;
692 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
693 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
694 if ( roundNearestEven ) {
695 increment = ( (sbits64) zSig1 < 0 );
697 else {
698 if ( zSign ) {
699 increment = ( roundingMode == float_round_down ) && zSig1;
701 else {
702 increment = ( roundingMode == float_round_up ) && zSig1;
705 if ( increment ) {
706 ++zSig0;
707 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
708 if ( (sbits64) zSig0 < 0 ) zExp = 1;
710 return packFloatx80( zSign, zExp, zSig0 );
713 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
714 if ( increment ) {
715 ++zSig0;
716 if ( zSig0 == 0 ) {
717 ++zExp;
718 zSig0 = LIT64( 0x8000000000000000 );
720 else {
721 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
724 else {
725 if ( zSig0 == 0 ) zExp = 0;
728 return packFloatx80( zSign, zExp, zSig0 );
732 -------------------------------------------------------------------------------
733 Takes an abstract floating-point value having sign `zSign', exponent
734 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
735 and returns the proper extended double-precision floating-point value
736 corresponding to the abstract input. This routine is just like
737 `roundAndPackFloatx80' except that the input significand does not have to be
738 normalized.
739 -------------------------------------------------------------------------------
741 static floatx80
742 normalizeRoundAndPackFloatx80(
743 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
746 int8 shiftCount;
748 if ( zSig0 == 0 ) {
749 zSig0 = zSig1;
750 zSig1 = 0;
751 zExp -= 64;
753 shiftCount = countLeadingZeros64( zSig0 );
754 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
755 zExp -= shiftCount;
756 return
757 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
761 #endif
764 -------------------------------------------------------------------------------
765 Returns the result of converting the 32-bit two's complement integer `a' to
766 the single-precision floating-point format. The conversion is performed
767 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
768 -------------------------------------------------------------------------------
770 float32 int32_to_float32( int32 a )
772 flag zSign;
774 if ( a == 0 ) return 0;
775 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
776 zSign = ( a < 0 );
777 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
782 -------------------------------------------------------------------------------
783 Returns the result of converting the 32-bit two's complement integer `a' to
784 the double-precision floating-point format. The conversion is performed
785 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
786 -------------------------------------------------------------------------------
788 float64 int32_to_float64( int32 a )
790 flag aSign;
791 uint32 absA;
792 int8 shiftCount;
793 bits64 zSig;
795 if ( a == 0 ) return 0;
796 aSign = ( a < 0 );
797 absA = aSign ? - a : a;
798 shiftCount = countLeadingZeros32( absA ) + 21;
799 zSig = absA;
800 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
804 #ifdef FLOATX80
807 -------------------------------------------------------------------------------
808 Returns the result of converting the 32-bit two's complement integer `a'
809 to the extended double-precision floating-point format. The conversion
810 is performed according to the IEC/IEEE Standard for Binary Floating-point
811 Arithmetic.
812 -------------------------------------------------------------------------------
814 floatx80 int32_to_floatx80( int32 a )
816 flag zSign;
817 uint32 absA;
818 int8 shiftCount;
819 bits64 zSig;
821 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
822 zSign = ( a < 0 );
823 absA = zSign ? - a : a;
824 shiftCount = countLeadingZeros32( absA ) + 32;
825 zSig = absA;
826 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
830 #endif
833 -------------------------------------------------------------------------------
834 Returns the result of converting the single-precision floating-point value
835 `a' to the 32-bit two's complement integer format. The conversion is
836 performed according to the IEC/IEEE Standard for Binary Floating-point
837 Arithmetic---which means in particular that the conversion is rounded
838 according to the current rounding mode. If `a' is a NaN, the largest
839 positive integer is returned. Otherwise, if the conversion overflows, the
840 largest integer with the same sign as `a' is returned.
841 -------------------------------------------------------------------------------
843 int32 float32_to_int32( float32 a )
845 flag aSign;
846 int16 aExp, shiftCount;
847 bits32 aSig;
848 bits64 zSig;
850 aSig = extractFloat32Frac( a );
851 aExp = extractFloat32Exp( a );
852 aSign = extractFloat32Sign( a );
853 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
854 if ( aExp ) aSig |= 0x00800000;
855 shiftCount = 0xAF - aExp;
856 zSig = aSig;
857 zSig <<= 32;
858 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
859 return roundAndPackInt32( aSign, zSig );
864 -------------------------------------------------------------------------------
865 Returns the result of converting the single-precision floating-point value
866 `a' to the 32-bit two's complement integer format. The conversion is
867 performed according to the IEC/IEEE Standard for Binary Floating-point
868 Arithmetic, except that the conversion is always rounded toward zero. If
869 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
870 conversion overflows, the largest integer with the same sign as `a' is
871 returned.
872 -------------------------------------------------------------------------------
874 int32 float32_to_int32_round_to_zero( float32 a )
876 flag aSign;
877 int16 aExp, shiftCount;
878 bits32 aSig;
879 int32 z;
881 aSig = extractFloat32Frac( a );
882 aExp = extractFloat32Exp( a );
883 aSign = extractFloat32Sign( a );
884 shiftCount = aExp - 0x9E;
885 if ( 0 <= shiftCount ) {
886 if ( a == 0xCF000000 ) return 0x80000000;
887 float_raise( float_flag_invalid );
888 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
889 return 0x80000000;
891 else if ( aExp <= 0x7E ) {
892 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
893 return 0;
895 aSig = ( aSig | 0x00800000 )<<8;
896 z = aSig>>( - shiftCount );
897 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
898 float_exception_flags |= float_flag_inexact;
900 return aSign ? - z : z;
905 -------------------------------------------------------------------------------
906 Returns the result of converting the single-precision floating-point value
907 `a' to the double-precision floating-point format. The conversion is
908 performed according to the IEC/IEEE Standard for Binary Floating-point
909 Arithmetic.
910 -------------------------------------------------------------------------------
912 float64 float32_to_float64( float32 a )
914 flag aSign;
915 int16 aExp;
916 bits32 aSig;
918 aSig = extractFloat32Frac( a );
919 aExp = extractFloat32Exp( a );
920 aSign = extractFloat32Sign( a );
921 if ( aExp == 0xFF ) {
922 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
923 return packFloat64( aSign, 0x7FF, 0 );
925 if ( aExp == 0 ) {
926 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
927 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
928 --aExp;
930 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
934 #ifdef FLOATX80
937 -------------------------------------------------------------------------------
938 Returns the result of converting the single-precision floating-point value
939 `a' to the extended double-precision floating-point format. The conversion
940 is performed according to the IEC/IEEE Standard for Binary Floating-point
941 Arithmetic.
942 -------------------------------------------------------------------------------
944 floatx80 float32_to_floatx80( float32 a )
946 flag aSign;
947 int16 aExp;
948 bits32 aSig;
950 aSig = extractFloat32Frac( a );
951 aExp = extractFloat32Exp( a );
952 aSign = extractFloat32Sign( a );
953 if ( aExp == 0xFF ) {
954 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
955 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
957 if ( aExp == 0 ) {
958 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
959 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
961 aSig |= 0x00800000;
962 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
966 #endif
969 -------------------------------------------------------------------------------
970 Rounds the single-precision floating-point value `a' to an integer, and
971 returns the result as a single-precision floating-point value. The
972 operation is performed according to the IEC/IEEE Standard for Binary
973 Floating-point Arithmetic.
974 -------------------------------------------------------------------------------
976 float32 float32_round_to_int( float32 a )
978 flag aSign;
979 int16 aExp;
980 bits32 lastBitMask, roundBitsMask;
981 int8 roundingMode;
982 float32 z;
984 aExp = extractFloat32Exp( a );
985 if ( 0x96 <= aExp ) {
986 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
987 return propagateFloat32NaN( a, a );
989 return a;
991 if ( aExp <= 0x7E ) {
992 if ( (bits32) ( a<<1 ) == 0 ) return a;
993 float_exception_flags |= float_flag_inexact;
994 aSign = extractFloat32Sign( a );
995 switch ( float_rounding_mode ) {
996 case float_round_nearest_even:
997 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
998 return packFloat32( aSign, 0x7F, 0 );
1000 break;
1001 case float_round_down:
1002 return aSign ? 0xBF800000 : 0;
1003 case float_round_up:
1004 return aSign ? 0x80000000 : 0x3F800000;
1006 return packFloat32( aSign, 0, 0 );
1008 lastBitMask = 1;
1009 lastBitMask <<= 0x96 - aExp;
1010 roundBitsMask = lastBitMask - 1;
1011 z = a;
1012 roundingMode = float_rounding_mode;
1013 if ( roundingMode == float_round_nearest_even ) {
1014 z += lastBitMask>>1;
1015 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1017 else if ( roundingMode != float_round_to_zero ) {
1018 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1019 z += roundBitsMask;
1022 z &= ~ roundBitsMask;
1023 if ( z != a ) float_exception_flags |= float_flag_inexact;
1024 return z;
1029 -------------------------------------------------------------------------------
1030 Returns the result of adding the absolute values of the single-precision
1031 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1032 before being returned. `zSign' is ignored if the result is a NaN. The
1033 addition is performed according to the IEC/IEEE Standard for Binary
1034 Floating-point Arithmetic.
1035 -------------------------------------------------------------------------------
1037 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1039 int16 aExp, bExp, zExp;
1040 bits32 aSig, bSig, zSig;
1041 int16 expDiff;
1043 aSig = extractFloat32Frac( a );
1044 aExp = extractFloat32Exp( a );
1045 bSig = extractFloat32Frac( b );
1046 bExp = extractFloat32Exp( b );
1047 expDiff = aExp - bExp;
1048 aSig <<= 6;
1049 bSig <<= 6;
1050 if ( 0 < expDiff ) {
1051 if ( aExp == 0xFF ) {
1052 if ( aSig ) return propagateFloat32NaN( a, b );
1053 return a;
1055 if ( bExp == 0 ) {
1056 --expDiff;
1058 else {
1059 bSig |= 0x20000000;
1061 shift32RightJamming( bSig, expDiff, &bSig );
1062 zExp = aExp;
1064 else if ( expDiff < 0 ) {
1065 if ( bExp == 0xFF ) {
1066 if ( bSig ) return propagateFloat32NaN( a, b );
1067 return packFloat32( zSign, 0xFF, 0 );
1069 if ( aExp == 0 ) {
1070 ++expDiff;
1072 else {
1073 aSig |= 0x20000000;
1075 shift32RightJamming( aSig, - expDiff, &aSig );
1076 zExp = bExp;
1078 else {
1079 if ( aExp == 0xFF ) {
1080 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1081 return a;
1083 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1084 zSig = 0x40000000 + aSig + bSig;
1085 zExp = aExp;
1086 goto roundAndPack;
1088 aSig |= 0x20000000;
1089 zSig = ( aSig + bSig )<<1;
1090 --zExp;
1091 if ( (sbits32) zSig < 0 ) {
1092 zSig = aSig + bSig;
1093 ++zExp;
1095 roundAndPack:
1096 return roundAndPackFloat32( zSign, zExp, zSig );
1101 -------------------------------------------------------------------------------
1102 Returns the result of subtracting the absolute values of the single-
1103 precision floating-point values `a' and `b'. If `zSign' is true, the
1104 difference is negated before being returned. `zSign' is ignored if the
1105 result is a NaN. The subtraction is performed according to the IEC/IEEE
1106 Standard for Binary Floating-point Arithmetic.
1107 -------------------------------------------------------------------------------
1109 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1111 int16 aExp, bExp, zExp;
1112 bits32 aSig, bSig, zSig;
1113 int16 expDiff;
1115 aSig = extractFloat32Frac( a );
1116 aExp = extractFloat32Exp( a );
1117 bSig = extractFloat32Frac( b );
1118 bExp = extractFloat32Exp( b );
1119 expDiff = aExp - bExp;
1120 aSig <<= 7;
1121 bSig <<= 7;
1122 if ( 0 < expDiff ) goto aExpBigger;
1123 if ( expDiff < 0 ) goto bExpBigger;
1124 if ( aExp == 0xFF ) {
1125 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1126 float_raise( float_flag_invalid );
1127 return float32_default_nan;
1129 if ( aExp == 0 ) {
1130 aExp = 1;
1131 bExp = 1;
1133 if ( bSig < aSig ) goto aBigger;
1134 if ( aSig < bSig ) goto bBigger;
1135 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1136 bExpBigger:
1137 if ( bExp == 0xFF ) {
1138 if ( bSig ) return propagateFloat32NaN( a, b );
1139 return packFloat32( zSign ^ 1, 0xFF, 0 );
1141 if ( aExp == 0 ) {
1142 ++expDiff;
1144 else {
1145 aSig |= 0x40000000;
1147 shift32RightJamming( aSig, - expDiff, &aSig );
1148 bSig |= 0x40000000;
1149 bBigger:
1150 zSig = bSig - aSig;
1151 zExp = bExp;
1152 zSign ^= 1;
1153 goto normalizeRoundAndPack;
1154 aExpBigger:
1155 if ( aExp == 0xFF ) {
1156 if ( aSig ) return propagateFloat32NaN( a, b );
1157 return a;
1159 if ( bExp == 0 ) {
1160 --expDiff;
1162 else {
1163 bSig |= 0x40000000;
1165 shift32RightJamming( bSig, expDiff, &bSig );
1166 aSig |= 0x40000000;
1167 aBigger:
1168 zSig = aSig - bSig;
1169 zExp = aExp;
1170 normalizeRoundAndPack:
1171 --zExp;
1172 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1177 -------------------------------------------------------------------------------
1178 Returns the result of adding the single-precision floating-point values `a'
1179 and `b'. The operation is performed according to the IEC/IEEE Standard for
1180 Binary Floating-point Arithmetic.
1181 -------------------------------------------------------------------------------
1183 float32 float32_add( float32 a, float32 b )
1185 flag aSign, bSign;
1187 aSign = extractFloat32Sign( a );
1188 bSign = extractFloat32Sign( b );
1189 if ( aSign == bSign ) {
1190 return addFloat32Sigs( a, b, aSign );
1192 else {
1193 return subFloat32Sigs( a, b, aSign );
1199 -------------------------------------------------------------------------------
1200 Returns the result of subtracting the single-precision floating-point values
1201 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1202 for Binary Floating-point Arithmetic.
1203 -------------------------------------------------------------------------------
1205 float32 float32_sub( float32 a, float32 b )
1207 flag aSign, bSign;
1209 aSign = extractFloat32Sign( a );
1210 bSign = extractFloat32Sign( b );
1211 if ( aSign == bSign ) {
1212 return subFloat32Sigs( a, b, aSign );
1214 else {
1215 return addFloat32Sigs( a, b, aSign );
1221 -------------------------------------------------------------------------------
1222 Returns the result of multiplying the single-precision floating-point values
1223 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1224 for Binary Floating-point Arithmetic.
1225 -------------------------------------------------------------------------------
1227 float32 float32_mul( float32 a, float32 b )
1229 flag aSign, bSign, zSign;
1230 int16 aExp, bExp, zExp;
1231 bits32 aSig, bSig;
1232 bits64 zSig64;
1233 bits32 zSig;
1235 aSig = extractFloat32Frac( a );
1236 aExp = extractFloat32Exp( a );
1237 aSign = extractFloat32Sign( a );
1238 bSig = extractFloat32Frac( b );
1239 bExp = extractFloat32Exp( b );
1240 bSign = extractFloat32Sign( b );
1241 zSign = aSign ^ bSign;
1242 if ( aExp == 0xFF ) {
1243 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1244 return propagateFloat32NaN( a, b );
1246 if ( ( bExp | bSig ) == 0 ) {
1247 float_raise( float_flag_invalid );
1248 return float32_default_nan;
1250 return packFloat32( zSign, 0xFF, 0 );
1252 if ( bExp == 0xFF ) {
1253 if ( bSig ) return propagateFloat32NaN( a, b );
1254 if ( ( aExp | aSig ) == 0 ) {
1255 float_raise( float_flag_invalid );
1256 return float32_default_nan;
1258 return packFloat32( zSign, 0xFF, 0 );
1260 if ( aExp == 0 ) {
1261 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1262 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1264 if ( bExp == 0 ) {
1265 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1266 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1268 zExp = aExp + bExp - 0x7F;
1269 aSig = ( aSig | 0x00800000 )<<7;
1270 bSig = ( bSig | 0x00800000 )<<8;
1271 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1272 zSig = zSig64;
1273 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1274 zSig <<= 1;
1275 --zExp;
1277 return roundAndPackFloat32( zSign, zExp, zSig );
1282 -------------------------------------------------------------------------------
1283 Returns the result of dividing the single-precision floating-point value `a'
1284 by the corresponding value `b'. The operation is performed according to the
1285 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1286 -------------------------------------------------------------------------------
1288 float32 float32_div( float32 a, float32 b )
1290 flag aSign, bSign, zSign;
1291 int16 aExp, bExp, zExp;
1292 bits32 aSig, bSig, zSig;
1294 aSig = extractFloat32Frac( a );
1295 aExp = extractFloat32Exp( a );
1296 aSign = extractFloat32Sign( a );
1297 bSig = extractFloat32Frac( b );
1298 bExp = extractFloat32Exp( b );
1299 bSign = extractFloat32Sign( b );
1300 zSign = aSign ^ bSign;
1301 if ( aExp == 0xFF ) {
1302 if ( aSig ) return propagateFloat32NaN( a, b );
1303 if ( bExp == 0xFF ) {
1304 if ( bSig ) return propagateFloat32NaN( a, b );
1305 float_raise( float_flag_invalid );
1306 return float32_default_nan;
1308 return packFloat32( zSign, 0xFF, 0 );
1310 if ( bExp == 0xFF ) {
1311 if ( bSig ) return propagateFloat32NaN( a, b );
1312 return packFloat32( zSign, 0, 0 );
1314 if ( bExp == 0 ) {
1315 if ( bSig == 0 ) {
1316 if ( ( aExp | aSig ) == 0 ) {
1317 float_raise( float_flag_invalid );
1318 return float32_default_nan;
1320 float_raise( float_flag_divbyzero );
1321 return packFloat32( zSign, 0xFF, 0 );
1323 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1325 if ( aExp == 0 ) {
1326 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1327 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1329 zExp = aExp - bExp + 0x7D;
1330 aSig = ( aSig | 0x00800000 )<<7;
1331 bSig = ( bSig | 0x00800000 )<<8;
1332 if ( bSig <= ( aSig + aSig ) ) {
1333 aSig >>= 1;
1334 ++zExp;
1337 bits64 tmp = ( (bits64) aSig )<<32;
1338 do_div( tmp, bSig );
1339 zSig = tmp;
1341 if ( ( zSig & 0x3F ) == 0 ) {
1342 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1344 return roundAndPackFloat32( zSign, zExp, zSig );
1349 -------------------------------------------------------------------------------
1350 Returns the remainder of the single-precision floating-point value `a'
1351 with respect to the corresponding value `b'. The operation is performed
1352 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1353 -------------------------------------------------------------------------------
1355 float32 float32_rem( float32 a, float32 b )
1357 flag aSign, bSign, zSign;
1358 int16 aExp, bExp, expDiff;
1359 bits32 aSig, bSig;
1360 bits32 q;
1361 bits64 aSig64, bSig64, q64;
1362 bits32 alternateASig;
1363 sbits32 sigMean;
1365 aSig = extractFloat32Frac( a );
1366 aExp = extractFloat32Exp( a );
1367 aSign = extractFloat32Sign( a );
1368 bSig = extractFloat32Frac( b );
1369 bExp = extractFloat32Exp( b );
1370 bSign = extractFloat32Sign( b );
1371 if ( aExp == 0xFF ) {
1372 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1373 return propagateFloat32NaN( a, b );
1375 float_raise( float_flag_invalid );
1376 return float32_default_nan;
1378 if ( bExp == 0xFF ) {
1379 if ( bSig ) return propagateFloat32NaN( a, b );
1380 return a;
1382 if ( bExp == 0 ) {
1383 if ( bSig == 0 ) {
1384 float_raise( float_flag_invalid );
1385 return float32_default_nan;
1387 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1389 if ( aExp == 0 ) {
1390 if ( aSig == 0 ) return a;
1391 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1393 expDiff = aExp - bExp;
1394 aSig |= 0x00800000;
1395 bSig |= 0x00800000;
1396 if ( expDiff < 32 ) {
1397 aSig <<= 8;
1398 bSig <<= 8;
1399 if ( expDiff < 0 ) {
1400 if ( expDiff < -1 ) return a;
1401 aSig >>= 1;
1403 q = ( bSig <= aSig );
1404 if ( q ) aSig -= bSig;
1405 if ( 0 < expDiff ) {
1406 bits64 tmp = ( (bits64) aSig )<<32;
1407 do_div( tmp, bSig );
1408 q = tmp;
1409 q >>= 32 - expDiff;
1410 bSig >>= 2;
1411 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1413 else {
1414 aSig >>= 2;
1415 bSig >>= 2;
1418 else {
1419 if ( bSig <= aSig ) aSig -= bSig;
1420 aSig64 = ( (bits64) aSig )<<40;
1421 bSig64 = ( (bits64) bSig )<<40;
1422 expDiff -= 64;
1423 while ( 0 < expDiff ) {
1424 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1425 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1426 aSig64 = - ( ( bSig * q64 )<<38 );
1427 expDiff -= 62;
1429 expDiff += 64;
1430 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1431 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1432 q = q64>>( 64 - expDiff );
1433 bSig <<= 6;
1434 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1436 do {
1437 alternateASig = aSig;
1438 ++q;
1439 aSig -= bSig;
1440 } while ( 0 <= (sbits32) aSig );
1441 sigMean = aSig + alternateASig;
1442 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1443 aSig = alternateASig;
1445 zSign = ( (sbits32) aSig < 0 );
1446 if ( zSign ) aSig = - aSig;
1447 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1452 -------------------------------------------------------------------------------
1453 Returns the square root of the single-precision floating-point value `a'.
1454 The operation is performed according to the IEC/IEEE Standard for Binary
1455 Floating-point Arithmetic.
1456 -------------------------------------------------------------------------------
1458 float32 float32_sqrt( float32 a )
1460 flag aSign;
1461 int16 aExp, zExp;
1462 bits32 aSig, zSig;
1463 bits64 rem, term;
1465 aSig = extractFloat32Frac( a );
1466 aExp = extractFloat32Exp( a );
1467 aSign = extractFloat32Sign( a );
1468 if ( aExp == 0xFF ) {
1469 if ( aSig ) return propagateFloat32NaN( a, 0 );
1470 if ( ! aSign ) return a;
1471 float_raise( float_flag_invalid );
1472 return float32_default_nan;
1474 if ( aSign ) {
1475 if ( ( aExp | aSig ) == 0 ) return a;
1476 float_raise( float_flag_invalid );
1477 return float32_default_nan;
1479 if ( aExp == 0 ) {
1480 if ( aSig == 0 ) return 0;
1481 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1483 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1484 aSig = ( aSig | 0x00800000 )<<8;
1485 zSig = estimateSqrt32( aExp, aSig ) + 2;
1486 if ( ( zSig & 0x7F ) <= 5 ) {
1487 if ( zSig < 2 ) {
1488 zSig = 0xFFFFFFFF;
1490 else {
1491 aSig >>= aExp & 1;
1492 term = ( (bits64) zSig ) * zSig;
1493 rem = ( ( (bits64) aSig )<<32 ) - term;
1494 while ( (sbits64) rem < 0 ) {
1495 --zSig;
1496 rem += ( ( (bits64) zSig )<<1 ) | 1;
1498 zSig |= ( rem != 0 );
1501 shift32RightJamming( zSig, 1, &zSig );
1502 return roundAndPackFloat32( 0, zExp, zSig );
1507 -------------------------------------------------------------------------------
1508 Returns 1 if the single-precision floating-point value `a' is equal to the
1509 corresponding value `b', and 0 otherwise. The comparison is performed
1510 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1511 -------------------------------------------------------------------------------
1513 flag float32_eq( float32 a, float32 b )
1516 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1517 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1519 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1520 float_raise( float_flag_invalid );
1522 return 0;
1524 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1529 -------------------------------------------------------------------------------
1530 Returns 1 if the single-precision floating-point value `a' is less than or
1531 equal to the corresponding value `b', and 0 otherwise. The comparison is
1532 performed according to the IEC/IEEE Standard for Binary Floating-point
1533 Arithmetic.
1534 -------------------------------------------------------------------------------
1536 flag float32_le( float32 a, float32 b )
1538 flag aSign, bSign;
1540 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1541 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1543 float_raise( float_flag_invalid );
1544 return 0;
1546 aSign = extractFloat32Sign( a );
1547 bSign = extractFloat32Sign( b );
1548 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1549 return ( a == b ) || ( aSign ^ ( a < b ) );
1554 -------------------------------------------------------------------------------
1555 Returns 1 if the single-precision floating-point value `a' is less than
1556 the corresponding value `b', and 0 otherwise. The comparison is performed
1557 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1558 -------------------------------------------------------------------------------
1560 flag float32_lt( float32 a, float32 b )
1562 flag aSign, bSign;
1564 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1565 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1567 float_raise( float_flag_invalid );
1568 return 0;
1570 aSign = extractFloat32Sign( a );
1571 bSign = extractFloat32Sign( b );
1572 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1573 return ( a != b ) && ( aSign ^ ( a < b ) );
1578 -------------------------------------------------------------------------------
1579 Returns 1 if the single-precision floating-point value `a' is equal to the
1580 corresponding value `b', and 0 otherwise. The invalid exception is raised
1581 if either operand is a NaN. Otherwise, the comparison is performed
1582 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1583 -------------------------------------------------------------------------------
1585 flag float32_eq_signaling( float32 a, float32 b )
1588 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1589 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1591 float_raise( float_flag_invalid );
1592 return 0;
1594 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1599 -------------------------------------------------------------------------------
1600 Returns 1 if the single-precision floating-point value `a' is less than or
1601 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1602 cause an exception. Otherwise, the comparison is performed according to the
1603 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1604 -------------------------------------------------------------------------------
1606 flag float32_le_quiet( float32 a, float32 b )
1608 flag aSign, bSign;
1609 //int16 aExp, bExp;
1611 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1612 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1614 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1615 float_raise( float_flag_invalid );
1617 return 0;
1619 aSign = extractFloat32Sign( a );
1620 bSign = extractFloat32Sign( b );
1621 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1622 return ( a == b ) || ( aSign ^ ( a < b ) );
1627 -------------------------------------------------------------------------------
1628 Returns 1 if the single-precision floating-point value `a' is less than
1629 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1630 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1631 Standard for Binary Floating-point Arithmetic.
1632 -------------------------------------------------------------------------------
1634 flag float32_lt_quiet( float32 a, float32 b )
1636 flag aSign, bSign;
1638 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1639 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1641 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1642 float_raise( float_flag_invalid );
1644 return 0;
1646 aSign = extractFloat32Sign( a );
1647 bSign = extractFloat32Sign( b );
1648 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1649 return ( a != b ) && ( aSign ^ ( a < b ) );
1654 -------------------------------------------------------------------------------
1655 Returns the result of converting the double-precision floating-point value
1656 `a' to the 32-bit two's complement integer format. The conversion is
1657 performed according to the IEC/IEEE Standard for Binary Floating-point
1658 Arithmetic---which means in particular that the conversion is rounded
1659 according to the current rounding mode. If `a' is a NaN, the largest
1660 positive integer is returned. Otherwise, if the conversion overflows, the
1661 largest integer with the same sign as `a' is returned.
1662 -------------------------------------------------------------------------------
1664 int32 float64_to_int32( float64 a )
1666 flag aSign;
1667 int16 aExp, shiftCount;
1668 bits64 aSig;
1670 aSig = extractFloat64Frac( a );
1671 aExp = extractFloat64Exp( a );
1672 aSign = extractFloat64Sign( a );
1673 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1674 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1675 shiftCount = 0x42C - aExp;
1676 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1677 return roundAndPackInt32( aSign, aSig );
1682 -------------------------------------------------------------------------------
1683 Returns the result of converting the double-precision floating-point value
1684 `a' to the 32-bit two's complement integer format. The conversion is
1685 performed according to the IEC/IEEE Standard for Binary Floating-point
1686 Arithmetic, except that the conversion is always rounded toward zero. If
1687 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1688 conversion overflows, the largest integer with the same sign as `a' is
1689 returned.
1690 -------------------------------------------------------------------------------
1692 int32 float64_to_int32_round_to_zero( float64 a )
1694 flag aSign;
1695 int16 aExp, shiftCount;
1696 bits64 aSig, savedASig;
1697 int32 z;
1699 aSig = extractFloat64Frac( a );
1700 aExp = extractFloat64Exp( a );
1701 aSign = extractFloat64Sign( a );
1702 shiftCount = 0x433 - aExp;
1703 if ( shiftCount < 21 ) {
1704 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1705 goto invalid;
1707 else if ( 52 < shiftCount ) {
1708 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1709 return 0;
1711 aSig |= LIT64( 0x0010000000000000 );
1712 savedASig = aSig;
1713 aSig >>= shiftCount;
1714 z = aSig;
1715 if ( aSign ) z = - z;
1716 if ( ( z < 0 ) ^ aSign ) {
1717 invalid:
1718 float_exception_flags |= float_flag_invalid;
1719 return aSign ? 0x80000000 : 0x7FFFFFFF;
1721 if ( ( aSig<<shiftCount ) != savedASig ) {
1722 float_exception_flags |= float_flag_inexact;
1724 return z;
1729 -------------------------------------------------------------------------------
1730 Returns the result of converting the double-precision floating-point value
1731 `a' to the 32-bit two's complement unsigned integer format. The conversion
1732 is performed according to the IEC/IEEE Standard for Binary Floating-point
1733 Arithmetic---which means in particular that the conversion is rounded
1734 according to the current rounding mode. If `a' is a NaN, the largest
1735 positive integer is returned. Otherwise, if the conversion overflows, the
1736 largest positive integer is returned.
1737 -------------------------------------------------------------------------------
1739 int32 float64_to_uint32( float64 a )
1741 flag aSign;
1742 int16 aExp, shiftCount;
1743 bits64 aSig;
1745 aSig = extractFloat64Frac( a );
1746 aExp = extractFloat64Exp( a );
1747 aSign = 0; //extractFloat64Sign( a );
1748 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1749 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1750 shiftCount = 0x42C - aExp;
1751 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1752 return roundAndPackInt32( aSign, aSig );
1756 -------------------------------------------------------------------------------
1757 Returns the result of converting the double-precision floating-point value
1758 `a' to the 32-bit two's complement integer format. The conversion is
1759 performed according to the IEC/IEEE Standard for Binary Floating-point
1760 Arithmetic, except that the conversion is always rounded toward zero. If
1761 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1762 conversion overflows, the largest positive integer is returned.
1763 -------------------------------------------------------------------------------
1765 int32 float64_to_uint32_round_to_zero( float64 a )
1767 flag aSign;
1768 int16 aExp, shiftCount;
1769 bits64 aSig, savedASig;
1770 int32 z;
1772 aSig = extractFloat64Frac( a );
1773 aExp = extractFloat64Exp( a );
1774 aSign = extractFloat64Sign( a );
1775 shiftCount = 0x433 - aExp;
1776 if ( shiftCount < 21 ) {
1777 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1778 goto invalid;
1780 else if ( 52 < shiftCount ) {
1781 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1782 return 0;
1784 aSig |= LIT64( 0x0010000000000000 );
1785 savedASig = aSig;
1786 aSig >>= shiftCount;
1787 z = aSig;
1788 if ( aSign ) z = - z;
1789 if ( ( z < 0 ) ^ aSign ) {
1790 invalid:
1791 float_exception_flags |= float_flag_invalid;
1792 return aSign ? 0x80000000 : 0x7FFFFFFF;
1794 if ( ( aSig<<shiftCount ) != savedASig ) {
1795 float_exception_flags |= float_flag_inexact;
1797 return z;
1801 -------------------------------------------------------------------------------
1802 Returns the result of converting the double-precision floating-point value
1803 `a' to the single-precision floating-point format. The conversion is
1804 performed according to the IEC/IEEE Standard for Binary Floating-point
1805 Arithmetic.
1806 -------------------------------------------------------------------------------
1808 float32 float64_to_float32( float64 a )
1810 flag aSign;
1811 int16 aExp;
1812 bits64 aSig;
1813 bits32 zSig;
1815 aSig = extractFloat64Frac( a );
1816 aExp = extractFloat64Exp( a );
1817 aSign = extractFloat64Sign( a );
1818 if ( aExp == 0x7FF ) {
1819 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1820 return packFloat32( aSign, 0xFF, 0 );
1822 shift64RightJamming( aSig, 22, &aSig );
1823 zSig = aSig;
1824 if ( aExp || zSig ) {
1825 zSig |= 0x40000000;
1826 aExp -= 0x381;
1828 return roundAndPackFloat32( aSign, aExp, zSig );
1832 #ifdef FLOATX80
1835 -------------------------------------------------------------------------------
1836 Returns the result of converting the double-precision floating-point value
1837 `a' to the extended double-precision floating-point format. The conversion
1838 is performed according to the IEC/IEEE Standard for Binary Floating-point
1839 Arithmetic.
1840 -------------------------------------------------------------------------------
1842 floatx80 float64_to_floatx80( float64 a )
1844 flag aSign;
1845 int16 aExp;
1846 bits64 aSig;
1848 aSig = extractFloat64Frac( a );
1849 aExp = extractFloat64Exp( a );
1850 aSign = extractFloat64Sign( a );
1851 if ( aExp == 0x7FF ) {
1852 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1853 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1855 if ( aExp == 0 ) {
1856 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1857 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1859 return
1860 packFloatx80(
1861 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1865 #endif
1868 -------------------------------------------------------------------------------
1869 Rounds the double-precision floating-point value `a' to an integer, and
1870 returns the result as a double-precision floating-point value. The
1871 operation is performed according to the IEC/IEEE Standard for Binary
1872 Floating-point Arithmetic.
1873 -------------------------------------------------------------------------------
1875 float64 float64_round_to_int( float64 a )
1877 flag aSign;
1878 int16 aExp;
1879 bits64 lastBitMask, roundBitsMask;
1880 int8 roundingMode;
1881 float64 z;
1883 aExp = extractFloat64Exp( a );
1884 if ( 0x433 <= aExp ) {
1885 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1886 return propagateFloat64NaN( a, a );
1888 return a;
1890 if ( aExp <= 0x3FE ) {
1891 if ( (bits64) ( a<<1 ) == 0 ) return a;
1892 float_exception_flags |= float_flag_inexact;
1893 aSign = extractFloat64Sign( a );
1894 switch ( float_rounding_mode ) {
1895 case float_round_nearest_even:
1896 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1897 return packFloat64( aSign, 0x3FF, 0 );
1899 break;
1900 case float_round_down:
1901 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1902 case float_round_up:
1903 return
1904 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1906 return packFloat64( aSign, 0, 0 );
1908 lastBitMask = 1;
1909 lastBitMask <<= 0x433 - aExp;
1910 roundBitsMask = lastBitMask - 1;
1911 z = a;
1912 roundingMode = float_rounding_mode;
1913 if ( roundingMode == float_round_nearest_even ) {
1914 z += lastBitMask>>1;
1915 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1917 else if ( roundingMode != float_round_to_zero ) {
1918 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1919 z += roundBitsMask;
1922 z &= ~ roundBitsMask;
1923 if ( z != a ) float_exception_flags |= float_flag_inexact;
1924 return z;
1929 -------------------------------------------------------------------------------
1930 Returns the result of adding the absolute values of the double-precision
1931 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1932 before being returned. `zSign' is ignored if the result is a NaN. The
1933 addition is performed according to the IEC/IEEE Standard for Binary
1934 Floating-point Arithmetic.
1935 -------------------------------------------------------------------------------
1937 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1939 int16 aExp, bExp, zExp;
1940 bits64 aSig, bSig, zSig;
1941 int16 expDiff;
1943 aSig = extractFloat64Frac( a );
1944 aExp = extractFloat64Exp( a );
1945 bSig = extractFloat64Frac( b );
1946 bExp = extractFloat64Exp( b );
1947 expDiff = aExp - bExp;
1948 aSig <<= 9;
1949 bSig <<= 9;
1950 if ( 0 < expDiff ) {
1951 if ( aExp == 0x7FF ) {
1952 if ( aSig ) return propagateFloat64NaN( a, b );
1953 return a;
1955 if ( bExp == 0 ) {
1956 --expDiff;
1958 else {
1959 bSig |= LIT64( 0x2000000000000000 );
1961 shift64RightJamming( bSig, expDiff, &bSig );
1962 zExp = aExp;
1964 else if ( expDiff < 0 ) {
1965 if ( bExp == 0x7FF ) {
1966 if ( bSig ) return propagateFloat64NaN( a, b );
1967 return packFloat64( zSign, 0x7FF, 0 );
1969 if ( aExp == 0 ) {
1970 ++expDiff;
1972 else {
1973 aSig |= LIT64( 0x2000000000000000 );
1975 shift64RightJamming( aSig, - expDiff, &aSig );
1976 zExp = bExp;
1978 else {
1979 if ( aExp == 0x7FF ) {
1980 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1981 return a;
1983 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1984 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1985 zExp = aExp;
1986 goto roundAndPack;
1988 aSig |= LIT64( 0x2000000000000000 );
1989 zSig = ( aSig + bSig )<<1;
1990 --zExp;
1991 if ( (sbits64) zSig < 0 ) {
1992 zSig = aSig + bSig;
1993 ++zExp;
1995 roundAndPack:
1996 return roundAndPackFloat64( zSign, zExp, zSig );
2001 -------------------------------------------------------------------------------
2002 Returns the result of subtracting the absolute values of the double-
2003 precision floating-point values `a' and `b'. If `zSign' is true, the
2004 difference is negated before being returned. `zSign' is ignored if the
2005 result is a NaN. The subtraction is performed according to the IEC/IEEE
2006 Standard for Binary Floating-point Arithmetic.
2007 -------------------------------------------------------------------------------
2009 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2011 int16 aExp, bExp, zExp;
2012 bits64 aSig, bSig, zSig;
2013 int16 expDiff;
2015 aSig = extractFloat64Frac( a );
2016 aExp = extractFloat64Exp( a );
2017 bSig = extractFloat64Frac( b );
2018 bExp = extractFloat64Exp( b );
2019 expDiff = aExp - bExp;
2020 aSig <<= 10;
2021 bSig <<= 10;
2022 if ( 0 < expDiff ) goto aExpBigger;
2023 if ( expDiff < 0 ) goto bExpBigger;
2024 if ( aExp == 0x7FF ) {
2025 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2026 float_raise( float_flag_invalid );
2027 return float64_default_nan;
2029 if ( aExp == 0 ) {
2030 aExp = 1;
2031 bExp = 1;
2033 if ( bSig < aSig ) goto aBigger;
2034 if ( aSig < bSig ) goto bBigger;
2035 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2036 bExpBigger:
2037 if ( bExp == 0x7FF ) {
2038 if ( bSig ) return propagateFloat64NaN( a, b );
2039 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2041 if ( aExp == 0 ) {
2042 ++expDiff;
2044 else {
2045 aSig |= LIT64( 0x4000000000000000 );
2047 shift64RightJamming( aSig, - expDiff, &aSig );
2048 bSig |= LIT64( 0x4000000000000000 );
2049 bBigger:
2050 zSig = bSig - aSig;
2051 zExp = bExp;
2052 zSign ^= 1;
2053 goto normalizeRoundAndPack;
2054 aExpBigger:
2055 if ( aExp == 0x7FF ) {
2056 if ( aSig ) return propagateFloat64NaN( a, b );
2057 return a;
2059 if ( bExp == 0 ) {
2060 --expDiff;
2062 else {
2063 bSig |= LIT64( 0x4000000000000000 );
2065 shift64RightJamming( bSig, expDiff, &bSig );
2066 aSig |= LIT64( 0x4000000000000000 );
2067 aBigger:
2068 zSig = aSig - bSig;
2069 zExp = aExp;
2070 normalizeRoundAndPack:
2071 --zExp;
2072 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2077 -------------------------------------------------------------------------------
2078 Returns the result of adding the double-precision floating-point values `a'
2079 and `b'. The operation is performed according to the IEC/IEEE Standard for
2080 Binary Floating-point Arithmetic.
2081 -------------------------------------------------------------------------------
2083 float64 float64_add( float64 a, float64 b )
2085 flag aSign, bSign;
2087 aSign = extractFloat64Sign( a );
2088 bSign = extractFloat64Sign( b );
2089 if ( aSign == bSign ) {
2090 return addFloat64Sigs( a, b, aSign );
2092 else {
2093 return subFloat64Sigs( a, b, aSign );
2099 -------------------------------------------------------------------------------
2100 Returns the result of subtracting the double-precision floating-point values
2101 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2102 for Binary Floating-point Arithmetic.
2103 -------------------------------------------------------------------------------
2105 float64 float64_sub( float64 a, float64 b )
2107 flag aSign, bSign;
2109 aSign = extractFloat64Sign( a );
2110 bSign = extractFloat64Sign( b );
2111 if ( aSign == bSign ) {
2112 return subFloat64Sigs( a, b, aSign );
2114 else {
2115 return addFloat64Sigs( a, b, aSign );
2121 -------------------------------------------------------------------------------
2122 Returns the result of multiplying the double-precision floating-point values
2123 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2124 for Binary Floating-point Arithmetic.
2125 -------------------------------------------------------------------------------
2127 float64 float64_mul( float64 a, float64 b )
2129 flag aSign, bSign, zSign;
2130 int16 aExp, bExp, zExp;
2131 bits64 aSig, bSig, zSig0, zSig1;
2133 aSig = extractFloat64Frac( a );
2134 aExp = extractFloat64Exp( a );
2135 aSign = extractFloat64Sign( a );
2136 bSig = extractFloat64Frac( b );
2137 bExp = extractFloat64Exp( b );
2138 bSign = extractFloat64Sign( b );
2139 zSign = aSign ^ bSign;
2140 if ( aExp == 0x7FF ) {
2141 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2142 return propagateFloat64NaN( a, b );
2144 if ( ( bExp | bSig ) == 0 ) {
2145 float_raise( float_flag_invalid );
2146 return float64_default_nan;
2148 return packFloat64( zSign, 0x7FF, 0 );
2150 if ( bExp == 0x7FF ) {
2151 if ( bSig ) return propagateFloat64NaN( a, b );
2152 if ( ( aExp | aSig ) == 0 ) {
2153 float_raise( float_flag_invalid );
2154 return float64_default_nan;
2156 return packFloat64( zSign, 0x7FF, 0 );
2158 if ( aExp == 0 ) {
2159 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2160 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2162 if ( bExp == 0 ) {
2163 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2164 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2166 zExp = aExp + bExp - 0x3FF;
2167 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2168 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2169 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2170 zSig0 |= ( zSig1 != 0 );
2171 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2172 zSig0 <<= 1;
2173 --zExp;
2175 return roundAndPackFloat64( zSign, zExp, zSig0 );
2180 -------------------------------------------------------------------------------
2181 Returns the result of dividing the double-precision floating-point value `a'
2182 by the corresponding value `b'. The operation is performed according to
2183 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2184 -------------------------------------------------------------------------------
2186 float64 float64_div( float64 a, float64 b )
2188 flag aSign, bSign, zSign;
2189 int16 aExp, bExp, zExp;
2190 bits64 aSig, bSig, zSig;
2191 bits64 rem0, rem1;
2192 bits64 term0, term1;
2194 aSig = extractFloat64Frac( a );
2195 aExp = extractFloat64Exp( a );
2196 aSign = extractFloat64Sign( a );
2197 bSig = extractFloat64Frac( b );
2198 bExp = extractFloat64Exp( b );
2199 bSign = extractFloat64Sign( b );
2200 zSign = aSign ^ bSign;
2201 if ( aExp == 0x7FF ) {
2202 if ( aSig ) return propagateFloat64NaN( a, b );
2203 if ( bExp == 0x7FF ) {
2204 if ( bSig ) return propagateFloat64NaN( a, b );
2205 float_raise( float_flag_invalid );
2206 return float64_default_nan;
2208 return packFloat64( zSign, 0x7FF, 0 );
2210 if ( bExp == 0x7FF ) {
2211 if ( bSig ) return propagateFloat64NaN( a, b );
2212 return packFloat64( zSign, 0, 0 );
2214 if ( bExp == 0 ) {
2215 if ( bSig == 0 ) {
2216 if ( ( aExp | aSig ) == 0 ) {
2217 float_raise( float_flag_invalid );
2218 return float64_default_nan;
2220 float_raise( float_flag_divbyzero );
2221 return packFloat64( zSign, 0x7FF, 0 );
2223 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2225 if ( aExp == 0 ) {
2226 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2227 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2229 zExp = aExp - bExp + 0x3FD;
2230 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2231 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2232 if ( bSig <= ( aSig + aSig ) ) {
2233 aSig >>= 1;
2234 ++zExp;
2236 zSig = estimateDiv128To64( aSig, 0, bSig );
2237 if ( ( zSig & 0x1FF ) <= 2 ) {
2238 mul64To128( bSig, zSig, &term0, &term1 );
2239 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2240 while ( (sbits64) rem0 < 0 ) {
2241 --zSig;
2242 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2244 zSig |= ( rem1 != 0 );
2246 return roundAndPackFloat64( zSign, zExp, zSig );
2251 -------------------------------------------------------------------------------
2252 Returns the remainder of the double-precision floating-point value `a'
2253 with respect to the corresponding value `b'. The operation is performed
2254 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2255 -------------------------------------------------------------------------------
2257 float64 float64_rem( float64 a, float64 b )
2259 flag aSign, bSign, zSign;
2260 int16 aExp, bExp, expDiff;
2261 bits64 aSig, bSig;
2262 bits64 q, alternateASig;
2263 sbits64 sigMean;
2265 aSig = extractFloat64Frac( a );
2266 aExp = extractFloat64Exp( a );
2267 aSign = extractFloat64Sign( a );
2268 bSig = extractFloat64Frac( b );
2269 bExp = extractFloat64Exp( b );
2270 bSign = extractFloat64Sign( b );
2271 if ( aExp == 0x7FF ) {
2272 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2273 return propagateFloat64NaN( a, b );
2275 float_raise( float_flag_invalid );
2276 return float64_default_nan;
2278 if ( bExp == 0x7FF ) {
2279 if ( bSig ) return propagateFloat64NaN( a, b );
2280 return a;
2282 if ( bExp == 0 ) {
2283 if ( bSig == 0 ) {
2284 float_raise( float_flag_invalid );
2285 return float64_default_nan;
2287 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2289 if ( aExp == 0 ) {
2290 if ( aSig == 0 ) return a;
2291 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2293 expDiff = aExp - bExp;
2294 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2295 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2296 if ( expDiff < 0 ) {
2297 if ( expDiff < -1 ) return a;
2298 aSig >>= 1;
2300 q = ( bSig <= aSig );
2301 if ( q ) aSig -= bSig;
2302 expDiff -= 64;
2303 while ( 0 < expDiff ) {
2304 q = estimateDiv128To64( aSig, 0, bSig );
2305 q = ( 2 < q ) ? q - 2 : 0;
2306 aSig = - ( ( bSig>>2 ) * q );
2307 expDiff -= 62;
2309 expDiff += 64;
2310 if ( 0 < expDiff ) {
2311 q = estimateDiv128To64( aSig, 0, bSig );
2312 q = ( 2 < q ) ? q - 2 : 0;
2313 q >>= 64 - expDiff;
2314 bSig >>= 2;
2315 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2317 else {
2318 aSig >>= 2;
2319 bSig >>= 2;
2321 do {
2322 alternateASig = aSig;
2323 ++q;
2324 aSig -= bSig;
2325 } while ( 0 <= (sbits64) aSig );
2326 sigMean = aSig + alternateASig;
2327 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2328 aSig = alternateASig;
2330 zSign = ( (sbits64) aSig < 0 );
2331 if ( zSign ) aSig = - aSig;
2332 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2337 -------------------------------------------------------------------------------
2338 Returns the square root of the double-precision floating-point value `a'.
2339 The operation is performed according to the IEC/IEEE Standard for Binary
2340 Floating-point Arithmetic.
2341 -------------------------------------------------------------------------------
2343 float64 float64_sqrt( float64 a )
2345 flag aSign;
2346 int16 aExp, zExp;
2347 bits64 aSig, zSig;
2348 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2349 //float64 z;
2351 aSig = extractFloat64Frac( a );
2352 aExp = extractFloat64Exp( a );
2353 aSign = extractFloat64Sign( a );
2354 if ( aExp == 0x7FF ) {
2355 if ( aSig ) return propagateFloat64NaN( a, a );
2356 if ( ! aSign ) return a;
2357 float_raise( float_flag_invalid );
2358 return float64_default_nan;
2360 if ( aSign ) {
2361 if ( ( aExp | aSig ) == 0 ) return a;
2362 float_raise( float_flag_invalid );
2363 return float64_default_nan;
2365 if ( aExp == 0 ) {
2366 if ( aSig == 0 ) return 0;
2367 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2369 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2370 aSig |= LIT64( 0x0010000000000000 );
2371 zSig = estimateSqrt32( aExp, aSig>>21 );
2372 zSig <<= 31;
2373 aSig <<= 9 - ( aExp & 1 );
2374 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2375 if ( ( zSig & 0x3FF ) <= 5 ) {
2376 if ( zSig < 2 ) {
2377 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2379 else {
2380 aSig <<= 2;
2381 mul64To128( zSig, zSig, &term0, &term1 );
2382 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2383 while ( (sbits64) rem0 < 0 ) {
2384 --zSig;
2385 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2386 term1 |= 1;
2387 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2389 zSig |= ( ( rem0 | rem1 ) != 0 );
2392 shift64RightJamming( zSig, 1, &zSig );
2393 return roundAndPackFloat64( 0, zExp, zSig );
2398 -------------------------------------------------------------------------------
2399 Returns 1 if the double-precision floating-point value `a' is equal to the
2400 corresponding value `b', and 0 otherwise. The comparison is performed
2401 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2402 -------------------------------------------------------------------------------
2404 flag float64_eq( float64 a, float64 b )
2407 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2408 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2410 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2411 float_raise( float_flag_invalid );
2413 return 0;
2415 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2420 -------------------------------------------------------------------------------
2421 Returns 1 if the double-precision floating-point value `a' is less than or
2422 equal to the corresponding value `b', and 0 otherwise. The comparison is
2423 performed according to the IEC/IEEE Standard for Binary Floating-point
2424 Arithmetic.
2425 -------------------------------------------------------------------------------
2427 flag float64_le( float64 a, float64 b )
2429 flag aSign, bSign;
2431 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2432 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2434 float_raise( float_flag_invalid );
2435 return 0;
2437 aSign = extractFloat64Sign( a );
2438 bSign = extractFloat64Sign( b );
2439 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2440 return ( a == b ) || ( aSign ^ ( a < b ) );
2445 -------------------------------------------------------------------------------
2446 Returns 1 if the double-precision floating-point value `a' is less than
2447 the corresponding value `b', and 0 otherwise. The comparison is performed
2448 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2449 -------------------------------------------------------------------------------
2451 flag float64_lt( float64 a, float64 b )
2453 flag aSign, bSign;
2455 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2456 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2458 float_raise( float_flag_invalid );
2459 return 0;
2461 aSign = extractFloat64Sign( a );
2462 bSign = extractFloat64Sign( b );
2463 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2464 return ( a != b ) && ( aSign ^ ( a < b ) );
2469 -------------------------------------------------------------------------------
2470 Returns 1 if the double-precision floating-point value `a' is equal to the
2471 corresponding value `b', and 0 otherwise. The invalid exception is raised
2472 if either operand is a NaN. Otherwise, the comparison is performed
2473 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2474 -------------------------------------------------------------------------------
2476 flag float64_eq_signaling( float64 a, float64 b )
2479 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2480 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2482 float_raise( float_flag_invalid );
2483 return 0;
2485 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2490 -------------------------------------------------------------------------------
2491 Returns 1 if the double-precision floating-point value `a' is less than or
2492 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2493 cause an exception. Otherwise, the comparison is performed according to the
2494 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2495 -------------------------------------------------------------------------------
2497 flag float64_le_quiet( float64 a, float64 b )
2499 flag aSign, bSign;
2500 //int16 aExp, bExp;
2502 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2503 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2505 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2506 float_raise( float_flag_invalid );
2508 return 0;
2510 aSign = extractFloat64Sign( a );
2511 bSign = extractFloat64Sign( b );
2512 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2513 return ( a == b ) || ( aSign ^ ( a < b ) );
2518 -------------------------------------------------------------------------------
2519 Returns 1 if the double-precision floating-point value `a' is less than
2520 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2521 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2522 Standard for Binary Floating-point Arithmetic.
2523 -------------------------------------------------------------------------------
2525 flag float64_lt_quiet( float64 a, float64 b )
2527 flag aSign, bSign;
2529 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2530 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2532 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2533 float_raise( float_flag_invalid );
2535 return 0;
2537 aSign = extractFloat64Sign( a );
2538 bSign = extractFloat64Sign( b );
2539 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2540 return ( a != b ) && ( aSign ^ ( a < b ) );
2544 #ifdef FLOATX80
2547 -------------------------------------------------------------------------------
2548 Returns the result of converting the extended double-precision floating-
2549 point value `a' to the 32-bit two's complement integer format. The
2550 conversion is performed according to the IEC/IEEE Standard for Binary
2551 Floating-point Arithmetic---which means in particular that the conversion
2552 is rounded according to the current rounding mode. If `a' is a NaN, the
2553 largest positive integer is returned. Otherwise, if the conversion
2554 overflows, the largest integer with the same sign as `a' is returned.
2555 -------------------------------------------------------------------------------
2557 int32 floatx80_to_int32( floatx80 a )
2559 flag aSign;
2560 int32 aExp, shiftCount;
2561 bits64 aSig;
2563 aSig = extractFloatx80Frac( a );
2564 aExp = extractFloatx80Exp( a );
2565 aSign = extractFloatx80Sign( a );
2566 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2567 shiftCount = 0x4037 - aExp;
2568 if ( shiftCount <= 0 ) shiftCount = 1;
2569 shift64RightJamming( aSig, shiftCount, &aSig );
2570 return roundAndPackInt32( aSign, aSig );
2575 -------------------------------------------------------------------------------
2576 Returns the result of converting the extended double-precision floating-
2577 point value `a' to the 32-bit two's complement integer format. The
2578 conversion is performed according to the IEC/IEEE Standard for Binary
2579 Floating-point Arithmetic, except that the conversion is always rounded
2580 toward zero. If `a' is a NaN, the largest positive integer is returned.
2581 Otherwise, if the conversion overflows, the largest integer with the same
2582 sign as `a' is returned.
2583 -------------------------------------------------------------------------------
2585 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2587 flag aSign;
2588 int32 aExp, shiftCount;
2589 bits64 aSig, savedASig;
2590 int32 z;
2592 aSig = extractFloatx80Frac( a );
2593 aExp = extractFloatx80Exp( a );
2594 aSign = extractFloatx80Sign( a );
2595 shiftCount = 0x403E - aExp;
2596 if ( shiftCount < 32 ) {
2597 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2598 goto invalid;
2600 else if ( 63 < shiftCount ) {
2601 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2602 return 0;
2604 savedASig = aSig;
2605 aSig >>= shiftCount;
2606 z = aSig;
2607 if ( aSign ) z = - z;
2608 if ( ( z < 0 ) ^ aSign ) {
2609 invalid:
2610 float_exception_flags |= float_flag_invalid;
2611 return aSign ? 0x80000000 : 0x7FFFFFFF;
2613 if ( ( aSig<<shiftCount ) != savedASig ) {
2614 float_exception_flags |= float_flag_inexact;
2616 return z;
2621 -------------------------------------------------------------------------------
2622 Returns the result of converting the extended double-precision floating-
2623 point value `a' to the single-precision floating-point format. The
2624 conversion is performed according to the IEC/IEEE Standard for Binary
2625 Floating-point Arithmetic.
2626 -------------------------------------------------------------------------------
2628 float32 floatx80_to_float32( floatx80 a )
2630 flag aSign;
2631 int32 aExp;
2632 bits64 aSig;
2634 aSig = extractFloatx80Frac( a );
2635 aExp = extractFloatx80Exp( a );
2636 aSign = extractFloatx80Sign( a );
2637 if ( aExp == 0x7FFF ) {
2638 if ( (bits64) ( aSig<<1 ) ) {
2639 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2641 return packFloat32( aSign, 0xFF, 0 );
2643 shift64RightJamming( aSig, 33, &aSig );
2644 if ( aExp || aSig ) aExp -= 0x3F81;
2645 return roundAndPackFloat32( aSign, aExp, aSig );
2650 -------------------------------------------------------------------------------
2651 Returns the result of converting the extended double-precision floating-
2652 point value `a' to the double-precision floating-point format. The
2653 conversion is performed according to the IEC/IEEE Standard for Binary
2654 Floating-point Arithmetic.
2655 -------------------------------------------------------------------------------
2657 float64 floatx80_to_float64( floatx80 a )
2659 flag aSign;
2660 int32 aExp;
2661 bits64 aSig, zSig;
2663 aSig = extractFloatx80Frac( a );
2664 aExp = extractFloatx80Exp( a );
2665 aSign = extractFloatx80Sign( a );
2666 if ( aExp == 0x7FFF ) {
2667 if ( (bits64) ( aSig<<1 ) ) {
2668 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2670 return packFloat64( aSign, 0x7FF, 0 );
2672 shift64RightJamming( aSig, 1, &zSig );
2673 if ( aExp || aSig ) aExp -= 0x3C01;
2674 return roundAndPackFloat64( aSign, aExp, zSig );
2679 -------------------------------------------------------------------------------
2680 Rounds the extended double-precision floating-point value `a' to an integer,
2681 and returns the result as an extended quadruple-precision floating-point
2682 value. The operation is performed according to the IEC/IEEE Standard for
2683 Binary Floating-point Arithmetic.
2684 -------------------------------------------------------------------------------
2686 floatx80 floatx80_round_to_int( floatx80 a )
2688 flag aSign;
2689 int32 aExp;
2690 bits64 lastBitMask, roundBitsMask;
2691 int8 roundingMode;
2692 floatx80 z;
2694 aExp = extractFloatx80Exp( a );
2695 if ( 0x403E <= aExp ) {
2696 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2697 return propagateFloatx80NaN( a, a );
2699 return a;
2701 if ( aExp <= 0x3FFE ) {
2702 if ( ( aExp == 0 )
2703 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2704 return a;
2706 float_exception_flags |= float_flag_inexact;
2707 aSign = extractFloatx80Sign( a );
2708 switch ( float_rounding_mode ) {
2709 case float_round_nearest_even:
2710 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2712 return
2713 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2715 break;
2716 case float_round_down:
2717 return
2718 aSign ?
2719 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2720 : packFloatx80( 0, 0, 0 );
2721 case float_round_up:
2722 return
2723 aSign ? packFloatx80( 1, 0, 0 )
2724 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2726 return packFloatx80( aSign, 0, 0 );
2728 lastBitMask = 1;
2729 lastBitMask <<= 0x403E - aExp;
2730 roundBitsMask = lastBitMask - 1;
2731 z = a;
2732 roundingMode = float_rounding_mode;
2733 if ( roundingMode == float_round_nearest_even ) {
2734 z.low += lastBitMask>>1;
2735 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2737 else if ( roundingMode != float_round_to_zero ) {
2738 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2739 z.low += roundBitsMask;
2742 z.low &= ~ roundBitsMask;
2743 if ( z.low == 0 ) {
2744 ++z.high;
2745 z.low = LIT64( 0x8000000000000000 );
2747 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
2748 return z;
2753 -------------------------------------------------------------------------------
2754 Returns the result of adding the absolute values of the extended double-
2755 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2756 negated before being returned. `zSign' is ignored if the result is a NaN.
2757 The addition is performed according to the IEC/IEEE Standard for Binary
2758 Floating-point Arithmetic.
2759 -------------------------------------------------------------------------------
2761 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2763 int32 aExp, bExp, zExp;
2764 bits64 aSig, bSig, zSig0, zSig1;
2765 int32 expDiff;
2767 aSig = extractFloatx80Frac( a );
2768 aExp = extractFloatx80Exp( a );
2769 bSig = extractFloatx80Frac( b );
2770 bExp = extractFloatx80Exp( b );
2771 expDiff = aExp - bExp;
2772 if ( 0 < expDiff ) {
2773 if ( aExp == 0x7FFF ) {
2774 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2775 return a;
2777 if ( bExp == 0 ) --expDiff;
2778 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2779 zExp = aExp;
2781 else if ( expDiff < 0 ) {
2782 if ( bExp == 0x7FFF ) {
2783 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2784 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2786 if ( aExp == 0 ) ++expDiff;
2787 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2788 zExp = bExp;
2790 else {
2791 if ( aExp == 0x7FFF ) {
2792 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2793 return propagateFloatx80NaN( a, b );
2795 return a;
2797 zSig1 = 0;
2798 zSig0 = aSig + bSig;
2799 if ( aExp == 0 ) {
2800 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2801 goto roundAndPack;
2803 zExp = aExp;
2804 goto shiftRight1;
2807 zSig0 = aSig + bSig;
2809 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2810 shiftRight1:
2811 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2812 zSig0 |= LIT64( 0x8000000000000000 );
2813 ++zExp;
2814 roundAndPack:
2815 return
2816 roundAndPackFloatx80(
2817 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2822 -------------------------------------------------------------------------------
2823 Returns the result of subtracting the absolute values of the extended
2824 double-precision floating-point values `a' and `b'. If `zSign' is true,
2825 the difference is negated before being returned. `zSign' is ignored if the
2826 result is a NaN. The subtraction is performed according to the IEC/IEEE
2827 Standard for Binary Floating-point Arithmetic.
2828 -------------------------------------------------------------------------------
2830 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2832 int32 aExp, bExp, zExp;
2833 bits64 aSig, bSig, zSig0, zSig1;
2834 int32 expDiff;
2835 floatx80 z;
2837 aSig = extractFloatx80Frac( a );
2838 aExp = extractFloatx80Exp( a );
2839 bSig = extractFloatx80Frac( b );
2840 bExp = extractFloatx80Exp( b );
2841 expDiff = aExp - bExp;
2842 if ( 0 < expDiff ) goto aExpBigger;
2843 if ( expDiff < 0 ) goto bExpBigger;
2844 if ( aExp == 0x7FFF ) {
2845 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2846 return propagateFloatx80NaN( a, b );
2848 float_raise( float_flag_invalid );
2849 z.low = floatx80_default_nan_low;
2850 z.high = floatx80_default_nan_high;
2851 return z;
2853 if ( aExp == 0 ) {
2854 aExp = 1;
2855 bExp = 1;
2857 zSig1 = 0;
2858 if ( bSig < aSig ) goto aBigger;
2859 if ( aSig < bSig ) goto bBigger;
2860 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
2861 bExpBigger:
2862 if ( bExp == 0x7FFF ) {
2863 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2864 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2866 if ( aExp == 0 ) ++expDiff;
2867 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2868 bBigger:
2869 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2870 zExp = bExp;
2871 zSign ^= 1;
2872 goto normalizeRoundAndPack;
2873 aExpBigger:
2874 if ( aExp == 0x7FFF ) {
2875 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2876 return a;
2878 if ( bExp == 0 ) --expDiff;
2879 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2880 aBigger:
2881 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2882 zExp = aExp;
2883 normalizeRoundAndPack:
2884 return
2885 normalizeRoundAndPackFloatx80(
2886 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2891 -------------------------------------------------------------------------------
2892 Returns the result of adding the extended double-precision floating-point
2893 values `a' and `b'. The operation is performed according to the IEC/IEEE
2894 Standard for Binary Floating-point Arithmetic.
2895 -------------------------------------------------------------------------------
2897 floatx80 floatx80_add( floatx80 a, floatx80 b )
2899 flag aSign, bSign;
2901 aSign = extractFloatx80Sign( a );
2902 bSign = extractFloatx80Sign( b );
2903 if ( aSign == bSign ) {
2904 return addFloatx80Sigs( a, b, aSign );
2906 else {
2907 return subFloatx80Sigs( a, b, aSign );
2913 -------------------------------------------------------------------------------
2914 Returns the result of subtracting the extended double-precision floating-
2915 point values `a' and `b'. The operation is performed according to the
2916 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2917 -------------------------------------------------------------------------------
2919 floatx80 floatx80_sub( floatx80 a, floatx80 b )
2921 flag aSign, bSign;
2923 aSign = extractFloatx80Sign( a );
2924 bSign = extractFloatx80Sign( b );
2925 if ( aSign == bSign ) {
2926 return subFloatx80Sigs( a, b, aSign );
2928 else {
2929 return addFloatx80Sigs( a, b, aSign );
2935 -------------------------------------------------------------------------------
2936 Returns the result of multiplying the extended double-precision floating-
2937 point values `a' and `b'. The operation is performed according to the
2938 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2939 -------------------------------------------------------------------------------
2941 floatx80 floatx80_mul( floatx80 a, floatx80 b )
2943 flag aSign, bSign, zSign;
2944 int32 aExp, bExp, zExp;
2945 bits64 aSig, bSig, zSig0, zSig1;
2946 floatx80 z;
2948 aSig = extractFloatx80Frac( a );
2949 aExp = extractFloatx80Exp( a );
2950 aSign = extractFloatx80Sign( a );
2951 bSig = extractFloatx80Frac( b );
2952 bExp = extractFloatx80Exp( b );
2953 bSign = extractFloatx80Sign( b );
2954 zSign = aSign ^ bSign;
2955 if ( aExp == 0x7FFF ) {
2956 if ( (bits64) ( aSig<<1 )
2957 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2958 return propagateFloatx80NaN( a, b );
2960 if ( ( bExp | bSig ) == 0 ) goto invalid;
2961 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2963 if ( bExp == 0x7FFF ) {
2964 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2965 if ( ( aExp | aSig ) == 0 ) {
2966 invalid:
2967 float_raise( float_flag_invalid );
2968 z.low = floatx80_default_nan_low;
2969 z.high = floatx80_default_nan_high;
2970 return z;
2972 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2974 if ( aExp == 0 ) {
2975 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2976 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2978 if ( bExp == 0 ) {
2979 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2980 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2982 zExp = aExp + bExp - 0x3FFE;
2983 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2984 if ( 0 < (sbits64) zSig0 ) {
2985 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2986 --zExp;
2988 return
2989 roundAndPackFloatx80(
2990 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2995 -------------------------------------------------------------------------------
2996 Returns the result of dividing the extended double-precision floating-point
2997 value `a' by the corresponding value `b'. The operation is performed
2998 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2999 -------------------------------------------------------------------------------
3001 floatx80 floatx80_div( floatx80 a, floatx80 b )
3003 flag aSign, bSign, zSign;
3004 int32 aExp, bExp, zExp;
3005 bits64 aSig, bSig, zSig0, zSig1;
3006 bits64 rem0, rem1, rem2, term0, term1, term2;
3007 floatx80 z;
3009 aSig = extractFloatx80Frac( a );
3010 aExp = extractFloatx80Exp( a );
3011 aSign = extractFloatx80Sign( a );
3012 bSig = extractFloatx80Frac( b );
3013 bExp = extractFloatx80Exp( b );
3014 bSign = extractFloatx80Sign( b );
3015 zSign = aSign ^ bSign;
3016 if ( aExp == 0x7FFF ) {
3017 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3018 if ( bExp == 0x7FFF ) {
3019 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3020 goto invalid;
3022 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3024 if ( bExp == 0x7FFF ) {
3025 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3026 return packFloatx80( zSign, 0, 0 );
3028 if ( bExp == 0 ) {
3029 if ( bSig == 0 ) {
3030 if ( ( aExp | aSig ) == 0 ) {
3031 invalid:
3032 float_raise( float_flag_invalid );
3033 z.low = floatx80_default_nan_low;
3034 z.high = floatx80_default_nan_high;
3035 return z;
3037 float_raise( float_flag_divbyzero );
3038 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3040 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3042 if ( aExp == 0 ) {
3043 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3044 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3046 zExp = aExp - bExp + 0x3FFE;
3047 rem1 = 0;
3048 if ( bSig <= aSig ) {
3049 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3050 ++zExp;
3052 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3053 mul64To128( bSig, zSig0, &term0, &term1 );
3054 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3055 while ( (sbits64) rem0 < 0 ) {
3056 --zSig0;
3057 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3059 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3060 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3061 mul64To128( bSig, zSig1, &term1, &term2 );
3062 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3063 while ( (sbits64) rem1 < 0 ) {
3064 --zSig1;
3065 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3067 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3069 return
3070 roundAndPackFloatx80(
3071 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3076 -------------------------------------------------------------------------------
3077 Returns the remainder of the extended double-precision floating-point value
3078 `a' with respect to the corresponding value `b'. The operation is performed
3079 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3080 -------------------------------------------------------------------------------
3082 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3084 flag aSign, bSign, zSign;
3085 int32 aExp, bExp, expDiff;
3086 bits64 aSig0, aSig1, bSig;
3087 bits64 q, term0, term1, alternateASig0, alternateASig1;
3088 floatx80 z;
3090 aSig0 = extractFloatx80Frac( a );
3091 aExp = extractFloatx80Exp( a );
3092 aSign = extractFloatx80Sign( a );
3093 bSig = extractFloatx80Frac( b );
3094 bExp = extractFloatx80Exp( b );
3095 bSign = extractFloatx80Sign( b );
3096 if ( aExp == 0x7FFF ) {
3097 if ( (bits64) ( aSig0<<1 )
3098 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3099 return propagateFloatx80NaN( a, b );
3101 goto invalid;
3103 if ( bExp == 0x7FFF ) {
3104 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3105 return a;
3107 if ( bExp == 0 ) {
3108 if ( bSig == 0 ) {
3109 invalid:
3110 float_raise( float_flag_invalid );
3111 z.low = floatx80_default_nan_low;
3112 z.high = floatx80_default_nan_high;
3113 return z;
3115 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3117 if ( aExp == 0 ) {
3118 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3119 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3121 bSig |= LIT64( 0x8000000000000000 );
3122 zSign = aSign;
3123 expDiff = aExp - bExp;
3124 aSig1 = 0;
3125 if ( expDiff < 0 ) {
3126 if ( expDiff < -1 ) return a;
3127 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3128 expDiff = 0;
3130 q = ( bSig <= aSig0 );
3131 if ( q ) aSig0 -= bSig;
3132 expDiff -= 64;
3133 while ( 0 < expDiff ) {
3134 q = estimateDiv128To64( aSig0, aSig1, bSig );
3135 q = ( 2 < q ) ? q - 2 : 0;
3136 mul64To128( bSig, q, &term0, &term1 );
3137 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3138 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3139 expDiff -= 62;
3141 expDiff += 64;
3142 if ( 0 < expDiff ) {
3143 q = estimateDiv128To64( aSig0, aSig1, bSig );
3144 q = ( 2 < q ) ? q - 2 : 0;
3145 q >>= 64 - expDiff;
3146 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3147 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3148 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3149 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3150 ++q;
3151 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3154 else {
3155 term1 = 0;
3156 term0 = bSig;
3158 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3159 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3160 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3161 && ( q & 1 ) )
3163 aSig0 = alternateASig0;
3164 aSig1 = alternateASig1;
3165 zSign = ! zSign;
3167 return
3168 normalizeRoundAndPackFloatx80(
3169 80, zSign, bExp + expDiff, aSig0, aSig1 );
3174 -------------------------------------------------------------------------------
3175 Returns the square root of the extended double-precision floating-point
3176 value `a'. The operation is performed according to the IEC/IEEE Standard
3177 for Binary Floating-point Arithmetic.
3178 -------------------------------------------------------------------------------
3180 floatx80 floatx80_sqrt( floatx80 a )
3182 flag aSign;
3183 int32 aExp, zExp;
3184 bits64 aSig0, aSig1, zSig0, zSig1;
3185 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3186 bits64 shiftedRem0, shiftedRem1;
3187 floatx80 z;
3189 aSig0 = extractFloatx80Frac( a );
3190 aExp = extractFloatx80Exp( a );
3191 aSign = extractFloatx80Sign( a );
3192 if ( aExp == 0x7FFF ) {
3193 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3194 if ( ! aSign ) return a;
3195 goto invalid;
3197 if ( aSign ) {
3198 if ( ( aExp | aSig0 ) == 0 ) return a;
3199 invalid:
3200 float_raise( float_flag_invalid );
3201 z.low = floatx80_default_nan_low;
3202 z.high = floatx80_default_nan_high;
3203 return z;
3205 if ( aExp == 0 ) {
3206 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3207 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3209 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3210 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3211 zSig0 <<= 31;
3212 aSig1 = 0;
3213 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3214 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3215 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3216 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3217 mul64To128( zSig0, zSig0, &term0, &term1 );
3218 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3219 while ( (sbits64) rem0 < 0 ) {
3220 --zSig0;
3221 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3222 term1 |= 1;
3223 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3225 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3226 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3227 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3228 if ( zSig1 == 0 ) zSig1 = 1;
3229 mul64To128( zSig0, zSig1, &term1, &term2 );
3230 shortShift128Left( term1, term2, 1, &term1, &term2 );
3231 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3232 mul64To128( zSig1, zSig1, &term2, &term3 );
3233 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3234 while ( (sbits64) rem1 < 0 ) {
3235 --zSig1;
3236 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3237 term3 |= 1;
3238 add192(
3239 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3241 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3243 return
3244 roundAndPackFloatx80(
3245 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3250 -------------------------------------------------------------------------------
3251 Returns 1 if the extended double-precision floating-point value `a' is
3252 equal to the corresponding value `b', and 0 otherwise. The comparison is
3253 performed according to the IEC/IEEE Standard for Binary Floating-point
3254 Arithmetic.
3255 -------------------------------------------------------------------------------
3257 flag floatx80_eq( floatx80 a, floatx80 b )
3260 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3261 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3262 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3263 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3265 if ( floatx80_is_signaling_nan( a )
3266 || floatx80_is_signaling_nan( b ) ) {
3267 float_raise( float_flag_invalid );
3269 return 0;
3271 return
3272 ( a.low == b.low )
3273 && ( ( a.high == b.high )
3274 || ( ( a.low == 0 )
3275 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3281 -------------------------------------------------------------------------------
3282 Returns 1 if the extended double-precision floating-point value `a' is
3283 less than or equal to the corresponding value `b', and 0 otherwise. The
3284 comparison is performed according to the IEC/IEEE Standard for Binary
3285 Floating-point Arithmetic.
3286 -------------------------------------------------------------------------------
3288 flag floatx80_le( floatx80 a, floatx80 b )
3290 flag aSign, bSign;
3292 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3293 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3294 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3295 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3297 float_raise( float_flag_invalid );
3298 return 0;
3300 aSign = extractFloatx80Sign( a );
3301 bSign = extractFloatx80Sign( b );
3302 if ( aSign != bSign ) {
3303 return
3304 aSign
3305 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3306 == 0 );
3308 return
3309 aSign ? le128( b.high, b.low, a.high, a.low )
3310 : le128( a.high, a.low, b.high, b.low );
3315 -------------------------------------------------------------------------------
3316 Returns 1 if the extended double-precision floating-point value `a' is
3317 less than the corresponding value `b', and 0 otherwise. The comparison
3318 is performed according to the IEC/IEEE Standard for Binary Floating-point
3319 Arithmetic.
3320 -------------------------------------------------------------------------------
3322 flag floatx80_lt( floatx80 a, floatx80 b )
3324 flag aSign, bSign;
3326 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3327 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3328 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3329 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3331 float_raise( float_flag_invalid );
3332 return 0;
3334 aSign = extractFloatx80Sign( a );
3335 bSign = extractFloatx80Sign( b );
3336 if ( aSign != bSign ) {
3337 return
3338 aSign
3339 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3340 != 0 );
3342 return
3343 aSign ? lt128( b.high, b.low, a.high, a.low )
3344 : lt128( a.high, a.low, b.high, b.low );
3349 -------------------------------------------------------------------------------
3350 Returns 1 if the extended double-precision floating-point value `a' is equal
3351 to the corresponding value `b', and 0 otherwise. The invalid exception is
3352 raised if either operand is a NaN. Otherwise, the comparison is performed
3353 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3354 -------------------------------------------------------------------------------
3356 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3359 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3360 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3361 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3362 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3364 float_raise( float_flag_invalid );
3365 return 0;
3367 return
3368 ( a.low == b.low )
3369 && ( ( a.high == b.high )
3370 || ( ( a.low == 0 )
3371 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3377 -------------------------------------------------------------------------------
3378 Returns 1 if the extended double-precision floating-point value `a' is less
3379 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3380 do not cause an exception. Otherwise, the comparison is performed according
3381 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3382 -------------------------------------------------------------------------------
3384 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3386 flag aSign, bSign;
3388 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3389 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3390 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3391 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3393 if ( floatx80_is_signaling_nan( a )
3394 || floatx80_is_signaling_nan( b ) ) {
3395 float_raise( float_flag_invalid );
3397 return 0;
3399 aSign = extractFloatx80Sign( a );
3400 bSign = extractFloatx80Sign( b );
3401 if ( aSign != bSign ) {
3402 return
3403 aSign
3404 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3405 == 0 );
3407 return
3408 aSign ? le128( b.high, b.low, a.high, a.low )
3409 : le128( a.high, a.low, b.high, b.low );
3414 -------------------------------------------------------------------------------
3415 Returns 1 if the extended double-precision floating-point value `a' is less
3416 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3417 an exception. Otherwise, the comparison is performed according to the
3418 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3419 -------------------------------------------------------------------------------
3421 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3423 flag aSign, bSign;
3425 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3426 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3427 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3428 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3430 if ( floatx80_is_signaling_nan( a )
3431 || floatx80_is_signaling_nan( b ) ) {
3432 float_raise( float_flag_invalid );
3434 return 0;
3436 aSign = extractFloatx80Sign( a );
3437 bSign = extractFloatx80Sign( b );
3438 if ( aSign != bSign ) {
3439 return
3440 aSign
3441 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3442 != 0 );
3444 return
3445 aSign ? lt128( b.high, b.low, a.high, a.low )
3446 : lt128( a.high, a.low, b.high, b.low );
3450 #endif