MOXA linux-2.6.x / linux-2.6.9-uc0 from sdlinux-moxaart.tgz
[linux-2.6.9-moxart.git] / arch / arm / nwfpe / softfloat.c
blob9d743ae29062b221acf4e74ce474e6560c3ae761
1 /*
2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
31 #include "fpa11.h"
32 //#include "milieu.h"
33 //#include "softfloat.h"
36 -------------------------------------------------------------------------------
37 Floating-point rounding mode, extended double-precision rounding precision,
38 and exception flags.
39 -------------------------------------------------------------------------------
41 int8 float_rounding_mode = float_round_nearest_even;
42 int8 floatx80_rounding_precision = 80;
43 int8 float_exception_flags;
46 -------------------------------------------------------------------------------
47 Primitive arithmetic functions, including multi-word arithmetic, and
48 division and square root approximations. (Can be specialized to target if
49 desired.)
50 -------------------------------------------------------------------------------
52 #include "softfloat-macros"
55 -------------------------------------------------------------------------------
56 Functions and definitions to determine: (1) whether tininess for underflow
57 is detected before or after rounding by default, (2) what (if anything)
58 happens when exceptions are raised, (3) how signaling NaNs are distinguished
59 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60 are propagated from function inputs to output. These details are target-
61 specific.
62 -------------------------------------------------------------------------------
64 #include "softfloat-specialize"
67 -------------------------------------------------------------------------------
68 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
69 and 7, and returns the properly rounded 32-bit integer corresponding to the
70 input. If `zSign' is nonzero, the input is negated before being converted
71 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
72 input is simply rounded to an integer, with the inexact exception raised if
73 the input cannot be represented exactly as an integer. If the fixed-point
74 input is too large, however, the invalid exception is raised and the largest
75 positive or negative integer is returned.
76 -------------------------------------------------------------------------------
78 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
80 int8 roundingMode;
81 flag roundNearestEven;
82 int8 roundIncrement, roundBits;
83 int32 z;
85 roundingMode = float_rounding_mode;
86 roundNearestEven = ( roundingMode == float_round_nearest_even );
87 roundIncrement = 0x40;
88 if ( ! roundNearestEven ) {
89 if ( roundingMode == float_round_to_zero ) {
90 roundIncrement = 0;
92 else {
93 roundIncrement = 0x7F;
94 if ( zSign ) {
95 if ( roundingMode == float_round_up ) roundIncrement = 0;
97 else {
98 if ( roundingMode == float_round_down ) roundIncrement = 0;
102 roundBits = absZ & 0x7F;
103 absZ = ( absZ + roundIncrement )>>7;
104 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
105 z = absZ;
106 if ( zSign ) z = - z;
107 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
108 float_exception_flags |= float_flag_invalid;
109 return zSign ? 0x80000000 : 0x7FFFFFFF;
111 if ( roundBits ) float_exception_flags |= float_flag_inexact;
112 return z;
117 -------------------------------------------------------------------------------
118 Returns the fraction bits of the single-precision floating-point value `a'.
119 -------------------------------------------------------------------------------
121 INLINE bits32 extractFloat32Frac( float32 a )
124 return a & 0x007FFFFF;
129 -------------------------------------------------------------------------------
130 Returns the exponent bits of the single-precision floating-point value `a'.
131 -------------------------------------------------------------------------------
133 INLINE int16 extractFloat32Exp( float32 a )
136 return ( a>>23 ) & 0xFF;
141 -------------------------------------------------------------------------------
142 Returns the sign bit of the single-precision floating-point value `a'.
143 -------------------------------------------------------------------------------
145 #if 0 /* in softfloat.h */
146 INLINE flag extractFloat32Sign( float32 a )
149 return a>>31;
152 #endif
155 -------------------------------------------------------------------------------
156 Normalizes the subnormal single-precision floating-point value represented
157 by the denormalized significand `aSig'. The normalized exponent and
158 significand are stored at the locations pointed to by `zExpPtr' and
159 `zSigPtr', respectively.
160 -------------------------------------------------------------------------------
162 static void
163 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
165 int8 shiftCount;
167 shiftCount = countLeadingZeros32( aSig ) - 8;
168 *zSigPtr = aSig<<shiftCount;
169 *zExpPtr = 1 - shiftCount;
174 -------------------------------------------------------------------------------
175 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
176 single-precision floating-point value, returning the result. After being
177 shifted into the proper positions, the three fields are simply added
178 together to form the result. This means that any integer portion of `zSig'
179 will be added into the exponent. Since a properly normalized significand
180 will have an integer portion equal to 1, the `zExp' input should be 1 less
181 than the desired result exponent whenever `zSig' is a complete, normalized
182 significand.
183 -------------------------------------------------------------------------------
185 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
187 #if 0
188 float32 f;
189 __asm__("@ packFloat32 \n\
190 mov %0, %1, asl #31 \n\
191 orr %0, %2, asl #23 \n\
192 orr %0, %3"
193 : /* no outputs */
194 : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
195 : "cc");
196 return f;
197 #else
198 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
199 #endif
203 -------------------------------------------------------------------------------
204 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
205 and significand `zSig', and returns the proper single-precision floating-
206 point value corresponding to the abstract input. Ordinarily, the abstract
207 value is simply rounded and packed into the single-precision format, with
208 the inexact exception raised if the abstract input cannot be represented
209 exactly. If the abstract value is too large, however, the overflow and
210 inexact exceptions are raised and an infinity or maximal finite value is
211 returned. If the abstract value is too small, the input value is rounded to
212 a subnormal number, and the underflow and inexact exceptions are raised if
213 the abstract input cannot be represented exactly as a subnormal single-
214 precision floating-point number.
215 The input significand `zSig' has its binary point between bits 30
216 and 29, which is 7 bits to the left of the usual location. This shifted
217 significand must be normalized or smaller. If `zSig' is not normalized,
218 `zExp' must be 0; in that case, the result returned is a subnormal number,
219 and it must not require rounding. In the usual case that `zSig' is
220 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
221 The handling of underflow and overflow follows the IEC/IEEE Standard for
222 Binary Floating-point Arithmetic.
223 -------------------------------------------------------------------------------
225 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
227 int8 roundingMode;
228 flag roundNearestEven;
229 int8 roundIncrement, roundBits;
230 flag isTiny;
232 roundingMode = float_rounding_mode;
233 roundNearestEven = ( roundingMode == float_round_nearest_even );
234 roundIncrement = 0x40;
235 if ( ! roundNearestEven ) {
236 if ( roundingMode == float_round_to_zero ) {
237 roundIncrement = 0;
239 else {
240 roundIncrement = 0x7F;
241 if ( zSign ) {
242 if ( roundingMode == float_round_up ) roundIncrement = 0;
244 else {
245 if ( roundingMode == float_round_down ) roundIncrement = 0;
249 roundBits = zSig & 0x7F;
250 if ( 0xFD <= (bits16) zExp ) {
251 if ( ( 0xFD < zExp )
252 || ( ( zExp == 0xFD )
253 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
255 float_raise( float_flag_overflow | float_flag_inexact );
256 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
258 if ( zExp < 0 ) {
259 isTiny =
260 ( float_detect_tininess == float_tininess_before_rounding )
261 || ( zExp < -1 )
262 || ( zSig + roundIncrement < 0x80000000 );
263 shift32RightJamming( zSig, - zExp, &zSig );
264 zExp = 0;
265 roundBits = zSig & 0x7F;
266 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
269 if ( roundBits ) float_exception_flags |= float_flag_inexact;
270 zSig = ( zSig + roundIncrement )>>7;
271 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
272 if ( zSig == 0 ) zExp = 0;
273 return packFloat32( zSign, zExp, zSig );
278 -------------------------------------------------------------------------------
279 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
280 and significand `zSig', and returns the proper single-precision floating-
281 point value corresponding to the abstract input. This routine is just like
282 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
283 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
284 point exponent.
285 -------------------------------------------------------------------------------
287 static float32
288 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
290 int8 shiftCount;
292 shiftCount = countLeadingZeros32( zSig ) - 1;
293 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
298 -------------------------------------------------------------------------------
299 Returns the fraction bits of the double-precision floating-point value `a'.
300 -------------------------------------------------------------------------------
302 INLINE bits64 extractFloat64Frac( float64 a )
305 return a & LIT64( 0x000FFFFFFFFFFFFF );
310 -------------------------------------------------------------------------------
311 Returns the exponent bits of the double-precision floating-point value `a'.
312 -------------------------------------------------------------------------------
314 INLINE int16 extractFloat64Exp( float64 a )
317 return ( a>>52 ) & 0x7FF;
322 -------------------------------------------------------------------------------
323 Returns the sign bit of the double-precision floating-point value `a'.
324 -------------------------------------------------------------------------------
326 #if 0 /* in softfloat.h */
327 INLINE flag extractFloat64Sign( float64 a )
330 return a>>63;
333 #endif
336 -------------------------------------------------------------------------------
337 Normalizes the subnormal double-precision floating-point value represented
338 by the denormalized significand `aSig'. The normalized exponent and
339 significand are stored at the locations pointed to by `zExpPtr' and
340 `zSigPtr', respectively.
341 -------------------------------------------------------------------------------
343 static void
344 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
346 int8 shiftCount;
348 shiftCount = countLeadingZeros64( aSig ) - 11;
349 *zSigPtr = aSig<<shiftCount;
350 *zExpPtr = 1 - shiftCount;
355 -------------------------------------------------------------------------------
356 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
357 double-precision floating-point value, returning the result. After being
358 shifted into the proper positions, the three fields are simply added
359 together to form the result. This means that any integer portion of `zSig'
360 will be added into the exponent. Since a properly normalized significand
361 will have an integer portion equal to 1, the `zExp' input should be 1 less
362 than the desired result exponent whenever `zSig' is a complete, normalized
363 significand.
364 -------------------------------------------------------------------------------
366 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
369 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
374 -------------------------------------------------------------------------------
375 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
376 and significand `zSig', and returns the proper double-precision floating-
377 point value corresponding to the abstract input. Ordinarily, the abstract
378 value is simply rounded and packed into the double-precision format, with
379 the inexact exception raised if the abstract input cannot be represented
380 exactly. If the abstract value is too large, however, the overflow and
381 inexact exceptions are raised and an infinity or maximal finite value is
382 returned. If the abstract value is too small, the input value is rounded to
383 a subnormal number, and the underflow and inexact exceptions are raised if
384 the abstract input cannot be represented exactly as a subnormal double-
385 precision floating-point number.
386 The input significand `zSig' has its binary point between bits 62
387 and 61, which is 10 bits to the left of the usual location. This shifted
388 significand must be normalized or smaller. If `zSig' is not normalized,
389 `zExp' must be 0; in that case, the result returned is a subnormal number,
390 and it must not require rounding. In the usual case that `zSig' is
391 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
392 The handling of underflow and overflow follows the IEC/IEEE Standard for
393 Binary Floating-point Arithmetic.
394 -------------------------------------------------------------------------------
396 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
398 int8 roundingMode;
399 flag roundNearestEven;
400 int16 roundIncrement, roundBits;
401 flag isTiny;
403 roundingMode = float_rounding_mode;
404 roundNearestEven = ( roundingMode == float_round_nearest_even );
405 roundIncrement = 0x200;
406 if ( ! roundNearestEven ) {
407 if ( roundingMode == float_round_to_zero ) {
408 roundIncrement = 0;
410 else {
411 roundIncrement = 0x3FF;
412 if ( zSign ) {
413 if ( roundingMode == float_round_up ) roundIncrement = 0;
415 else {
416 if ( roundingMode == float_round_down ) roundIncrement = 0;
420 roundBits = zSig & 0x3FF;
421 if ( 0x7FD <= (bits16) zExp ) {
422 if ( ( 0x7FD < zExp )
423 || ( ( zExp == 0x7FD )
424 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
426 //register int lr = __builtin_return_address(0);
427 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
428 float_raise( float_flag_overflow | float_flag_inexact );
429 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
431 if ( zExp < 0 ) {
432 isTiny =
433 ( float_detect_tininess == float_tininess_before_rounding )
434 || ( zExp < -1 )
435 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
436 shift64RightJamming( zSig, - zExp, &zSig );
437 zExp = 0;
438 roundBits = zSig & 0x3FF;
439 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
442 if ( roundBits ) float_exception_flags |= float_flag_inexact;
443 zSig = ( zSig + roundIncrement )>>10;
444 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
445 if ( zSig == 0 ) zExp = 0;
446 return packFloat64( zSign, zExp, zSig );
451 -------------------------------------------------------------------------------
452 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
453 and significand `zSig', and returns the proper double-precision floating-
454 point value corresponding to the abstract input. This routine is just like
455 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
456 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
457 point exponent.
458 -------------------------------------------------------------------------------
460 static float64
461 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
463 int8 shiftCount;
465 shiftCount = countLeadingZeros64( zSig ) - 1;
466 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
470 #ifdef FLOATX80
473 -------------------------------------------------------------------------------
474 Returns the fraction bits of the extended double-precision floating-point
475 value `a'.
476 -------------------------------------------------------------------------------
478 INLINE bits64 extractFloatx80Frac( floatx80 a )
481 return a.low;
486 -------------------------------------------------------------------------------
487 Returns the exponent bits of the extended double-precision floating-point
488 value `a'.
489 -------------------------------------------------------------------------------
491 INLINE int32 extractFloatx80Exp( floatx80 a )
494 return a.high & 0x7FFF;
499 -------------------------------------------------------------------------------
500 Returns the sign bit of the extended double-precision floating-point value
501 `a'.
502 -------------------------------------------------------------------------------
504 INLINE flag extractFloatx80Sign( floatx80 a )
507 return a.high>>15;
512 -------------------------------------------------------------------------------
513 Normalizes the subnormal extended double-precision floating-point value
514 represented by the denormalized significand `aSig'. The normalized exponent
515 and significand are stored at the locations pointed to by `zExpPtr' and
516 `zSigPtr', respectively.
517 -------------------------------------------------------------------------------
519 static void
520 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
522 int8 shiftCount;
524 shiftCount = countLeadingZeros64( aSig );
525 *zSigPtr = aSig<<shiftCount;
526 *zExpPtr = 1 - shiftCount;
531 -------------------------------------------------------------------------------
532 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
533 extended double-precision floating-point value, returning the result.
534 -------------------------------------------------------------------------------
536 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
538 floatx80 z;
540 z.low = zSig;
541 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
542 return z;
547 -------------------------------------------------------------------------------
548 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
549 and extended significand formed by the concatenation of `zSig0' and `zSig1',
550 and returns the proper extended double-precision floating-point value
551 corresponding to the abstract input. Ordinarily, the abstract value is
552 rounded and packed into the extended double-precision format, with the
553 inexact exception raised if the abstract input cannot be represented
554 exactly. If the abstract value is too large, however, the overflow and
555 inexact exceptions are raised and an infinity or maximal finite value is
556 returned. If the abstract value is too small, the input value is rounded to
557 a subnormal number, and the underflow and inexact exceptions are raised if
558 the abstract input cannot be represented exactly as a subnormal extended
559 double-precision floating-point number.
560 If `roundingPrecision' is 32 or 64, the result is rounded to the same
561 number of bits as single or double precision, respectively. Otherwise, the
562 result is rounded to the full precision of the extended double-precision
563 format.
564 The input significand must be normalized or smaller. If the input
565 significand is not normalized, `zExp' must be 0; in that case, the result
566 returned is a subnormal number, and it must not require rounding. The
567 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
568 Floating-point Arithmetic.
569 -------------------------------------------------------------------------------
571 static floatx80
572 roundAndPackFloatx80(
573 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
576 int8 roundingMode;
577 flag roundNearestEven, increment, isTiny;
578 int64 roundIncrement, roundMask, roundBits;
580 roundingMode = float_rounding_mode;
581 roundNearestEven = ( roundingMode == float_round_nearest_even );
582 if ( roundingPrecision == 80 ) goto precision80;
583 if ( roundingPrecision == 64 ) {
584 roundIncrement = LIT64( 0x0000000000000400 );
585 roundMask = LIT64( 0x00000000000007FF );
587 else if ( roundingPrecision == 32 ) {
588 roundIncrement = LIT64( 0x0000008000000000 );
589 roundMask = LIT64( 0x000000FFFFFFFFFF );
591 else {
592 goto precision80;
594 zSig0 |= ( zSig1 != 0 );
595 if ( ! roundNearestEven ) {
596 if ( roundingMode == float_round_to_zero ) {
597 roundIncrement = 0;
599 else {
600 roundIncrement = roundMask;
601 if ( zSign ) {
602 if ( roundingMode == float_round_up ) roundIncrement = 0;
604 else {
605 if ( roundingMode == float_round_down ) roundIncrement = 0;
609 roundBits = zSig0 & roundMask;
610 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
611 if ( ( 0x7FFE < zExp )
612 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
614 goto overflow;
616 if ( zExp <= 0 ) {
617 isTiny =
618 ( float_detect_tininess == float_tininess_before_rounding )
619 || ( zExp < 0 )
620 || ( zSig0 <= zSig0 + roundIncrement );
621 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
622 zExp = 0;
623 roundBits = zSig0 & roundMask;
624 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
625 if ( roundBits ) float_exception_flags |= float_flag_inexact;
626 zSig0 += roundIncrement;
627 if ( (sbits64) zSig0 < 0 ) zExp = 1;
628 roundIncrement = roundMask + 1;
629 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
630 roundMask |= roundIncrement;
632 zSig0 &= ~ roundMask;
633 return packFloatx80( zSign, zExp, zSig0 );
636 if ( roundBits ) float_exception_flags |= float_flag_inexact;
637 zSig0 += roundIncrement;
638 if ( zSig0 < roundIncrement ) {
639 ++zExp;
640 zSig0 = LIT64( 0x8000000000000000 );
642 roundIncrement = roundMask + 1;
643 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
644 roundMask |= roundIncrement;
646 zSig0 &= ~ roundMask;
647 if ( zSig0 == 0 ) zExp = 0;
648 return packFloatx80( zSign, zExp, zSig0 );
649 precision80:
650 increment = ( (sbits64) zSig1 < 0 );
651 if ( ! roundNearestEven ) {
652 if ( roundingMode == float_round_to_zero ) {
653 increment = 0;
655 else {
656 if ( zSign ) {
657 increment = ( roundingMode == float_round_down ) && zSig1;
659 else {
660 increment = ( roundingMode == float_round_up ) && zSig1;
664 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
665 if ( ( 0x7FFE < zExp )
666 || ( ( zExp == 0x7FFE )
667 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
668 && increment
671 roundMask = 0;
672 overflow:
673 float_raise( float_flag_overflow | float_flag_inexact );
674 if ( ( roundingMode == float_round_to_zero )
675 || ( zSign && ( roundingMode == float_round_up ) )
676 || ( ! zSign && ( roundingMode == float_round_down ) )
678 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
680 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
682 if ( zExp <= 0 ) {
683 isTiny =
684 ( float_detect_tininess == float_tininess_before_rounding )
685 || ( zExp < 0 )
686 || ! increment
687 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
688 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
689 zExp = 0;
690 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
691 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
692 if ( roundNearestEven ) {
693 increment = ( (sbits64) zSig1 < 0 );
695 else {
696 if ( zSign ) {
697 increment = ( roundingMode == float_round_down ) && zSig1;
699 else {
700 increment = ( roundingMode == float_round_up ) && zSig1;
703 if ( increment ) {
704 ++zSig0;
705 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
706 if ( (sbits64) zSig0 < 0 ) zExp = 1;
708 return packFloatx80( zSign, zExp, zSig0 );
711 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
712 if ( increment ) {
713 ++zSig0;
714 if ( zSig0 == 0 ) {
715 ++zExp;
716 zSig0 = LIT64( 0x8000000000000000 );
718 else {
719 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
722 else {
723 if ( zSig0 == 0 ) zExp = 0;
726 return packFloatx80( zSign, zExp, zSig0 );
730 -------------------------------------------------------------------------------
731 Takes an abstract floating-point value having sign `zSign', exponent
732 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
733 and returns the proper extended double-precision floating-point value
734 corresponding to the abstract input. This routine is just like
735 `roundAndPackFloatx80' except that the input significand does not have to be
736 normalized.
737 -------------------------------------------------------------------------------
739 static floatx80
740 normalizeRoundAndPackFloatx80(
741 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
744 int8 shiftCount;
746 if ( zSig0 == 0 ) {
747 zSig0 = zSig1;
748 zSig1 = 0;
749 zExp -= 64;
751 shiftCount = countLeadingZeros64( zSig0 );
752 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
753 zExp -= shiftCount;
754 return
755 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
759 #endif
762 -------------------------------------------------------------------------------
763 Returns the result of converting the 32-bit two's complement integer `a' to
764 the single-precision floating-point format. The conversion is performed
765 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
766 -------------------------------------------------------------------------------
768 float32 int32_to_float32( int32 a )
770 flag zSign;
772 if ( a == 0 ) return 0;
773 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
774 zSign = ( a < 0 );
775 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
780 -------------------------------------------------------------------------------
781 Returns the result of converting the 32-bit two's complement integer `a' to
782 the double-precision floating-point format. The conversion is performed
783 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
784 -------------------------------------------------------------------------------
786 float64 int32_to_float64( int32 a )
788 flag aSign;
789 uint32 absA;
790 int8 shiftCount;
791 bits64 zSig;
793 if ( a == 0 ) return 0;
794 aSign = ( a < 0 );
795 absA = aSign ? - a : a;
796 shiftCount = countLeadingZeros32( absA ) + 21;
797 zSig = absA;
798 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
802 #ifdef FLOATX80
805 -------------------------------------------------------------------------------
806 Returns the result of converting the 32-bit two's complement integer `a'
807 to the extended double-precision floating-point format. The conversion
808 is performed according to the IEC/IEEE Standard for Binary Floating-point
809 Arithmetic.
810 -------------------------------------------------------------------------------
812 floatx80 int32_to_floatx80( int32 a )
814 flag zSign;
815 uint32 absA;
816 int8 shiftCount;
817 bits64 zSig;
819 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
820 zSign = ( a < 0 );
821 absA = zSign ? - a : a;
822 shiftCount = countLeadingZeros32( absA ) + 32;
823 zSig = absA;
824 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
828 #endif
831 -------------------------------------------------------------------------------
832 Returns the result of converting the single-precision floating-point value
833 `a' to the 32-bit two's complement integer format. The conversion is
834 performed according to the IEC/IEEE Standard for Binary Floating-point
835 Arithmetic---which means in particular that the conversion is rounded
836 according to the current rounding mode. If `a' is a NaN, the largest
837 positive integer is returned. Otherwise, if the conversion overflows, the
838 largest integer with the same sign as `a' is returned.
839 -------------------------------------------------------------------------------
841 int32 float32_to_int32( float32 a )
843 flag aSign;
844 int16 aExp, shiftCount;
845 bits32 aSig;
846 bits64 zSig;
848 aSig = extractFloat32Frac( a );
849 aExp = extractFloat32Exp( a );
850 aSign = extractFloat32Sign( a );
851 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
852 if ( aExp ) aSig |= 0x00800000;
853 shiftCount = 0xAF - aExp;
854 zSig = aSig;
855 zSig <<= 32;
856 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
857 return roundAndPackInt32( aSign, zSig );
862 -------------------------------------------------------------------------------
863 Returns the result of converting the single-precision floating-point value
864 `a' to the 32-bit two's complement integer format. The conversion is
865 performed according to the IEC/IEEE Standard for Binary Floating-point
866 Arithmetic, except that the conversion is always rounded toward zero. If
867 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
868 conversion overflows, the largest integer with the same sign as `a' is
869 returned.
870 -------------------------------------------------------------------------------
872 int32 float32_to_int32_round_to_zero( float32 a )
874 flag aSign;
875 int16 aExp, shiftCount;
876 bits32 aSig;
877 int32 z;
879 aSig = extractFloat32Frac( a );
880 aExp = extractFloat32Exp( a );
881 aSign = extractFloat32Sign( a );
882 shiftCount = aExp - 0x9E;
883 if ( 0 <= shiftCount ) {
884 if ( a == 0xCF000000 ) return 0x80000000;
885 float_raise( float_flag_invalid );
886 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
887 return 0x80000000;
889 else if ( aExp <= 0x7E ) {
890 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
891 return 0;
893 aSig = ( aSig | 0x00800000 )<<8;
894 z = aSig>>( - shiftCount );
895 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
896 float_exception_flags |= float_flag_inexact;
898 return aSign ? - z : z;
903 -------------------------------------------------------------------------------
904 Returns the result of converting the single-precision floating-point value
905 `a' to the double-precision floating-point format. The conversion is
906 performed according to the IEC/IEEE Standard for Binary Floating-point
907 Arithmetic.
908 -------------------------------------------------------------------------------
910 float64 float32_to_float64( float32 a )
912 flag aSign;
913 int16 aExp;
914 bits32 aSig;
916 aSig = extractFloat32Frac( a );
917 aExp = extractFloat32Exp( a );
918 aSign = extractFloat32Sign( a );
919 if ( aExp == 0xFF ) {
920 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
921 return packFloat64( aSign, 0x7FF, 0 );
923 if ( aExp == 0 ) {
924 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
925 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
926 --aExp;
928 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
932 #ifdef FLOATX80
935 -------------------------------------------------------------------------------
936 Returns the result of converting the single-precision floating-point value
937 `a' to the extended double-precision floating-point format. The conversion
938 is performed according to the IEC/IEEE Standard for Binary Floating-point
939 Arithmetic.
940 -------------------------------------------------------------------------------
942 floatx80 float32_to_floatx80( float32 a )
944 flag aSign;
945 int16 aExp;
946 bits32 aSig;
948 aSig = extractFloat32Frac( a );
949 aExp = extractFloat32Exp( a );
950 aSign = extractFloat32Sign( a );
951 if ( aExp == 0xFF ) {
952 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
953 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
955 if ( aExp == 0 ) {
956 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
957 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
959 aSig |= 0x00800000;
960 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
964 #endif
967 -------------------------------------------------------------------------------
968 Rounds the single-precision floating-point value `a' to an integer, and
969 returns the result as a single-precision floating-point value. The
970 operation is performed according to the IEC/IEEE Standard for Binary
971 Floating-point Arithmetic.
972 -------------------------------------------------------------------------------
974 float32 float32_round_to_int( float32 a )
976 flag aSign;
977 int16 aExp;
978 bits32 lastBitMask, roundBitsMask;
979 int8 roundingMode;
980 float32 z;
982 aExp = extractFloat32Exp( a );
983 if ( 0x96 <= aExp ) {
984 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
985 return propagateFloat32NaN( a, a );
987 return a;
989 if ( aExp <= 0x7E ) {
990 if ( (bits32) ( a<<1 ) == 0 ) return a;
991 float_exception_flags |= float_flag_inexact;
992 aSign = extractFloat32Sign( a );
993 switch ( float_rounding_mode ) {
994 case float_round_nearest_even:
995 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
996 return packFloat32( aSign, 0x7F, 0 );
998 break;
999 case float_round_down:
1000 return aSign ? 0xBF800000 : 0;
1001 case float_round_up:
1002 return aSign ? 0x80000000 : 0x3F800000;
1004 return packFloat32( aSign, 0, 0 );
1006 lastBitMask = 1;
1007 lastBitMask <<= 0x96 - aExp;
1008 roundBitsMask = lastBitMask - 1;
1009 z = a;
1010 roundingMode = float_rounding_mode;
1011 if ( roundingMode == float_round_nearest_even ) {
1012 z += lastBitMask>>1;
1013 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1015 else if ( roundingMode != float_round_to_zero ) {
1016 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1017 z += roundBitsMask;
1020 z &= ~ roundBitsMask;
1021 if ( z != a ) float_exception_flags |= float_flag_inexact;
1022 return z;
1027 -------------------------------------------------------------------------------
1028 Returns the result of adding the absolute values of the single-precision
1029 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1030 before being returned. `zSign' is ignored if the result is a NaN. The
1031 addition is performed according to the IEC/IEEE Standard for Binary
1032 Floating-point Arithmetic.
1033 -------------------------------------------------------------------------------
1035 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1037 int16 aExp, bExp, zExp;
1038 bits32 aSig, bSig, zSig;
1039 int16 expDiff;
1041 aSig = extractFloat32Frac( a );
1042 aExp = extractFloat32Exp( a );
1043 bSig = extractFloat32Frac( b );
1044 bExp = extractFloat32Exp( b );
1045 expDiff = aExp - bExp;
1046 aSig <<= 6;
1047 bSig <<= 6;
1048 if ( 0 < expDiff ) {
1049 if ( aExp == 0xFF ) {
1050 if ( aSig ) return propagateFloat32NaN( a, b );
1051 return a;
1053 if ( bExp == 0 ) {
1054 --expDiff;
1056 else {
1057 bSig |= 0x20000000;
1059 shift32RightJamming( bSig, expDiff, &bSig );
1060 zExp = aExp;
1062 else if ( expDiff < 0 ) {
1063 if ( bExp == 0xFF ) {
1064 if ( bSig ) return propagateFloat32NaN( a, b );
1065 return packFloat32( zSign, 0xFF, 0 );
1067 if ( aExp == 0 ) {
1068 ++expDiff;
1070 else {
1071 aSig |= 0x20000000;
1073 shift32RightJamming( aSig, - expDiff, &aSig );
1074 zExp = bExp;
1076 else {
1077 if ( aExp == 0xFF ) {
1078 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1079 return a;
1081 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1082 zSig = 0x40000000 + aSig + bSig;
1083 zExp = aExp;
1084 goto roundAndPack;
1086 aSig |= 0x20000000;
1087 zSig = ( aSig + bSig )<<1;
1088 --zExp;
1089 if ( (sbits32) zSig < 0 ) {
1090 zSig = aSig + bSig;
1091 ++zExp;
1093 roundAndPack:
1094 return roundAndPackFloat32( zSign, zExp, zSig );
1099 -------------------------------------------------------------------------------
1100 Returns the result of subtracting the absolute values of the single-
1101 precision floating-point values `a' and `b'. If `zSign' is true, the
1102 difference is negated before being returned. `zSign' is ignored if the
1103 result is a NaN. The subtraction is performed according to the IEC/IEEE
1104 Standard for Binary Floating-point Arithmetic.
1105 -------------------------------------------------------------------------------
1107 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1109 int16 aExp, bExp, zExp;
1110 bits32 aSig, bSig, zSig;
1111 int16 expDiff;
1113 aSig = extractFloat32Frac( a );
1114 aExp = extractFloat32Exp( a );
1115 bSig = extractFloat32Frac( b );
1116 bExp = extractFloat32Exp( b );
1117 expDiff = aExp - bExp;
1118 aSig <<= 7;
1119 bSig <<= 7;
1120 if ( 0 < expDiff ) goto aExpBigger;
1121 if ( expDiff < 0 ) goto bExpBigger;
1122 if ( aExp == 0xFF ) {
1123 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1124 float_raise( float_flag_invalid );
1125 return float32_default_nan;
1127 if ( aExp == 0 ) {
1128 aExp = 1;
1129 bExp = 1;
1131 if ( bSig < aSig ) goto aBigger;
1132 if ( aSig < bSig ) goto bBigger;
1133 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1134 bExpBigger:
1135 if ( bExp == 0xFF ) {
1136 if ( bSig ) return propagateFloat32NaN( a, b );
1137 return packFloat32( zSign ^ 1, 0xFF, 0 );
1139 if ( aExp == 0 ) {
1140 ++expDiff;
1142 else {
1143 aSig |= 0x40000000;
1145 shift32RightJamming( aSig, - expDiff, &aSig );
1146 bSig |= 0x40000000;
1147 bBigger:
1148 zSig = bSig - aSig;
1149 zExp = bExp;
1150 zSign ^= 1;
1151 goto normalizeRoundAndPack;
1152 aExpBigger:
1153 if ( aExp == 0xFF ) {
1154 if ( aSig ) return propagateFloat32NaN( a, b );
1155 return a;
1157 if ( bExp == 0 ) {
1158 --expDiff;
1160 else {
1161 bSig |= 0x40000000;
1163 shift32RightJamming( bSig, expDiff, &bSig );
1164 aSig |= 0x40000000;
1165 aBigger:
1166 zSig = aSig - bSig;
1167 zExp = aExp;
1168 normalizeRoundAndPack:
1169 --zExp;
1170 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1175 -------------------------------------------------------------------------------
1176 Returns the result of adding the single-precision floating-point values `a'
1177 and `b'. The operation is performed according to the IEC/IEEE Standard for
1178 Binary Floating-point Arithmetic.
1179 -------------------------------------------------------------------------------
1181 float32 float32_add( float32 a, float32 b )
1183 flag aSign, bSign;
1185 aSign = extractFloat32Sign( a );
1186 bSign = extractFloat32Sign( b );
1187 if ( aSign == bSign ) {
1188 return addFloat32Sigs( a, b, aSign );
1190 else {
1191 return subFloat32Sigs( a, b, aSign );
1197 -------------------------------------------------------------------------------
1198 Returns the result of subtracting the single-precision floating-point values
1199 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1200 for Binary Floating-point Arithmetic.
1201 -------------------------------------------------------------------------------
1203 float32 float32_sub( float32 a, float32 b )
1205 flag aSign, bSign;
1207 aSign = extractFloat32Sign( a );
1208 bSign = extractFloat32Sign( b );
1209 if ( aSign == bSign ) {
1210 return subFloat32Sigs( a, b, aSign );
1212 else {
1213 return addFloat32Sigs( a, b, aSign );
1219 -------------------------------------------------------------------------------
1220 Returns the result of multiplying the single-precision floating-point values
1221 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1222 for Binary Floating-point Arithmetic.
1223 -------------------------------------------------------------------------------
1225 float32 float32_mul( float32 a, float32 b )
1227 flag aSign, bSign, zSign;
1228 int16 aExp, bExp, zExp;
1229 bits32 aSig, bSig;
1230 bits64 zSig64;
1231 bits32 zSig;
1233 aSig = extractFloat32Frac( a );
1234 aExp = extractFloat32Exp( a );
1235 aSign = extractFloat32Sign( a );
1236 bSig = extractFloat32Frac( b );
1237 bExp = extractFloat32Exp( b );
1238 bSign = extractFloat32Sign( b );
1239 zSign = aSign ^ bSign;
1240 if ( aExp == 0xFF ) {
1241 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1242 return propagateFloat32NaN( a, b );
1244 if ( ( bExp | bSig ) == 0 ) {
1245 float_raise( float_flag_invalid );
1246 return float32_default_nan;
1248 return packFloat32( zSign, 0xFF, 0 );
1250 if ( bExp == 0xFF ) {
1251 if ( bSig ) return propagateFloat32NaN( a, b );
1252 if ( ( aExp | aSig ) == 0 ) {
1253 float_raise( float_flag_invalid );
1254 return float32_default_nan;
1256 return packFloat32( zSign, 0xFF, 0 );
1258 if ( aExp == 0 ) {
1259 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1260 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1262 if ( bExp == 0 ) {
1263 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1264 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1266 zExp = aExp + bExp - 0x7F;
1267 aSig = ( aSig | 0x00800000 )<<7;
1268 bSig = ( bSig | 0x00800000 )<<8;
1269 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1270 zSig = zSig64;
1271 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1272 zSig <<= 1;
1273 --zExp;
1275 return roundAndPackFloat32( zSign, zExp, zSig );
1280 -------------------------------------------------------------------------------
1281 Returns the result of dividing the single-precision floating-point value `a'
1282 by the corresponding value `b'. The operation is performed according to the
1283 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1284 -------------------------------------------------------------------------------
1286 float32 float32_div( float32 a, float32 b )
1288 flag aSign, bSign, zSign;
1289 int16 aExp, bExp, zExp;
1290 bits32 aSig, bSig, zSig;
1292 aSig = extractFloat32Frac( a );
1293 aExp = extractFloat32Exp( a );
1294 aSign = extractFloat32Sign( a );
1295 bSig = extractFloat32Frac( b );
1296 bExp = extractFloat32Exp( b );
1297 bSign = extractFloat32Sign( b );
1298 zSign = aSign ^ bSign;
1299 if ( aExp == 0xFF ) {
1300 if ( aSig ) return propagateFloat32NaN( a, b );
1301 if ( bExp == 0xFF ) {
1302 if ( bSig ) return propagateFloat32NaN( a, b );
1303 float_raise( float_flag_invalid );
1304 return float32_default_nan;
1306 return packFloat32( zSign, 0xFF, 0 );
1308 if ( bExp == 0xFF ) {
1309 if ( bSig ) return propagateFloat32NaN( a, b );
1310 return packFloat32( zSign, 0, 0 );
1312 if ( bExp == 0 ) {
1313 if ( bSig == 0 ) {
1314 if ( ( aExp | aSig ) == 0 ) {
1315 float_raise( float_flag_invalid );
1316 return float32_default_nan;
1318 float_raise( float_flag_divbyzero );
1319 return packFloat32( zSign, 0xFF, 0 );
1321 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1323 if ( aExp == 0 ) {
1324 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1325 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1327 zExp = aExp - bExp + 0x7D;
1328 aSig = ( aSig | 0x00800000 )<<7;
1329 bSig = ( bSig | 0x00800000 )<<8;
1330 if ( bSig <= ( aSig + aSig ) ) {
1331 aSig >>= 1;
1332 ++zExp;
1334 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1335 if ( ( zSig & 0x3F ) == 0 ) {
1336 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1338 return roundAndPackFloat32( zSign, zExp, zSig );
1343 -------------------------------------------------------------------------------
1344 Returns the remainder of the single-precision floating-point value `a'
1345 with respect to the corresponding value `b'. The operation is performed
1346 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1347 -------------------------------------------------------------------------------
1349 float32 float32_rem( float32 a, float32 b )
1351 flag aSign, bSign, zSign;
1352 int16 aExp, bExp, expDiff;
1353 bits32 aSig, bSig;
1354 bits32 q;
1355 bits64 aSig64, bSig64, q64;
1356 bits32 alternateASig;
1357 sbits32 sigMean;
1359 aSig = extractFloat32Frac( a );
1360 aExp = extractFloat32Exp( a );
1361 aSign = extractFloat32Sign( a );
1362 bSig = extractFloat32Frac( b );
1363 bExp = extractFloat32Exp( b );
1364 bSign = extractFloat32Sign( b );
1365 if ( aExp == 0xFF ) {
1366 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1367 return propagateFloat32NaN( a, b );
1369 float_raise( float_flag_invalid );
1370 return float32_default_nan;
1372 if ( bExp == 0xFF ) {
1373 if ( bSig ) return propagateFloat32NaN( a, b );
1374 return a;
1376 if ( bExp == 0 ) {
1377 if ( bSig == 0 ) {
1378 float_raise( float_flag_invalid );
1379 return float32_default_nan;
1381 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1383 if ( aExp == 0 ) {
1384 if ( aSig == 0 ) return a;
1385 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1387 expDiff = aExp - bExp;
1388 aSig |= 0x00800000;
1389 bSig |= 0x00800000;
1390 if ( expDiff < 32 ) {
1391 aSig <<= 8;
1392 bSig <<= 8;
1393 if ( expDiff < 0 ) {
1394 if ( expDiff < -1 ) return a;
1395 aSig >>= 1;
1397 q = ( bSig <= aSig );
1398 if ( q ) aSig -= bSig;
1399 if ( 0 < expDiff ) {
1400 q = ( ( (bits64) aSig )<<32 ) / bSig;
1401 q >>= 32 - expDiff;
1402 bSig >>= 2;
1403 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1405 else {
1406 aSig >>= 2;
1407 bSig >>= 2;
1410 else {
1411 if ( bSig <= aSig ) aSig -= bSig;
1412 aSig64 = ( (bits64) aSig )<<40;
1413 bSig64 = ( (bits64) bSig )<<40;
1414 expDiff -= 64;
1415 while ( 0 < expDiff ) {
1416 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1417 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1418 aSig64 = - ( ( bSig * q64 )<<38 );
1419 expDiff -= 62;
1421 expDiff += 64;
1422 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1423 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1424 q = q64>>( 64 - expDiff );
1425 bSig <<= 6;
1426 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1428 do {
1429 alternateASig = aSig;
1430 ++q;
1431 aSig -= bSig;
1432 } while ( 0 <= (sbits32) aSig );
1433 sigMean = aSig + alternateASig;
1434 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1435 aSig = alternateASig;
1437 zSign = ( (sbits32) aSig < 0 );
1438 if ( zSign ) aSig = - aSig;
1439 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1444 -------------------------------------------------------------------------------
1445 Returns the square root of the single-precision floating-point value `a'.
1446 The operation is performed according to the IEC/IEEE Standard for Binary
1447 Floating-point Arithmetic.
1448 -------------------------------------------------------------------------------
1450 float32 float32_sqrt( float32 a )
1452 flag aSign;
1453 int16 aExp, zExp;
1454 bits32 aSig, zSig;
1455 bits64 rem, term;
1457 aSig = extractFloat32Frac( a );
1458 aExp = extractFloat32Exp( a );
1459 aSign = extractFloat32Sign( a );
1460 if ( aExp == 0xFF ) {
1461 if ( aSig ) return propagateFloat32NaN( a, 0 );
1462 if ( ! aSign ) return a;
1463 float_raise( float_flag_invalid );
1464 return float32_default_nan;
1466 if ( aSign ) {
1467 if ( ( aExp | aSig ) == 0 ) return a;
1468 float_raise( float_flag_invalid );
1469 return float32_default_nan;
1471 if ( aExp == 0 ) {
1472 if ( aSig == 0 ) return 0;
1473 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1475 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1476 aSig = ( aSig | 0x00800000 )<<8;
1477 zSig = estimateSqrt32( aExp, aSig ) + 2;
1478 if ( ( zSig & 0x7F ) <= 5 ) {
1479 if ( zSig < 2 ) {
1480 zSig = 0xFFFFFFFF;
1482 else {
1483 aSig >>= aExp & 1;
1484 term = ( (bits64) zSig ) * zSig;
1485 rem = ( ( (bits64) aSig )<<32 ) - term;
1486 while ( (sbits64) rem < 0 ) {
1487 --zSig;
1488 rem += ( ( (bits64) zSig )<<1 ) | 1;
1490 zSig |= ( rem != 0 );
1493 shift32RightJamming( zSig, 1, &zSig );
1494 return roundAndPackFloat32( 0, zExp, zSig );
1499 -------------------------------------------------------------------------------
1500 Returns 1 if the single-precision floating-point value `a' is equal to the
1501 corresponding value `b', and 0 otherwise. The comparison is performed
1502 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1503 -------------------------------------------------------------------------------
1505 flag float32_eq( float32 a, float32 b )
1508 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1509 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1511 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1512 float_raise( float_flag_invalid );
1514 return 0;
1516 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1521 -------------------------------------------------------------------------------
1522 Returns 1 if the single-precision floating-point value `a' is less than or
1523 equal to the corresponding value `b', and 0 otherwise. The comparison is
1524 performed according to the IEC/IEEE Standard for Binary Floating-point
1525 Arithmetic.
1526 -------------------------------------------------------------------------------
1528 flag float32_le( float32 a, float32 b )
1530 flag aSign, bSign;
1532 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1533 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1535 float_raise( float_flag_invalid );
1536 return 0;
1538 aSign = extractFloat32Sign( a );
1539 bSign = extractFloat32Sign( b );
1540 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1541 return ( a == b ) || ( aSign ^ ( a < b ) );
1546 -------------------------------------------------------------------------------
1547 Returns 1 if the single-precision floating-point value `a' is less than
1548 the corresponding value `b', and 0 otherwise. The comparison is performed
1549 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1550 -------------------------------------------------------------------------------
1552 flag float32_lt( float32 a, float32 b )
1554 flag aSign, bSign;
1556 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1557 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1559 float_raise( float_flag_invalid );
1560 return 0;
1562 aSign = extractFloat32Sign( a );
1563 bSign = extractFloat32Sign( b );
1564 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1565 return ( a != b ) && ( aSign ^ ( a < b ) );
1570 -------------------------------------------------------------------------------
1571 Returns 1 if the single-precision floating-point value `a' is equal to the
1572 corresponding value `b', and 0 otherwise. The invalid exception is raised
1573 if either operand is a NaN. Otherwise, the comparison is performed
1574 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1575 -------------------------------------------------------------------------------
1577 flag float32_eq_signaling( float32 a, float32 b )
1580 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1581 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1583 float_raise( float_flag_invalid );
1584 return 0;
1586 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1591 -------------------------------------------------------------------------------
1592 Returns 1 if the single-precision floating-point value `a' is less than or
1593 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1594 cause an exception. Otherwise, the comparison is performed according to the
1595 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1596 -------------------------------------------------------------------------------
1598 flag float32_le_quiet( float32 a, float32 b )
1600 flag aSign, bSign;
1601 //int16 aExp, bExp;
1603 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1604 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1606 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1607 float_raise( float_flag_invalid );
1609 return 0;
1611 aSign = extractFloat32Sign( a );
1612 bSign = extractFloat32Sign( b );
1613 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1614 return ( a == b ) || ( aSign ^ ( a < b ) );
1619 -------------------------------------------------------------------------------
1620 Returns 1 if the single-precision floating-point value `a' is less than
1621 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1622 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1623 Standard for Binary Floating-point Arithmetic.
1624 -------------------------------------------------------------------------------
1626 flag float32_lt_quiet( float32 a, float32 b )
1628 flag aSign, bSign;
1630 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1631 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1633 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1634 float_raise( float_flag_invalid );
1636 return 0;
1638 aSign = extractFloat32Sign( a );
1639 bSign = extractFloat32Sign( b );
1640 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1641 return ( a != b ) && ( aSign ^ ( a < b ) );
1646 -------------------------------------------------------------------------------
1647 Returns the result of converting the double-precision floating-point value
1648 `a' to the 32-bit two's complement integer format. The conversion is
1649 performed according to the IEC/IEEE Standard for Binary Floating-point
1650 Arithmetic---which means in particular that the conversion is rounded
1651 according to the current rounding mode. If `a' is a NaN, the largest
1652 positive integer is returned. Otherwise, if the conversion overflows, the
1653 largest integer with the same sign as `a' is returned.
1654 -------------------------------------------------------------------------------
1656 int32 float64_to_int32( float64 a )
1658 flag aSign;
1659 int16 aExp, shiftCount;
1660 bits64 aSig;
1662 aSig = extractFloat64Frac( a );
1663 aExp = extractFloat64Exp( a );
1664 aSign = extractFloat64Sign( a );
1665 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1666 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1667 shiftCount = 0x42C - aExp;
1668 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1669 return roundAndPackInt32( aSign, aSig );
1674 -------------------------------------------------------------------------------
1675 Returns the result of converting the double-precision floating-point value
1676 `a' to the 32-bit two's complement integer format. The conversion is
1677 performed according to the IEC/IEEE Standard for Binary Floating-point
1678 Arithmetic, except that the conversion is always rounded toward zero. If
1679 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1680 conversion overflows, the largest integer with the same sign as `a' is
1681 returned.
1682 -------------------------------------------------------------------------------
1684 int32 float64_to_int32_round_to_zero( float64 a )
1686 flag aSign;
1687 int16 aExp, shiftCount;
1688 bits64 aSig, savedASig;
1689 int32 z;
1691 aSig = extractFloat64Frac( a );
1692 aExp = extractFloat64Exp( a );
1693 aSign = extractFloat64Sign( a );
1694 shiftCount = 0x433 - aExp;
1695 if ( shiftCount < 21 ) {
1696 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1697 goto invalid;
1699 else if ( 52 < shiftCount ) {
1700 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1701 return 0;
1703 aSig |= LIT64( 0x0010000000000000 );
1704 savedASig = aSig;
1705 aSig >>= shiftCount;
1706 z = aSig;
1707 if ( aSign ) z = - z;
1708 if ( ( z < 0 ) ^ aSign ) {
1709 invalid:
1710 float_exception_flags |= float_flag_invalid;
1711 return aSign ? 0x80000000 : 0x7FFFFFFF;
1713 if ( ( aSig<<shiftCount ) != savedASig ) {
1714 float_exception_flags |= float_flag_inexact;
1716 return z;
1721 -------------------------------------------------------------------------------
1722 Returns the result of converting the double-precision floating-point value
1723 `a' to the 32-bit two's complement unsigned integer format. The conversion
1724 is performed according to the IEC/IEEE Standard for Binary Floating-point
1725 Arithmetic---which means in particular that the conversion is rounded
1726 according to the current rounding mode. If `a' is a NaN, the largest
1727 positive integer is returned. Otherwise, if the conversion overflows, the
1728 largest positive integer is returned.
1729 -------------------------------------------------------------------------------
1731 int32 float64_to_uint32( float64 a )
1733 flag aSign;
1734 int16 aExp, shiftCount;
1735 bits64 aSig;
1737 aSig = extractFloat64Frac( a );
1738 aExp = extractFloat64Exp( a );
1739 aSign = 0; //extractFloat64Sign( a );
1740 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1741 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1742 shiftCount = 0x42C - aExp;
1743 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1744 return roundAndPackInt32( aSign, aSig );
1748 -------------------------------------------------------------------------------
1749 Returns the result of converting the double-precision floating-point value
1750 `a' to the 32-bit two's complement integer format. The conversion is
1751 performed according to the IEC/IEEE Standard for Binary Floating-point
1752 Arithmetic, except that the conversion is always rounded toward zero. If
1753 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1754 conversion overflows, the largest positive integer is returned.
1755 -------------------------------------------------------------------------------
1757 int32 float64_to_uint32_round_to_zero( float64 a )
1759 flag aSign;
1760 int16 aExp, shiftCount;
1761 bits64 aSig, savedASig;
1762 int32 z;
1764 aSig = extractFloat64Frac( a );
1765 aExp = extractFloat64Exp( a );
1766 aSign = extractFloat64Sign( a );
1767 shiftCount = 0x433 - aExp;
1768 if ( shiftCount < 21 ) {
1769 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1770 goto invalid;
1772 else if ( 52 < shiftCount ) {
1773 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1774 return 0;
1776 aSig |= LIT64( 0x0010000000000000 );
1777 savedASig = aSig;
1778 aSig >>= shiftCount;
1779 z = aSig;
1780 if ( aSign ) z = - z;
1781 if ( ( z < 0 ) ^ aSign ) {
1782 invalid:
1783 float_exception_flags |= float_flag_invalid;
1784 return aSign ? 0x80000000 : 0x7FFFFFFF;
1786 if ( ( aSig<<shiftCount ) != savedASig ) {
1787 float_exception_flags |= float_flag_inexact;
1789 return z;
1793 -------------------------------------------------------------------------------
1794 Returns the result of converting the double-precision floating-point value
1795 `a' to the single-precision floating-point format. The conversion is
1796 performed according to the IEC/IEEE Standard for Binary Floating-point
1797 Arithmetic.
1798 -------------------------------------------------------------------------------
1800 float32 float64_to_float32( float64 a )
1802 flag aSign;
1803 int16 aExp;
1804 bits64 aSig;
1805 bits32 zSig;
1807 aSig = extractFloat64Frac( a );
1808 aExp = extractFloat64Exp( a );
1809 aSign = extractFloat64Sign( a );
1810 if ( aExp == 0x7FF ) {
1811 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1812 return packFloat32( aSign, 0xFF, 0 );
1814 shift64RightJamming( aSig, 22, &aSig );
1815 zSig = aSig;
1816 if ( aExp || zSig ) {
1817 zSig |= 0x40000000;
1818 aExp -= 0x381;
1820 return roundAndPackFloat32( aSign, aExp, zSig );
1824 #ifdef FLOATX80
1827 -------------------------------------------------------------------------------
1828 Returns the result of converting the double-precision floating-point value
1829 `a' to the extended double-precision floating-point format. The conversion
1830 is performed according to the IEC/IEEE Standard for Binary Floating-point
1831 Arithmetic.
1832 -------------------------------------------------------------------------------
1834 floatx80 float64_to_floatx80( float64 a )
1836 flag aSign;
1837 int16 aExp;
1838 bits64 aSig;
1840 aSig = extractFloat64Frac( a );
1841 aExp = extractFloat64Exp( a );
1842 aSign = extractFloat64Sign( a );
1843 if ( aExp == 0x7FF ) {
1844 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1845 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1847 if ( aExp == 0 ) {
1848 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1849 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1851 return
1852 packFloatx80(
1853 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1857 #endif
1860 -------------------------------------------------------------------------------
1861 Rounds the double-precision floating-point value `a' to an integer, and
1862 returns the result as a double-precision floating-point value. The
1863 operation is performed according to the IEC/IEEE Standard for Binary
1864 Floating-point Arithmetic.
1865 -------------------------------------------------------------------------------
1867 float64 float64_round_to_int( float64 a )
1869 flag aSign;
1870 int16 aExp;
1871 bits64 lastBitMask, roundBitsMask;
1872 int8 roundingMode;
1873 float64 z;
1875 aExp = extractFloat64Exp( a );
1876 if ( 0x433 <= aExp ) {
1877 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1878 return propagateFloat64NaN( a, a );
1880 return a;
1882 if ( aExp <= 0x3FE ) {
1883 if ( (bits64) ( a<<1 ) == 0 ) return a;
1884 float_exception_flags |= float_flag_inexact;
1885 aSign = extractFloat64Sign( a );
1886 switch ( float_rounding_mode ) {
1887 case float_round_nearest_even:
1888 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1889 return packFloat64( aSign, 0x3FF, 0 );
1891 break;
1892 case float_round_down:
1893 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1894 case float_round_up:
1895 return
1896 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1898 return packFloat64( aSign, 0, 0 );
1900 lastBitMask = 1;
1901 lastBitMask <<= 0x433 - aExp;
1902 roundBitsMask = lastBitMask - 1;
1903 z = a;
1904 roundingMode = float_rounding_mode;
1905 if ( roundingMode == float_round_nearest_even ) {
1906 z += lastBitMask>>1;
1907 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1909 else if ( roundingMode != float_round_to_zero ) {
1910 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1911 z += roundBitsMask;
1914 z &= ~ roundBitsMask;
1915 if ( z != a ) float_exception_flags |= float_flag_inexact;
1916 return z;
1921 -------------------------------------------------------------------------------
1922 Returns the result of adding the absolute values of the double-precision
1923 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1924 before being returned. `zSign' is ignored if the result is a NaN. The
1925 addition is performed according to the IEC/IEEE Standard for Binary
1926 Floating-point Arithmetic.
1927 -------------------------------------------------------------------------------
1929 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1931 int16 aExp, bExp, zExp;
1932 bits64 aSig, bSig, zSig;
1933 int16 expDiff;
1935 aSig = extractFloat64Frac( a );
1936 aExp = extractFloat64Exp( a );
1937 bSig = extractFloat64Frac( b );
1938 bExp = extractFloat64Exp( b );
1939 expDiff = aExp - bExp;
1940 aSig <<= 9;
1941 bSig <<= 9;
1942 if ( 0 < expDiff ) {
1943 if ( aExp == 0x7FF ) {
1944 if ( aSig ) return propagateFloat64NaN( a, b );
1945 return a;
1947 if ( bExp == 0 ) {
1948 --expDiff;
1950 else {
1951 bSig |= LIT64( 0x2000000000000000 );
1953 shift64RightJamming( bSig, expDiff, &bSig );
1954 zExp = aExp;
1956 else if ( expDiff < 0 ) {
1957 if ( bExp == 0x7FF ) {
1958 if ( bSig ) return propagateFloat64NaN( a, b );
1959 return packFloat64( zSign, 0x7FF, 0 );
1961 if ( aExp == 0 ) {
1962 ++expDiff;
1964 else {
1965 aSig |= LIT64( 0x2000000000000000 );
1967 shift64RightJamming( aSig, - expDiff, &aSig );
1968 zExp = bExp;
1970 else {
1971 if ( aExp == 0x7FF ) {
1972 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1973 return a;
1975 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1976 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1977 zExp = aExp;
1978 goto roundAndPack;
1980 aSig |= LIT64( 0x2000000000000000 );
1981 zSig = ( aSig + bSig )<<1;
1982 --zExp;
1983 if ( (sbits64) zSig < 0 ) {
1984 zSig = aSig + bSig;
1985 ++zExp;
1987 roundAndPack:
1988 return roundAndPackFloat64( zSign, zExp, zSig );
1993 -------------------------------------------------------------------------------
1994 Returns the result of subtracting the absolute values of the double-
1995 precision floating-point values `a' and `b'. If `zSign' is true, the
1996 difference is negated before being returned. `zSign' is ignored if the
1997 result is a NaN. The subtraction is performed according to the IEC/IEEE
1998 Standard for Binary Floating-point Arithmetic.
1999 -------------------------------------------------------------------------------
2001 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2003 int16 aExp, bExp, zExp;
2004 bits64 aSig, bSig, zSig;
2005 int16 expDiff;
2007 aSig = extractFloat64Frac( a );
2008 aExp = extractFloat64Exp( a );
2009 bSig = extractFloat64Frac( b );
2010 bExp = extractFloat64Exp( b );
2011 expDiff = aExp - bExp;
2012 aSig <<= 10;
2013 bSig <<= 10;
2014 if ( 0 < expDiff ) goto aExpBigger;
2015 if ( expDiff < 0 ) goto bExpBigger;
2016 if ( aExp == 0x7FF ) {
2017 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2018 float_raise( float_flag_invalid );
2019 return float64_default_nan;
2021 if ( aExp == 0 ) {
2022 aExp = 1;
2023 bExp = 1;
2025 if ( bSig < aSig ) goto aBigger;
2026 if ( aSig < bSig ) goto bBigger;
2027 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2028 bExpBigger:
2029 if ( bExp == 0x7FF ) {
2030 if ( bSig ) return propagateFloat64NaN( a, b );
2031 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2033 if ( aExp == 0 ) {
2034 ++expDiff;
2036 else {
2037 aSig |= LIT64( 0x4000000000000000 );
2039 shift64RightJamming( aSig, - expDiff, &aSig );
2040 bSig |= LIT64( 0x4000000000000000 );
2041 bBigger:
2042 zSig = bSig - aSig;
2043 zExp = bExp;
2044 zSign ^= 1;
2045 goto normalizeRoundAndPack;
2046 aExpBigger:
2047 if ( aExp == 0x7FF ) {
2048 if ( aSig ) return propagateFloat64NaN( a, b );
2049 return a;
2051 if ( bExp == 0 ) {
2052 --expDiff;
2054 else {
2055 bSig |= LIT64( 0x4000000000000000 );
2057 shift64RightJamming( bSig, expDiff, &bSig );
2058 aSig |= LIT64( 0x4000000000000000 );
2059 aBigger:
2060 zSig = aSig - bSig;
2061 zExp = aExp;
2062 normalizeRoundAndPack:
2063 --zExp;
2064 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2069 -------------------------------------------------------------------------------
2070 Returns the result of adding the double-precision floating-point values `a'
2071 and `b'. The operation is performed according to the IEC/IEEE Standard for
2072 Binary Floating-point Arithmetic.
2073 -------------------------------------------------------------------------------
2075 float64 float64_add( float64 a, float64 b )
2077 flag aSign, bSign;
2079 aSign = extractFloat64Sign( a );
2080 bSign = extractFloat64Sign( b );
2081 if ( aSign == bSign ) {
2082 return addFloat64Sigs( a, b, aSign );
2084 else {
2085 return subFloat64Sigs( a, b, aSign );
2091 -------------------------------------------------------------------------------
2092 Returns the result of subtracting the double-precision floating-point values
2093 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2094 for Binary Floating-point Arithmetic.
2095 -------------------------------------------------------------------------------
2097 float64 float64_sub( float64 a, float64 b )
2099 flag aSign, bSign;
2101 aSign = extractFloat64Sign( a );
2102 bSign = extractFloat64Sign( b );
2103 if ( aSign == bSign ) {
2104 return subFloat64Sigs( a, b, aSign );
2106 else {
2107 return addFloat64Sigs( a, b, aSign );
2113 -------------------------------------------------------------------------------
2114 Returns the result of multiplying the double-precision floating-point values
2115 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2116 for Binary Floating-point Arithmetic.
2117 -------------------------------------------------------------------------------
2119 float64 float64_mul( float64 a, float64 b )
2121 flag aSign, bSign, zSign;
2122 int16 aExp, bExp, zExp;
2123 bits64 aSig, bSig, zSig0, zSig1;
2125 aSig = extractFloat64Frac( a );
2126 aExp = extractFloat64Exp( a );
2127 aSign = extractFloat64Sign( a );
2128 bSig = extractFloat64Frac( b );
2129 bExp = extractFloat64Exp( b );
2130 bSign = extractFloat64Sign( b );
2131 zSign = aSign ^ bSign;
2132 if ( aExp == 0x7FF ) {
2133 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2134 return propagateFloat64NaN( a, b );
2136 if ( ( bExp | bSig ) == 0 ) {
2137 float_raise( float_flag_invalid );
2138 return float64_default_nan;
2140 return packFloat64( zSign, 0x7FF, 0 );
2142 if ( bExp == 0x7FF ) {
2143 if ( bSig ) return propagateFloat64NaN( a, b );
2144 if ( ( aExp | aSig ) == 0 ) {
2145 float_raise( float_flag_invalid );
2146 return float64_default_nan;
2148 return packFloat64( zSign, 0x7FF, 0 );
2150 if ( aExp == 0 ) {
2151 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2152 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2154 if ( bExp == 0 ) {
2155 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2156 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2158 zExp = aExp + bExp - 0x3FF;
2159 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2160 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2161 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2162 zSig0 |= ( zSig1 != 0 );
2163 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2164 zSig0 <<= 1;
2165 --zExp;
2167 return roundAndPackFloat64( zSign, zExp, zSig0 );
2172 -------------------------------------------------------------------------------
2173 Returns the result of dividing the double-precision floating-point value `a'
2174 by the corresponding value `b'. The operation is performed according to
2175 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2176 -------------------------------------------------------------------------------
2178 float64 float64_div( float64 a, float64 b )
2180 flag aSign, bSign, zSign;
2181 int16 aExp, bExp, zExp;
2182 bits64 aSig, bSig, zSig;
2183 bits64 rem0, rem1;
2184 bits64 term0, term1;
2186 aSig = extractFloat64Frac( a );
2187 aExp = extractFloat64Exp( a );
2188 aSign = extractFloat64Sign( a );
2189 bSig = extractFloat64Frac( b );
2190 bExp = extractFloat64Exp( b );
2191 bSign = extractFloat64Sign( b );
2192 zSign = aSign ^ bSign;
2193 if ( aExp == 0x7FF ) {
2194 if ( aSig ) return propagateFloat64NaN( a, b );
2195 if ( bExp == 0x7FF ) {
2196 if ( bSig ) return propagateFloat64NaN( a, b );
2197 float_raise( float_flag_invalid );
2198 return float64_default_nan;
2200 return packFloat64( zSign, 0x7FF, 0 );
2202 if ( bExp == 0x7FF ) {
2203 if ( bSig ) return propagateFloat64NaN( a, b );
2204 return packFloat64( zSign, 0, 0 );
2206 if ( bExp == 0 ) {
2207 if ( bSig == 0 ) {
2208 if ( ( aExp | aSig ) == 0 ) {
2209 float_raise( float_flag_invalid );
2210 return float64_default_nan;
2212 float_raise( float_flag_divbyzero );
2213 return packFloat64( zSign, 0x7FF, 0 );
2215 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2217 if ( aExp == 0 ) {
2218 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2219 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2221 zExp = aExp - bExp + 0x3FD;
2222 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2223 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2224 if ( bSig <= ( aSig + aSig ) ) {
2225 aSig >>= 1;
2226 ++zExp;
2228 zSig = estimateDiv128To64( aSig, 0, bSig );
2229 if ( ( zSig & 0x1FF ) <= 2 ) {
2230 mul64To128( bSig, zSig, &term0, &term1 );
2231 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2232 while ( (sbits64) rem0 < 0 ) {
2233 --zSig;
2234 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2236 zSig |= ( rem1 != 0 );
2238 return roundAndPackFloat64( zSign, zExp, zSig );
2243 -------------------------------------------------------------------------------
2244 Returns the remainder of the double-precision floating-point value `a'
2245 with respect to the corresponding value `b'. The operation is performed
2246 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2247 -------------------------------------------------------------------------------
2249 float64 float64_rem( float64 a, float64 b )
2251 flag aSign, bSign, zSign;
2252 int16 aExp, bExp, expDiff;
2253 bits64 aSig, bSig;
2254 bits64 q, alternateASig;
2255 sbits64 sigMean;
2257 aSig = extractFloat64Frac( a );
2258 aExp = extractFloat64Exp( a );
2259 aSign = extractFloat64Sign( a );
2260 bSig = extractFloat64Frac( b );
2261 bExp = extractFloat64Exp( b );
2262 bSign = extractFloat64Sign( b );
2263 if ( aExp == 0x7FF ) {
2264 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2265 return propagateFloat64NaN( a, b );
2267 float_raise( float_flag_invalid );
2268 return float64_default_nan;
2270 if ( bExp == 0x7FF ) {
2271 if ( bSig ) return propagateFloat64NaN( a, b );
2272 return a;
2274 if ( bExp == 0 ) {
2275 if ( bSig == 0 ) {
2276 float_raise( float_flag_invalid );
2277 return float64_default_nan;
2279 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2281 if ( aExp == 0 ) {
2282 if ( aSig == 0 ) return a;
2283 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2285 expDiff = aExp - bExp;
2286 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2287 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2288 if ( expDiff < 0 ) {
2289 if ( expDiff < -1 ) return a;
2290 aSig >>= 1;
2292 q = ( bSig <= aSig );
2293 if ( q ) aSig -= bSig;
2294 expDiff -= 64;
2295 while ( 0 < expDiff ) {
2296 q = estimateDiv128To64( aSig, 0, bSig );
2297 q = ( 2 < q ) ? q - 2 : 0;
2298 aSig = - ( ( bSig>>2 ) * q );
2299 expDiff -= 62;
2301 expDiff += 64;
2302 if ( 0 < expDiff ) {
2303 q = estimateDiv128To64( aSig, 0, bSig );
2304 q = ( 2 < q ) ? q - 2 : 0;
2305 q >>= 64 - expDiff;
2306 bSig >>= 2;
2307 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2309 else {
2310 aSig >>= 2;
2311 bSig >>= 2;
2313 do {
2314 alternateASig = aSig;
2315 ++q;
2316 aSig -= bSig;
2317 } while ( 0 <= (sbits64) aSig );
2318 sigMean = aSig + alternateASig;
2319 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2320 aSig = alternateASig;
2322 zSign = ( (sbits64) aSig < 0 );
2323 if ( zSign ) aSig = - aSig;
2324 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2329 -------------------------------------------------------------------------------
2330 Returns the square root of the double-precision floating-point value `a'.
2331 The operation is performed according to the IEC/IEEE Standard for Binary
2332 Floating-point Arithmetic.
2333 -------------------------------------------------------------------------------
2335 float64 float64_sqrt( float64 a )
2337 flag aSign;
2338 int16 aExp, zExp;
2339 bits64 aSig, zSig;
2340 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2341 //float64 z;
2343 aSig = extractFloat64Frac( a );
2344 aExp = extractFloat64Exp( a );
2345 aSign = extractFloat64Sign( a );
2346 if ( aExp == 0x7FF ) {
2347 if ( aSig ) return propagateFloat64NaN( a, a );
2348 if ( ! aSign ) return a;
2349 float_raise( float_flag_invalid );
2350 return float64_default_nan;
2352 if ( aSign ) {
2353 if ( ( aExp | aSig ) == 0 ) return a;
2354 float_raise( float_flag_invalid );
2355 return float64_default_nan;
2357 if ( aExp == 0 ) {
2358 if ( aSig == 0 ) return 0;
2359 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2361 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2362 aSig |= LIT64( 0x0010000000000000 );
2363 zSig = estimateSqrt32( aExp, aSig>>21 );
2364 zSig <<= 31;
2365 aSig <<= 9 - ( aExp & 1 );
2366 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2367 if ( ( zSig & 0x3FF ) <= 5 ) {
2368 if ( zSig < 2 ) {
2369 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2371 else {
2372 aSig <<= 2;
2373 mul64To128( zSig, zSig, &term0, &term1 );
2374 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2375 while ( (sbits64) rem0 < 0 ) {
2376 --zSig;
2377 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2378 term1 |= 1;
2379 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2381 zSig |= ( ( rem0 | rem1 ) != 0 );
2384 shift64RightJamming( zSig, 1, &zSig );
2385 return roundAndPackFloat64( 0, zExp, zSig );
2390 -------------------------------------------------------------------------------
2391 Returns 1 if the double-precision floating-point value `a' is equal to the
2392 corresponding value `b', and 0 otherwise. The comparison is performed
2393 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2394 -------------------------------------------------------------------------------
2396 flag float64_eq( float64 a, float64 b )
2399 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2400 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2402 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2403 float_raise( float_flag_invalid );
2405 return 0;
2407 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2412 -------------------------------------------------------------------------------
2413 Returns 1 if the double-precision floating-point value `a' is less than or
2414 equal to the corresponding value `b', and 0 otherwise. The comparison is
2415 performed according to the IEC/IEEE Standard for Binary Floating-point
2416 Arithmetic.
2417 -------------------------------------------------------------------------------
2419 flag float64_le( float64 a, float64 b )
2421 flag aSign, bSign;
2423 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2424 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2426 float_raise( float_flag_invalid );
2427 return 0;
2429 aSign = extractFloat64Sign( a );
2430 bSign = extractFloat64Sign( b );
2431 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2432 return ( a == b ) || ( aSign ^ ( a < b ) );
2437 -------------------------------------------------------------------------------
2438 Returns 1 if the double-precision floating-point value `a' is less than
2439 the corresponding value `b', and 0 otherwise. The comparison is performed
2440 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2441 -------------------------------------------------------------------------------
2443 flag float64_lt( float64 a, float64 b )
2445 flag aSign, bSign;
2447 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2448 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2450 float_raise( float_flag_invalid );
2451 return 0;
2453 aSign = extractFloat64Sign( a );
2454 bSign = extractFloat64Sign( b );
2455 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2456 return ( a != b ) && ( aSign ^ ( a < b ) );
2461 -------------------------------------------------------------------------------
2462 Returns 1 if the double-precision floating-point value `a' is equal to the
2463 corresponding value `b', and 0 otherwise. The invalid exception is raised
2464 if either operand is a NaN. Otherwise, the comparison is performed
2465 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2466 -------------------------------------------------------------------------------
2468 flag float64_eq_signaling( float64 a, float64 b )
2471 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2472 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2474 float_raise( float_flag_invalid );
2475 return 0;
2477 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2482 -------------------------------------------------------------------------------
2483 Returns 1 if the double-precision floating-point value `a' is less than or
2484 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2485 cause an exception. Otherwise, the comparison is performed according to the
2486 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2487 -------------------------------------------------------------------------------
2489 flag float64_le_quiet( float64 a, float64 b )
2491 flag aSign, bSign;
2492 //int16 aExp, bExp;
2494 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2495 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2497 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2498 float_raise( float_flag_invalid );
2500 return 0;
2502 aSign = extractFloat64Sign( a );
2503 bSign = extractFloat64Sign( b );
2504 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2505 return ( a == b ) || ( aSign ^ ( a < b ) );
2510 -------------------------------------------------------------------------------
2511 Returns 1 if the double-precision floating-point value `a' is less than
2512 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2513 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2514 Standard for Binary Floating-point Arithmetic.
2515 -------------------------------------------------------------------------------
2517 flag float64_lt_quiet( float64 a, float64 b )
2519 flag aSign, bSign;
2521 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2522 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2524 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2525 float_raise( float_flag_invalid );
2527 return 0;
2529 aSign = extractFloat64Sign( a );
2530 bSign = extractFloat64Sign( b );
2531 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2532 return ( a != b ) && ( aSign ^ ( a < b ) );
2536 #ifdef FLOATX80
2539 -------------------------------------------------------------------------------
2540 Returns the result of converting the extended double-precision floating-
2541 point value `a' to the 32-bit two's complement integer format. The
2542 conversion is performed according to the IEC/IEEE Standard for Binary
2543 Floating-point Arithmetic---which means in particular that the conversion
2544 is rounded according to the current rounding mode. If `a' is a NaN, the
2545 largest positive integer is returned. Otherwise, if the conversion
2546 overflows, the largest integer with the same sign as `a' is returned.
2547 -------------------------------------------------------------------------------
2549 int32 floatx80_to_int32( floatx80 a )
2551 flag aSign;
2552 int32 aExp, shiftCount;
2553 bits64 aSig;
2555 aSig = extractFloatx80Frac( a );
2556 aExp = extractFloatx80Exp( a );
2557 aSign = extractFloatx80Sign( a );
2558 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2559 shiftCount = 0x4037 - aExp;
2560 if ( shiftCount <= 0 ) shiftCount = 1;
2561 shift64RightJamming( aSig, shiftCount, &aSig );
2562 return roundAndPackInt32( aSign, aSig );
2567 -------------------------------------------------------------------------------
2568 Returns the result of converting the extended double-precision floating-
2569 point value `a' to the 32-bit two's complement integer format. The
2570 conversion is performed according to the IEC/IEEE Standard for Binary
2571 Floating-point Arithmetic, except that the conversion is always rounded
2572 toward zero. If `a' is a NaN, the largest positive integer is returned.
2573 Otherwise, if the conversion overflows, the largest integer with the same
2574 sign as `a' is returned.
2575 -------------------------------------------------------------------------------
2577 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2579 flag aSign;
2580 int32 aExp, shiftCount;
2581 bits64 aSig, savedASig;
2582 int32 z;
2584 aSig = extractFloatx80Frac( a );
2585 aExp = extractFloatx80Exp( a );
2586 aSign = extractFloatx80Sign( a );
2587 shiftCount = 0x403E - aExp;
2588 if ( shiftCount < 32 ) {
2589 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2590 goto invalid;
2592 else if ( 63 < shiftCount ) {
2593 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2594 return 0;
2596 savedASig = aSig;
2597 aSig >>= shiftCount;
2598 z = aSig;
2599 if ( aSign ) z = - z;
2600 if ( ( z < 0 ) ^ aSign ) {
2601 invalid:
2602 float_exception_flags |= float_flag_invalid;
2603 return aSign ? 0x80000000 : 0x7FFFFFFF;
2605 if ( ( aSig<<shiftCount ) != savedASig ) {
2606 float_exception_flags |= float_flag_inexact;
2608 return z;
2613 -------------------------------------------------------------------------------
2614 Returns the result of converting the extended double-precision floating-
2615 point value `a' to the single-precision floating-point format. The
2616 conversion is performed according to the IEC/IEEE Standard for Binary
2617 Floating-point Arithmetic.
2618 -------------------------------------------------------------------------------
2620 float32 floatx80_to_float32( floatx80 a )
2622 flag aSign;
2623 int32 aExp;
2624 bits64 aSig;
2626 aSig = extractFloatx80Frac( a );
2627 aExp = extractFloatx80Exp( a );
2628 aSign = extractFloatx80Sign( a );
2629 if ( aExp == 0x7FFF ) {
2630 if ( (bits64) ( aSig<<1 ) ) {
2631 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2633 return packFloat32( aSign, 0xFF, 0 );
2635 shift64RightJamming( aSig, 33, &aSig );
2636 if ( aExp || aSig ) aExp -= 0x3F81;
2637 return roundAndPackFloat32( aSign, aExp, aSig );
2642 -------------------------------------------------------------------------------
2643 Returns the result of converting the extended double-precision floating-
2644 point value `a' to the double-precision floating-point format. The
2645 conversion is performed according to the IEC/IEEE Standard for Binary
2646 Floating-point Arithmetic.
2647 -------------------------------------------------------------------------------
2649 float64 floatx80_to_float64( floatx80 a )
2651 flag aSign;
2652 int32 aExp;
2653 bits64 aSig, zSig;
2655 aSig = extractFloatx80Frac( a );
2656 aExp = extractFloatx80Exp( a );
2657 aSign = extractFloatx80Sign( a );
2658 if ( aExp == 0x7FFF ) {
2659 if ( (bits64) ( aSig<<1 ) ) {
2660 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2662 return packFloat64( aSign, 0x7FF, 0 );
2664 shift64RightJamming( aSig, 1, &zSig );
2665 if ( aExp || aSig ) aExp -= 0x3C01;
2666 return roundAndPackFloat64( aSign, aExp, zSig );
2671 -------------------------------------------------------------------------------
2672 Rounds the extended double-precision floating-point value `a' to an integer,
2673 and returns the result as an extended quadruple-precision floating-point
2674 value. The operation is performed according to the IEC/IEEE Standard for
2675 Binary Floating-point Arithmetic.
2676 -------------------------------------------------------------------------------
2678 floatx80 floatx80_round_to_int( floatx80 a )
2680 flag aSign;
2681 int32 aExp;
2682 bits64 lastBitMask, roundBitsMask;
2683 int8 roundingMode;
2684 floatx80 z;
2686 aExp = extractFloatx80Exp( a );
2687 if ( 0x403E <= aExp ) {
2688 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2689 return propagateFloatx80NaN( a, a );
2691 return a;
2693 if ( aExp <= 0x3FFE ) {
2694 if ( ( aExp == 0 )
2695 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2696 return a;
2698 float_exception_flags |= float_flag_inexact;
2699 aSign = extractFloatx80Sign( a );
2700 switch ( float_rounding_mode ) {
2701 case float_round_nearest_even:
2702 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2704 return
2705 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2707 break;
2708 case float_round_down:
2709 return
2710 aSign ?
2711 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2712 : packFloatx80( 0, 0, 0 );
2713 case float_round_up:
2714 return
2715 aSign ? packFloatx80( 1, 0, 0 )
2716 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2718 return packFloatx80( aSign, 0, 0 );
2720 lastBitMask = 1;
2721 lastBitMask <<= 0x403E - aExp;
2722 roundBitsMask = lastBitMask - 1;
2723 z = a;
2724 roundingMode = float_rounding_mode;
2725 if ( roundingMode == float_round_nearest_even ) {
2726 z.low += lastBitMask>>1;
2727 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2729 else if ( roundingMode != float_round_to_zero ) {
2730 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2731 z.low += roundBitsMask;
2734 z.low &= ~ roundBitsMask;
2735 if ( z.low == 0 ) {
2736 ++z.high;
2737 z.low = LIT64( 0x8000000000000000 );
2739 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
2740 return z;
2745 -------------------------------------------------------------------------------
2746 Returns the result of adding the absolute values of the extended double-
2747 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2748 negated before being returned. `zSign' is ignored if the result is a NaN.
2749 The addition is performed according to the IEC/IEEE Standard for Binary
2750 Floating-point Arithmetic.
2751 -------------------------------------------------------------------------------
2753 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2755 int32 aExp, bExp, zExp;
2756 bits64 aSig, bSig, zSig0, zSig1;
2757 int32 expDiff;
2759 aSig = extractFloatx80Frac( a );
2760 aExp = extractFloatx80Exp( a );
2761 bSig = extractFloatx80Frac( b );
2762 bExp = extractFloatx80Exp( b );
2763 expDiff = aExp - bExp;
2764 if ( 0 < expDiff ) {
2765 if ( aExp == 0x7FFF ) {
2766 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2767 return a;
2769 if ( bExp == 0 ) --expDiff;
2770 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2771 zExp = aExp;
2773 else if ( expDiff < 0 ) {
2774 if ( bExp == 0x7FFF ) {
2775 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2776 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2778 if ( aExp == 0 ) ++expDiff;
2779 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2780 zExp = bExp;
2782 else {
2783 if ( aExp == 0x7FFF ) {
2784 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2785 return propagateFloatx80NaN( a, b );
2787 return a;
2789 zSig1 = 0;
2790 zSig0 = aSig + bSig;
2791 if ( aExp == 0 ) {
2792 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2793 goto roundAndPack;
2795 zExp = aExp;
2796 goto shiftRight1;
2799 zSig0 = aSig + bSig;
2801 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2802 shiftRight1:
2803 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2804 zSig0 |= LIT64( 0x8000000000000000 );
2805 ++zExp;
2806 roundAndPack:
2807 return
2808 roundAndPackFloatx80(
2809 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2814 -------------------------------------------------------------------------------
2815 Returns the result of subtracting the absolute values of the extended
2816 double-precision floating-point values `a' and `b'. If `zSign' is true,
2817 the difference is negated before being returned. `zSign' is ignored if the
2818 result is a NaN. The subtraction is performed according to the IEC/IEEE
2819 Standard for Binary Floating-point Arithmetic.
2820 -------------------------------------------------------------------------------
2822 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2824 int32 aExp, bExp, zExp;
2825 bits64 aSig, bSig, zSig0, zSig1;
2826 int32 expDiff;
2827 floatx80 z;
2829 aSig = extractFloatx80Frac( a );
2830 aExp = extractFloatx80Exp( a );
2831 bSig = extractFloatx80Frac( b );
2832 bExp = extractFloatx80Exp( b );
2833 expDiff = aExp - bExp;
2834 if ( 0 < expDiff ) goto aExpBigger;
2835 if ( expDiff < 0 ) goto bExpBigger;
2836 if ( aExp == 0x7FFF ) {
2837 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2838 return propagateFloatx80NaN( a, b );
2840 float_raise( float_flag_invalid );
2841 z.low = floatx80_default_nan_low;
2842 z.high = floatx80_default_nan_high;
2843 return z;
2845 if ( aExp == 0 ) {
2846 aExp = 1;
2847 bExp = 1;
2849 zSig1 = 0;
2850 if ( bSig < aSig ) goto aBigger;
2851 if ( aSig < bSig ) goto bBigger;
2852 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
2853 bExpBigger:
2854 if ( bExp == 0x7FFF ) {
2855 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2856 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2858 if ( aExp == 0 ) ++expDiff;
2859 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2860 bBigger:
2861 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2862 zExp = bExp;
2863 zSign ^= 1;
2864 goto normalizeRoundAndPack;
2865 aExpBigger:
2866 if ( aExp == 0x7FFF ) {
2867 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2868 return a;
2870 if ( bExp == 0 ) --expDiff;
2871 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2872 aBigger:
2873 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2874 zExp = aExp;
2875 normalizeRoundAndPack:
2876 return
2877 normalizeRoundAndPackFloatx80(
2878 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2883 -------------------------------------------------------------------------------
2884 Returns the result of adding the extended double-precision floating-point
2885 values `a' and `b'. The operation is performed according to the IEC/IEEE
2886 Standard for Binary Floating-point Arithmetic.
2887 -------------------------------------------------------------------------------
2889 floatx80 floatx80_add( floatx80 a, floatx80 b )
2891 flag aSign, bSign;
2893 aSign = extractFloatx80Sign( a );
2894 bSign = extractFloatx80Sign( b );
2895 if ( aSign == bSign ) {
2896 return addFloatx80Sigs( a, b, aSign );
2898 else {
2899 return subFloatx80Sigs( a, b, aSign );
2905 -------------------------------------------------------------------------------
2906 Returns the result of subtracting the extended double-precision floating-
2907 point values `a' and `b'. The operation is performed according to the
2908 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2909 -------------------------------------------------------------------------------
2911 floatx80 floatx80_sub( floatx80 a, floatx80 b )
2913 flag aSign, bSign;
2915 aSign = extractFloatx80Sign( a );
2916 bSign = extractFloatx80Sign( b );
2917 if ( aSign == bSign ) {
2918 return subFloatx80Sigs( a, b, aSign );
2920 else {
2921 return addFloatx80Sigs( a, b, aSign );
2927 -------------------------------------------------------------------------------
2928 Returns the result of multiplying the extended double-precision floating-
2929 point values `a' and `b'. The operation is performed according to the
2930 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2931 -------------------------------------------------------------------------------
2933 floatx80 floatx80_mul( floatx80 a, floatx80 b )
2935 flag aSign, bSign, zSign;
2936 int32 aExp, bExp, zExp;
2937 bits64 aSig, bSig, zSig0, zSig1;
2938 floatx80 z;
2940 aSig = extractFloatx80Frac( a );
2941 aExp = extractFloatx80Exp( a );
2942 aSign = extractFloatx80Sign( a );
2943 bSig = extractFloatx80Frac( b );
2944 bExp = extractFloatx80Exp( b );
2945 bSign = extractFloatx80Sign( b );
2946 zSign = aSign ^ bSign;
2947 if ( aExp == 0x7FFF ) {
2948 if ( (bits64) ( aSig<<1 )
2949 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2950 return propagateFloatx80NaN( a, b );
2952 if ( ( bExp | bSig ) == 0 ) goto invalid;
2953 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2955 if ( bExp == 0x7FFF ) {
2956 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2957 if ( ( aExp | aSig ) == 0 ) {
2958 invalid:
2959 float_raise( float_flag_invalid );
2960 z.low = floatx80_default_nan_low;
2961 z.high = floatx80_default_nan_high;
2962 return z;
2964 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2966 if ( aExp == 0 ) {
2967 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2968 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2970 if ( bExp == 0 ) {
2971 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2972 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2974 zExp = aExp + bExp - 0x3FFE;
2975 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2976 if ( 0 < (sbits64) zSig0 ) {
2977 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2978 --zExp;
2980 return
2981 roundAndPackFloatx80(
2982 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2987 -------------------------------------------------------------------------------
2988 Returns the result of dividing the extended double-precision floating-point
2989 value `a' by the corresponding value `b'. The operation is performed
2990 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2991 -------------------------------------------------------------------------------
2993 floatx80 floatx80_div( floatx80 a, floatx80 b )
2995 flag aSign, bSign, zSign;
2996 int32 aExp, bExp, zExp;
2997 bits64 aSig, bSig, zSig0, zSig1;
2998 bits64 rem0, rem1, rem2, term0, term1, term2;
2999 floatx80 z;
3001 aSig = extractFloatx80Frac( a );
3002 aExp = extractFloatx80Exp( a );
3003 aSign = extractFloatx80Sign( a );
3004 bSig = extractFloatx80Frac( b );
3005 bExp = extractFloatx80Exp( b );
3006 bSign = extractFloatx80Sign( b );
3007 zSign = aSign ^ bSign;
3008 if ( aExp == 0x7FFF ) {
3009 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3010 if ( bExp == 0x7FFF ) {
3011 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3012 goto invalid;
3014 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3016 if ( bExp == 0x7FFF ) {
3017 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3018 return packFloatx80( zSign, 0, 0 );
3020 if ( bExp == 0 ) {
3021 if ( bSig == 0 ) {
3022 if ( ( aExp | aSig ) == 0 ) {
3023 invalid:
3024 float_raise( float_flag_invalid );
3025 z.low = floatx80_default_nan_low;
3026 z.high = floatx80_default_nan_high;
3027 return z;
3029 float_raise( float_flag_divbyzero );
3030 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3032 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3034 if ( aExp == 0 ) {
3035 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3036 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3038 zExp = aExp - bExp + 0x3FFE;
3039 rem1 = 0;
3040 if ( bSig <= aSig ) {
3041 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3042 ++zExp;
3044 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3045 mul64To128( bSig, zSig0, &term0, &term1 );
3046 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3047 while ( (sbits64) rem0 < 0 ) {
3048 --zSig0;
3049 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3051 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3052 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3053 mul64To128( bSig, zSig1, &term1, &term2 );
3054 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3055 while ( (sbits64) rem1 < 0 ) {
3056 --zSig1;
3057 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3059 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3061 return
3062 roundAndPackFloatx80(
3063 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3068 -------------------------------------------------------------------------------
3069 Returns the remainder of the extended double-precision floating-point value
3070 `a' with respect to the corresponding value `b'. The operation is performed
3071 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3072 -------------------------------------------------------------------------------
3074 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3076 flag aSign, bSign, zSign;
3077 int32 aExp, bExp, expDiff;
3078 bits64 aSig0, aSig1, bSig;
3079 bits64 q, term0, term1, alternateASig0, alternateASig1;
3080 floatx80 z;
3082 aSig0 = extractFloatx80Frac( a );
3083 aExp = extractFloatx80Exp( a );
3084 aSign = extractFloatx80Sign( a );
3085 bSig = extractFloatx80Frac( b );
3086 bExp = extractFloatx80Exp( b );
3087 bSign = extractFloatx80Sign( b );
3088 if ( aExp == 0x7FFF ) {
3089 if ( (bits64) ( aSig0<<1 )
3090 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3091 return propagateFloatx80NaN( a, b );
3093 goto invalid;
3095 if ( bExp == 0x7FFF ) {
3096 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3097 return a;
3099 if ( bExp == 0 ) {
3100 if ( bSig == 0 ) {
3101 invalid:
3102 float_raise( float_flag_invalid );
3103 z.low = floatx80_default_nan_low;
3104 z.high = floatx80_default_nan_high;
3105 return z;
3107 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3109 if ( aExp == 0 ) {
3110 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3111 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3113 bSig |= LIT64( 0x8000000000000000 );
3114 zSign = aSign;
3115 expDiff = aExp - bExp;
3116 aSig1 = 0;
3117 if ( expDiff < 0 ) {
3118 if ( expDiff < -1 ) return a;
3119 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3120 expDiff = 0;
3122 q = ( bSig <= aSig0 );
3123 if ( q ) aSig0 -= bSig;
3124 expDiff -= 64;
3125 while ( 0 < expDiff ) {
3126 q = estimateDiv128To64( aSig0, aSig1, bSig );
3127 q = ( 2 < q ) ? q - 2 : 0;
3128 mul64To128( bSig, q, &term0, &term1 );
3129 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3130 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3131 expDiff -= 62;
3133 expDiff += 64;
3134 if ( 0 < expDiff ) {
3135 q = estimateDiv128To64( aSig0, aSig1, bSig );
3136 q = ( 2 < q ) ? q - 2 : 0;
3137 q >>= 64 - expDiff;
3138 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3139 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3140 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3141 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3142 ++q;
3143 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3146 else {
3147 term1 = 0;
3148 term0 = bSig;
3150 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3151 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3152 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3153 && ( q & 1 ) )
3155 aSig0 = alternateASig0;
3156 aSig1 = alternateASig1;
3157 zSign = ! zSign;
3159 return
3160 normalizeRoundAndPackFloatx80(
3161 80, zSign, bExp + expDiff, aSig0, aSig1 );
3166 -------------------------------------------------------------------------------
3167 Returns the square root of the extended double-precision floating-point
3168 value `a'. The operation is performed according to the IEC/IEEE Standard
3169 for Binary Floating-point Arithmetic.
3170 -------------------------------------------------------------------------------
3172 floatx80 floatx80_sqrt( floatx80 a )
3174 flag aSign;
3175 int32 aExp, zExp;
3176 bits64 aSig0, aSig1, zSig0, zSig1;
3177 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3178 bits64 shiftedRem0, shiftedRem1;
3179 floatx80 z;
3181 aSig0 = extractFloatx80Frac( a );
3182 aExp = extractFloatx80Exp( a );
3183 aSign = extractFloatx80Sign( a );
3184 if ( aExp == 0x7FFF ) {
3185 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3186 if ( ! aSign ) return a;
3187 goto invalid;
3189 if ( aSign ) {
3190 if ( ( aExp | aSig0 ) == 0 ) return a;
3191 invalid:
3192 float_raise( float_flag_invalid );
3193 z.low = floatx80_default_nan_low;
3194 z.high = floatx80_default_nan_high;
3195 return z;
3197 if ( aExp == 0 ) {
3198 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3199 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3201 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3202 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3203 zSig0 <<= 31;
3204 aSig1 = 0;
3205 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3206 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3207 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3208 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3209 mul64To128( zSig0, zSig0, &term0, &term1 );
3210 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3211 while ( (sbits64) rem0 < 0 ) {
3212 --zSig0;
3213 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3214 term1 |= 1;
3215 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3217 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3218 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3219 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3220 if ( zSig1 == 0 ) zSig1 = 1;
3221 mul64To128( zSig0, zSig1, &term1, &term2 );
3222 shortShift128Left( term1, term2, 1, &term1, &term2 );
3223 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3224 mul64To128( zSig1, zSig1, &term2, &term3 );
3225 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3226 while ( (sbits64) rem1 < 0 ) {
3227 --zSig1;
3228 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3229 term3 |= 1;
3230 add192(
3231 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3233 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3235 return
3236 roundAndPackFloatx80(
3237 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3242 -------------------------------------------------------------------------------
3243 Returns 1 if the extended double-precision floating-point value `a' is
3244 equal to the corresponding value `b', and 0 otherwise. The comparison is
3245 performed according to the IEC/IEEE Standard for Binary Floating-point
3246 Arithmetic.
3247 -------------------------------------------------------------------------------
3249 flag floatx80_eq( floatx80 a, floatx80 b )
3252 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3253 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3254 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3255 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3257 if ( floatx80_is_signaling_nan( a )
3258 || floatx80_is_signaling_nan( b ) ) {
3259 float_raise( float_flag_invalid );
3261 return 0;
3263 return
3264 ( a.low == b.low )
3265 && ( ( a.high == b.high )
3266 || ( ( a.low == 0 )
3267 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3273 -------------------------------------------------------------------------------
3274 Returns 1 if the extended double-precision floating-point value `a' is
3275 less than or equal to the corresponding value `b', and 0 otherwise. The
3276 comparison is performed according to the IEC/IEEE Standard for Binary
3277 Floating-point Arithmetic.
3278 -------------------------------------------------------------------------------
3280 flag floatx80_le( floatx80 a, floatx80 b )
3282 flag aSign, bSign;
3284 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3285 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3286 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3287 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3289 float_raise( float_flag_invalid );
3290 return 0;
3292 aSign = extractFloatx80Sign( a );
3293 bSign = extractFloatx80Sign( b );
3294 if ( aSign != bSign ) {
3295 return
3296 aSign
3297 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3298 == 0 );
3300 return
3301 aSign ? le128( b.high, b.low, a.high, a.low )
3302 : le128( a.high, a.low, b.high, b.low );
3307 -------------------------------------------------------------------------------
3308 Returns 1 if the extended double-precision floating-point value `a' is
3309 less than the corresponding value `b', and 0 otherwise. The comparison
3310 is performed according to the IEC/IEEE Standard for Binary Floating-point
3311 Arithmetic.
3312 -------------------------------------------------------------------------------
3314 flag floatx80_lt( floatx80 a, floatx80 b )
3316 flag aSign, bSign;
3318 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3319 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3320 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3321 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3323 float_raise( float_flag_invalid );
3324 return 0;
3326 aSign = extractFloatx80Sign( a );
3327 bSign = extractFloatx80Sign( b );
3328 if ( aSign != bSign ) {
3329 return
3330 aSign
3331 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3332 != 0 );
3334 return
3335 aSign ? lt128( b.high, b.low, a.high, a.low )
3336 : lt128( a.high, a.low, b.high, b.low );
3341 -------------------------------------------------------------------------------
3342 Returns 1 if the extended double-precision floating-point value `a' is equal
3343 to the corresponding value `b', and 0 otherwise. The invalid exception is
3344 raised if either operand is a NaN. Otherwise, the comparison is performed
3345 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3346 -------------------------------------------------------------------------------
3348 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3351 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3352 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3353 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3354 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3356 float_raise( float_flag_invalid );
3357 return 0;
3359 return
3360 ( a.low == b.low )
3361 && ( ( a.high == b.high )
3362 || ( ( a.low == 0 )
3363 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3369 -------------------------------------------------------------------------------
3370 Returns 1 if the extended double-precision floating-point value `a' is less
3371 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3372 do not cause an exception. Otherwise, the comparison is performed according
3373 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3374 -------------------------------------------------------------------------------
3376 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3378 flag aSign, bSign;
3380 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3381 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3382 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3383 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3385 if ( floatx80_is_signaling_nan( a )
3386 || floatx80_is_signaling_nan( b ) ) {
3387 float_raise( float_flag_invalid );
3389 return 0;
3391 aSign = extractFloatx80Sign( a );
3392 bSign = extractFloatx80Sign( b );
3393 if ( aSign != bSign ) {
3394 return
3395 aSign
3396 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3397 == 0 );
3399 return
3400 aSign ? le128( b.high, b.low, a.high, a.low )
3401 : le128( a.high, a.low, b.high, b.low );
3406 -------------------------------------------------------------------------------
3407 Returns 1 if the extended double-precision floating-point value `a' is less
3408 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3409 an exception. Otherwise, the comparison is performed according to the
3410 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3411 -------------------------------------------------------------------------------
3413 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3415 flag aSign, bSign;
3417 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3418 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3419 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3420 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3422 if ( floatx80_is_signaling_nan( a )
3423 || floatx80_is_signaling_nan( b ) ) {
3424 float_raise( float_flag_invalid );
3426 return 0;
3428 aSign = extractFloatx80Sign( a );
3429 bSign = extractFloatx80Sign( b );
3430 if ( aSign != bSign ) {
3431 return
3432 aSign
3433 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3434 != 0 );
3436 return
3437 aSign ? lt128( b.high, b.low, a.high, a.low )
3438 : lt128( a.high, a.low, b.high, b.low );
3442 #endif