Automatic merge of rsync://rsync.kernel.org/pub/scm/linux/kernel/git/gregkh/driver...
[linux-2.6/verdex.git] / arch / arm26 / nwfpe / softfloat.c
blob26c1b916e5279d40cbb9170aa43fc477d7924c05
1 /*
2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
31 #include "fpa11.h"
32 #include "milieu.h"
33 #include "softfloat.h"
36 -------------------------------------------------------------------------------
37 Floating-point rounding mode, extended double-precision rounding precision,
38 and exception flags.
39 -------------------------------------------------------------------------------
41 int8 float_rounding_mode = float_round_nearest_even;
42 int8 floatx80_rounding_precision = 80;
43 int8 float_exception_flags;
46 -------------------------------------------------------------------------------
47 Primitive arithmetic functions, including multi-word arithmetic, and
48 division and square root approximations. (Can be specialized to target if
49 desired.)
50 -------------------------------------------------------------------------------
52 #include "softfloat-macros"
55 -------------------------------------------------------------------------------
56 Functions and definitions to determine: (1) whether tininess for underflow
57 is detected before or after rounding by default, (2) what (if anything)
58 happens when exceptions are raised, (3) how signaling NaNs are distinguished
59 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60 are propagated from function inputs to output. These details are target-
61 specific.
62 -------------------------------------------------------------------------------
64 #include "softfloat-specialize"
67 -------------------------------------------------------------------------------
68 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
69 and 7, and returns the properly rounded 32-bit integer corresponding to the
70 input. If `zSign' is nonzero, the input is negated before being converted
71 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
72 input is simply rounded to an integer, with the inexact exception raised if
73 the input cannot be represented exactly as an integer. If the fixed-point
74 input is too large, however, the invalid exception is raised and the largest
75 positive or negative integer is returned.
76 -------------------------------------------------------------------------------
78 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
80 int8 roundingMode;
81 flag roundNearestEven;
82 int8 roundIncrement, roundBits;
83 int32 z;
85 roundingMode = float_rounding_mode;
86 roundNearestEven = ( roundingMode == float_round_nearest_even );
87 roundIncrement = 0x40;
88 if ( ! roundNearestEven ) {
89 if ( roundingMode == float_round_to_zero ) {
90 roundIncrement = 0;
92 else {
93 roundIncrement = 0x7F;
94 if ( zSign ) {
95 if ( roundingMode == float_round_up ) roundIncrement = 0;
97 else {
98 if ( roundingMode == float_round_down ) roundIncrement = 0;
102 roundBits = absZ & 0x7F;
103 absZ = ( absZ + roundIncrement )>>7;
104 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
105 z = absZ;
106 if ( zSign ) z = - z;
107 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
108 float_exception_flags |= float_flag_invalid;
109 return zSign ? 0x80000000 : 0x7FFFFFFF;
111 if ( roundBits ) float_exception_flags |= float_flag_inexact;
112 return z;
117 -------------------------------------------------------------------------------
118 Returns the fraction bits of the single-precision floating-point value `a'.
119 -------------------------------------------------------------------------------
121 INLINE bits32 extractFloat32Frac( float32 a )
124 return a & 0x007FFFFF;
129 -------------------------------------------------------------------------------
130 Returns the exponent bits of the single-precision floating-point value `a'.
131 -------------------------------------------------------------------------------
133 INLINE int16 extractFloat32Exp( float32 a )
136 return ( a>>23 ) & 0xFF;
141 -------------------------------------------------------------------------------
142 Returns the sign bit of the single-precision floating-point value `a'.
143 -------------------------------------------------------------------------------
145 INLINE flag extractFloat32Sign( float32 a )
148 return a>>31;
153 -------------------------------------------------------------------------------
154 Normalizes the subnormal single-precision floating-point value represented
155 by the denormalized significand `aSig'. The normalized exponent and
156 significand are stored at the locations pointed to by `zExpPtr' and
157 `zSigPtr', respectively.
158 -------------------------------------------------------------------------------
160 static void
161 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
163 int8 shiftCount;
165 shiftCount = countLeadingZeros32( aSig ) - 8;
166 *zSigPtr = aSig<<shiftCount;
167 *zExpPtr = 1 - shiftCount;
172 -------------------------------------------------------------------------------
173 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
174 single-precision floating-point value, returning the result. After being
175 shifted into the proper positions, the three fields are simply added
176 together to form the result. This means that any integer portion of `zSig'
177 will be added into the exponent. Since a properly normalized significand
178 will have an integer portion equal to 1, the `zExp' input should be 1 less
179 than the desired result exponent whenever `zSig' is a complete, normalized
180 significand.
181 -------------------------------------------------------------------------------
183 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
185 #if 0
186 float32 f;
187 __asm__("@ packFloat32; \n\
188 mov %0, %1, asl #31; \n\
189 orr %0, %2, asl #23; \n\
190 orr %0, %3"
191 : /* no outputs */
192 : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
193 : "cc");
194 return f;
195 #else
196 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
197 #endif
201 -------------------------------------------------------------------------------
202 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
203 and significand `zSig', and returns the proper single-precision floating-
204 point value corresponding to the abstract input. Ordinarily, the abstract
205 value is simply rounded and packed into the single-precision format, with
206 the inexact exception raised if the abstract input cannot be represented
207 exactly. If the abstract value is too large, however, the overflow and
208 inexact exceptions are raised and an infinity or maximal finite value is
209 returned. If the abstract value is too small, the input value is rounded to
210 a subnormal number, and the underflow and inexact exceptions are raised if
211 the abstract input cannot be represented exactly as a subnormal single-
212 precision floating-point number.
213 The input significand `zSig' has its binary point between bits 30
214 and 29, which is 7 bits to the left of the usual location. This shifted
215 significand must be normalized or smaller. If `zSig' is not normalized,
216 `zExp' must be 0; in that case, the result returned is a subnormal number,
217 and it must not require rounding. In the usual case that `zSig' is
218 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
219 The handling of underflow and overflow follows the IEC/IEEE Standard for
220 Binary Floating-point Arithmetic.
221 -------------------------------------------------------------------------------
223 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
225 int8 roundingMode;
226 flag roundNearestEven;
227 int8 roundIncrement, roundBits;
228 flag isTiny;
230 roundingMode = float_rounding_mode;
231 roundNearestEven = ( roundingMode == float_round_nearest_even );
232 roundIncrement = 0x40;
233 if ( ! roundNearestEven ) {
234 if ( roundingMode == float_round_to_zero ) {
235 roundIncrement = 0;
237 else {
238 roundIncrement = 0x7F;
239 if ( zSign ) {
240 if ( roundingMode == float_round_up ) roundIncrement = 0;
242 else {
243 if ( roundingMode == float_round_down ) roundIncrement = 0;
247 roundBits = zSig & 0x7F;
248 if ( 0xFD <= (bits16) zExp ) {
249 if ( ( 0xFD < zExp )
250 || ( ( zExp == 0xFD )
251 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
253 float_raise( float_flag_overflow | float_flag_inexact );
254 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
256 if ( zExp < 0 ) {
257 isTiny =
258 ( float_detect_tininess == float_tininess_before_rounding )
259 || ( zExp < -1 )
260 || ( zSig + roundIncrement < 0x80000000 );
261 shift32RightJamming( zSig, - zExp, &zSig );
262 zExp = 0;
263 roundBits = zSig & 0x7F;
264 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
267 if ( roundBits ) float_exception_flags |= float_flag_inexact;
268 zSig = ( zSig + roundIncrement )>>7;
269 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
270 if ( zSig == 0 ) zExp = 0;
271 return packFloat32( zSign, zExp, zSig );
276 -------------------------------------------------------------------------------
277 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
278 and significand `zSig', and returns the proper single-precision floating-
279 point value corresponding to the abstract input. This routine is just like
280 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
281 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
282 point exponent.
283 -------------------------------------------------------------------------------
285 static float32
286 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
288 int8 shiftCount;
290 shiftCount = countLeadingZeros32( zSig ) - 1;
291 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
296 -------------------------------------------------------------------------------
297 Returns the fraction bits of the double-precision floating-point value `a'.
298 -------------------------------------------------------------------------------
300 INLINE bits64 extractFloat64Frac( float64 a )
303 return a & LIT64( 0x000FFFFFFFFFFFFF );
308 -------------------------------------------------------------------------------
309 Returns the exponent bits of the double-precision floating-point value `a'.
310 -------------------------------------------------------------------------------
312 INLINE int16 extractFloat64Exp( float64 a )
315 return ( a>>52 ) & 0x7FF;
320 -------------------------------------------------------------------------------
321 Returns the sign bit of the double-precision floating-point value `a'.
322 -------------------------------------------------------------------------------
324 INLINE flag extractFloat64Sign( float64 a )
327 return a>>63;
332 -------------------------------------------------------------------------------
333 Normalizes the subnormal double-precision floating-point value represented
334 by the denormalized significand `aSig'. The normalized exponent and
335 significand are stored at the locations pointed to by `zExpPtr' and
336 `zSigPtr', respectively.
337 -------------------------------------------------------------------------------
339 static void
340 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
342 int8 shiftCount;
344 shiftCount = countLeadingZeros64( aSig ) - 11;
345 *zSigPtr = aSig<<shiftCount;
346 *zExpPtr = 1 - shiftCount;
351 -------------------------------------------------------------------------------
352 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
353 double-precision floating-point value, returning the result. After being
354 shifted into the proper positions, the three fields are simply added
355 together to form the result. This means that any integer portion of `zSig'
356 will be added into the exponent. Since a properly normalized significand
357 will have an integer portion equal to 1, the `zExp' input should be 1 less
358 than the desired result exponent whenever `zSig' is a complete, normalized
359 significand.
360 -------------------------------------------------------------------------------
362 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
365 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
370 -------------------------------------------------------------------------------
371 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
372 and significand `zSig', and returns the proper double-precision floating-
373 point value corresponding to the abstract input. Ordinarily, the abstract
374 value is simply rounded and packed into the double-precision format, with
375 the inexact exception raised if the abstract input cannot be represented
376 exactly. If the abstract value is too large, however, the overflow and
377 inexact exceptions are raised and an infinity or maximal finite value is
378 returned. If the abstract value is too small, the input value is rounded to
379 a subnormal number, and the underflow and inexact exceptions are raised if
380 the abstract input cannot be represented exactly as a subnormal double-
381 precision floating-point number.
382 The input significand `zSig' has its binary point between bits 62
383 and 61, which is 10 bits to the left of the usual location. This shifted
384 significand must be normalized or smaller. If `zSig' is not normalized,
385 `zExp' must be 0; in that case, the result returned is a subnormal number,
386 and it must not require rounding. In the usual case that `zSig' is
387 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
388 The handling of underflow and overflow follows the IEC/IEEE Standard for
389 Binary Floating-point Arithmetic.
390 -------------------------------------------------------------------------------
392 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
394 int8 roundingMode;
395 flag roundNearestEven;
396 int16 roundIncrement, roundBits;
397 flag isTiny;
399 roundingMode = float_rounding_mode;
400 roundNearestEven = ( roundingMode == float_round_nearest_even );
401 roundIncrement = 0x200;
402 if ( ! roundNearestEven ) {
403 if ( roundingMode == float_round_to_zero ) {
404 roundIncrement = 0;
406 else {
407 roundIncrement = 0x3FF;
408 if ( zSign ) {
409 if ( roundingMode == float_round_up ) roundIncrement = 0;
411 else {
412 if ( roundingMode == float_round_down ) roundIncrement = 0;
416 roundBits = zSig & 0x3FF;
417 if ( 0x7FD <= (bits16) zExp ) {
418 if ( ( 0x7FD < zExp )
419 || ( ( zExp == 0x7FD )
420 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
422 //register int lr = __builtin_return_address(0);
423 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
424 float_raise( float_flag_overflow | float_flag_inexact );
425 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
427 if ( zExp < 0 ) {
428 isTiny =
429 ( float_detect_tininess == float_tininess_before_rounding )
430 || ( zExp < -1 )
431 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
432 shift64RightJamming( zSig, - zExp, &zSig );
433 zExp = 0;
434 roundBits = zSig & 0x3FF;
435 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
438 if ( roundBits ) float_exception_flags |= float_flag_inexact;
439 zSig = ( zSig + roundIncrement )>>10;
440 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
441 if ( zSig == 0 ) zExp = 0;
442 return packFloat64( zSign, zExp, zSig );
447 -------------------------------------------------------------------------------
448 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
449 and significand `zSig', and returns the proper double-precision floating-
450 point value corresponding to the abstract input. This routine is just like
451 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
452 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
453 point exponent.
454 -------------------------------------------------------------------------------
456 static float64
457 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
459 int8 shiftCount;
461 shiftCount = countLeadingZeros64( zSig ) - 1;
462 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
466 #ifdef FLOATX80
469 -------------------------------------------------------------------------------
470 Returns the fraction bits of the extended double-precision floating-point
471 value `a'.
472 -------------------------------------------------------------------------------
474 INLINE bits64 extractFloatx80Frac( floatx80 a )
477 return a.low;
482 -------------------------------------------------------------------------------
483 Returns the exponent bits of the extended double-precision floating-point
484 value `a'.
485 -------------------------------------------------------------------------------
487 INLINE int32 extractFloatx80Exp( floatx80 a )
490 return a.high & 0x7FFF;
495 -------------------------------------------------------------------------------
496 Returns the sign bit of the extended double-precision floating-point value
497 `a'.
498 -------------------------------------------------------------------------------
500 INLINE flag extractFloatx80Sign( floatx80 a )
503 return a.high>>15;
508 -------------------------------------------------------------------------------
509 Normalizes the subnormal extended double-precision floating-point value
510 represented by the denormalized significand `aSig'. The normalized exponent
511 and significand are stored at the locations pointed to by `zExpPtr' and
512 `zSigPtr', respectively.
513 -------------------------------------------------------------------------------
515 static void
516 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
518 int8 shiftCount;
520 shiftCount = countLeadingZeros64( aSig );
521 *zSigPtr = aSig<<shiftCount;
522 *zExpPtr = 1 - shiftCount;
527 -------------------------------------------------------------------------------
528 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
529 extended double-precision floating-point value, returning the result.
530 -------------------------------------------------------------------------------
532 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
534 floatx80 z;
536 z.low = zSig;
537 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
538 return z;
543 -------------------------------------------------------------------------------
544 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
545 and extended significand formed by the concatenation of `zSig0' and `zSig1',
546 and returns the proper extended double-precision floating-point value
547 corresponding to the abstract input. Ordinarily, the abstract value is
548 rounded and packed into the extended double-precision format, with the
549 inexact exception raised if the abstract input cannot be represented
550 exactly. If the abstract value is too large, however, the overflow and
551 inexact exceptions are raised and an infinity or maximal finite value is
552 returned. If the abstract value is too small, the input value is rounded to
553 a subnormal number, and the underflow and inexact exceptions are raised if
554 the abstract input cannot be represented exactly as a subnormal extended
555 double-precision floating-point number.
556 If `roundingPrecision' is 32 or 64, the result is rounded to the same
557 number of bits as single or double precision, respectively. Otherwise, the
558 result is rounded to the full precision of the extended double-precision
559 format.
560 The input significand must be normalized or smaller. If the input
561 significand is not normalized, `zExp' must be 0; in that case, the result
562 returned is a subnormal number, and it must not require rounding. The
563 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
564 Floating-point Arithmetic.
565 -------------------------------------------------------------------------------
567 static floatx80
568 roundAndPackFloatx80(
569 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
572 int8 roundingMode;
573 flag roundNearestEven, increment, isTiny;
574 int64 roundIncrement, roundMask, roundBits;
576 roundingMode = float_rounding_mode;
577 roundNearestEven = ( roundingMode == float_round_nearest_even );
578 if ( roundingPrecision == 80 ) goto precision80;
579 if ( roundingPrecision == 64 ) {
580 roundIncrement = LIT64( 0x0000000000000400 );
581 roundMask = LIT64( 0x00000000000007FF );
583 else if ( roundingPrecision == 32 ) {
584 roundIncrement = LIT64( 0x0000008000000000 );
585 roundMask = LIT64( 0x000000FFFFFFFFFF );
587 else {
588 goto precision80;
590 zSig0 |= ( zSig1 != 0 );
591 if ( ! roundNearestEven ) {
592 if ( roundingMode == float_round_to_zero ) {
593 roundIncrement = 0;
595 else {
596 roundIncrement = roundMask;
597 if ( zSign ) {
598 if ( roundingMode == float_round_up ) roundIncrement = 0;
600 else {
601 if ( roundingMode == float_round_down ) roundIncrement = 0;
605 roundBits = zSig0 & roundMask;
606 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
607 if ( ( 0x7FFE < zExp )
608 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
610 goto overflow;
612 if ( zExp <= 0 ) {
613 isTiny =
614 ( float_detect_tininess == float_tininess_before_rounding )
615 || ( zExp < 0 )
616 || ( zSig0 <= zSig0 + roundIncrement );
617 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
618 zExp = 0;
619 roundBits = zSig0 & roundMask;
620 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
621 if ( roundBits ) float_exception_flags |= float_flag_inexact;
622 zSig0 += roundIncrement;
623 if ( (sbits64) zSig0 < 0 ) zExp = 1;
624 roundIncrement = roundMask + 1;
625 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
626 roundMask |= roundIncrement;
628 zSig0 &= ~ roundMask;
629 return packFloatx80( zSign, zExp, zSig0 );
632 if ( roundBits ) float_exception_flags |= float_flag_inexact;
633 zSig0 += roundIncrement;
634 if ( zSig0 < roundIncrement ) {
635 ++zExp;
636 zSig0 = LIT64( 0x8000000000000000 );
638 roundIncrement = roundMask + 1;
639 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
640 roundMask |= roundIncrement;
642 zSig0 &= ~ roundMask;
643 if ( zSig0 == 0 ) zExp = 0;
644 return packFloatx80( zSign, zExp, zSig0 );
645 precision80:
646 increment = ( (sbits64) zSig1 < 0 );
647 if ( ! roundNearestEven ) {
648 if ( roundingMode == float_round_to_zero ) {
649 increment = 0;
651 else {
652 if ( zSign ) {
653 increment = ( roundingMode == float_round_down ) && zSig1;
655 else {
656 increment = ( roundingMode == float_round_up ) && zSig1;
660 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
661 if ( ( 0x7FFE < zExp )
662 || ( ( zExp == 0x7FFE )
663 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
664 && increment
667 roundMask = 0;
668 overflow:
669 float_raise( float_flag_overflow | float_flag_inexact );
670 if ( ( roundingMode == float_round_to_zero )
671 || ( zSign && ( roundingMode == float_round_up ) )
672 || ( ! zSign && ( roundingMode == float_round_down ) )
674 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
676 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
678 if ( zExp <= 0 ) {
679 isTiny =
680 ( float_detect_tininess == float_tininess_before_rounding )
681 || ( zExp < 0 )
682 || ! increment
683 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
684 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
685 zExp = 0;
686 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
687 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
688 if ( roundNearestEven ) {
689 increment = ( (sbits64) zSig1 < 0 );
691 else {
692 if ( zSign ) {
693 increment = ( roundingMode == float_round_down ) && zSig1;
695 else {
696 increment = ( roundingMode == float_round_up ) && zSig1;
699 if ( increment ) {
700 ++zSig0;
701 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
702 if ( (sbits64) zSig0 < 0 ) zExp = 1;
704 return packFloatx80( zSign, zExp, zSig0 );
707 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
708 if ( increment ) {
709 ++zSig0;
710 if ( zSig0 == 0 ) {
711 ++zExp;
712 zSig0 = LIT64( 0x8000000000000000 );
714 else {
715 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
718 else {
719 if ( zSig0 == 0 ) zExp = 0;
722 return packFloatx80( zSign, zExp, zSig0 );
726 -------------------------------------------------------------------------------
727 Takes an abstract floating-point value having sign `zSign', exponent
728 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
729 and returns the proper extended double-precision floating-point value
730 corresponding to the abstract input. This routine is just like
731 `roundAndPackFloatx80' except that the input significand does not have to be
732 normalized.
733 -------------------------------------------------------------------------------
735 static floatx80
736 normalizeRoundAndPackFloatx80(
737 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
740 int8 shiftCount;
742 if ( zSig0 == 0 ) {
743 zSig0 = zSig1;
744 zSig1 = 0;
745 zExp -= 64;
747 shiftCount = countLeadingZeros64( zSig0 );
748 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
749 zExp -= shiftCount;
750 return
751 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
755 #endif
758 -------------------------------------------------------------------------------
759 Returns the result of converting the 32-bit two's complement integer `a' to
760 the single-precision floating-point format. The conversion is performed
761 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
762 -------------------------------------------------------------------------------
764 float32 int32_to_float32( int32 a )
766 flag zSign;
768 if ( a == 0 ) return 0;
769 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
770 zSign = ( a < 0 );
771 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
776 -------------------------------------------------------------------------------
777 Returns the result of converting the 32-bit two's complement integer `a' to
778 the double-precision floating-point format. The conversion is performed
779 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
780 -------------------------------------------------------------------------------
782 float64 int32_to_float64( int32 a )
784 flag aSign;
785 uint32 absA;
786 int8 shiftCount;
787 bits64 zSig;
789 if ( a == 0 ) return 0;
790 aSign = ( a < 0 );
791 absA = aSign ? - a : a;
792 shiftCount = countLeadingZeros32( absA ) + 21;
793 zSig = absA;
794 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
798 #ifdef FLOATX80
801 -------------------------------------------------------------------------------
802 Returns the result of converting the 32-bit two's complement integer `a'
803 to the extended double-precision floating-point format. The conversion
804 is performed according to the IEC/IEEE Standard for Binary Floating-point
805 Arithmetic.
806 -------------------------------------------------------------------------------
808 floatx80 int32_to_floatx80( int32 a )
810 flag zSign;
811 uint32 absA;
812 int8 shiftCount;
813 bits64 zSig;
815 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
816 zSign = ( a < 0 );
817 absA = zSign ? - a : a;
818 shiftCount = countLeadingZeros32( absA ) + 32;
819 zSig = absA;
820 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
824 #endif
827 -------------------------------------------------------------------------------
828 Returns the result of converting the single-precision floating-point value
829 `a' to the 32-bit two's complement integer format. The conversion is
830 performed according to the IEC/IEEE Standard for Binary Floating-point
831 Arithmetic---which means in particular that the conversion is rounded
832 according to the current rounding mode. If `a' is a NaN, the largest
833 positive integer is returned. Otherwise, if the conversion overflows, the
834 largest integer with the same sign as `a' is returned.
835 -------------------------------------------------------------------------------
837 int32 float32_to_int32( float32 a )
839 flag aSign;
840 int16 aExp, shiftCount;
841 bits32 aSig;
842 bits64 zSig;
844 aSig = extractFloat32Frac( a );
845 aExp = extractFloat32Exp( a );
846 aSign = extractFloat32Sign( a );
847 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
848 if ( aExp ) aSig |= 0x00800000;
849 shiftCount = 0xAF - aExp;
850 zSig = aSig;
851 zSig <<= 32;
852 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
853 return roundAndPackInt32( aSign, zSig );
858 -------------------------------------------------------------------------------
859 Returns the result of converting the single-precision floating-point value
860 `a' to the 32-bit two's complement integer format. The conversion is
861 performed according to the IEC/IEEE Standard for Binary Floating-point
862 Arithmetic, except that the conversion is always rounded toward zero. If
863 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
864 conversion overflows, the largest integer with the same sign as `a' is
865 returned.
866 -------------------------------------------------------------------------------
868 int32 float32_to_int32_round_to_zero( float32 a )
870 flag aSign;
871 int16 aExp, shiftCount;
872 bits32 aSig;
873 int32 z;
875 aSig = extractFloat32Frac( a );
876 aExp = extractFloat32Exp( a );
877 aSign = extractFloat32Sign( a );
878 shiftCount = aExp - 0x9E;
879 if ( 0 <= shiftCount ) {
880 if ( a == 0xCF000000 ) return 0x80000000;
881 float_raise( float_flag_invalid );
882 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
883 return 0x80000000;
885 else if ( aExp <= 0x7E ) {
886 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
887 return 0;
889 aSig = ( aSig | 0x00800000 )<<8;
890 z = aSig>>( - shiftCount );
891 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
892 float_exception_flags |= float_flag_inexact;
894 return aSign ? - z : z;
899 -------------------------------------------------------------------------------
900 Returns the result of converting the single-precision floating-point value
901 `a' to the double-precision floating-point format. The conversion is
902 performed according to the IEC/IEEE Standard for Binary Floating-point
903 Arithmetic.
904 -------------------------------------------------------------------------------
906 float64 float32_to_float64( float32 a )
908 flag aSign;
909 int16 aExp;
910 bits32 aSig;
912 aSig = extractFloat32Frac( a );
913 aExp = extractFloat32Exp( a );
914 aSign = extractFloat32Sign( a );
915 if ( aExp == 0xFF ) {
916 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
917 return packFloat64( aSign, 0x7FF, 0 );
919 if ( aExp == 0 ) {
920 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
921 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
922 --aExp;
924 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
928 #ifdef FLOATX80
931 -------------------------------------------------------------------------------
932 Returns the result of converting the single-precision floating-point value
933 `a' to the extended double-precision floating-point format. The conversion
934 is performed according to the IEC/IEEE Standard for Binary Floating-point
935 Arithmetic.
936 -------------------------------------------------------------------------------
938 floatx80 float32_to_floatx80( float32 a )
940 flag aSign;
941 int16 aExp;
942 bits32 aSig;
944 aSig = extractFloat32Frac( a );
945 aExp = extractFloat32Exp( a );
946 aSign = extractFloat32Sign( a );
947 if ( aExp == 0xFF ) {
948 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
949 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
951 if ( aExp == 0 ) {
952 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
953 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
955 aSig |= 0x00800000;
956 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
960 #endif
963 -------------------------------------------------------------------------------
964 Rounds the single-precision floating-point value `a' to an integer, and
965 returns the result as a single-precision floating-point value. The
966 operation is performed according to the IEC/IEEE Standard for Binary
967 Floating-point Arithmetic.
968 -------------------------------------------------------------------------------
970 float32 float32_round_to_int( float32 a )
972 flag aSign;
973 int16 aExp;
974 bits32 lastBitMask, roundBitsMask;
975 int8 roundingMode;
976 float32 z;
978 aExp = extractFloat32Exp( a );
979 if ( 0x96 <= aExp ) {
980 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
981 return propagateFloat32NaN( a, a );
983 return a;
985 if ( aExp <= 0x7E ) {
986 if ( (bits32) ( a<<1 ) == 0 ) return a;
987 float_exception_flags |= float_flag_inexact;
988 aSign = extractFloat32Sign( a );
989 switch ( float_rounding_mode ) {
990 case float_round_nearest_even:
991 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
992 return packFloat32( aSign, 0x7F, 0 );
994 break;
995 case float_round_down:
996 return aSign ? 0xBF800000 : 0;
997 case float_round_up:
998 return aSign ? 0x80000000 : 0x3F800000;
1000 return packFloat32( aSign, 0, 0 );
1002 lastBitMask = 1;
1003 lastBitMask <<= 0x96 - aExp;
1004 roundBitsMask = lastBitMask - 1;
1005 z = a;
1006 roundingMode = float_rounding_mode;
1007 if ( roundingMode == float_round_nearest_even ) {
1008 z += lastBitMask>>1;
1009 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1011 else if ( roundingMode != float_round_to_zero ) {
1012 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1013 z += roundBitsMask;
1016 z &= ~ roundBitsMask;
1017 if ( z != a ) float_exception_flags |= float_flag_inexact;
1018 return z;
1023 -------------------------------------------------------------------------------
1024 Returns the result of adding the absolute values of the single-precision
1025 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1026 before being returned. `zSign' is ignored if the result is a NaN. The
1027 addition is performed according to the IEC/IEEE Standard for Binary
1028 Floating-point Arithmetic.
1029 -------------------------------------------------------------------------------
1031 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1033 int16 aExp, bExp, zExp;
1034 bits32 aSig, bSig, zSig;
1035 int16 expDiff;
1037 aSig = extractFloat32Frac( a );
1038 aExp = extractFloat32Exp( a );
1039 bSig = extractFloat32Frac( b );
1040 bExp = extractFloat32Exp( b );
1041 expDiff = aExp - bExp;
1042 aSig <<= 6;
1043 bSig <<= 6;
1044 if ( 0 < expDiff ) {
1045 if ( aExp == 0xFF ) {
1046 if ( aSig ) return propagateFloat32NaN( a, b );
1047 return a;
1049 if ( bExp == 0 ) {
1050 --expDiff;
1052 else {
1053 bSig |= 0x20000000;
1055 shift32RightJamming( bSig, expDiff, &bSig );
1056 zExp = aExp;
1058 else if ( expDiff < 0 ) {
1059 if ( bExp == 0xFF ) {
1060 if ( bSig ) return propagateFloat32NaN( a, b );
1061 return packFloat32( zSign, 0xFF, 0 );
1063 if ( aExp == 0 ) {
1064 ++expDiff;
1066 else {
1067 aSig |= 0x20000000;
1069 shift32RightJamming( aSig, - expDiff, &aSig );
1070 zExp = bExp;
1072 else {
1073 if ( aExp == 0xFF ) {
1074 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1075 return a;
1077 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1078 zSig = 0x40000000 + aSig + bSig;
1079 zExp = aExp;
1080 goto roundAndPack;
1082 aSig |= 0x20000000;
1083 zSig = ( aSig + bSig )<<1;
1084 --zExp;
1085 if ( (sbits32) zSig < 0 ) {
1086 zSig = aSig + bSig;
1087 ++zExp;
1089 roundAndPack:
1090 return roundAndPackFloat32( zSign, zExp, zSig );
1095 -------------------------------------------------------------------------------
1096 Returns the result of subtracting the absolute values of the single-
1097 precision floating-point values `a' and `b'. If `zSign' is true, the
1098 difference is negated before being returned. `zSign' is ignored if the
1099 result is a NaN. The subtraction is performed according to the IEC/IEEE
1100 Standard for Binary Floating-point Arithmetic.
1101 -------------------------------------------------------------------------------
1103 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1105 int16 aExp, bExp, zExp;
1106 bits32 aSig, bSig, zSig;
1107 int16 expDiff;
1109 aSig = extractFloat32Frac( a );
1110 aExp = extractFloat32Exp( a );
1111 bSig = extractFloat32Frac( b );
1112 bExp = extractFloat32Exp( b );
1113 expDiff = aExp - bExp;
1114 aSig <<= 7;
1115 bSig <<= 7;
1116 if ( 0 < expDiff ) goto aExpBigger;
1117 if ( expDiff < 0 ) goto bExpBigger;
1118 if ( aExp == 0xFF ) {
1119 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1120 float_raise( float_flag_invalid );
1121 return float32_default_nan;
1123 if ( aExp == 0 ) {
1124 aExp = 1;
1125 bExp = 1;
1127 if ( bSig < aSig ) goto aBigger;
1128 if ( aSig < bSig ) goto bBigger;
1129 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1130 bExpBigger:
1131 if ( bExp == 0xFF ) {
1132 if ( bSig ) return propagateFloat32NaN( a, b );
1133 return packFloat32( zSign ^ 1, 0xFF, 0 );
1135 if ( aExp == 0 ) {
1136 ++expDiff;
1138 else {
1139 aSig |= 0x40000000;
1141 shift32RightJamming( aSig, - expDiff, &aSig );
1142 bSig |= 0x40000000;
1143 bBigger:
1144 zSig = bSig - aSig;
1145 zExp = bExp;
1146 zSign ^= 1;
1147 goto normalizeRoundAndPack;
1148 aExpBigger:
1149 if ( aExp == 0xFF ) {
1150 if ( aSig ) return propagateFloat32NaN( a, b );
1151 return a;
1153 if ( bExp == 0 ) {
1154 --expDiff;
1156 else {
1157 bSig |= 0x40000000;
1159 shift32RightJamming( bSig, expDiff, &bSig );
1160 aSig |= 0x40000000;
1161 aBigger:
1162 zSig = aSig - bSig;
1163 zExp = aExp;
1164 normalizeRoundAndPack:
1165 --zExp;
1166 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1171 -------------------------------------------------------------------------------
1172 Returns the result of adding the single-precision floating-point values `a'
1173 and `b'. The operation is performed according to the IEC/IEEE Standard for
1174 Binary Floating-point Arithmetic.
1175 -------------------------------------------------------------------------------
1177 float32 float32_add( float32 a, float32 b )
1179 flag aSign, bSign;
1181 aSign = extractFloat32Sign( a );
1182 bSign = extractFloat32Sign( b );
1183 if ( aSign == bSign ) {
1184 return addFloat32Sigs( a, b, aSign );
1186 else {
1187 return subFloat32Sigs( a, b, aSign );
1193 -------------------------------------------------------------------------------
1194 Returns the result of subtracting the single-precision floating-point values
1195 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1196 for Binary Floating-point Arithmetic.
1197 -------------------------------------------------------------------------------
1199 float32 float32_sub( float32 a, float32 b )
1201 flag aSign, bSign;
1203 aSign = extractFloat32Sign( a );
1204 bSign = extractFloat32Sign( b );
1205 if ( aSign == bSign ) {
1206 return subFloat32Sigs( a, b, aSign );
1208 else {
1209 return addFloat32Sigs( a, b, aSign );
1215 -------------------------------------------------------------------------------
1216 Returns the result of multiplying the single-precision floating-point values
1217 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1218 for Binary Floating-point Arithmetic.
1219 -------------------------------------------------------------------------------
1221 float32 float32_mul( float32 a, float32 b )
1223 flag aSign, bSign, zSign;
1224 int16 aExp, bExp, zExp;
1225 bits32 aSig, bSig;
1226 bits64 zSig64;
1227 bits32 zSig;
1229 aSig = extractFloat32Frac( a );
1230 aExp = extractFloat32Exp( a );
1231 aSign = extractFloat32Sign( a );
1232 bSig = extractFloat32Frac( b );
1233 bExp = extractFloat32Exp( b );
1234 bSign = extractFloat32Sign( b );
1235 zSign = aSign ^ bSign;
1236 if ( aExp == 0xFF ) {
1237 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1238 return propagateFloat32NaN( a, b );
1240 if ( ( bExp | bSig ) == 0 ) {
1241 float_raise( float_flag_invalid );
1242 return float32_default_nan;
1244 return packFloat32( zSign, 0xFF, 0 );
1246 if ( bExp == 0xFF ) {
1247 if ( bSig ) return propagateFloat32NaN( a, b );
1248 if ( ( aExp | aSig ) == 0 ) {
1249 float_raise( float_flag_invalid );
1250 return float32_default_nan;
1252 return packFloat32( zSign, 0xFF, 0 );
1254 if ( aExp == 0 ) {
1255 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1256 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1258 if ( bExp == 0 ) {
1259 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1260 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1262 zExp = aExp + bExp - 0x7F;
1263 aSig = ( aSig | 0x00800000 )<<7;
1264 bSig = ( bSig | 0x00800000 )<<8;
1265 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1266 zSig = zSig64;
1267 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1268 zSig <<= 1;
1269 --zExp;
1271 return roundAndPackFloat32( zSign, zExp, zSig );
1276 -------------------------------------------------------------------------------
1277 Returns the result of dividing the single-precision floating-point value `a'
1278 by the corresponding value `b'. The operation is performed according to the
1279 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1280 -------------------------------------------------------------------------------
1282 float32 float32_div( float32 a, float32 b )
1284 flag aSign, bSign, zSign;
1285 int16 aExp, bExp, zExp;
1286 bits32 aSig, bSig, zSig;
1288 aSig = extractFloat32Frac( a );
1289 aExp = extractFloat32Exp( a );
1290 aSign = extractFloat32Sign( a );
1291 bSig = extractFloat32Frac( b );
1292 bExp = extractFloat32Exp( b );
1293 bSign = extractFloat32Sign( b );
1294 zSign = aSign ^ bSign;
1295 if ( aExp == 0xFF ) {
1296 if ( aSig ) return propagateFloat32NaN( a, b );
1297 if ( bExp == 0xFF ) {
1298 if ( bSig ) return propagateFloat32NaN( a, b );
1299 float_raise( float_flag_invalid );
1300 return float32_default_nan;
1302 return packFloat32( zSign, 0xFF, 0 );
1304 if ( bExp == 0xFF ) {
1305 if ( bSig ) return propagateFloat32NaN( a, b );
1306 return packFloat32( zSign, 0, 0 );
1308 if ( bExp == 0 ) {
1309 if ( bSig == 0 ) {
1310 if ( ( aExp | aSig ) == 0 ) {
1311 float_raise( float_flag_invalid );
1312 return float32_default_nan;
1314 float_raise( float_flag_divbyzero );
1315 return packFloat32( zSign, 0xFF, 0 );
1317 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1319 if ( aExp == 0 ) {
1320 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1321 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1323 zExp = aExp - bExp + 0x7D;
1324 aSig = ( aSig | 0x00800000 )<<7;
1325 bSig = ( bSig | 0x00800000 )<<8;
1326 if ( bSig <= ( aSig + aSig ) ) {
1327 aSig >>= 1;
1328 ++zExp;
1330 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1331 if ( ( zSig & 0x3F ) == 0 ) {
1332 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1334 return roundAndPackFloat32( zSign, zExp, zSig );
1339 -------------------------------------------------------------------------------
1340 Returns the remainder of the single-precision floating-point value `a'
1341 with respect to the corresponding value `b'. The operation is performed
1342 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1343 -------------------------------------------------------------------------------
1345 float32 float32_rem( float32 a, float32 b )
1347 flag aSign, bSign, zSign;
1348 int16 aExp, bExp, expDiff;
1349 bits32 aSig, bSig;
1350 bits32 q;
1351 bits64 aSig64, bSig64, q64;
1352 bits32 alternateASig;
1353 sbits32 sigMean;
1355 aSig = extractFloat32Frac( a );
1356 aExp = extractFloat32Exp( a );
1357 aSign = extractFloat32Sign( a );
1358 bSig = extractFloat32Frac( b );
1359 bExp = extractFloat32Exp( b );
1360 bSign = extractFloat32Sign( b );
1361 if ( aExp == 0xFF ) {
1362 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1363 return propagateFloat32NaN( a, b );
1365 float_raise( float_flag_invalid );
1366 return float32_default_nan;
1368 if ( bExp == 0xFF ) {
1369 if ( bSig ) return propagateFloat32NaN( a, b );
1370 return a;
1372 if ( bExp == 0 ) {
1373 if ( bSig == 0 ) {
1374 float_raise( float_flag_invalid );
1375 return float32_default_nan;
1377 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1379 if ( aExp == 0 ) {
1380 if ( aSig == 0 ) return a;
1381 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1383 expDiff = aExp - bExp;
1384 aSig |= 0x00800000;
1385 bSig |= 0x00800000;
1386 if ( expDiff < 32 ) {
1387 aSig <<= 8;
1388 bSig <<= 8;
1389 if ( expDiff < 0 ) {
1390 if ( expDiff < -1 ) return a;
1391 aSig >>= 1;
1393 q = ( bSig <= aSig );
1394 if ( q ) aSig -= bSig;
1395 if ( 0 < expDiff ) {
1396 q = ( ( (bits64) aSig )<<32 ) / bSig;
1397 q >>= 32 - expDiff;
1398 bSig >>= 2;
1399 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1401 else {
1402 aSig >>= 2;
1403 bSig >>= 2;
1406 else {
1407 if ( bSig <= aSig ) aSig -= bSig;
1408 aSig64 = ( (bits64) aSig )<<40;
1409 bSig64 = ( (bits64) bSig )<<40;
1410 expDiff -= 64;
1411 while ( 0 < expDiff ) {
1412 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1413 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1414 aSig64 = - ( ( bSig * q64 )<<38 );
1415 expDiff -= 62;
1417 expDiff += 64;
1418 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1419 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1420 q = q64>>( 64 - expDiff );
1421 bSig <<= 6;
1422 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1424 do {
1425 alternateASig = aSig;
1426 ++q;
1427 aSig -= bSig;
1428 } while ( 0 <= (sbits32) aSig );
1429 sigMean = aSig + alternateASig;
1430 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1431 aSig = alternateASig;
1433 zSign = ( (sbits32) aSig < 0 );
1434 if ( zSign ) aSig = - aSig;
1435 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1440 -------------------------------------------------------------------------------
1441 Returns the square root of the single-precision floating-point value `a'.
1442 The operation is performed according to the IEC/IEEE Standard for Binary
1443 Floating-point Arithmetic.
1444 -------------------------------------------------------------------------------
1446 float32 float32_sqrt( float32 a )
1448 flag aSign;
1449 int16 aExp, zExp;
1450 bits32 aSig, zSig;
1451 bits64 rem, term;
1453 aSig = extractFloat32Frac( a );
1454 aExp = extractFloat32Exp( a );
1455 aSign = extractFloat32Sign( a );
1456 if ( aExp == 0xFF ) {
1457 if ( aSig ) return propagateFloat32NaN( a, 0 );
1458 if ( ! aSign ) return a;
1459 float_raise( float_flag_invalid );
1460 return float32_default_nan;
1462 if ( aSign ) {
1463 if ( ( aExp | aSig ) == 0 ) return a;
1464 float_raise( float_flag_invalid );
1465 return float32_default_nan;
1467 if ( aExp == 0 ) {
1468 if ( aSig == 0 ) return 0;
1469 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1471 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1472 aSig = ( aSig | 0x00800000 )<<8;
1473 zSig = estimateSqrt32( aExp, aSig ) + 2;
1474 if ( ( zSig & 0x7F ) <= 5 ) {
1475 if ( zSig < 2 ) {
1476 zSig = 0xFFFFFFFF;
1478 else {
1479 aSig >>= aExp & 1;
1480 term = ( (bits64) zSig ) * zSig;
1481 rem = ( ( (bits64) aSig )<<32 ) - term;
1482 while ( (sbits64) rem < 0 ) {
1483 --zSig;
1484 rem += ( ( (bits64) zSig )<<1 ) | 1;
1486 zSig |= ( rem != 0 );
1489 shift32RightJamming( zSig, 1, &zSig );
1490 return roundAndPackFloat32( 0, zExp, zSig );
1495 -------------------------------------------------------------------------------
1496 Returns 1 if the single-precision floating-point value `a' is equal to the
1497 corresponding value `b', and 0 otherwise. The comparison is performed
1498 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1499 -------------------------------------------------------------------------------
1501 flag float32_eq( float32 a, float32 b )
1504 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1505 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1507 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1508 float_raise( float_flag_invalid );
1510 return 0;
1512 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1517 -------------------------------------------------------------------------------
1518 Returns 1 if the single-precision floating-point value `a' is less than or
1519 equal to the corresponding value `b', and 0 otherwise. The comparison is
1520 performed according to the IEC/IEEE Standard for Binary Floating-point
1521 Arithmetic.
1522 -------------------------------------------------------------------------------
1524 flag float32_le( float32 a, float32 b )
1526 flag aSign, bSign;
1528 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1529 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1531 float_raise( float_flag_invalid );
1532 return 0;
1534 aSign = extractFloat32Sign( a );
1535 bSign = extractFloat32Sign( b );
1536 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1537 return ( a == b ) || ( aSign ^ ( a < b ) );
1542 -------------------------------------------------------------------------------
1543 Returns 1 if the single-precision floating-point value `a' is less than
1544 the corresponding value `b', and 0 otherwise. The comparison is performed
1545 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1546 -------------------------------------------------------------------------------
1548 flag float32_lt( float32 a, float32 b )
1550 flag aSign, bSign;
1552 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1553 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1555 float_raise( float_flag_invalid );
1556 return 0;
1558 aSign = extractFloat32Sign( a );
1559 bSign = extractFloat32Sign( b );
1560 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1561 return ( a != b ) && ( aSign ^ ( a < b ) );
1566 -------------------------------------------------------------------------------
1567 Returns 1 if the single-precision floating-point value `a' is equal to the
1568 corresponding value `b', and 0 otherwise. The invalid exception is raised
1569 if either operand is a NaN. Otherwise, the comparison is performed
1570 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1571 -------------------------------------------------------------------------------
1573 flag float32_eq_signaling( float32 a, float32 b )
1576 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1577 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1579 float_raise( float_flag_invalid );
1580 return 0;
1582 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1587 -------------------------------------------------------------------------------
1588 Returns 1 if the single-precision floating-point value `a' is less than or
1589 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1590 cause an exception. Otherwise, the comparison is performed according to the
1591 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1592 -------------------------------------------------------------------------------
1594 flag float32_le_quiet( float32 a, float32 b )
1596 flag aSign, bSign;
1597 //int16 aExp, bExp;
1599 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1600 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1602 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1603 float_raise( float_flag_invalid );
1605 return 0;
1607 aSign = extractFloat32Sign( a );
1608 bSign = extractFloat32Sign( b );
1609 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1610 return ( a == b ) || ( aSign ^ ( a < b ) );
1615 -------------------------------------------------------------------------------
1616 Returns 1 if the single-precision floating-point value `a' is less than
1617 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1618 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1619 Standard for Binary Floating-point Arithmetic.
1620 -------------------------------------------------------------------------------
1622 flag float32_lt_quiet( float32 a, float32 b )
1624 flag aSign, bSign;
1626 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1627 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1629 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1630 float_raise( float_flag_invalid );
1632 return 0;
1634 aSign = extractFloat32Sign( a );
1635 bSign = extractFloat32Sign( b );
1636 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1637 return ( a != b ) && ( aSign ^ ( a < b ) );
1642 -------------------------------------------------------------------------------
1643 Returns the result of converting the double-precision floating-point value
1644 `a' to the 32-bit two's complement integer format. The conversion is
1645 performed according to the IEC/IEEE Standard for Binary Floating-point
1646 Arithmetic---which means in particular that the conversion is rounded
1647 according to the current rounding mode. If `a' is a NaN, the largest
1648 positive integer is returned. Otherwise, if the conversion overflows, the
1649 largest integer with the same sign as `a' is returned.
1650 -------------------------------------------------------------------------------
1652 int32 float64_to_int32( float64 a )
1654 flag aSign;
1655 int16 aExp, shiftCount;
1656 bits64 aSig;
1658 aSig = extractFloat64Frac( a );
1659 aExp = extractFloat64Exp( a );
1660 aSign = extractFloat64Sign( a );
1661 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1662 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1663 shiftCount = 0x42C - aExp;
1664 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1665 return roundAndPackInt32( aSign, aSig );
1670 -------------------------------------------------------------------------------
1671 Returns the result of converting the double-precision floating-point value
1672 `a' to the 32-bit two's complement integer format. The conversion is
1673 performed according to the IEC/IEEE Standard for Binary Floating-point
1674 Arithmetic, except that the conversion is always rounded toward zero. If
1675 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1676 conversion overflows, the largest integer with the same sign as `a' is
1677 returned.
1678 -------------------------------------------------------------------------------
1680 int32 float64_to_int32_round_to_zero( float64 a )
1682 flag aSign;
1683 int16 aExp, shiftCount;
1684 bits64 aSig, savedASig;
1685 int32 z;
1687 aSig = extractFloat64Frac( a );
1688 aExp = extractFloat64Exp( a );
1689 aSign = extractFloat64Sign( a );
1690 shiftCount = 0x433 - aExp;
1691 if ( shiftCount < 21 ) {
1692 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1693 goto invalid;
1695 else if ( 52 < shiftCount ) {
1696 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1697 return 0;
1699 aSig |= LIT64( 0x0010000000000000 );
1700 savedASig = aSig;
1701 aSig >>= shiftCount;
1702 z = aSig;
1703 if ( aSign ) z = - z;
1704 if ( ( z < 0 ) ^ aSign ) {
1705 invalid:
1706 float_exception_flags |= float_flag_invalid;
1707 return aSign ? 0x80000000 : 0x7FFFFFFF;
1709 if ( ( aSig<<shiftCount ) != savedASig ) {
1710 float_exception_flags |= float_flag_inexact;
1712 return z;
1717 -------------------------------------------------------------------------------
1718 Returns the result of converting the double-precision floating-point value
1719 `a' to the 32-bit two's complement unsigned integer format. The conversion
1720 is performed according to the IEC/IEEE Standard for Binary Floating-point
1721 Arithmetic---which means in particular that the conversion is rounded
1722 according to the current rounding mode. If `a' is a NaN, the largest
1723 positive integer is returned. Otherwise, if the conversion overflows, the
1724 largest positive integer is returned.
1725 -------------------------------------------------------------------------------
1727 int32 float64_to_uint32( float64 a )
1729 flag aSign;
1730 int16 aExp, shiftCount;
1731 bits64 aSig;
1733 aSig = extractFloat64Frac( a );
1734 aExp = extractFloat64Exp( a );
1735 aSign = 0; //extractFloat64Sign( a );
1736 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1737 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1738 shiftCount = 0x42C - aExp;
1739 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1740 return roundAndPackInt32( aSign, aSig );
1744 -------------------------------------------------------------------------------
1745 Returns the result of converting the double-precision floating-point value
1746 `a' to the 32-bit two's complement integer format. The conversion is
1747 performed according to the IEC/IEEE Standard for Binary Floating-point
1748 Arithmetic, except that the conversion is always rounded toward zero. If
1749 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1750 conversion overflows, the largest positive integer is returned.
1751 -------------------------------------------------------------------------------
1753 int32 float64_to_uint32_round_to_zero( float64 a )
1755 flag aSign;
1756 int16 aExp, shiftCount;
1757 bits64 aSig, savedASig;
1758 int32 z;
1760 aSig = extractFloat64Frac( a );
1761 aExp = extractFloat64Exp( a );
1762 aSign = extractFloat64Sign( a );
1763 shiftCount = 0x433 - aExp;
1764 if ( shiftCount < 21 ) {
1765 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1766 goto invalid;
1768 else if ( 52 < shiftCount ) {
1769 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1770 return 0;
1772 aSig |= LIT64( 0x0010000000000000 );
1773 savedASig = aSig;
1774 aSig >>= shiftCount;
1775 z = aSig;
1776 if ( aSign ) z = - z;
1777 if ( ( z < 0 ) ^ aSign ) {
1778 invalid:
1779 float_exception_flags |= float_flag_invalid;
1780 return aSign ? 0x80000000 : 0x7FFFFFFF;
1782 if ( ( aSig<<shiftCount ) != savedASig ) {
1783 float_exception_flags |= float_flag_inexact;
1785 return z;
1789 -------------------------------------------------------------------------------
1790 Returns the result of converting the double-precision floating-point value
1791 `a' to the single-precision floating-point format. The conversion is
1792 performed according to the IEC/IEEE Standard for Binary Floating-point
1793 Arithmetic.
1794 -------------------------------------------------------------------------------
1796 float32 float64_to_float32( float64 a )
1798 flag aSign;
1799 int16 aExp;
1800 bits64 aSig;
1801 bits32 zSig;
1803 aSig = extractFloat64Frac( a );
1804 aExp = extractFloat64Exp( a );
1805 aSign = extractFloat64Sign( a );
1806 if ( aExp == 0x7FF ) {
1807 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1808 return packFloat32( aSign, 0xFF, 0 );
1810 shift64RightJamming( aSig, 22, &aSig );
1811 zSig = aSig;
1812 if ( aExp || zSig ) {
1813 zSig |= 0x40000000;
1814 aExp -= 0x381;
1816 return roundAndPackFloat32( aSign, aExp, zSig );
1820 #ifdef FLOATX80
1823 -------------------------------------------------------------------------------
1824 Returns the result of converting the double-precision floating-point value
1825 `a' to the extended double-precision floating-point format. The conversion
1826 is performed according to the IEC/IEEE Standard for Binary Floating-point
1827 Arithmetic.
1828 -------------------------------------------------------------------------------
1830 floatx80 float64_to_floatx80( float64 a )
1832 flag aSign;
1833 int16 aExp;
1834 bits64 aSig;
1836 aSig = extractFloat64Frac( a );
1837 aExp = extractFloat64Exp( a );
1838 aSign = extractFloat64Sign( a );
1839 if ( aExp == 0x7FF ) {
1840 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1841 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1843 if ( aExp == 0 ) {
1844 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1845 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1847 return
1848 packFloatx80(
1849 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1853 #endif
1856 -------------------------------------------------------------------------------
1857 Rounds the double-precision floating-point value `a' to an integer, and
1858 returns the result as a double-precision floating-point value. The
1859 operation is performed according to the IEC/IEEE Standard for Binary
1860 Floating-point Arithmetic.
1861 -------------------------------------------------------------------------------
1863 float64 float64_round_to_int( float64 a )
1865 flag aSign;
1866 int16 aExp;
1867 bits64 lastBitMask, roundBitsMask;
1868 int8 roundingMode;
1869 float64 z;
1871 aExp = extractFloat64Exp( a );
1872 if ( 0x433 <= aExp ) {
1873 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1874 return propagateFloat64NaN( a, a );
1876 return a;
1878 if ( aExp <= 0x3FE ) {
1879 if ( (bits64) ( a<<1 ) == 0 ) return a;
1880 float_exception_flags |= float_flag_inexact;
1881 aSign = extractFloat64Sign( a );
1882 switch ( float_rounding_mode ) {
1883 case float_round_nearest_even:
1884 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1885 return packFloat64( aSign, 0x3FF, 0 );
1887 break;
1888 case float_round_down:
1889 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1890 case float_round_up:
1891 return
1892 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1894 return packFloat64( aSign, 0, 0 );
1896 lastBitMask = 1;
1897 lastBitMask <<= 0x433 - aExp;
1898 roundBitsMask = lastBitMask - 1;
1899 z = a;
1900 roundingMode = float_rounding_mode;
1901 if ( roundingMode == float_round_nearest_even ) {
1902 z += lastBitMask>>1;
1903 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1905 else if ( roundingMode != float_round_to_zero ) {
1906 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1907 z += roundBitsMask;
1910 z &= ~ roundBitsMask;
1911 if ( z != a ) float_exception_flags |= float_flag_inexact;
1912 return z;
1917 -------------------------------------------------------------------------------
1918 Returns the result of adding the absolute values of the double-precision
1919 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1920 before being returned. `zSign' is ignored if the result is a NaN. The
1921 addition is performed according to the IEC/IEEE Standard for Binary
1922 Floating-point Arithmetic.
1923 -------------------------------------------------------------------------------
1925 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1927 int16 aExp, bExp, zExp;
1928 bits64 aSig, bSig, zSig;
1929 int16 expDiff;
1931 aSig = extractFloat64Frac( a );
1932 aExp = extractFloat64Exp( a );
1933 bSig = extractFloat64Frac( b );
1934 bExp = extractFloat64Exp( b );
1935 expDiff = aExp - bExp;
1936 aSig <<= 9;
1937 bSig <<= 9;
1938 if ( 0 < expDiff ) {
1939 if ( aExp == 0x7FF ) {
1940 if ( aSig ) return propagateFloat64NaN( a, b );
1941 return a;
1943 if ( bExp == 0 ) {
1944 --expDiff;
1946 else {
1947 bSig |= LIT64( 0x2000000000000000 );
1949 shift64RightJamming( bSig, expDiff, &bSig );
1950 zExp = aExp;
1952 else if ( expDiff < 0 ) {
1953 if ( bExp == 0x7FF ) {
1954 if ( bSig ) return propagateFloat64NaN( a, b );
1955 return packFloat64( zSign, 0x7FF, 0 );
1957 if ( aExp == 0 ) {
1958 ++expDiff;
1960 else {
1961 aSig |= LIT64( 0x2000000000000000 );
1963 shift64RightJamming( aSig, - expDiff, &aSig );
1964 zExp = bExp;
1966 else {
1967 if ( aExp == 0x7FF ) {
1968 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1969 return a;
1971 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1972 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1973 zExp = aExp;
1974 goto roundAndPack;
1976 aSig |= LIT64( 0x2000000000000000 );
1977 zSig = ( aSig + bSig )<<1;
1978 --zExp;
1979 if ( (sbits64) zSig < 0 ) {
1980 zSig = aSig + bSig;
1981 ++zExp;
1983 roundAndPack:
1984 return roundAndPackFloat64( zSign, zExp, zSig );
1989 -------------------------------------------------------------------------------
1990 Returns the result of subtracting the absolute values of the double-
1991 precision floating-point values `a' and `b'. If `zSign' is true, the
1992 difference is negated before being returned. `zSign' is ignored if the
1993 result is a NaN. The subtraction is performed according to the IEC/IEEE
1994 Standard for Binary Floating-point Arithmetic.
1995 -------------------------------------------------------------------------------
1997 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1999 int16 aExp, bExp, zExp;
2000 bits64 aSig, bSig, zSig;
2001 int16 expDiff;
2003 aSig = extractFloat64Frac( a );
2004 aExp = extractFloat64Exp( a );
2005 bSig = extractFloat64Frac( b );
2006 bExp = extractFloat64Exp( b );
2007 expDiff = aExp - bExp;
2008 aSig <<= 10;
2009 bSig <<= 10;
2010 if ( 0 < expDiff ) goto aExpBigger;
2011 if ( expDiff < 0 ) goto bExpBigger;
2012 if ( aExp == 0x7FF ) {
2013 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2014 float_raise( float_flag_invalid );
2015 return float64_default_nan;
2017 if ( aExp == 0 ) {
2018 aExp = 1;
2019 bExp = 1;
2021 if ( bSig < aSig ) goto aBigger;
2022 if ( aSig < bSig ) goto bBigger;
2023 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2024 bExpBigger:
2025 if ( bExp == 0x7FF ) {
2026 if ( bSig ) return propagateFloat64NaN( a, b );
2027 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2029 if ( aExp == 0 ) {
2030 ++expDiff;
2032 else {
2033 aSig |= LIT64( 0x4000000000000000 );
2035 shift64RightJamming( aSig, - expDiff, &aSig );
2036 bSig |= LIT64( 0x4000000000000000 );
2037 bBigger:
2038 zSig = bSig - aSig;
2039 zExp = bExp;
2040 zSign ^= 1;
2041 goto normalizeRoundAndPack;
2042 aExpBigger:
2043 if ( aExp == 0x7FF ) {
2044 if ( aSig ) return propagateFloat64NaN( a, b );
2045 return a;
2047 if ( bExp == 0 ) {
2048 --expDiff;
2050 else {
2051 bSig |= LIT64( 0x4000000000000000 );
2053 shift64RightJamming( bSig, expDiff, &bSig );
2054 aSig |= LIT64( 0x4000000000000000 );
2055 aBigger:
2056 zSig = aSig - bSig;
2057 zExp = aExp;
2058 normalizeRoundAndPack:
2059 --zExp;
2060 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2065 -------------------------------------------------------------------------------
2066 Returns the result of adding the double-precision floating-point values `a'
2067 and `b'. The operation is performed according to the IEC/IEEE Standard for
2068 Binary Floating-point Arithmetic.
2069 -------------------------------------------------------------------------------
2071 float64 float64_add( float64 a, float64 b )
2073 flag aSign, bSign;
2075 aSign = extractFloat64Sign( a );
2076 bSign = extractFloat64Sign( b );
2077 if ( aSign == bSign ) {
2078 return addFloat64Sigs( a, b, aSign );
2080 else {
2081 return subFloat64Sigs( a, b, aSign );
2087 -------------------------------------------------------------------------------
2088 Returns the result of subtracting the double-precision floating-point values
2089 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2090 for Binary Floating-point Arithmetic.
2091 -------------------------------------------------------------------------------
2093 float64 float64_sub( float64 a, float64 b )
2095 flag aSign, bSign;
2097 aSign = extractFloat64Sign( a );
2098 bSign = extractFloat64Sign( b );
2099 if ( aSign == bSign ) {
2100 return subFloat64Sigs( a, b, aSign );
2102 else {
2103 return addFloat64Sigs( a, b, aSign );
2109 -------------------------------------------------------------------------------
2110 Returns the result of multiplying the double-precision floating-point values
2111 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2112 for Binary Floating-point Arithmetic.
2113 -------------------------------------------------------------------------------
2115 float64 float64_mul( float64 a, float64 b )
2117 flag aSign, bSign, zSign;
2118 int16 aExp, bExp, zExp;
2119 bits64 aSig, bSig, zSig0, zSig1;
2121 aSig = extractFloat64Frac( a );
2122 aExp = extractFloat64Exp( a );
2123 aSign = extractFloat64Sign( a );
2124 bSig = extractFloat64Frac( b );
2125 bExp = extractFloat64Exp( b );
2126 bSign = extractFloat64Sign( b );
2127 zSign = aSign ^ bSign;
2128 if ( aExp == 0x7FF ) {
2129 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2130 return propagateFloat64NaN( a, b );
2132 if ( ( bExp | bSig ) == 0 ) {
2133 float_raise( float_flag_invalid );
2134 return float64_default_nan;
2136 return packFloat64( zSign, 0x7FF, 0 );
2138 if ( bExp == 0x7FF ) {
2139 if ( bSig ) return propagateFloat64NaN( a, b );
2140 if ( ( aExp | aSig ) == 0 ) {
2141 float_raise( float_flag_invalid );
2142 return float64_default_nan;
2144 return packFloat64( zSign, 0x7FF, 0 );
2146 if ( aExp == 0 ) {
2147 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2148 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2150 if ( bExp == 0 ) {
2151 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2152 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2154 zExp = aExp + bExp - 0x3FF;
2155 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2156 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2157 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2158 zSig0 |= ( zSig1 != 0 );
2159 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2160 zSig0 <<= 1;
2161 --zExp;
2163 return roundAndPackFloat64( zSign, zExp, zSig0 );
2168 -------------------------------------------------------------------------------
2169 Returns the result of dividing the double-precision floating-point value `a'
2170 by the corresponding value `b'. The operation is performed according to
2171 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2172 -------------------------------------------------------------------------------
2174 float64 float64_div( float64 a, float64 b )
2176 flag aSign, bSign, zSign;
2177 int16 aExp, bExp, zExp;
2178 bits64 aSig, bSig, zSig;
2179 bits64 rem0, rem1;
2180 bits64 term0, term1;
2182 aSig = extractFloat64Frac( a );
2183 aExp = extractFloat64Exp( a );
2184 aSign = extractFloat64Sign( a );
2185 bSig = extractFloat64Frac( b );
2186 bExp = extractFloat64Exp( b );
2187 bSign = extractFloat64Sign( b );
2188 zSign = aSign ^ bSign;
2189 if ( aExp == 0x7FF ) {
2190 if ( aSig ) return propagateFloat64NaN( a, b );
2191 if ( bExp == 0x7FF ) {
2192 if ( bSig ) return propagateFloat64NaN( a, b );
2193 float_raise( float_flag_invalid );
2194 return float64_default_nan;
2196 return packFloat64( zSign, 0x7FF, 0 );
2198 if ( bExp == 0x7FF ) {
2199 if ( bSig ) return propagateFloat64NaN( a, b );
2200 return packFloat64( zSign, 0, 0 );
2202 if ( bExp == 0 ) {
2203 if ( bSig == 0 ) {
2204 if ( ( aExp | aSig ) == 0 ) {
2205 float_raise( float_flag_invalid );
2206 return float64_default_nan;
2208 float_raise( float_flag_divbyzero );
2209 return packFloat64( zSign, 0x7FF, 0 );
2211 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2213 if ( aExp == 0 ) {
2214 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2215 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2217 zExp = aExp - bExp + 0x3FD;
2218 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2219 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2220 if ( bSig <= ( aSig + aSig ) ) {
2221 aSig >>= 1;
2222 ++zExp;
2224 zSig = estimateDiv128To64( aSig, 0, bSig );
2225 if ( ( zSig & 0x1FF ) <= 2 ) {
2226 mul64To128( bSig, zSig, &term0, &term1 );
2227 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2228 while ( (sbits64) rem0 < 0 ) {
2229 --zSig;
2230 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2232 zSig |= ( rem1 != 0 );
2234 return roundAndPackFloat64( zSign, zExp, zSig );
2239 -------------------------------------------------------------------------------
2240 Returns the remainder of the double-precision floating-point value `a'
2241 with respect to the corresponding value `b'. The operation is performed
2242 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2243 -------------------------------------------------------------------------------
2245 float64 float64_rem( float64 a, float64 b )
2247 flag aSign, bSign, zSign;
2248 int16 aExp, bExp, expDiff;
2249 bits64 aSig, bSig;
2250 bits64 q, alternateASig;
2251 sbits64 sigMean;
2253 aSig = extractFloat64Frac( a );
2254 aExp = extractFloat64Exp( a );
2255 aSign = extractFloat64Sign( a );
2256 bSig = extractFloat64Frac( b );
2257 bExp = extractFloat64Exp( b );
2258 bSign = extractFloat64Sign( b );
2259 if ( aExp == 0x7FF ) {
2260 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2261 return propagateFloat64NaN( a, b );
2263 float_raise( float_flag_invalid );
2264 return float64_default_nan;
2266 if ( bExp == 0x7FF ) {
2267 if ( bSig ) return propagateFloat64NaN( a, b );
2268 return a;
2270 if ( bExp == 0 ) {
2271 if ( bSig == 0 ) {
2272 float_raise( float_flag_invalid );
2273 return float64_default_nan;
2275 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2277 if ( aExp == 0 ) {
2278 if ( aSig == 0 ) return a;
2279 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2281 expDiff = aExp - bExp;
2282 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2283 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2284 if ( expDiff < 0 ) {
2285 if ( expDiff < -1 ) return a;
2286 aSig >>= 1;
2288 q = ( bSig <= aSig );
2289 if ( q ) aSig -= bSig;
2290 expDiff -= 64;
2291 while ( 0 < expDiff ) {
2292 q = estimateDiv128To64( aSig, 0, bSig );
2293 q = ( 2 < q ) ? q - 2 : 0;
2294 aSig = - ( ( bSig>>2 ) * q );
2295 expDiff -= 62;
2297 expDiff += 64;
2298 if ( 0 < expDiff ) {
2299 q = estimateDiv128To64( aSig, 0, bSig );
2300 q = ( 2 < q ) ? q - 2 : 0;
2301 q >>= 64 - expDiff;
2302 bSig >>= 2;
2303 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2305 else {
2306 aSig >>= 2;
2307 bSig >>= 2;
2309 do {
2310 alternateASig = aSig;
2311 ++q;
2312 aSig -= bSig;
2313 } while ( 0 <= (sbits64) aSig );
2314 sigMean = aSig + alternateASig;
2315 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2316 aSig = alternateASig;
2318 zSign = ( (sbits64) aSig < 0 );
2319 if ( zSign ) aSig = - aSig;
2320 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2325 -------------------------------------------------------------------------------
2326 Returns the square root of the double-precision floating-point value `a'.
2327 The operation is performed according to the IEC/IEEE Standard for Binary
2328 Floating-point Arithmetic.
2329 -------------------------------------------------------------------------------
2331 float64 float64_sqrt( float64 a )
2333 flag aSign;
2334 int16 aExp, zExp;
2335 bits64 aSig, zSig;
2336 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2337 //float64 z;
2339 aSig = extractFloat64Frac( a );
2340 aExp = extractFloat64Exp( a );
2341 aSign = extractFloat64Sign( a );
2342 if ( aExp == 0x7FF ) {
2343 if ( aSig ) return propagateFloat64NaN( a, a );
2344 if ( ! aSign ) return a;
2345 float_raise( float_flag_invalid );
2346 return float64_default_nan;
2348 if ( aSign ) {
2349 if ( ( aExp | aSig ) == 0 ) return a;
2350 float_raise( float_flag_invalid );
2351 return float64_default_nan;
2353 if ( aExp == 0 ) {
2354 if ( aSig == 0 ) return 0;
2355 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2357 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2358 aSig |= LIT64( 0x0010000000000000 );
2359 zSig = estimateSqrt32( aExp, aSig>>21 );
2360 zSig <<= 31;
2361 aSig <<= 9 - ( aExp & 1 );
2362 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2363 if ( ( zSig & 0x3FF ) <= 5 ) {
2364 if ( zSig < 2 ) {
2365 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2367 else {
2368 aSig <<= 2;
2369 mul64To128( zSig, zSig, &term0, &term1 );
2370 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2371 while ( (sbits64) rem0 < 0 ) {
2372 --zSig;
2373 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2374 term1 |= 1;
2375 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2377 zSig |= ( ( rem0 | rem1 ) != 0 );
2380 shift64RightJamming( zSig, 1, &zSig );
2381 return roundAndPackFloat64( 0, zExp, zSig );
2386 -------------------------------------------------------------------------------
2387 Returns 1 if the double-precision floating-point value `a' is equal to the
2388 corresponding value `b', and 0 otherwise. The comparison is performed
2389 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2390 -------------------------------------------------------------------------------
2392 flag float64_eq( float64 a, float64 b )
2395 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2396 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2398 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2399 float_raise( float_flag_invalid );
2401 return 0;
2403 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2408 -------------------------------------------------------------------------------
2409 Returns 1 if the double-precision floating-point value `a' is less than or
2410 equal to the corresponding value `b', and 0 otherwise. The comparison is
2411 performed according to the IEC/IEEE Standard for Binary Floating-point
2412 Arithmetic.
2413 -------------------------------------------------------------------------------
2415 flag float64_le( float64 a, float64 b )
2417 flag aSign, bSign;
2419 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2420 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2422 float_raise( float_flag_invalid );
2423 return 0;
2425 aSign = extractFloat64Sign( a );
2426 bSign = extractFloat64Sign( b );
2427 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2428 return ( a == b ) || ( aSign ^ ( a < b ) );
2433 -------------------------------------------------------------------------------
2434 Returns 1 if the double-precision floating-point value `a' is less than
2435 the corresponding value `b', and 0 otherwise. The comparison is performed
2436 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2437 -------------------------------------------------------------------------------
2439 flag float64_lt( float64 a, float64 b )
2441 flag aSign, bSign;
2443 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2444 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2446 float_raise( float_flag_invalid );
2447 return 0;
2449 aSign = extractFloat64Sign( a );
2450 bSign = extractFloat64Sign( b );
2451 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2452 return ( a != b ) && ( aSign ^ ( a < b ) );
2457 -------------------------------------------------------------------------------
2458 Returns 1 if the double-precision floating-point value `a' is equal to the
2459 corresponding value `b', and 0 otherwise. The invalid exception is raised
2460 if either operand is a NaN. Otherwise, the comparison is performed
2461 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2462 -------------------------------------------------------------------------------
2464 flag float64_eq_signaling( float64 a, float64 b )
2467 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2468 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2470 float_raise( float_flag_invalid );
2471 return 0;
2473 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2478 -------------------------------------------------------------------------------
2479 Returns 1 if the double-precision floating-point value `a' is less than or
2480 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2481 cause an exception. Otherwise, the comparison is performed according to the
2482 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2483 -------------------------------------------------------------------------------
2485 flag float64_le_quiet( float64 a, float64 b )
2487 flag aSign, bSign;
2488 //int16 aExp, bExp;
2490 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2491 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2493 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2494 float_raise( float_flag_invalid );
2496 return 0;
2498 aSign = extractFloat64Sign( a );
2499 bSign = extractFloat64Sign( b );
2500 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2501 return ( a == b ) || ( aSign ^ ( a < b ) );
2506 -------------------------------------------------------------------------------
2507 Returns 1 if the double-precision floating-point value `a' is less than
2508 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2509 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2510 Standard for Binary Floating-point Arithmetic.
2511 -------------------------------------------------------------------------------
2513 flag float64_lt_quiet( float64 a, float64 b )
2515 flag aSign, bSign;
2517 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2518 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2520 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2521 float_raise( float_flag_invalid );
2523 return 0;
2525 aSign = extractFloat64Sign( a );
2526 bSign = extractFloat64Sign( b );
2527 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2528 return ( a != b ) && ( aSign ^ ( a < b ) );
2532 #ifdef FLOATX80
2535 -------------------------------------------------------------------------------
2536 Returns the result of converting the extended double-precision floating-
2537 point value `a' to the 32-bit two's complement integer format. The
2538 conversion is performed according to the IEC/IEEE Standard for Binary
2539 Floating-point Arithmetic---which means in particular that the conversion
2540 is rounded according to the current rounding mode. If `a' is a NaN, the
2541 largest positive integer is returned. Otherwise, if the conversion
2542 overflows, the largest integer with the same sign as `a' is returned.
2543 -------------------------------------------------------------------------------
2545 int32 floatx80_to_int32( floatx80 a )
2547 flag aSign;
2548 int32 aExp, shiftCount;
2549 bits64 aSig;
2551 aSig = extractFloatx80Frac( a );
2552 aExp = extractFloatx80Exp( a );
2553 aSign = extractFloatx80Sign( a );
2554 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2555 shiftCount = 0x4037 - aExp;
2556 if ( shiftCount <= 0 ) shiftCount = 1;
2557 shift64RightJamming( aSig, shiftCount, &aSig );
2558 return roundAndPackInt32( aSign, aSig );
2563 -------------------------------------------------------------------------------
2564 Returns the result of converting the extended double-precision floating-
2565 point value `a' to the 32-bit two's complement integer format. The
2566 conversion is performed according to the IEC/IEEE Standard for Binary
2567 Floating-point Arithmetic, except that the conversion is always rounded
2568 toward zero. If `a' is a NaN, the largest positive integer is returned.
2569 Otherwise, if the conversion overflows, the largest integer with the same
2570 sign as `a' is returned.
2571 -------------------------------------------------------------------------------
2573 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2575 flag aSign;
2576 int32 aExp, shiftCount;
2577 bits64 aSig, savedASig;
2578 int32 z;
2580 aSig = extractFloatx80Frac( a );
2581 aExp = extractFloatx80Exp( a );
2582 aSign = extractFloatx80Sign( a );
2583 shiftCount = 0x403E - aExp;
2584 if ( shiftCount < 32 ) {
2585 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2586 goto invalid;
2588 else if ( 63 < shiftCount ) {
2589 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2590 return 0;
2592 savedASig = aSig;
2593 aSig >>= shiftCount;
2594 z = aSig;
2595 if ( aSign ) z = - z;
2596 if ( ( z < 0 ) ^ aSign ) {
2597 invalid:
2598 float_exception_flags |= float_flag_invalid;
2599 return aSign ? 0x80000000 : 0x7FFFFFFF;
2601 if ( ( aSig<<shiftCount ) != savedASig ) {
2602 float_exception_flags |= float_flag_inexact;
2604 return z;
2609 -------------------------------------------------------------------------------
2610 Returns the result of converting the extended double-precision floating-
2611 point value `a' to the single-precision floating-point format. The
2612 conversion is performed according to the IEC/IEEE Standard for Binary
2613 Floating-point Arithmetic.
2614 -------------------------------------------------------------------------------
2616 float32 floatx80_to_float32( floatx80 a )
2618 flag aSign;
2619 int32 aExp;
2620 bits64 aSig;
2622 aSig = extractFloatx80Frac( a );
2623 aExp = extractFloatx80Exp( a );
2624 aSign = extractFloatx80Sign( a );
2625 if ( aExp == 0x7FFF ) {
2626 if ( (bits64) ( aSig<<1 ) ) {
2627 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2629 return packFloat32( aSign, 0xFF, 0 );
2631 shift64RightJamming( aSig, 33, &aSig );
2632 if ( aExp || aSig ) aExp -= 0x3F81;
2633 return roundAndPackFloat32( aSign, aExp, aSig );
2638 -------------------------------------------------------------------------------
2639 Returns the result of converting the extended double-precision floating-
2640 point value `a' to the double-precision floating-point format. The
2641 conversion is performed according to the IEC/IEEE Standard for Binary
2642 Floating-point Arithmetic.
2643 -------------------------------------------------------------------------------
2645 float64 floatx80_to_float64( floatx80 a )
2647 flag aSign;
2648 int32 aExp;
2649 bits64 aSig, zSig;
2651 aSig = extractFloatx80Frac( a );
2652 aExp = extractFloatx80Exp( a );
2653 aSign = extractFloatx80Sign( a );
2654 if ( aExp == 0x7FFF ) {
2655 if ( (bits64) ( aSig<<1 ) ) {
2656 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2658 return packFloat64( aSign, 0x7FF, 0 );
2660 shift64RightJamming( aSig, 1, &zSig );
2661 if ( aExp || aSig ) aExp -= 0x3C01;
2662 return roundAndPackFloat64( aSign, aExp, zSig );
2667 -------------------------------------------------------------------------------
2668 Rounds the extended double-precision floating-point value `a' to an integer,
2669 and returns the result as an extended quadruple-precision floating-point
2670 value. The operation is performed according to the IEC/IEEE Standard for
2671 Binary Floating-point Arithmetic.
2672 -------------------------------------------------------------------------------
2674 floatx80 floatx80_round_to_int( floatx80 a )
2676 flag aSign;
2677 int32 aExp;
2678 bits64 lastBitMask, roundBitsMask;
2679 int8 roundingMode;
2680 floatx80 z;
2682 aExp = extractFloatx80Exp( a );
2683 if ( 0x403E <= aExp ) {
2684 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2685 return propagateFloatx80NaN( a, a );
2687 return a;
2689 if ( aExp <= 0x3FFE ) {
2690 if ( ( aExp == 0 )
2691 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2692 return a;
2694 float_exception_flags |= float_flag_inexact;
2695 aSign = extractFloatx80Sign( a );
2696 switch ( float_rounding_mode ) {
2697 case float_round_nearest_even:
2698 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2700 return
2701 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2703 break;
2704 case float_round_down:
2705 return
2706 aSign ?
2707 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2708 : packFloatx80( 0, 0, 0 );
2709 case float_round_up:
2710 return
2711 aSign ? packFloatx80( 1, 0, 0 )
2712 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2714 return packFloatx80( aSign, 0, 0 );
2716 lastBitMask = 1;
2717 lastBitMask <<= 0x403E - aExp;
2718 roundBitsMask = lastBitMask - 1;
2719 z = a;
2720 roundingMode = float_rounding_mode;
2721 if ( roundingMode == float_round_nearest_even ) {
2722 z.low += lastBitMask>>1;
2723 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2725 else if ( roundingMode != float_round_to_zero ) {
2726 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2727 z.low += roundBitsMask;
2730 z.low &= ~ roundBitsMask;
2731 if ( z.low == 0 ) {
2732 ++z.high;
2733 z.low = LIT64( 0x8000000000000000 );
2735 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
2736 return z;
2741 -------------------------------------------------------------------------------
2742 Returns the result of adding the absolute values of the extended double-
2743 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2744 negated before being returned. `zSign' is ignored if the result is a NaN.
2745 The addition is performed according to the IEC/IEEE Standard for Binary
2746 Floating-point Arithmetic.
2747 -------------------------------------------------------------------------------
2749 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2751 int32 aExp, bExp, zExp;
2752 bits64 aSig, bSig, zSig0, zSig1;
2753 int32 expDiff;
2755 aSig = extractFloatx80Frac( a );
2756 aExp = extractFloatx80Exp( a );
2757 bSig = extractFloatx80Frac( b );
2758 bExp = extractFloatx80Exp( b );
2759 expDiff = aExp - bExp;
2760 if ( 0 < expDiff ) {
2761 if ( aExp == 0x7FFF ) {
2762 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2763 return a;
2765 if ( bExp == 0 ) --expDiff;
2766 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2767 zExp = aExp;
2769 else if ( expDiff < 0 ) {
2770 if ( bExp == 0x7FFF ) {
2771 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2772 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2774 if ( aExp == 0 ) ++expDiff;
2775 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2776 zExp = bExp;
2778 else {
2779 if ( aExp == 0x7FFF ) {
2780 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2781 return propagateFloatx80NaN( a, b );
2783 return a;
2785 zSig1 = 0;
2786 zSig0 = aSig + bSig;
2787 if ( aExp == 0 ) {
2788 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2789 goto roundAndPack;
2791 zExp = aExp;
2792 goto shiftRight1;
2795 zSig0 = aSig + bSig;
2797 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2798 shiftRight1:
2799 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2800 zSig0 |= LIT64( 0x8000000000000000 );
2801 ++zExp;
2802 roundAndPack:
2803 return
2804 roundAndPackFloatx80(
2805 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2810 -------------------------------------------------------------------------------
2811 Returns the result of subtracting the absolute values of the extended
2812 double-precision floating-point values `a' and `b'. If `zSign' is true,
2813 the difference is negated before being returned. `zSign' is ignored if the
2814 result is a NaN. The subtraction is performed according to the IEC/IEEE
2815 Standard for Binary Floating-point Arithmetic.
2816 -------------------------------------------------------------------------------
2818 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2820 int32 aExp, bExp, zExp;
2821 bits64 aSig, bSig, zSig0, zSig1;
2822 int32 expDiff;
2823 floatx80 z;
2825 aSig = extractFloatx80Frac( a );
2826 aExp = extractFloatx80Exp( a );
2827 bSig = extractFloatx80Frac( b );
2828 bExp = extractFloatx80Exp( b );
2829 expDiff = aExp - bExp;
2830 if ( 0 < expDiff ) goto aExpBigger;
2831 if ( expDiff < 0 ) goto bExpBigger;
2832 if ( aExp == 0x7FFF ) {
2833 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2834 return propagateFloatx80NaN( a, b );
2836 float_raise( float_flag_invalid );
2837 z.low = floatx80_default_nan_low;
2838 z.high = floatx80_default_nan_high;
2839 return z;
2841 if ( aExp == 0 ) {
2842 aExp = 1;
2843 bExp = 1;
2845 zSig1 = 0;
2846 if ( bSig < aSig ) goto aBigger;
2847 if ( aSig < bSig ) goto bBigger;
2848 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
2849 bExpBigger:
2850 if ( bExp == 0x7FFF ) {
2851 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2852 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2854 if ( aExp == 0 ) ++expDiff;
2855 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2856 bBigger:
2857 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2858 zExp = bExp;
2859 zSign ^= 1;
2860 goto normalizeRoundAndPack;
2861 aExpBigger:
2862 if ( aExp == 0x7FFF ) {
2863 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2864 return a;
2866 if ( bExp == 0 ) --expDiff;
2867 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2868 aBigger:
2869 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2870 zExp = aExp;
2871 normalizeRoundAndPack:
2872 return
2873 normalizeRoundAndPackFloatx80(
2874 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2879 -------------------------------------------------------------------------------
2880 Returns the result of adding the extended double-precision floating-point
2881 values `a' and `b'. The operation is performed according to the IEC/IEEE
2882 Standard for Binary Floating-point Arithmetic.
2883 -------------------------------------------------------------------------------
2885 floatx80 floatx80_add( floatx80 a, floatx80 b )
2887 flag aSign, bSign;
2889 aSign = extractFloatx80Sign( a );
2890 bSign = extractFloatx80Sign( b );
2891 if ( aSign == bSign ) {
2892 return addFloatx80Sigs( a, b, aSign );
2894 else {
2895 return subFloatx80Sigs( a, b, aSign );
2901 -------------------------------------------------------------------------------
2902 Returns the result of subtracting the extended double-precision floating-
2903 point values `a' and `b'. The operation is performed according to the
2904 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2905 -------------------------------------------------------------------------------
2907 floatx80 floatx80_sub( floatx80 a, floatx80 b )
2909 flag aSign, bSign;
2911 aSign = extractFloatx80Sign( a );
2912 bSign = extractFloatx80Sign( b );
2913 if ( aSign == bSign ) {
2914 return subFloatx80Sigs( a, b, aSign );
2916 else {
2917 return addFloatx80Sigs( a, b, aSign );
2923 -------------------------------------------------------------------------------
2924 Returns the result of multiplying the extended double-precision floating-
2925 point values `a' and `b'. The operation is performed according to the
2926 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2927 -------------------------------------------------------------------------------
2929 floatx80 floatx80_mul( floatx80 a, floatx80 b )
2931 flag aSign, bSign, zSign;
2932 int32 aExp, bExp, zExp;
2933 bits64 aSig, bSig, zSig0, zSig1;
2934 floatx80 z;
2936 aSig = extractFloatx80Frac( a );
2937 aExp = extractFloatx80Exp( a );
2938 aSign = extractFloatx80Sign( a );
2939 bSig = extractFloatx80Frac( b );
2940 bExp = extractFloatx80Exp( b );
2941 bSign = extractFloatx80Sign( b );
2942 zSign = aSign ^ bSign;
2943 if ( aExp == 0x7FFF ) {
2944 if ( (bits64) ( aSig<<1 )
2945 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2946 return propagateFloatx80NaN( a, b );
2948 if ( ( bExp | bSig ) == 0 ) goto invalid;
2949 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2951 if ( bExp == 0x7FFF ) {
2952 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2953 if ( ( aExp | aSig ) == 0 ) {
2954 invalid:
2955 float_raise( float_flag_invalid );
2956 z.low = floatx80_default_nan_low;
2957 z.high = floatx80_default_nan_high;
2958 return z;
2960 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2962 if ( aExp == 0 ) {
2963 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2964 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2966 if ( bExp == 0 ) {
2967 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2968 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2970 zExp = aExp + bExp - 0x3FFE;
2971 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2972 if ( 0 < (sbits64) zSig0 ) {
2973 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2974 --zExp;
2976 return
2977 roundAndPackFloatx80(
2978 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2983 -------------------------------------------------------------------------------
2984 Returns the result of dividing the extended double-precision floating-point
2985 value `a' by the corresponding value `b'. The operation is performed
2986 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2987 -------------------------------------------------------------------------------
2989 floatx80 floatx80_div( floatx80 a, floatx80 b )
2991 flag aSign, bSign, zSign;
2992 int32 aExp, bExp, zExp;
2993 bits64 aSig, bSig, zSig0, zSig1;
2994 bits64 rem0, rem1, rem2, term0, term1, term2;
2995 floatx80 z;
2997 aSig = extractFloatx80Frac( a );
2998 aExp = extractFloatx80Exp( a );
2999 aSign = extractFloatx80Sign( a );
3000 bSig = extractFloatx80Frac( b );
3001 bExp = extractFloatx80Exp( b );
3002 bSign = extractFloatx80Sign( b );
3003 zSign = aSign ^ bSign;
3004 if ( aExp == 0x7FFF ) {
3005 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3006 if ( bExp == 0x7FFF ) {
3007 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3008 goto invalid;
3010 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3012 if ( bExp == 0x7FFF ) {
3013 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3014 return packFloatx80( zSign, 0, 0 );
3016 if ( bExp == 0 ) {
3017 if ( bSig == 0 ) {
3018 if ( ( aExp | aSig ) == 0 ) {
3019 invalid:
3020 float_raise( float_flag_invalid );
3021 z.low = floatx80_default_nan_low;
3022 z.high = floatx80_default_nan_high;
3023 return z;
3025 float_raise( float_flag_divbyzero );
3026 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3028 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3030 if ( aExp == 0 ) {
3031 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3032 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3034 zExp = aExp - bExp + 0x3FFE;
3035 rem1 = 0;
3036 if ( bSig <= aSig ) {
3037 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3038 ++zExp;
3040 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3041 mul64To128( bSig, zSig0, &term0, &term1 );
3042 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3043 while ( (sbits64) rem0 < 0 ) {
3044 --zSig0;
3045 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3047 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3048 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3049 mul64To128( bSig, zSig1, &term1, &term2 );
3050 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3051 while ( (sbits64) rem1 < 0 ) {
3052 --zSig1;
3053 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3055 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3057 return
3058 roundAndPackFloatx80(
3059 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3064 -------------------------------------------------------------------------------
3065 Returns the remainder of the extended double-precision floating-point value
3066 `a' with respect to the corresponding value `b'. The operation is performed
3067 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3068 -------------------------------------------------------------------------------
3070 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3072 flag aSign, bSign, zSign;
3073 int32 aExp, bExp, expDiff;
3074 bits64 aSig0, aSig1, bSig;
3075 bits64 q, term0, term1, alternateASig0, alternateASig1;
3076 floatx80 z;
3078 aSig0 = extractFloatx80Frac( a );
3079 aExp = extractFloatx80Exp( a );
3080 aSign = extractFloatx80Sign( a );
3081 bSig = extractFloatx80Frac( b );
3082 bExp = extractFloatx80Exp( b );
3083 bSign = extractFloatx80Sign( b );
3084 if ( aExp == 0x7FFF ) {
3085 if ( (bits64) ( aSig0<<1 )
3086 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3087 return propagateFloatx80NaN( a, b );
3089 goto invalid;
3091 if ( bExp == 0x7FFF ) {
3092 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3093 return a;
3095 if ( bExp == 0 ) {
3096 if ( bSig == 0 ) {
3097 invalid:
3098 float_raise( float_flag_invalid );
3099 z.low = floatx80_default_nan_low;
3100 z.high = floatx80_default_nan_high;
3101 return z;
3103 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3105 if ( aExp == 0 ) {
3106 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3107 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3109 bSig |= LIT64( 0x8000000000000000 );
3110 zSign = aSign;
3111 expDiff = aExp - bExp;
3112 aSig1 = 0;
3113 if ( expDiff < 0 ) {
3114 if ( expDiff < -1 ) return a;
3115 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3116 expDiff = 0;
3118 q = ( bSig <= aSig0 );
3119 if ( q ) aSig0 -= bSig;
3120 expDiff -= 64;
3121 while ( 0 < expDiff ) {
3122 q = estimateDiv128To64( aSig0, aSig1, bSig );
3123 q = ( 2 < q ) ? q - 2 : 0;
3124 mul64To128( bSig, q, &term0, &term1 );
3125 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3126 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3127 expDiff -= 62;
3129 expDiff += 64;
3130 if ( 0 < expDiff ) {
3131 q = estimateDiv128To64( aSig0, aSig1, bSig );
3132 q = ( 2 < q ) ? q - 2 : 0;
3133 q >>= 64 - expDiff;
3134 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3135 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3136 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3137 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3138 ++q;
3139 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3142 else {
3143 term1 = 0;
3144 term0 = bSig;
3146 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3147 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3148 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3149 && ( q & 1 ) )
3151 aSig0 = alternateASig0;
3152 aSig1 = alternateASig1;
3153 zSign = ! zSign;
3155 return
3156 normalizeRoundAndPackFloatx80(
3157 80, zSign, bExp + expDiff, aSig0, aSig1 );
3162 -------------------------------------------------------------------------------
3163 Returns the square root of the extended double-precision floating-point
3164 value `a'. The operation is performed according to the IEC/IEEE Standard
3165 for Binary Floating-point Arithmetic.
3166 -------------------------------------------------------------------------------
3168 floatx80 floatx80_sqrt( floatx80 a )
3170 flag aSign;
3171 int32 aExp, zExp;
3172 bits64 aSig0, aSig1, zSig0, zSig1;
3173 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3174 bits64 shiftedRem0, shiftedRem1;
3175 floatx80 z;
3177 aSig0 = extractFloatx80Frac( a );
3178 aExp = extractFloatx80Exp( a );
3179 aSign = extractFloatx80Sign( a );
3180 if ( aExp == 0x7FFF ) {
3181 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3182 if ( ! aSign ) return a;
3183 goto invalid;
3185 if ( aSign ) {
3186 if ( ( aExp | aSig0 ) == 0 ) return a;
3187 invalid:
3188 float_raise( float_flag_invalid );
3189 z.low = floatx80_default_nan_low;
3190 z.high = floatx80_default_nan_high;
3191 return z;
3193 if ( aExp == 0 ) {
3194 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3195 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3197 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3198 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3199 zSig0 <<= 31;
3200 aSig1 = 0;
3201 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3202 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3203 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3204 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3205 mul64To128( zSig0, zSig0, &term0, &term1 );
3206 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3207 while ( (sbits64) rem0 < 0 ) {
3208 --zSig0;
3209 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3210 term1 |= 1;
3211 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3213 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3214 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3215 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3216 if ( zSig1 == 0 ) zSig1 = 1;
3217 mul64To128( zSig0, zSig1, &term1, &term2 );
3218 shortShift128Left( term1, term2, 1, &term1, &term2 );
3219 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3220 mul64To128( zSig1, zSig1, &term2, &term3 );
3221 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3222 while ( (sbits64) rem1 < 0 ) {
3223 --zSig1;
3224 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3225 term3 |= 1;
3226 add192(
3227 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3229 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3231 return
3232 roundAndPackFloatx80(
3233 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3238 -------------------------------------------------------------------------------
3239 Returns 1 if the extended double-precision floating-point value `a' is
3240 equal to the corresponding value `b', and 0 otherwise. The comparison is
3241 performed according to the IEC/IEEE Standard for Binary Floating-point
3242 Arithmetic.
3243 -------------------------------------------------------------------------------
3245 flag floatx80_eq( floatx80 a, floatx80 b )
3248 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3249 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3250 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3251 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3253 if ( floatx80_is_signaling_nan( a )
3254 || floatx80_is_signaling_nan( b ) ) {
3255 float_raise( float_flag_invalid );
3257 return 0;
3259 return
3260 ( a.low == b.low )
3261 && ( ( a.high == b.high )
3262 || ( ( a.low == 0 )
3263 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3269 -------------------------------------------------------------------------------
3270 Returns 1 if the extended double-precision floating-point value `a' is
3271 less than or equal to the corresponding value `b', and 0 otherwise. The
3272 comparison is performed according to the IEC/IEEE Standard for Binary
3273 Floating-point Arithmetic.
3274 -------------------------------------------------------------------------------
3276 flag floatx80_le( floatx80 a, floatx80 b )
3278 flag aSign, bSign;
3280 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3281 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3282 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3283 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3285 float_raise( float_flag_invalid );
3286 return 0;
3288 aSign = extractFloatx80Sign( a );
3289 bSign = extractFloatx80Sign( b );
3290 if ( aSign != bSign ) {
3291 return
3292 aSign
3293 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3294 == 0 );
3296 return
3297 aSign ? le128( b.high, b.low, a.high, a.low )
3298 : le128( a.high, a.low, b.high, b.low );
3303 -------------------------------------------------------------------------------
3304 Returns 1 if the extended double-precision floating-point value `a' is
3305 less than the corresponding value `b', and 0 otherwise. The comparison
3306 is performed according to the IEC/IEEE Standard for Binary Floating-point
3307 Arithmetic.
3308 -------------------------------------------------------------------------------
3310 flag floatx80_lt( floatx80 a, floatx80 b )
3312 flag aSign, bSign;
3314 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3315 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3316 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3317 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3319 float_raise( float_flag_invalid );
3320 return 0;
3322 aSign = extractFloatx80Sign( a );
3323 bSign = extractFloatx80Sign( b );
3324 if ( aSign != bSign ) {
3325 return
3326 aSign
3327 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3328 != 0 );
3330 return
3331 aSign ? lt128( b.high, b.low, a.high, a.low )
3332 : lt128( a.high, a.low, b.high, b.low );
3337 -------------------------------------------------------------------------------
3338 Returns 1 if the extended double-precision floating-point value `a' is equal
3339 to the corresponding value `b', and 0 otherwise. The invalid exception is
3340 raised if either operand is a NaN. Otherwise, the comparison is performed
3341 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3342 -------------------------------------------------------------------------------
3344 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3347 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3348 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3349 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3350 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3352 float_raise( float_flag_invalid );
3353 return 0;
3355 return
3356 ( a.low == b.low )
3357 && ( ( a.high == b.high )
3358 || ( ( a.low == 0 )
3359 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3365 -------------------------------------------------------------------------------
3366 Returns 1 if the extended double-precision floating-point value `a' is less
3367 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3368 do not cause an exception. Otherwise, the comparison is performed according
3369 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3370 -------------------------------------------------------------------------------
3372 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3374 flag aSign, bSign;
3376 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3377 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3378 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3379 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3381 if ( floatx80_is_signaling_nan( a )
3382 || floatx80_is_signaling_nan( b ) ) {
3383 float_raise( float_flag_invalid );
3385 return 0;
3387 aSign = extractFloatx80Sign( a );
3388 bSign = extractFloatx80Sign( b );
3389 if ( aSign != bSign ) {
3390 return
3391 aSign
3392 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3393 == 0 );
3395 return
3396 aSign ? le128( b.high, b.low, a.high, a.low )
3397 : le128( a.high, a.low, b.high, b.low );
3402 -------------------------------------------------------------------------------
3403 Returns 1 if the extended double-precision floating-point value `a' is less
3404 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3405 an exception. Otherwise, the comparison is performed according to the
3406 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3407 -------------------------------------------------------------------------------
3409 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3411 flag aSign, bSign;
3413 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3414 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3415 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3416 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3418 if ( floatx80_is_signaling_nan( a )
3419 || floatx80_is_signaling_nan( b ) ) {
3420 float_raise( float_flag_invalid );
3422 return 0;
3424 aSign = extractFloatx80Sign( a );
3425 bSign = extractFloatx80Sign( b );
3426 if ( aSign != bSign ) {
3427 return
3428 aSign
3429 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3430 != 0 );
3432 return
3433 aSign ? lt128( b.high, b.low, a.high, a.low )
3434 : lt128( a.high, a.low, b.high, b.low );
3438 #endif