kbuild: fix silentoldconfig with make O=
[linux-2.6/verdex.git] / arch / arm / nwfpe / softfloat.c
blobf9f049132a17bffb920acb8df278f5e91b514f7a
1 /*
2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
31 #include <asm/div64.h>
33 #include "fpa11.h"
34 //#include "milieu.h"
35 //#include "softfloat.h"
38 -------------------------------------------------------------------------------
39 Primitive arithmetic functions, including multi-word arithmetic, and
40 division and square root approximations. (Can be specialized to target if
41 desired.)
42 -------------------------------------------------------------------------------
44 #include "softfloat-macros"
47 -------------------------------------------------------------------------------
48 Functions and definitions to determine: (1) whether tininess for underflow
49 is detected before or after rounding by default, (2) what (if anything)
50 happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 are propagated from function inputs to output. These details are target-
53 specific.
54 -------------------------------------------------------------------------------
56 #include "softfloat-specialize"
59 -------------------------------------------------------------------------------
60 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
61 and 7, and returns the properly rounded 32-bit integer corresponding to the
62 input. If `zSign' is nonzero, the input is negated before being converted
63 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
64 input is simply rounded to an integer, with the inexact exception raised if
65 the input cannot be represented exactly as an integer. If the fixed-point
66 input is too large, however, the invalid exception is raised and the largest
67 positive or negative integer is returned.
68 -------------------------------------------------------------------------------
70 static int32 roundAndPackInt32( struct roundingData *roundData, flag zSign, bits64 absZ )
72 int8 roundingMode;
73 flag roundNearestEven;
74 int8 roundIncrement, roundBits;
75 int32 z;
77 roundingMode = roundData->mode;
78 roundNearestEven = ( roundingMode == float_round_nearest_even );
79 roundIncrement = 0x40;
80 if ( ! roundNearestEven ) {
81 if ( roundingMode == float_round_to_zero ) {
82 roundIncrement = 0;
84 else {
85 roundIncrement = 0x7F;
86 if ( zSign ) {
87 if ( roundingMode == float_round_up ) roundIncrement = 0;
89 else {
90 if ( roundingMode == float_round_down ) roundIncrement = 0;
94 roundBits = absZ & 0x7F;
95 absZ = ( absZ + roundIncrement )>>7;
96 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
97 z = absZ;
98 if ( zSign ) z = - z;
99 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
100 roundData->exception |= float_flag_invalid;
101 return zSign ? 0x80000000 : 0x7FFFFFFF;
103 if ( roundBits ) roundData->exception |= float_flag_inexact;
104 return z;
109 -------------------------------------------------------------------------------
110 Returns the fraction bits of the single-precision floating-point value `a'.
111 -------------------------------------------------------------------------------
113 INLINE bits32 extractFloat32Frac( float32 a )
116 return a & 0x007FFFFF;
121 -------------------------------------------------------------------------------
122 Returns the exponent bits of the single-precision floating-point value `a'.
123 -------------------------------------------------------------------------------
125 INLINE int16 extractFloat32Exp( float32 a )
128 return ( a>>23 ) & 0xFF;
133 -------------------------------------------------------------------------------
134 Returns the sign bit of the single-precision floating-point value `a'.
135 -------------------------------------------------------------------------------
137 #if 0 /* in softfloat.h */
138 INLINE flag extractFloat32Sign( float32 a )
141 return a>>31;
144 #endif
147 -------------------------------------------------------------------------------
148 Normalizes the subnormal single-precision floating-point value represented
149 by the denormalized significand `aSig'. The normalized exponent and
150 significand are stored at the locations pointed to by `zExpPtr' and
151 `zSigPtr', respectively.
152 -------------------------------------------------------------------------------
154 static void
155 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
157 int8 shiftCount;
159 shiftCount = countLeadingZeros32( aSig ) - 8;
160 *zSigPtr = aSig<<shiftCount;
161 *zExpPtr = 1 - shiftCount;
166 -------------------------------------------------------------------------------
167 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
168 single-precision floating-point value, returning the result. After being
169 shifted into the proper positions, the three fields are simply added
170 together to form the result. This means that any integer portion of `zSig'
171 will be added into the exponent. Since a properly normalized significand
172 will have an integer portion equal to 1, the `zExp' input should be 1 less
173 than the desired result exponent whenever `zSig' is a complete, normalized
174 significand.
175 -------------------------------------------------------------------------------
177 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
179 #if 0
180 float32 f;
181 __asm__("@ packFloat32 \n\
182 mov %0, %1, asl #31 \n\
183 orr %0, %2, asl #23 \n\
184 orr %0, %3"
185 : /* no outputs */
186 : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
187 : "cc");
188 return f;
189 #else
190 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
191 #endif
195 -------------------------------------------------------------------------------
196 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
197 and significand `zSig', and returns the proper single-precision floating-
198 point value corresponding to the abstract input. Ordinarily, the abstract
199 value is simply rounded and packed into the single-precision format, with
200 the inexact exception raised if the abstract input cannot be represented
201 exactly. If the abstract value is too large, however, the overflow and
202 inexact exceptions are raised and an infinity or maximal finite value is
203 returned. If the abstract value is too small, the input value is rounded to
204 a subnormal number, and the underflow and inexact exceptions are raised if
205 the abstract input cannot be represented exactly as a subnormal single-
206 precision floating-point number.
207 The input significand `zSig' has its binary point between bits 30
208 and 29, which is 7 bits to the left of the usual location. This shifted
209 significand must be normalized or smaller. If `zSig' is not normalized,
210 `zExp' must be 0; in that case, the result returned is a subnormal number,
211 and it must not require rounding. In the usual case that `zSig' is
212 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
213 The handling of underflow and overflow follows the IEC/IEEE Standard for
214 Binary Floating-point Arithmetic.
215 -------------------------------------------------------------------------------
217 static float32 roundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
219 int8 roundingMode;
220 flag roundNearestEven;
221 int8 roundIncrement, roundBits;
222 flag isTiny;
224 roundingMode = roundData->mode;
225 roundNearestEven = ( roundingMode == float_round_nearest_even );
226 roundIncrement = 0x40;
227 if ( ! roundNearestEven ) {
228 if ( roundingMode == float_round_to_zero ) {
229 roundIncrement = 0;
231 else {
232 roundIncrement = 0x7F;
233 if ( zSign ) {
234 if ( roundingMode == float_round_up ) roundIncrement = 0;
236 else {
237 if ( roundingMode == float_round_down ) roundIncrement = 0;
241 roundBits = zSig & 0x7F;
242 if ( 0xFD <= (bits16) zExp ) {
243 if ( ( 0xFD < zExp )
244 || ( ( zExp == 0xFD )
245 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
247 roundData->exception |= float_flag_overflow | float_flag_inexact;
248 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
250 if ( zExp < 0 ) {
251 isTiny =
252 ( float_detect_tininess == float_tininess_before_rounding )
253 || ( zExp < -1 )
254 || ( zSig + roundIncrement < 0x80000000 );
255 shift32RightJamming( zSig, - zExp, &zSig );
256 zExp = 0;
257 roundBits = zSig & 0x7F;
258 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
261 if ( roundBits ) roundData->exception |= float_flag_inexact;
262 zSig = ( zSig + roundIncrement )>>7;
263 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
264 if ( zSig == 0 ) zExp = 0;
265 return packFloat32( zSign, zExp, zSig );
270 -------------------------------------------------------------------------------
271 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
272 and significand `zSig', and returns the proper single-precision floating-
273 point value corresponding to the abstract input. This routine is just like
274 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
275 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
276 point exponent.
277 -------------------------------------------------------------------------------
279 static float32
280 normalizeRoundAndPackFloat32( struct roundingData *roundData, flag zSign, int16 zExp, bits32 zSig )
282 int8 shiftCount;
284 shiftCount = countLeadingZeros32( zSig ) - 1;
285 return roundAndPackFloat32( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
290 -------------------------------------------------------------------------------
291 Returns the fraction bits of the double-precision floating-point value `a'.
292 -------------------------------------------------------------------------------
294 INLINE bits64 extractFloat64Frac( float64 a )
297 return a & LIT64( 0x000FFFFFFFFFFFFF );
302 -------------------------------------------------------------------------------
303 Returns the exponent bits of the double-precision floating-point value `a'.
304 -------------------------------------------------------------------------------
306 INLINE int16 extractFloat64Exp( float64 a )
309 return ( a>>52 ) & 0x7FF;
314 -------------------------------------------------------------------------------
315 Returns the sign bit of the double-precision floating-point value `a'.
316 -------------------------------------------------------------------------------
318 #if 0 /* in softfloat.h */
319 INLINE flag extractFloat64Sign( float64 a )
322 return a>>63;
325 #endif
328 -------------------------------------------------------------------------------
329 Normalizes the subnormal double-precision floating-point value represented
330 by the denormalized significand `aSig'. The normalized exponent and
331 significand are stored at the locations pointed to by `zExpPtr' and
332 `zSigPtr', respectively.
333 -------------------------------------------------------------------------------
335 static void
336 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
338 int8 shiftCount;
340 shiftCount = countLeadingZeros64( aSig ) - 11;
341 *zSigPtr = aSig<<shiftCount;
342 *zExpPtr = 1 - shiftCount;
347 -------------------------------------------------------------------------------
348 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
349 double-precision floating-point value, returning the result. After being
350 shifted into the proper positions, the three fields are simply added
351 together to form the result. This means that any integer portion of `zSig'
352 will be added into the exponent. Since a properly normalized significand
353 will have an integer portion equal to 1, the `zExp' input should be 1 less
354 than the desired result exponent whenever `zSig' is a complete, normalized
355 significand.
356 -------------------------------------------------------------------------------
358 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
361 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
366 -------------------------------------------------------------------------------
367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 and significand `zSig', and returns the proper double-precision floating-
369 point value corresponding to the abstract input. Ordinarily, the abstract
370 value is simply rounded and packed into the double-precision format, with
371 the inexact exception raised if the abstract input cannot be represented
372 exactly. If the abstract value is too large, however, the overflow and
373 inexact exceptions are raised and an infinity or maximal finite value is
374 returned. If the abstract value is too small, the input value is rounded to
375 a subnormal number, and the underflow and inexact exceptions are raised if
376 the abstract input cannot be represented exactly as a subnormal double-
377 precision floating-point number.
378 The input significand `zSig' has its binary point between bits 62
379 and 61, which is 10 bits to the left of the usual location. This shifted
380 significand must be normalized or smaller. If `zSig' is not normalized,
381 `zExp' must be 0; in that case, the result returned is a subnormal number,
382 and it must not require rounding. In the usual case that `zSig' is
383 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
384 The handling of underflow and overflow follows the IEC/IEEE Standard for
385 Binary Floating-point Arithmetic.
386 -------------------------------------------------------------------------------
388 static float64 roundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
390 int8 roundingMode;
391 flag roundNearestEven;
392 int16 roundIncrement, roundBits;
393 flag isTiny;
395 roundingMode = roundData->mode;
396 roundNearestEven = ( roundingMode == float_round_nearest_even );
397 roundIncrement = 0x200;
398 if ( ! roundNearestEven ) {
399 if ( roundingMode == float_round_to_zero ) {
400 roundIncrement = 0;
402 else {
403 roundIncrement = 0x3FF;
404 if ( zSign ) {
405 if ( roundingMode == float_round_up ) roundIncrement = 0;
407 else {
408 if ( roundingMode == float_round_down ) roundIncrement = 0;
412 roundBits = zSig & 0x3FF;
413 if ( 0x7FD <= (bits16) zExp ) {
414 if ( ( 0x7FD < zExp )
415 || ( ( zExp == 0x7FD )
416 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
418 //register int lr = __builtin_return_address(0);
419 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
420 roundData->exception |= float_flag_overflow | float_flag_inexact;
421 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
423 if ( zExp < 0 ) {
424 isTiny =
425 ( float_detect_tininess == float_tininess_before_rounding )
426 || ( zExp < -1 )
427 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
428 shift64RightJamming( zSig, - zExp, &zSig );
429 zExp = 0;
430 roundBits = zSig & 0x3FF;
431 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
434 if ( roundBits ) roundData->exception |= float_flag_inexact;
435 zSig = ( zSig + roundIncrement )>>10;
436 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
437 if ( zSig == 0 ) zExp = 0;
438 return packFloat64( zSign, zExp, zSig );
443 -------------------------------------------------------------------------------
444 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
445 and significand `zSig', and returns the proper double-precision floating-
446 point value corresponding to the abstract input. This routine is just like
447 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
448 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
449 point exponent.
450 -------------------------------------------------------------------------------
452 static float64
453 normalizeRoundAndPackFloat64( struct roundingData *roundData, flag zSign, int16 zExp, bits64 zSig )
455 int8 shiftCount;
457 shiftCount = countLeadingZeros64( zSig ) - 1;
458 return roundAndPackFloat64( roundData, zSign, zExp - shiftCount, zSig<<shiftCount );
462 #ifdef FLOATX80
465 -------------------------------------------------------------------------------
466 Returns the fraction bits of the extended double-precision floating-point
467 value `a'.
468 -------------------------------------------------------------------------------
470 INLINE bits64 extractFloatx80Frac( floatx80 a )
473 return a.low;
478 -------------------------------------------------------------------------------
479 Returns the exponent bits of the extended double-precision floating-point
480 value `a'.
481 -------------------------------------------------------------------------------
483 INLINE int32 extractFloatx80Exp( floatx80 a )
486 return a.high & 0x7FFF;
491 -------------------------------------------------------------------------------
492 Returns the sign bit of the extended double-precision floating-point value
493 `a'.
494 -------------------------------------------------------------------------------
496 INLINE flag extractFloatx80Sign( floatx80 a )
499 return a.high>>15;
504 -------------------------------------------------------------------------------
505 Normalizes the subnormal extended double-precision floating-point value
506 represented by the denormalized significand `aSig'. The normalized exponent
507 and significand are stored at the locations pointed to by `zExpPtr' and
508 `zSigPtr', respectively.
509 -------------------------------------------------------------------------------
511 static void
512 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
514 int8 shiftCount;
516 shiftCount = countLeadingZeros64( aSig );
517 *zSigPtr = aSig<<shiftCount;
518 *zExpPtr = 1 - shiftCount;
523 -------------------------------------------------------------------------------
524 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
525 extended double-precision floating-point value, returning the result.
526 -------------------------------------------------------------------------------
528 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
530 floatx80 z;
532 z.low = zSig;
533 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
534 return z;
539 -------------------------------------------------------------------------------
540 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
541 and extended significand formed by the concatenation of `zSig0' and `zSig1',
542 and returns the proper extended double-precision floating-point value
543 corresponding to the abstract input. Ordinarily, the abstract value is
544 rounded and packed into the extended double-precision format, with the
545 inexact exception raised if the abstract input cannot be represented
546 exactly. If the abstract value is too large, however, the overflow and
547 inexact exceptions are raised and an infinity or maximal finite value is
548 returned. If the abstract value is too small, the input value is rounded to
549 a subnormal number, and the underflow and inexact exceptions are raised if
550 the abstract input cannot be represented exactly as a subnormal extended
551 double-precision floating-point number.
552 If `roundingPrecision' is 32 or 64, the result is rounded to the same
553 number of bits as single or double precision, respectively. Otherwise, the
554 result is rounded to the full precision of the extended double-precision
555 format.
556 The input significand must be normalized or smaller. If the input
557 significand is not normalized, `zExp' must be 0; in that case, the result
558 returned is a subnormal number, and it must not require rounding. The
559 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
560 Floating-point Arithmetic.
561 -------------------------------------------------------------------------------
563 static floatx80
564 roundAndPackFloatx80(
565 struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
568 int8 roundingMode, roundingPrecision;
569 flag roundNearestEven, increment, isTiny;
570 int64 roundIncrement, roundMask, roundBits;
572 roundingMode = roundData->mode;
573 roundingPrecision = roundData->precision;
574 roundNearestEven = ( roundingMode == float_round_nearest_even );
575 if ( roundingPrecision == 80 ) goto precision80;
576 if ( roundingPrecision == 64 ) {
577 roundIncrement = LIT64( 0x0000000000000400 );
578 roundMask = LIT64( 0x00000000000007FF );
580 else if ( roundingPrecision == 32 ) {
581 roundIncrement = LIT64( 0x0000008000000000 );
582 roundMask = LIT64( 0x000000FFFFFFFFFF );
584 else {
585 goto precision80;
587 zSig0 |= ( zSig1 != 0 );
588 if ( ! roundNearestEven ) {
589 if ( roundingMode == float_round_to_zero ) {
590 roundIncrement = 0;
592 else {
593 roundIncrement = roundMask;
594 if ( zSign ) {
595 if ( roundingMode == float_round_up ) roundIncrement = 0;
597 else {
598 if ( roundingMode == float_round_down ) roundIncrement = 0;
602 roundBits = zSig0 & roundMask;
603 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
604 if ( ( 0x7FFE < zExp )
605 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
607 goto overflow;
609 if ( zExp <= 0 ) {
610 isTiny =
611 ( float_detect_tininess == float_tininess_before_rounding )
612 || ( zExp < 0 )
613 || ( zSig0 <= zSig0 + roundIncrement );
614 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
615 zExp = 0;
616 roundBits = zSig0 & roundMask;
617 if ( isTiny && roundBits ) roundData->exception |= float_flag_underflow;
618 if ( roundBits ) roundData->exception |= float_flag_inexact;
619 zSig0 += roundIncrement;
620 if ( (sbits64) zSig0 < 0 ) zExp = 1;
621 roundIncrement = roundMask + 1;
622 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
623 roundMask |= roundIncrement;
625 zSig0 &= ~ roundMask;
626 return packFloatx80( zSign, zExp, zSig0 );
629 if ( roundBits ) roundData->exception |= float_flag_inexact;
630 zSig0 += roundIncrement;
631 if ( zSig0 < roundIncrement ) {
632 ++zExp;
633 zSig0 = LIT64( 0x8000000000000000 );
635 roundIncrement = roundMask + 1;
636 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
637 roundMask |= roundIncrement;
639 zSig0 &= ~ roundMask;
640 if ( zSig0 == 0 ) zExp = 0;
641 return packFloatx80( zSign, zExp, zSig0 );
642 precision80:
643 increment = ( (sbits64) zSig1 < 0 );
644 if ( ! roundNearestEven ) {
645 if ( roundingMode == float_round_to_zero ) {
646 increment = 0;
648 else {
649 if ( zSign ) {
650 increment = ( roundingMode == float_round_down ) && zSig1;
652 else {
653 increment = ( roundingMode == float_round_up ) && zSig1;
657 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
658 if ( ( 0x7FFE < zExp )
659 || ( ( zExp == 0x7FFE )
660 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
661 && increment
664 roundMask = 0;
665 overflow:
666 roundData->exception |= float_flag_overflow | float_flag_inexact;
667 if ( ( roundingMode == float_round_to_zero )
668 || ( zSign && ( roundingMode == float_round_up ) )
669 || ( ! zSign && ( roundingMode == float_round_down ) )
671 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
673 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
675 if ( zExp <= 0 ) {
676 isTiny =
677 ( float_detect_tininess == float_tininess_before_rounding )
678 || ( zExp < 0 )
679 || ! increment
680 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
681 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
682 zExp = 0;
683 if ( isTiny && zSig1 ) roundData->exception |= float_flag_underflow;
684 if ( zSig1 ) roundData->exception |= float_flag_inexact;
685 if ( roundNearestEven ) {
686 increment = ( (sbits64) zSig1 < 0 );
688 else {
689 if ( zSign ) {
690 increment = ( roundingMode == float_round_down ) && zSig1;
692 else {
693 increment = ( roundingMode == float_round_up ) && zSig1;
696 if ( increment ) {
697 ++zSig0;
698 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
699 if ( (sbits64) zSig0 < 0 ) zExp = 1;
701 return packFloatx80( zSign, zExp, zSig0 );
704 if ( zSig1 ) roundData->exception |= float_flag_inexact;
705 if ( increment ) {
706 ++zSig0;
707 if ( zSig0 == 0 ) {
708 ++zExp;
709 zSig0 = LIT64( 0x8000000000000000 );
711 else {
712 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
715 else {
716 if ( zSig0 == 0 ) zExp = 0;
719 return packFloatx80( zSign, zExp, zSig0 );
723 -------------------------------------------------------------------------------
724 Takes an abstract floating-point value having sign `zSign', exponent
725 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
726 and returns the proper extended double-precision floating-point value
727 corresponding to the abstract input. This routine is just like
728 `roundAndPackFloatx80' except that the input significand does not have to be
729 normalized.
730 -------------------------------------------------------------------------------
732 static floatx80
733 normalizeRoundAndPackFloatx80(
734 struct roundingData *roundData, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
737 int8 shiftCount;
739 if ( zSig0 == 0 ) {
740 zSig0 = zSig1;
741 zSig1 = 0;
742 zExp -= 64;
744 shiftCount = countLeadingZeros64( zSig0 );
745 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
746 zExp -= shiftCount;
747 return
748 roundAndPackFloatx80( roundData, zSign, zExp, zSig0, zSig1 );
752 #endif
755 -------------------------------------------------------------------------------
756 Returns the result of converting the 32-bit two's complement integer `a' to
757 the single-precision floating-point format. The conversion is performed
758 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
759 -------------------------------------------------------------------------------
761 float32 int32_to_float32(struct roundingData *roundData, int32 a)
763 flag zSign;
765 if ( a == 0 ) return 0;
766 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
767 zSign = ( a < 0 );
768 return normalizeRoundAndPackFloat32( roundData, zSign, 0x9C, zSign ? - a : a );
773 -------------------------------------------------------------------------------
774 Returns the result of converting the 32-bit two's complement integer `a' to
775 the double-precision floating-point format. The conversion is performed
776 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
777 -------------------------------------------------------------------------------
779 float64 int32_to_float64( int32 a )
781 flag aSign;
782 uint32 absA;
783 int8 shiftCount;
784 bits64 zSig;
786 if ( a == 0 ) return 0;
787 aSign = ( a < 0 );
788 absA = aSign ? - a : a;
789 shiftCount = countLeadingZeros32( absA ) + 21;
790 zSig = absA;
791 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
795 #ifdef FLOATX80
798 -------------------------------------------------------------------------------
799 Returns the result of converting the 32-bit two's complement integer `a'
800 to the extended double-precision floating-point format. The conversion
801 is performed according to the IEC/IEEE Standard for Binary Floating-point
802 Arithmetic.
803 -------------------------------------------------------------------------------
805 floatx80 int32_to_floatx80( int32 a )
807 flag zSign;
808 uint32 absA;
809 int8 shiftCount;
810 bits64 zSig;
812 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
813 zSign = ( a < 0 );
814 absA = zSign ? - a : a;
815 shiftCount = countLeadingZeros32( absA ) + 32;
816 zSig = absA;
817 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
821 #endif
824 -------------------------------------------------------------------------------
825 Returns the result of converting the single-precision floating-point value
826 `a' to the 32-bit two's complement integer format. The conversion is
827 performed according to the IEC/IEEE Standard for Binary Floating-point
828 Arithmetic---which means in particular that the conversion is rounded
829 according to the current rounding mode. If `a' is a NaN, the largest
830 positive integer is returned. Otherwise, if the conversion overflows, the
831 largest integer with the same sign as `a' is returned.
832 -------------------------------------------------------------------------------
834 int32 float32_to_int32( struct roundingData *roundData, float32 a )
836 flag aSign;
837 int16 aExp, shiftCount;
838 bits32 aSig;
839 bits64 zSig;
841 aSig = extractFloat32Frac( a );
842 aExp = extractFloat32Exp( a );
843 aSign = extractFloat32Sign( a );
844 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
845 if ( aExp ) aSig |= 0x00800000;
846 shiftCount = 0xAF - aExp;
847 zSig = aSig;
848 zSig <<= 32;
849 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
850 return roundAndPackInt32( roundData, aSign, zSig );
855 -------------------------------------------------------------------------------
856 Returns the result of converting the single-precision floating-point value
857 `a' to the 32-bit two's complement integer format. The conversion is
858 performed according to the IEC/IEEE Standard for Binary Floating-point
859 Arithmetic, except that the conversion is always rounded toward zero. If
860 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
861 conversion overflows, the largest integer with the same sign as `a' is
862 returned.
863 -------------------------------------------------------------------------------
865 int32 float32_to_int32_round_to_zero( float32 a )
867 flag aSign;
868 int16 aExp, shiftCount;
869 bits32 aSig;
870 int32 z;
872 aSig = extractFloat32Frac( a );
873 aExp = extractFloat32Exp( a );
874 aSign = extractFloat32Sign( a );
875 shiftCount = aExp - 0x9E;
876 if ( 0 <= shiftCount ) {
877 if ( a == 0xCF000000 ) return 0x80000000;
878 float_raise( float_flag_invalid );
879 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
880 return 0x80000000;
882 else if ( aExp <= 0x7E ) {
883 if ( aExp | aSig ) float_raise( float_flag_inexact );
884 return 0;
886 aSig = ( aSig | 0x00800000 )<<8;
887 z = aSig>>( - shiftCount );
888 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
889 float_raise( float_flag_inexact );
891 return aSign ? - z : z;
896 -------------------------------------------------------------------------------
897 Returns the result of converting the single-precision floating-point value
898 `a' to the double-precision floating-point format. The conversion is
899 performed according to the IEC/IEEE Standard for Binary Floating-point
900 Arithmetic.
901 -------------------------------------------------------------------------------
903 float64 float32_to_float64( float32 a )
905 flag aSign;
906 int16 aExp;
907 bits32 aSig;
909 aSig = extractFloat32Frac( a );
910 aExp = extractFloat32Exp( a );
911 aSign = extractFloat32Sign( a );
912 if ( aExp == 0xFF ) {
913 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
914 return packFloat64( aSign, 0x7FF, 0 );
916 if ( aExp == 0 ) {
917 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
918 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
919 --aExp;
921 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
925 #ifdef FLOATX80
928 -------------------------------------------------------------------------------
929 Returns the result of converting the single-precision floating-point value
930 `a' to the extended double-precision floating-point format. The conversion
931 is performed according to the IEC/IEEE Standard for Binary Floating-point
932 Arithmetic.
933 -------------------------------------------------------------------------------
935 floatx80 float32_to_floatx80( float32 a )
937 flag aSign;
938 int16 aExp;
939 bits32 aSig;
941 aSig = extractFloat32Frac( a );
942 aExp = extractFloat32Exp( a );
943 aSign = extractFloat32Sign( a );
944 if ( aExp == 0xFF ) {
945 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
946 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
948 if ( aExp == 0 ) {
949 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
950 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
952 aSig |= 0x00800000;
953 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
957 #endif
960 -------------------------------------------------------------------------------
961 Rounds the single-precision floating-point value `a' to an integer, and
962 returns the result as a single-precision floating-point value. The
963 operation is performed according to the IEC/IEEE Standard for Binary
964 Floating-point Arithmetic.
965 -------------------------------------------------------------------------------
967 float32 float32_round_to_int( struct roundingData *roundData, float32 a )
969 flag aSign;
970 int16 aExp;
971 bits32 lastBitMask, roundBitsMask;
972 int8 roundingMode;
973 float32 z;
975 aExp = extractFloat32Exp( a );
976 if ( 0x96 <= aExp ) {
977 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
978 return propagateFloat32NaN( a, a );
980 return a;
982 roundingMode = roundData->mode;
983 if ( aExp <= 0x7E ) {
984 if ( (bits32) ( a<<1 ) == 0 ) return a;
985 roundData->exception |= float_flag_inexact;
986 aSign = extractFloat32Sign( a );
987 switch ( roundingMode ) {
988 case float_round_nearest_even:
989 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
990 return packFloat32( aSign, 0x7F, 0 );
992 break;
993 case float_round_down:
994 return aSign ? 0xBF800000 : 0;
995 case float_round_up:
996 return aSign ? 0x80000000 : 0x3F800000;
998 return packFloat32( aSign, 0, 0 );
1000 lastBitMask = 1;
1001 lastBitMask <<= 0x96 - aExp;
1002 roundBitsMask = lastBitMask - 1;
1003 z = a;
1004 if ( roundingMode == float_round_nearest_even ) {
1005 z += lastBitMask>>1;
1006 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1008 else if ( roundingMode != float_round_to_zero ) {
1009 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1010 z += roundBitsMask;
1013 z &= ~ roundBitsMask;
1014 if ( z != a ) roundData->exception |= float_flag_inexact;
1015 return z;
1020 -------------------------------------------------------------------------------
1021 Returns the result of adding the absolute values of the single-precision
1022 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1023 before being returned. `zSign' is ignored if the result is a NaN. The
1024 addition is performed according to the IEC/IEEE Standard for Binary
1025 Floating-point Arithmetic.
1026 -------------------------------------------------------------------------------
1028 static float32 addFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1030 int16 aExp, bExp, zExp;
1031 bits32 aSig, bSig, zSig;
1032 int16 expDiff;
1034 aSig = extractFloat32Frac( a );
1035 aExp = extractFloat32Exp( a );
1036 bSig = extractFloat32Frac( b );
1037 bExp = extractFloat32Exp( b );
1038 expDiff = aExp - bExp;
1039 aSig <<= 6;
1040 bSig <<= 6;
1041 if ( 0 < expDiff ) {
1042 if ( aExp == 0xFF ) {
1043 if ( aSig ) return propagateFloat32NaN( a, b );
1044 return a;
1046 if ( bExp == 0 ) {
1047 --expDiff;
1049 else {
1050 bSig |= 0x20000000;
1052 shift32RightJamming( bSig, expDiff, &bSig );
1053 zExp = aExp;
1055 else if ( expDiff < 0 ) {
1056 if ( bExp == 0xFF ) {
1057 if ( bSig ) return propagateFloat32NaN( a, b );
1058 return packFloat32( zSign, 0xFF, 0 );
1060 if ( aExp == 0 ) {
1061 ++expDiff;
1063 else {
1064 aSig |= 0x20000000;
1066 shift32RightJamming( aSig, - expDiff, &aSig );
1067 zExp = bExp;
1069 else {
1070 if ( aExp == 0xFF ) {
1071 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1072 return a;
1074 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1075 zSig = 0x40000000 + aSig + bSig;
1076 zExp = aExp;
1077 goto roundAndPack;
1079 aSig |= 0x20000000;
1080 zSig = ( aSig + bSig )<<1;
1081 --zExp;
1082 if ( (sbits32) zSig < 0 ) {
1083 zSig = aSig + bSig;
1084 ++zExp;
1086 roundAndPack:
1087 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1092 -------------------------------------------------------------------------------
1093 Returns the result of subtracting the absolute values of the single-
1094 precision floating-point values `a' and `b'. If `zSign' is true, the
1095 difference is negated before being returned. `zSign' is ignored if the
1096 result is a NaN. The subtraction is performed according to the IEC/IEEE
1097 Standard for Binary Floating-point Arithmetic.
1098 -------------------------------------------------------------------------------
1100 static float32 subFloat32Sigs( struct roundingData *roundData, float32 a, float32 b, flag zSign )
1102 int16 aExp, bExp, zExp;
1103 bits32 aSig, bSig, zSig;
1104 int16 expDiff;
1106 aSig = extractFloat32Frac( a );
1107 aExp = extractFloat32Exp( a );
1108 bSig = extractFloat32Frac( b );
1109 bExp = extractFloat32Exp( b );
1110 expDiff = aExp - bExp;
1111 aSig <<= 7;
1112 bSig <<= 7;
1113 if ( 0 < expDiff ) goto aExpBigger;
1114 if ( expDiff < 0 ) goto bExpBigger;
1115 if ( aExp == 0xFF ) {
1116 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1117 roundData->exception |= float_flag_invalid;
1118 return float32_default_nan;
1120 if ( aExp == 0 ) {
1121 aExp = 1;
1122 bExp = 1;
1124 if ( bSig < aSig ) goto aBigger;
1125 if ( aSig < bSig ) goto bBigger;
1126 return packFloat32( roundData->mode == float_round_down, 0, 0 );
1127 bExpBigger:
1128 if ( bExp == 0xFF ) {
1129 if ( bSig ) return propagateFloat32NaN( a, b );
1130 return packFloat32( zSign ^ 1, 0xFF, 0 );
1132 if ( aExp == 0 ) {
1133 ++expDiff;
1135 else {
1136 aSig |= 0x40000000;
1138 shift32RightJamming( aSig, - expDiff, &aSig );
1139 bSig |= 0x40000000;
1140 bBigger:
1141 zSig = bSig - aSig;
1142 zExp = bExp;
1143 zSign ^= 1;
1144 goto normalizeRoundAndPack;
1145 aExpBigger:
1146 if ( aExp == 0xFF ) {
1147 if ( aSig ) return propagateFloat32NaN( a, b );
1148 return a;
1150 if ( bExp == 0 ) {
1151 --expDiff;
1153 else {
1154 bSig |= 0x40000000;
1156 shift32RightJamming( bSig, expDiff, &bSig );
1157 aSig |= 0x40000000;
1158 aBigger:
1159 zSig = aSig - bSig;
1160 zExp = aExp;
1161 normalizeRoundAndPack:
1162 --zExp;
1163 return normalizeRoundAndPackFloat32( roundData, zSign, zExp, zSig );
1168 -------------------------------------------------------------------------------
1169 Returns the result of adding the single-precision floating-point values `a'
1170 and `b'. The operation is performed according to the IEC/IEEE Standard for
1171 Binary Floating-point Arithmetic.
1172 -------------------------------------------------------------------------------
1174 float32 float32_add( struct roundingData *roundData, float32 a, float32 b )
1176 flag aSign, bSign;
1178 aSign = extractFloat32Sign( a );
1179 bSign = extractFloat32Sign( b );
1180 if ( aSign == bSign ) {
1181 return addFloat32Sigs( roundData, a, b, aSign );
1183 else {
1184 return subFloat32Sigs( roundData, a, b, aSign );
1190 -------------------------------------------------------------------------------
1191 Returns the result of subtracting the single-precision floating-point values
1192 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1193 for Binary Floating-point Arithmetic.
1194 -------------------------------------------------------------------------------
1196 float32 float32_sub( struct roundingData *roundData, float32 a, float32 b )
1198 flag aSign, bSign;
1200 aSign = extractFloat32Sign( a );
1201 bSign = extractFloat32Sign( b );
1202 if ( aSign == bSign ) {
1203 return subFloat32Sigs( roundData, a, b, aSign );
1205 else {
1206 return addFloat32Sigs( roundData, a, b, aSign );
1212 -------------------------------------------------------------------------------
1213 Returns the result of multiplying the single-precision floating-point values
1214 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1215 for Binary Floating-point Arithmetic.
1216 -------------------------------------------------------------------------------
1218 float32 float32_mul( struct roundingData *roundData, float32 a, float32 b )
1220 flag aSign, bSign, zSign;
1221 int16 aExp, bExp, zExp;
1222 bits32 aSig, bSig;
1223 bits64 zSig64;
1224 bits32 zSig;
1226 aSig = extractFloat32Frac( a );
1227 aExp = extractFloat32Exp( a );
1228 aSign = extractFloat32Sign( a );
1229 bSig = extractFloat32Frac( b );
1230 bExp = extractFloat32Exp( b );
1231 bSign = extractFloat32Sign( b );
1232 zSign = aSign ^ bSign;
1233 if ( aExp == 0xFF ) {
1234 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1235 return propagateFloat32NaN( a, b );
1237 if ( ( bExp | bSig ) == 0 ) {
1238 roundData->exception |= float_flag_invalid;
1239 return float32_default_nan;
1241 return packFloat32( zSign, 0xFF, 0 );
1243 if ( bExp == 0xFF ) {
1244 if ( bSig ) return propagateFloat32NaN( a, b );
1245 if ( ( aExp | aSig ) == 0 ) {
1246 roundData->exception |= float_flag_invalid;
1247 return float32_default_nan;
1249 return packFloat32( zSign, 0xFF, 0 );
1251 if ( aExp == 0 ) {
1252 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1253 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1255 if ( bExp == 0 ) {
1256 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1257 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1259 zExp = aExp + bExp - 0x7F;
1260 aSig = ( aSig | 0x00800000 )<<7;
1261 bSig = ( bSig | 0x00800000 )<<8;
1262 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1263 zSig = zSig64;
1264 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1265 zSig <<= 1;
1266 --zExp;
1268 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1273 -------------------------------------------------------------------------------
1274 Returns the result of dividing the single-precision floating-point value `a'
1275 by the corresponding value `b'. The operation is performed according to the
1276 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1277 -------------------------------------------------------------------------------
1279 float32 float32_div( struct roundingData *roundData, float32 a, float32 b )
1281 flag aSign, bSign, zSign;
1282 int16 aExp, bExp, zExp;
1283 bits32 aSig, bSig, zSig;
1285 aSig = extractFloat32Frac( a );
1286 aExp = extractFloat32Exp( a );
1287 aSign = extractFloat32Sign( a );
1288 bSig = extractFloat32Frac( b );
1289 bExp = extractFloat32Exp( b );
1290 bSign = extractFloat32Sign( b );
1291 zSign = aSign ^ bSign;
1292 if ( aExp == 0xFF ) {
1293 if ( aSig ) return propagateFloat32NaN( a, b );
1294 if ( bExp == 0xFF ) {
1295 if ( bSig ) return propagateFloat32NaN( a, b );
1296 roundData->exception |= float_flag_invalid;
1297 return float32_default_nan;
1299 return packFloat32( zSign, 0xFF, 0 );
1301 if ( bExp == 0xFF ) {
1302 if ( bSig ) return propagateFloat32NaN( a, b );
1303 return packFloat32( zSign, 0, 0 );
1305 if ( bExp == 0 ) {
1306 if ( bSig == 0 ) {
1307 if ( ( aExp | aSig ) == 0 ) {
1308 roundData->exception |= float_flag_invalid;
1309 return float32_default_nan;
1311 roundData->exception |= float_flag_divbyzero;
1312 return packFloat32( zSign, 0xFF, 0 );
1314 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1316 if ( aExp == 0 ) {
1317 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1318 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1320 zExp = aExp - bExp + 0x7D;
1321 aSig = ( aSig | 0x00800000 )<<7;
1322 bSig = ( bSig | 0x00800000 )<<8;
1323 if ( bSig <= ( aSig + aSig ) ) {
1324 aSig >>= 1;
1325 ++zExp;
1328 bits64 tmp = ( (bits64) aSig )<<32;
1329 do_div( tmp, bSig );
1330 zSig = tmp;
1332 if ( ( zSig & 0x3F ) == 0 ) {
1333 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1335 return roundAndPackFloat32( roundData, zSign, zExp, zSig );
1340 -------------------------------------------------------------------------------
1341 Returns the remainder of the single-precision floating-point value `a'
1342 with respect to the corresponding value `b'. The operation is performed
1343 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1344 -------------------------------------------------------------------------------
1346 float32 float32_rem( struct roundingData *roundData, float32 a, float32 b )
1348 flag aSign, bSign, zSign;
1349 int16 aExp, bExp, expDiff;
1350 bits32 aSig, bSig;
1351 bits32 q;
1352 bits64 aSig64, bSig64, q64;
1353 bits32 alternateASig;
1354 sbits32 sigMean;
1356 aSig = extractFloat32Frac( a );
1357 aExp = extractFloat32Exp( a );
1358 aSign = extractFloat32Sign( a );
1359 bSig = extractFloat32Frac( b );
1360 bExp = extractFloat32Exp( b );
1361 bSign = extractFloat32Sign( b );
1362 if ( aExp == 0xFF ) {
1363 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1364 return propagateFloat32NaN( a, b );
1366 roundData->exception |= float_flag_invalid;
1367 return float32_default_nan;
1369 if ( bExp == 0xFF ) {
1370 if ( bSig ) return propagateFloat32NaN( a, b );
1371 return a;
1373 if ( bExp == 0 ) {
1374 if ( bSig == 0 ) {
1375 roundData->exception |= float_flag_invalid;
1376 return float32_default_nan;
1378 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1380 if ( aExp == 0 ) {
1381 if ( aSig == 0 ) return a;
1382 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1384 expDiff = aExp - bExp;
1385 aSig |= 0x00800000;
1386 bSig |= 0x00800000;
1387 if ( expDiff < 32 ) {
1388 aSig <<= 8;
1389 bSig <<= 8;
1390 if ( expDiff < 0 ) {
1391 if ( expDiff < -1 ) return a;
1392 aSig >>= 1;
1394 q = ( bSig <= aSig );
1395 if ( q ) aSig -= bSig;
1396 if ( 0 < expDiff ) {
1397 bits64 tmp = ( (bits64) aSig )<<32;
1398 do_div( tmp, bSig );
1399 q = tmp;
1400 q >>= 32 - expDiff;
1401 bSig >>= 2;
1402 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1404 else {
1405 aSig >>= 2;
1406 bSig >>= 2;
1409 else {
1410 if ( bSig <= aSig ) aSig -= bSig;
1411 aSig64 = ( (bits64) aSig )<<40;
1412 bSig64 = ( (bits64) bSig )<<40;
1413 expDiff -= 64;
1414 while ( 0 < expDiff ) {
1415 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1416 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1417 aSig64 = - ( ( bSig * q64 )<<38 );
1418 expDiff -= 62;
1420 expDiff += 64;
1421 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1422 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1423 q = q64>>( 64 - expDiff );
1424 bSig <<= 6;
1425 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1427 do {
1428 alternateASig = aSig;
1429 ++q;
1430 aSig -= bSig;
1431 } while ( 0 <= (sbits32) aSig );
1432 sigMean = aSig + alternateASig;
1433 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1434 aSig = alternateASig;
1436 zSign = ( (sbits32) aSig < 0 );
1437 if ( zSign ) aSig = - aSig;
1438 return normalizeRoundAndPackFloat32( roundData, aSign ^ zSign, bExp, aSig );
1443 -------------------------------------------------------------------------------
1444 Returns the square root of the single-precision floating-point value `a'.
1445 The operation is performed according to the IEC/IEEE Standard for Binary
1446 Floating-point Arithmetic.
1447 -------------------------------------------------------------------------------
1449 float32 float32_sqrt( struct roundingData *roundData, float32 a )
1451 flag aSign;
1452 int16 aExp, zExp;
1453 bits32 aSig, zSig;
1454 bits64 rem, term;
1456 aSig = extractFloat32Frac( a );
1457 aExp = extractFloat32Exp( a );
1458 aSign = extractFloat32Sign( a );
1459 if ( aExp == 0xFF ) {
1460 if ( aSig ) return propagateFloat32NaN( a, 0 );
1461 if ( ! aSign ) return a;
1462 roundData->exception |= float_flag_invalid;
1463 return float32_default_nan;
1465 if ( aSign ) {
1466 if ( ( aExp | aSig ) == 0 ) return a;
1467 roundData->exception |= float_flag_invalid;
1468 return float32_default_nan;
1470 if ( aExp == 0 ) {
1471 if ( aSig == 0 ) return 0;
1472 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1474 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1475 aSig = ( aSig | 0x00800000 )<<8;
1476 zSig = estimateSqrt32( aExp, aSig ) + 2;
1477 if ( ( zSig & 0x7F ) <= 5 ) {
1478 if ( zSig < 2 ) {
1479 zSig = 0xFFFFFFFF;
1481 else {
1482 aSig >>= aExp & 1;
1483 term = ( (bits64) zSig ) * zSig;
1484 rem = ( ( (bits64) aSig )<<32 ) - term;
1485 while ( (sbits64) rem < 0 ) {
1486 --zSig;
1487 rem += ( ( (bits64) zSig )<<1 ) | 1;
1489 zSig |= ( rem != 0 );
1492 shift32RightJamming( zSig, 1, &zSig );
1493 return roundAndPackFloat32( roundData, 0, zExp, zSig );
1498 -------------------------------------------------------------------------------
1499 Returns 1 if the single-precision floating-point value `a' is equal to the
1500 corresponding value `b', and 0 otherwise. The comparison is performed
1501 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1502 -------------------------------------------------------------------------------
1504 flag float32_eq( float32 a, float32 b )
1507 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1508 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1510 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1511 float_raise( float_flag_invalid );
1513 return 0;
1515 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1520 -------------------------------------------------------------------------------
1521 Returns 1 if the single-precision floating-point value `a' is less than or
1522 equal to the corresponding value `b', and 0 otherwise. The comparison is
1523 performed according to the IEC/IEEE Standard for Binary Floating-point
1524 Arithmetic.
1525 -------------------------------------------------------------------------------
1527 flag float32_le( float32 a, float32 b )
1529 flag aSign, bSign;
1531 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1532 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1534 float_raise( float_flag_invalid );
1535 return 0;
1537 aSign = extractFloat32Sign( a );
1538 bSign = extractFloat32Sign( b );
1539 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1540 return ( a == b ) || ( aSign ^ ( a < b ) );
1545 -------------------------------------------------------------------------------
1546 Returns 1 if the single-precision floating-point value `a' is less than
1547 the corresponding value `b', and 0 otherwise. The comparison is performed
1548 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1549 -------------------------------------------------------------------------------
1551 flag float32_lt( float32 a, float32 b )
1553 flag aSign, bSign;
1555 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1556 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1558 float_raise( float_flag_invalid );
1559 return 0;
1561 aSign = extractFloat32Sign( a );
1562 bSign = extractFloat32Sign( b );
1563 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1564 return ( a != b ) && ( aSign ^ ( a < b ) );
1569 -------------------------------------------------------------------------------
1570 Returns 1 if the single-precision floating-point value `a' is equal to the
1571 corresponding value `b', and 0 otherwise. The invalid exception is raised
1572 if either operand is a NaN. Otherwise, the comparison is performed
1573 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1574 -------------------------------------------------------------------------------
1576 flag float32_eq_signaling( float32 a, float32 b )
1579 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1580 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1582 float_raise( float_flag_invalid );
1583 return 0;
1585 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1590 -------------------------------------------------------------------------------
1591 Returns 1 if the single-precision floating-point value `a' is less than or
1592 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1593 cause an exception. Otherwise, the comparison is performed according to the
1594 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1595 -------------------------------------------------------------------------------
1597 flag float32_le_quiet( float32 a, float32 b )
1599 flag aSign, bSign;
1600 //int16 aExp, bExp;
1602 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1603 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1605 /* Do nothing, even if NaN as we're quiet */
1606 return 0;
1608 aSign = extractFloat32Sign( a );
1609 bSign = extractFloat32Sign( b );
1610 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1611 return ( a == b ) || ( aSign ^ ( a < b ) );
1616 -------------------------------------------------------------------------------
1617 Returns 1 if the single-precision floating-point value `a' is less than
1618 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1619 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1620 Standard for Binary Floating-point Arithmetic.
1621 -------------------------------------------------------------------------------
1623 flag float32_lt_quiet( float32 a, float32 b )
1625 flag aSign, bSign;
1627 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1628 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1630 /* Do nothing, even if NaN as we're quiet */
1631 return 0;
1633 aSign = extractFloat32Sign( a );
1634 bSign = extractFloat32Sign( b );
1635 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1636 return ( a != b ) && ( aSign ^ ( a < b ) );
1641 -------------------------------------------------------------------------------
1642 Returns the result of converting the double-precision floating-point value
1643 `a' to the 32-bit two's complement integer format. The conversion is
1644 performed according to the IEC/IEEE Standard for Binary Floating-point
1645 Arithmetic---which means in particular that the conversion is rounded
1646 according to the current rounding mode. If `a' is a NaN, the largest
1647 positive integer is returned. Otherwise, if the conversion overflows, the
1648 largest integer with the same sign as `a' is returned.
1649 -------------------------------------------------------------------------------
1651 int32 float64_to_int32( struct roundingData *roundData, float64 a )
1653 flag aSign;
1654 int16 aExp, shiftCount;
1655 bits64 aSig;
1657 aSig = extractFloat64Frac( a );
1658 aExp = extractFloat64Exp( a );
1659 aSign = extractFloat64Sign( a );
1660 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1661 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1662 shiftCount = 0x42C - aExp;
1663 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1664 return roundAndPackInt32( roundData, aSign, aSig );
1669 -------------------------------------------------------------------------------
1670 Returns the result of converting the double-precision floating-point value
1671 `a' to the 32-bit two's complement integer format. The conversion is
1672 performed according to the IEC/IEEE Standard for Binary Floating-point
1673 Arithmetic, except that the conversion is always rounded toward zero. If
1674 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1675 conversion overflows, the largest integer with the same sign as `a' is
1676 returned.
1677 -------------------------------------------------------------------------------
1679 int32 float64_to_int32_round_to_zero( float64 a )
1681 flag aSign;
1682 int16 aExp, shiftCount;
1683 bits64 aSig, savedASig;
1684 int32 z;
1686 aSig = extractFloat64Frac( a );
1687 aExp = extractFloat64Exp( a );
1688 aSign = extractFloat64Sign( a );
1689 shiftCount = 0x433 - aExp;
1690 if ( shiftCount < 21 ) {
1691 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1692 goto invalid;
1694 else if ( 52 < shiftCount ) {
1695 if ( aExp || aSig ) float_raise( float_flag_inexact );
1696 return 0;
1698 aSig |= LIT64( 0x0010000000000000 );
1699 savedASig = aSig;
1700 aSig >>= shiftCount;
1701 z = aSig;
1702 if ( aSign ) z = - z;
1703 if ( ( z < 0 ) ^ aSign ) {
1704 invalid:
1705 float_raise( float_flag_invalid );
1706 return aSign ? 0x80000000 : 0x7FFFFFFF;
1708 if ( ( aSig<<shiftCount ) != savedASig ) {
1709 float_raise( float_flag_inexact );
1711 return z;
1716 -------------------------------------------------------------------------------
1717 Returns the result of converting the double-precision floating-point value
1718 `a' to the 32-bit two's complement unsigned integer format. The conversion
1719 is performed according to the IEC/IEEE Standard for Binary Floating-point
1720 Arithmetic---which means in particular that the conversion is rounded
1721 according to the current rounding mode. If `a' is a NaN, the largest
1722 positive integer is returned. Otherwise, if the conversion overflows, the
1723 largest positive integer is returned.
1724 -------------------------------------------------------------------------------
1726 int32 float64_to_uint32( struct roundingData *roundData, float64 a )
1728 flag aSign;
1729 int16 aExp, shiftCount;
1730 bits64 aSig;
1732 aSig = extractFloat64Frac( a );
1733 aExp = extractFloat64Exp( a );
1734 aSign = 0; //extractFloat64Sign( a );
1735 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1736 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1737 shiftCount = 0x42C - aExp;
1738 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1739 return roundAndPackInt32( roundData, aSign, aSig );
1743 -------------------------------------------------------------------------------
1744 Returns the result of converting the double-precision floating-point value
1745 `a' to the 32-bit two's complement integer format. The conversion is
1746 performed according to the IEC/IEEE Standard for Binary Floating-point
1747 Arithmetic, except that the conversion is always rounded toward zero. If
1748 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1749 conversion overflows, the largest positive integer is returned.
1750 -------------------------------------------------------------------------------
1752 int32 float64_to_uint32_round_to_zero( float64 a )
1754 flag aSign;
1755 int16 aExp, shiftCount;
1756 bits64 aSig, savedASig;
1757 int32 z;
1759 aSig = extractFloat64Frac( a );
1760 aExp = extractFloat64Exp( a );
1761 aSign = extractFloat64Sign( a );
1762 shiftCount = 0x433 - aExp;
1763 if ( shiftCount < 21 ) {
1764 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1765 goto invalid;
1767 else if ( 52 < shiftCount ) {
1768 if ( aExp || aSig ) float_raise( float_flag_inexact );
1769 return 0;
1771 aSig |= LIT64( 0x0010000000000000 );
1772 savedASig = aSig;
1773 aSig >>= shiftCount;
1774 z = aSig;
1775 if ( aSign ) z = - z;
1776 if ( ( z < 0 ) ^ aSign ) {
1777 invalid:
1778 float_raise( float_flag_invalid );
1779 return aSign ? 0x80000000 : 0x7FFFFFFF;
1781 if ( ( aSig<<shiftCount ) != savedASig ) {
1782 float_raise( float_flag_inexact );
1784 return z;
1788 -------------------------------------------------------------------------------
1789 Returns the result of converting the double-precision floating-point value
1790 `a' to the single-precision floating-point format. The conversion is
1791 performed according to the IEC/IEEE Standard for Binary Floating-point
1792 Arithmetic.
1793 -------------------------------------------------------------------------------
1795 float32 float64_to_float32( struct roundingData *roundData, float64 a )
1797 flag aSign;
1798 int16 aExp;
1799 bits64 aSig;
1800 bits32 zSig;
1802 aSig = extractFloat64Frac( a );
1803 aExp = extractFloat64Exp( a );
1804 aSign = extractFloat64Sign( a );
1805 if ( aExp == 0x7FF ) {
1806 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1807 return packFloat32( aSign, 0xFF, 0 );
1809 shift64RightJamming( aSig, 22, &aSig );
1810 zSig = aSig;
1811 if ( aExp || zSig ) {
1812 zSig |= 0x40000000;
1813 aExp -= 0x381;
1815 return roundAndPackFloat32( roundData, aSign, aExp, zSig );
1819 #ifdef FLOATX80
1822 -------------------------------------------------------------------------------
1823 Returns the result of converting the double-precision floating-point value
1824 `a' to the extended double-precision floating-point format. The conversion
1825 is performed according to the IEC/IEEE Standard for Binary Floating-point
1826 Arithmetic.
1827 -------------------------------------------------------------------------------
1829 floatx80 float64_to_floatx80( float64 a )
1831 flag aSign;
1832 int16 aExp;
1833 bits64 aSig;
1835 aSig = extractFloat64Frac( a );
1836 aExp = extractFloat64Exp( a );
1837 aSign = extractFloat64Sign( a );
1838 if ( aExp == 0x7FF ) {
1839 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1840 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1842 if ( aExp == 0 ) {
1843 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1844 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1846 return
1847 packFloatx80(
1848 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1852 #endif
1855 -------------------------------------------------------------------------------
1856 Rounds the double-precision floating-point value `a' to an integer, and
1857 returns the result as a double-precision floating-point value. The
1858 operation is performed according to the IEC/IEEE Standard for Binary
1859 Floating-point Arithmetic.
1860 -------------------------------------------------------------------------------
1862 float64 float64_round_to_int( struct roundingData *roundData, float64 a )
1864 flag aSign;
1865 int16 aExp;
1866 bits64 lastBitMask, roundBitsMask;
1867 int8 roundingMode;
1868 float64 z;
1870 aExp = extractFloat64Exp( a );
1871 if ( 0x433 <= aExp ) {
1872 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1873 return propagateFloat64NaN( a, a );
1875 return a;
1877 if ( aExp <= 0x3FE ) {
1878 if ( (bits64) ( a<<1 ) == 0 ) return a;
1879 roundData->exception |= float_flag_inexact;
1880 aSign = extractFloat64Sign( a );
1881 switch ( roundData->mode ) {
1882 case float_round_nearest_even:
1883 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1884 return packFloat64( aSign, 0x3FF, 0 );
1886 break;
1887 case float_round_down:
1888 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1889 case float_round_up:
1890 return
1891 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1893 return packFloat64( aSign, 0, 0 );
1895 lastBitMask = 1;
1896 lastBitMask <<= 0x433 - aExp;
1897 roundBitsMask = lastBitMask - 1;
1898 z = a;
1899 roundingMode = roundData->mode;
1900 if ( roundingMode == float_round_nearest_even ) {
1901 z += lastBitMask>>1;
1902 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1904 else if ( roundingMode != float_round_to_zero ) {
1905 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1906 z += roundBitsMask;
1909 z &= ~ roundBitsMask;
1910 if ( z != a ) roundData->exception |= float_flag_inexact;
1911 return z;
1916 -------------------------------------------------------------------------------
1917 Returns the result of adding the absolute values of the double-precision
1918 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1919 before being returned. `zSign' is ignored if the result is a NaN. The
1920 addition is performed according to the IEC/IEEE Standard for Binary
1921 Floating-point Arithmetic.
1922 -------------------------------------------------------------------------------
1924 static float64 addFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1926 int16 aExp, bExp, zExp;
1927 bits64 aSig, bSig, zSig;
1928 int16 expDiff;
1930 aSig = extractFloat64Frac( a );
1931 aExp = extractFloat64Exp( a );
1932 bSig = extractFloat64Frac( b );
1933 bExp = extractFloat64Exp( b );
1934 expDiff = aExp - bExp;
1935 aSig <<= 9;
1936 bSig <<= 9;
1937 if ( 0 < expDiff ) {
1938 if ( aExp == 0x7FF ) {
1939 if ( aSig ) return propagateFloat64NaN( a, b );
1940 return a;
1942 if ( bExp == 0 ) {
1943 --expDiff;
1945 else {
1946 bSig |= LIT64( 0x2000000000000000 );
1948 shift64RightJamming( bSig, expDiff, &bSig );
1949 zExp = aExp;
1951 else if ( expDiff < 0 ) {
1952 if ( bExp == 0x7FF ) {
1953 if ( bSig ) return propagateFloat64NaN( a, b );
1954 return packFloat64( zSign, 0x7FF, 0 );
1956 if ( aExp == 0 ) {
1957 ++expDiff;
1959 else {
1960 aSig |= LIT64( 0x2000000000000000 );
1962 shift64RightJamming( aSig, - expDiff, &aSig );
1963 zExp = bExp;
1965 else {
1966 if ( aExp == 0x7FF ) {
1967 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1968 return a;
1970 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1971 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1972 zExp = aExp;
1973 goto roundAndPack;
1975 aSig |= LIT64( 0x2000000000000000 );
1976 zSig = ( aSig + bSig )<<1;
1977 --zExp;
1978 if ( (sbits64) zSig < 0 ) {
1979 zSig = aSig + bSig;
1980 ++zExp;
1982 roundAndPack:
1983 return roundAndPackFloat64( roundData, zSign, zExp, zSig );
1988 -------------------------------------------------------------------------------
1989 Returns the result of subtracting the absolute values of the double-
1990 precision floating-point values `a' and `b'. If `zSign' is true, the
1991 difference is negated before being returned. `zSign' is ignored if the
1992 result is a NaN. The subtraction is performed according to the IEC/IEEE
1993 Standard for Binary Floating-point Arithmetic.
1994 -------------------------------------------------------------------------------
1996 static float64 subFloat64Sigs( struct roundingData *roundData, float64 a, float64 b, flag zSign )
1998 int16 aExp, bExp, zExp;
1999 bits64 aSig, bSig, zSig;
2000 int16 expDiff;
2002 aSig = extractFloat64Frac( a );
2003 aExp = extractFloat64Exp( a );
2004 bSig = extractFloat64Frac( b );
2005 bExp = extractFloat64Exp( b );
2006 expDiff = aExp - bExp;
2007 aSig <<= 10;
2008 bSig <<= 10;
2009 if ( 0 < expDiff ) goto aExpBigger;
2010 if ( expDiff < 0 ) goto bExpBigger;
2011 if ( aExp == 0x7FF ) {
2012 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2013 roundData->exception |= float_flag_invalid;
2014 return float64_default_nan;
2016 if ( aExp == 0 ) {
2017 aExp = 1;
2018 bExp = 1;
2020 if ( bSig < aSig ) goto aBigger;
2021 if ( aSig < bSig ) goto bBigger;
2022 return packFloat64( roundData->mode == float_round_down, 0, 0 );
2023 bExpBigger:
2024 if ( bExp == 0x7FF ) {
2025 if ( bSig ) return propagateFloat64NaN( a, b );
2026 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2028 if ( aExp == 0 ) {
2029 ++expDiff;
2031 else {
2032 aSig |= LIT64( 0x4000000000000000 );
2034 shift64RightJamming( aSig, - expDiff, &aSig );
2035 bSig |= LIT64( 0x4000000000000000 );
2036 bBigger:
2037 zSig = bSig - aSig;
2038 zExp = bExp;
2039 zSign ^= 1;
2040 goto normalizeRoundAndPack;
2041 aExpBigger:
2042 if ( aExp == 0x7FF ) {
2043 if ( aSig ) return propagateFloat64NaN( a, b );
2044 return a;
2046 if ( bExp == 0 ) {
2047 --expDiff;
2049 else {
2050 bSig |= LIT64( 0x4000000000000000 );
2052 shift64RightJamming( bSig, expDiff, &bSig );
2053 aSig |= LIT64( 0x4000000000000000 );
2054 aBigger:
2055 zSig = aSig - bSig;
2056 zExp = aExp;
2057 normalizeRoundAndPack:
2058 --zExp;
2059 return normalizeRoundAndPackFloat64( roundData, zSign, zExp, zSig );
2064 -------------------------------------------------------------------------------
2065 Returns the result of adding the double-precision floating-point values `a'
2066 and `b'. The operation is performed according to the IEC/IEEE Standard for
2067 Binary Floating-point Arithmetic.
2068 -------------------------------------------------------------------------------
2070 float64 float64_add( struct roundingData *roundData, float64 a, float64 b )
2072 flag aSign, bSign;
2074 aSign = extractFloat64Sign( a );
2075 bSign = extractFloat64Sign( b );
2076 if ( aSign == bSign ) {
2077 return addFloat64Sigs( roundData, a, b, aSign );
2079 else {
2080 return subFloat64Sigs( roundData, a, b, aSign );
2086 -------------------------------------------------------------------------------
2087 Returns the result of subtracting the double-precision floating-point values
2088 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2089 for Binary Floating-point Arithmetic.
2090 -------------------------------------------------------------------------------
2092 float64 float64_sub( struct roundingData *roundData, float64 a, float64 b )
2094 flag aSign, bSign;
2096 aSign = extractFloat64Sign( a );
2097 bSign = extractFloat64Sign( b );
2098 if ( aSign == bSign ) {
2099 return subFloat64Sigs( roundData, a, b, aSign );
2101 else {
2102 return addFloat64Sigs( roundData, a, b, aSign );
2108 -------------------------------------------------------------------------------
2109 Returns the result of multiplying the double-precision floating-point values
2110 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2111 for Binary Floating-point Arithmetic.
2112 -------------------------------------------------------------------------------
2114 float64 float64_mul( struct roundingData *roundData, float64 a, float64 b )
2116 flag aSign, bSign, zSign;
2117 int16 aExp, bExp, zExp;
2118 bits64 aSig, bSig, zSig0, zSig1;
2120 aSig = extractFloat64Frac( a );
2121 aExp = extractFloat64Exp( a );
2122 aSign = extractFloat64Sign( a );
2123 bSig = extractFloat64Frac( b );
2124 bExp = extractFloat64Exp( b );
2125 bSign = extractFloat64Sign( b );
2126 zSign = aSign ^ bSign;
2127 if ( aExp == 0x7FF ) {
2128 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2129 return propagateFloat64NaN( a, b );
2131 if ( ( bExp | bSig ) == 0 ) {
2132 roundData->exception |= float_flag_invalid;
2133 return float64_default_nan;
2135 return packFloat64( zSign, 0x7FF, 0 );
2137 if ( bExp == 0x7FF ) {
2138 if ( bSig ) return propagateFloat64NaN( a, b );
2139 if ( ( aExp | aSig ) == 0 ) {
2140 roundData->exception |= float_flag_invalid;
2141 return float64_default_nan;
2143 return packFloat64( zSign, 0x7FF, 0 );
2145 if ( aExp == 0 ) {
2146 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2147 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2149 if ( bExp == 0 ) {
2150 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2151 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2153 zExp = aExp + bExp - 0x3FF;
2154 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2155 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2156 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2157 zSig0 |= ( zSig1 != 0 );
2158 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2159 zSig0 <<= 1;
2160 --zExp;
2162 return roundAndPackFloat64( roundData, zSign, zExp, zSig0 );
2167 -------------------------------------------------------------------------------
2168 Returns the result of dividing the double-precision floating-point value `a'
2169 by the corresponding value `b'. The operation is performed according to
2170 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2171 -------------------------------------------------------------------------------
2173 float64 float64_div( struct roundingData *roundData, float64 a, float64 b )
2175 flag aSign, bSign, zSign;
2176 int16 aExp, bExp, zExp;
2177 bits64 aSig, bSig, zSig;
2178 bits64 rem0, rem1;
2179 bits64 term0, term1;
2181 aSig = extractFloat64Frac( a );
2182 aExp = extractFloat64Exp( a );
2183 aSign = extractFloat64Sign( a );
2184 bSig = extractFloat64Frac( b );
2185 bExp = extractFloat64Exp( b );
2186 bSign = extractFloat64Sign( b );
2187 zSign = aSign ^ bSign;
2188 if ( aExp == 0x7FF ) {
2189 if ( aSig ) return propagateFloat64NaN( a, b );
2190 if ( bExp == 0x7FF ) {
2191 if ( bSig ) return propagateFloat64NaN( a, b );
2192 roundData->exception |= float_flag_invalid;
2193 return float64_default_nan;
2195 return packFloat64( zSign, 0x7FF, 0 );
2197 if ( bExp == 0x7FF ) {
2198 if ( bSig ) return propagateFloat64NaN( a, b );
2199 return packFloat64( zSign, 0, 0 );
2201 if ( bExp == 0 ) {
2202 if ( bSig == 0 ) {
2203 if ( ( aExp | aSig ) == 0 ) {
2204 roundData->exception |= float_flag_invalid;
2205 return float64_default_nan;
2207 roundData->exception |= float_flag_divbyzero;
2208 return packFloat64( zSign, 0x7FF, 0 );
2210 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2212 if ( aExp == 0 ) {
2213 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2214 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2216 zExp = aExp - bExp + 0x3FD;
2217 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2218 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2219 if ( bSig <= ( aSig + aSig ) ) {
2220 aSig >>= 1;
2221 ++zExp;
2223 zSig = estimateDiv128To64( aSig, 0, bSig );
2224 if ( ( zSig & 0x1FF ) <= 2 ) {
2225 mul64To128( bSig, zSig, &term0, &term1 );
2226 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2227 while ( (sbits64) rem0 < 0 ) {
2228 --zSig;
2229 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2231 zSig |= ( rem1 != 0 );
2233 return roundAndPackFloat64( roundData, zSign, zExp, zSig );
2238 -------------------------------------------------------------------------------
2239 Returns the remainder of the double-precision floating-point value `a'
2240 with respect to the corresponding value `b'. The operation is performed
2241 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2242 -------------------------------------------------------------------------------
2244 float64 float64_rem( struct roundingData *roundData, float64 a, float64 b )
2246 flag aSign, bSign, zSign;
2247 int16 aExp, bExp, expDiff;
2248 bits64 aSig, bSig;
2249 bits64 q, alternateASig;
2250 sbits64 sigMean;
2252 aSig = extractFloat64Frac( a );
2253 aExp = extractFloat64Exp( a );
2254 aSign = extractFloat64Sign( a );
2255 bSig = extractFloat64Frac( b );
2256 bExp = extractFloat64Exp( b );
2257 bSign = extractFloat64Sign( b );
2258 if ( aExp == 0x7FF ) {
2259 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2260 return propagateFloat64NaN( a, b );
2262 roundData->exception |= float_flag_invalid;
2263 return float64_default_nan;
2265 if ( bExp == 0x7FF ) {
2266 if ( bSig ) return propagateFloat64NaN( a, b );
2267 return a;
2269 if ( bExp == 0 ) {
2270 if ( bSig == 0 ) {
2271 roundData->exception |= float_flag_invalid;
2272 return float64_default_nan;
2274 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2276 if ( aExp == 0 ) {
2277 if ( aSig == 0 ) return a;
2278 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2280 expDiff = aExp - bExp;
2281 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2282 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2283 if ( expDiff < 0 ) {
2284 if ( expDiff < -1 ) return a;
2285 aSig >>= 1;
2287 q = ( bSig <= aSig );
2288 if ( q ) aSig -= bSig;
2289 expDiff -= 64;
2290 while ( 0 < expDiff ) {
2291 q = estimateDiv128To64( aSig, 0, bSig );
2292 q = ( 2 < q ) ? q - 2 : 0;
2293 aSig = - ( ( bSig>>2 ) * q );
2294 expDiff -= 62;
2296 expDiff += 64;
2297 if ( 0 < expDiff ) {
2298 q = estimateDiv128To64( aSig, 0, bSig );
2299 q = ( 2 < q ) ? q - 2 : 0;
2300 q >>= 64 - expDiff;
2301 bSig >>= 2;
2302 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2304 else {
2305 aSig >>= 2;
2306 bSig >>= 2;
2308 do {
2309 alternateASig = aSig;
2310 ++q;
2311 aSig -= bSig;
2312 } while ( 0 <= (sbits64) aSig );
2313 sigMean = aSig + alternateASig;
2314 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2315 aSig = alternateASig;
2317 zSign = ( (sbits64) aSig < 0 );
2318 if ( zSign ) aSig = - aSig;
2319 return normalizeRoundAndPackFloat64( roundData, aSign ^ zSign, bExp, aSig );
2324 -------------------------------------------------------------------------------
2325 Returns the square root of the double-precision floating-point value `a'.
2326 The operation is performed according to the IEC/IEEE Standard for Binary
2327 Floating-point Arithmetic.
2328 -------------------------------------------------------------------------------
2330 float64 float64_sqrt( struct roundingData *roundData, float64 a )
2332 flag aSign;
2333 int16 aExp, zExp;
2334 bits64 aSig, zSig;
2335 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2336 //float64 z;
2338 aSig = extractFloat64Frac( a );
2339 aExp = extractFloat64Exp( a );
2340 aSign = extractFloat64Sign( a );
2341 if ( aExp == 0x7FF ) {
2342 if ( aSig ) return propagateFloat64NaN( a, a );
2343 if ( ! aSign ) return a;
2344 roundData->exception |= float_flag_invalid;
2345 return float64_default_nan;
2347 if ( aSign ) {
2348 if ( ( aExp | aSig ) == 0 ) return a;
2349 roundData->exception |= float_flag_invalid;
2350 return float64_default_nan;
2352 if ( aExp == 0 ) {
2353 if ( aSig == 0 ) return 0;
2354 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2356 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2357 aSig |= LIT64( 0x0010000000000000 );
2358 zSig = estimateSqrt32( aExp, aSig>>21 );
2359 zSig <<= 31;
2360 aSig <<= 9 - ( aExp & 1 );
2361 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2362 if ( ( zSig & 0x3FF ) <= 5 ) {
2363 if ( zSig < 2 ) {
2364 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2366 else {
2367 aSig <<= 2;
2368 mul64To128( zSig, zSig, &term0, &term1 );
2369 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2370 while ( (sbits64) rem0 < 0 ) {
2371 --zSig;
2372 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2373 term1 |= 1;
2374 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2376 zSig |= ( ( rem0 | rem1 ) != 0 );
2379 shift64RightJamming( zSig, 1, &zSig );
2380 return roundAndPackFloat64( roundData, 0, zExp, zSig );
2385 -------------------------------------------------------------------------------
2386 Returns 1 if the double-precision floating-point value `a' is equal to the
2387 corresponding value `b', and 0 otherwise. The comparison is performed
2388 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2389 -------------------------------------------------------------------------------
2391 flag float64_eq( float64 a, float64 b )
2394 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2395 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2397 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2398 float_raise( float_flag_invalid );
2400 return 0;
2402 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2407 -------------------------------------------------------------------------------
2408 Returns 1 if the double-precision floating-point value `a' is less than or
2409 equal to the corresponding value `b', and 0 otherwise. The comparison is
2410 performed according to the IEC/IEEE Standard for Binary Floating-point
2411 Arithmetic.
2412 -------------------------------------------------------------------------------
2414 flag float64_le( float64 a, float64 b )
2416 flag aSign, bSign;
2418 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2419 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2421 float_raise( float_flag_invalid );
2422 return 0;
2424 aSign = extractFloat64Sign( a );
2425 bSign = extractFloat64Sign( b );
2426 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2427 return ( a == b ) || ( aSign ^ ( a < b ) );
2432 -------------------------------------------------------------------------------
2433 Returns 1 if the double-precision floating-point value `a' is less than
2434 the corresponding value `b', and 0 otherwise. The comparison is performed
2435 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2436 -------------------------------------------------------------------------------
2438 flag float64_lt( float64 a, float64 b )
2440 flag aSign, bSign;
2442 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2443 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2445 float_raise( float_flag_invalid );
2446 return 0;
2448 aSign = extractFloat64Sign( a );
2449 bSign = extractFloat64Sign( b );
2450 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2451 return ( a != b ) && ( aSign ^ ( a < b ) );
2456 -------------------------------------------------------------------------------
2457 Returns 1 if the double-precision floating-point value `a' is equal to the
2458 corresponding value `b', and 0 otherwise. The invalid exception is raised
2459 if either operand is a NaN. Otherwise, the comparison is performed
2460 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2461 -------------------------------------------------------------------------------
2463 flag float64_eq_signaling( float64 a, float64 b )
2466 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2467 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2469 float_raise( float_flag_invalid );
2470 return 0;
2472 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2477 -------------------------------------------------------------------------------
2478 Returns 1 if the double-precision floating-point value `a' is less than or
2479 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2480 cause an exception. Otherwise, the comparison is performed according to the
2481 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2482 -------------------------------------------------------------------------------
2484 flag float64_le_quiet( float64 a, float64 b )
2486 flag aSign, bSign;
2487 //int16 aExp, bExp;
2489 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2490 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2492 /* Do nothing, even if NaN as we're quiet */
2493 return 0;
2495 aSign = extractFloat64Sign( a );
2496 bSign = extractFloat64Sign( b );
2497 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2498 return ( a == b ) || ( aSign ^ ( a < b ) );
2503 -------------------------------------------------------------------------------
2504 Returns 1 if the double-precision floating-point value `a' is less than
2505 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2506 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2507 Standard for Binary Floating-point Arithmetic.
2508 -------------------------------------------------------------------------------
2510 flag float64_lt_quiet( float64 a, float64 b )
2512 flag aSign, bSign;
2514 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2515 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2517 /* Do nothing, even if NaN as we're quiet */
2518 return 0;
2520 aSign = extractFloat64Sign( a );
2521 bSign = extractFloat64Sign( b );
2522 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2523 return ( a != b ) && ( aSign ^ ( a < b ) );
2527 #ifdef FLOATX80
2530 -------------------------------------------------------------------------------
2531 Returns the result of converting the extended double-precision floating-
2532 point value `a' to the 32-bit two's complement integer format. The
2533 conversion is performed according to the IEC/IEEE Standard for Binary
2534 Floating-point Arithmetic---which means in particular that the conversion
2535 is rounded according to the current rounding mode. If `a' is a NaN, the
2536 largest positive integer is returned. Otherwise, if the conversion
2537 overflows, the largest integer with the same sign as `a' is returned.
2538 -------------------------------------------------------------------------------
2540 int32 floatx80_to_int32( struct roundingData *roundData, floatx80 a )
2542 flag aSign;
2543 int32 aExp, shiftCount;
2544 bits64 aSig;
2546 aSig = extractFloatx80Frac( a );
2547 aExp = extractFloatx80Exp( a );
2548 aSign = extractFloatx80Sign( a );
2549 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2550 shiftCount = 0x4037 - aExp;
2551 if ( shiftCount <= 0 ) shiftCount = 1;
2552 shift64RightJamming( aSig, shiftCount, &aSig );
2553 return roundAndPackInt32( roundData, aSign, aSig );
2558 -------------------------------------------------------------------------------
2559 Returns the result of converting the extended double-precision floating-
2560 point value `a' to the 32-bit two's complement integer format. The
2561 conversion is performed according to the IEC/IEEE Standard for Binary
2562 Floating-point Arithmetic, except that the conversion is always rounded
2563 toward zero. If `a' is a NaN, the largest positive integer is returned.
2564 Otherwise, if the conversion overflows, the largest integer with the same
2565 sign as `a' is returned.
2566 -------------------------------------------------------------------------------
2568 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2570 flag aSign;
2571 int32 aExp, shiftCount;
2572 bits64 aSig, savedASig;
2573 int32 z;
2575 aSig = extractFloatx80Frac( a );
2576 aExp = extractFloatx80Exp( a );
2577 aSign = extractFloatx80Sign( a );
2578 shiftCount = 0x403E - aExp;
2579 if ( shiftCount < 32 ) {
2580 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2581 goto invalid;
2583 else if ( 63 < shiftCount ) {
2584 if ( aExp || aSig ) float_raise( float_flag_inexact );
2585 return 0;
2587 savedASig = aSig;
2588 aSig >>= shiftCount;
2589 z = aSig;
2590 if ( aSign ) z = - z;
2591 if ( ( z < 0 ) ^ aSign ) {
2592 invalid:
2593 float_raise( float_flag_invalid );
2594 return aSign ? 0x80000000 : 0x7FFFFFFF;
2596 if ( ( aSig<<shiftCount ) != savedASig ) {
2597 float_raise( float_flag_inexact );
2599 return z;
2604 -------------------------------------------------------------------------------
2605 Returns the result of converting the extended double-precision floating-
2606 point value `a' to the single-precision floating-point format. The
2607 conversion is performed according to the IEC/IEEE Standard for Binary
2608 Floating-point Arithmetic.
2609 -------------------------------------------------------------------------------
2611 float32 floatx80_to_float32( struct roundingData *roundData, floatx80 a )
2613 flag aSign;
2614 int32 aExp;
2615 bits64 aSig;
2617 aSig = extractFloatx80Frac( a );
2618 aExp = extractFloatx80Exp( a );
2619 aSign = extractFloatx80Sign( a );
2620 if ( aExp == 0x7FFF ) {
2621 if ( (bits64) ( aSig<<1 ) ) {
2622 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2624 return packFloat32( aSign, 0xFF, 0 );
2626 shift64RightJamming( aSig, 33, &aSig );
2627 if ( aExp || aSig ) aExp -= 0x3F81;
2628 return roundAndPackFloat32( roundData, aSign, aExp, aSig );
2633 -------------------------------------------------------------------------------
2634 Returns the result of converting the extended double-precision floating-
2635 point value `a' to the double-precision floating-point format. The
2636 conversion is performed according to the IEC/IEEE Standard for Binary
2637 Floating-point Arithmetic.
2638 -------------------------------------------------------------------------------
2640 float64 floatx80_to_float64( struct roundingData *roundData, floatx80 a )
2642 flag aSign;
2643 int32 aExp;
2644 bits64 aSig, zSig;
2646 aSig = extractFloatx80Frac( a );
2647 aExp = extractFloatx80Exp( a );
2648 aSign = extractFloatx80Sign( a );
2649 if ( aExp == 0x7FFF ) {
2650 if ( (bits64) ( aSig<<1 ) ) {
2651 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2653 return packFloat64( aSign, 0x7FF, 0 );
2655 shift64RightJamming( aSig, 1, &zSig );
2656 if ( aExp || aSig ) aExp -= 0x3C01;
2657 return roundAndPackFloat64( roundData, aSign, aExp, zSig );
2662 -------------------------------------------------------------------------------
2663 Rounds the extended double-precision floating-point value `a' to an integer,
2664 and returns the result as an extended quadruple-precision floating-point
2665 value. The operation is performed according to the IEC/IEEE Standard for
2666 Binary Floating-point Arithmetic.
2667 -------------------------------------------------------------------------------
2669 floatx80 floatx80_round_to_int( struct roundingData *roundData, floatx80 a )
2671 flag aSign;
2672 int32 aExp;
2673 bits64 lastBitMask, roundBitsMask;
2674 int8 roundingMode;
2675 floatx80 z;
2677 aExp = extractFloatx80Exp( a );
2678 if ( 0x403E <= aExp ) {
2679 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2680 return propagateFloatx80NaN( a, a );
2682 return a;
2684 if ( aExp <= 0x3FFE ) {
2685 if ( ( aExp == 0 )
2686 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2687 return a;
2689 roundData->exception |= float_flag_inexact;
2690 aSign = extractFloatx80Sign( a );
2691 switch ( roundData->mode ) {
2692 case float_round_nearest_even:
2693 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2695 return
2696 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2698 break;
2699 case float_round_down:
2700 return
2701 aSign ?
2702 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2703 : packFloatx80( 0, 0, 0 );
2704 case float_round_up:
2705 return
2706 aSign ? packFloatx80( 1, 0, 0 )
2707 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2709 return packFloatx80( aSign, 0, 0 );
2711 lastBitMask = 1;
2712 lastBitMask <<= 0x403E - aExp;
2713 roundBitsMask = lastBitMask - 1;
2714 z = a;
2715 roundingMode = roundData->mode;
2716 if ( roundingMode == float_round_nearest_even ) {
2717 z.low += lastBitMask>>1;
2718 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2720 else if ( roundingMode != float_round_to_zero ) {
2721 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2722 z.low += roundBitsMask;
2725 z.low &= ~ roundBitsMask;
2726 if ( z.low == 0 ) {
2727 ++z.high;
2728 z.low = LIT64( 0x8000000000000000 );
2730 if ( z.low != a.low ) roundData->exception |= float_flag_inexact;
2731 return z;
2736 -------------------------------------------------------------------------------
2737 Returns the result of adding the absolute values of the extended double-
2738 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2739 negated before being returned. `zSign' is ignored if the result is a NaN.
2740 The addition is performed according to the IEC/IEEE Standard for Binary
2741 Floating-point Arithmetic.
2742 -------------------------------------------------------------------------------
2744 static floatx80 addFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2746 int32 aExp, bExp, zExp;
2747 bits64 aSig, bSig, zSig0, zSig1;
2748 int32 expDiff;
2750 aSig = extractFloatx80Frac( a );
2751 aExp = extractFloatx80Exp( a );
2752 bSig = extractFloatx80Frac( b );
2753 bExp = extractFloatx80Exp( b );
2754 expDiff = aExp - bExp;
2755 if ( 0 < expDiff ) {
2756 if ( aExp == 0x7FFF ) {
2757 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2758 return a;
2760 if ( bExp == 0 ) --expDiff;
2761 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2762 zExp = aExp;
2764 else if ( expDiff < 0 ) {
2765 if ( bExp == 0x7FFF ) {
2766 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2767 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2769 if ( aExp == 0 ) ++expDiff;
2770 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2771 zExp = bExp;
2773 else {
2774 if ( aExp == 0x7FFF ) {
2775 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2776 return propagateFloatx80NaN( a, b );
2778 return a;
2780 zSig1 = 0;
2781 zSig0 = aSig + bSig;
2782 if ( aExp == 0 ) {
2783 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2784 goto roundAndPack;
2786 zExp = aExp;
2787 goto shiftRight1;
2790 zSig0 = aSig + bSig;
2792 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2793 shiftRight1:
2794 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2795 zSig0 |= LIT64( 0x8000000000000000 );
2796 ++zExp;
2797 roundAndPack:
2798 return
2799 roundAndPackFloatx80(
2800 roundData, zSign, zExp, zSig0, zSig1 );
2805 -------------------------------------------------------------------------------
2806 Returns the result of subtracting the absolute values of the extended
2807 double-precision floating-point values `a' and `b'. If `zSign' is true,
2808 the difference is negated before being returned. `zSign' is ignored if the
2809 result is a NaN. The subtraction is performed according to the IEC/IEEE
2810 Standard for Binary Floating-point Arithmetic.
2811 -------------------------------------------------------------------------------
2813 static floatx80 subFloatx80Sigs( struct roundingData *roundData, floatx80 a, floatx80 b, flag zSign )
2815 int32 aExp, bExp, zExp;
2816 bits64 aSig, bSig, zSig0, zSig1;
2817 int32 expDiff;
2818 floatx80 z;
2820 aSig = extractFloatx80Frac( a );
2821 aExp = extractFloatx80Exp( a );
2822 bSig = extractFloatx80Frac( b );
2823 bExp = extractFloatx80Exp( b );
2824 expDiff = aExp - bExp;
2825 if ( 0 < expDiff ) goto aExpBigger;
2826 if ( expDiff < 0 ) goto bExpBigger;
2827 if ( aExp == 0x7FFF ) {
2828 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2829 return propagateFloatx80NaN( a, b );
2831 roundData->exception |= float_flag_invalid;
2832 z.low = floatx80_default_nan_low;
2833 z.high = floatx80_default_nan_high;
2834 return z;
2836 if ( aExp == 0 ) {
2837 aExp = 1;
2838 bExp = 1;
2840 zSig1 = 0;
2841 if ( bSig < aSig ) goto aBigger;
2842 if ( aSig < bSig ) goto bBigger;
2843 return packFloatx80( roundData->mode == float_round_down, 0, 0 );
2844 bExpBigger:
2845 if ( bExp == 0x7FFF ) {
2846 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2847 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2849 if ( aExp == 0 ) ++expDiff;
2850 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2851 bBigger:
2852 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2853 zExp = bExp;
2854 zSign ^= 1;
2855 goto normalizeRoundAndPack;
2856 aExpBigger:
2857 if ( aExp == 0x7FFF ) {
2858 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2859 return a;
2861 if ( bExp == 0 ) --expDiff;
2862 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2863 aBigger:
2864 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2865 zExp = aExp;
2866 normalizeRoundAndPack:
2867 return
2868 normalizeRoundAndPackFloatx80(
2869 roundData, zSign, zExp, zSig0, zSig1 );
2874 -------------------------------------------------------------------------------
2875 Returns the result of adding the extended double-precision floating-point
2876 values `a' and `b'. The operation is performed according to the IEC/IEEE
2877 Standard for Binary Floating-point Arithmetic.
2878 -------------------------------------------------------------------------------
2880 floatx80 floatx80_add( struct roundingData *roundData, floatx80 a, floatx80 b )
2882 flag aSign, bSign;
2884 aSign = extractFloatx80Sign( a );
2885 bSign = extractFloatx80Sign( b );
2886 if ( aSign == bSign ) {
2887 return addFloatx80Sigs( roundData, a, b, aSign );
2889 else {
2890 return subFloatx80Sigs( roundData, a, b, aSign );
2896 -------------------------------------------------------------------------------
2897 Returns the result of subtracting the extended double-precision floating-
2898 point values `a' and `b'. The operation is performed according to the
2899 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2900 -------------------------------------------------------------------------------
2902 floatx80 floatx80_sub( struct roundingData *roundData, floatx80 a, floatx80 b )
2904 flag aSign, bSign;
2906 aSign = extractFloatx80Sign( a );
2907 bSign = extractFloatx80Sign( b );
2908 if ( aSign == bSign ) {
2909 return subFloatx80Sigs( roundData, a, b, aSign );
2911 else {
2912 return addFloatx80Sigs( roundData, a, b, aSign );
2918 -------------------------------------------------------------------------------
2919 Returns the result of multiplying the extended double-precision floating-
2920 point values `a' and `b'. The operation is performed according to the
2921 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2922 -------------------------------------------------------------------------------
2924 floatx80 floatx80_mul( struct roundingData *roundData, floatx80 a, floatx80 b )
2926 flag aSign, bSign, zSign;
2927 int32 aExp, bExp, zExp;
2928 bits64 aSig, bSig, zSig0, zSig1;
2929 floatx80 z;
2931 aSig = extractFloatx80Frac( a );
2932 aExp = extractFloatx80Exp( a );
2933 aSign = extractFloatx80Sign( a );
2934 bSig = extractFloatx80Frac( b );
2935 bExp = extractFloatx80Exp( b );
2936 bSign = extractFloatx80Sign( b );
2937 zSign = aSign ^ bSign;
2938 if ( aExp == 0x7FFF ) {
2939 if ( (bits64) ( aSig<<1 )
2940 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2941 return propagateFloatx80NaN( a, b );
2943 if ( ( bExp | bSig ) == 0 ) goto invalid;
2944 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2946 if ( bExp == 0x7FFF ) {
2947 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2948 if ( ( aExp | aSig ) == 0 ) {
2949 invalid:
2950 roundData->exception |= float_flag_invalid;
2951 z.low = floatx80_default_nan_low;
2952 z.high = floatx80_default_nan_high;
2953 return z;
2955 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2957 if ( aExp == 0 ) {
2958 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2959 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2961 if ( bExp == 0 ) {
2962 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2963 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2965 zExp = aExp + bExp - 0x3FFE;
2966 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2967 if ( 0 < (sbits64) zSig0 ) {
2968 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2969 --zExp;
2971 return
2972 roundAndPackFloatx80(
2973 roundData, zSign, zExp, zSig0, zSig1 );
2978 -------------------------------------------------------------------------------
2979 Returns the result of dividing the extended double-precision floating-point
2980 value `a' by the corresponding value `b'. The operation is performed
2981 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2982 -------------------------------------------------------------------------------
2984 floatx80 floatx80_div( struct roundingData *roundData, floatx80 a, floatx80 b )
2986 flag aSign, bSign, zSign;
2987 int32 aExp, bExp, zExp;
2988 bits64 aSig, bSig, zSig0, zSig1;
2989 bits64 rem0, rem1, rem2, term0, term1, term2;
2990 floatx80 z;
2992 aSig = extractFloatx80Frac( a );
2993 aExp = extractFloatx80Exp( a );
2994 aSign = extractFloatx80Sign( a );
2995 bSig = extractFloatx80Frac( b );
2996 bExp = extractFloatx80Exp( b );
2997 bSign = extractFloatx80Sign( b );
2998 zSign = aSign ^ bSign;
2999 if ( aExp == 0x7FFF ) {
3000 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3001 if ( bExp == 0x7FFF ) {
3002 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3003 goto invalid;
3005 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3007 if ( bExp == 0x7FFF ) {
3008 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3009 return packFloatx80( zSign, 0, 0 );
3011 if ( bExp == 0 ) {
3012 if ( bSig == 0 ) {
3013 if ( ( aExp | aSig ) == 0 ) {
3014 invalid:
3015 roundData->exception |= float_flag_invalid;
3016 z.low = floatx80_default_nan_low;
3017 z.high = floatx80_default_nan_high;
3018 return z;
3020 roundData->exception |= float_flag_divbyzero;
3021 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3023 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3025 if ( aExp == 0 ) {
3026 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3027 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3029 zExp = aExp - bExp + 0x3FFE;
3030 rem1 = 0;
3031 if ( bSig <= aSig ) {
3032 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3033 ++zExp;
3035 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3036 mul64To128( bSig, zSig0, &term0, &term1 );
3037 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3038 while ( (sbits64) rem0 < 0 ) {
3039 --zSig0;
3040 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3042 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3043 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3044 mul64To128( bSig, zSig1, &term1, &term2 );
3045 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3046 while ( (sbits64) rem1 < 0 ) {
3047 --zSig1;
3048 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3050 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3052 return
3053 roundAndPackFloatx80(
3054 roundData, zSign, zExp, zSig0, zSig1 );
3059 -------------------------------------------------------------------------------
3060 Returns the remainder of the extended double-precision floating-point value
3061 `a' with respect to the corresponding value `b'. The operation is performed
3062 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3063 -------------------------------------------------------------------------------
3065 floatx80 floatx80_rem( struct roundingData *roundData, floatx80 a, floatx80 b )
3067 flag aSign, bSign, zSign;
3068 int32 aExp, bExp, expDiff;
3069 bits64 aSig0, aSig1, bSig;
3070 bits64 q, term0, term1, alternateASig0, alternateASig1;
3071 floatx80 z;
3073 aSig0 = extractFloatx80Frac( a );
3074 aExp = extractFloatx80Exp( a );
3075 aSign = extractFloatx80Sign( a );
3076 bSig = extractFloatx80Frac( b );
3077 bExp = extractFloatx80Exp( b );
3078 bSign = extractFloatx80Sign( b );
3079 if ( aExp == 0x7FFF ) {
3080 if ( (bits64) ( aSig0<<1 )
3081 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3082 return propagateFloatx80NaN( a, b );
3084 goto invalid;
3086 if ( bExp == 0x7FFF ) {
3087 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3088 return a;
3090 if ( bExp == 0 ) {
3091 if ( bSig == 0 ) {
3092 invalid:
3093 roundData->exception |= float_flag_invalid;
3094 z.low = floatx80_default_nan_low;
3095 z.high = floatx80_default_nan_high;
3096 return z;
3098 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3100 if ( aExp == 0 ) {
3101 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3102 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3104 bSig |= LIT64( 0x8000000000000000 );
3105 zSign = aSign;
3106 expDiff = aExp - bExp;
3107 aSig1 = 0;
3108 if ( expDiff < 0 ) {
3109 if ( expDiff < -1 ) return a;
3110 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3111 expDiff = 0;
3113 q = ( bSig <= aSig0 );
3114 if ( q ) aSig0 -= bSig;
3115 expDiff -= 64;
3116 while ( 0 < expDiff ) {
3117 q = estimateDiv128To64( aSig0, aSig1, bSig );
3118 q = ( 2 < q ) ? q - 2 : 0;
3119 mul64To128( bSig, q, &term0, &term1 );
3120 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3121 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3122 expDiff -= 62;
3124 expDiff += 64;
3125 if ( 0 < expDiff ) {
3126 q = estimateDiv128To64( aSig0, aSig1, bSig );
3127 q = ( 2 < q ) ? q - 2 : 0;
3128 q >>= 64 - expDiff;
3129 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3130 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3131 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3132 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3133 ++q;
3134 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3137 else {
3138 term1 = 0;
3139 term0 = bSig;
3141 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3142 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3143 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3144 && ( q & 1 ) )
3146 aSig0 = alternateASig0;
3147 aSig1 = alternateASig1;
3148 zSign = ! zSign;
3151 return
3152 normalizeRoundAndPackFloatx80(
3153 roundData, zSign, bExp + expDiff, aSig0, aSig1 );
3158 -------------------------------------------------------------------------------
3159 Returns the square root of the extended double-precision floating-point
3160 value `a'. The operation is performed according to the IEC/IEEE Standard
3161 for Binary Floating-point Arithmetic.
3162 -------------------------------------------------------------------------------
3164 floatx80 floatx80_sqrt( struct roundingData *roundData, floatx80 a )
3166 flag aSign;
3167 int32 aExp, zExp;
3168 bits64 aSig0, aSig1, zSig0, zSig1;
3169 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3170 bits64 shiftedRem0, shiftedRem1;
3171 floatx80 z;
3173 aSig0 = extractFloatx80Frac( a );
3174 aExp = extractFloatx80Exp( a );
3175 aSign = extractFloatx80Sign( a );
3176 if ( aExp == 0x7FFF ) {
3177 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3178 if ( ! aSign ) return a;
3179 goto invalid;
3181 if ( aSign ) {
3182 if ( ( aExp | aSig0 ) == 0 ) return a;
3183 invalid:
3184 roundData->exception |= float_flag_invalid;
3185 z.low = floatx80_default_nan_low;
3186 z.high = floatx80_default_nan_high;
3187 return z;
3189 if ( aExp == 0 ) {
3190 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3191 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3193 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3194 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3195 zSig0 <<= 31;
3196 aSig1 = 0;
3197 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3198 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3199 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3200 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3201 mul64To128( zSig0, zSig0, &term0, &term1 );
3202 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3203 while ( (sbits64) rem0 < 0 ) {
3204 --zSig0;
3205 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3206 term1 |= 1;
3207 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3209 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3210 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3211 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3212 if ( zSig1 == 0 ) zSig1 = 1;
3213 mul64To128( zSig0, zSig1, &term1, &term2 );
3214 shortShift128Left( term1, term2, 1, &term1, &term2 );
3215 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3216 mul64To128( zSig1, zSig1, &term2, &term3 );
3217 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3218 while ( (sbits64) rem1 < 0 ) {
3219 --zSig1;
3220 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3221 term3 |= 1;
3222 add192(
3223 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3225 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3227 return
3228 roundAndPackFloatx80(
3229 roundData, 0, zExp, zSig0, zSig1 );
3234 -------------------------------------------------------------------------------
3235 Returns 1 if the extended double-precision floating-point value `a' is
3236 equal to the corresponding value `b', and 0 otherwise. The comparison is
3237 performed according to the IEC/IEEE Standard for Binary Floating-point
3238 Arithmetic.
3239 -------------------------------------------------------------------------------
3241 flag floatx80_eq( floatx80 a, floatx80 b )
3244 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3245 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3246 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3247 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3249 if ( floatx80_is_signaling_nan( a )
3250 || floatx80_is_signaling_nan( b ) ) {
3251 float_raise( float_flag_invalid );
3253 return 0;
3255 return
3256 ( a.low == b.low )
3257 && ( ( a.high == b.high )
3258 || ( ( a.low == 0 )
3259 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3265 -------------------------------------------------------------------------------
3266 Returns 1 if the extended double-precision floating-point value `a' is
3267 less than or equal to the corresponding value `b', and 0 otherwise. The
3268 comparison is performed according to the IEC/IEEE Standard for Binary
3269 Floating-point Arithmetic.
3270 -------------------------------------------------------------------------------
3272 flag floatx80_le( floatx80 a, floatx80 b )
3274 flag aSign, bSign;
3276 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3277 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3278 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3279 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3281 float_raise( float_flag_invalid );
3282 return 0;
3284 aSign = extractFloatx80Sign( a );
3285 bSign = extractFloatx80Sign( b );
3286 if ( aSign != bSign ) {
3287 return
3288 aSign
3289 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3290 == 0 );
3292 return
3293 aSign ? le128( b.high, b.low, a.high, a.low )
3294 : le128( a.high, a.low, b.high, b.low );
3299 -------------------------------------------------------------------------------
3300 Returns 1 if the extended double-precision floating-point value `a' is
3301 less than the corresponding value `b', and 0 otherwise. The comparison
3302 is performed according to the IEC/IEEE Standard for Binary Floating-point
3303 Arithmetic.
3304 -------------------------------------------------------------------------------
3306 flag floatx80_lt( floatx80 a, floatx80 b )
3308 flag aSign, bSign;
3310 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3311 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3312 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3313 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3315 float_raise( float_flag_invalid );
3316 return 0;
3318 aSign = extractFloatx80Sign( a );
3319 bSign = extractFloatx80Sign( b );
3320 if ( aSign != bSign ) {
3321 return
3322 aSign
3323 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3324 != 0 );
3326 return
3327 aSign ? lt128( b.high, b.low, a.high, a.low )
3328 : lt128( a.high, a.low, b.high, b.low );
3333 -------------------------------------------------------------------------------
3334 Returns 1 if the extended double-precision floating-point value `a' is equal
3335 to the corresponding value `b', and 0 otherwise. The invalid exception is
3336 raised if either operand is a NaN. Otherwise, the comparison is performed
3337 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3338 -------------------------------------------------------------------------------
3340 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3343 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3344 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3345 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3346 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3348 float_raise( float_flag_invalid );
3349 return 0;
3351 return
3352 ( a.low == b.low )
3353 && ( ( a.high == b.high )
3354 || ( ( a.low == 0 )
3355 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3361 -------------------------------------------------------------------------------
3362 Returns 1 if the extended double-precision floating-point value `a' is less
3363 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3364 do not cause an exception. Otherwise, the comparison is performed according
3365 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3366 -------------------------------------------------------------------------------
3368 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3370 flag aSign, bSign;
3372 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3373 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3374 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3375 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3377 /* Do nothing, even if NaN as we're quiet */
3378 return 0;
3380 aSign = extractFloatx80Sign( a );
3381 bSign = extractFloatx80Sign( b );
3382 if ( aSign != bSign ) {
3383 return
3384 aSign
3385 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3386 == 0 );
3388 return
3389 aSign ? le128( b.high, b.low, a.high, a.low )
3390 : le128( a.high, a.low, b.high, b.low );
3395 -------------------------------------------------------------------------------
3396 Returns 1 if the extended double-precision floating-point value `a' is less
3397 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3398 an exception. Otherwise, the comparison is performed according to the
3399 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3400 -------------------------------------------------------------------------------
3402 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3404 flag aSign, bSign;
3406 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3407 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3408 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3409 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3411 /* Do nothing, even if NaN as we're quiet */
3412 return 0;
3414 aSign = extractFloatx80Sign( a );
3415 bSign = extractFloatx80Sign( b );
3416 if ( aSign != bSign ) {
3417 return
3418 aSign
3419 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3420 != 0 );
3422 return
3423 aSign ? lt128( b.high, b.low, a.high, a.low )
3424 : lt128( a.high, a.low, b.high, b.low );
3428 #endif