* add p cc
[mascara-docs.git] / i386 / linux / linux-2.3.21 / arch / arm / nwfpe / softfloat.c
bloba7fc76cc8d38ef64d1c52a1093e3b9ce9aa4dadd
1 /*
2 ===============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
28 ===============================================================================
31 #include "milieu.h"
32 #include "softfloat.h"
35 -------------------------------------------------------------------------------
36 Floating-point rounding mode, extended double-precision rounding precision,
37 and exception flags.
38 -------------------------------------------------------------------------------
40 int8 float_rounding_mode = float_round_nearest_even;
41 int8 floatx80_rounding_precision = 80;
42 int8 float_exception_flags = 0;
45 -------------------------------------------------------------------------------
46 Primitive arithmetic functions, including multi-word arithmetic, and
47 division and square root approximations. (Can be specialized to target if
48 desired.)
49 -------------------------------------------------------------------------------
51 #include "softfloat-macros"
54 -------------------------------------------------------------------------------
55 Functions and definitions to determine: (1) whether tininess for underflow
56 is detected before or after rounding by default, (2) what (if anything)
57 happens when exceptions are raised, (3) how signaling NaNs are distinguished
58 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
59 are propagated from function inputs to output. These details are target-
60 specific.
61 -------------------------------------------------------------------------------
63 #include "softfloat-specialize"
66 -------------------------------------------------------------------------------
67 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
68 and 7, and returns the properly rounded 32-bit integer corresponding to the
69 input. If `zSign' is nonzero, the input is negated before being converted
70 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
71 input is simply rounded to an integer, with the inexact exception raised if
72 the input cannot be represented exactly as an integer. If the fixed-point
73 input is too large, however, the invalid exception is raised and the largest
74 positive or negative integer is returned.
75 -------------------------------------------------------------------------------
77 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
79 int8 roundingMode;
80 flag roundNearestEven;
81 int8 roundIncrement, roundBits;
82 int32 z;
84 roundingMode = float_rounding_mode;
85 roundNearestEven = ( roundingMode == float_round_nearest_even );
86 roundIncrement = 0x40;
87 if ( ! roundNearestEven ) {
88 if ( roundingMode == float_round_to_zero ) {
89 roundIncrement = 0;
91 else {
92 roundIncrement = 0x7F;
93 if ( zSign ) {
94 if ( roundingMode == float_round_up ) roundIncrement = 0;
96 else {
97 if ( roundingMode == float_round_down ) roundIncrement = 0;
101 roundBits = absZ & 0x7F;
102 absZ = ( absZ + roundIncrement )>>7;
103 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
104 z = absZ;
105 if ( zSign ) z = - z;
106 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
107 float_exception_flags |= float_flag_invalid;
108 return zSign ? 0x80000000 : 0x7FFFFFFF;
110 if ( roundBits ) float_exception_flags |= float_flag_inexact;
111 return z;
116 -------------------------------------------------------------------------------
117 Returns the fraction bits of the single-precision floating-point value `a'.
118 -------------------------------------------------------------------------------
120 INLINE bits32 extractFloat32Frac( float32 a )
123 return a & 0x007FFFFF;
128 -------------------------------------------------------------------------------
129 Returns the exponent bits of the single-precision floating-point value `a'.
130 -------------------------------------------------------------------------------
132 INLINE int16 extractFloat32Exp( float32 a )
135 return ( a>>23 ) & 0xFF;
140 -------------------------------------------------------------------------------
141 Returns the sign bit of the single-precision floating-point value `a'.
142 -------------------------------------------------------------------------------
144 INLINE flag extractFloat32Sign( float32 a )
147 return a>>31;
152 -------------------------------------------------------------------------------
153 Normalizes the subnormal single-precision floating-point value represented
154 by the denormalized significand `aSig'. The normalized exponent and
155 significand are stored at the locations pointed to by `zExpPtr' and
156 `zSigPtr', respectively.
157 -------------------------------------------------------------------------------
159 static void
160 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
162 int8 shiftCount;
164 shiftCount = countLeadingZeros32( aSig ) - 8;
165 *zSigPtr = aSig<<shiftCount;
166 *zExpPtr = 1 - shiftCount;
171 -------------------------------------------------------------------------------
172 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
173 single-precision floating-point value, returning the result. After being
174 shifted into the proper positions, the three fields are simply added
175 together to form the result. This means that any integer portion of `zSig'
176 will be added into the exponent. Since a properly normalized significand
177 will have an integer portion equal to 1, the `zExp' input should be 1 less
178 than the desired result exponent whenever `zSig' is a complete, normalized
179 significand.
180 -------------------------------------------------------------------------------
182 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
184 #if 0
185 float32 f;
186 __asm__("@ packFloat32;
187 mov %0, %1, asl #31;
188 orr %0, %2, asl #23;
189 orr %0, %3"
190 : /* no outputs */
191 : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
192 : "cc");
193 return f;
194 #else
195 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
196 #endif
200 -------------------------------------------------------------------------------
201 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
202 and significand `zSig', and returns the proper single-precision floating-
203 point value corresponding to the abstract input. Ordinarily, the abstract
204 value is simply rounded and packed into the single-precision format, with
205 the inexact exception raised if the abstract input cannot be represented
206 exactly. If the abstract value is too large, however, the overflow and
207 inexact exceptions are raised and an infinity or maximal finite value is
208 returned. If the abstract value is too small, the input value is rounded to
209 a subnormal number, and the underflow and inexact exceptions are raised if
210 the abstract input cannot be represented exactly as a subnormal single-
211 precision floating-point number.
212 The input significand `zSig' has its binary point between bits 30
213 and 29, which is 7 bits to the left of the usual location. This shifted
214 significand must be normalized or smaller. If `zSig' is not normalized,
215 `zExp' must be 0; in that case, the result returned is a subnormal number,
216 and it must not require rounding. In the usual case that `zSig' is
217 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
218 The handling of underflow and overflow follows the IEC/IEEE Standard for
219 Binary Floating-point Arithmetic.
220 -------------------------------------------------------------------------------
222 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
224 int8 roundingMode;
225 flag roundNearestEven;
226 int8 roundIncrement, roundBits;
227 flag isTiny;
229 roundingMode = float_rounding_mode;
230 roundNearestEven = ( roundingMode == float_round_nearest_even );
231 roundIncrement = 0x40;
232 if ( ! roundNearestEven ) {
233 if ( roundingMode == float_round_to_zero ) {
234 roundIncrement = 0;
236 else {
237 roundIncrement = 0x7F;
238 if ( zSign ) {
239 if ( roundingMode == float_round_up ) roundIncrement = 0;
241 else {
242 if ( roundingMode == float_round_down ) roundIncrement = 0;
246 roundBits = zSig & 0x7F;
247 if ( 0xFD <= (bits16) zExp ) {
248 if ( ( 0xFD < zExp )
249 || ( ( zExp == 0xFD )
250 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
252 float_raise( float_flag_overflow | float_flag_inexact );
253 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
255 if ( zExp < 0 ) {
256 isTiny =
257 ( float_detect_tininess == float_tininess_before_rounding )
258 || ( zExp < -1 )
259 || ( zSig + roundIncrement < 0x80000000 );
260 shift32RightJamming( zSig, - zExp, &zSig );
261 zExp = 0;
262 roundBits = zSig & 0x7F;
263 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
266 if ( roundBits ) float_exception_flags |= float_flag_inexact;
267 zSig = ( zSig + roundIncrement )>>7;
268 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
269 if ( zSig == 0 ) zExp = 0;
270 return packFloat32( zSign, zExp, zSig );
275 -------------------------------------------------------------------------------
276 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
277 and significand `zSig', and returns the proper single-precision floating-
278 point value corresponding to the abstract input. This routine is just like
279 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
280 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
281 point exponent.
282 -------------------------------------------------------------------------------
284 static float32
285 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
287 int8 shiftCount;
289 shiftCount = countLeadingZeros32( zSig ) - 1;
290 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
295 -------------------------------------------------------------------------------
296 Returns the fraction bits of the double-precision floating-point value `a'.
297 -------------------------------------------------------------------------------
299 INLINE bits64 extractFloat64Frac( float64 a )
302 return a & LIT64( 0x000FFFFFFFFFFFFF );
307 -------------------------------------------------------------------------------
308 Returns the exponent bits of the double-precision floating-point value `a'.
309 -------------------------------------------------------------------------------
311 INLINE int16 extractFloat64Exp( float64 a )
314 return ( a>>52 ) & 0x7FF;
319 -------------------------------------------------------------------------------
320 Returns the sign bit of the double-precision floating-point value `a'.
321 -------------------------------------------------------------------------------
323 INLINE flag extractFloat64Sign( float64 a )
326 return a>>63;
331 -------------------------------------------------------------------------------
332 Normalizes the subnormal double-precision floating-point value represented
333 by the denormalized significand `aSig'. The normalized exponent and
334 significand are stored at the locations pointed to by `zExpPtr' and
335 `zSigPtr', respectively.
336 -------------------------------------------------------------------------------
338 static void
339 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
341 int8 shiftCount;
343 shiftCount = countLeadingZeros64( aSig ) - 11;
344 *zSigPtr = aSig<<shiftCount;
345 *zExpPtr = 1 - shiftCount;
350 -------------------------------------------------------------------------------
351 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
352 double-precision floating-point value, returning the result. After being
353 shifted into the proper positions, the three fields are simply added
354 together to form the result. This means that any integer portion of `zSig'
355 will be added into the exponent. Since a properly normalized significand
356 will have an integer portion equal to 1, the `zExp' input should be 1 less
357 than the desired result exponent whenever `zSig' is a complete, normalized
358 significand.
359 -------------------------------------------------------------------------------
361 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
364 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
369 -------------------------------------------------------------------------------
370 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371 and significand `zSig', and returns the proper double-precision floating-
372 point value corresponding to the abstract input. Ordinarily, the abstract
373 value is simply rounded and packed into the double-precision format, with
374 the inexact exception raised if the abstract input cannot be represented
375 exactly. If the abstract value is too large, however, the overflow and
376 inexact exceptions are raised and an infinity or maximal finite value is
377 returned. If the abstract value is too small, the input value is rounded to
378 a subnormal number, and the underflow and inexact exceptions are raised if
379 the abstract input cannot be represented exactly as a subnormal double-
380 precision floating-point number.
381 The input significand `zSig' has its binary point between bits 62
382 and 61, which is 10 bits to the left of the usual location. This shifted
383 significand must be normalized or smaller. If `zSig' is not normalized,
384 `zExp' must be 0; in that case, the result returned is a subnormal number,
385 and it must not require rounding. In the usual case that `zSig' is
386 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
387 The handling of underflow and overflow follows the IEC/IEEE Standard for
388 Binary Floating-point Arithmetic.
389 -------------------------------------------------------------------------------
391 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
393 int8 roundingMode;
394 flag roundNearestEven;
395 int16 roundIncrement, roundBits;
396 flag isTiny;
398 roundingMode = float_rounding_mode;
399 roundNearestEven = ( roundingMode == float_round_nearest_even );
400 roundIncrement = 0x200;
401 if ( ! roundNearestEven ) {
402 if ( roundingMode == float_round_to_zero ) {
403 roundIncrement = 0;
405 else {
406 roundIncrement = 0x3FF;
407 if ( zSign ) {
408 if ( roundingMode == float_round_up ) roundIncrement = 0;
410 else {
411 if ( roundingMode == float_round_down ) roundIncrement = 0;
415 roundBits = zSig & 0x3FF;
416 if ( 0x7FD <= (bits16) zExp ) {
417 if ( ( 0x7FD < zExp )
418 || ( ( zExp == 0x7FD )
419 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
421 //register int lr;
422 //__asm__("mov %0, lr" :: "g" (lr));
423 //fp_printk("roundAndPackFloat64 called from 0x%08x\n",lr);
424 float_raise( float_flag_overflow | float_flag_inexact );
425 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
427 if ( zExp < 0 ) {
428 isTiny =
429 ( float_detect_tininess == float_tininess_before_rounding )
430 || ( zExp < -1 )
431 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
432 shift64RightJamming( zSig, - zExp, &zSig );
433 zExp = 0;
434 roundBits = zSig & 0x3FF;
435 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
438 if ( roundBits ) float_exception_flags |= float_flag_inexact;
439 zSig = ( zSig + roundIncrement )>>10;
440 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
441 if ( zSig == 0 ) zExp = 0;
442 return packFloat64( zSign, zExp, zSig );
447 -------------------------------------------------------------------------------
448 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
449 and significand `zSig', and returns the proper double-precision floating-
450 point value corresponding to the abstract input. This routine is just like
451 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
452 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
453 point exponent.
454 -------------------------------------------------------------------------------
456 static float64
457 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
459 int8 shiftCount;
461 shiftCount = countLeadingZeros64( zSig ) - 1;
462 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
466 #ifdef FLOATX80
469 -------------------------------------------------------------------------------
470 Returns the fraction bits of the extended double-precision floating-point
471 value `a'.
472 -------------------------------------------------------------------------------
474 INLINE bits64 extractFloatx80Frac( floatx80 a )
477 return a.low;
482 -------------------------------------------------------------------------------
483 Returns the exponent bits of the extended double-precision floating-point
484 value `a'.
485 -------------------------------------------------------------------------------
487 INLINE int32 extractFloatx80Exp( floatx80 a )
490 return a.high & 0x7FFF;
495 -------------------------------------------------------------------------------
496 Returns the sign bit of the extended double-precision floating-point value
497 `a'.
498 -------------------------------------------------------------------------------
500 INLINE flag extractFloatx80Sign( floatx80 a )
503 return a.high>>15;
508 -------------------------------------------------------------------------------
509 Normalizes the subnormal extended double-precision floating-point value
510 represented by the denormalized significand `aSig'. The normalized exponent
511 and significand are stored at the locations pointed to by `zExpPtr' and
512 `zSigPtr', respectively.
513 -------------------------------------------------------------------------------
515 static void
516 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
518 int8 shiftCount;
520 shiftCount = countLeadingZeros64( aSig );
521 *zSigPtr = aSig<<shiftCount;
522 *zExpPtr = 1 - shiftCount;
527 -------------------------------------------------------------------------------
528 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
529 extended double-precision floating-point value, returning the result.
530 -------------------------------------------------------------------------------
532 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
534 floatx80 z;
536 z.low = zSig;
537 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
538 return z;
543 -------------------------------------------------------------------------------
544 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
545 and extended significand formed by the concatenation of `zSig0' and `zSig1',
546 and returns the proper extended double-precision floating-point value
547 corresponding to the abstract input. Ordinarily, the abstract value is
548 rounded and packed into the extended double-precision format, with the
549 inexact exception raised if the abstract input cannot be represented
550 exactly. If the abstract value is too large, however, the overflow and
551 inexact exceptions are raised and an infinity or maximal finite value is
552 returned. If the abstract value is too small, the input value is rounded to
553 a subnormal number, and the underflow and inexact exceptions are raised if
554 the abstract input cannot be represented exactly as a subnormal extended
555 double-precision floating-point number.
556 If `roundingPrecision' is 32 or 64, the result is rounded to the same
557 number of bits as single or double precision, respectively. Otherwise, the
558 result is rounded to the full precision of the extended double-precision
559 format.
560 The input significand must be normalized or smaller. If the input
561 significand is not normalized, `zExp' must be 0; in that case, the result
562 returned is a subnormal number, and it must not require rounding. The
563 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
564 Floating-point Arithmetic.
565 -------------------------------------------------------------------------------
567 static floatx80
568 roundAndPackFloatx80(
569 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
572 int8 roundingMode;
573 flag roundNearestEven, increment, isTiny;
574 int64 roundIncrement, roundMask, roundBits;
576 roundingMode = float_rounding_mode;
577 roundNearestEven = ( roundingMode == float_round_nearest_even );
578 if ( roundingPrecision == 80 ) goto precision80;
579 if ( roundingPrecision == 64 ) {
580 roundIncrement = LIT64( 0x0000000000000400 );
581 roundMask = LIT64( 0x00000000000007FF );
583 else if ( roundingPrecision == 32 ) {
584 roundIncrement = LIT64( 0x0000008000000000 );
585 roundMask = LIT64( 0x000000FFFFFFFFFF );
587 else {
588 goto precision80;
590 zSig0 |= ( zSig1 != 0 );
591 if ( ! roundNearestEven ) {
592 if ( roundingMode == float_round_to_zero ) {
593 roundIncrement = 0;
595 else {
596 roundIncrement = roundMask;
597 if ( zSign ) {
598 if ( roundingMode == float_round_up ) roundIncrement = 0;
600 else {
601 if ( roundingMode == float_round_down ) roundIncrement = 0;
605 roundBits = zSig0 & roundMask;
606 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
607 if ( ( 0x7FFE < zExp )
608 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
610 goto overflow;
612 if ( zExp <= 0 ) {
613 isTiny =
614 ( float_detect_tininess == float_tininess_before_rounding )
615 || ( zExp < 0 )
616 || ( zSig0 <= zSig0 + roundIncrement );
617 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
618 zExp = 0;
619 roundBits = zSig0 & roundMask;
620 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
621 if ( roundBits ) float_exception_flags |= float_flag_inexact;
622 zSig0 += roundIncrement;
623 if ( (sbits64) zSig0 < 0 ) zExp = 1;
624 roundIncrement = roundMask + 1;
625 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
626 roundMask |= roundIncrement;
628 zSig0 &= ~ roundMask;
629 return packFloatx80( zSign, zExp, zSig0 );
632 if ( roundBits ) float_exception_flags |= float_flag_inexact;
633 zSig0 += roundIncrement;
634 if ( zSig0 < roundIncrement ) {
635 ++zExp;
636 zSig0 = LIT64( 0x8000000000000000 );
638 roundIncrement = roundMask + 1;
639 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
640 roundMask |= roundIncrement;
642 zSig0 &= ~ roundMask;
643 if ( zSig0 == 0 ) zExp = 0;
644 return packFloatx80( zSign, zExp, zSig0 );
645 precision80:
646 increment = ( (sbits64) zSig1 < 0 );
647 if ( ! roundNearestEven ) {
648 if ( roundingMode == float_round_to_zero ) {
649 increment = 0;
651 else {
652 if ( zSign ) {
653 increment = ( roundingMode == float_round_down ) && zSig1;
655 else {
656 increment = ( roundingMode == float_round_up ) && zSig1;
660 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
661 if ( ( 0x7FFE < zExp )
662 || ( ( zExp == 0x7FFE )
663 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
664 && increment
667 roundMask = 0;
668 overflow:
669 float_raise( float_flag_overflow | float_flag_inexact );
670 if ( ( roundingMode == float_round_to_zero )
671 || ( zSign && ( roundingMode == float_round_up ) )
672 || ( ! zSign && ( roundingMode == float_round_down ) )
674 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
676 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
678 if ( zExp <= 0 ) {
679 isTiny =
680 ( float_detect_tininess == float_tininess_before_rounding )
681 || ( zExp < 0 )
682 || ! increment
683 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
684 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
685 zExp = 0;
686 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
687 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
688 if ( roundNearestEven ) {
689 increment = ( (sbits64) zSig1 < 0 );
691 else {
692 if ( zSign ) {
693 increment = ( roundingMode == float_round_down ) && zSig1;
695 else {
696 increment = ( roundingMode == float_round_up ) && zSig1;
699 if ( increment ) {
700 ++zSig0;
701 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
702 if ( (sbits64) zSig0 < 0 ) zExp = 1;
704 return packFloatx80( zSign, zExp, zSig0 );
707 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
708 if ( increment ) {
709 ++zSig0;
710 if ( zSig0 == 0 ) {
711 ++zExp;
712 zSig0 = LIT64( 0x8000000000000000 );
714 else {
715 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
718 else {
719 if ( zSig0 == 0 ) zExp = 0;
722 return packFloatx80( zSign, zExp, zSig0 );
726 -------------------------------------------------------------------------------
727 Takes an abstract floating-point value having sign `zSign', exponent
728 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
729 and returns the proper extended double-precision floating-point value
730 corresponding to the abstract input. This routine is just like
731 `roundAndPackFloatx80' except that the input significand does not have to be
732 normalized.
733 -------------------------------------------------------------------------------
735 static floatx80
736 normalizeRoundAndPackFloatx80(
737 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
740 int8 shiftCount;
742 if ( zSig0 == 0 ) {
743 zSig0 = zSig1;
744 zSig1 = 0;
745 zExp -= 64;
747 shiftCount = countLeadingZeros64( zSig0 );
748 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
749 zExp -= shiftCount;
750 return
751 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
755 #endif
757 #ifdef FLOAT128
760 -------------------------------------------------------------------------------
761 Returns the least-significant 64 fraction bits of the quadruple-precision
762 floating-point value `a'.
763 -------------------------------------------------------------------------------
765 INLINE bits64 extractFloat128Frac1( float128 a )
768 return a.low;
773 -------------------------------------------------------------------------------
774 Returns the most-significant 48 fraction bits of the quadruple-precision
775 floating-point value `a'.
776 -------------------------------------------------------------------------------
778 INLINE bits64 extractFloat128Frac0( float128 a )
781 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
786 -------------------------------------------------------------------------------
787 Returns the exponent bits of the quadruple-precision floating-point value
788 `a'.
789 -------------------------------------------------------------------------------
791 INLINE int32 extractFloat128Exp( float128 a )
794 return ( a.high>>48 ) & 0x7FFF;
799 -------------------------------------------------------------------------------
800 Returns the sign bit of the quadruple-precision floating-point value `a'.
801 -------------------------------------------------------------------------------
803 INLINE flag extractFloat128Sign( float128 a )
806 return a.high>>63;
811 -------------------------------------------------------------------------------
812 Normalizes the subnormal quadruple-precision floating-point value
813 represented by the denormalized significand formed by the concatenation of
814 `aSig0' and `aSig1'. The normalized exponent is stored at the location
815 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
816 significand are stored at the location pointed to by `zSig0Ptr', and the
817 least significant 64 bits of the normalized significand are stored at the
818 location pointed to by `zSig1Ptr'.
819 -------------------------------------------------------------------------------
821 static void
822 normalizeFloat128Subnormal(
823 bits64 aSig0,
824 bits64 aSig1,
825 int32 *zExpPtr,
826 bits64 *zSig0Ptr,
827 bits64 *zSig1Ptr
830 int8 shiftCount;
832 if ( aSig0 == 0 ) {
833 shiftCount = countLeadingZeros64( aSig1 ) - 15;
834 if ( shiftCount < 0 ) {
835 *zSig0Ptr = aSig1>>( - shiftCount );
836 *zSig1Ptr = aSig1<<( shiftCount & 63 );
838 else {
839 *zSig0Ptr = aSig1<<shiftCount;
840 *zSig1Ptr = 0;
842 *zExpPtr = - shiftCount - 63;
844 else {
845 shiftCount = countLeadingZeros64( aSig0 ) - 15;
846 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
847 *zExpPtr = 1 - shiftCount;
853 -------------------------------------------------------------------------------
854 Packs the sign `zSign', the exponent `zExp', and the significand formed
855 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
856 floating-point value, returning the result. After being shifted into the
857 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
858 added together to form the most significant 32 bits of the result. This
859 means that any integer portion of `zSig0' will be added into the exponent.
860 Since a properly normalized significand will have an integer portion equal
861 to 1, the `zExp' input should be 1 less than the desired result exponent
862 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
863 significand.
864 -------------------------------------------------------------------------------
866 INLINE float128
867 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
869 float128 z;
871 z.low = zSig1;
872 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
873 return z;
878 -------------------------------------------------------------------------------
879 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
880 and extended significand formed by the concatenation of `zSig0', `zSig1',
881 and `zSig2', and returns the proper quadruple-precision floating-point value
882 corresponding to the abstract input. Ordinarily, the abstract value is
883 simply rounded and packed into the quadruple-precision format, with the
884 inexact exception raised if the abstract input cannot be represented
885 exactly. If the abstract value is too large, however, the overflow and
886 inexact exceptions are raised and an infinity or maximal finite value is
887 returned. If the abstract value is too small, the input value is rounded to
888 a subnormal number, and the underflow and inexact exceptions are raised if
889 the abstract input cannot be represented exactly as a subnormal quadruple-
890 precision floating-point number.
891 The input significand must be normalized or smaller. If the input
892 significand is not normalized, `zExp' must be 0; in that case, the result
893 returned is a subnormal number, and it must not require rounding. In the
894 usual case that the input significand is normalized, `zExp' must be 1 less
895 than the ``true'' floating-point exponent. The handling of underflow and
896 overflow follows the IEC/IEEE Standard for Binary Floating-point Arithmetic.
897 -------------------------------------------------------------------------------
899 static float128
900 roundAndPackFloat128(
901 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
903 int8 roundingMode;
904 flag roundNearestEven, increment, isTiny;
906 roundingMode = float_rounding_mode;
907 roundNearestEven = ( roundingMode == float_round_nearest_even );
908 increment = ( (sbits64) zSig2 < 0 );
909 if ( ! roundNearestEven ) {
910 if ( roundingMode == float_round_to_zero ) {
911 increment = 0;
913 else {
914 if ( zSign ) {
915 increment = ( roundingMode == float_round_down ) && zSig2;
917 else {
918 increment = ( roundingMode == float_round_up ) && zSig2;
922 if ( 0x7FFD <= (bits32) zExp ) {
923 if ( ( 0x7FFD < zExp )
924 || ( ( zExp == 0x7FFD )
925 && eq128(
926 LIT64( 0x0001FFFFFFFFFFFF ),
927 LIT64( 0xFFFFFFFFFFFFFFFF ),
928 zSig0,
929 zSig1
931 && increment
934 float_raise( float_flag_overflow | float_flag_inexact );
935 if ( ( roundingMode == float_round_to_zero )
936 || ( zSign && ( roundingMode == float_round_up ) )
937 || ( ! zSign && ( roundingMode == float_round_down ) )
939 return
940 packFloat128(
941 zSign,
942 0x7FFE,
943 LIT64( 0x0000FFFFFFFFFFFF ),
944 LIT64( 0xFFFFFFFFFFFFFFFF )
947 return packFloat128( zSign, 0x7FFF, 0, 0 );
949 if ( zExp < 0 ) {
950 isTiny =
951 ( float_detect_tininess == float_tininess_before_rounding )
952 || ( zExp < -1 )
953 || ! increment
954 || lt128(
955 zSig0,
956 zSig1,
957 LIT64( 0x0001FFFFFFFFFFFF ),
958 LIT64( 0xFFFFFFFFFFFFFFFF )
960 shift128ExtraRightJamming(
961 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
962 zExp = 0;
963 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
964 if ( roundNearestEven ) {
965 increment = ( (sbits64) zSig2 < 0 );
967 else {
968 if ( zSign ) {
969 increment = ( roundingMode == float_round_down ) && zSig2;
971 else {
972 increment = ( roundingMode == float_round_up ) && zSig2;
977 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
978 if ( increment ) {
979 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
980 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
982 else {
983 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
985 return packFloat128( zSign, zExp, zSig0, zSig1 );
990 -------------------------------------------------------------------------------
991 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
992 and significand formed by the concatenation of `zSig0' and `zSig1', and
993 returns the proper quadruple-precision floating-point value corresponding to
994 the abstract input. This routine is just like `roundAndPackFloat128' except
995 that the input significand has fewer bits and does not have to be normalized
996 in any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
997 point exponent.
998 -------------------------------------------------------------------------------
1000 static float128
1001 normalizeRoundAndPackFloat128(
1002 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1004 int8 shiftCount;
1005 bits64 zSig2;
1007 if ( zSig0 == 0 ) {
1008 zSig0 = zSig1;
1009 zSig1 = 0;
1010 zExp -= 64;
1012 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1013 if ( 0 <= shiftCount ) {
1014 zSig2 = 0;
1015 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1017 else {
1018 shift128ExtraRightJamming(
1019 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1021 zExp -= shiftCount;
1022 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1026 #endif
1029 -------------------------------------------------------------------------------
1030 Returns the result of converting the 32-bit two's complement integer `a' to
1031 the single-precision floating-point format. The conversion is performed
1032 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1033 -------------------------------------------------------------------------------
1035 float32 int32_to_float32( int32 a )
1037 flag zSign;
1039 if ( a == 0 ) return 0;
1040 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1041 zSign = ( a < 0 );
1042 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1047 -------------------------------------------------------------------------------
1048 Returns the result of converting the 32-bit two's complement integer `a' to
1049 the double-precision floating-point format. The conversion is performed
1050 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1051 -------------------------------------------------------------------------------
1053 float64 int32_to_float64( int32 a )
1055 flag aSign;
1056 uint32 absA;
1057 int8 shiftCount;
1058 bits64 zSig;
1060 if ( a == 0 ) return 0;
1061 aSign = ( a < 0 );
1062 absA = aSign ? - a : a;
1063 shiftCount = countLeadingZeros32( absA ) + 21;
1064 zSig = absA;
1065 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
1069 #ifdef FLOATX80
1072 -------------------------------------------------------------------------------
1073 Returns the result of converting the 32-bit two's complement integer `a'
1074 to the extended double-precision floating-point format. The conversion
1075 is performed according to the IEC/IEEE Standard for Binary Floating-point
1076 Arithmetic.
1077 -------------------------------------------------------------------------------
1079 floatx80 int32_to_floatx80( int32 a )
1081 flag zSign;
1082 uint32 absA;
1083 int8 shiftCount;
1084 bits64 zSig;
1086 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1087 zSign = ( a < 0 );
1088 absA = zSign ? - a : a;
1089 shiftCount = countLeadingZeros32( absA ) + 32;
1090 zSig = absA;
1091 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1095 #endif
1097 #ifdef FLOAT128
1100 -------------------------------------------------------------------------------
1101 Returns the result of converting the 32-bit two's complement integer `a' to
1102 the quadruple-precision floating-point format. The conversion is performed
1103 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1104 -------------------------------------------------------------------------------
1106 float128 int32_to_float128( int32 a )
1108 flag zSign;
1109 uint32 absA;
1110 int8 shiftCount;
1111 bits64 zSig0;
1113 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1114 zSign = ( a < 0 );
1115 absA = zSign ? - a : a;
1116 shiftCount = countLeadingZeros32( absA ) + 17;
1117 zSig0 = absA;
1118 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1122 #endif
1125 -------------------------------------------------------------------------------
1126 Returns the result of converting the single-precision floating-point value
1127 `a' to the 32-bit two's complement integer format. The conversion is
1128 performed according to the IEC/IEEE Standard for Binary Floating-point
1129 Arithmetic---which means in particular that the conversion is rounded
1130 according to the current rounding mode. If `a' is a NaN, the largest
1131 positive integer is returned. Otherwise, if the conversion overflows, the
1132 largest integer with the same sign as `a' is returned.
1133 -------------------------------------------------------------------------------
1135 int32 float32_to_int32( float32 a )
1137 flag aSign;
1138 int16 aExp, shiftCount;
1139 bits32 aSig;
1140 bits64 zSig;
1142 aSig = extractFloat32Frac( a );
1143 aExp = extractFloat32Exp( a );
1144 aSign = extractFloat32Sign( a );
1145 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1146 if ( aExp ) aSig |= 0x00800000;
1147 shiftCount = 0xAF - aExp;
1148 zSig = aSig;
1149 zSig <<= 32;
1150 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
1151 return roundAndPackInt32( aSign, zSig );
1156 -------------------------------------------------------------------------------
1157 Returns the result of converting the single-precision floating-point value
1158 `a' to the 32-bit two's complement integer format. The conversion is
1159 performed according to the IEC/IEEE Standard for Binary Floating-point
1160 Arithmetic, except that the conversion is always rounded toward zero. If
1161 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1162 conversion overflows, the largest integer with the same sign as `a' is
1163 returned.
1164 -------------------------------------------------------------------------------
1166 int32 float32_to_int32_round_to_zero( float32 a )
1168 flag aSign;
1169 int16 aExp, shiftCount;
1170 bits32 aSig;
1171 int32 z;
1173 aSig = extractFloat32Frac( a );
1174 aExp = extractFloat32Exp( a );
1175 aSign = extractFloat32Sign( a );
1176 shiftCount = aExp - 0x9E;
1177 if ( 0 <= shiftCount ) {
1178 if ( a == 0xCF000000 ) return 0x80000000;
1179 float_raise( float_flag_invalid );
1180 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1181 return 0x80000000;
1183 else if ( aExp <= 0x7E ) {
1184 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1185 return 0;
1187 aSig = ( aSig | 0x00800000 )<<8;
1188 z = aSig>>( - shiftCount );
1189 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1190 float_exception_flags |= float_flag_inexact;
1192 return aSign ? - z : z;
1197 -------------------------------------------------------------------------------
1198 Returns the result of converting the single-precision floating-point value
1199 `a' to the double-precision floating-point format. The conversion is
1200 performed according to the IEC/IEEE Standard for Binary Floating-point
1201 Arithmetic.
1202 -------------------------------------------------------------------------------
1204 float64 float32_to_float64( float32 a )
1206 flag aSign;
1207 int16 aExp;
1208 bits32 aSig;
1210 aSig = extractFloat32Frac( a );
1211 aExp = extractFloat32Exp( a );
1212 aSign = extractFloat32Sign( a );
1213 if ( aExp == 0xFF ) {
1214 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1215 return packFloat64( aSign, 0x7FF, 0 );
1217 if ( aExp == 0 ) {
1218 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1219 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1220 --aExp;
1222 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1226 #ifdef FLOATX80
1229 -------------------------------------------------------------------------------
1230 Returns the result of converting the single-precision floating-point value
1231 `a' to the extended double-precision floating-point format. The conversion
1232 is performed according to the IEC/IEEE Standard for Binary Floating-point
1233 Arithmetic.
1234 -------------------------------------------------------------------------------
1236 floatx80 float32_to_floatx80( float32 a )
1238 flag aSign;
1239 int16 aExp;
1240 bits32 aSig;
1242 aSig = extractFloat32Frac( a );
1243 aExp = extractFloat32Exp( a );
1244 aSign = extractFloat32Sign( a );
1245 if ( aExp == 0xFF ) {
1246 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1247 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1249 if ( aExp == 0 ) {
1250 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1251 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1253 aSig |= 0x00800000;
1254 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1258 #endif
1260 #ifdef FLOAT128
1263 -------------------------------------------------------------------------------
1264 Returns the result of converting the single-precision floating-point value
1265 `a' to the double-precision floating-point format. The conversion is
1266 performed according to the IEC/IEEE Standard for Binary Floating-point
1267 Arithmetic.
1268 -------------------------------------------------------------------------------
1270 float128 float32_to_float128( float32 a )
1272 flag aSign;
1273 int16 aExp;
1274 bits32 aSig;
1276 aSig = extractFloat32Frac( a );
1277 aExp = extractFloat32Exp( a );
1278 aSign = extractFloat32Sign( a );
1279 if ( aExp == 0xFF ) {
1280 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1281 return packFloat128( aSign, 0x7FFF, 0, 0 );
1283 if ( aExp == 0 ) {
1284 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1285 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1286 --aExp;
1288 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1292 #endif
1295 -------------------------------------------------------------------------------
1296 Rounds the single-precision floating-point value `a' to an integer, and
1297 returns the result as a single-precision floating-point value. The
1298 operation is performed according to the IEC/IEEE Standard for Binary
1299 Floating-point Arithmetic.
1300 -------------------------------------------------------------------------------
1302 float32 float32_round_to_int( float32 a )
1304 flag aSign;
1305 int16 aExp;
1306 bits32 lastBitMask, roundBitsMask;
1307 int8 roundingMode;
1308 float32 z;
1310 aExp = extractFloat32Exp( a );
1311 if ( 0x96 <= aExp ) {
1312 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1313 return propagateFloat32NaN( a, a );
1315 return a;
1317 if ( aExp <= 0x7E ) {
1318 if ( (bits32) ( a<<1 ) == 0 ) return a;
1319 float_exception_flags |= float_flag_inexact;
1320 aSign = extractFloat32Sign( a );
1321 switch ( float_rounding_mode ) {
1322 case float_round_nearest_even:
1323 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1324 return packFloat32( aSign, 0x7F, 0 );
1326 break;
1327 case float_round_down:
1328 return aSign ? 0xBF800000 : 0;
1329 case float_round_up:
1330 return aSign ? 0x80000000 : 0x3F800000;
1332 return packFloat32( aSign, 0, 0 );
1334 lastBitMask = 1;
1335 lastBitMask <<= 0x96 - aExp;
1336 roundBitsMask = lastBitMask - 1;
1337 z = a;
1338 roundingMode = float_rounding_mode;
1339 if ( roundingMode == float_round_nearest_even ) {
1340 z += lastBitMask>>1;
1341 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1343 else if ( roundingMode != float_round_to_zero ) {
1344 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1345 z += roundBitsMask;
1348 z &= ~ roundBitsMask;
1349 if ( z != a ) float_exception_flags |= float_flag_inexact;
1350 return z;
1355 -------------------------------------------------------------------------------
1356 Returns the result of adding the absolute values of the single-precision
1357 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1358 before being returned. `zSign' is ignored if the result is a NaN. The
1359 addition is performed according to the IEC/IEEE Standard for Binary
1360 Floating-point Arithmetic.
1361 -------------------------------------------------------------------------------
1363 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1365 int16 aExp, bExp, zExp;
1366 bits32 aSig, bSig, zSig;
1367 int16 expDiff;
1369 aSig = extractFloat32Frac( a );
1370 aExp = extractFloat32Exp( a );
1371 bSig = extractFloat32Frac( b );
1372 bExp = extractFloat32Exp( b );
1373 expDiff = aExp - bExp;
1374 aSig <<= 6;
1375 bSig <<= 6;
1376 if ( 0 < expDiff ) {
1377 if ( aExp == 0xFF ) {
1378 if ( aSig ) return propagateFloat32NaN( a, b );
1379 return a;
1381 if ( bExp == 0 ) {
1382 --expDiff;
1384 else {
1385 bSig |= 0x20000000;
1387 shift32RightJamming( bSig, expDiff, &bSig );
1388 zExp = aExp;
1390 else if ( expDiff < 0 ) {
1391 if ( bExp == 0xFF ) {
1392 if ( bSig ) return propagateFloat32NaN( a, b );
1393 return packFloat32( zSign, 0xFF, 0 );
1395 if ( aExp == 0 ) {
1396 ++expDiff;
1398 else {
1399 aSig |= 0x20000000;
1401 shift32RightJamming( aSig, - expDiff, &aSig );
1402 zExp = bExp;
1404 else {
1405 if ( aExp == 0xFF ) {
1406 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1407 return a;
1409 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1410 zSig = 0x40000000 + aSig + bSig;
1411 zExp = aExp;
1412 goto roundAndPack;
1414 aSig |= 0x20000000;
1415 zSig = ( aSig + bSig )<<1;
1416 --zExp;
1417 if ( (sbits32) zSig < 0 ) {
1418 zSig = aSig + bSig;
1419 ++zExp;
1421 roundAndPack:
1422 return roundAndPackFloat32( zSign, zExp, zSig );
1427 -------------------------------------------------------------------------------
1428 Returns the result of subtracting the absolute values of the single-
1429 precision floating-point values `a' and `b'. If `zSign' is true, the
1430 difference is negated before being returned. `zSign' is ignored if the
1431 result is a NaN. The subtraction is performed according to the IEC/IEEE
1432 Standard for Binary Floating-point Arithmetic.
1433 -------------------------------------------------------------------------------
1435 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1437 int16 aExp, bExp, zExp;
1438 bits32 aSig, bSig, zSig;
1439 int16 expDiff;
1441 aSig = extractFloat32Frac( a );
1442 aExp = extractFloat32Exp( a );
1443 bSig = extractFloat32Frac( b );
1444 bExp = extractFloat32Exp( b );
1445 expDiff = aExp - bExp;
1446 aSig <<= 7;
1447 bSig <<= 7;
1448 if ( 0 < expDiff ) goto aExpBigger;
1449 if ( expDiff < 0 ) goto bExpBigger;
1450 if ( aExp == 0xFF ) {
1451 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1452 float_raise( float_flag_invalid );
1453 return float32_default_nan;
1455 if ( aExp == 0 ) {
1456 aExp = 1;
1457 bExp = 1;
1459 if ( bSig < aSig ) goto aBigger;
1460 if ( aSig < bSig ) goto bBigger;
1461 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1462 bExpBigger:
1463 if ( bExp == 0xFF ) {
1464 if ( bSig ) return propagateFloat32NaN( a, b );
1465 return packFloat32( zSign ^ 1, 0xFF, 0 );
1467 if ( aExp == 0 ) {
1468 ++expDiff;
1470 else {
1471 aSig |= 0x40000000;
1473 shift32RightJamming( aSig, - expDiff, &aSig );
1474 bSig |= 0x40000000;
1475 bBigger:
1476 zSig = bSig - aSig;
1477 zExp = bExp;
1478 zSign ^= 1;
1479 goto normalizeRoundAndPack;
1480 aExpBigger:
1481 if ( aExp == 0xFF ) {
1482 if ( aSig ) return propagateFloat32NaN( a, b );
1483 return a;
1485 if ( bExp == 0 ) {
1486 --expDiff;
1488 else {
1489 bSig |= 0x40000000;
1491 shift32RightJamming( bSig, expDiff, &bSig );
1492 aSig |= 0x40000000;
1493 aBigger:
1494 zSig = aSig - bSig;
1495 zExp = aExp;
1496 normalizeRoundAndPack:
1497 --zExp;
1498 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1503 -------------------------------------------------------------------------------
1504 Returns the result of adding the single-precision floating-point values `a'
1505 and `b'. The operation is performed according to the IEC/IEEE Standard for
1506 Binary Floating-point Arithmetic.
1507 -------------------------------------------------------------------------------
1509 float32 float32_add( float32 a, float32 b )
1511 flag aSign, bSign;
1513 aSign = extractFloat32Sign( a );
1514 bSign = extractFloat32Sign( b );
1515 if ( aSign == bSign ) {
1516 return addFloat32Sigs( a, b, aSign );
1518 else {
1519 return subFloat32Sigs( a, b, aSign );
1525 -------------------------------------------------------------------------------
1526 Returns the result of subtracting the single-precision floating-point values
1527 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1528 for Binary Floating-point Arithmetic.
1529 -------------------------------------------------------------------------------
1531 float32 float32_sub( float32 a, float32 b )
1533 flag aSign, bSign;
1535 aSign = extractFloat32Sign( a );
1536 bSign = extractFloat32Sign( b );
1537 if ( aSign == bSign ) {
1538 return subFloat32Sigs( a, b, aSign );
1540 else {
1541 return addFloat32Sigs( a, b, aSign );
1547 -------------------------------------------------------------------------------
1548 Returns the result of multiplying the single-precision floating-point values
1549 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1550 for Binary Floating-point Arithmetic.
1551 -------------------------------------------------------------------------------
1553 float32 float32_mul( float32 a, float32 b )
1555 flag aSign, bSign, zSign;
1556 int16 aExp, bExp, zExp;
1557 bits32 aSig, bSig;
1558 bits64 zSig64;
1559 bits32 zSig;
1561 aSig = extractFloat32Frac( a );
1562 aExp = extractFloat32Exp( a );
1563 aSign = extractFloat32Sign( a );
1564 bSig = extractFloat32Frac( b );
1565 bExp = extractFloat32Exp( b );
1566 bSign = extractFloat32Sign( b );
1567 zSign = aSign ^ bSign;
1568 if ( aExp == 0xFF ) {
1569 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1570 return propagateFloat32NaN( a, b );
1572 if ( ( bExp | bSig ) == 0 ) {
1573 float_raise( float_flag_invalid );
1574 return float32_default_nan;
1576 return packFloat32( zSign, 0xFF, 0 );
1578 if ( bExp == 0xFF ) {
1579 if ( bSig ) return propagateFloat32NaN( a, b );
1580 if ( ( aExp | aSig ) == 0 ) {
1581 float_raise( float_flag_invalid );
1582 return float32_default_nan;
1584 return packFloat32( zSign, 0xFF, 0 );
1586 if ( aExp == 0 ) {
1587 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1588 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1590 if ( bExp == 0 ) {
1591 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1592 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1594 zExp = aExp + bExp - 0x7F;
1595 aSig = ( aSig | 0x00800000 )<<7;
1596 bSig = ( bSig | 0x00800000 )<<8;
1597 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1598 zSig = zSig64;
1599 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1600 zSig <<= 1;
1601 --zExp;
1603 return roundAndPackFloat32( zSign, zExp, zSig );
1608 -------------------------------------------------------------------------------
1609 Returns the result of dividing the single-precision floating-point value `a'
1610 by the corresponding value `b'. The operation is performed according to the
1611 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1612 -------------------------------------------------------------------------------
1614 float32 float32_div( float32 a, float32 b )
1616 flag aSign, bSign, zSign;
1617 int16 aExp, bExp, zExp;
1618 bits32 aSig, bSig, zSig;
1620 aSig = extractFloat32Frac( a );
1621 aExp = extractFloat32Exp( a );
1622 aSign = extractFloat32Sign( a );
1623 bSig = extractFloat32Frac( b );
1624 bExp = extractFloat32Exp( b );
1625 bSign = extractFloat32Sign( b );
1626 zSign = aSign ^ bSign;
1627 if ( aExp == 0xFF ) {
1628 if ( aSig ) return propagateFloat32NaN( a, b );
1629 if ( bExp == 0xFF ) {
1630 if ( bSig ) return propagateFloat32NaN( a, b );
1631 float_raise( float_flag_invalid );
1632 return float32_default_nan;
1634 return packFloat32( zSign, 0xFF, 0 );
1636 if ( bExp == 0xFF ) {
1637 if ( bSig ) return propagateFloat32NaN( a, b );
1638 return packFloat32( zSign, 0, 0 );
1640 if ( bExp == 0 ) {
1641 if ( bSig == 0 ) {
1642 if ( ( aExp | aSig ) == 0 ) {
1643 float_raise( float_flag_invalid );
1644 return float32_default_nan;
1646 float_raise( float_flag_divbyzero );
1647 return packFloat32( zSign, 0xFF, 0 );
1649 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1651 if ( aExp == 0 ) {
1652 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1653 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1655 zExp = aExp - bExp + 0x7D;
1656 aSig = ( aSig | 0x00800000 )<<7;
1657 bSig = ( bSig | 0x00800000 )<<8;
1658 if ( bSig <= ( aSig + aSig ) ) {
1659 aSig >>= 1;
1660 ++zExp;
1662 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1663 if ( ( zSig & 0x3F ) == 0 ) {
1664 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1666 return roundAndPackFloat32( zSign, zExp, zSig );
1671 -------------------------------------------------------------------------------
1672 Returns the remainder of the single-precision floating-point value `a'
1673 with respect to the corresponding value `b'. The operation is performed
1674 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1675 -------------------------------------------------------------------------------
1677 float32 float32_rem( float32 a, float32 b )
1679 flag aSign, bSign, zSign;
1680 int16 aExp, bExp, expDiff;
1681 bits32 aSig, bSig;
1682 bits32 q;
1683 bits64 aSig64, bSig64, q64;
1684 bits32 alternateASig;
1685 sbits32 sigMean;
1687 aSig = extractFloat32Frac( a );
1688 aExp = extractFloat32Exp( a );
1689 aSign = extractFloat32Sign( a );
1690 bSig = extractFloat32Frac( b );
1691 bExp = extractFloat32Exp( b );
1692 bSign = extractFloat32Sign( b );
1693 if ( aExp == 0xFF ) {
1694 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1695 return propagateFloat32NaN( a, b );
1697 float_raise( float_flag_invalid );
1698 return float32_default_nan;
1700 if ( bExp == 0xFF ) {
1701 if ( bSig ) return propagateFloat32NaN( a, b );
1702 return a;
1704 if ( bExp == 0 ) {
1705 if ( bSig == 0 ) {
1706 float_raise( float_flag_invalid );
1707 return float32_default_nan;
1709 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1711 if ( aExp == 0 ) {
1712 if ( aSig == 0 ) return a;
1713 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1715 expDiff = aExp - bExp;
1716 aSig |= 0x00800000;
1717 bSig |= 0x00800000;
1718 if ( expDiff < 32 ) {
1719 aSig <<= 8;
1720 bSig <<= 8;
1721 if ( expDiff < 0 ) {
1722 if ( expDiff < -1 ) return a;
1723 aSig >>= 1;
1725 q = ( bSig <= aSig );
1726 if ( q ) aSig -= bSig;
1727 if ( 0 < expDiff ) {
1728 q = ( ( (bits64) aSig )<<32 ) / bSig;
1729 q >>= 32 - expDiff;
1730 bSig >>= 2;
1731 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1733 else {
1734 aSig >>= 2;
1735 bSig >>= 2;
1738 else {
1739 if ( bSig <= aSig ) aSig -= bSig;
1740 aSig64 = ( (bits64) aSig )<<40;
1741 bSig64 = ( (bits64) bSig )<<40;
1742 expDiff -= 64;
1743 while ( 0 < expDiff ) {
1744 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1745 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1746 aSig64 = - ( ( bSig * q64 )<<38 );
1747 expDiff -= 62;
1749 expDiff += 64;
1750 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1751 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1752 q = q64>>( 64 - expDiff );
1753 bSig <<= 6;
1754 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1756 do {
1757 alternateASig = aSig;
1758 ++q;
1759 aSig -= bSig;
1760 } while ( 0 <= (sbits32) aSig );
1761 sigMean = aSig + alternateASig;
1762 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1763 aSig = alternateASig;
1765 zSign = ( (sbits32) aSig < 0 );
1766 if ( zSign ) aSig = - aSig;
1767 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1772 -------------------------------------------------------------------------------
1773 Returns the square root of the single-precision floating-point value `a'.
1774 The operation is performed according to the IEC/IEEE Standard for Binary
1775 Floating-point Arithmetic.
1776 -------------------------------------------------------------------------------
1778 float32 float32_sqrt( float32 a )
1780 flag aSign;
1781 int16 aExp, zExp;
1782 bits32 aSig, zSig;
1783 bits64 rem, term;
1785 aSig = extractFloat32Frac( a );
1786 aExp = extractFloat32Exp( a );
1787 aSign = extractFloat32Sign( a );
1788 if ( aExp == 0xFF ) {
1789 if ( aSig ) return propagateFloat32NaN( a, 0 );
1790 if ( ! aSign ) return a;
1791 float_raise( float_flag_invalid );
1792 return float32_default_nan;
1794 if ( aSign ) {
1795 if ( ( aExp | aSig ) == 0 ) return a;
1796 float_raise( float_flag_invalid );
1797 return float32_default_nan;
1799 if ( aExp == 0 ) {
1800 if ( aSig == 0 ) return 0;
1801 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1803 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1804 aSig = ( aSig | 0x00800000 )<<8;
1805 zSig = estimateSqrt32( aExp, aSig ) + 2;
1806 if ( ( zSig & 0x7F ) <= 5 ) {
1807 if ( zSig < 2 ) {
1808 zSig = 0xFFFFFFFF;
1810 else {
1811 aSig >>= aExp & 1;
1812 term = ( (bits64) zSig ) * zSig;
1813 rem = ( ( (bits64) aSig )<<32 ) - term;
1814 while ( (sbits64) rem < 0 ) {
1815 --zSig;
1816 rem += ( ( (bits64) zSig )<<1 ) | 1;
1818 zSig |= ( rem != 0 );
1821 shift32RightJamming( zSig, 1, &zSig );
1822 return roundAndPackFloat32( 0, zExp, zSig );
1827 -------------------------------------------------------------------------------
1828 Returns 1 if the single-precision floating-point value `a' is equal to the
1829 corresponding value `b', and 0 otherwise. The comparison is performed
1830 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1831 -------------------------------------------------------------------------------
1833 flag float32_eq( float32 a, float32 b )
1836 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1837 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1839 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1840 float_raise( float_flag_invalid );
1842 return 0;
1844 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1849 -------------------------------------------------------------------------------
1850 Returns 1 if the single-precision floating-point value `a' is less than or
1851 equal to the corresponding value `b', and 0 otherwise. The comparison is
1852 performed according to the IEC/IEEE Standard for Binary Floating-point
1853 Arithmetic.
1854 -------------------------------------------------------------------------------
1856 flag float32_le( float32 a, float32 b )
1858 flag aSign, bSign;
1860 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1861 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1863 float_raise( float_flag_invalid );
1864 return 0;
1866 aSign = extractFloat32Sign( a );
1867 bSign = extractFloat32Sign( b );
1868 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1869 return ( a == b ) || ( aSign ^ ( a < b ) );
1874 -------------------------------------------------------------------------------
1875 Returns 1 if the single-precision floating-point value `a' is less than
1876 the corresponding value `b', and 0 otherwise. The comparison is performed
1877 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1878 -------------------------------------------------------------------------------
1880 flag float32_lt( float32 a, float32 b )
1882 flag aSign, bSign;
1884 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1885 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1887 float_raise( float_flag_invalid );
1888 return 0;
1890 aSign = extractFloat32Sign( a );
1891 bSign = extractFloat32Sign( b );
1892 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1893 return ( a != b ) && ( aSign ^ ( a < b ) );
1898 -------------------------------------------------------------------------------
1899 Returns 1 if the single-precision floating-point value `a' is equal to the
1900 corresponding value `b', and 0 otherwise. The invalid exception is raised
1901 if either operand is a NaN. Otherwise, the comparison is performed
1902 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1903 -------------------------------------------------------------------------------
1905 flag float32_eq_signaling( float32 a, float32 b )
1908 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1909 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1911 float_raise( float_flag_invalid );
1912 return 0;
1914 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1919 -------------------------------------------------------------------------------
1920 Returns 1 if the single-precision floating-point value `a' is less than or
1921 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1922 cause an exception. Otherwise, the comparison is performed according to the
1923 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1924 -------------------------------------------------------------------------------
1926 flag float32_le_quiet( float32 a, float32 b )
1928 flag aSign, bSign;
1929 //int16 aExp, bExp;
1931 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1932 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1934 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1935 float_raise( float_flag_invalid );
1937 return 0;
1939 aSign = extractFloat32Sign( a );
1940 bSign = extractFloat32Sign( b );
1941 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1942 return ( a == b ) || ( aSign ^ ( a < b ) );
1947 -------------------------------------------------------------------------------
1948 Returns 1 if the single-precision floating-point value `a' is less than
1949 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1950 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1951 Standard for Binary Floating-point Arithmetic.
1952 -------------------------------------------------------------------------------
1954 flag float32_lt_quiet( float32 a, float32 b )
1956 flag aSign, bSign;
1958 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1959 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1961 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1962 float_raise( float_flag_invalid );
1964 return 0;
1966 aSign = extractFloat32Sign( a );
1967 bSign = extractFloat32Sign( b );
1968 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1969 return ( a != b ) && ( aSign ^ ( a < b ) );
1974 -------------------------------------------------------------------------------
1975 Returns the result of converting the double-precision floating-point value
1976 `a' to the 32-bit two's complement integer format. The conversion is
1977 performed according to the IEC/IEEE Standard for Binary Floating-point
1978 Arithmetic---which means in particular that the conversion is rounded
1979 according to the current rounding mode. If `a' is a NaN, the largest
1980 positive integer is returned. Otherwise, if the conversion overflows, the
1981 largest integer with the same sign as `a' is returned.
1982 -------------------------------------------------------------------------------
1984 int32 float64_to_int32( float64 a )
1986 flag aSign;
1987 int16 aExp, shiftCount;
1988 bits64 aSig;
1990 aSig = extractFloat64Frac( a );
1991 aExp = extractFloat64Exp( a );
1992 aSign = extractFloat64Sign( a );
1993 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1994 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1995 shiftCount = 0x42C - aExp;
1996 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1997 return roundAndPackInt32( aSign, aSig );
2002 -------------------------------------------------------------------------------
2003 Returns the result of converting the double-precision floating-point value
2004 `a' to the 32-bit two's complement integer format. The conversion is
2005 performed according to the IEC/IEEE Standard for Binary Floating-point
2006 Arithmetic, except that the conversion is always rounded toward zero. If
2007 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
2008 conversion overflows, the largest integer with the same sign as `a' is
2009 returned.
2010 -------------------------------------------------------------------------------
2012 int32 float64_to_int32_round_to_zero( float64 a )
2014 flag aSign;
2015 int16 aExp, shiftCount;
2016 bits64 aSig, savedASig;
2017 int32 z;
2019 aSig = extractFloat64Frac( a );
2020 aExp = extractFloat64Exp( a );
2021 aSign = extractFloat64Sign( a );
2022 shiftCount = 0x433 - aExp;
2023 if ( shiftCount < 21 ) {
2024 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2025 goto invalid;
2027 else if ( 52 < shiftCount ) {
2028 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2029 return 0;
2031 aSig |= LIT64( 0x0010000000000000 );
2032 savedASig = aSig;
2033 aSig >>= shiftCount;
2034 z = aSig;
2035 if ( aSign ) z = - z;
2036 if ( ( z < 0 ) ^ aSign ) {
2037 invalid:
2038 float_exception_flags |= float_flag_invalid;
2039 return aSign ? 0x80000000 : 0x7FFFFFFF;
2041 if ( ( aSig<<shiftCount ) != savedASig ) {
2042 float_exception_flags |= float_flag_inexact;
2044 return z;
2049 -------------------------------------------------------------------------------
2050 Returns the result of converting the double-precision floating-point value
2051 `a' to the 32-bit two's complement unsigned integer format. The conversion
2052 is performed according to the IEC/IEEE Standard for Binary Floating-point
2053 Arithmetic---which means in particular that the conversion is rounded
2054 according to the current rounding mode. If `a' is a NaN, the largest
2055 positive integer is returned. Otherwise, if the conversion overflows, the
2056 largest positive integer is returned.
2057 -------------------------------------------------------------------------------
2059 int32 float64_to_uint32( float64 a )
2061 flag aSign;
2062 int16 aExp, shiftCount;
2063 bits64 aSig;
2065 aSig = extractFloat64Frac( a );
2066 aExp = extractFloat64Exp( a );
2067 aSign = 0; //extractFloat64Sign( a );
2068 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2069 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2070 shiftCount = 0x42C - aExp;
2071 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2072 return roundAndPackInt32( aSign, aSig );
2076 -------------------------------------------------------------------------------
2077 Returns the result of converting the double-precision floating-point value
2078 `a' to the 32-bit two's complement integer format. The conversion is
2079 performed according to the IEC/IEEE Standard for Binary Floating-point
2080 Arithmetic, except that the conversion is always rounded toward zero. If
2081 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
2082 conversion overflows, the largest positive integer is returned.
2083 -------------------------------------------------------------------------------
2085 int32 float64_to_uint32_round_to_zero( float64 a )
2087 flag aSign;
2088 int16 aExp, shiftCount;
2089 bits64 aSig, savedASig;
2090 int32 z;
2092 aSig = extractFloat64Frac( a );
2093 aExp = extractFloat64Exp( a );
2094 aSign = extractFloat64Sign( a );
2095 shiftCount = 0x433 - aExp;
2096 if ( shiftCount < 21 ) {
2097 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2098 goto invalid;
2100 else if ( 52 < shiftCount ) {
2101 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2102 return 0;
2104 aSig |= LIT64( 0x0010000000000000 );
2105 savedASig = aSig;
2106 aSig >>= shiftCount;
2107 z = aSig;
2108 if ( aSign ) z = - z;
2109 if ( ( z < 0 ) ^ aSign ) {
2110 invalid:
2111 float_exception_flags |= float_flag_invalid;
2112 return aSign ? 0x80000000 : 0x7FFFFFFF;
2114 if ( ( aSig<<shiftCount ) != savedASig ) {
2115 float_exception_flags |= float_flag_inexact;
2117 return z;
2121 -------------------------------------------------------------------------------
2122 Returns the result of converting the double-precision floating-point value
2123 `a' to the single-precision floating-point format. The conversion is
2124 performed according to the IEC/IEEE Standard for Binary Floating-point
2125 Arithmetic.
2126 -------------------------------------------------------------------------------
2128 float32 float64_to_float32( float64 a )
2130 flag aSign;
2131 int16 aExp;
2132 bits64 aSig;
2133 bits32 zSig;
2135 aSig = extractFloat64Frac( a );
2136 aExp = extractFloat64Exp( a );
2137 aSign = extractFloat64Sign( a );
2138 if ( aExp == 0x7FF ) {
2139 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2140 return packFloat32( aSign, 0xFF, 0 );
2142 shift64RightJamming( aSig, 22, &aSig );
2143 zSig = aSig;
2144 if ( aExp || zSig ) {
2145 zSig |= 0x40000000;
2146 aExp -= 0x381;
2148 return roundAndPackFloat32( aSign, aExp, zSig );
2152 #ifdef FLOATX80
2155 -------------------------------------------------------------------------------
2156 Returns the result of converting the double-precision floating-point value
2157 `a' to the extended double-precision floating-point format. The conversion
2158 is performed according to the IEC/IEEE Standard for Binary Floating-point
2159 Arithmetic.
2160 -------------------------------------------------------------------------------
2162 floatx80 float64_to_floatx80( float64 a )
2164 flag aSign;
2165 int16 aExp;
2166 bits64 aSig;
2168 aSig = extractFloat64Frac( a );
2169 aExp = extractFloat64Exp( a );
2170 aSign = extractFloat64Sign( a );
2171 if ( aExp == 0x7FF ) {
2172 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2173 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2175 if ( aExp == 0 ) {
2176 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2177 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2179 return
2180 packFloatx80(
2181 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2185 #endif
2187 #ifdef FLOAT128
2190 -------------------------------------------------------------------------------
2191 Returns the result of converting the double-precision floating-point value
2192 `a' to the quadruple-precision floating-point format. The conversion is
2193 performed according to the IEC/IEEE Standard for Binary Floating-point
2194 Arithmetic.
2195 -------------------------------------------------------------------------------
2197 float128 float64_to_float128( float64 a )
2199 flag aSign;
2200 int16 aExp;
2201 bits64 aSig, zSig0, zSig1;
2203 aSig = extractFloat64Frac( a );
2204 aExp = extractFloat64Exp( a );
2205 aSign = extractFloat64Sign( a );
2206 if ( aExp == 0x7FF ) {
2207 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2208 return packFloat128( aSign, 0x7FFF, 0, 0 );
2210 if ( aExp == 0 ) {
2211 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2212 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2213 --aExp;
2215 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2216 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2220 #endif
2223 -------------------------------------------------------------------------------
2224 Rounds the double-precision floating-point value `a' to an integer, and
2225 returns the result as a double-precision floating-point value. The
2226 operation is performed according to the IEC/IEEE Standard for Binary
2227 Floating-point Arithmetic.
2228 -------------------------------------------------------------------------------
2230 float64 float64_round_to_int( float64 a )
2232 flag aSign;
2233 int16 aExp;
2234 bits64 lastBitMask, roundBitsMask;
2235 int8 roundingMode;
2236 float64 z;
2238 aExp = extractFloat64Exp( a );
2239 if ( 0x433 <= aExp ) {
2240 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2241 return propagateFloat64NaN( a, a );
2243 return a;
2245 if ( aExp <= 0x3FE ) {
2246 if ( (bits64) ( a<<1 ) == 0 ) return a;
2247 float_exception_flags |= float_flag_inexact;
2248 aSign = extractFloat64Sign( a );
2249 switch ( float_rounding_mode ) {
2250 case float_round_nearest_even:
2251 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2252 return packFloat64( aSign, 0x3FF, 0 );
2254 break;
2255 case float_round_down:
2256 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2257 case float_round_up:
2258 return
2259 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2261 return packFloat64( aSign, 0, 0 );
2263 lastBitMask = 1;
2264 lastBitMask <<= 0x433 - aExp;
2265 roundBitsMask = lastBitMask - 1;
2266 z = a;
2267 roundingMode = float_rounding_mode;
2268 if ( roundingMode == float_round_nearest_even ) {
2269 z += lastBitMask>>1;
2270 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2272 else if ( roundingMode != float_round_to_zero ) {
2273 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2274 z += roundBitsMask;
2277 z &= ~ roundBitsMask;
2278 if ( z != a ) float_exception_flags |= float_flag_inexact;
2279 return z;
2284 -------------------------------------------------------------------------------
2285 Returns the result of adding the absolute values of the double-precision
2286 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
2287 before being returned. `zSign' is ignored if the result is a NaN. The
2288 addition is performed according to the IEC/IEEE Standard for Binary
2289 Floating-point Arithmetic.
2290 -------------------------------------------------------------------------------
2292 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2294 int16 aExp, bExp, zExp;
2295 bits64 aSig, bSig, zSig;
2296 int16 expDiff;
2298 aSig = extractFloat64Frac( a );
2299 aExp = extractFloat64Exp( a );
2300 bSig = extractFloat64Frac( b );
2301 bExp = extractFloat64Exp( b );
2302 expDiff = aExp - bExp;
2303 aSig <<= 9;
2304 bSig <<= 9;
2305 if ( 0 < expDiff ) {
2306 if ( aExp == 0x7FF ) {
2307 if ( aSig ) return propagateFloat64NaN( a, b );
2308 return a;
2310 if ( bExp == 0 ) {
2311 --expDiff;
2313 else {
2314 bSig |= LIT64( 0x2000000000000000 );
2316 shift64RightJamming( bSig, expDiff, &bSig );
2317 zExp = aExp;
2319 else if ( expDiff < 0 ) {
2320 if ( bExp == 0x7FF ) {
2321 if ( bSig ) return propagateFloat64NaN( a, b );
2322 return packFloat64( zSign, 0x7FF, 0 );
2324 if ( aExp == 0 ) {
2325 ++expDiff;
2327 else {
2328 aSig |= LIT64( 0x2000000000000000 );
2330 shift64RightJamming( aSig, - expDiff, &aSig );
2331 zExp = bExp;
2333 else {
2334 if ( aExp == 0x7FF ) {
2335 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2336 return a;
2338 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2339 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2340 zExp = aExp;
2341 goto roundAndPack;
2343 aSig |= LIT64( 0x2000000000000000 );
2344 zSig = ( aSig + bSig )<<1;
2345 --zExp;
2346 if ( (sbits64) zSig < 0 ) {
2347 zSig = aSig + bSig;
2348 ++zExp;
2350 roundAndPack:
2351 return roundAndPackFloat64( zSign, zExp, zSig );
2356 -------------------------------------------------------------------------------
2357 Returns the result of subtracting the absolute values of the double-
2358 precision floating-point values `a' and `b'. If `zSign' is true, the
2359 difference is negated before being returned. `zSign' is ignored if the
2360 result is a NaN. The subtraction is performed according to the IEC/IEEE
2361 Standard for Binary Floating-point Arithmetic.
2362 -------------------------------------------------------------------------------
2364 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2366 int16 aExp, bExp, zExp;
2367 bits64 aSig, bSig, zSig;
2368 int16 expDiff;
2370 aSig = extractFloat64Frac( a );
2371 aExp = extractFloat64Exp( a );
2372 bSig = extractFloat64Frac( b );
2373 bExp = extractFloat64Exp( b );
2374 expDiff = aExp - bExp;
2375 aSig <<= 10;
2376 bSig <<= 10;
2377 if ( 0 < expDiff ) goto aExpBigger;
2378 if ( expDiff < 0 ) goto bExpBigger;
2379 if ( aExp == 0x7FF ) {
2380 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2381 float_raise( float_flag_invalid );
2382 return float64_default_nan;
2384 if ( aExp == 0 ) {
2385 aExp = 1;
2386 bExp = 1;
2388 if ( bSig < aSig ) goto aBigger;
2389 if ( aSig < bSig ) goto bBigger;
2390 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2391 bExpBigger:
2392 if ( bExp == 0x7FF ) {
2393 if ( bSig ) return propagateFloat64NaN( a, b );
2394 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2396 if ( aExp == 0 ) {
2397 ++expDiff;
2399 else {
2400 aSig |= LIT64( 0x4000000000000000 );
2402 shift64RightJamming( aSig, - expDiff, &aSig );
2403 bSig |= LIT64( 0x4000000000000000 );
2404 bBigger:
2405 zSig = bSig - aSig;
2406 zExp = bExp;
2407 zSign ^= 1;
2408 goto normalizeRoundAndPack;
2409 aExpBigger:
2410 if ( aExp == 0x7FF ) {
2411 if ( aSig ) return propagateFloat64NaN( a, b );
2412 return a;
2414 if ( bExp == 0 ) {
2415 --expDiff;
2417 else {
2418 bSig |= LIT64( 0x4000000000000000 );
2420 shift64RightJamming( bSig, expDiff, &bSig );
2421 aSig |= LIT64( 0x4000000000000000 );
2422 aBigger:
2423 zSig = aSig - bSig;
2424 zExp = aExp;
2425 normalizeRoundAndPack:
2426 --zExp;
2427 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2432 -------------------------------------------------------------------------------
2433 Returns the result of adding the double-precision floating-point values `a'
2434 and `b'. The operation is performed according to the IEC/IEEE Standard for
2435 Binary Floating-point Arithmetic.
2436 -------------------------------------------------------------------------------
2438 float64 float64_add( float64 a, float64 b )
2440 flag aSign, bSign;
2442 aSign = extractFloat64Sign( a );
2443 bSign = extractFloat64Sign( b );
2444 if ( aSign == bSign ) {
2445 return addFloat64Sigs( a, b, aSign );
2447 else {
2448 return subFloat64Sigs( a, b, aSign );
2454 -------------------------------------------------------------------------------
2455 Returns the result of subtracting the double-precision floating-point values
2456 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2457 for Binary Floating-point Arithmetic.
2458 -------------------------------------------------------------------------------
2460 float64 float64_sub( float64 a, float64 b )
2462 flag aSign, bSign;
2464 aSign = extractFloat64Sign( a );
2465 bSign = extractFloat64Sign( b );
2466 if ( aSign == bSign ) {
2467 return subFloat64Sigs( a, b, aSign );
2469 else {
2470 return addFloat64Sigs( a, b, aSign );
2476 -------------------------------------------------------------------------------
2477 Returns the result of multiplying the double-precision floating-point values
2478 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2479 for Binary Floating-point Arithmetic.
2480 -------------------------------------------------------------------------------
2482 float64 float64_mul( float64 a, float64 b )
2484 flag aSign, bSign, zSign;
2485 int16 aExp, bExp, zExp;
2486 bits64 aSig, bSig, zSig0, zSig1;
2488 aSig = extractFloat64Frac( a );
2489 aExp = extractFloat64Exp( a );
2490 aSign = extractFloat64Sign( a );
2491 bSig = extractFloat64Frac( b );
2492 bExp = extractFloat64Exp( b );
2493 bSign = extractFloat64Sign( b );
2494 zSign = aSign ^ bSign;
2495 if ( aExp == 0x7FF ) {
2496 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2497 return propagateFloat64NaN( a, b );
2499 if ( ( bExp | bSig ) == 0 ) {
2500 float_raise( float_flag_invalid );
2501 return float64_default_nan;
2503 return packFloat64( zSign, 0x7FF, 0 );
2505 if ( bExp == 0x7FF ) {
2506 if ( bSig ) return propagateFloat64NaN( a, b );
2507 if ( ( aExp | aSig ) == 0 ) {
2508 float_raise( float_flag_invalid );
2509 return float64_default_nan;
2511 return packFloat64( zSign, 0x7FF, 0 );
2513 if ( aExp == 0 ) {
2514 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2515 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2517 if ( bExp == 0 ) {
2518 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2519 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2521 zExp = aExp + bExp - 0x3FF;
2522 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2523 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2524 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2525 zSig0 |= ( zSig1 != 0 );
2526 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2527 zSig0 <<= 1;
2528 --zExp;
2530 return roundAndPackFloat64( zSign, zExp, zSig0 );
2535 -------------------------------------------------------------------------------
2536 Returns the result of dividing the double-precision floating-point value `a'
2537 by the corresponding value `b'. The operation is performed according to
2538 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2539 -------------------------------------------------------------------------------
2541 float64 float64_div( float64 a, float64 b )
2543 flag aSign, bSign, zSign;
2544 int16 aExp, bExp, zExp;
2545 bits64 aSig, bSig, zSig;
2546 bits64 rem0, rem1;
2547 bits64 term0, term1;
2549 aSig = extractFloat64Frac( a );
2550 aExp = extractFloat64Exp( a );
2551 aSign = extractFloat64Sign( a );
2552 bSig = extractFloat64Frac( b );
2553 bExp = extractFloat64Exp( b );
2554 bSign = extractFloat64Sign( b );
2555 zSign = aSign ^ bSign;
2556 if ( aExp == 0x7FF ) {
2557 if ( aSig ) return propagateFloat64NaN( a, b );
2558 if ( bExp == 0x7FF ) {
2559 if ( bSig ) return propagateFloat64NaN( a, b );
2560 float_raise( float_flag_invalid );
2561 return float64_default_nan;
2563 return packFloat64( zSign, 0x7FF, 0 );
2565 if ( bExp == 0x7FF ) {
2566 if ( bSig ) return propagateFloat64NaN( a, b );
2567 return packFloat64( zSign, 0, 0 );
2569 if ( bExp == 0 ) {
2570 if ( bSig == 0 ) {
2571 if ( ( aExp | aSig ) == 0 ) {
2572 float_raise( float_flag_invalid );
2573 return float64_default_nan;
2575 float_raise( float_flag_divbyzero );
2576 return packFloat64( zSign, 0x7FF, 0 );
2578 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2580 if ( aExp == 0 ) {
2581 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2582 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2584 zExp = aExp - bExp + 0x3FD;
2585 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2586 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2587 if ( bSig <= ( aSig + aSig ) ) {
2588 aSig >>= 1;
2589 ++zExp;
2591 zSig = estimateDiv128To64( aSig, 0, bSig );
2592 if ( ( zSig & 0x1FF ) <= 2 ) {
2593 mul64To128( bSig, zSig, &term0, &term1 );
2594 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2595 while ( (sbits64) rem0 < 0 ) {
2596 --zSig;
2597 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2599 zSig |= ( rem1 != 0 );
2601 return roundAndPackFloat64( zSign, zExp, zSig );
2606 -------------------------------------------------------------------------------
2607 Returns the remainder of the double-precision floating-point value `a'
2608 with respect to the corresponding value `b'. The operation is performed
2609 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2610 -------------------------------------------------------------------------------
2612 float64 float64_rem( float64 a, float64 b )
2614 flag aSign, bSign, zSign;
2615 int16 aExp, bExp, expDiff;
2616 bits64 aSig, bSig;
2617 bits64 q, alternateASig;
2618 sbits64 sigMean;
2620 aSig = extractFloat64Frac( a );
2621 aExp = extractFloat64Exp( a );
2622 aSign = extractFloat64Sign( a );
2623 bSig = extractFloat64Frac( b );
2624 bExp = extractFloat64Exp( b );
2625 bSign = extractFloat64Sign( b );
2626 if ( aExp == 0x7FF ) {
2627 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2628 return propagateFloat64NaN( a, b );
2630 float_raise( float_flag_invalid );
2631 return float64_default_nan;
2633 if ( bExp == 0x7FF ) {
2634 if ( bSig ) return propagateFloat64NaN( a, b );
2635 return a;
2637 if ( bExp == 0 ) {
2638 if ( bSig == 0 ) {
2639 float_raise( float_flag_invalid );
2640 return float64_default_nan;
2642 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2644 if ( aExp == 0 ) {
2645 if ( aSig == 0 ) return a;
2646 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2648 expDiff = aExp - bExp;
2649 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2650 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2651 if ( expDiff < 0 ) {
2652 if ( expDiff < -1 ) return a;
2653 aSig >>= 1;
2655 q = ( bSig <= aSig );
2656 if ( q ) aSig -= bSig;
2657 expDiff -= 64;
2658 while ( 0 < expDiff ) {
2659 q = estimateDiv128To64( aSig, 0, bSig );
2660 q = ( 2 < q ) ? q - 2 : 0;
2661 aSig = - ( ( bSig>>2 ) * q );
2662 expDiff -= 62;
2664 expDiff += 64;
2665 if ( 0 < expDiff ) {
2666 q = estimateDiv128To64( aSig, 0, bSig );
2667 q = ( 2 < q ) ? q - 2 : 0;
2668 q >>= 64 - expDiff;
2669 bSig >>= 2;
2670 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2672 else {
2673 aSig >>= 2;
2674 bSig >>= 2;
2676 do {
2677 alternateASig = aSig;
2678 ++q;
2679 aSig -= bSig;
2680 } while ( 0 <= (sbits64) aSig );
2681 sigMean = aSig + alternateASig;
2682 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2683 aSig = alternateASig;
2685 zSign = ( (sbits64) aSig < 0 );
2686 if ( zSign ) aSig = - aSig;
2687 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2692 -------------------------------------------------------------------------------
2693 Returns the square root of the double-precision floating-point value `a'.
2694 The operation is performed according to the IEC/IEEE Standard for Binary
2695 Floating-point Arithmetic.
2696 -------------------------------------------------------------------------------
2698 float64 float64_sqrt( float64 a )
2700 flag aSign;
2701 int16 aExp, zExp;
2702 bits64 aSig, zSig;
2703 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2704 //float64 z;
2706 aSig = extractFloat64Frac( a );
2707 aExp = extractFloat64Exp( a );
2708 aSign = extractFloat64Sign( a );
2709 if ( aExp == 0x7FF ) {
2710 if ( aSig ) return propagateFloat64NaN( a, a );
2711 if ( ! aSign ) return a;
2712 float_raise( float_flag_invalid );
2713 return float64_default_nan;
2715 if ( aSign ) {
2716 if ( ( aExp | aSig ) == 0 ) return a;
2717 float_raise( float_flag_invalid );
2718 return float64_default_nan;
2720 if ( aExp == 0 ) {
2721 if ( aSig == 0 ) return 0;
2722 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2724 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2725 aSig |= LIT64( 0x0010000000000000 );
2726 zSig = estimateSqrt32( aExp, aSig>>21 );
2727 zSig <<= 31;
2728 aSig <<= 9 - ( aExp & 1 );
2729 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2730 if ( ( zSig & 0x3FF ) <= 5 ) {
2731 if ( zSig < 2 ) {
2732 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2734 else {
2735 aSig <<= 2;
2736 mul64To128( zSig, zSig, &term0, &term1 );
2737 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2738 while ( (sbits64) rem0 < 0 ) {
2739 --zSig;
2740 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2741 term1 |= 1;
2742 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2744 zSig |= ( ( rem0 | rem1 ) != 0 );
2747 shift64RightJamming( zSig, 1, &zSig );
2748 return roundAndPackFloat64( 0, zExp, zSig );
2753 -------------------------------------------------------------------------------
2754 Returns 1 if the double-precision floating-point value `a' is equal to the
2755 corresponding value `b', and 0 otherwise. The comparison is performed
2756 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2757 -------------------------------------------------------------------------------
2759 flag float64_eq( float64 a, float64 b )
2762 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2763 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2765 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2766 float_raise( float_flag_invalid );
2768 return 0;
2770 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2775 -------------------------------------------------------------------------------
2776 Returns 1 if the double-precision floating-point value `a' is less than or
2777 equal to the corresponding value `b', and 0 otherwise. The comparison is
2778 performed according to the IEC/IEEE Standard for Binary Floating-point
2779 Arithmetic.
2780 -------------------------------------------------------------------------------
2782 flag float64_le( float64 a, float64 b )
2784 flag aSign, bSign;
2786 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2787 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2789 float_raise( float_flag_invalid );
2790 return 0;
2792 aSign = extractFloat64Sign( a );
2793 bSign = extractFloat64Sign( b );
2794 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2795 return ( a == b ) || ( aSign ^ ( a < b ) );
2800 -------------------------------------------------------------------------------
2801 Returns 1 if the double-precision floating-point value `a' is less than
2802 the corresponding value `b', and 0 otherwise. The comparison is performed
2803 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2804 -------------------------------------------------------------------------------
2806 flag float64_lt( float64 a, float64 b )
2808 flag aSign, bSign;
2810 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2811 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2813 float_raise( float_flag_invalid );
2814 return 0;
2816 aSign = extractFloat64Sign( a );
2817 bSign = extractFloat64Sign( b );
2818 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2819 return ( a != b ) && ( aSign ^ ( a < b ) );
2824 -------------------------------------------------------------------------------
2825 Returns 1 if the double-precision floating-point value `a' is equal to the
2826 corresponding value `b', and 0 otherwise. The invalid exception is raised
2827 if either operand is a NaN. Otherwise, the comparison is performed
2828 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2829 -------------------------------------------------------------------------------
2831 flag float64_eq_signaling( float64 a, float64 b )
2834 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2835 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2837 float_raise( float_flag_invalid );
2838 return 0;
2840 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2845 -------------------------------------------------------------------------------
2846 Returns 1 if the double-precision floating-point value `a' is less than or
2847 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2848 cause an exception. Otherwise, the comparison is performed according to the
2849 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2850 -------------------------------------------------------------------------------
2852 flag float64_le_quiet( float64 a, float64 b )
2854 flag aSign, bSign;
2855 //int16 aExp, bExp;
2857 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2858 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2860 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2861 float_raise( float_flag_invalid );
2863 return 0;
2865 aSign = extractFloat64Sign( a );
2866 bSign = extractFloat64Sign( b );
2867 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2868 return ( a == b ) || ( aSign ^ ( a < b ) );
2873 -------------------------------------------------------------------------------
2874 Returns 1 if the double-precision floating-point value `a' is less than
2875 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2876 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2877 Standard for Binary Floating-point Arithmetic.
2878 -------------------------------------------------------------------------------
2880 flag float64_lt_quiet( float64 a, float64 b )
2882 flag aSign, bSign;
2884 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2885 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2887 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2888 float_raise( float_flag_invalid );
2890 return 0;
2892 aSign = extractFloat64Sign( a );
2893 bSign = extractFloat64Sign( b );
2894 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2895 return ( a != b ) && ( aSign ^ ( a < b ) );
2899 #ifdef FLOATX80
2902 -------------------------------------------------------------------------------
2903 Returns the result of converting the extended double-precision floating-
2904 point value `a' to the 32-bit two's complement integer format. The
2905 conversion is performed according to the IEC/IEEE Standard for Binary
2906 Floating-point Arithmetic---which means in particular that the conversion
2907 is rounded according to the current rounding mode. If `a' is a NaN, the
2908 largest positive integer is returned. Otherwise, if the conversion
2909 overflows, the largest integer with the same sign as `a' is returned.
2910 -------------------------------------------------------------------------------
2912 int32 floatx80_to_int32( floatx80 a )
2914 flag aSign;
2915 int32 aExp, shiftCount;
2916 bits64 aSig;
2918 aSig = extractFloatx80Frac( a );
2919 aExp = extractFloatx80Exp( a );
2920 aSign = extractFloatx80Sign( a );
2921 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2922 shiftCount = 0x4037 - aExp;
2923 if ( shiftCount <= 0 ) shiftCount = 1;
2924 shift64RightJamming( aSig, shiftCount, &aSig );
2925 return roundAndPackInt32( aSign, aSig );
2930 -------------------------------------------------------------------------------
2931 Returns the result of converting the extended double-precision floating-
2932 point value `a' to the 32-bit two's complement integer format. The
2933 conversion is performed according to the IEC/IEEE Standard for Binary
2934 Floating-point Arithmetic, except that the conversion is always rounded
2935 toward zero. If `a' is a NaN, the largest positive integer is returned.
2936 Otherwise, if the conversion overflows, the largest integer with the same
2937 sign as `a' is returned.
2938 -------------------------------------------------------------------------------
2940 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2942 flag aSign;
2943 int32 aExp, shiftCount;
2944 bits64 aSig, savedASig;
2945 int32 z;
2947 aSig = extractFloatx80Frac( a );
2948 aExp = extractFloatx80Exp( a );
2949 aSign = extractFloatx80Sign( a );
2950 shiftCount = 0x403E - aExp;
2951 if ( shiftCount < 32 ) {
2952 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2953 goto invalid;
2955 else if ( 63 < shiftCount ) {
2956 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2957 return 0;
2959 savedASig = aSig;
2960 aSig >>= shiftCount;
2961 z = aSig;
2962 if ( aSign ) z = - z;
2963 if ( ( z < 0 ) ^ aSign ) {
2964 invalid:
2965 float_exception_flags |= float_flag_invalid;
2966 return aSign ? 0x80000000 : 0x7FFFFFFF;
2968 if ( ( aSig<<shiftCount ) != savedASig ) {
2969 float_exception_flags |= float_flag_inexact;
2971 return z;
2976 -------------------------------------------------------------------------------
2977 Returns the result of converting the extended double-precision floating-
2978 point value `a' to the single-precision floating-point format. The
2979 conversion is performed according to the IEC/IEEE Standard for Binary
2980 Floating-point Arithmetic.
2981 -------------------------------------------------------------------------------
2983 float32 floatx80_to_float32( floatx80 a )
2985 flag aSign;
2986 int32 aExp;
2987 bits64 aSig;
2989 aSig = extractFloatx80Frac( a );
2990 aExp = extractFloatx80Exp( a );
2991 aSign = extractFloatx80Sign( a );
2992 if ( aExp == 0x7FFF ) {
2993 if ( (bits64) ( aSig<<1 ) ) {
2994 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2996 return packFloat32( aSign, 0xFF, 0 );
2998 shift64RightJamming( aSig, 33, &aSig );
2999 if ( aExp || aSig ) aExp -= 0x3F81;
3000 return roundAndPackFloat32( aSign, aExp, aSig );
3005 -------------------------------------------------------------------------------
3006 Returns the result of converting the extended double-precision floating-
3007 point value `a' to the double-precision floating-point format. The
3008 conversion is performed according to the IEC/IEEE Standard for Binary
3009 Floating-point Arithmetic.
3010 -------------------------------------------------------------------------------
3012 float64 floatx80_to_float64( floatx80 a )
3014 flag aSign;
3015 int32 aExp;
3016 bits64 aSig, zSig;
3018 aSig = extractFloatx80Frac( a );
3019 aExp = extractFloatx80Exp( a );
3020 aSign = extractFloatx80Sign( a );
3021 if ( aExp == 0x7FFF ) {
3022 if ( (bits64) ( aSig<<1 ) ) {
3023 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3025 return packFloat64( aSign, 0x7FF, 0 );
3027 shift64RightJamming( aSig, 1, &zSig );
3028 if ( aExp || aSig ) aExp -= 0x3C01;
3029 return roundAndPackFloat64( aSign, aExp, zSig );
3033 #ifdef FLOAT128
3036 -------------------------------------------------------------------------------
3037 Returns the result of converting the extended double-precision floating-
3038 point value `a' to the quadruple-precision floating-point format. The
3039 conversion is performed according to the IEC/IEEE Standard for Binary
3040 Floating-point Arithmetic.
3041 -------------------------------------------------------------------------------
3043 float128 floatx80_to_float128( floatx80 a )
3045 flag aSign;
3046 int16 aExp;
3047 bits64 aSig, zSig0, zSig1;
3049 aSig = extractFloatx80Frac( a );
3050 aExp = extractFloatx80Exp( a );
3051 aSign = extractFloatx80Sign( a );
3052 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3053 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3055 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3056 return packFloat128( aSign, aExp, zSig0, zSig1 );
3060 #endif
3063 -------------------------------------------------------------------------------
3064 Rounds the extended double-precision floating-point value `a' to an integer,
3065 and returns the result as an extended quadruple-precision floating-point
3066 value. The operation is performed according to the IEC/IEEE Standard for
3067 Binary Floating-point Arithmetic.
3068 -------------------------------------------------------------------------------
3070 floatx80 floatx80_round_to_int( floatx80 a )
3072 flag aSign;
3073 int32 aExp;
3074 bits64 lastBitMask, roundBitsMask;
3075 int8 roundingMode;
3076 floatx80 z;
3078 aExp = extractFloatx80Exp( a );
3079 if ( 0x403E <= aExp ) {
3080 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3081 return propagateFloatx80NaN( a, a );
3083 return a;
3085 if ( aExp <= 0x3FFE ) {
3086 if ( ( aExp == 0 )
3087 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3088 return a;
3090 float_exception_flags |= float_flag_inexact;
3091 aSign = extractFloatx80Sign( a );
3092 switch ( float_rounding_mode ) {
3093 case float_round_nearest_even:
3094 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3096 return
3097 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3099 break;
3100 case float_round_down:
3101 return
3102 aSign ?
3103 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3104 : packFloatx80( 0, 0, 0 );
3105 case float_round_up:
3106 return
3107 aSign ? packFloatx80( 1, 0, 0 )
3108 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3110 return packFloatx80( aSign, 0, 0 );
3112 lastBitMask = 1;
3113 lastBitMask <<= 0x403E - aExp;
3114 roundBitsMask = lastBitMask - 1;
3115 z = a;
3116 roundingMode = float_rounding_mode;
3117 if ( roundingMode == float_round_nearest_even ) {
3118 z.low += lastBitMask>>1;
3119 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3121 else if ( roundingMode != float_round_to_zero ) {
3122 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3123 z.low += roundBitsMask;
3126 z.low &= ~ roundBitsMask;
3127 if ( z.low == 0 ) {
3128 ++z.high;
3129 z.low = LIT64( 0x8000000000000000 );
3131 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3132 return z;
3137 -------------------------------------------------------------------------------
3138 Returns the result of adding the absolute values of the extended double-
3139 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
3140 negated before being returned. `zSign' is ignored if the result is a NaN.
3141 The addition is performed according to the IEC/IEEE Standard for Binary
3142 Floating-point Arithmetic.
3143 -------------------------------------------------------------------------------
3145 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3147 int32 aExp, bExp, zExp;
3148 bits64 aSig, bSig, zSig0, zSig1;
3149 int32 expDiff;
3151 aSig = extractFloatx80Frac( a );
3152 aExp = extractFloatx80Exp( a );
3153 bSig = extractFloatx80Frac( b );
3154 bExp = extractFloatx80Exp( b );
3155 expDiff = aExp - bExp;
3156 if ( 0 < expDiff ) {
3157 if ( aExp == 0x7FFF ) {
3158 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3159 return a;
3161 if ( bExp == 0 ) --expDiff;
3162 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3163 zExp = aExp;
3165 else if ( expDiff < 0 ) {
3166 if ( bExp == 0x7FFF ) {
3167 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3168 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3170 if ( aExp == 0 ) ++expDiff;
3171 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3172 zExp = bExp;
3174 else {
3175 if ( aExp == 0x7FFF ) {
3176 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3177 return propagateFloatx80NaN( a, b );
3179 return a;
3181 zSig1 = 0;
3182 zSig0 = aSig + bSig;
3183 if ( aExp == 0 ) {
3184 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3185 goto roundAndPack;
3187 zExp = aExp;
3188 goto shiftRight1;
3191 zSig0 = aSig + bSig;
3193 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3194 shiftRight1:
3195 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3196 zSig0 |= LIT64( 0x8000000000000000 );
3197 ++zExp;
3198 roundAndPack:
3199 return
3200 roundAndPackFloatx80(
3201 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3206 -------------------------------------------------------------------------------
3207 Returns the result of subtracting the absolute values of the extended
3208 double-precision floating-point values `a' and `b'. If `zSign' is true,
3209 the difference is negated before being returned. `zSign' is ignored if the
3210 result is a NaN. The subtraction is performed according to the IEC/IEEE
3211 Standard for Binary Floating-point Arithmetic.
3212 -------------------------------------------------------------------------------
3214 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3216 int32 aExp, bExp, zExp;
3217 bits64 aSig, bSig, zSig0, zSig1;
3218 int32 expDiff;
3219 floatx80 z;
3221 aSig = extractFloatx80Frac( a );
3222 aExp = extractFloatx80Exp( a );
3223 bSig = extractFloatx80Frac( b );
3224 bExp = extractFloatx80Exp( b );
3225 expDiff = aExp - bExp;
3226 if ( 0 < expDiff ) goto aExpBigger;
3227 if ( expDiff < 0 ) goto bExpBigger;
3228 if ( aExp == 0x7FFF ) {
3229 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3230 return propagateFloatx80NaN( a, b );
3232 float_raise( float_flag_invalid );
3233 z.low = floatx80_default_nan_low;
3234 z.high = floatx80_default_nan_high;
3235 return z;
3237 if ( aExp == 0 ) {
3238 aExp = 1;
3239 bExp = 1;
3241 zSig1 = 0;
3242 if ( bSig < aSig ) goto aBigger;
3243 if ( aSig < bSig ) goto bBigger;
3244 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3245 bExpBigger:
3246 if ( bExp == 0x7FFF ) {
3247 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3248 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3250 if ( aExp == 0 ) ++expDiff;
3251 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3252 bBigger:
3253 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3254 zExp = bExp;
3255 zSign ^= 1;
3256 goto normalizeRoundAndPack;
3257 aExpBigger:
3258 if ( aExp == 0x7FFF ) {
3259 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3260 return a;
3262 if ( bExp == 0 ) --expDiff;
3263 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3264 aBigger:
3265 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3266 zExp = aExp;
3267 normalizeRoundAndPack:
3268 return
3269 normalizeRoundAndPackFloatx80(
3270 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3275 -------------------------------------------------------------------------------
3276 Returns the result of adding the extended double-precision floating-point
3277 values `a' and `b'. The operation is performed according to the IEC/IEEE
3278 Standard for Binary Floating-point Arithmetic.
3279 -------------------------------------------------------------------------------
3281 floatx80 floatx80_add( floatx80 a, floatx80 b )
3283 flag aSign, bSign;
3285 aSign = extractFloatx80Sign( a );
3286 bSign = extractFloatx80Sign( b );
3287 if ( aSign == bSign ) {
3288 return addFloatx80Sigs( a, b, aSign );
3290 else {
3291 return subFloatx80Sigs( a, b, aSign );
3297 -------------------------------------------------------------------------------
3298 Returns the result of subtracting the extended double-precision floating-
3299 point values `a' and `b'. The operation is performed according to the
3300 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3301 -------------------------------------------------------------------------------
3303 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3305 flag aSign, bSign;
3307 aSign = extractFloatx80Sign( a );
3308 bSign = extractFloatx80Sign( b );
3309 if ( aSign == bSign ) {
3310 return subFloatx80Sigs( a, b, aSign );
3312 else {
3313 return addFloatx80Sigs( a, b, aSign );
3319 -------------------------------------------------------------------------------
3320 Returns the result of multiplying the extended double-precision floating-
3321 point values `a' and `b'. The operation is performed according to the
3322 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3323 -------------------------------------------------------------------------------
3325 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3327 flag aSign, bSign, zSign;
3328 int32 aExp, bExp, zExp;
3329 bits64 aSig, bSig, zSig0, zSig1;
3330 floatx80 z;
3332 aSig = extractFloatx80Frac( a );
3333 aExp = extractFloatx80Exp( a );
3334 aSign = extractFloatx80Sign( a );
3335 bSig = extractFloatx80Frac( b );
3336 bExp = extractFloatx80Exp( b );
3337 bSign = extractFloatx80Sign( b );
3338 zSign = aSign ^ bSign;
3339 if ( aExp == 0x7FFF ) {
3340 if ( (bits64) ( aSig<<1 )
3341 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3342 return propagateFloatx80NaN( a, b );
3344 if ( ( bExp | bSig ) == 0 ) goto invalid;
3345 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3347 if ( bExp == 0x7FFF ) {
3348 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3349 if ( ( aExp | aSig ) == 0 ) {
3350 invalid:
3351 float_raise( float_flag_invalid );
3352 z.low = floatx80_default_nan_low;
3353 z.high = floatx80_default_nan_high;
3354 return z;
3356 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3358 if ( aExp == 0 ) {
3359 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3360 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3362 if ( bExp == 0 ) {
3363 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3364 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3366 zExp = aExp + bExp - 0x3FFE;
3367 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3368 if ( 0 < (sbits64) zSig0 ) {
3369 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3370 --zExp;
3372 return
3373 roundAndPackFloatx80(
3374 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3379 -------------------------------------------------------------------------------
3380 Returns the result of dividing the extended double-precision floating-point
3381 value `a' by the corresponding value `b'. The operation is performed
3382 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3383 -------------------------------------------------------------------------------
3385 floatx80 floatx80_div( floatx80 a, floatx80 b )
3387 flag aSign, bSign, zSign;
3388 int32 aExp, bExp, zExp;
3389 bits64 aSig, bSig, zSig0, zSig1;
3390 bits64 rem0, rem1, rem2, term0, term1, term2;
3391 floatx80 z;
3393 aSig = extractFloatx80Frac( a );
3394 aExp = extractFloatx80Exp( a );
3395 aSign = extractFloatx80Sign( a );
3396 bSig = extractFloatx80Frac( b );
3397 bExp = extractFloatx80Exp( b );
3398 bSign = extractFloatx80Sign( b );
3399 zSign = aSign ^ bSign;
3400 if ( aExp == 0x7FFF ) {
3401 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3402 if ( bExp == 0x7FFF ) {
3403 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3404 goto invalid;
3406 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3408 if ( bExp == 0x7FFF ) {
3409 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3410 return packFloatx80( zSign, 0, 0 );
3412 if ( bExp == 0 ) {
3413 if ( bSig == 0 ) {
3414 if ( ( aExp | aSig ) == 0 ) {
3415 invalid:
3416 float_raise( float_flag_invalid );
3417 z.low = floatx80_default_nan_low;
3418 z.high = floatx80_default_nan_high;
3419 return z;
3421 float_raise( float_flag_divbyzero );
3422 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3424 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3426 if ( aExp == 0 ) {
3427 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3428 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3430 zExp = aExp - bExp + 0x3FFE;
3431 rem1 = 0;
3432 if ( bSig <= aSig ) {
3433 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3434 ++zExp;
3436 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3437 mul64To128( bSig, zSig0, &term0, &term1 );
3438 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3439 while ( (sbits64) rem0 < 0 ) {
3440 --zSig0;
3441 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3443 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3444 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3445 mul64To128( bSig, zSig1, &term1, &term2 );
3446 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3447 while ( (sbits64) rem1 < 0 ) {
3448 --zSig1;
3449 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3451 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3453 return
3454 roundAndPackFloatx80(
3455 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3460 -------------------------------------------------------------------------------
3461 Returns the remainder of the extended double-precision floating-point value
3462 `a' with respect to the corresponding value `b'. The operation is performed
3463 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3464 -------------------------------------------------------------------------------
3466 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3468 flag aSign, bSign, zSign;
3469 int32 aExp, bExp, expDiff;
3470 bits64 aSig0, aSig1, bSig;
3471 bits64 q, term0, term1, alternateASig0, alternateASig1;
3472 floatx80 z;
3474 aSig0 = extractFloatx80Frac( a );
3475 aExp = extractFloatx80Exp( a );
3476 aSign = extractFloatx80Sign( a );
3477 bSig = extractFloatx80Frac( b );
3478 bExp = extractFloatx80Exp( b );
3479 bSign = extractFloatx80Sign( b );
3480 if ( aExp == 0x7FFF ) {
3481 if ( (bits64) ( aSig0<<1 )
3482 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3483 return propagateFloatx80NaN( a, b );
3485 goto invalid;
3487 if ( bExp == 0x7FFF ) {
3488 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3489 return a;
3491 if ( bExp == 0 ) {
3492 if ( bSig == 0 ) {
3493 invalid:
3494 float_raise( float_flag_invalid );
3495 z.low = floatx80_default_nan_low;
3496 z.high = floatx80_default_nan_high;
3497 return z;
3499 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3501 if ( aExp == 0 ) {
3502 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3503 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3505 bSig |= LIT64( 0x8000000000000000 );
3506 zSign = aSign;
3507 expDiff = aExp - bExp;
3508 aSig1 = 0;
3509 if ( expDiff < 0 ) {
3510 if ( expDiff < -1 ) return a;
3511 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3512 expDiff = 0;
3514 q = ( bSig <= aSig0 );
3515 if ( q ) aSig0 -= bSig;
3516 expDiff -= 64;
3517 while ( 0 < expDiff ) {
3518 q = estimateDiv128To64( aSig0, aSig1, bSig );
3519 q = ( 2 < q ) ? q - 2 : 0;
3520 mul64To128( bSig, q, &term0, &term1 );
3521 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3522 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3523 expDiff -= 62;
3525 expDiff += 64;
3526 if ( 0 < expDiff ) {
3527 q = estimateDiv128To64( aSig0, aSig1, bSig );
3528 q = ( 2 < q ) ? q - 2 : 0;
3529 q >>= 64 - expDiff;
3530 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3531 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3532 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3533 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3534 ++q;
3535 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3538 else {
3539 term1 = 0;
3540 term0 = bSig;
3542 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3543 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3544 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3545 && ( q & 1 ) )
3547 aSig0 = alternateASig0;
3548 aSig1 = alternateASig1;
3549 zSign = ! zSign;
3551 return
3552 normalizeRoundAndPackFloatx80(
3553 80, zSign, bExp + expDiff, aSig0, aSig1 );
3558 -------------------------------------------------------------------------------
3559 Returns the square root of the extended double-precision floating-point
3560 value `a'. The operation is performed according to the IEC/IEEE Standard
3561 for Binary Floating-point Arithmetic.
3562 -------------------------------------------------------------------------------
3564 floatx80 floatx80_sqrt( floatx80 a )
3566 flag aSign;
3567 int32 aExp, zExp;
3568 bits64 aSig0, aSig1, zSig0, zSig1;
3569 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3570 bits64 shiftedRem0, shiftedRem1;
3571 floatx80 z;
3573 aSig0 = extractFloatx80Frac( a );
3574 aExp = extractFloatx80Exp( a );
3575 aSign = extractFloatx80Sign( a );
3576 if ( aExp == 0x7FFF ) {
3577 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3578 if ( ! aSign ) return a;
3579 goto invalid;
3581 if ( aSign ) {
3582 if ( ( aExp | aSig0 ) == 0 ) return a;
3583 invalid:
3584 float_raise( float_flag_invalid );
3585 z.low = floatx80_default_nan_low;
3586 z.high = floatx80_default_nan_high;
3587 return z;
3589 if ( aExp == 0 ) {
3590 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3591 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3593 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3594 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3595 zSig0 <<= 31;
3596 aSig1 = 0;
3597 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3598 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3599 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3600 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3601 mul64To128( zSig0, zSig0, &term0, &term1 );
3602 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3603 while ( (sbits64) rem0 < 0 ) {
3604 --zSig0;
3605 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3606 term1 |= 1;
3607 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3609 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3610 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3611 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3612 if ( zSig1 == 0 ) zSig1 = 1;
3613 mul64To128( zSig0, zSig1, &term1, &term2 );
3614 shortShift128Left( term1, term2, 1, &term1, &term2 );
3615 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3616 mul64To128( zSig1, zSig1, &term2, &term3 );
3617 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3618 while ( (sbits64) rem1 < 0 ) {
3619 --zSig1;
3620 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3621 term3 |= 1;
3622 add192(
3623 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3625 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3627 return
3628 roundAndPackFloatx80(
3629 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3634 -------------------------------------------------------------------------------
3635 Returns 1 if the extended double-precision floating-point value `a' is
3636 equal to the corresponding value `b', and 0 otherwise. The comparison is
3637 performed according to the IEC/IEEE Standard for Binary Floating-point
3638 Arithmetic.
3639 -------------------------------------------------------------------------------
3641 flag floatx80_eq( floatx80 a, floatx80 b )
3644 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3645 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3646 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3647 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3649 if ( floatx80_is_signaling_nan( a )
3650 || floatx80_is_signaling_nan( b ) ) {
3651 float_raise( float_flag_invalid );
3653 return 0;
3655 return
3656 ( a.low == b.low )
3657 && ( ( a.high == b.high )
3658 || ( ( a.low == 0 )
3659 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3665 -------------------------------------------------------------------------------
3666 Returns 1 if the extended double-precision floating-point value `a' is
3667 less than or equal to the corresponding value `b', and 0 otherwise. The
3668 comparison is performed according to the IEC/IEEE Standard for Binary
3669 Floating-point Arithmetic.
3670 -------------------------------------------------------------------------------
3672 flag floatx80_le( floatx80 a, floatx80 b )
3674 flag aSign, bSign;
3676 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3677 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3678 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3679 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3681 float_raise( float_flag_invalid );
3682 return 0;
3684 aSign = extractFloatx80Sign( a );
3685 bSign = extractFloatx80Sign( b );
3686 if ( aSign != bSign ) {
3687 return
3688 aSign
3689 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3690 == 0 );
3692 return
3693 aSign ? le128( b.high, b.low, a.high, a.low )
3694 : le128( a.high, a.low, b.high, b.low );
3699 -------------------------------------------------------------------------------
3700 Returns 1 if the extended double-precision floating-point value `a' is
3701 less than the corresponding value `b', and 0 otherwise. The comparison
3702 is performed according to the IEC/IEEE Standard for Binary Floating-point
3703 Arithmetic.
3704 -------------------------------------------------------------------------------
3706 flag floatx80_lt( floatx80 a, floatx80 b )
3708 flag aSign, bSign;
3710 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3711 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3712 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3713 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3715 float_raise( float_flag_invalid );
3716 return 0;
3718 aSign = extractFloatx80Sign( a );
3719 bSign = extractFloatx80Sign( b );
3720 if ( aSign != bSign ) {
3721 return
3722 aSign
3723 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3724 != 0 );
3726 return
3727 aSign ? lt128( b.high, b.low, a.high, a.low )
3728 : lt128( a.high, a.low, b.high, b.low );
3733 -------------------------------------------------------------------------------
3734 Returns 1 if the extended double-precision floating-point value `a' is equal
3735 to the corresponding value `b', and 0 otherwise. The invalid exception is
3736 raised if either operand is a NaN. Otherwise, the comparison is performed
3737 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3738 -------------------------------------------------------------------------------
3740 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3743 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3744 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3745 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3746 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3748 float_raise( float_flag_invalid );
3749 return 0;
3751 return
3752 ( a.low == b.low )
3753 && ( ( a.high == b.high )
3754 || ( ( a.low == 0 )
3755 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3761 -------------------------------------------------------------------------------
3762 Returns 1 if the extended double-precision floating-point value `a' is less
3763 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3764 do not cause an exception. Otherwise, the comparison is performed according
3765 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3766 -------------------------------------------------------------------------------
3768 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3770 flag aSign, bSign;
3772 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3773 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3774 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3775 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3777 if ( floatx80_is_signaling_nan( a )
3778 || floatx80_is_signaling_nan( b ) ) {
3779 float_raise( float_flag_invalid );
3781 return 0;
3783 aSign = extractFloatx80Sign( a );
3784 bSign = extractFloatx80Sign( b );
3785 if ( aSign != bSign ) {
3786 return
3787 aSign
3788 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3789 == 0 );
3791 return
3792 aSign ? le128( b.high, b.low, a.high, a.low )
3793 : le128( a.high, a.low, b.high, b.low );
3798 -------------------------------------------------------------------------------
3799 Returns 1 if the extended double-precision floating-point value `a' is less
3800 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3801 an exception. Otherwise, the comparison is performed according to the
3802 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3803 -------------------------------------------------------------------------------
3805 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3807 flag aSign, bSign;
3809 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3810 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3811 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3812 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3814 if ( floatx80_is_signaling_nan( a )
3815 || floatx80_is_signaling_nan( b ) ) {
3816 float_raise( float_flag_invalid );
3818 return 0;
3820 aSign = extractFloatx80Sign( a );
3821 bSign = extractFloatx80Sign( b );
3822 if ( aSign != bSign ) {
3823 return
3824 aSign
3825 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3826 != 0 );
3828 return
3829 aSign ? lt128( b.high, b.low, a.high, a.low )
3830 : lt128( a.high, a.low, b.high, b.low );
3834 #endif
3836 #ifdef FLOAT128
3839 -------------------------------------------------------------------------------
3840 Returns the result of converting the quadruple-precision floating-point
3841 value `a' to the 32-bit two's complement integer format. The conversion
3842 is performed according to the IEC/IEEE Standard for Binary Floating-point
3843 Arithmetic---which means in particular that the conversion is rounded
3844 according to the current rounding mode. If `a' is a NaN, the largest
3845 positive integer is returned. Otherwise, if the conversion overflows, the
3846 largest integer with the same sign as `a' is returned.
3847 -------------------------------------------------------------------------------
3849 int32 float128_to_int32( float128 a )
3851 flag aSign;
3852 int32 aExp, shiftCount;
3853 bits64 aSig0, aSig1;
3855 aSig1 = extractFloat128Frac1( a );
3856 aSig0 = extractFloat128Frac0( a );
3857 aExp = extractFloat128Exp( a );
3858 aSign = extractFloat128Sign( a );
3859 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
3860 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3861 aSig0 |= ( aSig1 != 0 );
3862 shiftCount = 0x4028 - aExp;
3863 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
3864 return roundAndPackInt32( aSign, aSig0 );
3869 -------------------------------------------------------------------------------
3870 Returns the result of converting the quadruple-precision floating-point
3871 value `a' to the 32-bit two's complement integer format. The conversion
3872 is performed according to the IEC/IEEE Standard for Binary Floating-point
3873 Arithmetic, except that the conversion is always rounded toward zero. If
3874 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
3875 conversion overflows, the largest integer with the same sign as `a' is
3876 returned.
3877 -------------------------------------------------------------------------------
3879 int32 float128_to_int32_round_to_zero( float128 a )
3881 flag aSign;
3882 int32 aExp, shiftCount;
3883 bits64 aSig0, aSig1, savedASig;
3884 int32 z;
3886 aSig1 = extractFloat128Frac1( a );
3887 aSig0 = extractFloat128Frac0( a );
3888 aExp = extractFloat128Exp( a );
3889 aSign = extractFloat128Sign( a );
3890 aSig0 |= ( aSig1 != 0 );
3891 shiftCount = 0x402F - aExp;
3892 if ( shiftCount < 17 ) {
3893 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
3894 goto invalid;
3896 else if ( 48 < shiftCount ) {
3897 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
3898 return 0;
3900 aSig0 |= LIT64( 0x0001000000000000 );
3901 savedASig = aSig0;
3902 aSig0 >>= shiftCount;
3903 z = aSig0;
3904 if ( aSign ) z = - z;
3905 if ( ( z < 0 ) ^ aSign ) {
3906 invalid:
3907 float_exception_flags |= float_flag_invalid;
3908 return aSign ? 0x80000000 : 0x7FFFFFFF;
3910 if ( ( aSig0<<shiftCount ) != savedASig ) {
3911 float_exception_flags |= float_flag_inexact;
3913 return z;
3918 -------------------------------------------------------------------------------
3919 Returns the result of converting the quadruple-precision floating-point
3920 value `a' to the single-precision floating-point format. The conversion
3921 is performed according to the IEC/IEEE Standard for Binary Floating-point
3922 Arithmetic.
3923 -------------------------------------------------------------------------------
3925 float32 float128_to_float32( float128 a )
3927 flag aSign;
3928 int32 aExp;
3929 bits64 aSig0, aSig1;
3930 bits32 zSig;
3932 aSig1 = extractFloat128Frac1( a );
3933 aSig0 = extractFloat128Frac0( a );
3934 aExp = extractFloat128Exp( a );
3935 aSign = extractFloat128Sign( a );
3936 if ( aExp == 0x7FFF ) {
3937 if ( aSig0 | aSig1 ) {
3938 return commonNaNToFloat32( float128ToCommonNaN( a ) );
3940 return packFloat32( aSign, 0xFF, 0 );
3942 aSig0 |= ( aSig1 != 0 );
3943 shift64RightJamming( aSig0, 18, &aSig0 );
3944 zSig = aSig0;
3945 if ( aExp || zSig ) {
3946 zSig |= 0x40000000;
3947 aExp -= 0x3F81;
3949 return roundAndPackFloat32( aSign, aExp, zSig );
3954 -------------------------------------------------------------------------------
3955 Returns the result of converting the quadruple-precision floating-point
3956 value `a' to the double-precision floating-point format. The conversion
3957 is performed according to the IEC/IEEE Standard for Binary Floating-point
3958 Arithmetic.
3959 -------------------------------------------------------------------------------
3961 float64 float128_to_float64( float128 a )
3963 flag aSign;
3964 int32 aExp;
3965 bits64 aSig0, aSig1;
3967 aSig1 = extractFloat128Frac1( a );
3968 aSig0 = extractFloat128Frac0( a );
3969 aExp = extractFloat128Exp( a );
3970 aSign = extractFloat128Sign( a );
3971 if ( aExp == 0x7FFF ) {
3972 if ( aSig0 | aSig1 ) {
3973 return commonNaNToFloat64( float128ToCommonNaN( a ) );
3975 return packFloat64( aSign, 0x7FF, 0 );
3977 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
3978 aSig0 |= ( aSig1 != 0 );
3979 if ( aExp || aSig0 ) {
3980 aSig0 |= LIT64( 0x4000000000000000 );
3981 aExp -= 0x3C01;
3983 return roundAndPackFloat64( aSign, aExp, aSig0 );
3987 #ifdef FLOATX80
3990 -------------------------------------------------------------------------------
3991 Returns the result of converting the quadruple-precision floating-point
3992 value `a' to the extended double-precision floating-point format. The
3993 conversion is performed according to the IEC/IEEE Standard for Binary
3994 Floating-point Arithmetic.
3995 -------------------------------------------------------------------------------
3997 floatx80 float128_to_floatx80( float128 a )
3999 flag aSign;
4000 int32 aExp;
4001 bits64 aSig0, aSig1;
4003 aSig1 = extractFloat128Frac1( a );
4004 aSig0 = extractFloat128Frac0( a );
4005 aExp = extractFloat128Exp( a );
4006 aSign = extractFloat128Sign( a );
4007 if ( aExp == 0x7FFF ) {
4008 if ( aSig0 | aSig1 ) {
4009 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4011 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4013 if ( aExp == 0 ) {
4014 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4015 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4017 else {
4018 aSig0 |= LIT64( 0x0001000000000000 );
4020 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4021 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4025 #endif
4028 -------------------------------------------------------------------------------
4029 Rounds the quadruple-precision floating-point value `a' to an integer, and
4030 returns the result as a quadruple-precision floating-point value. The
4031 operation is performed according to the IEC/IEEE Standard for Binary
4032 Floating-point Arithmetic.
4033 -------------------------------------------------------------------------------
4035 float128 float128_round_to_int( float128 a )
4037 flag aSign;
4038 int32 aExp;
4039 bits64 lastBitMask, roundBitsMask;
4040 int8 roundingMode;
4041 float128 z;
4043 aExp = extractFloat128Exp( a );
4044 if ( 0x402F <= aExp ) {
4045 if ( 0x406F <= aExp ) {
4046 if ( ( aExp == 0x7FFF )
4047 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4049 return propagateFloat128NaN( a, a );
4051 return a;
4053 lastBitMask = 1;
4054 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4055 roundBitsMask = lastBitMask - 1;
4056 z = a;
4057 roundingMode = float_rounding_mode;
4058 if ( roundingMode == float_round_nearest_even ) {
4059 if ( lastBitMask ) {
4060 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4061 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4063 else {
4064 if ( (sbits64) z.low < 0 ) {
4065 ++z.high;
4066 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4070 else if ( roundingMode != float_round_to_zero ) {
4071 if ( extractFloat128Sign( z )
4072 ^ ( roundingMode == float_round_up ) ) {
4073 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4076 z.low &= ~ roundBitsMask;
4078 else {
4079 if ( aExp <= 0x3FFE ) {
4080 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4081 float_exception_flags |= float_flag_inexact;
4082 aSign = extractFloat128Sign( a );
4083 switch ( float_rounding_mode ) {
4084 case float_round_nearest_even:
4085 if ( ( aExp == 0x3FFE )
4086 && ( extractFloat128Frac0( a )
4087 | extractFloat128Frac1( a ) )
4089 return packFloat128( aSign, 0x3FFF, 0, 0 );
4091 break;
4092 case float_round_down:
4093 return
4094 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4095 : packFloat128( 0, 0, 0, 0 );
4096 case float_round_up:
4097 return
4098 aSign ? packFloat128( 1, 0, 0, 0 )
4099 : packFloat128( 0, 0x3FFF, 0, 0 );
4101 return packFloat128( aSign, 0, 0, 0 );
4103 lastBitMask = 1;
4104 lastBitMask <<= 0x402F - aExp;
4105 roundBitsMask = lastBitMask - 1;
4106 z.low = 0;
4107 z.high = a.high;
4108 roundingMode = float_rounding_mode;
4109 if ( roundingMode == float_round_nearest_even ) {
4110 z.high += lastBitMask>>1;
4111 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4112 z.high &= ~ lastBitMask;
4115 else if ( roundingMode != float_round_to_zero ) {
4116 if ( extractFloat128Sign( z )
4117 ^ ( roundingMode == float_round_up ) ) {
4118 z.high |= ( a.low != 0 );
4119 z.high += roundBitsMask;
4122 z.high &= ~ roundBitsMask;
4124 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4125 float_exception_flags |= float_flag_inexact;
4127 return z;
4132 -------------------------------------------------------------------------------
4133 Returns the result of adding the absolute values of the quadruple-precision
4134 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
4135 before being returned. `zSign' is ignored if the result is a NaN. The
4136 addition is performed according to the IEC/IEEE Standard for Binary
4137 Floating-point Arithmetic.
4138 -------------------------------------------------------------------------------
4140 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4142 int32 aExp, bExp, zExp;
4143 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4144 int32 expDiff;
4146 aSig1 = extractFloat128Frac1( a );
4147 aSig0 = extractFloat128Frac0( a );
4148 aExp = extractFloat128Exp( a );
4149 bSig1 = extractFloat128Frac1( b );
4150 bSig0 = extractFloat128Frac0( b );
4151 bExp = extractFloat128Exp( b );
4152 expDiff = aExp - bExp;
4153 if ( 0 < expDiff ) {
4154 if ( aExp == 0x7FFF ) {
4155 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4156 return a;
4158 if ( bExp == 0 ) {
4159 --expDiff;
4161 else {
4162 bSig0 |= LIT64( 0x0001000000000000 );
4164 shift128ExtraRightJamming(
4165 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4166 zExp = aExp;
4168 else if ( expDiff < 0 ) {
4169 if ( bExp == 0x7FFF ) {
4170 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4171 return packFloat128( zSign, 0x7FFF, 0, 0 );
4173 if ( aExp == 0 ) {
4174 ++expDiff;
4176 else {
4177 aSig0 |= LIT64( 0x0001000000000000 );
4179 shift128ExtraRightJamming(
4180 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4181 zExp = bExp;
4183 else {
4184 if ( aExp == 0x7FFF ) {
4185 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4186 return propagateFloat128NaN( a, b );
4188 return a;
4190 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4191 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4192 zSig2 = 0;
4193 zSig0 |= LIT64( 0x0002000000000000 );
4194 zExp = aExp;
4195 goto shiftRight1;
4197 aSig0 |= LIT64( 0x0001000000000000 );
4198 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4199 --zExp;
4200 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4201 ++zExp;
4202 shiftRight1:
4203 shift128ExtraRightJamming(
4204 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4205 roundAndPack:
4206 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4211 -------------------------------------------------------------------------------
4212 Returns the result of subtracting the absolute values of the quadruple-
4213 precision floating-point values `a' and `b'. If `zSign' is true, the
4214 difference is negated before being returned. `zSign' is ignored if the
4215 result is a NaN. The subtraction is performed according to the IEC/IEEE
4216 Standard for Binary Floating-point Arithmetic.
4217 -------------------------------------------------------------------------------
4219 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4221 int32 aExp, bExp, zExp;
4222 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4223 int32 expDiff;
4224 float128 z;
4226 aSig1 = extractFloat128Frac1( a );
4227 aSig0 = extractFloat128Frac0( a );
4228 aExp = extractFloat128Exp( a );
4229 bSig1 = extractFloat128Frac1( b );
4230 bSig0 = extractFloat128Frac0( b );
4231 bExp = extractFloat128Exp( b );
4232 expDiff = aExp - bExp;
4233 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4234 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4235 if ( 0 < expDiff ) goto aExpBigger;
4236 if ( expDiff < 0 ) goto bExpBigger;
4237 if ( aExp == 0x7FFF ) {
4238 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4239 return propagateFloat128NaN( a, b );
4241 float_raise( float_flag_invalid );
4242 z.low = float128_default_nan_low;
4243 z.high = float128_default_nan_high;
4244 return z;
4246 if ( aExp == 0 ) {
4247 aExp = 1;
4248 bExp = 1;
4250 if ( bSig0 < aSig0 ) goto aBigger;
4251 if ( aSig0 < bSig0 ) goto bBigger;
4252 if ( bSig1 < aSig1 ) goto aBigger;
4253 if ( aSig1 < bSig1 ) goto bBigger;
4254 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4255 bExpBigger:
4256 if ( bExp == 0x7FFF ) {
4257 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4258 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4260 if ( aExp == 0 ) {
4261 ++expDiff;
4263 else {
4264 aSig0 |= LIT64( 0x4000000000000000 );
4266 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4267 bSig0 |= LIT64( 0x4000000000000000 );
4268 bBigger:
4269 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4270 zExp = bExp;
4271 zSign ^= 1;
4272 goto normalizeRoundAndPack;
4273 aExpBigger:
4274 if ( aExp == 0x7FFF ) {
4275 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4276 return a;
4278 if ( bExp == 0 ) {
4279 --expDiff;
4281 else {
4282 bSig0 |= LIT64( 0x4000000000000000 );
4284 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4285 aSig0 |= LIT64( 0x4000000000000000 );
4286 aBigger:
4287 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4288 zExp = aExp;
4289 normalizeRoundAndPack:
4290 --zExp;
4291 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4296 -------------------------------------------------------------------------------
4297 Returns the result of adding the quadruple-precision floating-point values
4298 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4299 for Binary Floating-point Arithmetic.
4300 -------------------------------------------------------------------------------
4302 float128 float128_add( float128 a, float128 b )
4304 flag aSign, bSign;
4306 aSign = extractFloat128Sign( a );
4307 bSign = extractFloat128Sign( b );
4308 if ( aSign == bSign ) {
4309 return addFloat128Sigs( a, b, aSign );
4311 else {
4312 return subFloat128Sigs( a, b, aSign );
4318 -------------------------------------------------------------------------------
4319 Returns the result of subtracting the quadruple-precision floating-point
4320 values `a' and `b'. The operation is performed according to the IEC/IEEE
4321 Standard for Binary Floating-point Arithmetic.
4322 -------------------------------------------------------------------------------
4324 float128 float128_sub( float128 a, float128 b )
4326 flag aSign, bSign;
4328 aSign = extractFloat128Sign( a );
4329 bSign = extractFloat128Sign( b );
4330 if ( aSign == bSign ) {
4331 return subFloat128Sigs( a, b, aSign );
4333 else {
4334 return addFloat128Sigs( a, b, aSign );
4340 -------------------------------------------------------------------------------
4341 Returns the result of multiplying the quadruple-precision floating-point
4342 values `a' and `b'. The operation is performed according to the IEC/IEEE
4343 Standard for Binary Floating-point Arithmetic.
4344 -------------------------------------------------------------------------------
4346 float128 float128_mul( float128 a, float128 b )
4348 flag aSign, bSign, zSign;
4349 int32 aExp, bExp, zExp;
4350 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4351 float128 z;
4353 aSig1 = extractFloat128Frac1( a );
4354 aSig0 = extractFloat128Frac0( a );
4355 aExp = extractFloat128Exp( a );
4356 aSign = extractFloat128Sign( a );
4357 bSig1 = extractFloat128Frac1( b );
4358 bSig0 = extractFloat128Frac0( b );
4359 bExp = extractFloat128Exp( b );
4360 bSign = extractFloat128Sign( b );
4361 zSign = aSign ^ bSign;
4362 if ( aExp == 0x7FFF ) {
4363 if ( ( aSig0 | aSig1 )
4364 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4365 return propagateFloat128NaN( a, b );
4367 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4368 return packFloat128( zSign, 0x7FFF, 0, 0 );
4370 if ( bExp == 0x7FFF ) {
4371 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4372 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4373 invalid:
4374 float_raise( float_flag_invalid );
4375 z.low = float128_default_nan_low;
4376 z.high = float128_default_nan_high;
4377 return z;
4379 return packFloat128( zSign, 0x7FFF, 0, 0 );
4381 if ( aExp == 0 ) {
4382 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4383 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4385 if ( bExp == 0 ) {
4386 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4387 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4389 zExp = aExp + bExp - 0x4000;
4390 aSig0 |= LIT64( 0x0001000000000000 );
4391 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4392 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4393 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4394 zSig2 |= ( zSig3 != 0 );
4395 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4396 shift128ExtraRightJamming(
4397 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4398 ++zExp;
4400 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4405 -------------------------------------------------------------------------------
4406 Returns the result of dividing the quadruple-precision floating-point value
4407 `a' by the corresponding value `b'. The operation is performed according to
4408 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4409 -------------------------------------------------------------------------------
4411 float128 float128_div( float128 a, float128 b )
4413 flag aSign, bSign, zSign;
4414 int32 aExp, bExp, zExp;
4415 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4416 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4417 float128 z;
4419 aSig1 = extractFloat128Frac1( a );
4420 aSig0 = extractFloat128Frac0( a );
4421 aExp = extractFloat128Exp( a );
4422 aSign = extractFloat128Sign( a );
4423 bSig1 = extractFloat128Frac1( b );
4424 bSig0 = extractFloat128Frac0( b );
4425 bExp = extractFloat128Exp( b );
4426 bSign = extractFloat128Sign( b );
4427 zSign = aSign ^ bSign;
4428 if ( aExp == 0x7FFF ) {
4429 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4430 if ( bExp == 0x7FFF ) {
4431 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4432 goto invalid;
4434 return packFloat128( zSign, 0x7FFF, 0, 0 );
4436 if ( bExp == 0x7FFF ) {
4437 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4438 return packFloat128( zSign, 0, 0, 0 );
4440 if ( bExp == 0 ) {
4441 if ( ( bSig0 | bSig1 ) == 0 ) {
4442 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4443 invalid:
4444 float_raise( float_flag_invalid );
4445 z.low = float128_default_nan_low;
4446 z.high = float128_default_nan_high;
4447 return z;
4449 float_raise( float_flag_divbyzero );
4450 return packFloat128( zSign, 0x7FFF, 0, 0 );
4452 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4454 if ( aExp == 0 ) {
4455 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4456 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4458 zExp = aExp - bExp + 0x3FFD;
4459 shortShift128Left(
4460 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4461 shortShift128Left(
4462 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4463 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4464 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4465 ++zExp;
4467 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4468 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4469 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4470 while ( (sbits64) rem0 < 0 ) {
4471 --zSig0;
4472 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4474 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4475 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4476 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4477 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4478 while ( (sbits64) rem1 < 0 ) {
4479 --zSig1;
4480 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4482 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4484 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4485 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4490 -------------------------------------------------------------------------------
4491 Returns the remainder of the quadruple-precision floating-point value `a'
4492 with respect to the corresponding value `b'. The operation is performed
4493 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4494 -------------------------------------------------------------------------------
4496 float128 float128_rem( float128 a, float128 b )
4498 flag aSign, bSign, zSign;
4499 int32 aExp, bExp, expDiff;
4500 bits64 aSig0, aSig1, bSig0, bSig1;
4501 bits64 q, term0, term1, term2, allZero, alternateASig0, alternateASig1;
4502 bits64 sigMean1;
4503 sbits64 sigMean0;
4504 float128 z;
4506 aSig1 = extractFloat128Frac1( a );
4507 aSig0 = extractFloat128Frac0( a );
4508 aExp = extractFloat128Exp( a );
4509 aSign = extractFloat128Sign( a );
4510 bSig1 = extractFloat128Frac1( b );
4511 bSig0 = extractFloat128Frac0( b );
4512 bExp = extractFloat128Exp( b );
4513 bSign = extractFloat128Sign( b );
4514 if ( aExp == 0x7FFF ) {
4515 if ( ( aSig0 | aSig1 )
4516 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4517 return propagateFloat128NaN( a, b );
4519 goto invalid;
4521 if ( bExp == 0x7FFF ) {
4522 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4523 return a;
4525 if ( bExp == 0 ) {
4526 if ( ( bSig0 | bSig1 ) == 0 ) {
4527 invalid:
4528 float_raise( float_flag_invalid );
4529 z.low = float128_default_nan_low;
4530 z.high = float128_default_nan_high;
4531 return z;
4533 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4535 if ( aExp == 0 ) {
4536 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4537 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4539 expDiff = aExp - bExp;
4540 if ( expDiff < -1 ) return a;
4541 shortShift128Left(
4542 aSig0 | LIT64( 0x0001000000000000 ),
4543 aSig1,
4544 15 - ( expDiff < 0 ),
4545 &aSig0,
4546 &aSig1
4548 shortShift128Left(
4549 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4550 q = le128( bSig0, bSig1, aSig0, aSig1 );
4551 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4552 expDiff -= 64;
4553 while ( 0 < expDiff ) {
4554 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4555 q = ( 4 < q ) ? q - 4 : 0;
4556 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4557 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4558 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4559 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4560 expDiff -= 61;
4562 if ( -64 < expDiff ) {
4563 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4564 q = ( 4 < q ) ? q - 4 : 0;
4565 q >>= - expDiff;
4566 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4567 expDiff += 52;
4568 if ( expDiff < 0 ) {
4569 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4571 else {
4572 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4574 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4575 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4577 else {
4578 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4579 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4581 do {
4582 alternateASig0 = aSig0;
4583 alternateASig1 = aSig1;
4584 ++q;
4585 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4586 } while ( 0 <= (sbits64) aSig0 );
4587 add128(
4588 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4589 if ( ( sigMean0 < 0 )
4590 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4591 aSig0 = alternateASig0;
4592 aSig1 = alternateASig1;
4594 zSign = ( (sbits64) aSig0 < 0 );
4595 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4596 return
4597 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4602 -------------------------------------------------------------------------------
4603 Returns the square root of the quadruple-precision floating-point value `a'.
4604 The operation is performed according to the IEC/IEEE Standard for Binary
4605 Floating-point Arithmetic.
4606 -------------------------------------------------------------------------------
4608 float128 float128_sqrt( float128 a )
4610 flag aSign;
4611 int32 aExp, zExp;
4612 bits64 aSig0, aSig1, zSig0, zSig1, zSig2;
4613 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4614 bits64 shiftedRem0, shiftedRem1;
4615 float128 z;
4617 aSig1 = extractFloat128Frac1( a );
4618 aSig0 = extractFloat128Frac0( a );
4619 aExp = extractFloat128Exp( a );
4620 aSign = extractFloat128Sign( a );
4621 if ( aExp == 0x7FFF ) {
4622 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4623 if ( ! aSign ) return a;
4624 goto invalid;
4626 if ( aSign ) {
4627 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4628 invalid:
4629 float_raise( float_flag_invalid );
4630 z.low = float128_default_nan_low;
4631 z.high = float128_default_nan_high;
4632 return z;
4634 if ( aExp == 0 ) {
4635 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4636 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4638 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4639 aSig0 |= LIT64( 0x0001000000000000 );
4640 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4641 zSig0 <<= 31;
4642 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4643 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
4644 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
4645 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
4646 mul64To128( zSig0, zSig0, &term0, &term1 );
4647 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4648 while ( (sbits64) rem0 < 0 ) {
4649 --zSig0;
4650 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
4651 term1 |= 1;
4652 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
4654 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
4655 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
4656 if ( ( zSig1 & 0x3FFF ) <= 5 ) {
4657 if ( zSig1 == 0 ) zSig1 = 1;
4658 mul64To128( zSig0, zSig1, &term1, &term2 );
4659 shortShift128Left( term1, term2, 1, &term1, &term2 );
4660 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4661 mul64To128( zSig1, zSig1, &term2, &term3 );
4662 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4663 while ( (sbits64) rem1 < 0 ) {
4664 --zSig1;
4665 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
4666 term3 |= 1;
4667 add192(
4668 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
4670 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4672 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4673 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4678 -------------------------------------------------------------------------------
4679 Returns 1 if the quadruple-precision floating-point value `a' is equal to
4680 the corresponding value `b', and 0 otherwise. The comparison is performed
4681 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4682 -------------------------------------------------------------------------------
4684 flag float128_eq( float128 a, float128 b )
4687 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4688 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4689 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4690 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4692 if ( float128_is_signaling_nan( a )
4693 || float128_is_signaling_nan( b ) ) {
4694 float_raise( float_flag_invalid );
4696 return 0;
4698 return
4699 ( a.low == b.low )
4700 && ( ( a.high == b.high )
4701 || ( ( a.low == 0 )
4702 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4708 -------------------------------------------------------------------------------
4709 Returns 1 if the quadruple-precision floating-point value `a' is less than
4710 or equal to the corresponding value `b', and 0 otherwise. The comparison
4711 is performed according to the IEC/IEEE Standard for Binary Floating-point
4712 Arithmetic.
4713 -------------------------------------------------------------------------------
4715 flag float128_le( float128 a, float128 b )
4717 flag aSign, bSign;
4719 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4720 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4721 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4722 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4724 float_raise( float_flag_invalid );
4725 return 0;
4727 aSign = extractFloat128Sign( a );
4728 bSign = extractFloat128Sign( b );
4729 if ( aSign != bSign ) {
4730 return
4731 aSign
4732 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4733 == 0 );
4735 return
4736 aSign ? le128( b.high, b.low, a.high, a.low )
4737 : le128( a.high, a.low, b.high, b.low );
4742 -------------------------------------------------------------------------------
4743 Returns 1 if the quadruple-precision floating-point value `a' is less than
4744 the corresponding value `b', and 0 otherwise. The comparison is performed
4745 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4746 -------------------------------------------------------------------------------
4748 flag float128_lt( float128 a, float128 b )
4750 flag aSign, bSign;
4752 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4753 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4754 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4755 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4757 float_raise( float_flag_invalid );
4758 return 0;
4760 aSign = extractFloat128Sign( a );
4761 bSign = extractFloat128Sign( b );
4762 if ( aSign != bSign ) {
4763 return
4764 aSign
4765 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4766 != 0 );
4768 return
4769 aSign ? lt128( b.high, b.low, a.high, a.low )
4770 : lt128( a.high, a.low, b.high, b.low );
4775 -------------------------------------------------------------------------------
4776 Returns 1 if the quadruple-precision floating-point value `a' is equal to
4777 the corresponding value `b', and 0 otherwise. The invalid exception is
4778 raised if either operand is a NaN. Otherwise, the comparison is performed
4779 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4780 -------------------------------------------------------------------------------
4782 flag float128_eq_signaling( float128 a, float128 b )
4785 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4786 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4787 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4788 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4790 float_raise( float_flag_invalid );
4791 return 0;
4793 return
4794 ( a.low == b.low )
4795 && ( ( a.high == b.high )
4796 || ( ( a.low == 0 )
4797 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4803 -------------------------------------------------------------------------------
4804 Returns 1 if the quadruple-precision floating-point value `a' is less than
4805 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4806 cause an exception. Otherwise, the comparison is performed according to the
4807 IEC/IEEE Standard for Binary Floating-point Arithmetic.
4808 -------------------------------------------------------------------------------
4810 flag float128_le_quiet( float128 a, float128 b )
4812 flag aSign, bSign;
4814 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4815 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4816 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4817 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4819 if ( float128_is_signaling_nan( a )
4820 || float128_is_signaling_nan( b ) ) {
4821 float_raise( float_flag_invalid );
4823 return 0;
4825 aSign = extractFloat128Sign( a );
4826 bSign = extractFloat128Sign( b );
4827 if ( aSign != bSign ) {
4828 return
4829 aSign
4830 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4831 == 0 );
4833 return
4834 aSign ? le128( b.high, b.low, a.high, a.low )
4835 : le128( a.high, a.low, b.high, b.low );
4840 -------------------------------------------------------------------------------
4841 Returns 1 if the quadruple-precision floating-point value `a' is less than
4842 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4843 exception. Otherwise, the comparison is performed according to the IEC/IEEE
4844 Standard for Binary Floating-point Arithmetic.
4845 -------------------------------------------------------------------------------
4847 flag float128_lt_quiet( float128 a, float128 b )
4849 flag aSign, bSign;
4851 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4852 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4853 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4854 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4856 if ( float128_is_signaling_nan( a )
4857 || float128_is_signaling_nan( b ) ) {
4858 float_raise( float_flag_invalid );
4860 return 0;
4862 aSign = extractFloat128Sign( a );
4863 bSign = extractFloat128Sign( b );
4864 if ( aSign != bSign ) {
4865 return
4866 aSign
4867 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4868 != 0 );
4870 return
4871 aSign ? lt128( b.high, b.low, a.high, a.low )
4872 : lt128( a.high, a.low, b.high, b.low );
4876 #endif