Fix up mix of man(7)/mdoc(7).
[netbsd-mini2440.git] / sys / lib / libkern / softfloat.c
blobaffdef8268d1b9ec432db12067c0b8d69d899bd5
1 /* $NetBSD: softfloat.c,v 1.2.6.3 2004/09/21 13:35:54 skrll Exp $ */
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
9 /*
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
18 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 /* If you need this in a boot program, you have bigger problems... */
48 #ifndef _STANDALONE
50 #include <sys/cdefs.h>
51 #if defined(LIBC_SCCS) && !defined(lint)
52 __RCSID("$NetBSD: softfloat.c,v 1.2.6.3 2004/09/21 13:35:54 skrll Exp $");
53 #endif /* LIBC_SCCS and not lint */
55 #ifdef SOFTFLOAT_FOR_GCC
56 #include "softfloat-for-gcc.h"
57 #endif
59 #include "milieu.h"
60 #include "softfloat.h"
63 * Conversions between floats as stored in memory and floats as
64 * SoftFloat uses them
66 #ifndef FLOAT64_DEMANGLE
67 #define FLOAT64_DEMANGLE(a) (a)
68 #endif
69 #ifndef FLOAT64_MANGLE
70 #define FLOAT64_MANGLE(a) (a)
71 #endif
74 -------------------------------------------------------------------------------
75 Floating-point rounding mode, extended double-precision rounding precision,
76 and exception flags.
77 -------------------------------------------------------------------------------
81 * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
82 * Right now, it does not. I've removed all other dynamic global
83 * variables. [ross]
85 #ifdef FLOATX80
86 int8 floatx80_rounding_precision = 80;
87 #endif
90 -------------------------------------------------------------------------------
91 Primitive arithmetic functions, including multi-word arithmetic, and
92 division and square root approximations. (Can be specialized to target if
93 desired.)
94 -------------------------------------------------------------------------------
96 #include "softfloat-macros.h"
99 -------------------------------------------------------------------------------
100 Functions and definitions to determine: (1) whether tininess for underflow
101 is detected before or after rounding by default, (2) what (if anything)
102 happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 are propagated from function inputs to output. These details are target-
105 specific.
106 -------------------------------------------------------------------------------
108 #include "softfloat-specialize.h"
110 #ifndef SOFTFLOAT_FOR_GCC /* Not used */
112 -------------------------------------------------------------------------------
113 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
114 and 7, and returns the properly rounded 32-bit integer corresponding to the
115 input. If `zSign' is 1, the input is negated before being converted to an
116 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
117 is simply rounded to an integer, with the inexact exception raised if the
118 input cannot be represented exactly as an integer. However, if the fixed-
119 point input is too large, the invalid exception is raised and the largest
120 positive or negative integer is returned.
121 -------------------------------------------------------------------------------
123 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
125 int8 roundingMode;
126 flag roundNearestEven;
127 int8 roundIncrement, roundBits;
128 int32 z;
130 roundingMode = float_rounding_mode();
131 roundNearestEven = ( roundingMode == float_round_nearest_even );
132 roundIncrement = 0x40;
133 if ( ! roundNearestEven ) {
134 if ( roundingMode == float_round_to_zero ) {
135 roundIncrement = 0;
137 else {
138 roundIncrement = 0x7F;
139 if ( zSign ) {
140 if ( roundingMode == float_round_up ) roundIncrement = 0;
142 else {
143 if ( roundingMode == float_round_down ) roundIncrement = 0;
147 roundBits = absZ & 0x7F;
148 absZ = ( absZ + roundIncrement )>>7;
149 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
150 z = absZ;
151 if ( zSign ) z = - z;
152 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
153 float_raise( float_flag_invalid );
154 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
156 if ( roundBits ) float_set_inexact();
157 return z;
162 -------------------------------------------------------------------------------
163 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
164 `absZ1', with binary point between bits 63 and 64 (between the input words),
165 and returns the properly rounded 64-bit integer corresponding to the input.
166 If `zSign' is 1, the input is negated before being converted to an integer.
167 Ordinarily, the fixed-point input is simply rounded to an integer, with
168 the inexact exception raised if the input cannot be represented exactly as
169 an integer. However, if the fixed-point input is too large, the invalid
170 exception is raised and the largest positive or negative integer is
171 returned.
172 -------------------------------------------------------------------------------
174 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
176 int8 roundingMode;
177 flag roundNearestEven, increment;
178 int64 z;
180 roundingMode = float_rounding_mode();
181 roundNearestEven = ( roundingMode == float_round_nearest_even );
182 increment = ( (sbits64) absZ1 < 0 );
183 if ( ! roundNearestEven ) {
184 if ( roundingMode == float_round_to_zero ) {
185 increment = 0;
187 else {
188 if ( zSign ) {
189 increment = ( roundingMode == float_round_down ) && absZ1;
191 else {
192 increment = ( roundingMode == float_round_up ) && absZ1;
196 if ( increment ) {
197 ++absZ0;
198 if ( absZ0 == 0 ) goto overflow;
199 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
201 z = absZ0;
202 if ( zSign ) z = - z;
203 if ( z && ( ( z < 0 ) ^ zSign ) ) {
204 overflow:
205 float_raise( float_flag_invalid );
206 return
207 zSign ? (sbits64) LIT64( 0x8000000000000000 )
208 : LIT64( 0x7FFFFFFFFFFFFFFF );
210 if ( absZ1 ) float_set_inexact();
211 return z;
214 #endif
217 -------------------------------------------------------------------------------
218 Returns the fraction bits of the single-precision floating-point value `a'.
219 -------------------------------------------------------------------------------
221 INLINE bits32 extractFloat32Frac( float32 a )
224 return a & 0x007FFFFF;
229 -------------------------------------------------------------------------------
230 Returns the exponent bits of the single-precision floating-point value `a'.
231 -------------------------------------------------------------------------------
233 INLINE int16 extractFloat32Exp( float32 a )
236 return ( a>>23 ) & 0xFF;
241 -------------------------------------------------------------------------------
242 Returns the sign bit of the single-precision floating-point value `a'.
243 -------------------------------------------------------------------------------
245 INLINE flag extractFloat32Sign( float32 a )
248 return a>>31;
253 -------------------------------------------------------------------------------
254 Normalizes the subnormal single-precision floating-point value represented
255 by the denormalized significand `aSig'. The normalized exponent and
256 significand are stored at the locations pointed to by `zExpPtr' and
257 `zSigPtr', respectively.
258 -------------------------------------------------------------------------------
260 static void
261 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
263 int8 shiftCount;
265 shiftCount = countLeadingZeros32( aSig ) - 8;
266 *zSigPtr = aSig<<shiftCount;
267 *zExpPtr = 1 - shiftCount;
272 -------------------------------------------------------------------------------
273 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
274 single-precision floating-point value, returning the result. After being
275 shifted into the proper positions, the three fields are simply added
276 together to form the result. This means that any integer portion of `zSig'
277 will be added into the exponent. Since a properly normalized significand
278 will have an integer portion equal to 1, the `zExp' input should be 1 less
279 than the desired result exponent whenever `zSig' is a complete, normalized
280 significand.
281 -------------------------------------------------------------------------------
283 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
286 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
291 -------------------------------------------------------------------------------
292 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
293 and significand `zSig', and returns the proper single-precision floating-
294 point value corresponding to the abstract input. Ordinarily, the abstract
295 value is simply rounded and packed into the single-precision format, with
296 the inexact exception raised if the abstract input cannot be represented
297 exactly. However, if the abstract value is too large, the overflow and
298 inexact exceptions are raised and an infinity or maximal finite value is
299 returned. If the abstract value is too small, the input value is rounded to
300 a subnormal number, and the underflow and inexact exceptions are raised if
301 the abstract input cannot be represented exactly as a subnormal single-
302 precision floating-point number.
303 The input significand `zSig' has its binary point between bits 30
304 and 29, which is 7 bits to the left of the usual location. This shifted
305 significand must be normalized or smaller. If `zSig' is not normalized,
306 `zExp' must be 0; in that case, the result returned is a subnormal number,
307 and it must not require rounding. In the usual case that `zSig' is
308 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
309 The handling of underflow and overflow follows the IEC/IEEE Standard for
310 Binary Floating-Point Arithmetic.
311 -------------------------------------------------------------------------------
313 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
315 int8 roundingMode;
316 flag roundNearestEven;
317 int8 roundIncrement, roundBits;
318 flag isTiny;
320 roundingMode = float_rounding_mode();
321 roundNearestEven = ( roundingMode == float_round_nearest_even );
322 roundIncrement = 0x40;
323 if ( ! roundNearestEven ) {
324 if ( roundingMode == float_round_to_zero ) {
325 roundIncrement = 0;
327 else {
328 roundIncrement = 0x7F;
329 if ( zSign ) {
330 if ( roundingMode == float_round_up ) roundIncrement = 0;
332 else {
333 if ( roundingMode == float_round_down ) roundIncrement = 0;
337 roundBits = zSig & 0x7F;
338 if ( 0xFD <= (bits16) zExp ) {
339 if ( ( 0xFD < zExp )
340 || ( ( zExp == 0xFD )
341 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
343 float_raise( float_flag_overflow | float_flag_inexact );
344 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
346 if ( zExp < 0 ) {
347 isTiny =
348 ( float_detect_tininess == float_tininess_before_rounding )
349 || ( zExp < -1 )
350 || ( zSig + roundIncrement < 0x80000000 );
351 shift32RightJamming( zSig, - zExp, &zSig );
352 zExp = 0;
353 roundBits = zSig & 0x7F;
354 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
357 if ( roundBits ) float_set_inexact();
358 zSig = ( zSig + roundIncrement )>>7;
359 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
360 if ( zSig == 0 ) zExp = 0;
361 return packFloat32( zSign, zExp, zSig );
366 -------------------------------------------------------------------------------
367 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 and significand `zSig', and returns the proper single-precision floating-
369 point value corresponding to the abstract input. This routine is just like
370 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
371 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
372 floating-point exponent.
373 -------------------------------------------------------------------------------
375 static float32
376 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
378 int8 shiftCount;
380 shiftCount = countLeadingZeros32( zSig ) - 1;
381 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
386 -------------------------------------------------------------------------------
387 Returns the fraction bits of the double-precision floating-point value `a'.
388 -------------------------------------------------------------------------------
390 INLINE bits64 extractFloat64Frac( float64 a )
393 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
398 -------------------------------------------------------------------------------
399 Returns the exponent bits of the double-precision floating-point value `a'.
400 -------------------------------------------------------------------------------
402 INLINE int16 extractFloat64Exp( float64 a )
405 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
410 -------------------------------------------------------------------------------
411 Returns the sign bit of the double-precision floating-point value `a'.
412 -------------------------------------------------------------------------------
414 INLINE flag extractFloat64Sign( float64 a )
417 return FLOAT64_DEMANGLE(a)>>63;
422 -------------------------------------------------------------------------------
423 Normalizes the subnormal double-precision floating-point value represented
424 by the denormalized significand `aSig'. The normalized exponent and
425 significand are stored at the locations pointed to by `zExpPtr' and
426 `zSigPtr', respectively.
427 -------------------------------------------------------------------------------
429 static void
430 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
432 int8 shiftCount;
434 shiftCount = countLeadingZeros64( aSig ) - 11;
435 *zSigPtr = aSig<<shiftCount;
436 *zExpPtr = 1 - shiftCount;
441 -------------------------------------------------------------------------------
442 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
443 double-precision floating-point value, returning the result. After being
444 shifted into the proper positions, the three fields are simply added
445 together to form the result. This means that any integer portion of `zSig'
446 will be added into the exponent. Since a properly normalized significand
447 will have an integer portion equal to 1, the `zExp' input should be 1 less
448 than the desired result exponent whenever `zSig' is a complete, normalized
449 significand.
450 -------------------------------------------------------------------------------
452 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
455 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
456 ( ( (bits64) zExp )<<52 ) + zSig );
461 -------------------------------------------------------------------------------
462 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
463 and significand `zSig', and returns the proper double-precision floating-
464 point value corresponding to the abstract input. Ordinarily, the abstract
465 value is simply rounded and packed into the double-precision format, with
466 the inexact exception raised if the abstract input cannot be represented
467 exactly. However, if the abstract value is too large, the overflow and
468 inexact exceptions are raised and an infinity or maximal finite value is
469 returned. If the abstract value is too small, the input value is rounded to
470 a subnormal number, and the underflow and inexact exceptions are raised if
471 the abstract input cannot be represented exactly as a subnormal double-
472 precision floating-point number.
473 The input significand `zSig' has its binary point between bits 62
474 and 61, which is 10 bits to the left of the usual location. This shifted
475 significand must be normalized or smaller. If `zSig' is not normalized,
476 `zExp' must be 0; in that case, the result returned is a subnormal number,
477 and it must not require rounding. In the usual case that `zSig' is
478 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
479 The handling of underflow and overflow follows the IEC/IEEE Standard for
480 Binary Floating-Point Arithmetic.
481 -------------------------------------------------------------------------------
483 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
485 int8 roundingMode;
486 flag roundNearestEven;
487 int16 roundIncrement, roundBits;
488 flag isTiny;
490 roundingMode = float_rounding_mode();
491 roundNearestEven = ( roundingMode == float_round_nearest_even );
492 roundIncrement = 0x200;
493 if ( ! roundNearestEven ) {
494 if ( roundingMode == float_round_to_zero ) {
495 roundIncrement = 0;
497 else {
498 roundIncrement = 0x3FF;
499 if ( zSign ) {
500 if ( roundingMode == float_round_up ) roundIncrement = 0;
502 else {
503 if ( roundingMode == float_round_down ) roundIncrement = 0;
507 roundBits = zSig & 0x3FF;
508 if ( 0x7FD <= (bits16) zExp ) {
509 if ( ( 0x7FD < zExp )
510 || ( ( zExp == 0x7FD )
511 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
513 float_raise( float_flag_overflow | float_flag_inexact );
514 return FLOAT64_MANGLE(
515 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
516 ( roundIncrement == 0 ));
518 if ( zExp < 0 ) {
519 isTiny =
520 ( float_detect_tininess == float_tininess_before_rounding )
521 || ( zExp < -1 )
522 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
523 shift64RightJamming( zSig, - zExp, &zSig );
524 zExp = 0;
525 roundBits = zSig & 0x3FF;
526 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
529 if ( roundBits ) float_set_inexact();
530 zSig = ( zSig + roundIncrement )>>10;
531 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
532 if ( zSig == 0 ) zExp = 0;
533 return packFloat64( zSign, zExp, zSig );
538 -------------------------------------------------------------------------------
539 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
540 and significand `zSig', and returns the proper double-precision floating-
541 point value corresponding to the abstract input. This routine is just like
542 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
543 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
544 floating-point exponent.
545 -------------------------------------------------------------------------------
547 static float64
548 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
550 int8 shiftCount;
552 shiftCount = countLeadingZeros64( zSig ) - 1;
553 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
557 #ifdef FLOATX80
560 -------------------------------------------------------------------------------
561 Returns the fraction bits of the extended double-precision floating-point
562 value `a'.
563 -------------------------------------------------------------------------------
565 INLINE bits64 extractFloatx80Frac( floatx80 a )
568 return a.low;
573 -------------------------------------------------------------------------------
574 Returns the exponent bits of the extended double-precision floating-point
575 value `a'.
576 -------------------------------------------------------------------------------
578 INLINE int32 extractFloatx80Exp( floatx80 a )
581 return a.high & 0x7FFF;
586 -------------------------------------------------------------------------------
587 Returns the sign bit of the extended double-precision floating-point value
588 `a'.
589 -------------------------------------------------------------------------------
591 INLINE flag extractFloatx80Sign( floatx80 a )
594 return a.high>>15;
599 -------------------------------------------------------------------------------
600 Normalizes the subnormal extended double-precision floating-point value
601 represented by the denormalized significand `aSig'. The normalized exponent
602 and significand are stored at the locations pointed to by `zExpPtr' and
603 `zSigPtr', respectively.
604 -------------------------------------------------------------------------------
606 static void
607 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
609 int8 shiftCount;
611 shiftCount = countLeadingZeros64( aSig );
612 *zSigPtr = aSig<<shiftCount;
613 *zExpPtr = 1 - shiftCount;
618 -------------------------------------------------------------------------------
619 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
620 extended double-precision floating-point value, returning the result.
621 -------------------------------------------------------------------------------
623 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
625 floatx80 z;
627 z.low = zSig;
628 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
629 return z;
634 -------------------------------------------------------------------------------
635 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
636 and extended significand formed by the concatenation of `zSig0' and `zSig1',
637 and returns the proper extended double-precision floating-point value
638 corresponding to the abstract input. Ordinarily, the abstract value is
639 rounded and packed into the extended double-precision format, with the
640 inexact exception raised if the abstract input cannot be represented
641 exactly. However, if the abstract value is too large, the overflow and
642 inexact exceptions are raised and an infinity or maximal finite value is
643 returned. If the abstract value is too small, the input value is rounded to
644 a subnormal number, and the underflow and inexact exceptions are raised if
645 the abstract input cannot be represented exactly as a subnormal extended
646 double-precision floating-point number.
647 If `roundingPrecision' is 32 or 64, the result is rounded to the same
648 number of bits as single or double precision, respectively. Otherwise, the
649 result is rounded to the full precision of the extended double-precision
650 format.
651 The input significand must be normalized or smaller. If the input
652 significand is not normalized, `zExp' must be 0; in that case, the result
653 returned is a subnormal number, and it must not require rounding. The
654 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
655 Floating-Point Arithmetic.
656 -------------------------------------------------------------------------------
658 static floatx80
659 roundAndPackFloatx80(
660 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
663 int8 roundingMode;
664 flag roundNearestEven, increment, isTiny;
665 int64 roundIncrement, roundMask, roundBits;
667 roundingMode = float_rounding_mode();
668 roundNearestEven = ( roundingMode == float_round_nearest_even );
669 if ( roundingPrecision == 80 ) goto precision80;
670 if ( roundingPrecision == 64 ) {
671 roundIncrement = LIT64( 0x0000000000000400 );
672 roundMask = LIT64( 0x00000000000007FF );
674 else if ( roundingPrecision == 32 ) {
675 roundIncrement = LIT64( 0x0000008000000000 );
676 roundMask = LIT64( 0x000000FFFFFFFFFF );
678 else {
679 goto precision80;
681 zSig0 |= ( zSig1 != 0 );
682 if ( ! roundNearestEven ) {
683 if ( roundingMode == float_round_to_zero ) {
684 roundIncrement = 0;
686 else {
687 roundIncrement = roundMask;
688 if ( zSign ) {
689 if ( roundingMode == float_round_up ) roundIncrement = 0;
691 else {
692 if ( roundingMode == float_round_down ) roundIncrement = 0;
696 roundBits = zSig0 & roundMask;
697 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
698 if ( ( 0x7FFE < zExp )
699 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
701 goto overflow;
703 if ( zExp <= 0 ) {
704 isTiny =
705 ( float_detect_tininess == float_tininess_before_rounding )
706 || ( zExp < 0 )
707 || ( zSig0 <= zSig0 + roundIncrement );
708 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
709 zExp = 0;
710 roundBits = zSig0 & roundMask;
711 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
712 if ( roundBits ) float_set_inexact();
713 zSig0 += roundIncrement;
714 if ( (sbits64) zSig0 < 0 ) zExp = 1;
715 roundIncrement = roundMask + 1;
716 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
717 roundMask |= roundIncrement;
719 zSig0 &= ~ roundMask;
720 return packFloatx80( zSign, zExp, zSig0 );
723 if ( roundBits ) float_set_inexact();
724 zSig0 += roundIncrement;
725 if ( zSig0 < roundIncrement ) {
726 ++zExp;
727 zSig0 = LIT64( 0x8000000000000000 );
729 roundIncrement = roundMask + 1;
730 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
731 roundMask |= roundIncrement;
733 zSig0 &= ~ roundMask;
734 if ( zSig0 == 0 ) zExp = 0;
735 return packFloatx80( zSign, zExp, zSig0 );
736 precision80:
737 increment = ( (sbits64) zSig1 < 0 );
738 if ( ! roundNearestEven ) {
739 if ( roundingMode == float_round_to_zero ) {
740 increment = 0;
742 else {
743 if ( zSign ) {
744 increment = ( roundingMode == float_round_down ) && zSig1;
746 else {
747 increment = ( roundingMode == float_round_up ) && zSig1;
751 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
752 if ( ( 0x7FFE < zExp )
753 || ( ( zExp == 0x7FFE )
754 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
755 && increment
758 roundMask = 0;
759 overflow:
760 float_raise( float_flag_overflow | float_flag_inexact );
761 if ( ( roundingMode == float_round_to_zero )
762 || ( zSign && ( roundingMode == float_round_up ) )
763 || ( ! zSign && ( roundingMode == float_round_down ) )
765 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
767 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
769 if ( zExp <= 0 ) {
770 isTiny =
771 ( float_detect_tininess == float_tininess_before_rounding )
772 || ( zExp < 0 )
773 || ! increment
774 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
775 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
776 zExp = 0;
777 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
778 if ( zSig1 ) float_set_inexact();
779 if ( roundNearestEven ) {
780 increment = ( (sbits64) zSig1 < 0 );
782 else {
783 if ( zSign ) {
784 increment = ( roundingMode == float_round_down ) && zSig1;
786 else {
787 increment = ( roundingMode == float_round_up ) && zSig1;
790 if ( increment ) {
791 ++zSig0;
792 zSig0 &=
793 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
794 if ( (sbits64) zSig0 < 0 ) zExp = 1;
796 return packFloatx80( zSign, zExp, zSig0 );
799 if ( zSig1 ) float_set_inexact();
800 if ( increment ) {
801 ++zSig0;
802 if ( zSig0 == 0 ) {
803 ++zExp;
804 zSig0 = LIT64( 0x8000000000000000 );
806 else {
807 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
810 else {
811 if ( zSig0 == 0 ) zExp = 0;
813 return packFloatx80( zSign, zExp, zSig0 );
818 -------------------------------------------------------------------------------
819 Takes an abstract floating-point value having sign `zSign', exponent
820 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
821 and returns the proper extended double-precision floating-point value
822 corresponding to the abstract input. This routine is just like
823 `roundAndPackFloatx80' except that the input significand does not have to be
824 normalized.
825 -------------------------------------------------------------------------------
827 static floatx80
828 normalizeRoundAndPackFloatx80(
829 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
832 int8 shiftCount;
834 if ( zSig0 == 0 ) {
835 zSig0 = zSig1;
836 zSig1 = 0;
837 zExp -= 64;
839 shiftCount = countLeadingZeros64( zSig0 );
840 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
841 zExp -= shiftCount;
842 return
843 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
847 #endif
849 #ifdef FLOAT128
852 -------------------------------------------------------------------------------
853 Returns the least-significant 64 fraction bits of the quadruple-precision
854 floating-point value `a'.
855 -------------------------------------------------------------------------------
857 INLINE bits64 extractFloat128Frac1( float128 a )
860 return a.low;
865 -------------------------------------------------------------------------------
866 Returns the most-significant 48 fraction bits of the quadruple-precision
867 floating-point value `a'.
868 -------------------------------------------------------------------------------
870 INLINE bits64 extractFloat128Frac0( float128 a )
873 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
878 -------------------------------------------------------------------------------
879 Returns the exponent bits of the quadruple-precision floating-point value
880 `a'.
881 -------------------------------------------------------------------------------
883 INLINE int32 extractFloat128Exp( float128 a )
886 return ( a.high>>48 ) & 0x7FFF;
891 -------------------------------------------------------------------------------
892 Returns the sign bit of the quadruple-precision floating-point value `a'.
893 -------------------------------------------------------------------------------
895 INLINE flag extractFloat128Sign( float128 a )
898 return a.high>>63;
903 -------------------------------------------------------------------------------
904 Normalizes the subnormal quadruple-precision floating-point value
905 represented by the denormalized significand formed by the concatenation of
906 `aSig0' and `aSig1'. The normalized exponent is stored at the location
907 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
908 significand are stored at the location pointed to by `zSig0Ptr', and the
909 least significant 64 bits of the normalized significand are stored at the
910 location pointed to by `zSig1Ptr'.
911 -------------------------------------------------------------------------------
913 static void
914 normalizeFloat128Subnormal(
915 bits64 aSig0,
916 bits64 aSig1,
917 int32 *zExpPtr,
918 bits64 *zSig0Ptr,
919 bits64 *zSig1Ptr
922 int8 shiftCount;
924 if ( aSig0 == 0 ) {
925 shiftCount = countLeadingZeros64( aSig1 ) - 15;
926 if ( shiftCount < 0 ) {
927 *zSig0Ptr = aSig1>>( - shiftCount );
928 *zSig1Ptr = aSig1<<( shiftCount & 63 );
930 else {
931 *zSig0Ptr = aSig1<<shiftCount;
932 *zSig1Ptr = 0;
934 *zExpPtr = - shiftCount - 63;
936 else {
937 shiftCount = countLeadingZeros64( aSig0 ) - 15;
938 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
939 *zExpPtr = 1 - shiftCount;
945 -------------------------------------------------------------------------------
946 Packs the sign `zSign', the exponent `zExp', and the significand formed
947 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
948 floating-point value, returning the result. After being shifted into the
949 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
950 added together to form the most significant 32 bits of the result. This
951 means that any integer portion of `zSig0' will be added into the exponent.
952 Since a properly normalized significand will have an integer portion equal
953 to 1, the `zExp' input should be 1 less than the desired result exponent
954 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
955 significand.
956 -------------------------------------------------------------------------------
958 INLINE float128
959 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
961 float128 z;
963 z.low = zSig1;
964 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
965 return z;
970 -------------------------------------------------------------------------------
971 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
972 and extended significand formed by the concatenation of `zSig0', `zSig1',
973 and `zSig2', and returns the proper quadruple-precision floating-point value
974 corresponding to the abstract input. Ordinarily, the abstract value is
975 simply rounded and packed into the quadruple-precision format, with the
976 inexact exception raised if the abstract input cannot be represented
977 exactly. However, if the abstract value is too large, the overflow and
978 inexact exceptions are raised and an infinity or maximal finite value is
979 returned. If the abstract value is too small, the input value is rounded to
980 a subnormal number, and the underflow and inexact exceptions are raised if
981 the abstract input cannot be represented exactly as a subnormal quadruple-
982 precision floating-point number.
983 The input significand must be normalized or smaller. If the input
984 significand is not normalized, `zExp' must be 0; in that case, the result
985 returned is a subnormal number, and it must not require rounding. In the
986 usual case that the input significand is normalized, `zExp' must be 1 less
987 than the ``true'' floating-point exponent. The handling of underflow and
988 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
989 -------------------------------------------------------------------------------
991 static float128
992 roundAndPackFloat128(
993 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
995 int8 roundingMode;
996 flag roundNearestEven, increment, isTiny;
998 roundingMode = float_rounding_mode();
999 roundNearestEven = ( roundingMode == float_round_nearest_even );
1000 increment = ( (sbits64) zSig2 < 0 );
1001 if ( ! roundNearestEven ) {
1002 if ( roundingMode == float_round_to_zero ) {
1003 increment = 0;
1005 else {
1006 if ( zSign ) {
1007 increment = ( roundingMode == float_round_down ) && zSig2;
1009 else {
1010 increment = ( roundingMode == float_round_up ) && zSig2;
1014 if ( 0x7FFD <= (bits32) zExp ) {
1015 if ( ( 0x7FFD < zExp )
1016 || ( ( zExp == 0x7FFD )
1017 && eq128(
1018 LIT64( 0x0001FFFFFFFFFFFF ),
1019 LIT64( 0xFFFFFFFFFFFFFFFF ),
1020 zSig0,
1021 zSig1
1023 && increment
1026 float_raise( float_flag_overflow | float_flag_inexact );
1027 if ( ( roundingMode == float_round_to_zero )
1028 || ( zSign && ( roundingMode == float_round_up ) )
1029 || ( ! zSign && ( roundingMode == float_round_down ) )
1031 return
1032 packFloat128(
1033 zSign,
1034 0x7FFE,
1035 LIT64( 0x0000FFFFFFFFFFFF ),
1036 LIT64( 0xFFFFFFFFFFFFFFFF )
1039 return packFloat128( zSign, 0x7FFF, 0, 0 );
1041 if ( zExp < 0 ) {
1042 isTiny =
1043 ( float_detect_tininess == float_tininess_before_rounding )
1044 || ( zExp < -1 )
1045 || ! increment
1046 || lt128(
1047 zSig0,
1048 zSig1,
1049 LIT64( 0x0001FFFFFFFFFFFF ),
1050 LIT64( 0xFFFFFFFFFFFFFFFF )
1052 shift128ExtraRightJamming(
1053 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1054 zExp = 0;
1055 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1056 if ( roundNearestEven ) {
1057 increment = ( (sbits64) zSig2 < 0 );
1059 else {
1060 if ( zSign ) {
1061 increment = ( roundingMode == float_round_down ) && zSig2;
1063 else {
1064 increment = ( roundingMode == float_round_up ) && zSig2;
1069 if ( zSig2 ) float_set_inexact();
1070 if ( increment ) {
1071 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1072 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1074 else {
1075 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1077 return packFloat128( zSign, zExp, zSig0, zSig1 );
1082 -------------------------------------------------------------------------------
1083 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1084 and significand formed by the concatenation of `zSig0' and `zSig1', and
1085 returns the proper quadruple-precision floating-point value corresponding
1086 to the abstract input. This routine is just like `roundAndPackFloat128'
1087 except that the input significand has fewer bits and does not have to be
1088 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1089 point exponent.
1090 -------------------------------------------------------------------------------
1092 static float128
1093 normalizeRoundAndPackFloat128(
1094 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1096 int8 shiftCount;
1097 bits64 zSig2;
1099 if ( zSig0 == 0 ) {
1100 zSig0 = zSig1;
1101 zSig1 = 0;
1102 zExp -= 64;
1104 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1105 if ( 0 <= shiftCount ) {
1106 zSig2 = 0;
1107 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1109 else {
1110 shift128ExtraRightJamming(
1111 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1113 zExp -= shiftCount;
1114 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1118 #endif
1121 -------------------------------------------------------------------------------
1122 Returns the result of converting the 32-bit two's complement integer `a'
1123 to the single-precision floating-point format. The conversion is performed
1124 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1125 -------------------------------------------------------------------------------
1127 float32 int32_to_float32( int32 a )
1129 flag zSign;
1131 if ( a == 0 ) return 0;
1132 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1133 zSign = ( a < 0 );
1134 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1139 -------------------------------------------------------------------------------
1140 Returns the result of converting the 32-bit two's complement integer `a'
1141 to the double-precision floating-point format. The conversion is performed
1142 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1143 -------------------------------------------------------------------------------
1145 float64 int32_to_float64( int32 a )
1147 flag zSign;
1148 uint32 absA;
1149 int8 shiftCount;
1150 bits64 zSig;
1152 if ( a == 0 ) return 0;
1153 zSign = ( a < 0 );
1154 absA = zSign ? - a : a;
1155 shiftCount = countLeadingZeros32( absA ) + 21;
1156 zSig = absA;
1157 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1161 #ifdef FLOATX80
1164 -------------------------------------------------------------------------------
1165 Returns the result of converting the 32-bit two's complement integer `a'
1166 to the extended double-precision floating-point format. The conversion
1167 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1168 Arithmetic.
1169 -------------------------------------------------------------------------------
1171 floatx80 int32_to_floatx80( int32 a )
1173 flag zSign;
1174 uint32 absA;
1175 int8 shiftCount;
1176 bits64 zSig;
1178 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1179 zSign = ( a < 0 );
1180 absA = zSign ? - a : a;
1181 shiftCount = countLeadingZeros32( absA ) + 32;
1182 zSig = absA;
1183 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1187 #endif
1189 #ifdef FLOAT128
1192 -------------------------------------------------------------------------------
1193 Returns the result of converting the 32-bit two's complement integer `a' to
1194 the quadruple-precision floating-point format. The conversion is performed
1195 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1196 -------------------------------------------------------------------------------
1198 float128 int32_to_float128( int32 a )
1200 flag zSign;
1201 uint32 absA;
1202 int8 shiftCount;
1203 bits64 zSig0;
1205 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1206 zSign = ( a < 0 );
1207 absA = zSign ? - a : a;
1208 shiftCount = countLeadingZeros32( absA ) + 17;
1209 zSig0 = absA;
1210 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1214 #endif
1216 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1218 -------------------------------------------------------------------------------
1219 Returns the result of converting the 64-bit two's complement integer `a'
1220 to the single-precision floating-point format. The conversion is performed
1221 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1222 -------------------------------------------------------------------------------
1224 float32 int64_to_float32( int64 a )
1226 flag zSign;
1227 uint64 absA;
1228 int8 shiftCount;
1230 if ( a == 0 ) return 0;
1231 zSign = ( a < 0 );
1232 absA = zSign ? - a : a;
1233 shiftCount = countLeadingZeros64( absA ) - 40;
1234 if ( 0 <= shiftCount ) {
1235 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1237 else {
1238 shiftCount += 7;
1239 if ( shiftCount < 0 ) {
1240 shift64RightJamming( absA, - shiftCount, &absA );
1242 else {
1243 absA <<= shiftCount;
1245 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1251 -------------------------------------------------------------------------------
1252 Returns the result of converting the 64-bit two's complement integer `a'
1253 to the double-precision floating-point format. The conversion is performed
1254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 -------------------------------------------------------------------------------
1257 float64 int64_to_float64( int64 a )
1259 flag zSign;
1261 if ( a == 0 ) return 0;
1262 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1263 return packFloat64( 1, 0x43E, 0 );
1265 zSign = ( a < 0 );
1266 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1270 #ifdef FLOATX80
1273 -------------------------------------------------------------------------------
1274 Returns the result of converting the 64-bit two's complement integer `a'
1275 to the extended double-precision floating-point format. The conversion
1276 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1277 Arithmetic.
1278 -------------------------------------------------------------------------------
1280 floatx80 int64_to_floatx80( int64 a )
1282 flag zSign;
1283 uint64 absA;
1284 int8 shiftCount;
1286 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1287 zSign = ( a < 0 );
1288 absA = zSign ? - a : a;
1289 shiftCount = countLeadingZeros64( absA );
1290 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1294 #endif
1296 #ifdef FLOAT128
1299 -------------------------------------------------------------------------------
1300 Returns the result of converting the 64-bit two's complement integer `a' to
1301 the quadruple-precision floating-point format. The conversion is performed
1302 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1303 -------------------------------------------------------------------------------
1305 float128 int64_to_float128( int64 a )
1307 flag zSign;
1308 uint64 absA;
1309 int8 shiftCount;
1310 int32 zExp;
1311 bits64 zSig0, zSig1;
1313 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1314 zSign = ( a < 0 );
1315 absA = zSign ? - a : a;
1316 shiftCount = countLeadingZeros64( absA ) + 49;
1317 zExp = 0x406E - shiftCount;
1318 if ( 64 <= shiftCount ) {
1319 zSig1 = 0;
1320 zSig0 = absA;
1321 shiftCount -= 64;
1323 else {
1324 zSig1 = absA;
1325 zSig0 = 0;
1327 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1328 return packFloat128( zSign, zExp, zSig0, zSig1 );
1332 #endif
1333 #endif /* !SOFTFLOAT_FOR_GCC */
1335 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1337 -------------------------------------------------------------------------------
1338 Returns the result of converting the single-precision floating-point value
1339 `a' to the 32-bit two's complement integer format. The conversion is
1340 performed according to the IEC/IEEE Standard for Binary Floating-Point
1341 Arithmetic---which means in particular that the conversion is rounded
1342 according to the current rounding mode. If `a' is a NaN, the largest
1343 positive integer is returned. Otherwise, if the conversion overflows, the
1344 largest integer with the same sign as `a' is returned.
1345 -------------------------------------------------------------------------------
1347 int32 float32_to_int32( float32 a )
1349 flag aSign;
1350 int16 aExp, shiftCount;
1351 bits32 aSig;
1352 bits64 aSig64;
1354 aSig = extractFloat32Frac( a );
1355 aExp = extractFloat32Exp( a );
1356 aSign = extractFloat32Sign( a );
1357 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1358 if ( aExp ) aSig |= 0x00800000;
1359 shiftCount = 0xAF - aExp;
1360 aSig64 = aSig;
1361 aSig64 <<= 32;
1362 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1363 return roundAndPackInt32( aSign, aSig64 );
1366 #endif /* !SOFTFLOAT_FOR_GCC */
1369 -------------------------------------------------------------------------------
1370 Returns the result of converting the single-precision floating-point value
1371 `a' to the 32-bit two's complement integer format. The conversion is
1372 performed according to the IEC/IEEE Standard for Binary Floating-Point
1373 Arithmetic, except that the conversion is always rounded toward zero.
1374 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1375 the conversion overflows, the largest integer with the same sign as `a' is
1376 returned.
1377 -------------------------------------------------------------------------------
1379 int32 float32_to_int32_round_to_zero( float32 a )
1381 flag aSign;
1382 int16 aExp, shiftCount;
1383 bits32 aSig;
1384 int32 z;
1386 aSig = extractFloat32Frac( a );
1387 aExp = extractFloat32Exp( a );
1388 aSign = extractFloat32Sign( a );
1389 shiftCount = aExp - 0x9E;
1390 if ( 0 <= shiftCount ) {
1391 if ( a != 0xCF000000 ) {
1392 float_raise( float_flag_invalid );
1393 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1395 return (sbits32) 0x80000000;
1397 else if ( aExp <= 0x7E ) {
1398 if ( aExp | aSig ) float_set_inexact();
1399 return 0;
1401 aSig = ( aSig | 0x00800000 )<<8;
1402 z = aSig>>( - shiftCount );
1403 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1404 float_set_inexact();
1406 if ( aSign ) z = - z;
1407 return z;
1411 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1413 -------------------------------------------------------------------------------
1414 Returns the result of converting the single-precision floating-point value
1415 `a' to the 64-bit two's complement integer format. The conversion is
1416 performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 Arithmetic---which means in particular that the conversion is rounded
1418 according to the current rounding mode. If `a' is a NaN, the largest
1419 positive integer is returned. Otherwise, if the conversion overflows, the
1420 largest integer with the same sign as `a' is returned.
1421 -------------------------------------------------------------------------------
1423 int64 float32_to_int64( float32 a )
1425 flag aSign;
1426 int16 aExp, shiftCount;
1427 bits32 aSig;
1428 bits64 aSig64, aSigExtra;
1430 aSig = extractFloat32Frac( a );
1431 aExp = extractFloat32Exp( a );
1432 aSign = extractFloat32Sign( a );
1433 shiftCount = 0xBE - aExp;
1434 if ( shiftCount < 0 ) {
1435 float_raise( float_flag_invalid );
1436 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1437 return LIT64( 0x7FFFFFFFFFFFFFFF );
1439 return (sbits64) LIT64( 0x8000000000000000 );
1441 if ( aExp ) aSig |= 0x00800000;
1442 aSig64 = aSig;
1443 aSig64 <<= 40;
1444 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1445 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1450 -------------------------------------------------------------------------------
1451 Returns the result of converting the single-precision floating-point value
1452 `a' to the 64-bit two's complement integer format. The conversion is
1453 performed according to the IEC/IEEE Standard for Binary Floating-Point
1454 Arithmetic, except that the conversion is always rounded toward zero. If
1455 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1456 conversion overflows, the largest integer with the same sign as `a' is
1457 returned.
1458 -------------------------------------------------------------------------------
1460 int64 float32_to_int64_round_to_zero( float32 a )
1462 flag aSign;
1463 int16 aExp, shiftCount;
1464 bits32 aSig;
1465 bits64 aSig64;
1466 int64 z;
1468 aSig = extractFloat32Frac( a );
1469 aExp = extractFloat32Exp( a );
1470 aSign = extractFloat32Sign( a );
1471 shiftCount = aExp - 0xBE;
1472 if ( 0 <= shiftCount ) {
1473 if ( a != 0xDF000000 ) {
1474 float_raise( float_flag_invalid );
1475 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1476 return LIT64( 0x7FFFFFFFFFFFFFFF );
1479 return (sbits64) LIT64( 0x8000000000000000 );
1481 else if ( aExp <= 0x7E ) {
1482 if ( aExp | aSig ) float_set_inexact();
1483 return 0;
1485 aSig64 = aSig | 0x00800000;
1486 aSig64 <<= 40;
1487 z = aSig64>>( - shiftCount );
1488 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1489 float_set_inexact();
1491 if ( aSign ) z = - z;
1492 return z;
1495 #endif /* !SOFTFLOAT_FOR_GCC */
1498 -------------------------------------------------------------------------------
1499 Returns the result of converting the single-precision floating-point value
1500 `a' to the double-precision floating-point format. The conversion is
1501 performed according to the IEC/IEEE Standard for Binary Floating-Point
1502 Arithmetic.
1503 -------------------------------------------------------------------------------
1505 float64 float32_to_float64( float32 a )
1507 flag aSign;
1508 int16 aExp;
1509 bits32 aSig;
1511 aSig = extractFloat32Frac( a );
1512 aExp = extractFloat32Exp( a );
1513 aSign = extractFloat32Sign( a );
1514 if ( aExp == 0xFF ) {
1515 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1516 return packFloat64( aSign, 0x7FF, 0 );
1518 if ( aExp == 0 ) {
1519 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1520 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1521 --aExp;
1523 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1527 #ifdef FLOATX80
1530 -------------------------------------------------------------------------------
1531 Returns the result of converting the single-precision floating-point value
1532 `a' to the extended double-precision floating-point format. The conversion
1533 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1534 Arithmetic.
1535 -------------------------------------------------------------------------------
1537 floatx80 float32_to_floatx80( float32 a )
1539 flag aSign;
1540 int16 aExp;
1541 bits32 aSig;
1543 aSig = extractFloat32Frac( a );
1544 aExp = extractFloat32Exp( a );
1545 aSign = extractFloat32Sign( a );
1546 if ( aExp == 0xFF ) {
1547 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1548 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1550 if ( aExp == 0 ) {
1551 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1552 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1554 aSig |= 0x00800000;
1555 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1559 #endif
1561 #ifdef FLOAT128
1564 -------------------------------------------------------------------------------
1565 Returns the result of converting the single-precision floating-point value
1566 `a' to the double-precision floating-point format. The conversion is
1567 performed according to the IEC/IEEE Standard for Binary Floating-Point
1568 Arithmetic.
1569 -------------------------------------------------------------------------------
1571 float128 float32_to_float128( float32 a )
1573 flag aSign;
1574 int16 aExp;
1575 bits32 aSig;
1577 aSig = extractFloat32Frac( a );
1578 aExp = extractFloat32Exp( a );
1579 aSign = extractFloat32Sign( a );
1580 if ( aExp == 0xFF ) {
1581 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1582 return packFloat128( aSign, 0x7FFF, 0, 0 );
1584 if ( aExp == 0 ) {
1585 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1586 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1587 --aExp;
1589 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1593 #endif
1595 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1597 -------------------------------------------------------------------------------
1598 Rounds the single-precision floating-point value `a' to an integer, and
1599 returns the result as a single-precision floating-point value. The
1600 operation is performed according to the IEC/IEEE Standard for Binary
1601 Floating-Point Arithmetic.
1602 -------------------------------------------------------------------------------
1604 float32 float32_round_to_int( float32 a )
1606 flag aSign;
1607 int16 aExp;
1608 bits32 lastBitMask, roundBitsMask;
1609 int8 roundingMode;
1610 float32 z;
1612 aExp = extractFloat32Exp( a );
1613 if ( 0x96 <= aExp ) {
1614 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1615 return propagateFloat32NaN( a, a );
1617 return a;
1619 if ( aExp <= 0x7E ) {
1620 if ( (bits32) ( a<<1 ) == 0 ) return a;
1621 float_set_inexact();
1622 aSign = extractFloat32Sign( a );
1623 switch ( float_rounding_mode() ) {
1624 case float_round_nearest_even:
1625 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1626 return packFloat32( aSign, 0x7F, 0 );
1628 break;
1629 case float_round_down:
1630 return aSign ? 0xBF800000 : 0;
1631 case float_round_up:
1632 return aSign ? 0x80000000 : 0x3F800000;
1634 return packFloat32( aSign, 0, 0 );
1636 lastBitMask = 1;
1637 lastBitMask <<= 0x96 - aExp;
1638 roundBitsMask = lastBitMask - 1;
1639 z = a;
1640 roundingMode = float_rounding_mode();
1641 if ( roundingMode == float_round_nearest_even ) {
1642 z += lastBitMask>>1;
1643 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1645 else if ( roundingMode != float_round_to_zero ) {
1646 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1647 z += roundBitsMask;
1650 z &= ~ roundBitsMask;
1651 if ( z != a ) float_set_inexact();
1652 return z;
1655 #endif /* !SOFTFLOAT_FOR_GCC */
1658 -------------------------------------------------------------------------------
1659 Returns the result of adding the absolute values of the single-precision
1660 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1661 before being returned. `zSign' is ignored if the result is a NaN.
1662 The addition is performed according to the IEC/IEEE Standard for Binary
1663 Floating-Point Arithmetic.
1664 -------------------------------------------------------------------------------
1666 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1668 int16 aExp, bExp, zExp;
1669 bits32 aSig, bSig, zSig;
1670 int16 expDiff;
1672 aSig = extractFloat32Frac( a );
1673 aExp = extractFloat32Exp( a );
1674 bSig = extractFloat32Frac( b );
1675 bExp = extractFloat32Exp( b );
1676 expDiff = aExp - bExp;
1677 aSig <<= 6;
1678 bSig <<= 6;
1679 if ( 0 < expDiff ) {
1680 if ( aExp == 0xFF ) {
1681 if ( aSig ) return propagateFloat32NaN( a, b );
1682 return a;
1684 if ( bExp == 0 ) {
1685 --expDiff;
1687 else {
1688 bSig |= 0x20000000;
1690 shift32RightJamming( bSig, expDiff, &bSig );
1691 zExp = aExp;
1693 else if ( expDiff < 0 ) {
1694 if ( bExp == 0xFF ) {
1695 if ( bSig ) return propagateFloat32NaN( a, b );
1696 return packFloat32( zSign, 0xFF, 0 );
1698 if ( aExp == 0 ) {
1699 ++expDiff;
1701 else {
1702 aSig |= 0x20000000;
1704 shift32RightJamming( aSig, - expDiff, &aSig );
1705 zExp = bExp;
1707 else {
1708 if ( aExp == 0xFF ) {
1709 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1710 return a;
1712 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1713 zSig = 0x40000000 + aSig + bSig;
1714 zExp = aExp;
1715 goto roundAndPack;
1717 aSig |= 0x20000000;
1718 zSig = ( aSig + bSig )<<1;
1719 --zExp;
1720 if ( (sbits32) zSig < 0 ) {
1721 zSig = aSig + bSig;
1722 ++zExp;
1724 roundAndPack:
1725 return roundAndPackFloat32( zSign, zExp, zSig );
1730 -------------------------------------------------------------------------------
1731 Returns the result of subtracting the absolute values of the single-
1732 precision floating-point values `a' and `b'. If `zSign' is 1, the
1733 difference is negated before being returned. `zSign' is ignored if the
1734 result is a NaN. The subtraction is performed according to the IEC/IEEE
1735 Standard for Binary Floating-Point Arithmetic.
1736 -------------------------------------------------------------------------------
1738 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1740 int16 aExp, bExp, zExp;
1741 bits32 aSig, bSig, zSig;
1742 int16 expDiff;
1744 aSig = extractFloat32Frac( a );
1745 aExp = extractFloat32Exp( a );
1746 bSig = extractFloat32Frac( b );
1747 bExp = extractFloat32Exp( b );
1748 expDiff = aExp - bExp;
1749 aSig <<= 7;
1750 bSig <<= 7;
1751 if ( 0 < expDiff ) goto aExpBigger;
1752 if ( expDiff < 0 ) goto bExpBigger;
1753 if ( aExp == 0xFF ) {
1754 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1755 float_raise( float_flag_invalid );
1756 return float32_default_nan;
1758 if ( aExp == 0 ) {
1759 aExp = 1;
1760 bExp = 1;
1762 if ( bSig < aSig ) goto aBigger;
1763 if ( aSig < bSig ) goto bBigger;
1764 return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1765 bExpBigger:
1766 if ( bExp == 0xFF ) {
1767 if ( bSig ) return propagateFloat32NaN( a, b );
1768 return packFloat32( zSign ^ 1, 0xFF, 0 );
1770 if ( aExp == 0 ) {
1771 ++expDiff;
1773 else {
1774 aSig |= 0x40000000;
1776 shift32RightJamming( aSig, - expDiff, &aSig );
1777 bSig |= 0x40000000;
1778 bBigger:
1779 zSig = bSig - aSig;
1780 zExp = bExp;
1781 zSign ^= 1;
1782 goto normalizeRoundAndPack;
1783 aExpBigger:
1784 if ( aExp == 0xFF ) {
1785 if ( aSig ) return propagateFloat32NaN( a, b );
1786 return a;
1788 if ( bExp == 0 ) {
1789 --expDiff;
1791 else {
1792 bSig |= 0x40000000;
1794 shift32RightJamming( bSig, expDiff, &bSig );
1795 aSig |= 0x40000000;
1796 aBigger:
1797 zSig = aSig - bSig;
1798 zExp = aExp;
1799 normalizeRoundAndPack:
1800 --zExp;
1801 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1806 -------------------------------------------------------------------------------
1807 Returns the result of adding the single-precision floating-point values `a'
1808 and `b'. The operation is performed according to the IEC/IEEE Standard for
1809 Binary Floating-Point Arithmetic.
1810 -------------------------------------------------------------------------------
1812 float32 float32_add( float32 a, float32 b )
1814 flag aSign, bSign;
1816 aSign = extractFloat32Sign( a );
1817 bSign = extractFloat32Sign( b );
1818 if ( aSign == bSign ) {
1819 return addFloat32Sigs( a, b, aSign );
1821 else {
1822 return subFloat32Sigs( a, b, aSign );
1828 -------------------------------------------------------------------------------
1829 Returns the result of subtracting the single-precision floating-point values
1830 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1831 for Binary Floating-Point Arithmetic.
1832 -------------------------------------------------------------------------------
1834 float32 float32_sub( float32 a, float32 b )
1836 flag aSign, bSign;
1838 aSign = extractFloat32Sign( a );
1839 bSign = extractFloat32Sign( b );
1840 if ( aSign == bSign ) {
1841 return subFloat32Sigs( a, b, aSign );
1843 else {
1844 return addFloat32Sigs( a, b, aSign );
1850 -------------------------------------------------------------------------------
1851 Returns the result of multiplying the single-precision floating-point values
1852 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1853 for Binary Floating-Point Arithmetic.
1854 -------------------------------------------------------------------------------
1856 float32 float32_mul( float32 a, float32 b )
1858 flag aSign, bSign, zSign;
1859 int16 aExp, bExp, zExp;
1860 bits32 aSig, bSig;
1861 bits64 zSig64;
1862 bits32 zSig;
1864 aSig = extractFloat32Frac( a );
1865 aExp = extractFloat32Exp( a );
1866 aSign = extractFloat32Sign( a );
1867 bSig = extractFloat32Frac( b );
1868 bExp = extractFloat32Exp( b );
1869 bSign = extractFloat32Sign( b );
1870 zSign = aSign ^ bSign;
1871 if ( aExp == 0xFF ) {
1872 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1873 return propagateFloat32NaN( a, b );
1875 if ( ( bExp | bSig ) == 0 ) {
1876 float_raise( float_flag_invalid );
1877 return float32_default_nan;
1879 return packFloat32( zSign, 0xFF, 0 );
1881 if ( bExp == 0xFF ) {
1882 if ( bSig ) return propagateFloat32NaN( a, b );
1883 if ( ( aExp | aSig ) == 0 ) {
1884 float_raise( float_flag_invalid );
1885 return float32_default_nan;
1887 return packFloat32( zSign, 0xFF, 0 );
1889 if ( aExp == 0 ) {
1890 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1891 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1893 if ( bExp == 0 ) {
1894 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1895 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1897 zExp = aExp + bExp - 0x7F;
1898 aSig = ( aSig | 0x00800000 )<<7;
1899 bSig = ( bSig | 0x00800000 )<<8;
1900 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1901 zSig = zSig64;
1902 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1903 zSig <<= 1;
1904 --zExp;
1906 return roundAndPackFloat32( zSign, zExp, zSig );
1911 -------------------------------------------------------------------------------
1912 Returns the result of dividing the single-precision floating-point value `a'
1913 by the corresponding value `b'. The operation is performed according to the
1914 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1915 -------------------------------------------------------------------------------
1917 float32 float32_div( float32 a, float32 b )
1919 flag aSign, bSign, zSign;
1920 int16 aExp, bExp, zExp;
1921 bits32 aSig, bSig, zSig;
1923 aSig = extractFloat32Frac( a );
1924 aExp = extractFloat32Exp( a );
1925 aSign = extractFloat32Sign( a );
1926 bSig = extractFloat32Frac( b );
1927 bExp = extractFloat32Exp( b );
1928 bSign = extractFloat32Sign( b );
1929 zSign = aSign ^ bSign;
1930 if ( aExp == 0xFF ) {
1931 if ( aSig ) return propagateFloat32NaN( a, b );
1932 if ( bExp == 0xFF ) {
1933 if ( bSig ) return propagateFloat32NaN( a, b );
1934 float_raise( float_flag_invalid );
1935 return float32_default_nan;
1937 return packFloat32( zSign, 0xFF, 0 );
1939 if ( bExp == 0xFF ) {
1940 if ( bSig ) return propagateFloat32NaN( a, b );
1941 return packFloat32( zSign, 0, 0 );
1943 if ( bExp == 0 ) {
1944 if ( bSig == 0 ) {
1945 if ( ( aExp | aSig ) == 0 ) {
1946 float_raise( float_flag_invalid );
1947 return float32_default_nan;
1949 float_raise( float_flag_divbyzero );
1950 return packFloat32( zSign, 0xFF, 0 );
1952 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1954 if ( aExp == 0 ) {
1955 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1956 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1958 zExp = aExp - bExp + 0x7D;
1959 aSig = ( aSig | 0x00800000 )<<7;
1960 bSig = ( bSig | 0x00800000 )<<8;
1961 if ( bSig <= ( aSig + aSig ) ) {
1962 aSig >>= 1;
1963 ++zExp;
1965 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1966 if ( ( zSig & 0x3F ) == 0 ) {
1967 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1969 return roundAndPackFloat32( zSign, zExp, zSig );
1973 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1975 -------------------------------------------------------------------------------
1976 Returns the remainder of the single-precision floating-point value `a'
1977 with respect to the corresponding value `b'. The operation is performed
1978 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1979 -------------------------------------------------------------------------------
1981 float32 float32_rem( float32 a, float32 b )
1983 flag aSign, bSign, zSign;
1984 int16 aExp, bExp, expDiff;
1985 bits32 aSig, bSig;
1986 bits32 q;
1987 bits64 aSig64, bSig64, q64;
1988 bits32 alternateASig;
1989 sbits32 sigMean;
1991 aSig = extractFloat32Frac( a );
1992 aExp = extractFloat32Exp( a );
1993 aSign = extractFloat32Sign( a );
1994 bSig = extractFloat32Frac( b );
1995 bExp = extractFloat32Exp( b );
1996 bSign = extractFloat32Sign( b );
1997 if ( aExp == 0xFF ) {
1998 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1999 return propagateFloat32NaN( a, b );
2001 float_raise( float_flag_invalid );
2002 return float32_default_nan;
2004 if ( bExp == 0xFF ) {
2005 if ( bSig ) return propagateFloat32NaN( a, b );
2006 return a;
2008 if ( bExp == 0 ) {
2009 if ( bSig == 0 ) {
2010 float_raise( float_flag_invalid );
2011 return float32_default_nan;
2013 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2015 if ( aExp == 0 ) {
2016 if ( aSig == 0 ) return a;
2017 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2019 expDiff = aExp - bExp;
2020 aSig |= 0x00800000;
2021 bSig |= 0x00800000;
2022 if ( expDiff < 32 ) {
2023 aSig <<= 8;
2024 bSig <<= 8;
2025 if ( expDiff < 0 ) {
2026 if ( expDiff < -1 ) return a;
2027 aSig >>= 1;
2029 q = ( bSig <= aSig );
2030 if ( q ) aSig -= bSig;
2031 if ( 0 < expDiff ) {
2032 q = ( ( (bits64) aSig )<<32 ) / bSig;
2033 q >>= 32 - expDiff;
2034 bSig >>= 2;
2035 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2037 else {
2038 aSig >>= 2;
2039 bSig >>= 2;
2042 else {
2043 if ( bSig <= aSig ) aSig -= bSig;
2044 aSig64 = ( (bits64) aSig )<<40;
2045 bSig64 = ( (bits64) bSig )<<40;
2046 expDiff -= 64;
2047 while ( 0 < expDiff ) {
2048 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2049 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2050 aSig64 = - ( ( bSig * q64 )<<38 );
2051 expDiff -= 62;
2053 expDiff += 64;
2054 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2055 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2056 q = q64>>( 64 - expDiff );
2057 bSig <<= 6;
2058 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2060 do {
2061 alternateASig = aSig;
2062 ++q;
2063 aSig -= bSig;
2064 } while ( 0 <= (sbits32) aSig );
2065 sigMean = aSig + alternateASig;
2066 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2067 aSig = alternateASig;
2069 zSign = ( (sbits32) aSig < 0 );
2070 if ( zSign ) aSig = - aSig;
2071 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2074 #endif /* !SOFTFLOAT_FOR_GCC */
2076 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2078 -------------------------------------------------------------------------------
2079 Returns the square root of the single-precision floating-point value `a'.
2080 The operation is performed according to the IEC/IEEE Standard for Binary
2081 Floating-Point Arithmetic.
2082 -------------------------------------------------------------------------------
2084 float32 float32_sqrt( float32 a )
2086 flag aSign;
2087 int16 aExp, zExp;
2088 bits32 aSig, zSig;
2089 bits64 rem, term;
2091 aSig = extractFloat32Frac( a );
2092 aExp = extractFloat32Exp( a );
2093 aSign = extractFloat32Sign( a );
2094 if ( aExp == 0xFF ) {
2095 if ( aSig ) return propagateFloat32NaN( a, 0 );
2096 if ( ! aSign ) return a;
2097 float_raise( float_flag_invalid );
2098 return float32_default_nan;
2100 if ( aSign ) {
2101 if ( ( aExp | aSig ) == 0 ) return a;
2102 float_raise( float_flag_invalid );
2103 return float32_default_nan;
2105 if ( aExp == 0 ) {
2106 if ( aSig == 0 ) return 0;
2107 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2109 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2110 aSig = ( aSig | 0x00800000 )<<8;
2111 zSig = estimateSqrt32( aExp, aSig ) + 2;
2112 if ( ( zSig & 0x7F ) <= 5 ) {
2113 if ( zSig < 2 ) {
2114 zSig = 0x7FFFFFFF;
2115 goto roundAndPack;
2117 aSig >>= aExp & 1;
2118 term = ( (bits64) zSig ) * zSig;
2119 rem = ( ( (bits64) aSig )<<32 ) - term;
2120 while ( (sbits64) rem < 0 ) {
2121 --zSig;
2122 rem += ( ( (bits64) zSig )<<1 ) | 1;
2124 zSig |= ( rem != 0 );
2126 shift32RightJamming( zSig, 1, &zSig );
2127 roundAndPack:
2128 return roundAndPackFloat32( 0, zExp, zSig );
2131 #endif /* !SOFTFLOAT_FOR_GCC */
2134 -------------------------------------------------------------------------------
2135 Returns 1 if the single-precision floating-point value `a' is equal to
2136 the corresponding value `b', and 0 otherwise. The comparison is performed
2137 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2138 -------------------------------------------------------------------------------
2140 flag float32_eq( float32 a, float32 b )
2143 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2144 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2146 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2147 float_raise( float_flag_invalid );
2149 return 0;
2151 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2156 -------------------------------------------------------------------------------
2157 Returns 1 if the single-precision floating-point value `a' is less than
2158 or equal to the corresponding value `b', and 0 otherwise. The comparison
2159 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2160 Arithmetic.
2161 -------------------------------------------------------------------------------
2163 flag float32_le( float32 a, float32 b )
2165 flag aSign, bSign;
2167 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2168 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2170 float_raise( float_flag_invalid );
2171 return 0;
2173 aSign = extractFloat32Sign( a );
2174 bSign = extractFloat32Sign( b );
2175 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2176 return ( a == b ) || ( aSign ^ ( a < b ) );
2181 -------------------------------------------------------------------------------
2182 Returns 1 if the single-precision floating-point value `a' is less than
2183 the corresponding value `b', and 0 otherwise. The comparison is performed
2184 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 -------------------------------------------------------------------------------
2187 flag float32_lt( float32 a, float32 b )
2189 flag aSign, bSign;
2191 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2192 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2194 float_raise( float_flag_invalid );
2195 return 0;
2197 aSign = extractFloat32Sign( a );
2198 bSign = extractFloat32Sign( b );
2199 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2200 return ( a != b ) && ( aSign ^ ( a < b ) );
2204 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2206 -------------------------------------------------------------------------------
2207 Returns 1 if the single-precision floating-point value `a' is equal to
2208 the corresponding value `b', and 0 otherwise. The invalid exception is
2209 raised if either operand is a NaN. Otherwise, the comparison is performed
2210 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2211 -------------------------------------------------------------------------------
2213 flag float32_eq_signaling( float32 a, float32 b )
2216 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2217 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2219 float_raise( float_flag_invalid );
2220 return 0;
2222 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2227 -------------------------------------------------------------------------------
2228 Returns 1 if the single-precision floating-point value `a' is less than or
2229 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2230 cause an exception. Otherwise, the comparison is performed according to the
2231 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2232 -------------------------------------------------------------------------------
2234 flag float32_le_quiet( float32 a, float32 b )
2236 flag aSign, bSign;
2238 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2239 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2241 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2242 float_raise( float_flag_invalid );
2244 return 0;
2246 aSign = extractFloat32Sign( a );
2247 bSign = extractFloat32Sign( b );
2248 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2249 return ( a == b ) || ( aSign ^ ( a < b ) );
2254 -------------------------------------------------------------------------------
2255 Returns 1 if the single-precision floating-point value `a' is less than
2256 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2257 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2258 Standard for Binary Floating-Point Arithmetic.
2259 -------------------------------------------------------------------------------
2261 flag float32_lt_quiet( float32 a, float32 b )
2263 flag aSign, bSign;
2265 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2266 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2268 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2269 float_raise( float_flag_invalid );
2271 return 0;
2273 aSign = extractFloat32Sign( a );
2274 bSign = extractFloat32Sign( b );
2275 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2276 return ( a != b ) && ( aSign ^ ( a < b ) );
2279 #endif /* !SOFTFLOAT_FOR_GCC */
2281 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2283 -------------------------------------------------------------------------------
2284 Returns the result of converting the double-precision floating-point value
2285 `a' to the 32-bit two's complement integer format. The conversion is
2286 performed according to the IEC/IEEE Standard for Binary Floating-Point
2287 Arithmetic---which means in particular that the conversion is rounded
2288 according to the current rounding mode. If `a' is a NaN, the largest
2289 positive integer is returned. Otherwise, if the conversion overflows, the
2290 largest integer with the same sign as `a' is returned.
2291 -------------------------------------------------------------------------------
2293 int32 float64_to_int32( float64 a )
2295 flag aSign;
2296 int16 aExp, shiftCount;
2297 bits64 aSig;
2299 aSig = extractFloat64Frac( a );
2300 aExp = extractFloat64Exp( a );
2301 aSign = extractFloat64Sign( a );
2302 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2303 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2304 shiftCount = 0x42C - aExp;
2305 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2306 return roundAndPackInt32( aSign, aSig );
2309 #endif /* !SOFTFLOAT_FOR_GCC */
2312 -------------------------------------------------------------------------------
2313 Returns the result of converting the double-precision floating-point value
2314 `a' to the 32-bit two's complement integer format. The conversion is
2315 performed according to the IEC/IEEE Standard for Binary Floating-Point
2316 Arithmetic, except that the conversion is always rounded toward zero.
2317 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2318 the conversion overflows, the largest integer with the same sign as `a' is
2319 returned.
2320 -------------------------------------------------------------------------------
2322 int32 float64_to_int32_round_to_zero( float64 a )
2324 flag aSign;
2325 int16 aExp, shiftCount;
2326 bits64 aSig, savedASig;
2327 int32 z;
2329 aSig = extractFloat64Frac( a );
2330 aExp = extractFloat64Exp( a );
2331 aSign = extractFloat64Sign( a );
2332 if ( 0x41E < aExp ) {
2333 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2334 goto invalid;
2336 else if ( aExp < 0x3FF ) {
2337 if ( aExp || aSig ) float_set_inexact();
2338 return 0;
2340 aSig |= LIT64( 0x0010000000000000 );
2341 shiftCount = 0x433 - aExp;
2342 savedASig = aSig;
2343 aSig >>= shiftCount;
2344 z = aSig;
2345 if ( aSign ) z = - z;
2346 if ( ( z < 0 ) ^ aSign ) {
2347 invalid:
2348 float_raise( float_flag_invalid );
2349 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2351 if ( ( aSig<<shiftCount ) != savedASig ) {
2352 float_set_inexact();
2354 return z;
2358 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2360 -------------------------------------------------------------------------------
2361 Returns the result of converting the double-precision floating-point value
2362 `a' to the 64-bit two's complement integer format. The conversion is
2363 performed according to the IEC/IEEE Standard for Binary Floating-Point
2364 Arithmetic---which means in particular that the conversion is rounded
2365 according to the current rounding mode. If `a' is a NaN, the largest
2366 positive integer is returned. Otherwise, if the conversion overflows, the
2367 largest integer with the same sign as `a' is returned.
2368 -------------------------------------------------------------------------------
2370 int64 float64_to_int64( float64 a )
2372 flag aSign;
2373 int16 aExp, shiftCount;
2374 bits64 aSig, aSigExtra;
2376 aSig = extractFloat64Frac( a );
2377 aExp = extractFloat64Exp( a );
2378 aSign = extractFloat64Sign( a );
2379 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2380 shiftCount = 0x433 - aExp;
2381 if ( shiftCount <= 0 ) {
2382 if ( 0x43E < aExp ) {
2383 float_raise( float_flag_invalid );
2384 if ( ! aSign
2385 || ( ( aExp == 0x7FF )
2386 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2388 return LIT64( 0x7FFFFFFFFFFFFFFF );
2390 return (sbits64) LIT64( 0x8000000000000000 );
2392 aSigExtra = 0;
2393 aSig <<= - shiftCount;
2395 else {
2396 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2398 return roundAndPackInt64( aSign, aSig, aSigExtra );
2403 -------------------------------------------------------------------------------
2404 Returns the result of converting the double-precision floating-point value
2405 `a' to the 64-bit two's complement integer format. The conversion is
2406 performed according to the IEC/IEEE Standard for Binary Floating-Point
2407 Arithmetic, except that the conversion is always rounded toward zero.
2408 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2409 the conversion overflows, the largest integer with the same sign as `a' is
2410 returned.
2411 -------------------------------------------------------------------------------
2413 int64 float64_to_int64_round_to_zero( float64 a )
2415 flag aSign;
2416 int16 aExp, shiftCount;
2417 bits64 aSig;
2418 int64 z;
2420 aSig = extractFloat64Frac( a );
2421 aExp = extractFloat64Exp( a );
2422 aSign = extractFloat64Sign( a );
2423 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2424 shiftCount = aExp - 0x433;
2425 if ( 0 <= shiftCount ) {
2426 if ( 0x43E <= aExp ) {
2427 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2428 float_raise( float_flag_invalid );
2429 if ( ! aSign
2430 || ( ( aExp == 0x7FF )
2431 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2433 return LIT64( 0x7FFFFFFFFFFFFFFF );
2436 return (sbits64) LIT64( 0x8000000000000000 );
2438 z = aSig<<shiftCount;
2440 else {
2441 if ( aExp < 0x3FE ) {
2442 if ( aExp | aSig ) float_set_inexact();
2443 return 0;
2445 z = aSig>>( - shiftCount );
2446 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2447 float_set_inexact();
2450 if ( aSign ) z = - z;
2451 return z;
2454 #endif /* !SOFTFLOAT_FOR_GCC */
2457 -------------------------------------------------------------------------------
2458 Returns the result of converting the double-precision floating-point value
2459 `a' to the single-precision floating-point format. The conversion is
2460 performed according to the IEC/IEEE Standard for Binary Floating-Point
2461 Arithmetic.
2462 -------------------------------------------------------------------------------
2464 float32 float64_to_float32( float64 a )
2466 flag aSign;
2467 int16 aExp;
2468 bits64 aSig;
2469 bits32 zSig;
2471 aSig = extractFloat64Frac( a );
2472 aExp = extractFloat64Exp( a );
2473 aSign = extractFloat64Sign( a );
2474 if ( aExp == 0x7FF ) {
2475 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2476 return packFloat32( aSign, 0xFF, 0 );
2478 shift64RightJamming( aSig, 22, &aSig );
2479 zSig = aSig;
2480 if ( aExp || zSig ) {
2481 zSig |= 0x40000000;
2482 aExp -= 0x381;
2484 return roundAndPackFloat32( aSign, aExp, zSig );
2488 #ifdef FLOATX80
2491 -------------------------------------------------------------------------------
2492 Returns the result of converting the double-precision floating-point value
2493 `a' to the extended double-precision floating-point format. The conversion
2494 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2495 Arithmetic.
2496 -------------------------------------------------------------------------------
2498 floatx80 float64_to_floatx80( float64 a )
2500 flag aSign;
2501 int16 aExp;
2502 bits64 aSig;
2504 aSig = extractFloat64Frac( a );
2505 aExp = extractFloat64Exp( a );
2506 aSign = extractFloat64Sign( a );
2507 if ( aExp == 0x7FF ) {
2508 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2509 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2511 if ( aExp == 0 ) {
2512 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2513 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2515 return
2516 packFloatx80(
2517 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2521 #endif
2523 #ifdef FLOAT128
2526 -------------------------------------------------------------------------------
2527 Returns the result of converting the double-precision floating-point value
2528 `a' to the quadruple-precision floating-point format. The conversion is
2529 performed according to the IEC/IEEE Standard for Binary Floating-Point
2530 Arithmetic.
2531 -------------------------------------------------------------------------------
2533 float128 float64_to_float128( float64 a )
2535 flag aSign;
2536 int16 aExp;
2537 bits64 aSig, zSig0, zSig1;
2539 aSig = extractFloat64Frac( a );
2540 aExp = extractFloat64Exp( a );
2541 aSign = extractFloat64Sign( a );
2542 if ( aExp == 0x7FF ) {
2543 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2544 return packFloat128( aSign, 0x7FFF, 0, 0 );
2546 if ( aExp == 0 ) {
2547 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2548 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2549 --aExp;
2551 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2552 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2556 #endif
2558 #ifndef SOFTFLOAT_FOR_GCC
2560 -------------------------------------------------------------------------------
2561 Rounds the double-precision floating-point value `a' to an integer, and
2562 returns the result as a double-precision floating-point value. The
2563 operation is performed according to the IEC/IEEE Standard for Binary
2564 Floating-Point Arithmetic.
2565 -------------------------------------------------------------------------------
2567 float64 float64_round_to_int( float64 a )
2569 flag aSign;
2570 int16 aExp;
2571 bits64 lastBitMask, roundBitsMask;
2572 int8 roundingMode;
2573 float64 z;
2575 aExp = extractFloat64Exp( a );
2576 if ( 0x433 <= aExp ) {
2577 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2578 return propagateFloat64NaN( a, a );
2580 return a;
2582 if ( aExp < 0x3FF ) {
2583 if ( (bits64) ( a<<1 ) == 0 ) return a;
2584 float_set_inexact();
2585 aSign = extractFloat64Sign( a );
2586 switch ( float_rounding_mode() ) {
2587 case float_round_nearest_even:
2588 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2589 return packFloat64( aSign, 0x3FF, 0 );
2591 break;
2592 case float_round_down:
2593 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2594 case float_round_up:
2595 return
2596 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2598 return packFloat64( aSign, 0, 0 );
2600 lastBitMask = 1;
2601 lastBitMask <<= 0x433 - aExp;
2602 roundBitsMask = lastBitMask - 1;
2603 z = a;
2604 roundingMode = float_rounding_mode();
2605 if ( roundingMode == float_round_nearest_even ) {
2606 z += lastBitMask>>1;
2607 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2609 else if ( roundingMode != float_round_to_zero ) {
2610 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2611 z += roundBitsMask;
2614 z &= ~ roundBitsMask;
2615 if ( z != a ) float_set_inexact();
2616 return z;
2619 #endif
2622 -------------------------------------------------------------------------------
2623 Returns the result of adding the absolute values of the double-precision
2624 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2625 before being returned. `zSign' is ignored if the result is a NaN.
2626 The addition is performed according to the IEC/IEEE Standard for Binary
2627 Floating-Point Arithmetic.
2628 -------------------------------------------------------------------------------
2630 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2632 int16 aExp, bExp, zExp;
2633 bits64 aSig, bSig, zSig;
2634 int16 expDiff;
2636 aSig = extractFloat64Frac( a );
2637 aExp = extractFloat64Exp( a );
2638 bSig = extractFloat64Frac( b );
2639 bExp = extractFloat64Exp( b );
2640 expDiff = aExp - bExp;
2641 aSig <<= 9;
2642 bSig <<= 9;
2643 if ( 0 < expDiff ) {
2644 if ( aExp == 0x7FF ) {
2645 if ( aSig ) return propagateFloat64NaN( a, b );
2646 return a;
2648 if ( bExp == 0 ) {
2649 --expDiff;
2651 else {
2652 bSig |= LIT64( 0x2000000000000000 );
2654 shift64RightJamming( bSig, expDiff, &bSig );
2655 zExp = aExp;
2657 else if ( expDiff < 0 ) {
2658 if ( bExp == 0x7FF ) {
2659 if ( bSig ) return propagateFloat64NaN( a, b );
2660 return packFloat64( zSign, 0x7FF, 0 );
2662 if ( aExp == 0 ) {
2663 ++expDiff;
2665 else {
2666 aSig |= LIT64( 0x2000000000000000 );
2668 shift64RightJamming( aSig, - expDiff, &aSig );
2669 zExp = bExp;
2671 else {
2672 if ( aExp == 0x7FF ) {
2673 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2674 return a;
2676 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2677 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2678 zExp = aExp;
2679 goto roundAndPack;
2681 aSig |= LIT64( 0x2000000000000000 );
2682 zSig = ( aSig + bSig )<<1;
2683 --zExp;
2684 if ( (sbits64) zSig < 0 ) {
2685 zSig = aSig + bSig;
2686 ++zExp;
2688 roundAndPack:
2689 return roundAndPackFloat64( zSign, zExp, zSig );
2694 -------------------------------------------------------------------------------
2695 Returns the result of subtracting the absolute values of the double-
2696 precision floating-point values `a' and `b'. If `zSign' is 1, the
2697 difference is negated before being returned. `zSign' is ignored if the
2698 result is a NaN. The subtraction is performed according to the IEC/IEEE
2699 Standard for Binary Floating-Point Arithmetic.
2700 -------------------------------------------------------------------------------
2702 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2704 int16 aExp, bExp, zExp;
2705 bits64 aSig, bSig, zSig;
2706 int16 expDiff;
2708 aSig = extractFloat64Frac( a );
2709 aExp = extractFloat64Exp( a );
2710 bSig = extractFloat64Frac( b );
2711 bExp = extractFloat64Exp( b );
2712 expDiff = aExp - bExp;
2713 aSig <<= 10;
2714 bSig <<= 10;
2715 if ( 0 < expDiff ) goto aExpBigger;
2716 if ( expDiff < 0 ) goto bExpBigger;
2717 if ( aExp == 0x7FF ) {
2718 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2719 float_raise( float_flag_invalid );
2720 return float64_default_nan;
2722 if ( aExp == 0 ) {
2723 aExp = 1;
2724 bExp = 1;
2726 if ( bSig < aSig ) goto aBigger;
2727 if ( aSig < bSig ) goto bBigger;
2728 return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2729 bExpBigger:
2730 if ( bExp == 0x7FF ) {
2731 if ( bSig ) return propagateFloat64NaN( a, b );
2732 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2734 if ( aExp == 0 ) {
2735 ++expDiff;
2737 else {
2738 aSig |= LIT64( 0x4000000000000000 );
2740 shift64RightJamming( aSig, - expDiff, &aSig );
2741 bSig |= LIT64( 0x4000000000000000 );
2742 bBigger:
2743 zSig = bSig - aSig;
2744 zExp = bExp;
2745 zSign ^= 1;
2746 goto normalizeRoundAndPack;
2747 aExpBigger:
2748 if ( aExp == 0x7FF ) {
2749 if ( aSig ) return propagateFloat64NaN( a, b );
2750 return a;
2752 if ( bExp == 0 ) {
2753 --expDiff;
2755 else {
2756 bSig |= LIT64( 0x4000000000000000 );
2758 shift64RightJamming( bSig, expDiff, &bSig );
2759 aSig |= LIT64( 0x4000000000000000 );
2760 aBigger:
2761 zSig = aSig - bSig;
2762 zExp = aExp;
2763 normalizeRoundAndPack:
2764 --zExp;
2765 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2770 -------------------------------------------------------------------------------
2771 Returns the result of adding the double-precision floating-point values `a'
2772 and `b'. The operation is performed according to the IEC/IEEE Standard for
2773 Binary Floating-Point Arithmetic.
2774 -------------------------------------------------------------------------------
2776 float64 float64_add( float64 a, float64 b )
2778 flag aSign, bSign;
2780 aSign = extractFloat64Sign( a );
2781 bSign = extractFloat64Sign( b );
2782 if ( aSign == bSign ) {
2783 return addFloat64Sigs( a, b, aSign );
2785 else {
2786 return subFloat64Sigs( a, b, aSign );
2792 -------------------------------------------------------------------------------
2793 Returns the result of subtracting the double-precision floating-point values
2794 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2795 for Binary Floating-Point Arithmetic.
2796 -------------------------------------------------------------------------------
2798 float64 float64_sub( float64 a, float64 b )
2800 flag aSign, bSign;
2802 aSign = extractFloat64Sign( a );
2803 bSign = extractFloat64Sign( b );
2804 if ( aSign == bSign ) {
2805 return subFloat64Sigs( a, b, aSign );
2807 else {
2808 return addFloat64Sigs( a, b, aSign );
2814 -------------------------------------------------------------------------------
2815 Returns the result of multiplying the double-precision floating-point values
2816 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2817 for Binary Floating-Point Arithmetic.
2818 -------------------------------------------------------------------------------
2820 float64 float64_mul( float64 a, float64 b )
2822 flag aSign, bSign, zSign;
2823 int16 aExp, bExp, zExp;
2824 bits64 aSig, bSig, zSig0, zSig1;
2826 aSig = extractFloat64Frac( a );
2827 aExp = extractFloat64Exp( a );
2828 aSign = extractFloat64Sign( a );
2829 bSig = extractFloat64Frac( b );
2830 bExp = extractFloat64Exp( b );
2831 bSign = extractFloat64Sign( b );
2832 zSign = aSign ^ bSign;
2833 if ( aExp == 0x7FF ) {
2834 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2835 return propagateFloat64NaN( a, b );
2837 if ( ( bExp | bSig ) == 0 ) {
2838 float_raise( float_flag_invalid );
2839 return float64_default_nan;
2841 return packFloat64( zSign, 0x7FF, 0 );
2843 if ( bExp == 0x7FF ) {
2844 if ( bSig ) return propagateFloat64NaN( a, b );
2845 if ( ( aExp | aSig ) == 0 ) {
2846 float_raise( float_flag_invalid );
2847 return float64_default_nan;
2849 return packFloat64( zSign, 0x7FF, 0 );
2851 if ( aExp == 0 ) {
2852 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2853 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2855 if ( bExp == 0 ) {
2856 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2857 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2859 zExp = aExp + bExp - 0x3FF;
2860 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2861 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2862 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2863 zSig0 |= ( zSig1 != 0 );
2864 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2865 zSig0 <<= 1;
2866 --zExp;
2868 return roundAndPackFloat64( zSign, zExp, zSig0 );
2873 -------------------------------------------------------------------------------
2874 Returns the result of dividing the double-precision floating-point value `a'
2875 by the corresponding value `b'. The operation is performed according to
2876 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2877 -------------------------------------------------------------------------------
2879 float64 float64_div( float64 a, float64 b )
2881 flag aSign, bSign, zSign;
2882 int16 aExp, bExp, zExp;
2883 bits64 aSig, bSig, zSig;
2884 bits64 rem0, rem1;
2885 bits64 term0, term1;
2887 aSig = extractFloat64Frac( a );
2888 aExp = extractFloat64Exp( a );
2889 aSign = extractFloat64Sign( a );
2890 bSig = extractFloat64Frac( b );
2891 bExp = extractFloat64Exp( b );
2892 bSign = extractFloat64Sign( b );
2893 zSign = aSign ^ bSign;
2894 if ( aExp == 0x7FF ) {
2895 if ( aSig ) return propagateFloat64NaN( a, b );
2896 if ( bExp == 0x7FF ) {
2897 if ( bSig ) return propagateFloat64NaN( a, b );
2898 float_raise( float_flag_invalid );
2899 return float64_default_nan;
2901 return packFloat64( zSign, 0x7FF, 0 );
2903 if ( bExp == 0x7FF ) {
2904 if ( bSig ) return propagateFloat64NaN( a, b );
2905 return packFloat64( zSign, 0, 0 );
2907 if ( bExp == 0 ) {
2908 if ( bSig == 0 ) {
2909 if ( ( aExp | aSig ) == 0 ) {
2910 float_raise( float_flag_invalid );
2911 return float64_default_nan;
2913 float_raise( float_flag_divbyzero );
2914 return packFloat64( zSign, 0x7FF, 0 );
2916 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2918 if ( aExp == 0 ) {
2919 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2920 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2922 zExp = aExp - bExp + 0x3FD;
2923 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2924 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2925 if ( bSig <= ( aSig + aSig ) ) {
2926 aSig >>= 1;
2927 ++zExp;
2929 zSig = estimateDiv128To64( aSig, 0, bSig );
2930 if ( ( zSig & 0x1FF ) <= 2 ) {
2931 mul64To128( bSig, zSig, &term0, &term1 );
2932 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2933 while ( (sbits64) rem0 < 0 ) {
2934 --zSig;
2935 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2937 zSig |= ( rem1 != 0 );
2939 return roundAndPackFloat64( zSign, zExp, zSig );
2943 #ifndef SOFTFLOAT_FOR_GCC
2945 -------------------------------------------------------------------------------
2946 Returns the remainder of the double-precision floating-point value `a'
2947 with respect to the corresponding value `b'. The operation is performed
2948 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2949 -------------------------------------------------------------------------------
2951 float64 float64_rem( float64 a, float64 b )
2953 flag aSign, bSign, zSign;
2954 int16 aExp, bExp, expDiff;
2955 bits64 aSig, bSig;
2956 bits64 q, alternateASig;
2957 sbits64 sigMean;
2959 aSig = extractFloat64Frac( a );
2960 aExp = extractFloat64Exp( a );
2961 aSign = extractFloat64Sign( a );
2962 bSig = extractFloat64Frac( b );
2963 bExp = extractFloat64Exp( b );
2964 bSign = extractFloat64Sign( b );
2965 if ( aExp == 0x7FF ) {
2966 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2967 return propagateFloat64NaN( a, b );
2969 float_raise( float_flag_invalid );
2970 return float64_default_nan;
2972 if ( bExp == 0x7FF ) {
2973 if ( bSig ) return propagateFloat64NaN( a, b );
2974 return a;
2976 if ( bExp == 0 ) {
2977 if ( bSig == 0 ) {
2978 float_raise( float_flag_invalid );
2979 return float64_default_nan;
2981 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2983 if ( aExp == 0 ) {
2984 if ( aSig == 0 ) return a;
2985 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2987 expDiff = aExp - bExp;
2988 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2989 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2990 if ( expDiff < 0 ) {
2991 if ( expDiff < -1 ) return a;
2992 aSig >>= 1;
2994 q = ( bSig <= aSig );
2995 if ( q ) aSig -= bSig;
2996 expDiff -= 64;
2997 while ( 0 < expDiff ) {
2998 q = estimateDiv128To64( aSig, 0, bSig );
2999 q = ( 2 < q ) ? q - 2 : 0;
3000 aSig = - ( ( bSig>>2 ) * q );
3001 expDiff -= 62;
3003 expDiff += 64;
3004 if ( 0 < expDiff ) {
3005 q = estimateDiv128To64( aSig, 0, bSig );
3006 q = ( 2 < q ) ? q - 2 : 0;
3007 q >>= 64 - expDiff;
3008 bSig >>= 2;
3009 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3011 else {
3012 aSig >>= 2;
3013 bSig >>= 2;
3015 do {
3016 alternateASig = aSig;
3017 ++q;
3018 aSig -= bSig;
3019 } while ( 0 <= (sbits64) aSig );
3020 sigMean = aSig + alternateASig;
3021 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3022 aSig = alternateASig;
3024 zSign = ( (sbits64) aSig < 0 );
3025 if ( zSign ) aSig = - aSig;
3026 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3031 -------------------------------------------------------------------------------
3032 Returns the square root of the double-precision floating-point value `a'.
3033 The operation is performed according to the IEC/IEEE Standard for Binary
3034 Floating-Point Arithmetic.
3035 -------------------------------------------------------------------------------
3037 float64 float64_sqrt( float64 a )
3039 flag aSign;
3040 int16 aExp, zExp;
3041 bits64 aSig, zSig, doubleZSig;
3042 bits64 rem0, rem1, term0, term1;
3044 aSig = extractFloat64Frac( a );
3045 aExp = extractFloat64Exp( a );
3046 aSign = extractFloat64Sign( a );
3047 if ( aExp == 0x7FF ) {
3048 if ( aSig ) return propagateFloat64NaN( a, a );
3049 if ( ! aSign ) return a;
3050 float_raise( float_flag_invalid );
3051 return float64_default_nan;
3053 if ( aSign ) {
3054 if ( ( aExp | aSig ) == 0 ) return a;
3055 float_raise( float_flag_invalid );
3056 return float64_default_nan;
3058 if ( aExp == 0 ) {
3059 if ( aSig == 0 ) return 0;
3060 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3062 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3063 aSig |= LIT64( 0x0010000000000000 );
3064 zSig = estimateSqrt32( aExp, aSig>>21 );
3065 aSig <<= 9 - ( aExp & 1 );
3066 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3067 if ( ( zSig & 0x1FF ) <= 5 ) {
3068 doubleZSig = zSig<<1;
3069 mul64To128( zSig, zSig, &term0, &term1 );
3070 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3071 while ( (sbits64) rem0 < 0 ) {
3072 --zSig;
3073 doubleZSig -= 2;
3074 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3076 zSig |= ( ( rem0 | rem1 ) != 0 );
3078 return roundAndPackFloat64( 0, zExp, zSig );
3081 #endif
3084 -------------------------------------------------------------------------------
3085 Returns 1 if the double-precision floating-point value `a' is equal to the
3086 corresponding value `b', and 0 otherwise. The comparison is performed
3087 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3088 -------------------------------------------------------------------------------
3090 flag float64_eq( float64 a, float64 b )
3093 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3094 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3096 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3097 float_raise( float_flag_invalid );
3099 return 0;
3101 return ( a == b ) ||
3102 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3107 -------------------------------------------------------------------------------
3108 Returns 1 if the double-precision floating-point value `a' is less than or
3109 equal to the corresponding value `b', and 0 otherwise. The comparison is
3110 performed according to the IEC/IEEE Standard for Binary Floating-Point
3111 Arithmetic.
3112 -------------------------------------------------------------------------------
3114 flag float64_le( float64 a, float64 b )
3116 flag aSign, bSign;
3118 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3119 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3121 float_raise( float_flag_invalid );
3122 return 0;
3124 aSign = extractFloat64Sign( a );
3125 bSign = extractFloat64Sign( b );
3126 if ( aSign != bSign )
3127 return aSign ||
3128 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3129 0 );
3130 return ( a == b ) ||
3131 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3136 -------------------------------------------------------------------------------
3137 Returns 1 if the double-precision floating-point value `a' is less than
3138 the corresponding value `b', and 0 otherwise. The comparison is performed
3139 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3140 -------------------------------------------------------------------------------
3142 flag float64_lt( float64 a, float64 b )
3144 flag aSign, bSign;
3146 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3147 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3149 float_raise( float_flag_invalid );
3150 return 0;
3152 aSign = extractFloat64Sign( a );
3153 bSign = extractFloat64Sign( b );
3154 if ( aSign != bSign )
3155 return aSign &&
3156 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3157 0 );
3158 return ( a != b ) &&
3159 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3163 #ifndef SOFTFLOAT_FOR_GCC
3165 -------------------------------------------------------------------------------
3166 Returns 1 if the double-precision floating-point value `a' is equal to the
3167 corresponding value `b', and 0 otherwise. The invalid exception is raised
3168 if either operand is a NaN. Otherwise, the comparison is performed
3169 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3170 -------------------------------------------------------------------------------
3172 flag float64_eq_signaling( float64 a, float64 b )
3175 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3176 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3178 float_raise( float_flag_invalid );
3179 return 0;
3181 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3186 -------------------------------------------------------------------------------
3187 Returns 1 if the double-precision floating-point value `a' is less than or
3188 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3189 cause an exception. Otherwise, the comparison is performed according to the
3190 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3191 -------------------------------------------------------------------------------
3193 flag float64_le_quiet( float64 a, float64 b )
3195 flag aSign, bSign;
3197 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3198 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3200 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3201 float_raise( float_flag_invalid );
3203 return 0;
3205 aSign = extractFloat64Sign( a );
3206 bSign = extractFloat64Sign( b );
3207 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3208 return ( a == b ) || ( aSign ^ ( a < b ) );
3213 -------------------------------------------------------------------------------
3214 Returns 1 if the double-precision floating-point value `a' is less than
3215 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3216 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3217 Standard for Binary Floating-Point Arithmetic.
3218 -------------------------------------------------------------------------------
3220 flag float64_lt_quiet( float64 a, float64 b )
3222 flag aSign, bSign;
3224 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3225 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3227 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3228 float_raise( float_flag_invalid );
3230 return 0;
3232 aSign = extractFloat64Sign( a );
3233 bSign = extractFloat64Sign( b );
3234 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3235 return ( a != b ) && ( aSign ^ ( a < b ) );
3238 #endif
3240 #ifdef FLOATX80
3243 -------------------------------------------------------------------------------
3244 Returns the result of converting the extended double-precision floating-
3245 point value `a' to the 32-bit two's complement integer format. The
3246 conversion is performed according to the IEC/IEEE Standard for Binary
3247 Floating-Point Arithmetic---which means in particular that the conversion
3248 is rounded according to the current rounding mode. If `a' is a NaN, the
3249 largest positive integer is returned. Otherwise, if the conversion
3250 overflows, the largest integer with the same sign as `a' is returned.
3251 -------------------------------------------------------------------------------
3253 int32 floatx80_to_int32( floatx80 a )
3255 flag aSign;
3256 int32 aExp, shiftCount;
3257 bits64 aSig;
3259 aSig = extractFloatx80Frac( a );
3260 aExp = extractFloatx80Exp( a );
3261 aSign = extractFloatx80Sign( a );
3262 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3263 shiftCount = 0x4037 - aExp;
3264 if ( shiftCount <= 0 ) shiftCount = 1;
3265 shift64RightJamming( aSig, shiftCount, &aSig );
3266 return roundAndPackInt32( aSign, aSig );
3271 -------------------------------------------------------------------------------
3272 Returns the result of converting the extended double-precision floating-
3273 point value `a' to the 32-bit two's complement integer format. The
3274 conversion is performed according to the IEC/IEEE Standard for Binary
3275 Floating-Point Arithmetic, except that the conversion is always rounded
3276 toward zero. If `a' is a NaN, the largest positive integer is returned.
3277 Otherwise, if the conversion overflows, the largest integer with the same
3278 sign as `a' is returned.
3279 -------------------------------------------------------------------------------
3281 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3283 flag aSign;
3284 int32 aExp, shiftCount;
3285 bits64 aSig, savedASig;
3286 int32 z;
3288 aSig = extractFloatx80Frac( a );
3289 aExp = extractFloatx80Exp( a );
3290 aSign = extractFloatx80Sign( a );
3291 if ( 0x401E < aExp ) {
3292 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3293 goto invalid;
3295 else if ( aExp < 0x3FFF ) {
3296 if ( aExp || aSig ) float_set_inexact();
3297 return 0;
3299 shiftCount = 0x403E - aExp;
3300 savedASig = aSig;
3301 aSig >>= shiftCount;
3302 z = aSig;
3303 if ( aSign ) z = - z;
3304 if ( ( z < 0 ) ^ aSign ) {
3305 invalid:
3306 float_raise( float_flag_invalid );
3307 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3309 if ( ( aSig<<shiftCount ) != savedASig ) {
3310 float_set_inexact();
3312 return z;
3317 -------------------------------------------------------------------------------
3318 Returns the result of converting the extended double-precision floating-
3319 point value `a' to the 64-bit two's complement integer format. The
3320 conversion is performed according to the IEC/IEEE Standard for Binary
3321 Floating-Point Arithmetic---which means in particular that the conversion
3322 is rounded according to the current rounding mode. If `a' is a NaN,
3323 the largest positive integer is returned. Otherwise, if the conversion
3324 overflows, the largest integer with the same sign as `a' is returned.
3325 -------------------------------------------------------------------------------
3327 int64 floatx80_to_int64( floatx80 a )
3329 flag aSign;
3330 int32 aExp, shiftCount;
3331 bits64 aSig, aSigExtra;
3333 aSig = extractFloatx80Frac( a );
3334 aExp = extractFloatx80Exp( a );
3335 aSign = extractFloatx80Sign( a );
3336 shiftCount = 0x403E - aExp;
3337 if ( shiftCount <= 0 ) {
3338 if ( shiftCount ) {
3339 float_raise( float_flag_invalid );
3340 if ( ! aSign
3341 || ( ( aExp == 0x7FFF )
3342 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3344 return LIT64( 0x7FFFFFFFFFFFFFFF );
3346 return (sbits64) LIT64( 0x8000000000000000 );
3348 aSigExtra = 0;
3350 else {
3351 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3353 return roundAndPackInt64( aSign, aSig, aSigExtra );
3358 -------------------------------------------------------------------------------
3359 Returns the result of converting the extended double-precision floating-
3360 point value `a' to the 64-bit two's complement integer format. The
3361 conversion is performed according to the IEC/IEEE Standard for Binary
3362 Floating-Point Arithmetic, except that the conversion is always rounded
3363 toward zero. If `a' is a NaN, the largest positive integer is returned.
3364 Otherwise, if the conversion overflows, the largest integer with the same
3365 sign as `a' is returned.
3366 -------------------------------------------------------------------------------
3368 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3370 flag aSign;
3371 int32 aExp, shiftCount;
3372 bits64 aSig;
3373 int64 z;
3375 aSig = extractFloatx80Frac( a );
3376 aExp = extractFloatx80Exp( a );
3377 aSign = extractFloatx80Sign( a );
3378 shiftCount = aExp - 0x403E;
3379 if ( 0 <= shiftCount ) {
3380 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3381 if ( ( a.high != 0xC03E ) || aSig ) {
3382 float_raise( float_flag_invalid );
3383 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3384 return LIT64( 0x7FFFFFFFFFFFFFFF );
3387 return (sbits64) LIT64( 0x8000000000000000 );
3389 else if ( aExp < 0x3FFF ) {
3390 if ( aExp | aSig ) float_set_inexact();
3391 return 0;
3393 z = aSig>>( - shiftCount );
3394 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3395 float_set_inexact();
3397 if ( aSign ) z = - z;
3398 return z;
3403 -------------------------------------------------------------------------------
3404 Returns the result of converting the extended double-precision floating-
3405 point value `a' to the single-precision floating-point format. The
3406 conversion is performed according to the IEC/IEEE Standard for Binary
3407 Floating-Point Arithmetic.
3408 -------------------------------------------------------------------------------
3410 float32 floatx80_to_float32( floatx80 a )
3412 flag aSign;
3413 int32 aExp;
3414 bits64 aSig;
3416 aSig = extractFloatx80Frac( a );
3417 aExp = extractFloatx80Exp( a );
3418 aSign = extractFloatx80Sign( a );
3419 if ( aExp == 0x7FFF ) {
3420 if ( (bits64) ( aSig<<1 ) ) {
3421 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3423 return packFloat32( aSign, 0xFF, 0 );
3425 shift64RightJamming( aSig, 33, &aSig );
3426 if ( aExp || aSig ) aExp -= 0x3F81;
3427 return roundAndPackFloat32( aSign, aExp, aSig );
3432 -------------------------------------------------------------------------------
3433 Returns the result of converting the extended double-precision floating-
3434 point value `a' to the double-precision floating-point format. The
3435 conversion is performed according to the IEC/IEEE Standard for Binary
3436 Floating-Point Arithmetic.
3437 -------------------------------------------------------------------------------
3439 float64 floatx80_to_float64( floatx80 a )
3441 flag aSign;
3442 int32 aExp;
3443 bits64 aSig, zSig;
3445 aSig = extractFloatx80Frac( a );
3446 aExp = extractFloatx80Exp( a );
3447 aSign = extractFloatx80Sign( a );
3448 if ( aExp == 0x7FFF ) {
3449 if ( (bits64) ( aSig<<1 ) ) {
3450 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3452 return packFloat64( aSign, 0x7FF, 0 );
3454 shift64RightJamming( aSig, 1, &zSig );
3455 if ( aExp || aSig ) aExp -= 0x3C01;
3456 return roundAndPackFloat64( aSign, aExp, zSig );
3460 #ifdef FLOAT128
3463 -------------------------------------------------------------------------------
3464 Returns the result of converting the extended double-precision floating-
3465 point value `a' to the quadruple-precision floating-point format. The
3466 conversion is performed according to the IEC/IEEE Standard for Binary
3467 Floating-Point Arithmetic.
3468 -------------------------------------------------------------------------------
3470 float128 floatx80_to_float128( floatx80 a )
3472 flag aSign;
3473 int16 aExp;
3474 bits64 aSig, zSig0, zSig1;
3476 aSig = extractFloatx80Frac( a );
3477 aExp = extractFloatx80Exp( a );
3478 aSign = extractFloatx80Sign( a );
3479 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3480 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3482 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3483 return packFloat128( aSign, aExp, zSig0, zSig1 );
3487 #endif
3490 -------------------------------------------------------------------------------
3491 Rounds the extended double-precision floating-point value `a' to an integer,
3492 and returns the result as an extended quadruple-precision floating-point
3493 value. The operation is performed according to the IEC/IEEE Standard for
3494 Binary Floating-Point Arithmetic.
3495 -------------------------------------------------------------------------------
3497 floatx80 floatx80_round_to_int( floatx80 a )
3499 flag aSign;
3500 int32 aExp;
3501 bits64 lastBitMask, roundBitsMask;
3502 int8 roundingMode;
3503 floatx80 z;
3505 aExp = extractFloatx80Exp( a );
3506 if ( 0x403E <= aExp ) {
3507 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3508 return propagateFloatx80NaN( a, a );
3510 return a;
3512 if ( aExp < 0x3FFF ) {
3513 if ( ( aExp == 0 )
3514 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3515 return a;
3517 float_set_inexact();
3518 aSign = extractFloatx80Sign( a );
3519 switch ( float_rounding_mode() ) {
3520 case float_round_nearest_even:
3521 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3523 return
3524 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3526 break;
3527 case float_round_down:
3528 return
3529 aSign ?
3530 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3531 : packFloatx80( 0, 0, 0 );
3532 case float_round_up:
3533 return
3534 aSign ? packFloatx80( 1, 0, 0 )
3535 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3537 return packFloatx80( aSign, 0, 0 );
3539 lastBitMask = 1;
3540 lastBitMask <<= 0x403E - aExp;
3541 roundBitsMask = lastBitMask - 1;
3542 z = a;
3543 roundingMode = float_rounding_mode();
3544 if ( roundingMode == float_round_nearest_even ) {
3545 z.low += lastBitMask>>1;
3546 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3548 else if ( roundingMode != float_round_to_zero ) {
3549 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3550 z.low += roundBitsMask;
3553 z.low &= ~ roundBitsMask;
3554 if ( z.low == 0 ) {
3555 ++z.high;
3556 z.low = LIT64( 0x8000000000000000 );
3558 if ( z.low != a.low ) float_set_inexact();
3559 return z;
3564 -------------------------------------------------------------------------------
3565 Returns the result of adding the absolute values of the extended double-
3566 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3567 negated before being returned. `zSign' is ignored if the result is a NaN.
3568 The addition is performed according to the IEC/IEEE Standard for Binary
3569 Floating-Point Arithmetic.
3570 -------------------------------------------------------------------------------
3572 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3574 int32 aExp, bExp, zExp;
3575 bits64 aSig, bSig, zSig0, zSig1;
3576 int32 expDiff;
3578 aSig = extractFloatx80Frac( a );
3579 aExp = extractFloatx80Exp( a );
3580 bSig = extractFloatx80Frac( b );
3581 bExp = extractFloatx80Exp( b );
3582 expDiff = aExp - bExp;
3583 if ( 0 < expDiff ) {
3584 if ( aExp == 0x7FFF ) {
3585 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3586 return a;
3588 if ( bExp == 0 ) --expDiff;
3589 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3590 zExp = aExp;
3592 else if ( expDiff < 0 ) {
3593 if ( bExp == 0x7FFF ) {
3594 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3595 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3597 if ( aExp == 0 ) ++expDiff;
3598 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3599 zExp = bExp;
3601 else {
3602 if ( aExp == 0x7FFF ) {
3603 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3604 return propagateFloatx80NaN( a, b );
3606 return a;
3608 zSig1 = 0;
3609 zSig0 = aSig + bSig;
3610 if ( aExp == 0 ) {
3611 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3612 goto roundAndPack;
3614 zExp = aExp;
3615 goto shiftRight1;
3617 zSig0 = aSig + bSig;
3618 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3619 shiftRight1:
3620 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3621 zSig0 |= LIT64( 0x8000000000000000 );
3622 ++zExp;
3623 roundAndPack:
3624 return
3625 roundAndPackFloatx80(
3626 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3631 -------------------------------------------------------------------------------
3632 Returns the result of subtracting the absolute values of the extended
3633 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3634 difference is negated before being returned. `zSign' is ignored if the
3635 result is a NaN. The subtraction is performed according to the IEC/IEEE
3636 Standard for Binary Floating-Point Arithmetic.
3637 -------------------------------------------------------------------------------
3639 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3641 int32 aExp, bExp, zExp;
3642 bits64 aSig, bSig, zSig0, zSig1;
3643 int32 expDiff;
3644 floatx80 z;
3646 aSig = extractFloatx80Frac( a );
3647 aExp = extractFloatx80Exp( a );
3648 bSig = extractFloatx80Frac( b );
3649 bExp = extractFloatx80Exp( b );
3650 expDiff = aExp - bExp;
3651 if ( 0 < expDiff ) goto aExpBigger;
3652 if ( expDiff < 0 ) goto bExpBigger;
3653 if ( aExp == 0x7FFF ) {
3654 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3655 return propagateFloatx80NaN( a, b );
3657 float_raise( float_flag_invalid );
3658 z.low = floatx80_default_nan_low;
3659 z.high = floatx80_default_nan_high;
3660 return z;
3662 if ( aExp == 0 ) {
3663 aExp = 1;
3664 bExp = 1;
3666 zSig1 = 0;
3667 if ( bSig < aSig ) goto aBigger;
3668 if ( aSig < bSig ) goto bBigger;
3669 return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3670 bExpBigger:
3671 if ( bExp == 0x7FFF ) {
3672 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3673 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3675 if ( aExp == 0 ) ++expDiff;
3676 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3677 bBigger:
3678 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3679 zExp = bExp;
3680 zSign ^= 1;
3681 goto normalizeRoundAndPack;
3682 aExpBigger:
3683 if ( aExp == 0x7FFF ) {
3684 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3685 return a;
3687 if ( bExp == 0 ) --expDiff;
3688 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3689 aBigger:
3690 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3691 zExp = aExp;
3692 normalizeRoundAndPack:
3693 return
3694 normalizeRoundAndPackFloatx80(
3695 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3700 -------------------------------------------------------------------------------
3701 Returns the result of adding the extended double-precision floating-point
3702 values `a' and `b'. The operation is performed according to the IEC/IEEE
3703 Standard for Binary Floating-Point Arithmetic.
3704 -------------------------------------------------------------------------------
3706 floatx80 floatx80_add( floatx80 a, floatx80 b )
3708 flag aSign, bSign;
3710 aSign = extractFloatx80Sign( a );
3711 bSign = extractFloatx80Sign( b );
3712 if ( aSign == bSign ) {
3713 return addFloatx80Sigs( a, b, aSign );
3715 else {
3716 return subFloatx80Sigs( a, b, aSign );
3722 -------------------------------------------------------------------------------
3723 Returns the result of subtracting the extended double-precision floating-
3724 point values `a' and `b'. The operation is performed according to the
3725 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3726 -------------------------------------------------------------------------------
3728 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3730 flag aSign, bSign;
3732 aSign = extractFloatx80Sign( a );
3733 bSign = extractFloatx80Sign( b );
3734 if ( aSign == bSign ) {
3735 return subFloatx80Sigs( a, b, aSign );
3737 else {
3738 return addFloatx80Sigs( a, b, aSign );
3744 -------------------------------------------------------------------------------
3745 Returns the result of multiplying the extended double-precision floating-
3746 point values `a' and `b'. The operation is performed according to the
3747 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3748 -------------------------------------------------------------------------------
3750 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3752 flag aSign, bSign, zSign;
3753 int32 aExp, bExp, zExp;
3754 bits64 aSig, bSig, zSig0, zSig1;
3755 floatx80 z;
3757 aSig = extractFloatx80Frac( a );
3758 aExp = extractFloatx80Exp( a );
3759 aSign = extractFloatx80Sign( a );
3760 bSig = extractFloatx80Frac( b );
3761 bExp = extractFloatx80Exp( b );
3762 bSign = extractFloatx80Sign( b );
3763 zSign = aSign ^ bSign;
3764 if ( aExp == 0x7FFF ) {
3765 if ( (bits64) ( aSig<<1 )
3766 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3767 return propagateFloatx80NaN( a, b );
3769 if ( ( bExp | bSig ) == 0 ) goto invalid;
3770 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3772 if ( bExp == 0x7FFF ) {
3773 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3774 if ( ( aExp | aSig ) == 0 ) {
3775 invalid:
3776 float_raise( float_flag_invalid );
3777 z.low = floatx80_default_nan_low;
3778 z.high = floatx80_default_nan_high;
3779 return z;
3781 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3783 if ( aExp == 0 ) {
3784 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3785 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3787 if ( bExp == 0 ) {
3788 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3789 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3791 zExp = aExp + bExp - 0x3FFE;
3792 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3793 if ( 0 < (sbits64) zSig0 ) {
3794 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3795 --zExp;
3797 return
3798 roundAndPackFloatx80(
3799 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3804 -------------------------------------------------------------------------------
3805 Returns the result of dividing the extended double-precision floating-point
3806 value `a' by the corresponding value `b'. The operation is performed
3807 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3808 -------------------------------------------------------------------------------
3810 floatx80 floatx80_div( floatx80 a, floatx80 b )
3812 flag aSign, bSign, zSign;
3813 int32 aExp, bExp, zExp;
3814 bits64 aSig, bSig, zSig0, zSig1;
3815 bits64 rem0, rem1, rem2, term0, term1, term2;
3816 floatx80 z;
3818 aSig = extractFloatx80Frac( a );
3819 aExp = extractFloatx80Exp( a );
3820 aSign = extractFloatx80Sign( a );
3821 bSig = extractFloatx80Frac( b );
3822 bExp = extractFloatx80Exp( b );
3823 bSign = extractFloatx80Sign( b );
3824 zSign = aSign ^ bSign;
3825 if ( aExp == 0x7FFF ) {
3826 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3827 if ( bExp == 0x7FFF ) {
3828 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3829 goto invalid;
3831 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3833 if ( bExp == 0x7FFF ) {
3834 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3835 return packFloatx80( zSign, 0, 0 );
3837 if ( bExp == 0 ) {
3838 if ( bSig == 0 ) {
3839 if ( ( aExp | aSig ) == 0 ) {
3840 invalid:
3841 float_raise( float_flag_invalid );
3842 z.low = floatx80_default_nan_low;
3843 z.high = floatx80_default_nan_high;
3844 return z;
3846 float_raise( float_flag_divbyzero );
3847 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3849 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3851 if ( aExp == 0 ) {
3852 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3853 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3855 zExp = aExp - bExp + 0x3FFE;
3856 rem1 = 0;
3857 if ( bSig <= aSig ) {
3858 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3859 ++zExp;
3861 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3862 mul64To128( bSig, zSig0, &term0, &term1 );
3863 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3864 while ( (sbits64) rem0 < 0 ) {
3865 --zSig0;
3866 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3868 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3869 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3870 mul64To128( bSig, zSig1, &term1, &term2 );
3871 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3872 while ( (sbits64) rem1 < 0 ) {
3873 --zSig1;
3874 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3876 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3878 return
3879 roundAndPackFloatx80(
3880 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3885 -------------------------------------------------------------------------------
3886 Returns the remainder of the extended double-precision floating-point value
3887 `a' with respect to the corresponding value `b'. The operation is performed
3888 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3889 -------------------------------------------------------------------------------
3891 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3893 flag aSign, bSign, zSign;
3894 int32 aExp, bExp, expDiff;
3895 bits64 aSig0, aSig1, bSig;
3896 bits64 q, term0, term1, alternateASig0, alternateASig1;
3897 floatx80 z;
3899 aSig0 = extractFloatx80Frac( a );
3900 aExp = extractFloatx80Exp( a );
3901 aSign = extractFloatx80Sign( a );
3902 bSig = extractFloatx80Frac( b );
3903 bExp = extractFloatx80Exp( b );
3904 bSign = extractFloatx80Sign( b );
3905 if ( aExp == 0x7FFF ) {
3906 if ( (bits64) ( aSig0<<1 )
3907 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3908 return propagateFloatx80NaN( a, b );
3910 goto invalid;
3912 if ( bExp == 0x7FFF ) {
3913 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3914 return a;
3916 if ( bExp == 0 ) {
3917 if ( bSig == 0 ) {
3918 invalid:
3919 float_raise( float_flag_invalid );
3920 z.low = floatx80_default_nan_low;
3921 z.high = floatx80_default_nan_high;
3922 return z;
3924 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3926 if ( aExp == 0 ) {
3927 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3928 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3930 bSig |= LIT64( 0x8000000000000000 );
3931 zSign = aSign;
3932 expDiff = aExp - bExp;
3933 aSig1 = 0;
3934 if ( expDiff < 0 ) {
3935 if ( expDiff < -1 ) return a;
3936 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3937 expDiff = 0;
3939 q = ( bSig <= aSig0 );
3940 if ( q ) aSig0 -= bSig;
3941 expDiff -= 64;
3942 while ( 0 < expDiff ) {
3943 q = estimateDiv128To64( aSig0, aSig1, bSig );
3944 q = ( 2 < q ) ? q - 2 : 0;
3945 mul64To128( bSig, q, &term0, &term1 );
3946 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3947 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3948 expDiff -= 62;
3950 expDiff += 64;
3951 if ( 0 < expDiff ) {
3952 q = estimateDiv128To64( aSig0, aSig1, bSig );
3953 q = ( 2 < q ) ? q - 2 : 0;
3954 q >>= 64 - expDiff;
3955 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3956 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3957 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3958 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3959 ++q;
3960 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3963 else {
3964 term1 = 0;
3965 term0 = bSig;
3967 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3968 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3969 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3970 && ( q & 1 ) )
3972 aSig0 = alternateASig0;
3973 aSig1 = alternateASig1;
3974 zSign = ! zSign;
3976 return
3977 normalizeRoundAndPackFloatx80(
3978 80, zSign, bExp + expDiff, aSig0, aSig1 );
3983 -------------------------------------------------------------------------------
3984 Returns the square root of the extended double-precision floating-point
3985 value `a'. The operation is performed according to the IEC/IEEE Standard
3986 for Binary Floating-Point Arithmetic.
3987 -------------------------------------------------------------------------------
3989 floatx80 floatx80_sqrt( floatx80 a )
3991 flag aSign;
3992 int32 aExp, zExp;
3993 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3994 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3995 floatx80 z;
3997 aSig0 = extractFloatx80Frac( a );
3998 aExp = extractFloatx80Exp( a );
3999 aSign = extractFloatx80Sign( a );
4000 if ( aExp == 0x7FFF ) {
4001 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4002 if ( ! aSign ) return a;
4003 goto invalid;
4005 if ( aSign ) {
4006 if ( ( aExp | aSig0 ) == 0 ) return a;
4007 invalid:
4008 float_raise( float_flag_invalid );
4009 z.low = floatx80_default_nan_low;
4010 z.high = floatx80_default_nan_high;
4011 return z;
4013 if ( aExp == 0 ) {
4014 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4015 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4017 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4018 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4019 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4020 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4021 doubleZSig0 = zSig0<<1;
4022 mul64To128( zSig0, zSig0, &term0, &term1 );
4023 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4024 while ( (sbits64) rem0 < 0 ) {
4025 --zSig0;
4026 doubleZSig0 -= 2;
4027 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4029 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4030 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4031 if ( zSig1 == 0 ) zSig1 = 1;
4032 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4033 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4034 mul64To128( zSig1, zSig1, &term2, &term3 );
4035 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4036 while ( (sbits64) rem1 < 0 ) {
4037 --zSig1;
4038 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4039 term3 |= 1;
4040 term2 |= doubleZSig0;
4041 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4043 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4045 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4046 zSig0 |= doubleZSig0;
4047 return
4048 roundAndPackFloatx80(
4049 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4054 -------------------------------------------------------------------------------
4055 Returns 1 if the extended double-precision floating-point value `a' is
4056 equal to the corresponding value `b', and 0 otherwise. The comparison is
4057 performed according to the IEC/IEEE Standard for Binary Floating-Point
4058 Arithmetic.
4059 -------------------------------------------------------------------------------
4061 flag floatx80_eq( floatx80 a, floatx80 b )
4064 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4065 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4066 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4067 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4069 if ( floatx80_is_signaling_nan( a )
4070 || floatx80_is_signaling_nan( b ) ) {
4071 float_raise( float_flag_invalid );
4073 return 0;
4075 return
4076 ( a.low == b.low )
4077 && ( ( a.high == b.high )
4078 || ( ( a.low == 0 )
4079 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4085 -------------------------------------------------------------------------------
4086 Returns 1 if the extended double-precision floating-point value `a' is
4087 less than or equal to the corresponding value `b', and 0 otherwise. The
4088 comparison is performed according to the IEC/IEEE Standard for Binary
4089 Floating-Point Arithmetic.
4090 -------------------------------------------------------------------------------
4092 flag floatx80_le( floatx80 a, floatx80 b )
4094 flag aSign, bSign;
4096 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4097 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4098 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4099 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4101 float_raise( float_flag_invalid );
4102 return 0;
4104 aSign = extractFloatx80Sign( a );
4105 bSign = extractFloatx80Sign( b );
4106 if ( aSign != bSign ) {
4107 return
4108 aSign
4109 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4110 == 0 );
4112 return
4113 aSign ? le128( b.high, b.low, a.high, a.low )
4114 : le128( a.high, a.low, b.high, b.low );
4119 -------------------------------------------------------------------------------
4120 Returns 1 if the extended double-precision floating-point value `a' is
4121 less than the corresponding value `b', and 0 otherwise. The comparison
4122 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4123 Arithmetic.
4124 -------------------------------------------------------------------------------
4126 flag floatx80_lt( floatx80 a, floatx80 b )
4128 flag aSign, bSign;
4130 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4131 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4132 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4133 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4135 float_raise( float_flag_invalid );
4136 return 0;
4138 aSign = extractFloatx80Sign( a );
4139 bSign = extractFloatx80Sign( b );
4140 if ( aSign != bSign ) {
4141 return
4142 aSign
4143 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4144 != 0 );
4146 return
4147 aSign ? lt128( b.high, b.low, a.high, a.low )
4148 : lt128( a.high, a.low, b.high, b.low );
4153 -------------------------------------------------------------------------------
4154 Returns 1 if the extended double-precision floating-point value `a' is equal
4155 to the corresponding value `b', and 0 otherwise. The invalid exception is
4156 raised if either operand is a NaN. Otherwise, the comparison is performed
4157 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4158 -------------------------------------------------------------------------------
4160 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4163 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4164 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4165 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4166 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4168 float_raise( float_flag_invalid );
4169 return 0;
4171 return
4172 ( a.low == b.low )
4173 && ( ( a.high == b.high )
4174 || ( ( a.low == 0 )
4175 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4181 -------------------------------------------------------------------------------
4182 Returns 1 if the extended double-precision floating-point value `a' is less
4183 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4184 do not cause an exception. Otherwise, the comparison is performed according
4185 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4186 -------------------------------------------------------------------------------
4188 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4190 flag aSign, bSign;
4192 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4193 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4194 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4195 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4197 if ( floatx80_is_signaling_nan( a )
4198 || floatx80_is_signaling_nan( b ) ) {
4199 float_raise( float_flag_invalid );
4201 return 0;
4203 aSign = extractFloatx80Sign( a );
4204 bSign = extractFloatx80Sign( b );
4205 if ( aSign != bSign ) {
4206 return
4207 aSign
4208 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4209 == 0 );
4211 return
4212 aSign ? le128( b.high, b.low, a.high, a.low )
4213 : le128( a.high, a.low, b.high, b.low );
4218 -------------------------------------------------------------------------------
4219 Returns 1 if the extended double-precision floating-point value `a' is less
4220 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4221 an exception. Otherwise, the comparison is performed according to the
4222 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4223 -------------------------------------------------------------------------------
4225 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4227 flag aSign, bSign;
4229 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4230 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4231 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4232 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4234 if ( floatx80_is_signaling_nan( a )
4235 || floatx80_is_signaling_nan( b ) ) {
4236 float_raise( float_flag_invalid );
4238 return 0;
4240 aSign = extractFloatx80Sign( a );
4241 bSign = extractFloatx80Sign( b );
4242 if ( aSign != bSign ) {
4243 return
4244 aSign
4245 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4246 != 0 );
4248 return
4249 aSign ? lt128( b.high, b.low, a.high, a.low )
4250 : lt128( a.high, a.low, b.high, b.low );
4254 #endif
4256 #ifdef FLOAT128
4259 -------------------------------------------------------------------------------
4260 Returns the result of converting the quadruple-precision floating-point
4261 value `a' to the 32-bit two's complement integer format. The conversion
4262 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4263 Arithmetic---which means in particular that the conversion is rounded
4264 according to the current rounding mode. If `a' is a NaN, the largest
4265 positive integer is returned. Otherwise, if the conversion overflows, the
4266 largest integer with the same sign as `a' is returned.
4267 -------------------------------------------------------------------------------
4269 int32 float128_to_int32( float128 a )
4271 flag aSign;
4272 int32 aExp, shiftCount;
4273 bits64 aSig0, aSig1;
4275 aSig1 = extractFloat128Frac1( a );
4276 aSig0 = extractFloat128Frac0( a );
4277 aExp = extractFloat128Exp( a );
4278 aSign = extractFloat128Sign( a );
4279 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4280 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4281 aSig0 |= ( aSig1 != 0 );
4282 shiftCount = 0x4028 - aExp;
4283 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4284 return roundAndPackInt32( aSign, aSig0 );
4289 -------------------------------------------------------------------------------
4290 Returns the result of converting the quadruple-precision floating-point
4291 value `a' to the 32-bit two's complement integer format. The conversion
4292 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4293 Arithmetic, except that the conversion is always rounded toward zero. If
4294 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4295 conversion overflows, the largest integer with the same sign as `a' is
4296 returned.
4297 -------------------------------------------------------------------------------
4299 int32 float128_to_int32_round_to_zero( float128 a )
4301 flag aSign;
4302 int32 aExp, shiftCount;
4303 bits64 aSig0, aSig1, savedASig;
4304 int32 z;
4306 aSig1 = extractFloat128Frac1( a );
4307 aSig0 = extractFloat128Frac0( a );
4308 aExp = extractFloat128Exp( a );
4309 aSign = extractFloat128Sign( a );
4310 aSig0 |= ( aSig1 != 0 );
4311 if ( 0x401E < aExp ) {
4312 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4313 goto invalid;
4315 else if ( aExp < 0x3FFF ) {
4316 if ( aExp || aSig0 ) float_set_inexact();
4317 return 0;
4319 aSig0 |= LIT64( 0x0001000000000000 );
4320 shiftCount = 0x402F - aExp;
4321 savedASig = aSig0;
4322 aSig0 >>= shiftCount;
4323 z = aSig0;
4324 if ( aSign ) z = - z;
4325 if ( ( z < 0 ) ^ aSign ) {
4326 invalid:
4327 float_raise( float_flag_invalid );
4328 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4330 if ( ( aSig0<<shiftCount ) != savedASig ) {
4331 float_set_inexact();
4333 return z;
4338 -------------------------------------------------------------------------------
4339 Returns the result of converting the quadruple-precision floating-point
4340 value `a' to the 64-bit two's complement integer format. The conversion
4341 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4342 Arithmetic---which means in particular that the conversion is rounded
4343 according to the current rounding mode. If `a' is a NaN, the largest
4344 positive integer is returned. Otherwise, if the conversion overflows, the
4345 largest integer with the same sign as `a' is returned.
4346 -------------------------------------------------------------------------------
4348 int64 float128_to_int64( float128 a )
4350 flag aSign;
4351 int32 aExp, shiftCount;
4352 bits64 aSig0, aSig1;
4354 aSig1 = extractFloat128Frac1( a );
4355 aSig0 = extractFloat128Frac0( a );
4356 aExp = extractFloat128Exp( a );
4357 aSign = extractFloat128Sign( a );
4358 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4359 shiftCount = 0x402F - aExp;
4360 if ( shiftCount <= 0 ) {
4361 if ( 0x403E < aExp ) {
4362 float_raise( float_flag_invalid );
4363 if ( ! aSign
4364 || ( ( aExp == 0x7FFF )
4365 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4368 return LIT64( 0x7FFFFFFFFFFFFFFF );
4370 return (sbits64) LIT64( 0x8000000000000000 );
4372 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4374 else {
4375 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4377 return roundAndPackInt64( aSign, aSig0, aSig1 );
4382 -------------------------------------------------------------------------------
4383 Returns the result of converting the quadruple-precision floating-point
4384 value `a' to the 64-bit two's complement integer format. The conversion
4385 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4386 Arithmetic, except that the conversion is always rounded toward zero.
4387 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4388 the conversion overflows, the largest integer with the same sign as `a' is
4389 returned.
4390 -------------------------------------------------------------------------------
4392 int64 float128_to_int64_round_to_zero( float128 a )
4394 flag aSign;
4395 int32 aExp, shiftCount;
4396 bits64 aSig0, aSig1;
4397 int64 z;
4399 aSig1 = extractFloat128Frac1( a );
4400 aSig0 = extractFloat128Frac0( a );
4401 aExp = extractFloat128Exp( a );
4402 aSign = extractFloat128Sign( a );
4403 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4404 shiftCount = aExp - 0x402F;
4405 if ( 0 < shiftCount ) {
4406 if ( 0x403E <= aExp ) {
4407 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4408 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4409 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4410 if ( aSig1 ) float_set_inexact();
4412 else {
4413 float_raise( float_flag_invalid );
4414 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4415 return LIT64( 0x7FFFFFFFFFFFFFFF );
4418 return (sbits64) LIT64( 0x8000000000000000 );
4420 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4421 if ( (bits64) ( aSig1<<shiftCount ) ) {
4422 float_set_inexact();
4425 else {
4426 if ( aExp < 0x3FFF ) {
4427 if ( aExp | aSig0 | aSig1 ) {
4428 float_set_inexact();
4430 return 0;
4432 z = aSig0>>( - shiftCount );
4433 if ( aSig1
4434 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4435 float_set_inexact();
4438 if ( aSign ) z = - z;
4439 return z;
4444 -------------------------------------------------------------------------------
4445 Returns the result of converting the quadruple-precision floating-point
4446 value `a' to the single-precision floating-point format. The conversion
4447 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4448 Arithmetic.
4449 -------------------------------------------------------------------------------
4451 float32 float128_to_float32( float128 a )
4453 flag aSign;
4454 int32 aExp;
4455 bits64 aSig0, aSig1;
4456 bits32 zSig;
4458 aSig1 = extractFloat128Frac1( a );
4459 aSig0 = extractFloat128Frac0( a );
4460 aExp = extractFloat128Exp( a );
4461 aSign = extractFloat128Sign( a );
4462 if ( aExp == 0x7FFF ) {
4463 if ( aSig0 | aSig1 ) {
4464 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4466 return packFloat32( aSign, 0xFF, 0 );
4468 aSig0 |= ( aSig1 != 0 );
4469 shift64RightJamming( aSig0, 18, &aSig0 );
4470 zSig = aSig0;
4471 if ( aExp || zSig ) {
4472 zSig |= 0x40000000;
4473 aExp -= 0x3F81;
4475 return roundAndPackFloat32( aSign, aExp, zSig );
4480 -------------------------------------------------------------------------------
4481 Returns the result of converting the quadruple-precision floating-point
4482 value `a' to the double-precision floating-point format. The conversion
4483 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4484 Arithmetic.
4485 -------------------------------------------------------------------------------
4487 float64 float128_to_float64( float128 a )
4489 flag aSign;
4490 int32 aExp;
4491 bits64 aSig0, aSig1;
4493 aSig1 = extractFloat128Frac1( a );
4494 aSig0 = extractFloat128Frac0( a );
4495 aExp = extractFloat128Exp( a );
4496 aSign = extractFloat128Sign( a );
4497 if ( aExp == 0x7FFF ) {
4498 if ( aSig0 | aSig1 ) {
4499 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4501 return packFloat64( aSign, 0x7FF, 0 );
4503 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4504 aSig0 |= ( aSig1 != 0 );
4505 if ( aExp || aSig0 ) {
4506 aSig0 |= LIT64( 0x4000000000000000 );
4507 aExp -= 0x3C01;
4509 return roundAndPackFloat64( aSign, aExp, aSig0 );
4513 #ifdef FLOATX80
4516 -------------------------------------------------------------------------------
4517 Returns the result of converting the quadruple-precision floating-point
4518 value `a' to the extended double-precision floating-point format. The
4519 conversion is performed according to the IEC/IEEE Standard for Binary
4520 Floating-Point Arithmetic.
4521 -------------------------------------------------------------------------------
4523 floatx80 float128_to_floatx80( float128 a )
4525 flag aSign;
4526 int32 aExp;
4527 bits64 aSig0, aSig1;
4529 aSig1 = extractFloat128Frac1( a );
4530 aSig0 = extractFloat128Frac0( a );
4531 aExp = extractFloat128Exp( a );
4532 aSign = extractFloat128Sign( a );
4533 if ( aExp == 0x7FFF ) {
4534 if ( aSig0 | aSig1 ) {
4535 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4537 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4539 if ( aExp == 0 ) {
4540 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4541 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4543 else {
4544 aSig0 |= LIT64( 0x0001000000000000 );
4546 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4547 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4551 #endif
4554 -------------------------------------------------------------------------------
4555 Rounds the quadruple-precision floating-point value `a' to an integer, and
4556 returns the result as a quadruple-precision floating-point value. The
4557 operation is performed according to the IEC/IEEE Standard for Binary
4558 Floating-Point Arithmetic.
4559 -------------------------------------------------------------------------------
4561 float128 float128_round_to_int( float128 a )
4563 flag aSign;
4564 int32 aExp;
4565 bits64 lastBitMask, roundBitsMask;
4566 int8 roundingMode;
4567 float128 z;
4569 aExp = extractFloat128Exp( a );
4570 if ( 0x402F <= aExp ) {
4571 if ( 0x406F <= aExp ) {
4572 if ( ( aExp == 0x7FFF )
4573 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4575 return propagateFloat128NaN( a, a );
4577 return a;
4579 lastBitMask = 1;
4580 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4581 roundBitsMask = lastBitMask - 1;
4582 z = a;
4583 roundingMode = float_rounding_mode();
4584 if ( roundingMode == float_round_nearest_even ) {
4585 if ( lastBitMask ) {
4586 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4587 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4589 else {
4590 if ( (sbits64) z.low < 0 ) {
4591 ++z.high;
4592 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4596 else if ( roundingMode != float_round_to_zero ) {
4597 if ( extractFloat128Sign( z )
4598 ^ ( roundingMode == float_round_up ) ) {
4599 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4602 z.low &= ~ roundBitsMask;
4604 else {
4605 if ( aExp < 0x3FFF ) {
4606 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4607 float_set_inexact();
4608 aSign = extractFloat128Sign( a );
4609 switch ( float_rounding_mode() ) {
4610 case float_round_nearest_even:
4611 if ( ( aExp == 0x3FFE )
4612 && ( extractFloat128Frac0( a )
4613 | extractFloat128Frac1( a ) )
4615 return packFloat128( aSign, 0x3FFF, 0, 0 );
4617 break;
4618 case float_round_down:
4619 return
4620 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4621 : packFloat128( 0, 0, 0, 0 );
4622 case float_round_up:
4623 return
4624 aSign ? packFloat128( 1, 0, 0, 0 )
4625 : packFloat128( 0, 0x3FFF, 0, 0 );
4627 return packFloat128( aSign, 0, 0, 0 );
4629 lastBitMask = 1;
4630 lastBitMask <<= 0x402F - aExp;
4631 roundBitsMask = lastBitMask - 1;
4632 z.low = 0;
4633 z.high = a.high;
4634 roundingMode = float_rounding_mode();
4635 if ( roundingMode == float_round_nearest_even ) {
4636 z.high += lastBitMask>>1;
4637 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4638 z.high &= ~ lastBitMask;
4641 else if ( roundingMode != float_round_to_zero ) {
4642 if ( extractFloat128Sign( z )
4643 ^ ( roundingMode == float_round_up ) ) {
4644 z.high |= ( a.low != 0 );
4645 z.high += roundBitsMask;
4648 z.high &= ~ roundBitsMask;
4650 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4651 float_set_inexact();
4653 return z;
4658 -------------------------------------------------------------------------------
4659 Returns the result of adding the absolute values of the quadruple-precision
4660 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4661 before being returned. `zSign' is ignored if the result is a NaN.
4662 The addition is performed according to the IEC/IEEE Standard for Binary
4663 Floating-Point Arithmetic.
4664 -------------------------------------------------------------------------------
4666 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4668 int32 aExp, bExp, zExp;
4669 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4670 int32 expDiff;
4672 aSig1 = extractFloat128Frac1( a );
4673 aSig0 = extractFloat128Frac0( a );
4674 aExp = extractFloat128Exp( a );
4675 bSig1 = extractFloat128Frac1( b );
4676 bSig0 = extractFloat128Frac0( b );
4677 bExp = extractFloat128Exp( b );
4678 expDiff = aExp - bExp;
4679 if ( 0 < expDiff ) {
4680 if ( aExp == 0x7FFF ) {
4681 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4682 return a;
4684 if ( bExp == 0 ) {
4685 --expDiff;
4687 else {
4688 bSig0 |= LIT64( 0x0001000000000000 );
4690 shift128ExtraRightJamming(
4691 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4692 zExp = aExp;
4694 else if ( expDiff < 0 ) {
4695 if ( bExp == 0x7FFF ) {
4696 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4697 return packFloat128( zSign, 0x7FFF, 0, 0 );
4699 if ( aExp == 0 ) {
4700 ++expDiff;
4702 else {
4703 aSig0 |= LIT64( 0x0001000000000000 );
4705 shift128ExtraRightJamming(
4706 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4707 zExp = bExp;
4709 else {
4710 if ( aExp == 0x7FFF ) {
4711 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4712 return propagateFloat128NaN( a, b );
4714 return a;
4716 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4717 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4718 zSig2 = 0;
4719 zSig0 |= LIT64( 0x0002000000000000 );
4720 zExp = aExp;
4721 goto shiftRight1;
4723 aSig0 |= LIT64( 0x0001000000000000 );
4724 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4725 --zExp;
4726 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4727 ++zExp;
4728 shiftRight1:
4729 shift128ExtraRightJamming(
4730 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4731 roundAndPack:
4732 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4737 -------------------------------------------------------------------------------
4738 Returns the result of subtracting the absolute values of the quadruple-
4739 precision floating-point values `a' and `b'. If `zSign' is 1, the
4740 difference is negated before being returned. `zSign' is ignored if the
4741 result is a NaN. The subtraction is performed according to the IEC/IEEE
4742 Standard for Binary Floating-Point Arithmetic.
4743 -------------------------------------------------------------------------------
4745 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4747 int32 aExp, bExp, zExp;
4748 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4749 int32 expDiff;
4750 float128 z;
4752 aSig1 = extractFloat128Frac1( a );
4753 aSig0 = extractFloat128Frac0( a );
4754 aExp = extractFloat128Exp( a );
4755 bSig1 = extractFloat128Frac1( b );
4756 bSig0 = extractFloat128Frac0( b );
4757 bExp = extractFloat128Exp( b );
4758 expDiff = aExp - bExp;
4759 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4760 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4761 if ( 0 < expDiff ) goto aExpBigger;
4762 if ( expDiff < 0 ) goto bExpBigger;
4763 if ( aExp == 0x7FFF ) {
4764 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4765 return propagateFloat128NaN( a, b );
4767 float_raise( float_flag_invalid );
4768 z.low = float128_default_nan_low;
4769 z.high = float128_default_nan_high;
4770 return z;
4772 if ( aExp == 0 ) {
4773 aExp = 1;
4774 bExp = 1;
4776 if ( bSig0 < aSig0 ) goto aBigger;
4777 if ( aSig0 < bSig0 ) goto bBigger;
4778 if ( bSig1 < aSig1 ) goto aBigger;
4779 if ( aSig1 < bSig1 ) goto bBigger;
4780 return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4781 bExpBigger:
4782 if ( bExp == 0x7FFF ) {
4783 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4784 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4786 if ( aExp == 0 ) {
4787 ++expDiff;
4789 else {
4790 aSig0 |= LIT64( 0x4000000000000000 );
4792 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4793 bSig0 |= LIT64( 0x4000000000000000 );
4794 bBigger:
4795 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4796 zExp = bExp;
4797 zSign ^= 1;
4798 goto normalizeRoundAndPack;
4799 aExpBigger:
4800 if ( aExp == 0x7FFF ) {
4801 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4802 return a;
4804 if ( bExp == 0 ) {
4805 --expDiff;
4807 else {
4808 bSig0 |= LIT64( 0x4000000000000000 );
4810 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4811 aSig0 |= LIT64( 0x4000000000000000 );
4812 aBigger:
4813 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4814 zExp = aExp;
4815 normalizeRoundAndPack:
4816 --zExp;
4817 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4822 -------------------------------------------------------------------------------
4823 Returns the result of adding the quadruple-precision floating-point values
4824 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4825 for Binary Floating-Point Arithmetic.
4826 -------------------------------------------------------------------------------
4828 float128 float128_add( float128 a, float128 b )
4830 flag aSign, bSign;
4832 aSign = extractFloat128Sign( a );
4833 bSign = extractFloat128Sign( b );
4834 if ( aSign == bSign ) {
4835 return addFloat128Sigs( a, b, aSign );
4837 else {
4838 return subFloat128Sigs( a, b, aSign );
4844 -------------------------------------------------------------------------------
4845 Returns the result of subtracting the quadruple-precision floating-point
4846 values `a' and `b'. The operation is performed according to the IEC/IEEE
4847 Standard for Binary Floating-Point Arithmetic.
4848 -------------------------------------------------------------------------------
4850 float128 float128_sub( float128 a, float128 b )
4852 flag aSign, bSign;
4854 aSign = extractFloat128Sign( a );
4855 bSign = extractFloat128Sign( b );
4856 if ( aSign == bSign ) {
4857 return subFloat128Sigs( a, b, aSign );
4859 else {
4860 return addFloat128Sigs( a, b, aSign );
4866 -------------------------------------------------------------------------------
4867 Returns the result of multiplying the quadruple-precision floating-point
4868 values `a' and `b'. The operation is performed according to the IEC/IEEE
4869 Standard for Binary Floating-Point Arithmetic.
4870 -------------------------------------------------------------------------------
4872 float128 float128_mul( float128 a, float128 b )
4874 flag aSign, bSign, zSign;
4875 int32 aExp, bExp, zExp;
4876 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4877 float128 z;
4879 aSig1 = extractFloat128Frac1( a );
4880 aSig0 = extractFloat128Frac0( a );
4881 aExp = extractFloat128Exp( a );
4882 aSign = extractFloat128Sign( a );
4883 bSig1 = extractFloat128Frac1( b );
4884 bSig0 = extractFloat128Frac0( b );
4885 bExp = extractFloat128Exp( b );
4886 bSign = extractFloat128Sign( b );
4887 zSign = aSign ^ bSign;
4888 if ( aExp == 0x7FFF ) {
4889 if ( ( aSig0 | aSig1 )
4890 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4891 return propagateFloat128NaN( a, b );
4893 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4894 return packFloat128( zSign, 0x7FFF, 0, 0 );
4896 if ( bExp == 0x7FFF ) {
4897 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4898 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4899 invalid:
4900 float_raise( float_flag_invalid );
4901 z.low = float128_default_nan_low;
4902 z.high = float128_default_nan_high;
4903 return z;
4905 return packFloat128( zSign, 0x7FFF, 0, 0 );
4907 if ( aExp == 0 ) {
4908 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4909 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4911 if ( bExp == 0 ) {
4912 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4913 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4915 zExp = aExp + bExp - 0x4000;
4916 aSig0 |= LIT64( 0x0001000000000000 );
4917 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4918 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4919 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4920 zSig2 |= ( zSig3 != 0 );
4921 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4922 shift128ExtraRightJamming(
4923 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4924 ++zExp;
4926 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4931 -------------------------------------------------------------------------------
4932 Returns the result of dividing the quadruple-precision floating-point value
4933 `a' by the corresponding value `b'. The operation is performed according to
4934 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4935 -------------------------------------------------------------------------------
4937 float128 float128_div( float128 a, float128 b )
4939 flag aSign, bSign, zSign;
4940 int32 aExp, bExp, zExp;
4941 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4942 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4943 float128 z;
4945 aSig1 = extractFloat128Frac1( a );
4946 aSig0 = extractFloat128Frac0( a );
4947 aExp = extractFloat128Exp( a );
4948 aSign = extractFloat128Sign( a );
4949 bSig1 = extractFloat128Frac1( b );
4950 bSig0 = extractFloat128Frac0( b );
4951 bExp = extractFloat128Exp( b );
4952 bSign = extractFloat128Sign( b );
4953 zSign = aSign ^ bSign;
4954 if ( aExp == 0x7FFF ) {
4955 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4956 if ( bExp == 0x7FFF ) {
4957 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4958 goto invalid;
4960 return packFloat128( zSign, 0x7FFF, 0, 0 );
4962 if ( bExp == 0x7FFF ) {
4963 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4964 return packFloat128( zSign, 0, 0, 0 );
4966 if ( bExp == 0 ) {
4967 if ( ( bSig0 | bSig1 ) == 0 ) {
4968 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4969 invalid:
4970 float_raise( float_flag_invalid );
4971 z.low = float128_default_nan_low;
4972 z.high = float128_default_nan_high;
4973 return z;
4975 float_raise( float_flag_divbyzero );
4976 return packFloat128( zSign, 0x7FFF, 0, 0 );
4978 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4980 if ( aExp == 0 ) {
4981 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4982 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4984 zExp = aExp - bExp + 0x3FFD;
4985 shortShift128Left(
4986 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4987 shortShift128Left(
4988 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4989 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4990 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4991 ++zExp;
4993 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4994 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4995 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4996 while ( (sbits64) rem0 < 0 ) {
4997 --zSig0;
4998 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5000 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5001 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5002 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5003 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5004 while ( (sbits64) rem1 < 0 ) {
5005 --zSig1;
5006 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5008 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5010 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5011 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5016 -------------------------------------------------------------------------------
5017 Returns the remainder of the quadruple-precision floating-point value `a'
5018 with respect to the corresponding value `b'. The operation is performed
5019 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5020 -------------------------------------------------------------------------------
5022 float128 float128_rem( float128 a, float128 b )
5024 flag aSign, bSign, zSign;
5025 int32 aExp, bExp, expDiff;
5026 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5027 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5028 sbits64 sigMean0;
5029 float128 z;
5031 aSig1 = extractFloat128Frac1( a );
5032 aSig0 = extractFloat128Frac0( a );
5033 aExp = extractFloat128Exp( a );
5034 aSign = extractFloat128Sign( a );
5035 bSig1 = extractFloat128Frac1( b );
5036 bSig0 = extractFloat128Frac0( b );
5037 bExp = extractFloat128Exp( b );
5038 bSign = extractFloat128Sign( b );
5039 if ( aExp == 0x7FFF ) {
5040 if ( ( aSig0 | aSig1 )
5041 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5042 return propagateFloat128NaN( a, b );
5044 goto invalid;
5046 if ( bExp == 0x7FFF ) {
5047 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5048 return a;
5050 if ( bExp == 0 ) {
5051 if ( ( bSig0 | bSig1 ) == 0 ) {
5052 invalid:
5053 float_raise( float_flag_invalid );
5054 z.low = float128_default_nan_low;
5055 z.high = float128_default_nan_high;
5056 return z;
5058 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5060 if ( aExp == 0 ) {
5061 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5062 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5064 expDiff = aExp - bExp;
5065 if ( expDiff < -1 ) return a;
5066 shortShift128Left(
5067 aSig0 | LIT64( 0x0001000000000000 ),
5068 aSig1,
5069 15 - ( expDiff < 0 ),
5070 &aSig0,
5071 &aSig1
5073 shortShift128Left(
5074 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5075 q = le128( bSig0, bSig1, aSig0, aSig1 );
5076 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5077 expDiff -= 64;
5078 while ( 0 < expDiff ) {
5079 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5080 q = ( 4 < q ) ? q - 4 : 0;
5081 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5082 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5083 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5084 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5085 expDiff -= 61;
5087 if ( -64 < expDiff ) {
5088 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5089 q = ( 4 < q ) ? q - 4 : 0;
5090 q >>= - expDiff;
5091 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5092 expDiff += 52;
5093 if ( expDiff < 0 ) {
5094 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5096 else {
5097 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5099 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5100 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5102 else {
5103 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5104 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5106 do {
5107 alternateASig0 = aSig0;
5108 alternateASig1 = aSig1;
5109 ++q;
5110 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5111 } while ( 0 <= (sbits64) aSig0 );
5112 add128(
5113 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5114 if ( ( sigMean0 < 0 )
5115 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5116 aSig0 = alternateASig0;
5117 aSig1 = alternateASig1;
5119 zSign = ( (sbits64) aSig0 < 0 );
5120 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5121 return
5122 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5127 -------------------------------------------------------------------------------
5128 Returns the square root of the quadruple-precision floating-point value `a'.
5129 The operation is performed according to the IEC/IEEE Standard for Binary
5130 Floating-Point Arithmetic.
5131 -------------------------------------------------------------------------------
5133 float128 float128_sqrt( float128 a )
5135 flag aSign;
5136 int32 aExp, zExp;
5137 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5138 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5139 float128 z;
5141 aSig1 = extractFloat128Frac1( a );
5142 aSig0 = extractFloat128Frac0( a );
5143 aExp = extractFloat128Exp( a );
5144 aSign = extractFloat128Sign( a );
5145 if ( aExp == 0x7FFF ) {
5146 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5147 if ( ! aSign ) return a;
5148 goto invalid;
5150 if ( aSign ) {
5151 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5152 invalid:
5153 float_raise( float_flag_invalid );
5154 z.low = float128_default_nan_low;
5155 z.high = float128_default_nan_high;
5156 return z;
5158 if ( aExp == 0 ) {
5159 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5160 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5162 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5163 aSig0 |= LIT64( 0x0001000000000000 );
5164 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5165 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5166 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5167 doubleZSig0 = zSig0<<1;
5168 mul64To128( zSig0, zSig0, &term0, &term1 );
5169 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5170 while ( (sbits64) rem0 < 0 ) {
5171 --zSig0;
5172 doubleZSig0 -= 2;
5173 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5175 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5176 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5177 if ( zSig1 == 0 ) zSig1 = 1;
5178 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5179 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5180 mul64To128( zSig1, zSig1, &term2, &term3 );
5181 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5182 while ( (sbits64) rem1 < 0 ) {
5183 --zSig1;
5184 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5185 term3 |= 1;
5186 term2 |= doubleZSig0;
5187 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5189 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5191 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5192 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5197 -------------------------------------------------------------------------------
5198 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5199 the corresponding value `b', and 0 otherwise. The comparison is performed
5200 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5201 -------------------------------------------------------------------------------
5203 flag float128_eq( float128 a, float128 b )
5206 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5207 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5208 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5209 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5211 if ( float128_is_signaling_nan( a )
5212 || float128_is_signaling_nan( b ) ) {
5213 float_raise( float_flag_invalid );
5215 return 0;
5217 return
5218 ( a.low == b.low )
5219 && ( ( a.high == b.high )
5220 || ( ( a.low == 0 )
5221 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5227 -------------------------------------------------------------------------------
5228 Returns 1 if the quadruple-precision floating-point value `a' is less than
5229 or equal to the corresponding value `b', and 0 otherwise. The comparison
5230 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5231 Arithmetic.
5232 -------------------------------------------------------------------------------
5234 flag float128_le( float128 a, float128 b )
5236 flag aSign, bSign;
5238 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5239 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5240 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5241 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5243 float_raise( float_flag_invalid );
5244 return 0;
5246 aSign = extractFloat128Sign( a );
5247 bSign = extractFloat128Sign( b );
5248 if ( aSign != bSign ) {
5249 return
5250 aSign
5251 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5252 == 0 );
5254 return
5255 aSign ? le128( b.high, b.low, a.high, a.low )
5256 : le128( a.high, a.low, b.high, b.low );
5261 -------------------------------------------------------------------------------
5262 Returns 1 if the quadruple-precision floating-point value `a' is less than
5263 the corresponding value `b', and 0 otherwise. The comparison is performed
5264 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5265 -------------------------------------------------------------------------------
5267 flag float128_lt( float128 a, float128 b )
5269 flag aSign, bSign;
5271 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5272 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5273 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5274 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5276 float_raise( float_flag_invalid );
5277 return 0;
5279 aSign = extractFloat128Sign( a );
5280 bSign = extractFloat128Sign( b );
5281 if ( aSign != bSign ) {
5282 return
5283 aSign
5284 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5285 != 0 );
5287 return
5288 aSign ? lt128( b.high, b.low, a.high, a.low )
5289 : lt128( a.high, a.low, b.high, b.low );
5294 -------------------------------------------------------------------------------
5295 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5296 the corresponding value `b', and 0 otherwise. The invalid exception is
5297 raised if either operand is a NaN. Otherwise, the comparison is performed
5298 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5299 -------------------------------------------------------------------------------
5301 flag float128_eq_signaling( float128 a, float128 b )
5304 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5305 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5306 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5307 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5309 float_raise( float_flag_invalid );
5310 return 0;
5312 return
5313 ( a.low == b.low )
5314 && ( ( a.high == b.high )
5315 || ( ( a.low == 0 )
5316 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5322 -------------------------------------------------------------------------------
5323 Returns 1 if the quadruple-precision floating-point value `a' is less than
5324 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5325 cause an exception. Otherwise, the comparison is performed according to the
5326 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5327 -------------------------------------------------------------------------------
5329 flag float128_le_quiet( float128 a, float128 b )
5331 flag aSign, bSign;
5333 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5334 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5335 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5336 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5338 if ( float128_is_signaling_nan( a )
5339 || float128_is_signaling_nan( b ) ) {
5340 float_raise( float_flag_invalid );
5342 return 0;
5344 aSign = extractFloat128Sign( a );
5345 bSign = extractFloat128Sign( b );
5346 if ( aSign != bSign ) {
5347 return
5348 aSign
5349 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5350 == 0 );
5352 return
5353 aSign ? le128( b.high, b.low, a.high, a.low )
5354 : le128( a.high, a.low, b.high, b.low );
5359 -------------------------------------------------------------------------------
5360 Returns 1 if the quadruple-precision floating-point value `a' is less than
5361 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5362 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5363 Standard for Binary Floating-Point Arithmetic.
5364 -------------------------------------------------------------------------------
5366 flag float128_lt_quiet( float128 a, float128 b )
5368 flag aSign, bSign;
5370 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5371 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5372 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5373 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5375 if ( float128_is_signaling_nan( a )
5376 || float128_is_signaling_nan( b ) ) {
5377 float_raise( float_flag_invalid );
5379 return 0;
5381 aSign = extractFloat128Sign( a );
5382 bSign = extractFloat128Sign( b );
5383 if ( aSign != bSign ) {
5384 return
5385 aSign
5386 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5387 != 0 );
5389 return
5390 aSign ? lt128( b.high, b.low, a.high, a.low )
5391 : lt128( a.high, a.low, b.high, b.low );
5395 #endif
5398 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5401 * These two routines are not part of the original softfloat distribution.
5403 * They are based on the corresponding conversions to integer but return
5404 * unsigned numbers instead since these functions are required by GCC.
5406 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97
5408 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5412 -------------------------------------------------------------------------------
5413 Returns the result of converting the double-precision floating-point value
5414 `a' to the 32-bit unsigned integer format. The conversion is
5415 performed according to the IEC/IEEE Standard for Binary Floating-point
5416 Arithmetic, except that the conversion is always rounded toward zero. If
5417 `a' is a NaN, the largest positive integer is returned. If the conversion
5418 overflows, the largest integer positive is returned.
5419 -------------------------------------------------------------------------------
5421 uint32 float64_to_uint32_round_to_zero( float64 a )
5423 flag aSign;
5424 int16 aExp, shiftCount;
5425 bits64 aSig, savedASig;
5426 uint32 z;
5428 aSig = extractFloat64Frac( a );
5429 aExp = extractFloat64Exp( a );
5430 aSign = extractFloat64Sign( a );
5432 if (aSign) {
5433 float_raise( float_flag_invalid );
5434 return(0);
5437 if ( 0x41E < aExp ) {
5438 float_raise( float_flag_invalid );
5439 return 0xffffffff;
5441 else if ( aExp < 0x3FF ) {
5442 if ( aExp || aSig ) float_set_inexact();
5443 return 0;
5445 aSig |= LIT64( 0x0010000000000000 );
5446 shiftCount = 0x433 - aExp;
5447 savedASig = aSig;
5448 aSig >>= shiftCount;
5449 z = aSig;
5450 if ( ( aSig<<shiftCount ) != savedASig ) {
5451 float_set_inexact();
5453 return z;
5458 -------------------------------------------------------------------------------
5459 Returns the result of converting the single-precision floating-point value
5460 `a' to the 32-bit unsigned integer format. The conversion is
5461 performed according to the IEC/IEEE Standard for Binary Floating-point
5462 Arithmetic, except that the conversion is always rounded toward zero. If
5463 `a' is a NaN, the largest positive integer is returned. If the conversion
5464 overflows, the largest positive integer is returned.
5465 -------------------------------------------------------------------------------
5467 uint32 float32_to_uint32_round_to_zero( float32 a )
5469 flag aSign;
5470 int16 aExp, shiftCount;
5471 bits32 aSig;
5472 uint32 z;
5474 aSig = extractFloat32Frac( a );
5475 aExp = extractFloat32Exp( a );
5476 aSign = extractFloat32Sign( a );
5477 shiftCount = aExp - 0x9E;
5479 if (aSign) {
5480 float_raise( float_flag_invalid );
5481 return(0);
5483 if ( 0 < shiftCount ) {
5484 float_raise( float_flag_invalid );
5485 return 0xFFFFFFFF;
5487 else if ( aExp <= 0x7E ) {
5488 if ( aExp | aSig ) float_set_inexact();
5489 return 0;
5491 aSig = ( aSig | 0x800000 )<<8;
5492 z = aSig>>( - shiftCount );
5493 if ( aSig<<( shiftCount & 31 ) ) {
5494 float_set_inexact();
5496 return z;
5500 #endif
5502 #endif /* _STANDALONE */