Sync usage with man page.
[netbsd-mini2440.git] / external / bsd / pcc / dist / pcc-libs / libsoftfloat / bits64 / softfloat.c
blob1e5bff85446bf310fadf5d8b1cb3152ad0a8e22c
1 /* $NetBSD: softfloat.c,v 1.5 2007/11/08 21:31:04 martin Exp $ */
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
9 /*
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
18 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
47 #ifdef SOFTFLOAT_FOR_GCC
48 #include "softfloat-for-gcc.h"
49 #endif
51 #include "milieu.h"
52 #include "softfloat.h"
55 * Conversions between floats as stored in memory and floats as
56 * SoftFloat uses them
58 #ifndef FLOAT64_DEMANGLE
59 #define FLOAT64_DEMANGLE(a) (a)
60 #endif
61 #ifndef FLOAT64_MANGLE
62 #define FLOAT64_MANGLE(a) (a)
63 #endif
66 -------------------------------------------------------------------------------
67 Floating-point rounding mode, extended double-precision rounding precision,
68 and exception flags.
69 -------------------------------------------------------------------------------
71 fp_rnd float_rounding_mode = float_round_nearest_even;
72 fp_except float_exception_flags = 0;
73 #ifdef FLOATX80
74 int8 floatx80_rounding_precision = 80;
75 #endif
78 -------------------------------------------------------------------------------
79 Primitive arithmetic functions, including multi-word arithmetic, and
80 division and square root approximations. (Can be specialized to target if
81 desired.)
82 -------------------------------------------------------------------------------
84 #include "softfloat-macros"
87 -------------------------------------------------------------------------------
88 Functions and definitions to determine: (1) whether tininess for underflow
89 is detected before or after rounding by default, (2) what (if anything)
90 happens when exceptions are raised, (3) how signaling NaNs are distinguished
91 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
92 are propagated from function inputs to output. These details are target-
93 specific.
94 -------------------------------------------------------------------------------
96 #include "softfloat-specialize"
98 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
100 -------------------------------------------------------------------------------
101 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
102 and 7, and returns the properly rounded 32-bit integer corresponding to the
103 input. If `zSign' is 1, the input is negated before being converted to an
104 integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
105 is simply rounded to an integer, with the inexact exception raised if the
106 input cannot be represented exactly as an integer. However, if the fixed-
107 point input is too large, the invalid exception is raised and the largest
108 positive or negative integer is returned.
109 -------------------------------------------------------------------------------
111 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
113 int8 roundingMode;
114 flag roundNearestEven;
115 int8 roundIncrement, roundBits;
116 int32 z;
118 roundingMode = float_rounding_mode;
119 roundNearestEven = ( roundingMode == float_round_nearest_even );
120 roundIncrement = 0x40;
121 if ( ! roundNearestEven ) {
122 if ( roundingMode == float_round_to_zero ) {
123 roundIncrement = 0;
125 else {
126 roundIncrement = 0x7F;
127 if ( zSign ) {
128 if ( roundingMode == float_round_up ) roundIncrement = 0;
130 else {
131 if ( roundingMode == float_round_down ) roundIncrement = 0;
135 roundBits = absZ & 0x7F;
136 absZ = ( absZ + roundIncrement )>>7;
137 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
138 z = absZ;
139 if ( zSign ) z = - z;
140 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
141 float_raise( float_flag_invalid );
142 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
144 if ( roundBits ) float_exception_flags |= float_flag_inexact;
145 return z;
150 -------------------------------------------------------------------------------
151 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152 `absZ1', with binary point between bits 63 and 64 (between the input words),
153 and returns the properly rounded 64-bit integer corresponding to the input.
154 If `zSign' is 1, the input is negated before being converted to an integer.
155 Ordinarily, the fixed-point input is simply rounded to an integer, with
156 the inexact exception raised if the input cannot be represented exactly as
157 an integer. However, if the fixed-point input is too large, the invalid
158 exception is raised and the largest positive or negative integer is
159 returned.
160 -------------------------------------------------------------------------------
162 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
164 int8 roundingMode;
165 flag roundNearestEven, increment;
166 int64 z;
168 roundingMode = float_rounding_mode;
169 roundNearestEven = ( roundingMode == float_round_nearest_even );
170 increment = ( (sbits64) absZ1 < 0 );
171 if ( ! roundNearestEven ) {
172 if ( roundingMode == float_round_to_zero ) {
173 increment = 0;
175 else {
176 if ( zSign ) {
177 increment = ( roundingMode == float_round_down ) && absZ1;
179 else {
180 increment = ( roundingMode == float_round_up ) && absZ1;
184 if ( increment ) {
185 ++absZ0;
186 if ( absZ0 == 0 ) goto overflow;
187 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
189 z = absZ0;
190 if ( zSign ) z = - z;
191 if ( z && ( ( z < 0 ) ^ zSign ) ) {
192 overflow:
193 float_raise( float_flag_invalid );
194 return
195 zSign ? (sbits64) LIT64( 0x8000000000000000 )
196 : LIT64( 0x7FFFFFFFFFFFFFFF );
198 if ( absZ1 ) float_exception_flags |= float_flag_inexact;
199 return z;
202 #endif
205 -------------------------------------------------------------------------------
206 Returns the fraction bits of the single-precision floating-point value `a'.
207 -------------------------------------------------------------------------------
209 INLINE bits32 extractFloat32Frac( float32 a )
212 return a & 0x007FFFFF;
217 -------------------------------------------------------------------------------
218 Returns the exponent bits of the single-precision floating-point value `a'.
219 -------------------------------------------------------------------------------
221 INLINE int16 extractFloat32Exp( float32 a )
224 return ( a>>23 ) & 0xFF;
229 -------------------------------------------------------------------------------
230 Returns the sign bit of the single-precision floating-point value `a'.
231 -------------------------------------------------------------------------------
233 INLINE flag extractFloat32Sign( float32 a )
236 return a>>31;
241 -------------------------------------------------------------------------------
242 Normalizes the subnormal single-precision floating-point value represented
243 by the denormalized significand `aSig'. The normalized exponent and
244 significand are stored at the locations pointed to by `zExpPtr' and
245 `zSigPtr', respectively.
246 -------------------------------------------------------------------------------
248 static void
249 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
251 int8 shiftCount;
253 shiftCount = countLeadingZeros32( aSig ) - 8;
254 *zSigPtr = aSig<<shiftCount;
255 *zExpPtr = 1 - shiftCount;
260 -------------------------------------------------------------------------------
261 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
262 single-precision floating-point value, returning the result. After being
263 shifted into the proper positions, the three fields are simply added
264 together to form the result. This means that any integer portion of `zSig'
265 will be added into the exponent. Since a properly normalized significand
266 will have an integer portion equal to 1, the `zExp' input should be 1 less
267 than the desired result exponent whenever `zSig' is a complete, normalized
268 significand.
269 -------------------------------------------------------------------------------
271 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
274 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
279 -------------------------------------------------------------------------------
280 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
281 and significand `zSig', and returns the proper single-precision floating-
282 point value corresponding to the abstract input. Ordinarily, the abstract
283 value is simply rounded and packed into the single-precision format, with
284 the inexact exception raised if the abstract input cannot be represented
285 exactly. However, if the abstract value is too large, the overflow and
286 inexact exceptions are raised and an infinity or maximal finite value is
287 returned. If the abstract value is too small, the input value is rounded to
288 a subnormal number, and the underflow and inexact exceptions are raised if
289 the abstract input cannot be represented exactly as a subnormal single-
290 precision floating-point number.
291 The input significand `zSig' has its binary point between bits 30
292 and 29, which is 7 bits to the left of the usual location. This shifted
293 significand must be normalized or smaller. If `zSig' is not normalized,
294 `zExp' must be 0; in that case, the result returned is a subnormal number,
295 and it must not require rounding. In the usual case that `zSig' is
296 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
297 The handling of underflow and overflow follows the IEC/IEEE Standard for
298 Binary Floating-Point Arithmetic.
299 -------------------------------------------------------------------------------
301 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
303 int8 roundingMode;
304 flag roundNearestEven;
305 int8 roundIncrement, roundBits;
306 flag isTiny;
308 roundingMode = float_rounding_mode;
309 roundNearestEven = ( roundingMode == float_round_nearest_even );
310 roundIncrement = 0x40;
311 if ( ! roundNearestEven ) {
312 if ( roundingMode == float_round_to_zero ) {
313 roundIncrement = 0;
315 else {
316 roundIncrement = 0x7F;
317 if ( zSign ) {
318 if ( roundingMode == float_round_up ) roundIncrement = 0;
320 else {
321 if ( roundingMode == float_round_down ) roundIncrement = 0;
325 roundBits = zSig & 0x7F;
326 if ( 0xFD <= (bits16) zExp ) {
327 if ( ( 0xFD < zExp )
328 || ( ( zExp == 0xFD )
329 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
331 float_raise( float_flag_overflow | float_flag_inexact );
332 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
334 if ( zExp < 0 ) {
335 isTiny =
336 ( float_detect_tininess == float_tininess_before_rounding )
337 || ( zExp < -1 )
338 || ( zSig + roundIncrement < 0x80000000 );
339 shift32RightJamming( zSig, - zExp, &zSig );
340 zExp = 0;
341 roundBits = zSig & 0x7F;
342 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
345 if ( roundBits ) float_exception_flags |= float_flag_inexact;
346 zSig = ( zSig + roundIncrement )>>7;
347 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
348 if ( zSig == 0 ) zExp = 0;
349 return packFloat32( zSign, zExp, zSig );
354 -------------------------------------------------------------------------------
355 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
356 and significand `zSig', and returns the proper single-precision floating-
357 point value corresponding to the abstract input. This routine is just like
358 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
359 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
360 floating-point exponent.
361 -------------------------------------------------------------------------------
363 static float32
364 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
366 int8 shiftCount;
368 shiftCount = countLeadingZeros32( zSig ) - 1;
369 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
374 -------------------------------------------------------------------------------
375 Returns the fraction bits of the double-precision floating-point value `a'.
376 -------------------------------------------------------------------------------
378 INLINE bits64 extractFloat64Frac( float64 a )
381 return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
386 -------------------------------------------------------------------------------
387 Returns the exponent bits of the double-precision floating-point value `a'.
388 -------------------------------------------------------------------------------
390 INLINE int16 extractFloat64Exp( float64 a )
393 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
398 -------------------------------------------------------------------------------
399 Returns the sign bit of the double-precision floating-point value `a'.
400 -------------------------------------------------------------------------------
402 INLINE flag extractFloat64Sign( float64 a )
405 return FLOAT64_DEMANGLE(a)>>63;
410 -------------------------------------------------------------------------------
411 Normalizes the subnormal double-precision floating-point value represented
412 by the denormalized significand `aSig'. The normalized exponent and
413 significand are stored at the locations pointed to by `zExpPtr' and
414 `zSigPtr', respectively.
415 -------------------------------------------------------------------------------
417 static void
418 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
420 int8 shiftCount;
422 shiftCount = countLeadingZeros64( aSig ) - 11;
423 *zSigPtr = aSig<<shiftCount;
424 *zExpPtr = 1 - shiftCount;
429 -------------------------------------------------------------------------------
430 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
431 double-precision floating-point value, returning the result. After being
432 shifted into the proper positions, the three fields are simply added
433 together to form the result. This means that any integer portion of `zSig'
434 will be added into the exponent. Since a properly normalized significand
435 will have an integer portion equal to 1, the `zExp' input should be 1 less
436 than the desired result exponent whenever `zSig' is a complete, normalized
437 significand.
438 -------------------------------------------------------------------------------
440 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
443 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
444 ( ( (bits64) zExp )<<52 ) + zSig );
449 -------------------------------------------------------------------------------
450 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
451 and significand `zSig', and returns the proper double-precision floating-
452 point value corresponding to the abstract input. Ordinarily, the abstract
453 value is simply rounded and packed into the double-precision format, with
454 the inexact exception raised if the abstract input cannot be represented
455 exactly. However, if the abstract value is too large, the overflow and
456 inexact exceptions are raised and an infinity or maximal finite value is
457 returned. If the abstract value is too small, the input value is rounded to
458 a subnormal number, and the underflow and inexact exceptions are raised if
459 the abstract input cannot be represented exactly as a subnormal double-
460 precision floating-point number.
461 The input significand `zSig' has its binary point between bits 62
462 and 61, which is 10 bits to the left of the usual location. This shifted
463 significand must be normalized or smaller. If `zSig' is not normalized,
464 `zExp' must be 0; in that case, the result returned is a subnormal number,
465 and it must not require rounding. In the usual case that `zSig' is
466 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
467 The handling of underflow and overflow follows the IEC/IEEE Standard for
468 Binary Floating-Point Arithmetic.
469 -------------------------------------------------------------------------------
471 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
473 int8 roundingMode;
474 flag roundNearestEven;
475 int16 roundIncrement, roundBits;
476 flag isTiny;
478 roundingMode = float_rounding_mode;
479 roundNearestEven = ( roundingMode == float_round_nearest_even );
480 roundIncrement = 0x200;
481 if ( ! roundNearestEven ) {
482 if ( roundingMode == float_round_to_zero ) {
483 roundIncrement = 0;
485 else {
486 roundIncrement = 0x3FF;
487 if ( zSign ) {
488 if ( roundingMode == float_round_up ) roundIncrement = 0;
490 else {
491 if ( roundingMode == float_round_down ) roundIncrement = 0;
495 roundBits = zSig & 0x3FF;
496 if ( 0x7FD <= (bits16) zExp ) {
497 if ( ( 0x7FD < zExp )
498 || ( ( zExp == 0x7FD )
499 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
501 float_raise( float_flag_overflow | float_flag_inexact );
502 return FLOAT64_MANGLE(
503 FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
504 ( roundIncrement == 0 ));
506 if ( zExp < 0 ) {
507 isTiny =
508 ( float_detect_tininess == float_tininess_before_rounding )
509 || ( zExp < -1 )
510 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
511 shift64RightJamming( zSig, - zExp, &zSig );
512 zExp = 0;
513 roundBits = zSig & 0x3FF;
514 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
517 if ( roundBits ) float_exception_flags |= float_flag_inexact;
518 zSig = ( zSig + roundIncrement )>>10;
519 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
520 if ( zSig == 0 ) zExp = 0;
521 return packFloat64( zSign, zExp, zSig );
526 -------------------------------------------------------------------------------
527 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
528 and significand `zSig', and returns the proper double-precision floating-
529 point value corresponding to the abstract input. This routine is just like
530 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
531 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
532 floating-point exponent.
533 -------------------------------------------------------------------------------
535 static float64
536 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
538 int8 shiftCount;
540 shiftCount = countLeadingZeros64( zSig ) - 1;
541 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
545 #ifdef FLOATX80
548 -------------------------------------------------------------------------------
549 Returns the fraction bits of the extended double-precision floating-point
550 value `a'.
551 -------------------------------------------------------------------------------
553 INLINE bits64 extractFloatx80Frac( floatx80 a )
556 return a.low;
561 -------------------------------------------------------------------------------
562 Returns the exponent bits of the extended double-precision floating-point
563 value `a'.
564 -------------------------------------------------------------------------------
566 INLINE int32 extractFloatx80Exp( floatx80 a )
569 return a.high & 0x7FFF;
574 -------------------------------------------------------------------------------
575 Returns the sign bit of the extended double-precision floating-point value
576 `a'.
577 -------------------------------------------------------------------------------
579 INLINE flag extractFloatx80Sign( floatx80 a )
582 return a.high>>15;
587 -------------------------------------------------------------------------------
588 Normalizes the subnormal extended double-precision floating-point value
589 represented by the denormalized significand `aSig'. The normalized exponent
590 and significand are stored at the locations pointed to by `zExpPtr' and
591 `zSigPtr', respectively.
592 -------------------------------------------------------------------------------
594 static void
595 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
597 int8 shiftCount;
599 shiftCount = countLeadingZeros64( aSig );
600 *zSigPtr = aSig<<shiftCount;
601 *zExpPtr = 1 - shiftCount;
606 -------------------------------------------------------------------------------
607 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
608 extended double-precision floating-point value, returning the result.
609 -------------------------------------------------------------------------------
611 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
613 floatx80 z;
615 z.low = zSig;
616 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
617 return z;
622 -------------------------------------------------------------------------------
623 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
624 and extended significand formed by the concatenation of `zSig0' and `zSig1',
625 and returns the proper extended double-precision floating-point value
626 corresponding to the abstract input. Ordinarily, the abstract value is
627 rounded and packed into the extended double-precision format, with the
628 inexact exception raised if the abstract input cannot be represented
629 exactly. However, if the abstract value is too large, the overflow and
630 inexact exceptions are raised and an infinity or maximal finite value is
631 returned. If the abstract value is too small, the input value is rounded to
632 a subnormal number, and the underflow and inexact exceptions are raised if
633 the abstract input cannot be represented exactly as a subnormal extended
634 double-precision floating-point number.
635 If `roundingPrecision' is 32 or 64, the result is rounded to the same
636 number of bits as single or double precision, respectively. Otherwise, the
637 result is rounded to the full precision of the extended double-precision
638 format.
639 The input significand must be normalized or smaller. If the input
640 significand is not normalized, `zExp' must be 0; in that case, the result
641 returned is a subnormal number, and it must not require rounding. The
642 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
643 Floating-Point Arithmetic.
644 -------------------------------------------------------------------------------
646 static floatx80
647 roundAndPackFloatx80(
648 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
651 int8 roundingMode;
652 flag roundNearestEven, increment, isTiny;
653 int64 roundIncrement, roundMask, roundBits;
655 roundingMode = float_rounding_mode;
656 roundNearestEven = ( roundingMode == float_round_nearest_even );
657 if ( roundingPrecision == 80 ) goto precision80;
658 if ( roundingPrecision == 64 ) {
659 roundIncrement = LIT64( 0x0000000000000400 );
660 roundMask = LIT64( 0x00000000000007FF );
662 else if ( roundingPrecision == 32 ) {
663 roundIncrement = LIT64( 0x0000008000000000 );
664 roundMask = LIT64( 0x000000FFFFFFFFFF );
666 else {
667 goto precision80;
669 zSig0 |= ( zSig1 != 0 );
670 if ( ! roundNearestEven ) {
671 if ( roundingMode == float_round_to_zero ) {
672 roundIncrement = 0;
674 else {
675 roundIncrement = roundMask;
676 if ( zSign ) {
677 if ( roundingMode == float_round_up ) roundIncrement = 0;
679 else {
680 if ( roundingMode == float_round_down ) roundIncrement = 0;
684 roundBits = zSig0 & roundMask;
685 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
686 if ( ( 0x7FFE < zExp )
687 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
689 goto overflow;
691 if ( zExp <= 0 ) {
692 isTiny =
693 ( float_detect_tininess == float_tininess_before_rounding )
694 || ( zExp < 0 )
695 || ( zSig0 <= zSig0 + roundIncrement );
696 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
697 zExp = 0;
698 roundBits = zSig0 & roundMask;
699 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
700 if ( roundBits ) float_exception_flags |= float_flag_inexact;
701 zSig0 += roundIncrement;
702 if ( (sbits64) zSig0 < 0 ) zExp = 1;
703 roundIncrement = roundMask + 1;
704 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
705 roundMask |= roundIncrement;
707 zSig0 &= ~ roundMask;
708 return packFloatx80( zSign, zExp, zSig0 );
711 if ( roundBits ) float_exception_flags |= float_flag_inexact;
712 zSig0 += roundIncrement;
713 if ( zSig0 < roundIncrement ) {
714 ++zExp;
715 zSig0 = LIT64( 0x8000000000000000 );
717 roundIncrement = roundMask + 1;
718 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
719 roundMask |= roundIncrement;
721 zSig0 &= ~ roundMask;
722 if ( zSig0 == 0 ) zExp = 0;
723 return packFloatx80( zSign, zExp, zSig0 );
724 precision80:
725 increment = ( (sbits64) zSig1 < 0 );
726 if ( ! roundNearestEven ) {
727 if ( roundingMode == float_round_to_zero ) {
728 increment = 0;
730 else {
731 if ( zSign ) {
732 increment = ( roundingMode == float_round_down ) && zSig1;
734 else {
735 increment = ( roundingMode == float_round_up ) && zSig1;
739 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
740 if ( ( 0x7FFE < zExp )
741 || ( ( zExp == 0x7FFE )
742 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
743 && increment
746 roundMask = 0;
747 overflow:
748 float_raise( float_flag_overflow | float_flag_inexact );
749 if ( ( roundingMode == float_round_to_zero )
750 || ( zSign && ( roundingMode == float_round_up ) )
751 || ( ! zSign && ( roundingMode == float_round_down ) )
753 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
755 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
757 if ( zExp <= 0 ) {
758 isTiny =
759 ( float_detect_tininess == float_tininess_before_rounding )
760 || ( zExp < 0 )
761 || ! increment
762 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
763 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
764 zExp = 0;
765 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
766 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
767 if ( roundNearestEven ) {
768 increment = ( (sbits64) zSig1 < 0 );
770 else {
771 if ( zSign ) {
772 increment = ( roundingMode == float_round_down ) && zSig1;
774 else {
775 increment = ( roundingMode == float_round_up ) && zSig1;
778 if ( increment ) {
779 ++zSig0;
780 zSig0 &=
781 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
782 if ( (sbits64) zSig0 < 0 ) zExp = 1;
784 return packFloatx80( zSign, zExp, zSig0 );
787 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
788 if ( increment ) {
789 ++zSig0;
790 if ( zSig0 == 0 ) {
791 ++zExp;
792 zSig0 = LIT64( 0x8000000000000000 );
794 else {
795 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
798 else {
799 if ( zSig0 == 0 ) zExp = 0;
801 return packFloatx80( zSign, zExp, zSig0 );
806 -------------------------------------------------------------------------------
807 Takes an abstract floating-point value having sign `zSign', exponent
808 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
809 and returns the proper extended double-precision floating-point value
810 corresponding to the abstract input. This routine is just like
811 `roundAndPackFloatx80' except that the input significand does not have to be
812 normalized.
813 -------------------------------------------------------------------------------
815 static floatx80
816 normalizeRoundAndPackFloatx80(
817 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
820 int8 shiftCount;
822 if ( zSig0 == 0 ) {
823 zSig0 = zSig1;
824 zSig1 = 0;
825 zExp -= 64;
827 shiftCount = countLeadingZeros64( zSig0 );
828 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
829 zExp -= shiftCount;
830 return
831 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
835 #endif
837 #ifdef FLOAT128
840 -------------------------------------------------------------------------------
841 Returns the least-significant 64 fraction bits of the quadruple-precision
842 floating-point value `a'.
843 -------------------------------------------------------------------------------
845 INLINE bits64 extractFloat128Frac1( float128 a )
848 return a.low;
853 -------------------------------------------------------------------------------
854 Returns the most-significant 48 fraction bits of the quadruple-precision
855 floating-point value `a'.
856 -------------------------------------------------------------------------------
858 INLINE bits64 extractFloat128Frac0( float128 a )
861 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
866 -------------------------------------------------------------------------------
867 Returns the exponent bits of the quadruple-precision floating-point value
868 `a'.
869 -------------------------------------------------------------------------------
871 INLINE int32 extractFloat128Exp( float128 a )
874 return ( a.high>>48 ) & 0x7FFF;
879 -------------------------------------------------------------------------------
880 Returns the sign bit of the quadruple-precision floating-point value `a'.
881 -------------------------------------------------------------------------------
883 INLINE flag extractFloat128Sign( float128 a )
886 return a.high>>63;
891 -------------------------------------------------------------------------------
892 Normalizes the subnormal quadruple-precision floating-point value
893 represented by the denormalized significand formed by the concatenation of
894 `aSig0' and `aSig1'. The normalized exponent is stored at the location
895 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
896 significand are stored at the location pointed to by `zSig0Ptr', and the
897 least significant 64 bits of the normalized significand are stored at the
898 location pointed to by `zSig1Ptr'.
899 -------------------------------------------------------------------------------
901 static void
902 normalizeFloat128Subnormal(
903 bits64 aSig0,
904 bits64 aSig1,
905 int32 *zExpPtr,
906 bits64 *zSig0Ptr,
907 bits64 *zSig1Ptr
910 int8 shiftCount;
912 if ( aSig0 == 0 ) {
913 shiftCount = countLeadingZeros64( aSig1 ) - 15;
914 if ( shiftCount < 0 ) {
915 *zSig0Ptr = aSig1>>( - shiftCount );
916 *zSig1Ptr = aSig1<<( shiftCount & 63 );
918 else {
919 *zSig0Ptr = aSig1<<shiftCount;
920 *zSig1Ptr = 0;
922 *zExpPtr = - shiftCount - 63;
924 else {
925 shiftCount = countLeadingZeros64( aSig0 ) - 15;
926 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
927 *zExpPtr = 1 - shiftCount;
933 -------------------------------------------------------------------------------
934 Packs the sign `zSign', the exponent `zExp', and the significand formed
935 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
936 floating-point value, returning the result. After being shifted into the
937 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
938 added together to form the most significant 32 bits of the result. This
939 means that any integer portion of `zSig0' will be added into the exponent.
940 Since a properly normalized significand will have an integer portion equal
941 to 1, the `zExp' input should be 1 less than the desired result exponent
942 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
943 significand.
944 -------------------------------------------------------------------------------
946 INLINE float128
947 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
949 float128 z;
951 z.low = zSig1;
952 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
953 return z;
958 -------------------------------------------------------------------------------
959 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
960 and extended significand formed by the concatenation of `zSig0', `zSig1',
961 and `zSig2', and returns the proper quadruple-precision floating-point value
962 corresponding to the abstract input. Ordinarily, the abstract value is
963 simply rounded and packed into the quadruple-precision format, with the
964 inexact exception raised if the abstract input cannot be represented
965 exactly. However, if the abstract value is too large, the overflow and
966 inexact exceptions are raised and an infinity or maximal finite value is
967 returned. If the abstract value is too small, the input value is rounded to
968 a subnormal number, and the underflow and inexact exceptions are raised if
969 the abstract input cannot be represented exactly as a subnormal quadruple-
970 precision floating-point number.
971 The input significand must be normalized or smaller. If the input
972 significand is not normalized, `zExp' must be 0; in that case, the result
973 returned is a subnormal number, and it must not require rounding. In the
974 usual case that the input significand is normalized, `zExp' must be 1 less
975 than the ``true'' floating-point exponent. The handling of underflow and
976 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
977 -------------------------------------------------------------------------------
979 static float128
980 roundAndPackFloat128(
981 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
983 int8 roundingMode;
984 flag roundNearestEven, increment, isTiny;
986 roundingMode = float_rounding_mode;
987 roundNearestEven = ( roundingMode == float_round_nearest_even );
988 increment = ( (sbits64) zSig2 < 0 );
989 if ( ! roundNearestEven ) {
990 if ( roundingMode == float_round_to_zero ) {
991 increment = 0;
993 else {
994 if ( zSign ) {
995 increment = ( roundingMode == float_round_down ) && zSig2;
997 else {
998 increment = ( roundingMode == float_round_up ) && zSig2;
1002 if ( 0x7FFD <= (bits32) zExp ) {
1003 if ( ( 0x7FFD < zExp )
1004 || ( ( zExp == 0x7FFD )
1005 && eq128(
1006 LIT64( 0x0001FFFFFFFFFFFF ),
1007 LIT64( 0xFFFFFFFFFFFFFFFF ),
1008 zSig0,
1009 zSig1
1011 && increment
1014 float_raise( float_flag_overflow | float_flag_inexact );
1015 if ( ( roundingMode == float_round_to_zero )
1016 || ( zSign && ( roundingMode == float_round_up ) )
1017 || ( ! zSign && ( roundingMode == float_round_down ) )
1019 return
1020 packFloat128(
1021 zSign,
1022 0x7FFE,
1023 LIT64( 0x0000FFFFFFFFFFFF ),
1024 LIT64( 0xFFFFFFFFFFFFFFFF )
1027 return packFloat128( zSign, 0x7FFF, 0, 0 );
1029 if ( zExp < 0 ) {
1030 isTiny =
1031 ( float_detect_tininess == float_tininess_before_rounding )
1032 || ( zExp < -1 )
1033 || ! increment
1034 || lt128(
1035 zSig0,
1036 zSig1,
1037 LIT64( 0x0001FFFFFFFFFFFF ),
1038 LIT64( 0xFFFFFFFFFFFFFFFF )
1040 shift128ExtraRightJamming(
1041 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1042 zExp = 0;
1043 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1044 if ( roundNearestEven ) {
1045 increment = ( (sbits64) zSig2 < 0 );
1047 else {
1048 if ( zSign ) {
1049 increment = ( roundingMode == float_round_down ) && zSig2;
1051 else {
1052 increment = ( roundingMode == float_round_up ) && zSig2;
1057 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1058 if ( increment ) {
1059 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1060 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1062 else {
1063 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1065 return packFloat128( zSign, zExp, zSig0, zSig1 );
1070 -------------------------------------------------------------------------------
1071 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1072 and significand formed by the concatenation of `zSig0' and `zSig1', and
1073 returns the proper quadruple-precision floating-point value corresponding
1074 to the abstract input. This routine is just like `roundAndPackFloat128'
1075 except that the input significand has fewer bits and does not have to be
1076 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1077 point exponent.
1078 -------------------------------------------------------------------------------
1080 static float128
1081 normalizeRoundAndPackFloat128(
1082 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1084 int8 shiftCount;
1085 bits64 zSig2;
1087 if ( zSig0 == 0 ) {
1088 zSig0 = zSig1;
1089 zSig1 = 0;
1090 zExp -= 64;
1092 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1093 if ( 0 <= shiftCount ) {
1094 zSig2 = 0;
1095 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1097 else {
1098 shift128ExtraRightJamming(
1099 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1101 zExp -= shiftCount;
1102 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1106 #endif
1109 -------------------------------------------------------------------------------
1110 Returns the result of converting the 32-bit two's complement integer `a'
1111 to the single-precision floating-point format. The conversion is performed
1112 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 -------------------------------------------------------------------------------
1115 float32 int32_to_float32( int32 a )
1117 flag zSign;
1119 if ( a == 0 ) return 0;
1120 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1121 zSign = ( a < 0 );
1122 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1127 -------------------------------------------------------------------------------
1128 Returns the result of converting the 32-bit two's complement integer `a'
1129 to the double-precision floating-point format. The conversion is performed
1130 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1131 -------------------------------------------------------------------------------
1133 float64 int32_to_float64( int32 a )
1135 flag zSign;
1136 uint32 absA;
1137 int8 shiftCount;
1138 bits64 zSig;
1140 if ( a == 0 ) return 0;
1141 zSign = ( a < 0 );
1142 absA = zSign ? - a : a;
1143 shiftCount = countLeadingZeros32( absA ) + 21;
1144 zSig = absA;
1145 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1149 #ifdef FLOATX80
1152 -------------------------------------------------------------------------------
1153 Returns the result of converting the 32-bit two's complement integer `a'
1154 to the extended double-precision floating-point format. The conversion
1155 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 Arithmetic.
1157 -------------------------------------------------------------------------------
1159 floatx80 int32_to_floatx80( int32 a )
1161 flag zSign;
1162 uint32 absA;
1163 int8 shiftCount;
1164 bits64 zSig;
1166 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167 zSign = ( a < 0 );
1168 absA = zSign ? - a : a;
1169 shiftCount = countLeadingZeros32( absA ) + 32;
1170 zSig = absA;
1171 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1175 #endif
1177 #ifdef FLOAT128
1180 -------------------------------------------------------------------------------
1181 Returns the result of converting the 32-bit two's complement integer `a' to
1182 the quadruple-precision floating-point format. The conversion is performed
1183 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1184 -------------------------------------------------------------------------------
1186 float128 int32_to_float128( int32 a )
1188 flag zSign;
1189 uint32 absA;
1190 int8 shiftCount;
1191 bits64 zSig0;
1193 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1194 zSign = ( a < 0 );
1195 absA = zSign ? - a : a;
1196 shiftCount = countLeadingZeros32( absA ) + 17;
1197 zSig0 = absA;
1198 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1202 #endif
1204 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1206 -------------------------------------------------------------------------------
1207 Returns the result of converting the 64-bit two's complement integer `a'
1208 to the single-precision floating-point format. The conversion is performed
1209 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1210 -------------------------------------------------------------------------------
1212 float32 int64_to_float32( int64 a )
1214 flag zSign;
1215 uint64 absA;
1216 int8 shiftCount;
1218 if ( a == 0 ) return 0;
1219 zSign = ( a < 0 );
1220 absA = zSign ? - a : a;
1221 shiftCount = countLeadingZeros64( absA ) - 40;
1222 if ( 0 <= shiftCount ) {
1223 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1225 else {
1226 shiftCount += 7;
1227 if ( shiftCount < 0 ) {
1228 shift64RightJamming( absA, - shiftCount, &absA );
1230 else {
1231 absA <<= shiftCount;
1233 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1239 -------------------------------------------------------------------------------
1240 Returns the result of converting the 64-bit two's complement integer `a'
1241 to the double-precision floating-point format. The conversion is performed
1242 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1243 -------------------------------------------------------------------------------
1245 float64 int64_to_float64( int64 a )
1247 flag zSign;
1249 if ( a == 0 ) return 0;
1250 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1251 return packFloat64( 1, 0x43E, 0 );
1253 zSign = ( a < 0 );
1254 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1258 #ifdef FLOATX80
1261 -------------------------------------------------------------------------------
1262 Returns the result of converting the 64-bit two's complement integer `a'
1263 to the extended double-precision floating-point format. The conversion
1264 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1265 Arithmetic.
1266 -------------------------------------------------------------------------------
1268 floatx80 int64_to_floatx80( int64 a )
1270 flag zSign;
1271 uint64 absA;
1272 int8 shiftCount;
1274 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1275 zSign = ( a < 0 );
1276 absA = zSign ? - a : a;
1277 shiftCount = countLeadingZeros64( absA );
1278 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1282 #endif
1284 #endif /* !SOFTFLOAT_FOR_GCC */
1286 #ifdef FLOAT128
1289 -------------------------------------------------------------------------------
1290 Returns the result of converting the 64-bit two's complement integer `a' to
1291 the quadruple-precision floating-point format. The conversion is performed
1292 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1293 -------------------------------------------------------------------------------
1295 float128 int64_to_float128( int64 a )
1297 flag zSign;
1298 uint64 absA;
1299 int8 shiftCount;
1300 int32 zExp;
1301 bits64 zSig0, zSig1;
1303 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1304 zSign = ( a < 0 );
1305 absA = zSign ? - a : a;
1306 shiftCount = countLeadingZeros64( absA ) + 49;
1307 zExp = 0x406E - shiftCount;
1308 if ( 64 <= shiftCount ) {
1309 zSig1 = 0;
1310 zSig0 = absA;
1311 shiftCount -= 64;
1313 else {
1314 zSig1 = absA;
1315 zSig0 = 0;
1317 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1318 return packFloat128( zSign, zExp, zSig0, zSig1 );
1322 #endif
1324 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1326 -------------------------------------------------------------------------------
1327 Returns the result of converting the single-precision floating-point value
1328 `a' to the 32-bit two's complement integer format. The conversion is
1329 performed according to the IEC/IEEE Standard for Binary Floating-Point
1330 Arithmetic---which means in particular that the conversion is rounded
1331 according to the current rounding mode. If `a' is a NaN, the largest
1332 positive integer is returned. Otherwise, if the conversion overflows, the
1333 largest integer with the same sign as `a' is returned.
1334 -------------------------------------------------------------------------------
1336 int32 float32_to_int32( float32 a )
1338 flag aSign;
1339 int16 aExp, shiftCount;
1340 bits32 aSig;
1341 bits64 aSig64;
1343 aSig = extractFloat32Frac( a );
1344 aExp = extractFloat32Exp( a );
1345 aSign = extractFloat32Sign( a );
1346 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1347 if ( aExp ) aSig |= 0x00800000;
1348 shiftCount = 0xAF - aExp;
1349 aSig64 = aSig;
1350 aSig64 <<= 32;
1351 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1352 return roundAndPackInt32( aSign, aSig64 );
1355 #endif /* !SOFTFLOAT_FOR_GCC */
1358 -------------------------------------------------------------------------------
1359 Returns the result of converting the single-precision floating-point value
1360 `a' to the 32-bit two's complement integer format. The conversion is
1361 performed according to the IEC/IEEE Standard for Binary Floating-Point
1362 Arithmetic, except that the conversion is always rounded toward zero.
1363 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1364 the conversion overflows, the largest integer with the same sign as `a' is
1365 returned.
1366 -------------------------------------------------------------------------------
1368 int32 float32_to_int32_round_to_zero( float32 a )
1370 flag aSign;
1371 int16 aExp, shiftCount;
1372 bits32 aSig;
1373 int32 z;
1375 aSig = extractFloat32Frac( a );
1376 aExp = extractFloat32Exp( a );
1377 aSign = extractFloat32Sign( a );
1378 shiftCount = aExp - 0x9E;
1379 if ( 0 <= shiftCount ) {
1380 if ( a != 0xCF000000 ) {
1381 float_raise( float_flag_invalid );
1382 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1384 return (sbits32) 0x80000000;
1386 else if ( aExp <= 0x7E ) {
1387 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1388 return 0;
1390 aSig = ( aSig | 0x00800000 )<<8;
1391 z = aSig>>( - shiftCount );
1392 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1393 float_exception_flags |= float_flag_inexact;
1395 if ( aSign ) z = - z;
1396 return z;
1400 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1402 -------------------------------------------------------------------------------
1403 Returns the result of converting the single-precision floating-point value
1404 `a' to the 64-bit two's complement integer format. The conversion is
1405 performed according to the IEC/IEEE Standard for Binary Floating-Point
1406 Arithmetic---which means in particular that the conversion is rounded
1407 according to the current rounding mode. If `a' is a NaN, the largest
1408 positive integer is returned. Otherwise, if the conversion overflows, the
1409 largest integer with the same sign as `a' is returned.
1410 -------------------------------------------------------------------------------
1412 int64 float32_to_int64( float32 a )
1414 flag aSign;
1415 int16 aExp, shiftCount;
1416 bits32 aSig;
1417 bits64 aSig64, aSigExtra;
1419 aSig = extractFloat32Frac( a );
1420 aExp = extractFloat32Exp( a );
1421 aSign = extractFloat32Sign( a );
1422 shiftCount = 0xBE - aExp;
1423 if ( shiftCount < 0 ) {
1424 float_raise( float_flag_invalid );
1425 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1426 return LIT64( 0x7FFFFFFFFFFFFFFF );
1428 return (sbits64) LIT64( 0x8000000000000000 );
1430 if ( aExp ) aSig |= 0x00800000;
1431 aSig64 = aSig;
1432 aSig64 <<= 40;
1433 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1434 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1439 -------------------------------------------------------------------------------
1440 Returns the result of converting the single-precision floating-point value
1441 `a' to the 64-bit two's complement integer format. The conversion is
1442 performed according to the IEC/IEEE Standard for Binary Floating-Point
1443 Arithmetic, except that the conversion is always rounded toward zero. If
1444 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1445 conversion overflows, the largest integer with the same sign as `a' is
1446 returned.
1447 -------------------------------------------------------------------------------
1449 int64 float32_to_int64_round_to_zero( float32 a )
1451 flag aSign;
1452 int16 aExp, shiftCount;
1453 bits32 aSig;
1454 bits64 aSig64;
1455 int64 z;
1457 aSig = extractFloat32Frac( a );
1458 aExp = extractFloat32Exp( a );
1459 aSign = extractFloat32Sign( a );
1460 shiftCount = aExp - 0xBE;
1461 if ( 0 <= shiftCount ) {
1462 if ( a != 0xDF000000 ) {
1463 float_raise( float_flag_invalid );
1464 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1465 return LIT64( 0x7FFFFFFFFFFFFFFF );
1468 return (sbits64) LIT64( 0x8000000000000000 );
1470 else if ( aExp <= 0x7E ) {
1471 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1472 return 0;
1474 aSig64 = aSig | 0x00800000;
1475 aSig64 <<= 40;
1476 z = aSig64>>( - shiftCount );
1477 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1478 float_exception_flags |= float_flag_inexact;
1480 if ( aSign ) z = - z;
1481 return z;
1484 #endif /* !SOFTFLOAT_FOR_GCC */
1487 -------------------------------------------------------------------------------
1488 Returns the result of converting the single-precision floating-point value
1489 `a' to the double-precision floating-point format. The conversion is
1490 performed according to the IEC/IEEE Standard for Binary Floating-Point
1491 Arithmetic.
1492 -------------------------------------------------------------------------------
1494 float64 float32_to_float64( float32 a )
1496 flag aSign;
1497 int16 aExp;
1498 bits32 aSig;
1500 aSig = extractFloat32Frac( a );
1501 aExp = extractFloat32Exp( a );
1502 aSign = extractFloat32Sign( a );
1503 if ( aExp == 0xFF ) {
1504 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1505 return packFloat64( aSign, 0x7FF, 0 );
1507 if ( aExp == 0 ) {
1508 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1509 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1510 --aExp;
1512 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1516 #ifdef FLOATX80
1519 -------------------------------------------------------------------------------
1520 Returns the result of converting the single-precision floating-point value
1521 `a' to the extended double-precision floating-point format. The conversion
1522 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1523 Arithmetic.
1524 -------------------------------------------------------------------------------
1526 floatx80 float32_to_floatx80( float32 a )
1528 flag aSign;
1529 int16 aExp;
1530 bits32 aSig;
1532 aSig = extractFloat32Frac( a );
1533 aExp = extractFloat32Exp( a );
1534 aSign = extractFloat32Sign( a );
1535 if ( aExp == 0xFF ) {
1536 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1537 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1539 if ( aExp == 0 ) {
1540 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1541 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1543 aSig |= 0x00800000;
1544 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1548 #endif
1550 #ifdef FLOAT128
1553 -------------------------------------------------------------------------------
1554 Returns the result of converting the single-precision floating-point value
1555 `a' to the double-precision floating-point format. The conversion is
1556 performed according to the IEC/IEEE Standard for Binary Floating-Point
1557 Arithmetic.
1558 -------------------------------------------------------------------------------
1560 float128 float32_to_float128( float32 a )
1562 flag aSign;
1563 int16 aExp;
1564 bits32 aSig;
1566 aSig = extractFloat32Frac( a );
1567 aExp = extractFloat32Exp( a );
1568 aSign = extractFloat32Sign( a );
1569 if ( aExp == 0xFF ) {
1570 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1571 return packFloat128( aSign, 0x7FFF, 0, 0 );
1573 if ( aExp == 0 ) {
1574 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1575 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1576 --aExp;
1578 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1582 #endif
1584 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1586 -------------------------------------------------------------------------------
1587 Rounds the single-precision floating-point value `a' to an integer, and
1588 returns the result as a single-precision floating-point value. The
1589 operation is performed according to the IEC/IEEE Standard for Binary
1590 Floating-Point Arithmetic.
1591 -------------------------------------------------------------------------------
1593 float32 float32_round_to_int( float32 a )
1595 flag aSign;
1596 int16 aExp;
1597 bits32 lastBitMask, roundBitsMask;
1598 int8 roundingMode;
1599 float32 z;
1601 aExp = extractFloat32Exp( a );
1602 if ( 0x96 <= aExp ) {
1603 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1604 return propagateFloat32NaN( a, a );
1606 return a;
1608 if ( aExp <= 0x7E ) {
1609 if ( (bits32) ( a<<1 ) == 0 ) return a;
1610 float_exception_flags |= float_flag_inexact;
1611 aSign = extractFloat32Sign( a );
1612 switch ( float_rounding_mode ) {
1613 case float_round_nearest_even:
1614 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1615 return packFloat32( aSign, 0x7F, 0 );
1617 break;
1618 case float_round_to_zero:
1619 break;
1620 case float_round_down:
1621 return aSign ? 0xBF800000 : 0;
1622 case float_round_up:
1623 return aSign ? 0x80000000 : 0x3F800000;
1625 return packFloat32( aSign, 0, 0 );
1627 lastBitMask = 1;
1628 lastBitMask <<= 0x96 - aExp;
1629 roundBitsMask = lastBitMask - 1;
1630 z = a;
1631 roundingMode = float_rounding_mode;
1632 if ( roundingMode == float_round_nearest_even ) {
1633 z += lastBitMask>>1;
1634 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1636 else if ( roundingMode != float_round_to_zero ) {
1637 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1638 z += roundBitsMask;
1641 z &= ~ roundBitsMask;
1642 if ( z != a ) float_exception_flags |= float_flag_inexact;
1643 return z;
1646 #endif /* !SOFTFLOAT_FOR_GCC */
1649 -------------------------------------------------------------------------------
1650 Returns the result of adding the absolute values of the single-precision
1651 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1652 before being returned. `zSign' is ignored if the result is a NaN.
1653 The addition is performed according to the IEC/IEEE Standard for Binary
1654 Floating-Point Arithmetic.
1655 -------------------------------------------------------------------------------
1657 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1659 int16 aExp, bExp, zExp;
1660 bits32 aSig, bSig, zSig;
1661 int16 expDiff;
1663 aSig = extractFloat32Frac( a );
1664 aExp = extractFloat32Exp( a );
1665 bSig = extractFloat32Frac( b );
1666 bExp = extractFloat32Exp( b );
1667 expDiff = aExp - bExp;
1668 aSig <<= 6;
1669 bSig <<= 6;
1670 if ( 0 < expDiff ) {
1671 if ( aExp == 0xFF ) {
1672 if ( aSig ) return propagateFloat32NaN( a, b );
1673 return a;
1675 if ( bExp == 0 ) {
1676 --expDiff;
1678 else {
1679 bSig |= 0x20000000;
1681 shift32RightJamming( bSig, expDiff, &bSig );
1682 zExp = aExp;
1684 else if ( expDiff < 0 ) {
1685 if ( bExp == 0xFF ) {
1686 if ( bSig ) return propagateFloat32NaN( a, b );
1687 return packFloat32( zSign, 0xFF, 0 );
1689 if ( aExp == 0 ) {
1690 ++expDiff;
1692 else {
1693 aSig |= 0x20000000;
1695 shift32RightJamming( aSig, - expDiff, &aSig );
1696 zExp = bExp;
1698 else {
1699 if ( aExp == 0xFF ) {
1700 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1701 return a;
1703 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1704 zSig = 0x40000000 + aSig + bSig;
1705 zExp = aExp;
1706 goto roundAndPack;
1708 aSig |= 0x20000000;
1709 zSig = ( aSig + bSig )<<1;
1710 --zExp;
1711 if ( (sbits32) zSig < 0 ) {
1712 zSig = aSig + bSig;
1713 ++zExp;
1715 roundAndPack:
1716 return roundAndPackFloat32( zSign, zExp, zSig );
1721 -------------------------------------------------------------------------------
1722 Returns the result of subtracting the absolute values of the single-
1723 precision floating-point values `a' and `b'. If `zSign' is 1, the
1724 difference is negated before being returned. `zSign' is ignored if the
1725 result is a NaN. The subtraction is performed according to the IEC/IEEE
1726 Standard for Binary Floating-Point Arithmetic.
1727 -------------------------------------------------------------------------------
1729 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1731 int16 aExp, bExp, zExp;
1732 bits32 aSig, bSig, zSig;
1733 int16 expDiff;
1735 aSig = extractFloat32Frac( a );
1736 aExp = extractFloat32Exp( a );
1737 bSig = extractFloat32Frac( b );
1738 bExp = extractFloat32Exp( b );
1739 expDiff = aExp - bExp;
1740 aSig <<= 7;
1741 bSig <<= 7;
1742 if ( 0 < expDiff ) goto aExpBigger;
1743 if ( expDiff < 0 ) goto bExpBigger;
1744 if ( aExp == 0xFF ) {
1745 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1746 float_raise( float_flag_invalid );
1747 return float32_default_nan;
1749 if ( aExp == 0 ) {
1750 aExp = 1;
1751 bExp = 1;
1753 if ( bSig < aSig ) goto aBigger;
1754 if ( aSig < bSig ) goto bBigger;
1755 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1756 bExpBigger:
1757 if ( bExp == 0xFF ) {
1758 if ( bSig ) return propagateFloat32NaN( a, b );
1759 return packFloat32( zSign ^ 1, 0xFF, 0 );
1761 if ( aExp == 0 ) {
1762 ++expDiff;
1764 else {
1765 aSig |= 0x40000000;
1767 shift32RightJamming( aSig, - expDiff, &aSig );
1768 bSig |= 0x40000000;
1769 bBigger:
1770 zSig = bSig - aSig;
1771 zExp = bExp;
1772 zSign ^= 1;
1773 goto normalizeRoundAndPack;
1774 aExpBigger:
1775 if ( aExp == 0xFF ) {
1776 if ( aSig ) return propagateFloat32NaN( a, b );
1777 return a;
1779 if ( bExp == 0 ) {
1780 --expDiff;
1782 else {
1783 bSig |= 0x40000000;
1785 shift32RightJamming( bSig, expDiff, &bSig );
1786 aSig |= 0x40000000;
1787 aBigger:
1788 zSig = aSig - bSig;
1789 zExp = aExp;
1790 normalizeRoundAndPack:
1791 --zExp;
1792 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1797 -------------------------------------------------------------------------------
1798 Returns the result of adding the single-precision floating-point values `a'
1799 and `b'. The operation is performed according to the IEC/IEEE Standard for
1800 Binary Floating-Point Arithmetic.
1801 -------------------------------------------------------------------------------
1803 float32 float32_add( float32 a, float32 b )
1805 flag aSign, bSign;
1807 aSign = extractFloat32Sign( a );
1808 bSign = extractFloat32Sign( b );
1809 if ( aSign == bSign ) {
1810 return addFloat32Sigs( a, b, aSign );
1812 else {
1813 return subFloat32Sigs( a, b, aSign );
1819 -------------------------------------------------------------------------------
1820 Returns the result of subtracting the single-precision floating-point values
1821 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1822 for Binary Floating-Point Arithmetic.
1823 -------------------------------------------------------------------------------
1825 float32 float32_sub( float32 a, float32 b )
1827 flag aSign, bSign;
1829 aSign = extractFloat32Sign( a );
1830 bSign = extractFloat32Sign( b );
1831 if ( aSign == bSign ) {
1832 return subFloat32Sigs( a, b, aSign );
1834 else {
1835 return addFloat32Sigs( a, b, aSign );
1841 -------------------------------------------------------------------------------
1842 Returns the result of multiplying the single-precision floating-point values
1843 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1844 for Binary Floating-Point Arithmetic.
1845 -------------------------------------------------------------------------------
1847 float32 float32_mul( float32 a, float32 b )
1849 flag aSign, bSign, zSign;
1850 int16 aExp, bExp, zExp;
1851 bits32 aSig, bSig;
1852 bits64 zSig64;
1853 bits32 zSig;
1855 aSig = extractFloat32Frac( a );
1856 aExp = extractFloat32Exp( a );
1857 aSign = extractFloat32Sign( a );
1858 bSig = extractFloat32Frac( b );
1859 bExp = extractFloat32Exp( b );
1860 bSign = extractFloat32Sign( b );
1861 zSign = aSign ^ bSign;
1862 if ( aExp == 0xFF ) {
1863 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1864 return propagateFloat32NaN( a, b );
1866 if ( ( bExp | bSig ) == 0 ) {
1867 float_raise( float_flag_invalid );
1868 return float32_default_nan;
1870 return packFloat32( zSign, 0xFF, 0 );
1872 if ( bExp == 0xFF ) {
1873 if ( bSig ) return propagateFloat32NaN( a, b );
1874 if ( ( aExp | aSig ) == 0 ) {
1875 float_raise( float_flag_invalid );
1876 return float32_default_nan;
1878 return packFloat32( zSign, 0xFF, 0 );
1880 if ( aExp == 0 ) {
1881 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1882 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1884 if ( bExp == 0 ) {
1885 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1886 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1888 zExp = aExp + bExp - 0x7F;
1889 aSig = ( aSig | 0x00800000 )<<7;
1890 bSig = ( bSig | 0x00800000 )<<8;
1891 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1892 zSig = zSig64;
1893 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1894 zSig <<= 1;
1895 --zExp;
1897 return roundAndPackFloat32( zSign, zExp, zSig );
1902 -------------------------------------------------------------------------------
1903 Returns the result of dividing the single-precision floating-point value `a'
1904 by the corresponding value `b'. The operation is performed according to the
1905 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1906 -------------------------------------------------------------------------------
1908 float32 float32_div( float32 a, float32 b )
1910 flag aSign, bSign, zSign;
1911 int16 aExp, bExp, zExp;
1912 bits32 aSig, bSig, zSig;
1914 aSig = extractFloat32Frac( a );
1915 aExp = extractFloat32Exp( a );
1916 aSign = extractFloat32Sign( a );
1917 bSig = extractFloat32Frac( b );
1918 bExp = extractFloat32Exp( b );
1919 bSign = extractFloat32Sign( b );
1920 zSign = aSign ^ bSign;
1921 if ( aExp == 0xFF ) {
1922 if ( aSig ) return propagateFloat32NaN( a, b );
1923 if ( bExp == 0xFF ) {
1924 if ( bSig ) return propagateFloat32NaN( a, b );
1925 float_raise( float_flag_invalid );
1926 return float32_default_nan;
1928 return packFloat32( zSign, 0xFF, 0 );
1930 if ( bExp == 0xFF ) {
1931 if ( bSig ) return propagateFloat32NaN( a, b );
1932 return packFloat32( zSign, 0, 0 );
1934 if ( bExp == 0 ) {
1935 if ( bSig == 0 ) {
1936 if ( ( aExp | aSig ) == 0 ) {
1937 float_raise( float_flag_invalid );
1938 return float32_default_nan;
1940 float_raise( float_flag_divbyzero );
1941 return packFloat32( zSign, 0xFF, 0 );
1943 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1945 if ( aExp == 0 ) {
1946 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1947 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1949 zExp = aExp - bExp + 0x7D;
1950 aSig = ( aSig | 0x00800000 )<<7;
1951 bSig = ( bSig | 0x00800000 )<<8;
1952 if ( bSig <= ( aSig + aSig ) ) {
1953 aSig >>= 1;
1954 ++zExp;
1956 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1957 if ( ( zSig & 0x3F ) == 0 ) {
1958 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1960 return roundAndPackFloat32( zSign, zExp, zSig );
1964 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1966 -------------------------------------------------------------------------------
1967 Returns the remainder of the single-precision floating-point value `a'
1968 with respect to the corresponding value `b'. The operation is performed
1969 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1970 -------------------------------------------------------------------------------
1972 float32 float32_rem( float32 a, float32 b )
1974 flag aSign, bSign, zSign;
1975 int16 aExp, bExp, expDiff;
1976 bits32 aSig, bSig;
1977 bits32 q;
1978 bits64 aSig64, bSig64, q64;
1979 bits32 alternateASig;
1980 sbits32 sigMean;
1982 aSig = extractFloat32Frac( a );
1983 aExp = extractFloat32Exp( a );
1984 aSign = extractFloat32Sign( a );
1985 bSig = extractFloat32Frac( b );
1986 bExp = extractFloat32Exp( b );
1987 bSign = extractFloat32Sign( b );
1988 if ( aExp == 0xFF ) {
1989 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1990 return propagateFloat32NaN( a, b );
1992 float_raise( float_flag_invalid );
1993 return float32_default_nan;
1995 if ( bExp == 0xFF ) {
1996 if ( bSig ) return propagateFloat32NaN( a, b );
1997 return a;
1999 if ( bExp == 0 ) {
2000 if ( bSig == 0 ) {
2001 float_raise( float_flag_invalid );
2002 return float32_default_nan;
2004 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2006 if ( aExp == 0 ) {
2007 if ( aSig == 0 ) return a;
2008 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2010 expDiff = aExp - bExp;
2011 aSig |= 0x00800000;
2012 bSig |= 0x00800000;
2013 if ( expDiff < 32 ) {
2014 aSig <<= 8;
2015 bSig <<= 8;
2016 if ( expDiff < 0 ) {
2017 if ( expDiff < -1 ) return a;
2018 aSig >>= 1;
2020 q = ( bSig <= aSig );
2021 if ( q ) aSig -= bSig;
2022 if ( 0 < expDiff ) {
2023 q = ( ( (bits64) aSig )<<32 ) / bSig;
2024 q >>= 32 - expDiff;
2025 bSig >>= 2;
2026 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2028 else {
2029 aSig >>= 2;
2030 bSig >>= 2;
2033 else {
2034 if ( bSig <= aSig ) aSig -= bSig;
2035 aSig64 = ( (bits64) aSig )<<40;
2036 bSig64 = ( (bits64) bSig )<<40;
2037 expDiff -= 64;
2038 while ( 0 < expDiff ) {
2039 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2040 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2041 aSig64 = - ( ( bSig * q64 )<<38 );
2042 expDiff -= 62;
2044 expDiff += 64;
2045 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2046 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2047 q = q64>>( 64 - expDiff );
2048 bSig <<= 6;
2049 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2051 do {
2052 alternateASig = aSig;
2053 ++q;
2054 aSig -= bSig;
2055 } while ( 0 <= (sbits32) aSig );
2056 sigMean = aSig + alternateASig;
2057 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2058 aSig = alternateASig;
2060 zSign = ( (sbits32) aSig < 0 );
2061 if ( zSign ) aSig = - aSig;
2062 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2065 #endif /* !SOFTFLOAT_FOR_GCC */
2067 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2069 -------------------------------------------------------------------------------
2070 Returns the square root of the single-precision floating-point value `a'.
2071 The operation is performed according to the IEC/IEEE Standard for Binary
2072 Floating-Point Arithmetic.
2073 -------------------------------------------------------------------------------
2075 float32 float32_sqrt( float32 a )
2077 flag aSign;
2078 int16 aExp, zExp;
2079 bits32 aSig, zSig;
2080 bits64 rem, term;
2082 aSig = extractFloat32Frac( a );
2083 aExp = extractFloat32Exp( a );
2084 aSign = extractFloat32Sign( a );
2085 if ( aExp == 0xFF ) {
2086 if ( aSig ) return propagateFloat32NaN( a, 0 );
2087 if ( ! aSign ) return a;
2088 float_raise( float_flag_invalid );
2089 return float32_default_nan;
2091 if ( aSign ) {
2092 if ( ( aExp | aSig ) == 0 ) return a;
2093 float_raise( float_flag_invalid );
2094 return float32_default_nan;
2096 if ( aExp == 0 ) {
2097 if ( aSig == 0 ) return 0;
2098 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2100 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2101 aSig = ( aSig | 0x00800000 )<<8;
2102 zSig = estimateSqrt32( aExp, aSig ) + 2;
2103 if ( ( zSig & 0x7F ) <= 5 ) {
2104 if ( zSig < 2 ) {
2105 zSig = 0x7FFFFFFF;
2106 goto roundAndPack;
2108 aSig >>= aExp & 1;
2109 term = ( (bits64) zSig ) * zSig;
2110 rem = ( ( (bits64) aSig )<<32 ) - term;
2111 while ( (sbits64) rem < 0 ) {
2112 --zSig;
2113 rem += ( ( (bits64) zSig )<<1 ) | 1;
2115 zSig |= ( rem != 0 );
2117 shift32RightJamming( zSig, 1, &zSig );
2118 roundAndPack:
2119 return roundAndPackFloat32( 0, zExp, zSig );
2122 #endif /* !SOFTFLOAT_FOR_GCC */
2125 -------------------------------------------------------------------------------
2126 Returns 1 if the single-precision floating-point value `a' is equal to
2127 the corresponding value `b', and 0 otherwise. The comparison is performed
2128 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2129 -------------------------------------------------------------------------------
2131 flag float32_eq( float32 a, float32 b )
2134 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2135 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2137 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2138 float_raise( float_flag_invalid );
2140 return 0;
2142 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2147 -------------------------------------------------------------------------------
2148 Returns 1 if the single-precision floating-point value `a' is less than
2149 or equal to the corresponding value `b', and 0 otherwise. The comparison
2150 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2151 Arithmetic.
2152 -------------------------------------------------------------------------------
2154 flag float32_le( float32 a, float32 b )
2156 flag aSign, bSign;
2158 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2159 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2161 float_raise( float_flag_invalid );
2162 return 0;
2164 aSign = extractFloat32Sign( a );
2165 bSign = extractFloat32Sign( b );
2166 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2167 return ( a == b ) || ( aSign ^ ( a < b ) );
2172 -------------------------------------------------------------------------------
2173 Returns 1 if the single-precision floating-point value `a' is less than
2174 the corresponding value `b', and 0 otherwise. The comparison is performed
2175 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176 -------------------------------------------------------------------------------
2178 flag float32_lt( float32 a, float32 b )
2180 flag aSign, bSign;
2182 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2183 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2185 float_raise( float_flag_invalid );
2186 return 0;
2188 aSign = extractFloat32Sign( a );
2189 bSign = extractFloat32Sign( b );
2190 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2191 return ( a != b ) && ( aSign ^ ( a < b ) );
2195 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2197 -------------------------------------------------------------------------------
2198 Returns 1 if the single-precision floating-point value `a' is equal to
2199 the corresponding value `b', and 0 otherwise. The invalid exception is
2200 raised if either operand is a NaN. Otherwise, the comparison is performed
2201 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2202 -------------------------------------------------------------------------------
2204 flag float32_eq_signaling( float32 a, float32 b )
2207 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2208 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2210 float_raise( float_flag_invalid );
2211 return 0;
2213 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2218 -------------------------------------------------------------------------------
2219 Returns 1 if the single-precision floating-point value `a' is less than or
2220 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2221 cause an exception. Otherwise, the comparison is performed according to the
2222 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2223 -------------------------------------------------------------------------------
2225 flag float32_le_quiet( float32 a, float32 b )
2227 flag aSign, bSign;
2229 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2230 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2232 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2233 float_raise( float_flag_invalid );
2235 return 0;
2237 aSign = extractFloat32Sign( a );
2238 bSign = extractFloat32Sign( b );
2239 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2240 return ( a == b ) || ( aSign ^ ( a < b ) );
2245 -------------------------------------------------------------------------------
2246 Returns 1 if the single-precision floating-point value `a' is less than
2247 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2248 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2249 Standard for Binary Floating-Point Arithmetic.
2250 -------------------------------------------------------------------------------
2252 flag float32_lt_quiet( float32 a, float32 b )
2254 flag aSign, bSign;
2256 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2257 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2259 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2260 float_raise( float_flag_invalid );
2262 return 0;
2264 aSign = extractFloat32Sign( a );
2265 bSign = extractFloat32Sign( b );
2266 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2267 return ( a != b ) && ( aSign ^ ( a < b ) );
2270 #endif /* !SOFTFLOAT_FOR_GCC */
2272 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2274 -------------------------------------------------------------------------------
2275 Returns the result of converting the double-precision floating-point value
2276 `a' to the 32-bit two's complement integer format. The conversion is
2277 performed according to the IEC/IEEE Standard for Binary Floating-Point
2278 Arithmetic---which means in particular that the conversion is rounded
2279 according to the current rounding mode. If `a' is a NaN, the largest
2280 positive integer is returned. Otherwise, if the conversion overflows, the
2281 largest integer with the same sign as `a' is returned.
2282 -------------------------------------------------------------------------------
2284 int32 float64_to_int32( float64 a )
2286 flag aSign;
2287 int16 aExp, shiftCount;
2288 bits64 aSig;
2290 aSig = extractFloat64Frac( a );
2291 aExp = extractFloat64Exp( a );
2292 aSign = extractFloat64Sign( a );
2293 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2294 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2295 shiftCount = 0x42C - aExp;
2296 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2297 return roundAndPackInt32( aSign, aSig );
2300 #endif /* !SOFTFLOAT_FOR_GCC */
2303 -------------------------------------------------------------------------------
2304 Returns the result of converting the double-precision floating-point value
2305 `a' to the 32-bit two's complement integer format. The conversion is
2306 performed according to the IEC/IEEE Standard for Binary Floating-Point
2307 Arithmetic, except that the conversion is always rounded toward zero.
2308 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2309 the conversion overflows, the largest integer with the same sign as `a' is
2310 returned.
2311 -------------------------------------------------------------------------------
2313 int32 float64_to_int32_round_to_zero( float64 a )
2315 flag aSign;
2316 int16 aExp, shiftCount;
2317 bits64 aSig, savedASig;
2318 int32 z;
2320 aSig = extractFloat64Frac( a );
2321 aExp = extractFloat64Exp( a );
2322 aSign = extractFloat64Sign( a );
2323 if ( 0x41E < aExp ) {
2324 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2325 goto invalid;
2327 else if ( aExp < 0x3FF ) {
2328 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2329 return 0;
2331 aSig |= LIT64( 0x0010000000000000 );
2332 shiftCount = 0x433 - aExp;
2333 savedASig = aSig;
2334 aSig >>= shiftCount;
2335 z = aSig;
2336 if ( aSign ) z = - z;
2337 if ( ( z < 0 ) ^ aSign ) {
2338 invalid:
2339 float_raise( float_flag_invalid );
2340 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2342 if ( ( aSig<<shiftCount ) != savedASig ) {
2343 float_exception_flags |= float_flag_inexact;
2345 return z;
2349 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2351 -------------------------------------------------------------------------------
2352 Returns the result of converting the double-precision floating-point value
2353 `a' to the 64-bit two's complement integer format. The conversion is
2354 performed according to the IEC/IEEE Standard for Binary Floating-Point
2355 Arithmetic---which means in particular that the conversion is rounded
2356 according to the current rounding mode. If `a' is a NaN, the largest
2357 positive integer is returned. Otherwise, if the conversion overflows, the
2358 largest integer with the same sign as `a' is returned.
2359 -------------------------------------------------------------------------------
2361 int64 float64_to_int64( float64 a )
2363 flag aSign;
2364 int16 aExp, shiftCount;
2365 bits64 aSig, aSigExtra;
2367 aSig = extractFloat64Frac( a );
2368 aExp = extractFloat64Exp( a );
2369 aSign = extractFloat64Sign( a );
2370 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2371 shiftCount = 0x433 - aExp;
2372 if ( shiftCount <= 0 ) {
2373 if ( 0x43E < aExp ) {
2374 float_raise( float_flag_invalid );
2375 if ( ! aSign
2376 || ( ( aExp == 0x7FF )
2377 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2379 return LIT64( 0x7FFFFFFFFFFFFFFF );
2381 return (sbits64) LIT64( 0x8000000000000000 );
2383 aSigExtra = 0;
2384 aSig <<= - shiftCount;
2386 else {
2387 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2389 return roundAndPackInt64( aSign, aSig, aSigExtra );
2394 -------------------------------------------------------------------------------
2395 Returns the result of converting the double-precision floating-point value
2396 `a' to the 64-bit two's complement integer format. The conversion is
2397 performed according to the IEC/IEEE Standard for Binary Floating-Point
2398 Arithmetic, except that the conversion is always rounded toward zero.
2399 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2400 the conversion overflows, the largest integer with the same sign as `a' is
2401 returned.
2402 -------------------------------------------------------------------------------
2404 int64 float64_to_int64_round_to_zero( float64 a )
2406 flag aSign;
2407 int16 aExp, shiftCount;
2408 bits64 aSig;
2409 int64 z;
2411 aSig = extractFloat64Frac( a );
2412 aExp = extractFloat64Exp( a );
2413 aSign = extractFloat64Sign( a );
2414 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2415 shiftCount = aExp - 0x433;
2416 if ( 0 <= shiftCount ) {
2417 if ( 0x43E <= aExp ) {
2418 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2419 float_raise( float_flag_invalid );
2420 if ( ! aSign
2421 || ( ( aExp == 0x7FF )
2422 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2424 return LIT64( 0x7FFFFFFFFFFFFFFF );
2427 return (sbits64) LIT64( 0x8000000000000000 );
2429 z = aSig<<shiftCount;
2431 else {
2432 if ( aExp < 0x3FE ) {
2433 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2434 return 0;
2436 z = aSig>>( - shiftCount );
2437 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2438 float_exception_flags |= float_flag_inexact;
2441 if ( aSign ) z = - z;
2442 return z;
2445 #endif /* !SOFTFLOAT_FOR_GCC */
2448 -------------------------------------------------------------------------------
2449 Returns the result of converting the double-precision floating-point value
2450 `a' to the single-precision floating-point format. The conversion is
2451 performed according to the IEC/IEEE Standard for Binary Floating-Point
2452 Arithmetic.
2453 -------------------------------------------------------------------------------
2455 float32 float64_to_float32( float64 a )
2457 flag aSign;
2458 int16 aExp;
2459 bits64 aSig;
2460 bits32 zSig;
2462 aSig = extractFloat64Frac( a );
2463 aExp = extractFloat64Exp( a );
2464 aSign = extractFloat64Sign( a );
2465 if ( aExp == 0x7FF ) {
2466 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2467 return packFloat32( aSign, 0xFF, 0 );
2469 shift64RightJamming( aSig, 22, &aSig );
2470 zSig = aSig;
2471 if ( aExp || zSig ) {
2472 zSig |= 0x40000000;
2473 aExp -= 0x381;
2475 return roundAndPackFloat32( aSign, aExp, zSig );
2479 #ifdef FLOATX80
2482 -------------------------------------------------------------------------------
2483 Returns the result of converting the double-precision floating-point value
2484 `a' to the extended double-precision floating-point format. The conversion
2485 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2486 Arithmetic.
2487 -------------------------------------------------------------------------------
2489 floatx80 float64_to_floatx80( float64 a )
2491 flag aSign;
2492 int16 aExp;
2493 bits64 aSig;
2495 aSig = extractFloat64Frac( a );
2496 aExp = extractFloat64Exp( a );
2497 aSign = extractFloat64Sign( a );
2498 if ( aExp == 0x7FF ) {
2499 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2500 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2502 if ( aExp == 0 ) {
2503 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2504 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2506 return
2507 packFloatx80(
2508 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2512 #endif
2514 #ifdef FLOAT128
2517 -------------------------------------------------------------------------------
2518 Returns the result of converting the double-precision floating-point value
2519 `a' to the quadruple-precision floating-point format. The conversion is
2520 performed according to the IEC/IEEE Standard for Binary Floating-Point
2521 Arithmetic.
2522 -------------------------------------------------------------------------------
2524 float128 float64_to_float128( float64 a )
2526 flag aSign;
2527 int16 aExp;
2528 bits64 aSig, zSig0, zSig1;
2530 aSig = extractFloat64Frac( a );
2531 aExp = extractFloat64Exp( a );
2532 aSign = extractFloat64Sign( a );
2533 if ( aExp == 0x7FF ) {
2534 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2535 return packFloat128( aSign, 0x7FFF, 0, 0 );
2537 if ( aExp == 0 ) {
2538 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2539 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2540 --aExp;
2542 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2543 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2547 #endif
2549 #ifndef SOFTFLOAT_FOR_GCC
2551 -------------------------------------------------------------------------------
2552 Rounds the double-precision floating-point value `a' to an integer, and
2553 returns the result as a double-precision floating-point value. The
2554 operation is performed according to the IEC/IEEE Standard for Binary
2555 Floating-Point Arithmetic.
2556 -------------------------------------------------------------------------------
2558 float64 float64_round_to_int( float64 a )
2560 flag aSign;
2561 int16 aExp;
2562 bits64 lastBitMask, roundBitsMask;
2563 int8 roundingMode;
2564 float64 z;
2566 aExp = extractFloat64Exp( a );
2567 if ( 0x433 <= aExp ) {
2568 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2569 return propagateFloat64NaN( a, a );
2571 return a;
2573 if ( aExp < 0x3FF ) {
2574 if ( (bits64) ( a<<1 ) == 0 ) return a;
2575 float_exception_flags |= float_flag_inexact;
2576 aSign = extractFloat64Sign( a );
2577 switch ( float_rounding_mode ) {
2578 case float_round_nearest_even:
2579 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2580 return packFloat64( aSign, 0x3FF, 0 );
2582 break;
2583 case float_round_to_zero:
2584 break;
2585 case float_round_down:
2586 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2587 case float_round_up:
2588 return
2589 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2591 return packFloat64( aSign, 0, 0 );
2593 lastBitMask = 1;
2594 lastBitMask <<= 0x433 - aExp;
2595 roundBitsMask = lastBitMask - 1;
2596 z = a;
2597 roundingMode = float_rounding_mode;
2598 if ( roundingMode == float_round_nearest_even ) {
2599 z += lastBitMask>>1;
2600 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2602 else if ( roundingMode != float_round_to_zero ) {
2603 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2604 z += roundBitsMask;
2607 z &= ~ roundBitsMask;
2608 if ( z != a ) float_exception_flags |= float_flag_inexact;
2609 return z;
2612 #endif
2615 -------------------------------------------------------------------------------
2616 Returns the result of adding the absolute values of the double-precision
2617 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2618 before being returned. `zSign' is ignored if the result is a NaN.
2619 The addition is performed according to the IEC/IEEE Standard for Binary
2620 Floating-Point Arithmetic.
2621 -------------------------------------------------------------------------------
2623 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2625 int16 aExp, bExp, zExp;
2626 bits64 aSig, bSig, zSig;
2627 int16 expDiff;
2629 aSig = extractFloat64Frac( a );
2630 aExp = extractFloat64Exp( a );
2631 bSig = extractFloat64Frac( b );
2632 bExp = extractFloat64Exp( b );
2633 expDiff = aExp - bExp;
2634 aSig <<= 9;
2635 bSig <<= 9;
2636 if ( 0 < expDiff ) {
2637 if ( aExp == 0x7FF ) {
2638 if ( aSig ) return propagateFloat64NaN( a, b );
2639 return a;
2641 if ( bExp == 0 ) {
2642 --expDiff;
2644 else {
2645 bSig |= LIT64( 0x2000000000000000 );
2647 shift64RightJamming( bSig, expDiff, &bSig );
2648 zExp = aExp;
2650 else if ( expDiff < 0 ) {
2651 if ( bExp == 0x7FF ) {
2652 if ( bSig ) return propagateFloat64NaN( a, b );
2653 return packFloat64( zSign, 0x7FF, 0 );
2655 if ( aExp == 0 ) {
2656 ++expDiff;
2658 else {
2659 aSig |= LIT64( 0x2000000000000000 );
2661 shift64RightJamming( aSig, - expDiff, &aSig );
2662 zExp = bExp;
2664 else {
2665 if ( aExp == 0x7FF ) {
2666 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2667 return a;
2669 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2670 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2671 zExp = aExp;
2672 goto roundAndPack;
2674 aSig |= LIT64( 0x2000000000000000 );
2675 zSig = ( aSig + bSig )<<1;
2676 --zExp;
2677 if ( (sbits64) zSig < 0 ) {
2678 zSig = aSig + bSig;
2679 ++zExp;
2681 roundAndPack:
2682 return roundAndPackFloat64( zSign, zExp, zSig );
2687 -------------------------------------------------------------------------------
2688 Returns the result of subtracting the absolute values of the double-
2689 precision floating-point values `a' and `b'. If `zSign' is 1, the
2690 difference is negated before being returned. `zSign' is ignored if the
2691 result is a NaN. The subtraction is performed according to the IEC/IEEE
2692 Standard for Binary Floating-Point Arithmetic.
2693 -------------------------------------------------------------------------------
2695 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2697 int16 aExp, bExp, zExp;
2698 bits64 aSig, bSig, zSig;
2699 int16 expDiff;
2701 aSig = extractFloat64Frac( a );
2702 aExp = extractFloat64Exp( a );
2703 bSig = extractFloat64Frac( b );
2704 bExp = extractFloat64Exp( b );
2705 expDiff = aExp - bExp;
2706 aSig <<= 10;
2707 bSig <<= 10;
2708 if ( 0 < expDiff ) goto aExpBigger;
2709 if ( expDiff < 0 ) goto bExpBigger;
2710 if ( aExp == 0x7FF ) {
2711 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2712 float_raise( float_flag_invalid );
2713 return float64_default_nan;
2715 if ( aExp == 0 ) {
2716 aExp = 1;
2717 bExp = 1;
2719 if ( bSig < aSig ) goto aBigger;
2720 if ( aSig < bSig ) goto bBigger;
2721 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2722 bExpBigger:
2723 if ( bExp == 0x7FF ) {
2724 if ( bSig ) return propagateFloat64NaN( a, b );
2725 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2727 if ( aExp == 0 ) {
2728 ++expDiff;
2730 else {
2731 aSig |= LIT64( 0x4000000000000000 );
2733 shift64RightJamming( aSig, - expDiff, &aSig );
2734 bSig |= LIT64( 0x4000000000000000 );
2735 bBigger:
2736 zSig = bSig - aSig;
2737 zExp = bExp;
2738 zSign ^= 1;
2739 goto normalizeRoundAndPack;
2740 aExpBigger:
2741 if ( aExp == 0x7FF ) {
2742 if ( aSig ) return propagateFloat64NaN( a, b );
2743 return a;
2745 if ( bExp == 0 ) {
2746 --expDiff;
2748 else {
2749 bSig |= LIT64( 0x4000000000000000 );
2751 shift64RightJamming( bSig, expDiff, &bSig );
2752 aSig |= LIT64( 0x4000000000000000 );
2753 aBigger:
2754 zSig = aSig - bSig;
2755 zExp = aExp;
2756 normalizeRoundAndPack:
2757 --zExp;
2758 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2763 -------------------------------------------------------------------------------
2764 Returns the result of adding the double-precision floating-point values `a'
2765 and `b'. The operation is performed according to the IEC/IEEE Standard for
2766 Binary Floating-Point Arithmetic.
2767 -------------------------------------------------------------------------------
2769 float64 float64_add( float64 a, float64 b )
2771 flag aSign, bSign;
2773 aSign = extractFloat64Sign( a );
2774 bSign = extractFloat64Sign( b );
2775 if ( aSign == bSign ) {
2776 return addFloat64Sigs( a, b, aSign );
2778 else {
2779 return subFloat64Sigs( a, b, aSign );
2785 -------------------------------------------------------------------------------
2786 Returns the result of subtracting the double-precision floating-point values
2787 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2788 for Binary Floating-Point Arithmetic.
2789 -------------------------------------------------------------------------------
2791 float64 float64_sub( float64 a, float64 b )
2793 flag aSign, bSign;
2795 aSign = extractFloat64Sign( a );
2796 bSign = extractFloat64Sign( b );
2797 if ( aSign == bSign ) {
2798 return subFloat64Sigs( a, b, aSign );
2800 else {
2801 return addFloat64Sigs( a, b, aSign );
2807 -------------------------------------------------------------------------------
2808 Returns the result of multiplying the double-precision floating-point values
2809 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2810 for Binary Floating-Point Arithmetic.
2811 -------------------------------------------------------------------------------
2813 float64 float64_mul( float64 a, float64 b )
2815 flag aSign, bSign, zSign;
2816 int16 aExp, bExp, zExp;
2817 bits64 aSig, bSig, zSig0, zSig1;
2819 aSig = extractFloat64Frac( a );
2820 aExp = extractFloat64Exp( a );
2821 aSign = extractFloat64Sign( a );
2822 bSig = extractFloat64Frac( b );
2823 bExp = extractFloat64Exp( b );
2824 bSign = extractFloat64Sign( b );
2825 zSign = aSign ^ bSign;
2826 if ( aExp == 0x7FF ) {
2827 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2828 return propagateFloat64NaN( a, b );
2830 if ( ( bExp | bSig ) == 0 ) {
2831 float_raise( float_flag_invalid );
2832 return float64_default_nan;
2834 return packFloat64( zSign, 0x7FF, 0 );
2836 if ( bExp == 0x7FF ) {
2837 if ( bSig ) return propagateFloat64NaN( a, b );
2838 if ( ( aExp | aSig ) == 0 ) {
2839 float_raise( float_flag_invalid );
2840 return float64_default_nan;
2842 return packFloat64( zSign, 0x7FF, 0 );
2844 if ( aExp == 0 ) {
2845 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2846 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2848 if ( bExp == 0 ) {
2849 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2850 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2852 zExp = aExp + bExp - 0x3FF;
2853 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2854 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2855 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2856 zSig0 |= ( zSig1 != 0 );
2857 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2858 zSig0 <<= 1;
2859 --zExp;
2861 return roundAndPackFloat64( zSign, zExp, zSig0 );
2866 -------------------------------------------------------------------------------
2867 Returns the result of dividing the double-precision floating-point value `a'
2868 by the corresponding value `b'. The operation is performed according to
2869 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2870 -------------------------------------------------------------------------------
2872 float64 float64_div( float64 a, float64 b )
2874 flag aSign, bSign, zSign;
2875 int16 aExp, bExp, zExp;
2876 bits64 aSig, bSig, zSig;
2877 bits64 rem0, rem1;
2878 bits64 term0, term1;
2880 aSig = extractFloat64Frac( a );
2881 aExp = extractFloat64Exp( a );
2882 aSign = extractFloat64Sign( a );
2883 bSig = extractFloat64Frac( b );
2884 bExp = extractFloat64Exp( b );
2885 bSign = extractFloat64Sign( b );
2886 zSign = aSign ^ bSign;
2887 if ( aExp == 0x7FF ) {
2888 if ( aSig ) return propagateFloat64NaN( a, b );
2889 if ( bExp == 0x7FF ) {
2890 if ( bSig ) return propagateFloat64NaN( a, b );
2891 float_raise( float_flag_invalid );
2892 return float64_default_nan;
2894 return packFloat64( zSign, 0x7FF, 0 );
2896 if ( bExp == 0x7FF ) {
2897 if ( bSig ) return propagateFloat64NaN( a, b );
2898 return packFloat64( zSign, 0, 0 );
2900 if ( bExp == 0 ) {
2901 if ( bSig == 0 ) {
2902 if ( ( aExp | aSig ) == 0 ) {
2903 float_raise( float_flag_invalid );
2904 return float64_default_nan;
2906 float_raise( float_flag_divbyzero );
2907 return packFloat64( zSign, 0x7FF, 0 );
2909 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2911 if ( aExp == 0 ) {
2912 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2913 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2915 zExp = aExp - bExp + 0x3FD;
2916 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2917 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2918 if ( bSig <= ( aSig + aSig ) ) {
2919 aSig >>= 1;
2920 ++zExp;
2922 zSig = estimateDiv128To64( aSig, 0, bSig );
2923 if ( ( zSig & 0x1FF ) <= 2 ) {
2924 mul64To128( bSig, zSig, &term0, &term1 );
2925 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2926 while ( (sbits64) rem0 < 0 ) {
2927 --zSig;
2928 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2930 zSig |= ( rem1 != 0 );
2932 return roundAndPackFloat64( zSign, zExp, zSig );
2936 #ifndef SOFTFLOAT_FOR_GCC
2938 -------------------------------------------------------------------------------
2939 Returns the remainder of the double-precision floating-point value `a'
2940 with respect to the corresponding value `b'. The operation is performed
2941 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2942 -------------------------------------------------------------------------------
2944 float64 float64_rem( float64 a, float64 b )
2946 flag aSign, bSign, zSign;
2947 int16 aExp, bExp, expDiff;
2948 bits64 aSig, bSig;
2949 bits64 q, alternateASig;
2950 sbits64 sigMean;
2952 aSig = extractFloat64Frac( a );
2953 aExp = extractFloat64Exp( a );
2954 aSign = extractFloat64Sign( a );
2955 bSig = extractFloat64Frac( b );
2956 bExp = extractFloat64Exp( b );
2957 bSign = extractFloat64Sign( b );
2958 if ( aExp == 0x7FF ) {
2959 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2960 return propagateFloat64NaN( a, b );
2962 float_raise( float_flag_invalid );
2963 return float64_default_nan;
2965 if ( bExp == 0x7FF ) {
2966 if ( bSig ) return propagateFloat64NaN( a, b );
2967 return a;
2969 if ( bExp == 0 ) {
2970 if ( bSig == 0 ) {
2971 float_raise( float_flag_invalid );
2972 return float64_default_nan;
2974 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2976 if ( aExp == 0 ) {
2977 if ( aSig == 0 ) return a;
2978 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2980 expDiff = aExp - bExp;
2981 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2982 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2983 if ( expDiff < 0 ) {
2984 if ( expDiff < -1 ) return a;
2985 aSig >>= 1;
2987 q = ( bSig <= aSig );
2988 if ( q ) aSig -= bSig;
2989 expDiff -= 64;
2990 while ( 0 < expDiff ) {
2991 q = estimateDiv128To64( aSig, 0, bSig );
2992 q = ( 2 < q ) ? q - 2 : 0;
2993 aSig = - ( ( bSig>>2 ) * q );
2994 expDiff -= 62;
2996 expDiff += 64;
2997 if ( 0 < expDiff ) {
2998 q = estimateDiv128To64( aSig, 0, bSig );
2999 q = ( 2 < q ) ? q - 2 : 0;
3000 q >>= 64 - expDiff;
3001 bSig >>= 2;
3002 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3004 else {
3005 aSig >>= 2;
3006 bSig >>= 2;
3008 do {
3009 alternateASig = aSig;
3010 ++q;
3011 aSig -= bSig;
3012 } while ( 0 <= (sbits64) aSig );
3013 sigMean = aSig + alternateASig;
3014 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3015 aSig = alternateASig;
3017 zSign = ( (sbits64) aSig < 0 );
3018 if ( zSign ) aSig = - aSig;
3019 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3024 -------------------------------------------------------------------------------
3025 Returns the square root of the double-precision floating-point value `a'.
3026 The operation is performed according to the IEC/IEEE Standard for Binary
3027 Floating-Point Arithmetic.
3028 -------------------------------------------------------------------------------
3030 float64 float64_sqrt( float64 a )
3032 flag aSign;
3033 int16 aExp, zExp;
3034 bits64 aSig, zSig, doubleZSig;
3035 bits64 rem0, rem1, term0, term1;
3037 aSig = extractFloat64Frac( a );
3038 aExp = extractFloat64Exp( a );
3039 aSign = extractFloat64Sign( a );
3040 if ( aExp == 0x7FF ) {
3041 if ( aSig ) return propagateFloat64NaN( a, a );
3042 if ( ! aSign ) return a;
3043 float_raise( float_flag_invalid );
3044 return float64_default_nan;
3046 if ( aSign ) {
3047 if ( ( aExp | aSig ) == 0 ) return a;
3048 float_raise( float_flag_invalid );
3049 return float64_default_nan;
3051 if ( aExp == 0 ) {
3052 if ( aSig == 0 ) return 0;
3053 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3055 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3056 aSig |= LIT64( 0x0010000000000000 );
3057 zSig = estimateSqrt32( aExp, aSig>>21 );
3058 aSig <<= 9 - ( aExp & 1 );
3059 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3060 if ( ( zSig & 0x1FF ) <= 5 ) {
3061 doubleZSig = zSig<<1;
3062 mul64To128( zSig, zSig, &term0, &term1 );
3063 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3064 while ( (sbits64) rem0 < 0 ) {
3065 --zSig;
3066 doubleZSig -= 2;
3067 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3069 zSig |= ( ( rem0 | rem1 ) != 0 );
3071 return roundAndPackFloat64( 0, zExp, zSig );
3074 #endif
3077 -------------------------------------------------------------------------------
3078 Returns 1 if the double-precision floating-point value `a' is equal to the
3079 corresponding value `b', and 0 otherwise. The comparison is performed
3080 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3081 -------------------------------------------------------------------------------
3083 flag float64_eq( float64 a, float64 b )
3086 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3087 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3089 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3090 float_raise( float_flag_invalid );
3092 return 0;
3094 return ( a == b ) ||
3095 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3100 -------------------------------------------------------------------------------
3101 Returns 1 if the double-precision floating-point value `a' is less than or
3102 equal to the corresponding value `b', and 0 otherwise. The comparison is
3103 performed according to the IEC/IEEE Standard for Binary Floating-Point
3104 Arithmetic.
3105 -------------------------------------------------------------------------------
3107 flag float64_le( float64 a, float64 b )
3109 flag aSign, bSign;
3111 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3112 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3114 float_raise( float_flag_invalid );
3115 return 0;
3117 aSign = extractFloat64Sign( a );
3118 bSign = extractFloat64Sign( b );
3119 if ( aSign != bSign )
3120 return aSign ||
3121 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3122 0 );
3123 return ( a == b ) ||
3124 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3129 -------------------------------------------------------------------------------
3130 Returns 1 if the double-precision floating-point value `a' is less than
3131 the corresponding value `b', and 0 otherwise. The comparison is performed
3132 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3133 -------------------------------------------------------------------------------
3135 flag float64_lt( float64 a, float64 b )
3137 flag aSign, bSign;
3139 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3140 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3142 float_raise( float_flag_invalid );
3143 return 0;
3145 aSign = extractFloat64Sign( a );
3146 bSign = extractFloat64Sign( b );
3147 if ( aSign != bSign )
3148 return aSign &&
3149 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3150 0 );
3151 return ( a != b ) &&
3152 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3156 #ifndef SOFTFLOAT_FOR_GCC
3158 -------------------------------------------------------------------------------
3159 Returns 1 if the double-precision floating-point value `a' is equal to the
3160 corresponding value `b', and 0 otherwise. The invalid exception is raised
3161 if either operand is a NaN. Otherwise, the comparison is performed
3162 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3163 -------------------------------------------------------------------------------
3165 flag float64_eq_signaling( float64 a, float64 b )
3168 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3169 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3171 float_raise( float_flag_invalid );
3172 return 0;
3174 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3179 -------------------------------------------------------------------------------
3180 Returns 1 if the double-precision floating-point value `a' is less than or
3181 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3182 cause an exception. Otherwise, the comparison is performed according to the
3183 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3184 -------------------------------------------------------------------------------
3186 flag float64_le_quiet( float64 a, float64 b )
3188 flag aSign, bSign;
3190 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3191 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3193 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3194 float_raise( float_flag_invalid );
3196 return 0;
3198 aSign = extractFloat64Sign( a );
3199 bSign = extractFloat64Sign( b );
3200 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3201 return ( a == b ) || ( aSign ^ ( a < b ) );
3206 -------------------------------------------------------------------------------
3207 Returns 1 if the double-precision floating-point value `a' is less than
3208 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3209 exception. Otherwise, the comparison is performed according to the IEC/IEEE
3210 Standard for Binary Floating-Point Arithmetic.
3211 -------------------------------------------------------------------------------
3213 flag float64_lt_quiet( float64 a, float64 b )
3215 flag aSign, bSign;
3217 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3218 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3220 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3221 float_raise( float_flag_invalid );
3223 return 0;
3225 aSign = extractFloat64Sign( a );
3226 bSign = extractFloat64Sign( b );
3227 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3228 return ( a != b ) && ( aSign ^ ( a < b ) );
3231 #endif
3233 #ifdef FLOATX80
3236 -------------------------------------------------------------------------------
3237 Returns the result of converting the extended double-precision floating-
3238 point value `a' to the 32-bit two's complement integer format. The
3239 conversion is performed according to the IEC/IEEE Standard for Binary
3240 Floating-Point Arithmetic---which means in particular that the conversion
3241 is rounded according to the current rounding mode. If `a' is a NaN, the
3242 largest positive integer is returned. Otherwise, if the conversion
3243 overflows, the largest integer with the same sign as `a' is returned.
3244 -------------------------------------------------------------------------------
3246 int32 floatx80_to_int32( floatx80 a )
3248 flag aSign;
3249 int32 aExp, shiftCount;
3250 bits64 aSig;
3252 aSig = extractFloatx80Frac( a );
3253 aExp = extractFloatx80Exp( a );
3254 aSign = extractFloatx80Sign( a );
3255 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3256 shiftCount = 0x4037 - aExp;
3257 if ( shiftCount <= 0 ) shiftCount = 1;
3258 shift64RightJamming( aSig, shiftCount, &aSig );
3259 return roundAndPackInt32( aSign, aSig );
3264 -------------------------------------------------------------------------------
3265 Returns the result of converting the extended double-precision floating-
3266 point value `a' to the 32-bit two's complement integer format. The
3267 conversion is performed according to the IEC/IEEE Standard for Binary
3268 Floating-Point Arithmetic, except that the conversion is always rounded
3269 toward zero. If `a' is a NaN, the largest positive integer is returned.
3270 Otherwise, if the conversion overflows, the largest integer with the same
3271 sign as `a' is returned.
3272 -------------------------------------------------------------------------------
3274 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3276 flag aSign;
3277 int32 aExp, shiftCount;
3278 bits64 aSig, savedASig;
3279 int32 z;
3281 aSig = extractFloatx80Frac( a );
3282 aExp = extractFloatx80Exp( a );
3283 aSign = extractFloatx80Sign( a );
3284 if ( 0x401E < aExp ) {
3285 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3286 goto invalid;
3288 else if ( aExp < 0x3FFF ) {
3289 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3290 return 0;
3292 shiftCount = 0x403E - aExp;
3293 savedASig = aSig;
3294 aSig >>= shiftCount;
3295 z = aSig;
3296 if ( aSign ) z = - z;
3297 if ( ( z < 0 ) ^ aSign ) {
3298 invalid:
3299 float_raise( float_flag_invalid );
3300 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3302 if ( ( aSig<<shiftCount ) != savedASig ) {
3303 float_exception_flags |= float_flag_inexact;
3305 return z;
3310 -------------------------------------------------------------------------------
3311 Returns the result of converting the extended double-precision floating-
3312 point value `a' to the 64-bit two's complement integer format. The
3313 conversion is performed according to the IEC/IEEE Standard for Binary
3314 Floating-Point Arithmetic---which means in particular that the conversion
3315 is rounded according to the current rounding mode. If `a' is a NaN,
3316 the largest positive integer is returned. Otherwise, if the conversion
3317 overflows, the largest integer with the same sign as `a' is returned.
3318 -------------------------------------------------------------------------------
3320 int64 floatx80_to_int64( floatx80 a )
3322 flag aSign;
3323 int32 aExp, shiftCount;
3324 bits64 aSig, aSigExtra;
3326 aSig = extractFloatx80Frac( a );
3327 aExp = extractFloatx80Exp( a );
3328 aSign = extractFloatx80Sign( a );
3329 shiftCount = 0x403E - aExp;
3330 if ( shiftCount <= 0 ) {
3331 if ( shiftCount ) {
3332 float_raise( float_flag_invalid );
3333 if ( ! aSign
3334 || ( ( aExp == 0x7FFF )
3335 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3337 return LIT64( 0x7FFFFFFFFFFFFFFF );
3339 return (sbits64) LIT64( 0x8000000000000000 );
3341 aSigExtra = 0;
3343 else {
3344 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3346 return roundAndPackInt64( aSign, aSig, aSigExtra );
3351 -------------------------------------------------------------------------------
3352 Returns the result of converting the extended double-precision floating-
3353 point value `a' to the 64-bit two's complement integer format. The
3354 conversion is performed according to the IEC/IEEE Standard for Binary
3355 Floating-Point Arithmetic, except that the conversion is always rounded
3356 toward zero. If `a' is a NaN, the largest positive integer is returned.
3357 Otherwise, if the conversion overflows, the largest integer with the same
3358 sign as `a' is returned.
3359 -------------------------------------------------------------------------------
3361 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3363 flag aSign;
3364 int32 aExp, shiftCount;
3365 bits64 aSig;
3366 int64 z;
3368 aSig = extractFloatx80Frac( a );
3369 aExp = extractFloatx80Exp( a );
3370 aSign = extractFloatx80Sign( a );
3371 shiftCount = aExp - 0x403E;
3372 if ( 0 <= shiftCount ) {
3373 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3374 if ( ( a.high != 0xC03E ) || aSig ) {
3375 float_raise( float_flag_invalid );
3376 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3377 return LIT64( 0x7FFFFFFFFFFFFFFF );
3380 return (sbits64) LIT64( 0x8000000000000000 );
3382 else if ( aExp < 0x3FFF ) {
3383 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3384 return 0;
3386 z = aSig>>( - shiftCount );
3387 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3388 float_exception_flags |= float_flag_inexact;
3390 if ( aSign ) z = - z;
3391 return z;
3396 -------------------------------------------------------------------------------
3397 Returns the result of converting the extended double-precision floating-
3398 point value `a' to the single-precision floating-point format. The
3399 conversion is performed according to the IEC/IEEE Standard for Binary
3400 Floating-Point Arithmetic.
3401 -------------------------------------------------------------------------------
3403 float32 floatx80_to_float32( floatx80 a )
3405 flag aSign;
3406 int32 aExp;
3407 bits64 aSig;
3409 aSig = extractFloatx80Frac( a );
3410 aExp = extractFloatx80Exp( a );
3411 aSign = extractFloatx80Sign( a );
3412 if ( aExp == 0x7FFF ) {
3413 if ( (bits64) ( aSig<<1 ) ) {
3414 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3416 return packFloat32( aSign, 0xFF, 0 );
3418 shift64RightJamming( aSig, 33, &aSig );
3419 if ( aExp || aSig ) aExp -= 0x3F81;
3420 return roundAndPackFloat32( aSign, aExp, aSig );
3425 -------------------------------------------------------------------------------
3426 Returns the result of converting the extended double-precision floating-
3427 point value `a' to the double-precision floating-point format. The
3428 conversion is performed according to the IEC/IEEE Standard for Binary
3429 Floating-Point Arithmetic.
3430 -------------------------------------------------------------------------------
3432 float64 floatx80_to_float64( floatx80 a )
3434 flag aSign;
3435 int32 aExp;
3436 bits64 aSig, zSig;
3438 aSig = extractFloatx80Frac( a );
3439 aExp = extractFloatx80Exp( a );
3440 aSign = extractFloatx80Sign( a );
3441 if ( aExp == 0x7FFF ) {
3442 if ( (bits64) ( aSig<<1 ) ) {
3443 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3445 return packFloat64( aSign, 0x7FF, 0 );
3447 shift64RightJamming( aSig, 1, &zSig );
3448 if ( aExp || aSig ) aExp -= 0x3C01;
3449 return roundAndPackFloat64( aSign, aExp, zSig );
3453 #ifdef FLOAT128
3456 -------------------------------------------------------------------------------
3457 Returns the result of converting the extended double-precision floating-
3458 point value `a' to the quadruple-precision floating-point format. The
3459 conversion is performed according to the IEC/IEEE Standard for Binary
3460 Floating-Point Arithmetic.
3461 -------------------------------------------------------------------------------
3463 float128 floatx80_to_float128( floatx80 a )
3465 flag aSign;
3466 int16 aExp;
3467 bits64 aSig, zSig0, zSig1;
3469 aSig = extractFloatx80Frac( a );
3470 aExp = extractFloatx80Exp( a );
3471 aSign = extractFloatx80Sign( a );
3472 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3473 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3475 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3476 return packFloat128( aSign, aExp, zSig0, zSig1 );
3480 #endif
3483 -------------------------------------------------------------------------------
3484 Rounds the extended double-precision floating-point value `a' to an integer,
3485 and returns the result as an extended quadruple-precision floating-point
3486 value. The operation is performed according to the IEC/IEEE Standard for
3487 Binary Floating-Point Arithmetic.
3488 -------------------------------------------------------------------------------
3490 floatx80 floatx80_round_to_int( floatx80 a )
3492 flag aSign;
3493 int32 aExp;
3494 bits64 lastBitMask, roundBitsMask;
3495 int8 roundingMode;
3496 floatx80 z;
3498 aExp = extractFloatx80Exp( a );
3499 if ( 0x403E <= aExp ) {
3500 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3501 return propagateFloatx80NaN( a, a );
3503 return a;
3505 if ( aExp < 0x3FFF ) {
3506 if ( ( aExp == 0 )
3507 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3508 return a;
3510 float_exception_flags |= float_flag_inexact;
3511 aSign = extractFloatx80Sign( a );
3512 switch ( float_rounding_mode ) {
3513 case float_round_nearest_even:
3514 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3516 return
3517 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3519 break;
3520 case float_round_to_zero:
3521 break;
3522 case float_round_down:
3523 return
3524 aSign ?
3525 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3526 : packFloatx80( 0, 0, 0 );
3527 case float_round_up:
3528 return
3529 aSign ? packFloatx80( 1, 0, 0 )
3530 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3532 return packFloatx80( aSign, 0, 0 );
3534 lastBitMask = 1;
3535 lastBitMask <<= 0x403E - aExp;
3536 roundBitsMask = lastBitMask - 1;
3537 z = a;
3538 roundingMode = float_rounding_mode;
3539 if ( roundingMode == float_round_nearest_even ) {
3540 z.low += lastBitMask>>1;
3541 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3543 else if ( roundingMode != float_round_to_zero ) {
3544 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3545 z.low += roundBitsMask;
3548 z.low &= ~ roundBitsMask;
3549 if ( z.low == 0 ) {
3550 ++z.high;
3551 z.low = LIT64( 0x8000000000000000 );
3553 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3554 return z;
3559 -------------------------------------------------------------------------------
3560 Returns the result of adding the absolute values of the extended double-
3561 precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3562 negated before being returned. `zSign' is ignored if the result is a NaN.
3563 The addition is performed according to the IEC/IEEE Standard for Binary
3564 Floating-Point Arithmetic.
3565 -------------------------------------------------------------------------------
3567 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3569 int32 aExp, bExp, zExp;
3570 bits64 aSig, bSig, zSig0, zSig1;
3571 int32 expDiff;
3573 aSig = extractFloatx80Frac( a );
3574 aExp = extractFloatx80Exp( a );
3575 bSig = extractFloatx80Frac( b );
3576 bExp = extractFloatx80Exp( b );
3577 expDiff = aExp - bExp;
3578 if ( 0 < expDiff ) {
3579 if ( aExp == 0x7FFF ) {
3580 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3581 return a;
3583 if ( bExp == 0 ) --expDiff;
3584 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3585 zExp = aExp;
3587 else if ( expDiff < 0 ) {
3588 if ( bExp == 0x7FFF ) {
3589 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3590 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3592 if ( aExp == 0 ) ++expDiff;
3593 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3594 zExp = bExp;
3596 else {
3597 if ( aExp == 0x7FFF ) {
3598 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3599 return propagateFloatx80NaN( a, b );
3601 return a;
3603 zSig1 = 0;
3604 zSig0 = aSig + bSig;
3605 if ( aExp == 0 ) {
3606 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3607 goto roundAndPack;
3609 zExp = aExp;
3610 goto shiftRight1;
3612 zSig0 = aSig + bSig;
3613 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3614 shiftRight1:
3615 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3616 zSig0 |= LIT64( 0x8000000000000000 );
3617 ++zExp;
3618 roundAndPack:
3619 return
3620 roundAndPackFloatx80(
3621 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3626 -------------------------------------------------------------------------------
3627 Returns the result of subtracting the absolute values of the extended
3628 double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3629 difference is negated before being returned. `zSign' is ignored if the
3630 result is a NaN. The subtraction is performed according to the IEC/IEEE
3631 Standard for Binary Floating-Point Arithmetic.
3632 -------------------------------------------------------------------------------
3634 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3636 int32 aExp, bExp, zExp;
3637 bits64 aSig, bSig, zSig0, zSig1;
3638 int32 expDiff;
3639 floatx80 z;
3641 aSig = extractFloatx80Frac( a );
3642 aExp = extractFloatx80Exp( a );
3643 bSig = extractFloatx80Frac( b );
3644 bExp = extractFloatx80Exp( b );
3645 expDiff = aExp - bExp;
3646 if ( 0 < expDiff ) goto aExpBigger;
3647 if ( expDiff < 0 ) goto bExpBigger;
3648 if ( aExp == 0x7FFF ) {
3649 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3650 return propagateFloatx80NaN( a, b );
3652 float_raise( float_flag_invalid );
3653 z.low = floatx80_default_nan_low;
3654 z.high = floatx80_default_nan_high;
3655 return z;
3657 if ( aExp == 0 ) {
3658 aExp = 1;
3659 bExp = 1;
3661 zSig1 = 0;
3662 if ( bSig < aSig ) goto aBigger;
3663 if ( aSig < bSig ) goto bBigger;
3664 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3665 bExpBigger:
3666 if ( bExp == 0x7FFF ) {
3667 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3668 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3670 if ( aExp == 0 ) ++expDiff;
3671 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3672 bBigger:
3673 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3674 zExp = bExp;
3675 zSign ^= 1;
3676 goto normalizeRoundAndPack;
3677 aExpBigger:
3678 if ( aExp == 0x7FFF ) {
3679 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3680 return a;
3682 if ( bExp == 0 ) --expDiff;
3683 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3684 aBigger:
3685 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3686 zExp = aExp;
3687 normalizeRoundAndPack:
3688 return
3689 normalizeRoundAndPackFloatx80(
3690 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3695 -------------------------------------------------------------------------------
3696 Returns the result of adding the extended double-precision floating-point
3697 values `a' and `b'. The operation is performed according to the IEC/IEEE
3698 Standard for Binary Floating-Point Arithmetic.
3699 -------------------------------------------------------------------------------
3701 floatx80 floatx80_add( floatx80 a, floatx80 b )
3703 flag aSign, bSign;
3705 aSign = extractFloatx80Sign( a );
3706 bSign = extractFloatx80Sign( b );
3707 if ( aSign == bSign ) {
3708 return addFloatx80Sigs( a, b, aSign );
3710 else {
3711 return subFloatx80Sigs( a, b, aSign );
3717 -------------------------------------------------------------------------------
3718 Returns the result of subtracting the extended double-precision floating-
3719 point values `a' and `b'. The operation is performed according to the
3720 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3721 -------------------------------------------------------------------------------
3723 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3725 flag aSign, bSign;
3727 aSign = extractFloatx80Sign( a );
3728 bSign = extractFloatx80Sign( b );
3729 if ( aSign == bSign ) {
3730 return subFloatx80Sigs( a, b, aSign );
3732 else {
3733 return addFloatx80Sigs( a, b, aSign );
3739 -------------------------------------------------------------------------------
3740 Returns the result of multiplying the extended double-precision floating-
3741 point values `a' and `b'. The operation is performed according to the
3742 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3743 -------------------------------------------------------------------------------
3745 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3747 flag aSign, bSign, zSign;
3748 int32 aExp, bExp, zExp;
3749 bits64 aSig, bSig, zSig0, zSig1;
3750 floatx80 z;
3752 aSig = extractFloatx80Frac( a );
3753 aExp = extractFloatx80Exp( a );
3754 aSign = extractFloatx80Sign( a );
3755 bSig = extractFloatx80Frac( b );
3756 bExp = extractFloatx80Exp( b );
3757 bSign = extractFloatx80Sign( b );
3758 zSign = aSign ^ bSign;
3759 if ( aExp == 0x7FFF ) {
3760 if ( (bits64) ( aSig<<1 )
3761 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3762 return propagateFloatx80NaN( a, b );
3764 if ( ( bExp | bSig ) == 0 ) goto invalid;
3765 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3767 if ( bExp == 0x7FFF ) {
3768 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3769 if ( ( aExp | aSig ) == 0 ) {
3770 invalid:
3771 float_raise( float_flag_invalid );
3772 z.low = floatx80_default_nan_low;
3773 z.high = floatx80_default_nan_high;
3774 return z;
3776 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3778 if ( aExp == 0 ) {
3779 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3780 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3782 if ( bExp == 0 ) {
3783 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3784 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3786 zExp = aExp + bExp - 0x3FFE;
3787 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3788 if ( 0 < (sbits64) zSig0 ) {
3789 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3790 --zExp;
3792 return
3793 roundAndPackFloatx80(
3794 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3799 -------------------------------------------------------------------------------
3800 Returns the result of dividing the extended double-precision floating-point
3801 value `a' by the corresponding value `b'. The operation is performed
3802 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3803 -------------------------------------------------------------------------------
3805 floatx80 floatx80_div( floatx80 a, floatx80 b )
3807 flag aSign, bSign, zSign;
3808 int32 aExp, bExp, zExp;
3809 bits64 aSig, bSig, zSig0, zSig1;
3810 bits64 rem0, rem1, rem2, term0, term1, term2;
3811 floatx80 z;
3813 aSig = extractFloatx80Frac( a );
3814 aExp = extractFloatx80Exp( a );
3815 aSign = extractFloatx80Sign( a );
3816 bSig = extractFloatx80Frac( b );
3817 bExp = extractFloatx80Exp( b );
3818 bSign = extractFloatx80Sign( b );
3819 zSign = aSign ^ bSign;
3820 if ( aExp == 0x7FFF ) {
3821 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3822 if ( bExp == 0x7FFF ) {
3823 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3824 goto invalid;
3826 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3828 if ( bExp == 0x7FFF ) {
3829 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3830 return packFloatx80( zSign, 0, 0 );
3832 if ( bExp == 0 ) {
3833 if ( bSig == 0 ) {
3834 if ( ( aExp | aSig ) == 0 ) {
3835 invalid:
3836 float_raise( float_flag_invalid );
3837 z.low = floatx80_default_nan_low;
3838 z.high = floatx80_default_nan_high;
3839 return z;
3841 float_raise( float_flag_divbyzero );
3842 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3844 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3846 if ( aExp == 0 ) {
3847 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3848 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3850 zExp = aExp - bExp + 0x3FFE;
3851 rem1 = 0;
3852 if ( bSig <= aSig ) {
3853 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3854 ++zExp;
3856 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3857 mul64To128( bSig, zSig0, &term0, &term1 );
3858 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3859 while ( (sbits64) rem0 < 0 ) {
3860 --zSig0;
3861 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3863 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3864 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3865 mul64To128( bSig, zSig1, &term1, &term2 );
3866 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3867 while ( (sbits64) rem1 < 0 ) {
3868 --zSig1;
3869 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3871 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3873 return
3874 roundAndPackFloatx80(
3875 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3880 -------------------------------------------------------------------------------
3881 Returns the remainder of the extended double-precision floating-point value
3882 `a' with respect to the corresponding value `b'. The operation is performed
3883 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3884 -------------------------------------------------------------------------------
3886 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3888 flag aSign, bSign, zSign;
3889 int32 aExp, bExp, expDiff;
3890 bits64 aSig0, aSig1, bSig;
3891 bits64 q, term0, term1, alternateASig0, alternateASig1;
3892 floatx80 z;
3894 aSig0 = extractFloatx80Frac( a );
3895 aExp = extractFloatx80Exp( a );
3896 aSign = extractFloatx80Sign( a );
3897 bSig = extractFloatx80Frac( b );
3898 bExp = extractFloatx80Exp( b );
3899 bSign = extractFloatx80Sign( b );
3900 if ( aExp == 0x7FFF ) {
3901 if ( (bits64) ( aSig0<<1 )
3902 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3903 return propagateFloatx80NaN( a, b );
3905 goto invalid;
3907 if ( bExp == 0x7FFF ) {
3908 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3909 return a;
3911 if ( bExp == 0 ) {
3912 if ( bSig == 0 ) {
3913 invalid:
3914 float_raise( float_flag_invalid );
3915 z.low = floatx80_default_nan_low;
3916 z.high = floatx80_default_nan_high;
3917 return z;
3919 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3921 if ( aExp == 0 ) {
3922 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3923 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3925 bSig |= LIT64( 0x8000000000000000 );
3926 zSign = aSign;
3927 expDiff = aExp - bExp;
3928 aSig1 = 0;
3929 if ( expDiff < 0 ) {
3930 if ( expDiff < -1 ) return a;
3931 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3932 expDiff = 0;
3934 q = ( bSig <= aSig0 );
3935 if ( q ) aSig0 -= bSig;
3936 expDiff -= 64;
3937 while ( 0 < expDiff ) {
3938 q = estimateDiv128To64( aSig0, aSig1, bSig );
3939 q = ( 2 < q ) ? q - 2 : 0;
3940 mul64To128( bSig, q, &term0, &term1 );
3941 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3942 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3943 expDiff -= 62;
3945 expDiff += 64;
3946 if ( 0 < expDiff ) {
3947 q = estimateDiv128To64( aSig0, aSig1, bSig );
3948 q = ( 2 < q ) ? q - 2 : 0;
3949 q >>= 64 - expDiff;
3950 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3951 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3952 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3953 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3954 ++q;
3955 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3958 else {
3959 term1 = 0;
3960 term0 = bSig;
3962 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3963 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3964 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3965 && ( q & 1 ) )
3967 aSig0 = alternateASig0;
3968 aSig1 = alternateASig1;
3969 zSign = ! zSign;
3971 return
3972 normalizeRoundAndPackFloatx80(
3973 80, zSign, bExp + expDiff, aSig0, aSig1 );
3978 -------------------------------------------------------------------------------
3979 Returns the square root of the extended double-precision floating-point
3980 value `a'. The operation is performed according to the IEC/IEEE Standard
3981 for Binary Floating-Point Arithmetic.
3982 -------------------------------------------------------------------------------
3984 floatx80 floatx80_sqrt( floatx80 a )
3986 flag aSign;
3987 int32 aExp, zExp;
3988 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3989 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3990 floatx80 z;
3992 aSig0 = extractFloatx80Frac( a );
3993 aExp = extractFloatx80Exp( a );
3994 aSign = extractFloatx80Sign( a );
3995 if ( aExp == 0x7FFF ) {
3996 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3997 if ( ! aSign ) return a;
3998 goto invalid;
4000 if ( aSign ) {
4001 if ( ( aExp | aSig0 ) == 0 ) return a;
4002 invalid:
4003 float_raise( float_flag_invalid );
4004 z.low = floatx80_default_nan_low;
4005 z.high = floatx80_default_nan_high;
4006 return z;
4008 if ( aExp == 0 ) {
4009 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4010 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4012 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4013 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4014 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4015 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4016 doubleZSig0 = zSig0<<1;
4017 mul64To128( zSig0, zSig0, &term0, &term1 );
4018 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4019 while ( (sbits64) rem0 < 0 ) {
4020 --zSig0;
4021 doubleZSig0 -= 2;
4022 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4024 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4025 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4026 if ( zSig1 == 0 ) zSig1 = 1;
4027 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4028 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4029 mul64To128( zSig1, zSig1, &term2, &term3 );
4030 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4031 while ( (sbits64) rem1 < 0 ) {
4032 --zSig1;
4033 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4034 term3 |= 1;
4035 term2 |= doubleZSig0;
4036 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4038 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4040 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4041 zSig0 |= doubleZSig0;
4042 return
4043 roundAndPackFloatx80(
4044 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4049 -------------------------------------------------------------------------------
4050 Returns 1 if the extended double-precision floating-point value `a' is
4051 equal to the corresponding value `b', and 0 otherwise. The comparison is
4052 performed according to the IEC/IEEE Standard for Binary Floating-Point
4053 Arithmetic.
4054 -------------------------------------------------------------------------------
4056 flag floatx80_eq( floatx80 a, floatx80 b )
4059 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4060 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4061 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4062 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4064 if ( floatx80_is_signaling_nan( a )
4065 || floatx80_is_signaling_nan( b ) ) {
4066 float_raise( float_flag_invalid );
4068 return 0;
4070 return
4071 ( a.low == b.low )
4072 && ( ( a.high == b.high )
4073 || ( ( a.low == 0 )
4074 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4080 -------------------------------------------------------------------------------
4081 Returns 1 if the extended double-precision floating-point value `a' is
4082 less than or equal to the corresponding value `b', and 0 otherwise. The
4083 comparison is performed according to the IEC/IEEE Standard for Binary
4084 Floating-Point Arithmetic.
4085 -------------------------------------------------------------------------------
4087 flag floatx80_le( floatx80 a, floatx80 b )
4089 flag aSign, bSign;
4091 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4092 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4093 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4094 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4096 float_raise( float_flag_invalid );
4097 return 0;
4099 aSign = extractFloatx80Sign( a );
4100 bSign = extractFloatx80Sign( b );
4101 if ( aSign != bSign ) {
4102 return
4103 aSign
4104 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4105 == 0 );
4107 return
4108 aSign ? le128( b.high, b.low, a.high, a.low )
4109 : le128( a.high, a.low, b.high, b.low );
4114 -------------------------------------------------------------------------------
4115 Returns 1 if the extended double-precision floating-point value `a' is
4116 less than the corresponding value `b', and 0 otherwise. The comparison
4117 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4118 Arithmetic.
4119 -------------------------------------------------------------------------------
4121 flag floatx80_lt( floatx80 a, floatx80 b )
4123 flag aSign, bSign;
4125 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4126 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4127 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4128 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4130 float_raise( float_flag_invalid );
4131 return 0;
4133 aSign = extractFloatx80Sign( a );
4134 bSign = extractFloatx80Sign( b );
4135 if ( aSign != bSign ) {
4136 return
4137 aSign
4138 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4139 != 0 );
4141 return
4142 aSign ? lt128( b.high, b.low, a.high, a.low )
4143 : lt128( a.high, a.low, b.high, b.low );
4148 -------------------------------------------------------------------------------
4149 Returns 1 if the extended double-precision floating-point value `a' is equal
4150 to the corresponding value `b', and 0 otherwise. The invalid exception is
4151 raised if either operand is a NaN. Otherwise, the comparison is performed
4152 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4153 -------------------------------------------------------------------------------
4155 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4158 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4159 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4160 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4161 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4163 float_raise( float_flag_invalid );
4164 return 0;
4166 return
4167 ( a.low == b.low )
4168 && ( ( a.high == b.high )
4169 || ( ( a.low == 0 )
4170 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4176 -------------------------------------------------------------------------------
4177 Returns 1 if the extended double-precision floating-point value `a' is less
4178 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4179 do not cause an exception. Otherwise, the comparison is performed according
4180 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4181 -------------------------------------------------------------------------------
4183 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4185 flag aSign, bSign;
4187 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4188 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4189 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4190 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4192 if ( floatx80_is_signaling_nan( a )
4193 || floatx80_is_signaling_nan( b ) ) {
4194 float_raise( float_flag_invalid );
4196 return 0;
4198 aSign = extractFloatx80Sign( a );
4199 bSign = extractFloatx80Sign( b );
4200 if ( aSign != bSign ) {
4201 return
4202 aSign
4203 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4204 == 0 );
4206 return
4207 aSign ? le128( b.high, b.low, a.high, a.low )
4208 : le128( a.high, a.low, b.high, b.low );
4213 -------------------------------------------------------------------------------
4214 Returns 1 if the extended double-precision floating-point value `a' is less
4215 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4216 an exception. Otherwise, the comparison is performed according to the
4217 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4218 -------------------------------------------------------------------------------
4220 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4222 flag aSign, bSign;
4224 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4225 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4226 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4227 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4229 if ( floatx80_is_signaling_nan( a )
4230 || floatx80_is_signaling_nan( b ) ) {
4231 float_raise( float_flag_invalid );
4233 return 0;
4235 aSign = extractFloatx80Sign( a );
4236 bSign = extractFloatx80Sign( b );
4237 if ( aSign != bSign ) {
4238 return
4239 aSign
4240 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4241 != 0 );
4243 return
4244 aSign ? lt128( b.high, b.low, a.high, a.low )
4245 : lt128( a.high, a.low, b.high, b.low );
4249 #endif
4251 #ifdef FLOAT128
4254 -------------------------------------------------------------------------------
4255 Returns the result of converting the quadruple-precision floating-point
4256 value `a' to the 32-bit two's complement integer format. The conversion
4257 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4258 Arithmetic---which means in particular that the conversion is rounded
4259 according to the current rounding mode. If `a' is a NaN, the largest
4260 positive integer is returned. Otherwise, if the conversion overflows, the
4261 largest integer with the same sign as `a' is returned.
4262 -------------------------------------------------------------------------------
4264 int32 float128_to_int32( float128 a )
4266 flag aSign;
4267 int32 aExp, shiftCount;
4268 bits64 aSig0, aSig1;
4270 aSig1 = extractFloat128Frac1( a );
4271 aSig0 = extractFloat128Frac0( a );
4272 aExp = extractFloat128Exp( a );
4273 aSign = extractFloat128Sign( a );
4274 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4275 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4276 aSig0 |= ( aSig1 != 0 );
4277 shiftCount = 0x4028 - aExp;
4278 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4279 return roundAndPackInt32( aSign, aSig0 );
4284 -------------------------------------------------------------------------------
4285 Returns the result of converting the quadruple-precision floating-point
4286 value `a' to the 32-bit two's complement integer format. The conversion
4287 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4288 Arithmetic, except that the conversion is always rounded toward zero. If
4289 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4290 conversion overflows, the largest integer with the same sign as `a' is
4291 returned.
4292 -------------------------------------------------------------------------------
4294 int32 float128_to_int32_round_to_zero( float128 a )
4296 flag aSign;
4297 int32 aExp, shiftCount;
4298 bits64 aSig0, aSig1, savedASig;
4299 int32 z;
4301 aSig1 = extractFloat128Frac1( a );
4302 aSig0 = extractFloat128Frac0( a );
4303 aExp = extractFloat128Exp( a );
4304 aSign = extractFloat128Sign( a );
4305 aSig0 |= ( aSig1 != 0 );
4306 if ( 0x401E < aExp ) {
4307 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4308 goto invalid;
4310 else if ( aExp < 0x3FFF ) {
4311 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4312 return 0;
4314 aSig0 |= LIT64( 0x0001000000000000 );
4315 shiftCount = 0x402F - aExp;
4316 savedASig = aSig0;
4317 aSig0 >>= shiftCount;
4318 z = aSig0;
4319 if ( aSign ) z = - z;
4320 if ( ( z < 0 ) ^ aSign ) {
4321 invalid:
4322 float_raise( float_flag_invalid );
4323 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4325 if ( ( aSig0<<shiftCount ) != savedASig ) {
4326 float_exception_flags |= float_flag_inexact;
4328 return z;
4333 -------------------------------------------------------------------------------
4334 Returns the result of converting the quadruple-precision floating-point
4335 value `a' to the 64-bit two's complement integer format. The conversion
4336 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4337 Arithmetic---which means in particular that the conversion is rounded
4338 according to the current rounding mode. If `a' is a NaN, the largest
4339 positive integer is returned. Otherwise, if the conversion overflows, the
4340 largest integer with the same sign as `a' is returned.
4341 -------------------------------------------------------------------------------
4343 int64 float128_to_int64( float128 a )
4345 flag aSign;
4346 int32 aExp, shiftCount;
4347 bits64 aSig0, aSig1;
4349 aSig1 = extractFloat128Frac1( a );
4350 aSig0 = extractFloat128Frac0( a );
4351 aExp = extractFloat128Exp( a );
4352 aSign = extractFloat128Sign( a );
4353 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4354 shiftCount = 0x402F - aExp;
4355 if ( shiftCount <= 0 ) {
4356 if ( 0x403E < aExp ) {
4357 float_raise( float_flag_invalid );
4358 if ( ! aSign
4359 || ( ( aExp == 0x7FFF )
4360 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4363 return LIT64( 0x7FFFFFFFFFFFFFFF );
4365 return (sbits64) LIT64( 0x8000000000000000 );
4367 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4369 else {
4370 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4372 return roundAndPackInt64( aSign, aSig0, aSig1 );
4377 -------------------------------------------------------------------------------
4378 Returns the result of converting the quadruple-precision floating-point
4379 value `a' to the 64-bit two's complement integer format. The conversion
4380 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4381 Arithmetic, except that the conversion is always rounded toward zero.
4382 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4383 the conversion overflows, the largest integer with the same sign as `a' is
4384 returned.
4385 -------------------------------------------------------------------------------
4387 int64 float128_to_int64_round_to_zero( float128 a )
4389 flag aSign;
4390 int32 aExp, shiftCount;
4391 bits64 aSig0, aSig1;
4392 int64 z;
4394 aSig1 = extractFloat128Frac1( a );
4395 aSig0 = extractFloat128Frac0( a );
4396 aExp = extractFloat128Exp( a );
4397 aSign = extractFloat128Sign( a );
4398 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4399 shiftCount = aExp - 0x402F;
4400 if ( 0 < shiftCount ) {
4401 if ( 0x403E <= aExp ) {
4402 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4403 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4404 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4405 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4407 else {
4408 float_raise( float_flag_invalid );
4409 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4410 return LIT64( 0x7FFFFFFFFFFFFFFF );
4413 return (sbits64) LIT64( 0x8000000000000000 );
4415 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4416 if ( (bits64) ( aSig1<<shiftCount ) ) {
4417 float_exception_flags |= float_flag_inexact;
4420 else {
4421 if ( aExp < 0x3FFF ) {
4422 if ( aExp | aSig0 | aSig1 ) {
4423 float_exception_flags |= float_flag_inexact;
4425 return 0;
4427 z = aSig0>>( - shiftCount );
4428 if ( aSig1
4429 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4430 float_exception_flags |= float_flag_inexact;
4433 if ( aSign ) z = - z;
4434 return z;
4439 * just like above - but do not care for overflow of signed results
4441 uint64 float128_to_uint64_round_to_zero( float128 a )
4443 flag aSign;
4444 int32 aExp, shiftCount;
4445 bits64 aSig0, aSig1;
4446 uint64 z;
4448 aSig1 = extractFloat128Frac1( a );
4449 aSig0 = extractFloat128Frac0( a );
4450 aExp = extractFloat128Exp( a );
4451 aSign = extractFloat128Sign( a );
4452 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4453 shiftCount = aExp - 0x402F;
4454 if ( 0 < shiftCount ) {
4455 if ( 0x403F <= aExp ) {
4456 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4457 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4458 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4459 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4461 else {
4462 float_raise( float_flag_invalid );
4464 return LIT64( 0xFFFFFFFFFFFFFFFF );
4466 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4467 if ( (bits64) ( aSig1<<shiftCount ) ) {
4468 float_exception_flags |= float_flag_inexact;
4471 else {
4472 if ( aExp < 0x3FFF ) {
4473 if ( aExp | aSig0 | aSig1 ) {
4474 float_exception_flags |= float_flag_inexact;
4476 return 0;
4478 z = aSig0>>( - shiftCount );
4479 if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4480 float_exception_flags |= float_flag_inexact;
4483 if ( aSign ) z = - z;
4484 return z;
4489 -------------------------------------------------------------------------------
4490 Returns the result of converting the quadruple-precision floating-point
4491 value `a' to the single-precision floating-point format. The conversion
4492 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4493 Arithmetic.
4494 -------------------------------------------------------------------------------
4496 float32 float128_to_float32( float128 a )
4498 flag aSign;
4499 int32 aExp;
4500 bits64 aSig0, aSig1;
4501 bits32 zSig;
4503 aSig1 = extractFloat128Frac1( a );
4504 aSig0 = extractFloat128Frac0( a );
4505 aExp = extractFloat128Exp( a );
4506 aSign = extractFloat128Sign( a );
4507 if ( aExp == 0x7FFF ) {
4508 if ( aSig0 | aSig1 ) {
4509 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4511 return packFloat32( aSign, 0xFF, 0 );
4513 aSig0 |= ( aSig1 != 0 );
4514 shift64RightJamming( aSig0, 18, &aSig0 );
4515 zSig = aSig0;
4516 if ( aExp || zSig ) {
4517 zSig |= 0x40000000;
4518 aExp -= 0x3F81;
4520 return roundAndPackFloat32( aSign, aExp, zSig );
4525 -------------------------------------------------------------------------------
4526 Returns the result of converting the quadruple-precision floating-point
4527 value `a' to the double-precision floating-point format. The conversion
4528 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4529 Arithmetic.
4530 -------------------------------------------------------------------------------
4532 float64 float128_to_float64( float128 a )
4534 flag aSign;
4535 int32 aExp;
4536 bits64 aSig0, aSig1;
4538 aSig1 = extractFloat128Frac1( a );
4539 aSig0 = extractFloat128Frac0( a );
4540 aExp = extractFloat128Exp( a );
4541 aSign = extractFloat128Sign( a );
4542 if ( aExp == 0x7FFF ) {
4543 if ( aSig0 | aSig1 ) {
4544 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4546 return packFloat64( aSign, 0x7FF, 0 );
4548 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4549 aSig0 |= ( aSig1 != 0 );
4550 if ( aExp || aSig0 ) {
4551 aSig0 |= LIT64( 0x4000000000000000 );
4552 aExp -= 0x3C01;
4554 return roundAndPackFloat64( aSign, aExp, aSig0 );
4558 #ifdef FLOATX80
4561 -------------------------------------------------------------------------------
4562 Returns the result of converting the quadruple-precision floating-point
4563 value `a' to the extended double-precision floating-point format. The
4564 conversion is performed according to the IEC/IEEE Standard for Binary
4565 Floating-Point Arithmetic.
4566 -------------------------------------------------------------------------------
4568 floatx80 float128_to_floatx80( float128 a )
4570 flag aSign;
4571 int32 aExp;
4572 bits64 aSig0, aSig1;
4574 aSig1 = extractFloat128Frac1( a );
4575 aSig0 = extractFloat128Frac0( a );
4576 aExp = extractFloat128Exp( a );
4577 aSign = extractFloat128Sign( a );
4578 if ( aExp == 0x7FFF ) {
4579 if ( aSig0 | aSig1 ) {
4580 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4582 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4584 if ( aExp == 0 ) {
4585 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4586 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4588 else {
4589 aSig0 |= LIT64( 0x0001000000000000 );
4591 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4592 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4596 #endif
4599 -------------------------------------------------------------------------------
4600 Rounds the quadruple-precision floating-point value `a' to an integer, and
4601 returns the result as a quadruple-precision floating-point value. The
4602 operation is performed according to the IEC/IEEE Standard for Binary
4603 Floating-Point Arithmetic.
4604 -------------------------------------------------------------------------------
4606 float128 float128_round_to_int( float128 a )
4608 flag aSign;
4609 int32 aExp;
4610 bits64 lastBitMask, roundBitsMask;
4611 int8 roundingMode;
4612 float128 z;
4614 aExp = extractFloat128Exp( a );
4615 if ( 0x402F <= aExp ) {
4616 if ( 0x406F <= aExp ) {
4617 if ( ( aExp == 0x7FFF )
4618 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4620 return propagateFloat128NaN( a, a );
4622 return a;
4624 lastBitMask = 1;
4625 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4626 roundBitsMask = lastBitMask - 1;
4627 z = a;
4628 roundingMode = float_rounding_mode;
4629 if ( roundingMode == float_round_nearest_even ) {
4630 if ( lastBitMask ) {
4631 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4632 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4634 else {
4635 if ( (sbits64) z.low < 0 ) {
4636 ++z.high;
4637 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4641 else if ( roundingMode != float_round_to_zero ) {
4642 if ( extractFloat128Sign( z )
4643 ^ ( roundingMode == float_round_up ) ) {
4644 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4647 z.low &= ~ roundBitsMask;
4649 else {
4650 if ( aExp < 0x3FFF ) {
4651 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4652 float_exception_flags |= float_flag_inexact;
4653 aSign = extractFloat128Sign( a );
4654 switch ( float_rounding_mode ) {
4655 case float_round_nearest_even:
4656 if ( ( aExp == 0x3FFE )
4657 && ( extractFloat128Frac0( a )
4658 | extractFloat128Frac1( a ) )
4660 return packFloat128( aSign, 0x3FFF, 0, 0 );
4662 break;
4663 case float_round_to_zero:
4664 break;
4665 case float_round_down:
4666 return
4667 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4668 : packFloat128( 0, 0, 0, 0 );
4669 case float_round_up:
4670 return
4671 aSign ? packFloat128( 1, 0, 0, 0 )
4672 : packFloat128( 0, 0x3FFF, 0, 0 );
4674 return packFloat128( aSign, 0, 0, 0 );
4676 lastBitMask = 1;
4677 lastBitMask <<= 0x402F - aExp;
4678 roundBitsMask = lastBitMask - 1;
4679 z.low = 0;
4680 z.high = a.high;
4681 roundingMode = float_rounding_mode;
4682 if ( roundingMode == float_round_nearest_even ) {
4683 z.high += lastBitMask>>1;
4684 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4685 z.high &= ~ lastBitMask;
4688 else if ( roundingMode != float_round_to_zero ) {
4689 if ( extractFloat128Sign( z )
4690 ^ ( roundingMode == float_round_up ) ) {
4691 z.high |= ( a.low != 0 );
4692 z.high += roundBitsMask;
4695 z.high &= ~ roundBitsMask;
4697 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4698 float_exception_flags |= float_flag_inexact;
4700 return z;
4705 -------------------------------------------------------------------------------
4706 Returns the result of adding the absolute values of the quadruple-precision
4707 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4708 before being returned. `zSign' is ignored if the result is a NaN.
4709 The addition is performed according to the IEC/IEEE Standard for Binary
4710 Floating-Point Arithmetic.
4711 -------------------------------------------------------------------------------
4713 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4715 int32 aExp, bExp, zExp;
4716 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4717 int32 expDiff;
4719 aSig1 = extractFloat128Frac1( a );
4720 aSig0 = extractFloat128Frac0( a );
4721 aExp = extractFloat128Exp( a );
4722 bSig1 = extractFloat128Frac1( b );
4723 bSig0 = extractFloat128Frac0( b );
4724 bExp = extractFloat128Exp( b );
4725 expDiff = aExp - bExp;
4726 if ( 0 < expDiff ) {
4727 if ( aExp == 0x7FFF ) {
4728 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4729 return a;
4731 if ( bExp == 0 ) {
4732 --expDiff;
4734 else {
4735 bSig0 |= LIT64( 0x0001000000000000 );
4737 shift128ExtraRightJamming(
4738 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4739 zExp = aExp;
4741 else if ( expDiff < 0 ) {
4742 if ( bExp == 0x7FFF ) {
4743 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4744 return packFloat128( zSign, 0x7FFF, 0, 0 );
4746 if ( aExp == 0 ) {
4747 ++expDiff;
4749 else {
4750 aSig0 |= LIT64( 0x0001000000000000 );
4752 shift128ExtraRightJamming(
4753 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4754 zExp = bExp;
4756 else {
4757 if ( aExp == 0x7FFF ) {
4758 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4759 return propagateFloat128NaN( a, b );
4761 return a;
4763 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4764 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4765 zSig2 = 0;
4766 zSig0 |= LIT64( 0x0002000000000000 );
4767 zExp = aExp;
4768 goto shiftRight1;
4770 aSig0 |= LIT64( 0x0001000000000000 );
4771 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4772 --zExp;
4773 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4774 ++zExp;
4775 shiftRight1:
4776 shift128ExtraRightJamming(
4777 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4778 roundAndPack:
4779 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4784 -------------------------------------------------------------------------------
4785 Returns the result of subtracting the absolute values of the quadruple-
4786 precision floating-point values `a' and `b'. If `zSign' is 1, the
4787 difference is negated before being returned. `zSign' is ignored if the
4788 result is a NaN. The subtraction is performed according to the IEC/IEEE
4789 Standard for Binary Floating-Point Arithmetic.
4790 -------------------------------------------------------------------------------
4792 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4794 int32 aExp, bExp, zExp;
4795 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4796 int32 expDiff;
4797 float128 z;
4799 aSig1 = extractFloat128Frac1( a );
4800 aSig0 = extractFloat128Frac0( a );
4801 aExp = extractFloat128Exp( a );
4802 bSig1 = extractFloat128Frac1( b );
4803 bSig0 = extractFloat128Frac0( b );
4804 bExp = extractFloat128Exp( b );
4805 expDiff = aExp - bExp;
4806 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4807 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4808 if ( 0 < expDiff ) goto aExpBigger;
4809 if ( expDiff < 0 ) goto bExpBigger;
4810 if ( aExp == 0x7FFF ) {
4811 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4812 return propagateFloat128NaN( a, b );
4814 float_raise( float_flag_invalid );
4815 z.low = float128_default_nan_low;
4816 z.high = float128_default_nan_high;
4817 return z;
4819 if ( aExp == 0 ) {
4820 aExp = 1;
4821 bExp = 1;
4823 if ( bSig0 < aSig0 ) goto aBigger;
4824 if ( aSig0 < bSig0 ) goto bBigger;
4825 if ( bSig1 < aSig1 ) goto aBigger;
4826 if ( aSig1 < bSig1 ) goto bBigger;
4827 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4828 bExpBigger:
4829 if ( bExp == 0x7FFF ) {
4830 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4831 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4833 if ( aExp == 0 ) {
4834 ++expDiff;
4836 else {
4837 aSig0 |= LIT64( 0x4000000000000000 );
4839 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4840 bSig0 |= LIT64( 0x4000000000000000 );
4841 bBigger:
4842 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4843 zExp = bExp;
4844 zSign ^= 1;
4845 goto normalizeRoundAndPack;
4846 aExpBigger:
4847 if ( aExp == 0x7FFF ) {
4848 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4849 return a;
4851 if ( bExp == 0 ) {
4852 --expDiff;
4854 else {
4855 bSig0 |= LIT64( 0x4000000000000000 );
4857 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4858 aSig0 |= LIT64( 0x4000000000000000 );
4859 aBigger:
4860 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4861 zExp = aExp;
4862 normalizeRoundAndPack:
4863 --zExp;
4864 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4869 -------------------------------------------------------------------------------
4870 Returns the result of adding the quadruple-precision floating-point values
4871 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4872 for Binary Floating-Point Arithmetic.
4873 -------------------------------------------------------------------------------
4875 float128 float128_add( float128 a, float128 b )
4877 flag aSign, bSign;
4879 aSign = extractFloat128Sign( a );
4880 bSign = extractFloat128Sign( b );
4881 if ( aSign == bSign ) {
4882 return addFloat128Sigs( a, b, aSign );
4884 else {
4885 return subFloat128Sigs( a, b, aSign );
4891 -------------------------------------------------------------------------------
4892 Returns the result of subtracting the quadruple-precision floating-point
4893 values `a' and `b'. The operation is performed according to the IEC/IEEE
4894 Standard for Binary Floating-Point Arithmetic.
4895 -------------------------------------------------------------------------------
4897 float128 float128_sub( float128 a, float128 b )
4899 flag aSign, bSign;
4901 aSign = extractFloat128Sign( a );
4902 bSign = extractFloat128Sign( b );
4903 if ( aSign == bSign ) {
4904 return subFloat128Sigs( a, b, aSign );
4906 else {
4907 return addFloat128Sigs( a, b, aSign );
4913 -------------------------------------------------------------------------------
4914 Returns the result of multiplying the quadruple-precision floating-point
4915 values `a' and `b'. The operation is performed according to the IEC/IEEE
4916 Standard for Binary Floating-Point Arithmetic.
4917 -------------------------------------------------------------------------------
4919 float128 float128_mul( float128 a, float128 b )
4921 flag aSign, bSign, zSign;
4922 int32 aExp, bExp, zExp;
4923 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4924 float128 z;
4926 aSig1 = extractFloat128Frac1( a );
4927 aSig0 = extractFloat128Frac0( a );
4928 aExp = extractFloat128Exp( a );
4929 aSign = extractFloat128Sign( a );
4930 bSig1 = extractFloat128Frac1( b );
4931 bSig0 = extractFloat128Frac0( b );
4932 bExp = extractFloat128Exp( b );
4933 bSign = extractFloat128Sign( b );
4934 zSign = aSign ^ bSign;
4935 if ( aExp == 0x7FFF ) {
4936 if ( ( aSig0 | aSig1 )
4937 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4938 return propagateFloat128NaN( a, b );
4940 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4941 return packFloat128( zSign, 0x7FFF, 0, 0 );
4943 if ( bExp == 0x7FFF ) {
4944 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4945 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4946 invalid:
4947 float_raise( float_flag_invalid );
4948 z.low = float128_default_nan_low;
4949 z.high = float128_default_nan_high;
4950 return z;
4952 return packFloat128( zSign, 0x7FFF, 0, 0 );
4954 if ( aExp == 0 ) {
4955 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4956 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4958 if ( bExp == 0 ) {
4959 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4960 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4962 zExp = aExp + bExp - 0x4000;
4963 aSig0 |= LIT64( 0x0001000000000000 );
4964 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4965 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4966 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4967 zSig2 |= ( zSig3 != 0 );
4968 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4969 shift128ExtraRightJamming(
4970 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4971 ++zExp;
4973 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4978 -------------------------------------------------------------------------------
4979 Returns the result of dividing the quadruple-precision floating-point value
4980 `a' by the corresponding value `b'. The operation is performed according to
4981 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4982 -------------------------------------------------------------------------------
4984 float128 float128_div( float128 a, float128 b )
4986 flag aSign, bSign, zSign;
4987 int32 aExp, bExp, zExp;
4988 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4989 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4990 float128 z;
4992 aSig1 = extractFloat128Frac1( a );
4993 aSig0 = extractFloat128Frac0( a );
4994 aExp = extractFloat128Exp( a );
4995 aSign = extractFloat128Sign( a );
4996 bSig1 = extractFloat128Frac1( b );
4997 bSig0 = extractFloat128Frac0( b );
4998 bExp = extractFloat128Exp( b );
4999 bSign = extractFloat128Sign( b );
5000 zSign = aSign ^ bSign;
5001 if ( aExp == 0x7FFF ) {
5002 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5003 if ( bExp == 0x7FFF ) {
5004 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5005 goto invalid;
5007 return packFloat128( zSign, 0x7FFF, 0, 0 );
5009 if ( bExp == 0x7FFF ) {
5010 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5011 return packFloat128( zSign, 0, 0, 0 );
5013 if ( bExp == 0 ) {
5014 if ( ( bSig0 | bSig1 ) == 0 ) {
5015 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5016 invalid:
5017 float_raise( float_flag_invalid );
5018 z.low = float128_default_nan_low;
5019 z.high = float128_default_nan_high;
5020 return z;
5022 float_raise( float_flag_divbyzero );
5023 return packFloat128( zSign, 0x7FFF, 0, 0 );
5025 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5027 if ( aExp == 0 ) {
5028 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5029 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5031 zExp = aExp - bExp + 0x3FFD;
5032 shortShift128Left(
5033 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5034 shortShift128Left(
5035 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5036 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5037 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5038 ++zExp;
5040 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5041 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5042 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5043 while ( (sbits64) rem0 < 0 ) {
5044 --zSig0;
5045 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5047 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5048 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5049 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5050 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5051 while ( (sbits64) rem1 < 0 ) {
5052 --zSig1;
5053 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5055 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5057 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5058 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5063 -------------------------------------------------------------------------------
5064 Returns the remainder of the quadruple-precision floating-point value `a'
5065 with respect to the corresponding value `b'. The operation is performed
5066 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5067 -------------------------------------------------------------------------------
5069 float128 float128_rem( float128 a, float128 b )
5071 flag aSign, bSign, zSign;
5072 int32 aExp, bExp, expDiff;
5073 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5074 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5075 sbits64 sigMean0;
5076 float128 z;
5078 aSig1 = extractFloat128Frac1( a );
5079 aSig0 = extractFloat128Frac0( a );
5080 aExp = extractFloat128Exp( a );
5081 aSign = extractFloat128Sign( a );
5082 bSig1 = extractFloat128Frac1( b );
5083 bSig0 = extractFloat128Frac0( b );
5084 bExp = extractFloat128Exp( b );
5085 bSign = extractFloat128Sign( b );
5086 if ( aExp == 0x7FFF ) {
5087 if ( ( aSig0 | aSig1 )
5088 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5089 return propagateFloat128NaN( a, b );
5091 goto invalid;
5093 if ( bExp == 0x7FFF ) {
5094 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5095 return a;
5097 if ( bExp == 0 ) {
5098 if ( ( bSig0 | bSig1 ) == 0 ) {
5099 invalid:
5100 float_raise( float_flag_invalid );
5101 z.low = float128_default_nan_low;
5102 z.high = float128_default_nan_high;
5103 return z;
5105 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5107 if ( aExp == 0 ) {
5108 if ( ( aSig0 | aSig1 ) == 0 ) return a;
5109 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5111 expDiff = aExp - bExp;
5112 if ( expDiff < -1 ) return a;
5113 shortShift128Left(
5114 aSig0 | LIT64( 0x0001000000000000 ),
5115 aSig1,
5116 15 - ( expDiff < 0 ),
5117 &aSig0,
5118 &aSig1
5120 shortShift128Left(
5121 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5122 q = le128( bSig0, bSig1, aSig0, aSig1 );
5123 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5124 expDiff -= 64;
5125 while ( 0 < expDiff ) {
5126 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5127 q = ( 4 < q ) ? q - 4 : 0;
5128 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5129 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5130 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5131 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5132 expDiff -= 61;
5134 if ( -64 < expDiff ) {
5135 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5136 q = ( 4 < q ) ? q - 4 : 0;
5137 q >>= - expDiff;
5138 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5139 expDiff += 52;
5140 if ( expDiff < 0 ) {
5141 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5143 else {
5144 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5146 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5147 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5149 else {
5150 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5151 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5153 do {
5154 alternateASig0 = aSig0;
5155 alternateASig1 = aSig1;
5156 ++q;
5157 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5158 } while ( 0 <= (sbits64) aSig0 );
5159 add128(
5160 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5161 if ( ( sigMean0 < 0 )
5162 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5163 aSig0 = alternateASig0;
5164 aSig1 = alternateASig1;
5166 zSign = ( (sbits64) aSig0 < 0 );
5167 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5168 return
5169 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5174 -------------------------------------------------------------------------------
5175 Returns the square root of the quadruple-precision floating-point value `a'.
5176 The operation is performed according to the IEC/IEEE Standard for Binary
5177 Floating-Point Arithmetic.
5178 -------------------------------------------------------------------------------
5180 float128 float128_sqrt( float128 a )
5182 flag aSign;
5183 int32 aExp, zExp;
5184 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5185 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5186 float128 z;
5188 aSig1 = extractFloat128Frac1( a );
5189 aSig0 = extractFloat128Frac0( a );
5190 aExp = extractFloat128Exp( a );
5191 aSign = extractFloat128Sign( a );
5192 if ( aExp == 0x7FFF ) {
5193 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5194 if ( ! aSign ) return a;
5195 goto invalid;
5197 if ( aSign ) {
5198 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5199 invalid:
5200 float_raise( float_flag_invalid );
5201 z.low = float128_default_nan_low;
5202 z.high = float128_default_nan_high;
5203 return z;
5205 if ( aExp == 0 ) {
5206 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5207 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5209 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5210 aSig0 |= LIT64( 0x0001000000000000 );
5211 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5212 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5213 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5214 doubleZSig0 = zSig0<<1;
5215 mul64To128( zSig0, zSig0, &term0, &term1 );
5216 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5217 while ( (sbits64) rem0 < 0 ) {
5218 --zSig0;
5219 doubleZSig0 -= 2;
5220 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5222 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5223 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5224 if ( zSig1 == 0 ) zSig1 = 1;
5225 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5226 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5227 mul64To128( zSig1, zSig1, &term2, &term3 );
5228 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5229 while ( (sbits64) rem1 < 0 ) {
5230 --zSig1;
5231 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5232 term3 |= 1;
5233 term2 |= doubleZSig0;
5234 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5236 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5238 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5239 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5244 -------------------------------------------------------------------------------
5245 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5246 the corresponding value `b', and 0 otherwise. The comparison is performed
5247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5248 -------------------------------------------------------------------------------
5250 flag float128_eq( float128 a, float128 b )
5253 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5254 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5255 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5256 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5258 if ( float128_is_signaling_nan( a )
5259 || float128_is_signaling_nan( b ) ) {
5260 float_raise( float_flag_invalid );
5262 return 0;
5264 return
5265 ( a.low == b.low )
5266 && ( ( a.high == b.high )
5267 || ( ( a.low == 0 )
5268 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5274 -------------------------------------------------------------------------------
5275 Returns 1 if the quadruple-precision floating-point value `a' is less than
5276 or equal to the corresponding value `b', and 0 otherwise. The comparison
5277 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5278 Arithmetic.
5279 -------------------------------------------------------------------------------
5281 flag float128_le( float128 a, float128 b )
5283 flag aSign, bSign;
5285 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5286 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5287 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5288 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5290 float_raise( float_flag_invalid );
5291 return 0;
5293 aSign = extractFloat128Sign( a );
5294 bSign = extractFloat128Sign( b );
5295 if ( aSign != bSign ) {
5296 return
5297 aSign
5298 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5299 == 0 );
5301 return
5302 aSign ? le128( b.high, b.low, a.high, a.low )
5303 : le128( a.high, a.low, b.high, b.low );
5308 -------------------------------------------------------------------------------
5309 Returns 1 if the quadruple-precision floating-point value `a' is less than
5310 the corresponding value `b', and 0 otherwise. The comparison is performed
5311 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5312 -------------------------------------------------------------------------------
5314 flag float128_lt( float128 a, float128 b )
5316 flag aSign, bSign;
5318 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5319 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5320 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5321 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5323 float_raise( float_flag_invalid );
5324 return 0;
5326 aSign = extractFloat128Sign( a );
5327 bSign = extractFloat128Sign( b );
5328 if ( aSign != bSign ) {
5329 return
5330 aSign
5331 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5332 != 0 );
5334 return
5335 aSign ? lt128( b.high, b.low, a.high, a.low )
5336 : lt128( a.high, a.low, b.high, b.low );
5341 -------------------------------------------------------------------------------
5342 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5343 the corresponding value `b', and 0 otherwise. The invalid exception is
5344 raised if either operand is a NaN. Otherwise, the comparison is performed
5345 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5346 -------------------------------------------------------------------------------
5348 flag float128_eq_signaling( float128 a, float128 b )
5351 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5352 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5353 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5354 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5356 float_raise( float_flag_invalid );
5357 return 0;
5359 return
5360 ( a.low == b.low )
5361 && ( ( a.high == b.high )
5362 || ( ( a.low == 0 )
5363 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5369 -------------------------------------------------------------------------------
5370 Returns 1 if the quadruple-precision floating-point value `a' is less than
5371 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5372 cause an exception. Otherwise, the comparison is performed according to the
5373 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5374 -------------------------------------------------------------------------------
5376 flag float128_le_quiet( float128 a, float128 b )
5378 flag aSign, bSign;
5380 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5381 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5382 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5383 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5385 if ( float128_is_signaling_nan( a )
5386 || float128_is_signaling_nan( b ) ) {
5387 float_raise( float_flag_invalid );
5389 return 0;
5391 aSign = extractFloat128Sign( a );
5392 bSign = extractFloat128Sign( b );
5393 if ( aSign != bSign ) {
5394 return
5395 aSign
5396 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5397 == 0 );
5399 return
5400 aSign ? le128( b.high, b.low, a.high, a.low )
5401 : le128( a.high, a.low, b.high, b.low );
5406 -------------------------------------------------------------------------------
5407 Returns 1 if the quadruple-precision floating-point value `a' is less than
5408 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5409 exception. Otherwise, the comparison is performed according to the IEC/IEEE
5410 Standard for Binary Floating-Point Arithmetic.
5411 -------------------------------------------------------------------------------
5413 flag float128_lt_quiet( float128 a, float128 b )
5415 flag aSign, bSign;
5417 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5418 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5419 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5420 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5422 if ( float128_is_signaling_nan( a )
5423 || float128_is_signaling_nan( b ) ) {
5424 float_raise( float_flag_invalid );
5426 return 0;
5428 aSign = extractFloat128Sign( a );
5429 bSign = extractFloat128Sign( b );
5430 if ( aSign != bSign ) {
5431 return
5432 aSign
5433 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5434 != 0 );
5436 return
5437 aSign ? lt128( b.high, b.low, a.high, a.low )
5438 : lt128( a.high, a.low, b.high, b.low );
5442 #endif
5445 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5448 * These two routines are not part of the original softfloat distribution.
5450 * They are based on the corresponding conversions to integer but return
5451 * unsigned numbers instead since these functions are required by GCC.
5453 * Added by Mark Brinicombe <mark@NetBSD.org> 27/09/97
5455 * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5459 -------------------------------------------------------------------------------
5460 Returns the result of converting the double-precision floating-point value
5461 `a' to the 32-bit unsigned integer format. The conversion is
5462 performed according to the IEC/IEEE Standard for Binary Floating-point
5463 Arithmetic, except that the conversion is always rounded toward zero. If
5464 `a' is a NaN, the largest positive integer is returned. If the conversion
5465 overflows, the largest integer positive is returned.
5466 -------------------------------------------------------------------------------
5468 uint32 float64_to_uint32_round_to_zero( float64 a )
5470 flag aSign;
5471 int16 aExp, shiftCount;
5472 bits64 aSig, savedASig;
5473 uint32 z;
5475 aSig = extractFloat64Frac( a );
5476 aExp = extractFloat64Exp( a );
5477 aSign = extractFloat64Sign( a );
5479 if (aSign) {
5480 float_raise( float_flag_invalid );
5481 return(0);
5484 if ( 0x41E < aExp ) {
5485 float_raise( float_flag_invalid );
5486 return 0xffffffff;
5488 else if ( aExp < 0x3FF ) {
5489 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5490 return 0;
5492 aSig |= LIT64( 0x0010000000000000 );
5493 shiftCount = 0x433 - aExp;
5494 savedASig = aSig;
5495 aSig >>= shiftCount;
5496 z = aSig;
5497 if ( ( aSig<<shiftCount ) != savedASig ) {
5498 float_exception_flags |= float_flag_inexact;
5500 return z;
5505 -------------------------------------------------------------------------------
5506 Returns the result of converting the single-precision floating-point value
5507 `a' to the 32-bit unsigned integer format. The conversion is
5508 performed according to the IEC/IEEE Standard for Binary Floating-point
5509 Arithmetic, except that the conversion is always rounded toward zero. If
5510 `a' is a NaN, the largest positive integer is returned. If the conversion
5511 overflows, the largest positive integer is returned.
5512 -------------------------------------------------------------------------------
5514 uint32 float32_to_uint32_round_to_zero( float32 a )
5516 flag aSign;
5517 int16 aExp, shiftCount;
5518 bits32 aSig;
5519 uint32 z;
5521 aSig = extractFloat32Frac( a );
5522 aExp = extractFloat32Exp( a );
5523 aSign = extractFloat32Sign( a );
5524 shiftCount = aExp - 0x9E;
5526 if (aSign) {
5527 float_raise( float_flag_invalid );
5528 return(0);
5530 if ( 0 < shiftCount ) {
5531 float_raise( float_flag_invalid );
5532 return 0xFFFFFFFF;
5534 else if ( aExp <= 0x7E ) {
5535 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5536 return 0;
5538 aSig = ( aSig | 0x800000 )<<8;
5539 z = aSig>>( - shiftCount );
5540 if ( aSig<<( shiftCount & 31 ) ) {
5541 float_exception_flags |= float_flag_inexact;
5543 return z;
5547 #endif