Remove building with NOCRYPTO option
[minix3.git] / lib / libc / softfloat / bits32 / softfloat.c
bloba3527e0d2e8b2a4ab3e55f76a1457e59e6c55901
1 /* $NetBSD: softfloat.c,v 1.3 2013/01/10 08:16:11 matt Exp $ */
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
9 /*
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
18 * This differs from the standard bits32/softfloat.c in that float64
19 * is defined to be a 64-bit integer rather than a structure. The
20 * structure is float64s, with translation between the two going via
21 * float64u.
25 ===============================================================================
27 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
28 Arithmetic Package, Release 2a.
30 Written by John R. Hauser. This work was made possible in part by the
31 International Computer Science Institute, located at Suite 600, 1947 Center
32 Street, Berkeley, California 94704. Funding was partially provided by the
33 National Science Foundation under grant MIP-9311980. The original version
34 of this code was written as part of a project to build a fixed-point vector
35 processor in collaboration with the University of California at Berkeley,
36 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
37 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
38 arithmetic/SoftFloat.html'.
40 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
41 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
42 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
43 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
44 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
46 Derivative works are acceptable, even for commercial purposes, so long as
47 (1) they include prominent notice that the work is derivative, and (2) they
48 include prominent notice akin to these four paragraphs for those parts of
49 this code that are retained.
51 ===============================================================================
54 #include <sys/cdefs.h>
55 #if defined(LIBC_SCCS) && !defined(lint)
56 __RCSID("$NetBSD: softfloat.c,v 1.3 2013/01/10 08:16:11 matt Exp $");
57 #endif /* LIBC_SCCS and not lint */
59 #ifdef SOFTFLOAT_FOR_GCC
60 #include "softfloat-for-gcc.h"
61 #endif
63 #include "milieu.h"
64 #include "softfloat.h"
67 * Conversions between floats as stored in memory and floats as
68 * SoftFloat uses them
70 #ifndef FLOAT64_DEMANGLE
71 #define FLOAT64_DEMANGLE(a) (a)
72 #endif
73 #ifndef FLOAT64_MANGLE
74 #define FLOAT64_MANGLE(a) (a)
75 #endif
78 -------------------------------------------------------------------------------
79 Floating-point rounding mode and exception flags.
80 -------------------------------------------------------------------------------
82 #ifndef set_float_rounding_mode
83 fp_rnd float_rounding_mode = float_round_nearest_even;
84 fp_except float_exception_flags = 0;
85 #endif
86 #ifndef set_float_exception_inexact_flag
87 #define set_float_exception_inexact_flag() \
88 ((void)(float_exception_flags |= float_flag_inexact))
89 #endif
92 -------------------------------------------------------------------------------
93 Primitive arithmetic functions, including multi-word arithmetic, and
94 division and square root approximations. (Can be specialized to target if
95 desired.)
96 -------------------------------------------------------------------------------
98 #include "softfloat-macros"
101 -------------------------------------------------------------------------------
102 Functions and definitions to determine: (1) whether tininess for underflow
103 is detected before or after rounding by default, (2) what (if anything)
104 happens when exceptions are raised, (3) how signaling NaNs are distinguished
105 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
106 are propagated from function inputs to output. These details are target-
107 specific.
108 -------------------------------------------------------------------------------
110 #include "softfloat-specialize"
113 -------------------------------------------------------------------------------
114 Returns the fraction bits of the single-precision floating-point value `a'.
115 -------------------------------------------------------------------------------
117 INLINE bits32 extractFloat32Frac( float32 a )
120 return a & 0x007FFFFF;
125 -------------------------------------------------------------------------------
126 Returns the exponent bits of the single-precision floating-point value `a'.
127 -------------------------------------------------------------------------------
129 INLINE int16 extractFloat32Exp( float32 a )
132 return ( a>>23 ) & 0xFF;
137 -------------------------------------------------------------------------------
138 Returns the sign bit of the single-precision floating-point value `a'.
139 -------------------------------------------------------------------------------
141 INLINE flag extractFloat32Sign( float32 a )
144 return a>>31;
149 -------------------------------------------------------------------------------
150 Normalizes the subnormal single-precision floating-point value represented
151 by the denormalized significand `aSig'. The normalized exponent and
152 significand are stored at the locations pointed to by `zExpPtr' and
153 `zSigPtr', respectively.
154 -------------------------------------------------------------------------------
156 static void
157 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
159 int8 shiftCount;
161 shiftCount = countLeadingZeros32( aSig ) - 8;
162 *zSigPtr = aSig<<shiftCount;
163 *zExpPtr = 1 - shiftCount;
168 -------------------------------------------------------------------------------
169 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
170 single-precision floating-point value, returning the result. After being
171 shifted into the proper positions, the three fields are simply added
172 together to form the result. This means that any integer portion of `zSig'
173 will be added into the exponent. Since a properly normalized significand
174 will have an integer portion equal to 1, the `zExp' input should be 1 less
175 than the desired result exponent whenever `zSig' is a complete, normalized
176 significand.
177 -------------------------------------------------------------------------------
179 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
182 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
187 -------------------------------------------------------------------------------
188 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
189 and significand `zSig', and returns the proper single-precision floating-
190 point value corresponding to the abstract input. Ordinarily, the abstract
191 value is simply rounded and packed into the single-precision format, with
192 the inexact exception raised if the abstract input cannot be represented
193 exactly. However, if the abstract value is too large, the overflow and
194 inexact exceptions are raised and an infinity or maximal finite value is
195 returned. If the abstract value is too small, the input value is rounded to
196 a subnormal number, and the underflow and inexact exceptions are raised if
197 the abstract input cannot be represented exactly as a subnormal single-
198 precision floating-point number.
199 The input significand `zSig' has its binary point between bits 30
200 and 29, which is 7 bits to the left of the usual location. This shifted
201 significand must be normalized or smaller. If `zSig' is not normalized,
202 `zExp' must be 0; in that case, the result returned is a subnormal number,
203 and it must not require rounding. In the usual case that `zSig' is
204 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
205 The handling of underflow and overflow follows the IEC/IEEE Standard for
206 Binary Floating-Point Arithmetic.
207 -------------------------------------------------------------------------------
209 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
211 int8 roundingMode;
212 flag roundNearestEven;
213 int8 roundIncrement, roundBits;
214 flag isTiny;
216 roundingMode = float_rounding_mode;
217 roundNearestEven = roundingMode == float_round_nearest_even;
218 roundIncrement = 0x40;
219 if ( ! roundNearestEven ) {
220 if ( roundingMode == float_round_to_zero ) {
221 roundIncrement = 0;
223 else {
224 roundIncrement = 0x7F;
225 if ( zSign ) {
226 if ( roundingMode == float_round_up ) roundIncrement = 0;
228 else {
229 if ( roundingMode == float_round_down ) roundIncrement = 0;
233 roundBits = zSig & 0x7F;
234 if ( 0xFD <= (bits16) zExp ) {
235 if ( ( 0xFD < zExp )
236 || ( ( zExp == 0xFD )
237 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
239 float_raise( float_flag_overflow | float_flag_inexact );
240 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
242 if ( zExp < 0 ) {
243 isTiny =
244 ( float_detect_tininess == float_tininess_before_rounding )
245 || ( zExp < -1 )
246 || ( zSig + roundIncrement < (uint32)0x80000000 );
247 shift32RightJamming( zSig, - zExp, &zSig );
248 zExp = 0;
249 roundBits = zSig & 0x7F;
250 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
253 if ( roundBits ) set_float_exception_inexact_flag();
254 zSig = ( zSig + roundIncrement )>>7;
255 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
256 if ( zSig == 0 ) zExp = 0;
257 return packFloat32( zSign, zExp, zSig );
262 -------------------------------------------------------------------------------
263 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
264 and significand `zSig', and returns the proper single-precision floating-
265 point value corresponding to the abstract input. This routine is just like
266 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
267 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
268 floating-point exponent.
269 -------------------------------------------------------------------------------
271 static float32
272 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
274 int8 shiftCount;
276 shiftCount = countLeadingZeros32( zSig ) - 1;
277 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
282 -------------------------------------------------------------------------------
283 Returns the least-significant 32 fraction bits of the double-precision
284 floating-point value `a'.
285 -------------------------------------------------------------------------------
287 INLINE bits32 extractFloat64Frac1( float64 a )
290 return (bits32)(FLOAT64_DEMANGLE(a) & LIT64(0x00000000FFFFFFFF));
295 -------------------------------------------------------------------------------
296 Returns the most-significant 20 fraction bits of the double-precision
297 floating-point value `a'.
298 -------------------------------------------------------------------------------
300 INLINE bits32 extractFloat64Frac0( float64 a )
303 return (bits32)((FLOAT64_DEMANGLE(a) >> 32) & 0x000FFFFF);
308 -------------------------------------------------------------------------------
309 Returns the exponent bits of the double-precision floating-point value `a'.
310 -------------------------------------------------------------------------------
312 INLINE int16 extractFloat64Exp( float64 a )
315 return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF);
320 -------------------------------------------------------------------------------
321 Returns the sign bit of the double-precision floating-point value `a'.
322 -------------------------------------------------------------------------------
324 INLINE flag extractFloat64Sign( float64 a )
327 return (flag)(FLOAT64_DEMANGLE(a) >> 63);
332 -------------------------------------------------------------------------------
333 Normalizes the subnormal double-precision floating-point value represented
334 by the denormalized significand formed by the concatenation of `aSig0' and
335 `aSig1'. The normalized exponent is stored at the location pointed to by
336 `zExpPtr'. The most significant 21 bits of the normalized significand are
337 stored at the location pointed to by `zSig0Ptr', and the least significant
338 32 bits of the normalized significand are stored at the location pointed to
339 by `zSig1Ptr'.
340 -------------------------------------------------------------------------------
342 static void
343 normalizeFloat64Subnormal(
344 bits32 aSig0,
345 bits32 aSig1,
346 int16 *zExpPtr,
347 bits32 *zSig0Ptr,
348 bits32 *zSig1Ptr
351 int8 shiftCount;
353 if ( aSig0 == 0 ) {
354 shiftCount = countLeadingZeros32( aSig1 ) - 11;
355 if ( shiftCount < 0 ) {
356 *zSig0Ptr = aSig1>>( - shiftCount );
357 *zSig1Ptr = aSig1<<( shiftCount & 31 );
359 else {
360 *zSig0Ptr = aSig1<<shiftCount;
361 *zSig1Ptr = 0;
363 *zExpPtr = - shiftCount - 31;
365 else {
366 shiftCount = countLeadingZeros32( aSig0 ) - 11;
367 shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
368 *zExpPtr = 1 - shiftCount;
374 -------------------------------------------------------------------------------
375 Packs the sign `zSign', the exponent `zExp', and the significand formed by
376 the concatenation of `zSig0' and `zSig1' into a double-precision floating-
377 point value, returning the result. After being shifted into the proper
378 positions, the three fields `zSign', `zExp', and `zSig0' are simply added
379 together to form the most significant 32 bits of the result. This means
380 that any integer portion of `zSig0' will be added into the exponent. Since
381 a properly normalized significand will have an integer portion equal to 1,
382 the `zExp' input should be 1 less than the desired result exponent whenever
383 `zSig0' and `zSig1' concatenated form a complete, normalized significand.
384 -------------------------------------------------------------------------------
386 INLINE float64
387 packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
390 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
391 ( ( (bits64) zExp )<<52 ) +
392 ( ( (bits64) zSig0 )<<32 ) + zSig1 );
398 -------------------------------------------------------------------------------
399 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
400 and extended significand formed by the concatenation of `zSig0', `zSig1',
401 and `zSig2', and returns the proper double-precision floating-point value
402 corresponding to the abstract input. Ordinarily, the abstract value is
403 simply rounded and packed into the double-precision format, with the inexact
404 exception raised if the abstract input cannot be represented exactly.
405 However, if the abstract value is too large, the overflow and inexact
406 exceptions are raised and an infinity or maximal finite value is returned.
407 If the abstract value is too small, the input value is rounded to a
408 subnormal number, and the underflow and inexact exceptions are raised if the
409 abstract input cannot be represented exactly as a subnormal double-precision
410 floating-point number.
411 The input significand must be normalized or smaller. If the input
412 significand is not normalized, `zExp' must be 0; in that case, the result
413 returned is a subnormal number, and it must not require rounding. In the
414 usual case that the input significand is normalized, `zExp' must be 1 less
415 than the ``true'' floating-point exponent. The handling of underflow and
416 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
417 -------------------------------------------------------------------------------
419 static float64
420 roundAndPackFloat64(
421 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 )
423 int8 roundingMode;
424 flag roundNearestEven, increment, isTiny;
426 roundingMode = float_rounding_mode;
427 roundNearestEven = ( roundingMode == float_round_nearest_even );
428 increment = ( (sbits32) zSig2 < 0 );
429 if ( ! roundNearestEven ) {
430 if ( roundingMode == float_round_to_zero ) {
431 increment = 0;
433 else {
434 if ( zSign ) {
435 increment = ( roundingMode == float_round_down ) && zSig2;
437 else {
438 increment = ( roundingMode == float_round_up ) && zSig2;
442 if ( 0x7FD <= (bits16) zExp ) {
443 if ( ( 0x7FD < zExp )
444 || ( ( zExp == 0x7FD )
445 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 )
446 && increment
449 float_raise( float_flag_overflow | float_flag_inexact );
450 if ( ( roundingMode == float_round_to_zero )
451 || ( zSign && ( roundingMode == float_round_up ) )
452 || ( ! zSign && ( roundingMode == float_round_down ) )
454 return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
456 return packFloat64( zSign, 0x7FF, 0, 0 );
458 if ( zExp < 0 ) {
459 isTiny =
460 ( float_detect_tininess == float_tininess_before_rounding )
461 || ( zExp < -1 )
462 || ! increment
463 || lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF );
464 shift64ExtraRightJamming(
465 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
466 zExp = 0;
467 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
468 if ( roundNearestEven ) {
469 increment = ( (sbits32) zSig2 < 0 );
471 else {
472 if ( zSign ) {
473 increment = ( roundingMode == float_round_down ) && zSig2;
475 else {
476 increment = ( roundingMode == float_round_up ) && zSig2;
481 if ( zSig2 ) set_float_exception_inexact_flag();
482 if ( increment ) {
483 add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
484 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
486 else {
487 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
489 return packFloat64( zSign, zExp, zSig0, zSig1 );
494 -------------------------------------------------------------------------------
495 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
496 and significand formed by the concatenation of `zSig0' and `zSig1', and
497 returns the proper double-precision floating-point value corresponding
498 to the abstract input. This routine is just like `roundAndPackFloat64'
499 except that the input significand has fewer bits and does not have to be
500 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
501 point exponent.
502 -------------------------------------------------------------------------------
504 static float64
505 normalizeRoundAndPackFloat64(
506 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
508 int8 shiftCount;
509 bits32 zSig2;
511 if ( zSig0 == 0 ) {
512 zSig0 = zSig1;
513 zSig1 = 0;
514 zExp -= 32;
516 shiftCount = countLeadingZeros32( zSig0 ) - 11;
517 if ( 0 <= shiftCount ) {
518 zSig2 = 0;
519 shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
521 else {
522 shift64ExtraRightJamming(
523 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
525 zExp -= shiftCount;
526 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
531 -------------------------------------------------------------------------------
532 Returns the result of converting the 32-bit two's complement integer `a' to
533 the single-precision floating-point format. The conversion is performed
534 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
535 -------------------------------------------------------------------------------
537 float32 int32_to_float32( int32 a )
539 flag zSign;
541 if ( a == 0 ) return 0;
542 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
543 zSign = ( a < 0 );
544 return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a));
549 -------------------------------------------------------------------------------
550 Returns the result of converting the 32-bit two's complement integer `a' to
551 the double-precision floating-point format. The conversion is performed
552 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
553 -------------------------------------------------------------------------------
555 float64 int32_to_float64( int32 a )
557 flag zSign;
558 bits32 absA;
559 int8 shiftCount;
560 bits32 zSig0, zSig1;
562 if ( a == 0 ) return packFloat64( 0, 0, 0, 0 );
563 zSign = ( a < 0 );
564 absA = zSign ? - a : a;
565 shiftCount = countLeadingZeros32( absA ) - 11;
566 if ( 0 <= shiftCount ) {
567 zSig0 = absA<<shiftCount;
568 zSig1 = 0;
570 else {
571 shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 );
573 return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 );
577 #ifndef SOFTFLOAT_FOR_GCC
579 -------------------------------------------------------------------------------
580 Returns the result of converting the single-precision floating-point value
581 `a' to the 32-bit two's complement integer format. The conversion is
582 performed according to the IEC/IEEE Standard for Binary Floating-Point
583 Arithmetic---which means in particular that the conversion is rounded
584 according to the current rounding mode. If `a' is a NaN, the largest
585 positive integer is returned. Otherwise, if the conversion overflows, the
586 largest integer with the same sign as `a' is returned.
587 -------------------------------------------------------------------------------
589 int32 float32_to_int32( float32 a )
591 flag aSign;
592 int16 aExp, shiftCount;
593 bits32 aSig, aSigExtra;
594 int32 z;
595 int8 roundingMode;
597 aSig = extractFloat32Frac( a );
598 aExp = extractFloat32Exp( a );
599 aSign = extractFloat32Sign( a );
600 shiftCount = aExp - 0x96;
601 if ( 0 <= shiftCount ) {
602 if ( 0x9E <= aExp ) {
603 if ( a != 0xCF000000 ) {
604 float_raise( float_flag_invalid );
605 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
606 return 0x7FFFFFFF;
609 return (sbits32) 0x80000000;
611 z = ( aSig | 0x00800000 )<<shiftCount;
612 if ( aSign ) z = - z;
614 else {
615 if ( aExp < 0x7E ) {
616 aSigExtra = aExp | aSig;
617 z = 0;
619 else {
620 aSig |= 0x00800000;
621 aSigExtra = aSig<<( shiftCount & 31 );
622 z = aSig>>( - shiftCount );
624 if ( aSigExtra ) set_float_exception_inexact_flag();
625 roundingMode = float_rounding_mode;
626 if ( roundingMode == float_round_nearest_even ) {
627 if ( (sbits32) aSigExtra < 0 ) {
628 ++z;
629 if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1;
631 if ( aSign ) z = - z;
633 else {
634 aSigExtra = ( aSigExtra != 0 );
635 if ( aSign ) {
636 z += ( roundingMode == float_round_down ) & aSigExtra;
637 z = - z;
639 else {
640 z += ( roundingMode == float_round_up ) & aSigExtra;
644 return z;
647 #endif
650 -------------------------------------------------------------------------------
651 Returns the result of converting the single-precision floating-point value
652 `a' to the 32-bit two's complement integer format. The conversion is
653 performed according to the IEC/IEEE Standard for Binary Floating-Point
654 Arithmetic, except that the conversion is always rounded toward zero.
655 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
656 the conversion overflows, the largest integer with the same sign as `a' is
657 returned.
658 -------------------------------------------------------------------------------
660 int32 float32_to_int32_round_to_zero( float32 a )
662 flag aSign;
663 int16 aExp, shiftCount;
664 bits32 aSig;
665 int32 z;
667 aSig = extractFloat32Frac( a );
668 aExp = extractFloat32Exp( a );
669 aSign = extractFloat32Sign( a );
670 shiftCount = aExp - 0x9E;
671 if ( 0 <= shiftCount ) {
672 if ( a != 0xCF000000 ) {
673 float_raise( float_flag_invalid );
674 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
676 return (sbits32) 0x80000000;
678 else if ( aExp <= 0x7E ) {
679 if ( aExp | aSig ) set_float_exception_inexact_flag();
680 return 0;
682 aSig = ( aSig | 0x00800000 )<<8;
683 z = aSig>>( - shiftCount );
684 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
685 set_float_exception_inexact_flag();
687 if ( aSign ) z = - z;
688 return z;
693 -------------------------------------------------------------------------------
694 Returns the result of converting the single-precision floating-point value
695 `a' to the double-precision floating-point format. The conversion is
696 performed according to the IEC/IEEE Standard for Binary Floating-Point
697 Arithmetic.
698 -------------------------------------------------------------------------------
700 float64 float32_to_float64( float32 a )
702 flag aSign;
703 int16 aExp;
704 bits32 aSig, zSig0, zSig1;
706 aSig = extractFloat32Frac( a );
707 aExp = extractFloat32Exp( a );
708 aSign = extractFloat32Sign( a );
709 if ( aExp == 0xFF ) {
710 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
711 return packFloat64( aSign, 0x7FF, 0, 0 );
713 if ( aExp == 0 ) {
714 if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 );
715 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
716 --aExp;
718 shift64Right( aSig, 0, 3, &zSig0, &zSig1 );
719 return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 );
723 #ifndef SOFTFLOAT_FOR_GCC
725 -------------------------------------------------------------------------------
726 Rounds the single-precision floating-point value `a' to an integer,
727 and returns the result as a single-precision floating-point value. The
728 operation is performed according to the IEC/IEEE Standard for Binary
729 Floating-Point Arithmetic.
730 -------------------------------------------------------------------------------
732 float32 float32_round_to_int( float32 a )
734 flag aSign;
735 int16 aExp;
736 bits32 lastBitMask, roundBitsMask;
737 int8 roundingMode;
738 float32 z;
740 aExp = extractFloat32Exp( a );
741 if ( 0x96 <= aExp ) {
742 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
743 return propagateFloat32NaN( a, a );
745 return a;
747 if ( aExp <= 0x7E ) {
748 if ( (bits32) ( a<<1 ) == 0 ) return a;
749 set_float_exception_inexact_flag();
750 aSign = extractFloat32Sign( a );
751 switch ( float_rounding_mode ) {
752 case float_round_nearest_even:
753 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
754 return packFloat32( aSign, 0x7F, 0 );
756 break;
757 case float_round_to_zero:
758 break;
759 case float_round_down:
760 return aSign ? 0xBF800000 : 0;
761 case float_round_up:
762 return aSign ? 0x80000000 : 0x3F800000;
764 return packFloat32( aSign, 0, 0 );
766 lastBitMask = 1;
767 lastBitMask <<= 0x96 - aExp;
768 roundBitsMask = lastBitMask - 1;
769 z = a;
770 roundingMode = float_rounding_mode;
771 if ( roundingMode == float_round_nearest_even ) {
772 z += lastBitMask>>1;
773 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
775 else if ( roundingMode != float_round_to_zero ) {
776 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
777 z += roundBitsMask;
780 z &= ~ roundBitsMask;
781 if ( z != a ) set_float_exception_inexact_flag();
782 return z;
785 #endif
788 -------------------------------------------------------------------------------
789 Returns the result of adding the absolute values of the single-precision
790 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
791 before being returned. `zSign' is ignored if the result is a NaN.
792 The addition is performed according to the IEC/IEEE Standard for Binary
793 Floating-Point Arithmetic.
794 -------------------------------------------------------------------------------
796 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
798 int16 aExp, bExp, zExp;
799 bits32 aSig, bSig, zSig;
800 int16 expDiff;
802 aSig = extractFloat32Frac( a );
803 aExp = extractFloat32Exp( a );
804 bSig = extractFloat32Frac( b );
805 bExp = extractFloat32Exp( b );
806 expDiff = aExp - bExp;
807 aSig <<= 6;
808 bSig <<= 6;
809 if ( 0 < expDiff ) {
810 if ( aExp == 0xFF ) {
811 if ( aSig ) return propagateFloat32NaN( a, b );
812 return a;
814 if ( bExp == 0 ) {
815 --expDiff;
817 else {
818 bSig |= 0x20000000;
820 shift32RightJamming( bSig, expDiff, &bSig );
821 zExp = aExp;
823 else if ( expDiff < 0 ) {
824 if ( bExp == 0xFF ) {
825 if ( bSig ) return propagateFloat32NaN( a, b );
826 return packFloat32( zSign, 0xFF, 0 );
828 if ( aExp == 0 ) {
829 ++expDiff;
831 else {
832 aSig |= 0x20000000;
834 shift32RightJamming( aSig, - expDiff, &aSig );
835 zExp = bExp;
837 else {
838 if ( aExp == 0xFF ) {
839 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
840 return a;
842 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
843 zSig = 0x40000000 + aSig + bSig;
844 zExp = aExp;
845 goto roundAndPack;
847 aSig |= 0x20000000;
848 zSig = ( aSig + bSig )<<1;
849 --zExp;
850 if ( (sbits32) zSig < 0 ) {
851 zSig = aSig + bSig;
852 ++zExp;
854 roundAndPack:
855 return roundAndPackFloat32( zSign, zExp, zSig );
860 -------------------------------------------------------------------------------
861 Returns the result of subtracting the absolute values of the single-
862 precision floating-point values `a' and `b'. If `zSign' is 1, the
863 difference is negated before being returned. `zSign' is ignored if the
864 result is a NaN. The subtraction is performed according to the IEC/IEEE
865 Standard for Binary Floating-Point Arithmetic.
866 -------------------------------------------------------------------------------
868 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
870 int16 aExp, bExp, zExp;
871 bits32 aSig, bSig, zSig;
872 int16 expDiff;
874 aSig = extractFloat32Frac( a );
875 aExp = extractFloat32Exp( a );
876 bSig = extractFloat32Frac( b );
877 bExp = extractFloat32Exp( b );
878 expDiff = aExp - bExp;
879 aSig <<= 7;
880 bSig <<= 7;
881 if ( 0 < expDiff ) goto aExpBigger;
882 if ( expDiff < 0 ) goto bExpBigger;
883 if ( aExp == 0xFF ) {
884 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
885 float_raise( float_flag_invalid );
886 return float32_default_nan;
888 if ( aExp == 0 ) {
889 aExp = 1;
890 bExp = 1;
892 if ( bSig < aSig ) goto aBigger;
893 if ( aSig < bSig ) goto bBigger;
894 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
895 bExpBigger:
896 if ( bExp == 0xFF ) {
897 if ( bSig ) return propagateFloat32NaN( a, b );
898 return packFloat32( zSign ^ 1, 0xFF, 0 );
900 if ( aExp == 0 ) {
901 ++expDiff;
903 else {
904 aSig |= 0x40000000;
906 shift32RightJamming( aSig, - expDiff, &aSig );
907 bSig |= 0x40000000;
908 bBigger:
909 zSig = bSig - aSig;
910 zExp = bExp;
911 zSign ^= 1;
912 goto normalizeRoundAndPack;
913 aExpBigger:
914 if ( aExp == 0xFF ) {
915 if ( aSig ) return propagateFloat32NaN( a, b );
916 return a;
918 if ( bExp == 0 ) {
919 --expDiff;
921 else {
922 bSig |= 0x40000000;
924 shift32RightJamming( bSig, expDiff, &bSig );
925 aSig |= 0x40000000;
926 aBigger:
927 zSig = aSig - bSig;
928 zExp = aExp;
929 normalizeRoundAndPack:
930 --zExp;
931 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
936 -------------------------------------------------------------------------------
937 Returns the result of adding the single-precision floating-point values `a'
938 and `b'. The operation is performed according to the IEC/IEEE Standard for
939 Binary Floating-Point Arithmetic.
940 -------------------------------------------------------------------------------
942 float32 float32_add( float32 a, float32 b )
944 flag aSign, bSign;
946 aSign = extractFloat32Sign( a );
947 bSign = extractFloat32Sign( b );
948 if ( aSign == bSign ) {
949 return addFloat32Sigs( a, b, aSign );
951 else {
952 return subFloat32Sigs( a, b, aSign );
958 -------------------------------------------------------------------------------
959 Returns the result of subtracting the single-precision floating-point values
960 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
961 for Binary Floating-Point Arithmetic.
962 -------------------------------------------------------------------------------
964 float32 float32_sub( float32 a, float32 b )
966 flag aSign, bSign;
968 aSign = extractFloat32Sign( a );
969 bSign = extractFloat32Sign( b );
970 if ( aSign == bSign ) {
971 return subFloat32Sigs( a, b, aSign );
973 else {
974 return addFloat32Sigs( a, b, aSign );
980 -------------------------------------------------------------------------------
981 Returns the result of multiplying the single-precision floating-point values
982 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
983 for Binary Floating-Point Arithmetic.
984 -------------------------------------------------------------------------------
986 float32 float32_mul( float32 a, float32 b )
988 flag aSign, bSign, zSign;
989 int16 aExp, bExp, zExp;
990 bits32 aSig, bSig, zSig0, zSig1;
992 aSig = extractFloat32Frac( a );
993 aExp = extractFloat32Exp( a );
994 aSign = extractFloat32Sign( a );
995 bSig = extractFloat32Frac( b );
996 bExp = extractFloat32Exp( b );
997 bSign = extractFloat32Sign( b );
998 zSign = aSign ^ bSign;
999 if ( aExp == 0xFF ) {
1000 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1001 return propagateFloat32NaN( a, b );
1003 if ( ( bExp | bSig ) == 0 ) {
1004 float_raise( float_flag_invalid );
1005 return float32_default_nan;
1007 return packFloat32( zSign, 0xFF, 0 );
1009 if ( bExp == 0xFF ) {
1010 if ( bSig ) return propagateFloat32NaN( a, b );
1011 if ( ( aExp | aSig ) == 0 ) {
1012 float_raise( float_flag_invalid );
1013 return float32_default_nan;
1015 return packFloat32( zSign, 0xFF, 0 );
1017 if ( aExp == 0 ) {
1018 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1019 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1021 if ( bExp == 0 ) {
1022 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1023 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1025 zExp = aExp + bExp - 0x7F;
1026 aSig = ( aSig | 0x00800000 )<<7;
1027 bSig = ( bSig | 0x00800000 )<<8;
1028 mul32To64( aSig, bSig, &zSig0, &zSig1 );
1029 zSig0 |= ( zSig1 != 0 );
1030 if ( 0 <= (sbits32) ( zSig0<<1 ) ) {
1031 zSig0 <<= 1;
1032 --zExp;
1034 return roundAndPackFloat32( zSign, zExp, zSig0 );
1039 -------------------------------------------------------------------------------
1040 Returns the result of dividing the single-precision floating-point value `a'
1041 by the corresponding value `b'. The operation is performed according to the
1042 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1043 -------------------------------------------------------------------------------
1045 float32 float32_div( float32 a, float32 b )
1047 flag aSign, bSign, zSign;
1048 int16 aExp, bExp, zExp;
1049 bits32 aSig, bSig, zSig, rem0, rem1, term0, term1;
1051 aSig = extractFloat32Frac( a );
1052 aExp = extractFloat32Exp( a );
1053 aSign = extractFloat32Sign( a );
1054 bSig = extractFloat32Frac( b );
1055 bExp = extractFloat32Exp( b );
1056 bSign = extractFloat32Sign( b );
1057 zSign = aSign ^ bSign;
1058 if ( aExp == 0xFF ) {
1059 if ( aSig ) return propagateFloat32NaN( a, b );
1060 if ( bExp == 0xFF ) {
1061 if ( bSig ) return propagateFloat32NaN( a, b );
1062 float_raise( float_flag_invalid );
1063 return float32_default_nan;
1065 return packFloat32( zSign, 0xFF, 0 );
1067 if ( bExp == 0xFF ) {
1068 if ( bSig ) return propagateFloat32NaN( a, b );
1069 return packFloat32( zSign, 0, 0 );
1071 if ( bExp == 0 ) {
1072 if ( bSig == 0 ) {
1073 if ( ( aExp | aSig ) == 0 ) {
1074 float_raise( float_flag_invalid );
1075 return float32_default_nan;
1077 float_raise( float_flag_divbyzero );
1078 return packFloat32( zSign, 0xFF, 0 );
1080 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1082 if ( aExp == 0 ) {
1083 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1084 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1086 zExp = aExp - bExp + 0x7D;
1087 aSig = ( aSig | 0x00800000 )<<7;
1088 bSig = ( bSig | 0x00800000 )<<8;
1089 if ( bSig <= ( aSig + aSig ) ) {
1090 aSig >>= 1;
1091 ++zExp;
1093 zSig = estimateDiv64To32( aSig, 0, bSig );
1094 if ( ( zSig & 0x3F ) <= 2 ) {
1095 mul32To64( bSig, zSig, &term0, &term1 );
1096 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1097 while ( (sbits32) rem0 < 0 ) {
1098 --zSig;
1099 add64( rem0, rem1, 0, bSig, &rem0, &rem1 );
1101 zSig |= ( rem1 != 0 );
1103 return roundAndPackFloat32( zSign, zExp, zSig );
1107 #ifndef SOFTFLOAT_FOR_GCC
1109 -------------------------------------------------------------------------------
1110 Returns the remainder of the single-precision floating-point value `a'
1111 with respect to the corresponding value `b'. The operation is performed
1112 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113 -------------------------------------------------------------------------------
1115 float32 float32_rem( float32 a, float32 b )
1117 flag aSign, bSign, zSign;
1118 int16 aExp, bExp, expDiff;
1119 bits32 aSig, bSig, q, allZero, alternateASig;
1120 sbits32 sigMean;
1122 aSig = extractFloat32Frac( a );
1123 aExp = extractFloat32Exp( a );
1124 aSign = extractFloat32Sign( a );
1125 bSig = extractFloat32Frac( b );
1126 bExp = extractFloat32Exp( b );
1127 bSign = extractFloat32Sign( b );
1128 if ( aExp == 0xFF ) {
1129 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1130 return propagateFloat32NaN( a, b );
1132 float_raise( float_flag_invalid );
1133 return float32_default_nan;
1135 if ( bExp == 0xFF ) {
1136 if ( bSig ) return propagateFloat32NaN( a, b );
1137 return a;
1139 if ( bExp == 0 ) {
1140 if ( bSig == 0 ) {
1141 float_raise( float_flag_invalid );
1142 return float32_default_nan;
1144 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1146 if ( aExp == 0 ) {
1147 if ( aSig == 0 ) return a;
1148 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1150 expDiff = aExp - bExp;
1151 aSig = ( aSig | 0x00800000 )<<8;
1152 bSig = ( bSig | 0x00800000 )<<8;
1153 if ( expDiff < 0 ) {
1154 if ( expDiff < -1 ) return a;
1155 aSig >>= 1;
1157 q = ( bSig <= aSig );
1158 if ( q ) aSig -= bSig;
1159 expDiff -= 32;
1160 while ( 0 < expDiff ) {
1161 q = estimateDiv64To32( aSig, 0, bSig );
1162 q = ( 2 < q ) ? q - 2 : 0;
1163 aSig = - ( ( bSig>>2 ) * q );
1164 expDiff -= 30;
1166 expDiff += 32;
1167 if ( 0 < expDiff ) {
1168 q = estimateDiv64To32( aSig, 0, bSig );
1169 q = ( 2 < q ) ? q - 2 : 0;
1170 q >>= 32 - expDiff;
1171 bSig >>= 2;
1172 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1174 else {
1175 aSig >>= 2;
1176 bSig >>= 2;
1178 do {
1179 alternateASig = aSig;
1180 ++q;
1181 aSig -= bSig;
1182 } while ( 0 <= (sbits32) aSig );
1183 sigMean = aSig + alternateASig;
1184 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1185 aSig = alternateASig;
1187 zSign = ( (sbits32) aSig < 0 );
1188 if ( zSign ) aSig = - aSig;
1189 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1192 #endif
1194 #ifndef SOFTFLOAT_FOR_GCC
1196 -------------------------------------------------------------------------------
1197 Returns the square root of the single-precision floating-point value `a'.
1198 The operation is performed according to the IEC/IEEE Standard for Binary
1199 Floating-Point Arithmetic.
1200 -------------------------------------------------------------------------------
1202 float32 float32_sqrt( float32 a )
1204 flag aSign;
1205 int16 aExp, zExp;
1206 bits32 aSig, zSig, rem0, rem1, term0, term1;
1208 aSig = extractFloat32Frac( a );
1209 aExp = extractFloat32Exp( a );
1210 aSign = extractFloat32Sign( a );
1211 if ( aExp == 0xFF ) {
1212 if ( aSig ) return propagateFloat32NaN( a, 0 );
1213 if ( ! aSign ) return a;
1214 float_raise( float_flag_invalid );
1215 return float32_default_nan;
1217 if ( aSign ) {
1218 if ( ( aExp | aSig ) == 0 ) return a;
1219 float_raise( float_flag_invalid );
1220 return float32_default_nan;
1222 if ( aExp == 0 ) {
1223 if ( aSig == 0 ) return 0;
1224 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1226 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1227 aSig = ( aSig | 0x00800000 )<<8;
1228 zSig = estimateSqrt32( aExp, aSig ) + 2;
1229 if ( ( zSig & 0x7F ) <= 5 ) {
1230 if ( zSig < 2 ) {
1231 zSig = 0x7FFFFFFF;
1232 goto roundAndPack;
1234 else {
1235 aSig >>= aExp & 1;
1236 mul32To64( zSig, zSig, &term0, &term1 );
1237 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1238 while ( (sbits32) rem0 < 0 ) {
1239 --zSig;
1240 shortShift64Left( 0, zSig, 1, &term0, &term1 );
1241 term1 |= 1;
1242 add64( rem0, rem1, term0, term1, &rem0, &rem1 );
1244 zSig |= ( ( rem0 | rem1 ) != 0 );
1247 shift32RightJamming( zSig, 1, &zSig );
1248 roundAndPack:
1249 return roundAndPackFloat32( 0, zExp, zSig );
1252 #endif
1255 -------------------------------------------------------------------------------
1256 Returns 1 if the single-precision floating-point value `a' is equal to
1257 the corresponding value `b', and 0 otherwise. The comparison is performed
1258 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259 -------------------------------------------------------------------------------
1261 flag float32_eq( float32 a, float32 b )
1264 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1265 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1267 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1268 float_raise( float_flag_invalid );
1270 return 0;
1272 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1277 -------------------------------------------------------------------------------
1278 Returns 1 if the single-precision floating-point value `a' is less than
1279 or equal to the corresponding value `b', and 0 otherwise. The comparison
1280 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1281 Arithmetic.
1282 -------------------------------------------------------------------------------
1284 flag float32_le( float32 a, float32 b )
1286 flag aSign, bSign;
1288 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1289 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1291 float_raise( float_flag_invalid );
1292 return 0;
1294 aSign = extractFloat32Sign( a );
1295 bSign = extractFloat32Sign( b );
1296 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1297 return ( a == b ) || ( aSign ^ ( a < b ) );
1302 -------------------------------------------------------------------------------
1303 Returns 1 if the single-precision floating-point value `a' is less than
1304 the corresponding value `b', and 0 otherwise. The comparison is performed
1305 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1306 -------------------------------------------------------------------------------
1308 flag float32_lt( float32 a, float32 b )
1310 flag aSign, bSign;
1312 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1313 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1315 float_raise( float_flag_invalid );
1316 return 0;
1318 aSign = extractFloat32Sign( a );
1319 bSign = extractFloat32Sign( b );
1320 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1321 return ( a != b ) && ( aSign ^ ( a < b ) );
1325 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1327 -------------------------------------------------------------------------------
1328 Returns 1 if the single-precision floating-point value `a' is equal to
1329 the corresponding value `b', and 0 otherwise. The invalid exception is
1330 raised if either operand is a NaN. Otherwise, the comparison is performed
1331 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1332 -------------------------------------------------------------------------------
1334 flag float32_eq_signaling( float32 a, float32 b )
1337 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1338 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1340 float_raise( float_flag_invalid );
1341 return 0;
1343 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1348 -------------------------------------------------------------------------------
1349 Returns 1 if the single-precision floating-point value `a' is less than or
1350 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1351 cause an exception. Otherwise, the comparison is performed according to the
1352 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1353 -------------------------------------------------------------------------------
1355 flag float32_le_quiet( float32 a, float32 b )
1357 flag aSign, bSign;
1358 int16 aExp, bExp;
1360 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1361 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1363 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1364 float_raise( float_flag_invalid );
1366 return 0;
1368 aSign = extractFloat32Sign( a );
1369 bSign = extractFloat32Sign( b );
1370 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1371 return ( a == b ) || ( aSign ^ ( a < b ) );
1376 -------------------------------------------------------------------------------
1377 Returns 1 if the single-precision floating-point value `a' is less than
1378 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1379 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1380 Standard for Binary Floating-Point Arithmetic.
1381 -------------------------------------------------------------------------------
1383 flag float32_lt_quiet( float32 a, float32 b )
1385 flag aSign, bSign;
1387 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1388 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1390 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1391 float_raise( float_flag_invalid );
1393 return 0;
1395 aSign = extractFloat32Sign( a );
1396 bSign = extractFloat32Sign( b );
1397 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1398 return ( a != b ) && ( aSign ^ ( a < b ) );
1401 #endif /* !SOFTFLOAT_FOR_GCC */
1403 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1405 -------------------------------------------------------------------------------
1406 Returns the result of converting the double-precision floating-point value
1407 `a' to the 32-bit two's complement integer format. The conversion is
1408 performed according to the IEC/IEEE Standard for Binary Floating-Point
1409 Arithmetic---which means in particular that the conversion is rounded
1410 according to the current rounding mode. If `a' is a NaN, the largest
1411 positive integer is returned. Otherwise, if the conversion overflows, the
1412 largest integer with the same sign as `a' is returned.
1413 -------------------------------------------------------------------------------
1415 int32 float64_to_int32( float64 a )
1417 flag aSign;
1418 int16 aExp, shiftCount;
1419 bits32 aSig0, aSig1, absZ, aSigExtra;
1420 int32 z;
1421 int8 roundingMode;
1423 aSig1 = extractFloat64Frac1( a );
1424 aSig0 = extractFloat64Frac0( a );
1425 aExp = extractFloat64Exp( a );
1426 aSign = extractFloat64Sign( a );
1427 shiftCount = aExp - 0x413;
1428 if ( 0 <= shiftCount ) {
1429 if ( 0x41E < aExp ) {
1430 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1431 goto invalid;
1433 shortShift64Left(
1434 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1435 if ( 0x80000000 < absZ ) goto invalid;
1437 else {
1438 aSig1 = ( aSig1 != 0 );
1439 if ( aExp < 0x3FE ) {
1440 aSigExtra = aExp | aSig0 | aSig1;
1441 absZ = 0;
1443 else {
1444 aSig0 |= 0x00100000;
1445 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1446 absZ = aSig0>>( - shiftCount );
1449 roundingMode = float_rounding_mode;
1450 if ( roundingMode == float_round_nearest_even ) {
1451 if ( (sbits32) aSigExtra < 0 ) {
1452 ++absZ;
1453 if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1;
1455 z = aSign ? - absZ : absZ;
1457 else {
1458 aSigExtra = ( aSigExtra != 0 );
1459 if ( aSign ) {
1460 z = - ( absZ
1461 + ( ( roundingMode == float_round_down ) & aSigExtra ) );
1463 else {
1464 z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra );
1467 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1468 invalid:
1469 float_raise( float_flag_invalid );
1470 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1472 if ( aSigExtra ) set_float_exception_inexact_flag();
1473 return z;
1476 #endif /* !SOFTFLOAT_FOR_GCC */
1479 -------------------------------------------------------------------------------
1480 Returns the result of converting the double-precision floating-point value
1481 `a' to the 32-bit two's complement integer format. The conversion is
1482 performed according to the IEC/IEEE Standard for Binary Floating-Point
1483 Arithmetic, except that the conversion is always rounded toward zero.
1484 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1485 the conversion overflows, the largest integer with the same sign as `a' is
1486 returned.
1487 -------------------------------------------------------------------------------
1489 int32 float64_to_int32_round_to_zero( float64 a )
1491 flag aSign;
1492 int16 aExp, shiftCount;
1493 bits32 aSig0, aSig1, absZ, aSigExtra;
1494 int32 z;
1496 aSig1 = extractFloat64Frac1( a );
1497 aSig0 = extractFloat64Frac0( a );
1498 aExp = extractFloat64Exp( a );
1499 aSign = extractFloat64Sign( a );
1500 shiftCount = aExp - 0x413;
1501 if ( 0 <= shiftCount ) {
1502 if ( 0x41E < aExp ) {
1503 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1504 goto invalid;
1506 shortShift64Left(
1507 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1509 else {
1510 if ( aExp < 0x3FF ) {
1511 if ( aExp | aSig0 | aSig1 ) {
1512 set_float_exception_inexact_flag();
1514 return 0;
1516 aSig0 |= 0x00100000;
1517 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1518 absZ = aSig0>>( - shiftCount );
1520 z = aSign ? - absZ : absZ;
1521 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1522 invalid:
1523 float_raise( float_flag_invalid );
1524 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1526 if ( aSigExtra ) set_float_exception_inexact_flag();
1527 return z;
1532 -------------------------------------------------------------------------------
1533 Returns the result of converting the double-precision floating-point value
1534 `a' to the single-precision floating-point format. The conversion is
1535 performed according to the IEC/IEEE Standard for Binary Floating-Point
1536 Arithmetic.
1537 -------------------------------------------------------------------------------
1539 float32 float64_to_float32( float64 a )
1541 flag aSign;
1542 int16 aExp;
1543 bits32 aSig0, aSig1, zSig;
1544 bits32 allZero;
1546 aSig1 = extractFloat64Frac1( a );
1547 aSig0 = extractFloat64Frac0( a );
1548 aExp = extractFloat64Exp( a );
1549 aSign = extractFloat64Sign( a );
1550 if ( aExp == 0x7FF ) {
1551 if ( aSig0 | aSig1 ) {
1552 return commonNaNToFloat32( float64ToCommonNaN( a ) );
1554 return packFloat32( aSign, 0xFF, 0 );
1556 shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig );
1557 if ( aExp ) zSig |= 0x40000000;
1558 return roundAndPackFloat32( aSign, aExp - 0x381, zSig );
1562 #ifndef SOFTFLOAT_FOR_GCC
1564 -------------------------------------------------------------------------------
1565 Rounds the double-precision floating-point value `a' to an integer,
1566 and returns the result as a double-precision floating-point value. The
1567 operation is performed according to the IEC/IEEE Standard for Binary
1568 Floating-Point Arithmetic.
1569 -------------------------------------------------------------------------------
1571 float64 float64_round_to_int( float64 a )
1573 flag aSign;
1574 int16 aExp;
1575 bits32 lastBitMask, roundBitsMask;
1576 int8 roundingMode;
1577 float64 z;
1579 aExp = extractFloat64Exp( a );
1580 if ( 0x413 <= aExp ) {
1581 if ( 0x433 <= aExp ) {
1582 if ( ( aExp == 0x7FF )
1583 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) {
1584 return propagateFloat64NaN( a, a );
1586 return a;
1588 lastBitMask = 1;
1589 lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1;
1590 roundBitsMask = lastBitMask - 1;
1591 z = a;
1592 roundingMode = float_rounding_mode;
1593 if ( roundingMode == float_round_nearest_even ) {
1594 if ( lastBitMask ) {
1595 add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
1596 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
1598 else {
1599 if ( (sbits32) z.low < 0 ) {
1600 ++z.high;
1601 if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1;
1605 else if ( roundingMode != float_round_to_zero ) {
1606 if ( extractFloat64Sign( z )
1607 ^ ( roundingMode == float_round_up ) ) {
1608 add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
1611 z.low &= ~ roundBitsMask;
1613 else {
1614 if ( aExp <= 0x3FE ) {
1615 if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
1616 set_float_exception_inexact_flag();
1617 aSign = extractFloat64Sign( a );
1618 switch ( float_rounding_mode ) {
1619 case float_round_nearest_even:
1620 if ( ( aExp == 0x3FE )
1621 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) )
1623 return packFloat64( aSign, 0x3FF, 0, 0 );
1625 break;
1626 case float_round_down:
1627 return
1628 aSign ? packFloat64( 1, 0x3FF, 0, 0 )
1629 : packFloat64( 0, 0, 0, 0 );
1630 case float_round_up:
1631 return
1632 aSign ? packFloat64( 1, 0, 0, 0 )
1633 : packFloat64( 0, 0x3FF, 0, 0 );
1635 return packFloat64( aSign, 0, 0, 0 );
1637 lastBitMask = 1;
1638 lastBitMask <<= 0x413 - aExp;
1639 roundBitsMask = lastBitMask - 1;
1640 z.low = 0;
1641 z.high = a.high;
1642 roundingMode = float_rounding_mode;
1643 if ( roundingMode == float_round_nearest_even ) {
1644 z.high += lastBitMask>>1;
1645 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
1646 z.high &= ~ lastBitMask;
1649 else if ( roundingMode != float_round_to_zero ) {
1650 if ( extractFloat64Sign( z )
1651 ^ ( roundingMode == float_round_up ) ) {
1652 z.high |= ( a.low != 0 );
1653 z.high += roundBitsMask;
1656 z.high &= ~ roundBitsMask;
1658 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
1659 set_float_exception_inexact_flag();
1661 return z;
1664 #endif
1667 -------------------------------------------------------------------------------
1668 Returns the result of adding the absolute values of the double-precision
1669 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1670 before being returned. `zSign' is ignored if the result is a NaN.
1671 The addition is performed according to the IEC/IEEE Standard for Binary
1672 Floating-Point Arithmetic.
1673 -------------------------------------------------------------------------------
1675 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1677 int16 aExp, bExp, zExp;
1678 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1679 int16 expDiff;
1681 aSig1 = extractFloat64Frac1( a );
1682 aSig0 = extractFloat64Frac0( a );
1683 aExp = extractFloat64Exp( a );
1684 bSig1 = extractFloat64Frac1( b );
1685 bSig0 = extractFloat64Frac0( b );
1686 bExp = extractFloat64Exp( b );
1687 expDiff = aExp - bExp;
1688 if ( 0 < expDiff ) {
1689 if ( aExp == 0x7FF ) {
1690 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1691 return a;
1693 if ( bExp == 0 ) {
1694 --expDiff;
1696 else {
1697 bSig0 |= 0x00100000;
1699 shift64ExtraRightJamming(
1700 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
1701 zExp = aExp;
1703 else if ( expDiff < 0 ) {
1704 if ( bExp == 0x7FF ) {
1705 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1706 return packFloat64( zSign, 0x7FF, 0, 0 );
1708 if ( aExp == 0 ) {
1709 ++expDiff;
1711 else {
1712 aSig0 |= 0x00100000;
1714 shift64ExtraRightJamming(
1715 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
1716 zExp = bExp;
1718 else {
1719 if ( aExp == 0x7FF ) {
1720 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1721 return propagateFloat64NaN( a, b );
1723 return a;
1725 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1726 if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 );
1727 zSig2 = 0;
1728 zSig0 |= 0x00200000;
1729 zExp = aExp;
1730 goto shiftRight1;
1732 aSig0 |= 0x00100000;
1733 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1734 --zExp;
1735 if ( zSig0 < 0x00200000 ) goto roundAndPack;
1736 ++zExp;
1737 shiftRight1:
1738 shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1739 roundAndPack:
1740 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1745 -------------------------------------------------------------------------------
1746 Returns the result of subtracting the absolute values of the double-
1747 precision floating-point values `a' and `b'. If `zSign' is 1, the
1748 difference is negated before being returned. `zSign' is ignored if the
1749 result is a NaN. The subtraction is performed according to the IEC/IEEE
1750 Standard for Binary Floating-Point Arithmetic.
1751 -------------------------------------------------------------------------------
1753 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1755 int16 aExp, bExp, zExp;
1756 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
1757 int16 expDiff;
1759 aSig1 = extractFloat64Frac1( a );
1760 aSig0 = extractFloat64Frac0( a );
1761 aExp = extractFloat64Exp( a );
1762 bSig1 = extractFloat64Frac1( b );
1763 bSig0 = extractFloat64Frac0( b );
1764 bExp = extractFloat64Exp( b );
1765 expDiff = aExp - bExp;
1766 shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 );
1767 shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 );
1768 if ( 0 < expDiff ) goto aExpBigger;
1769 if ( expDiff < 0 ) goto bExpBigger;
1770 if ( aExp == 0x7FF ) {
1771 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1772 return propagateFloat64NaN( a, b );
1774 float_raise( float_flag_invalid );
1775 return float64_default_nan;
1777 if ( aExp == 0 ) {
1778 aExp = 1;
1779 bExp = 1;
1781 if ( bSig0 < aSig0 ) goto aBigger;
1782 if ( aSig0 < bSig0 ) goto bBigger;
1783 if ( bSig1 < aSig1 ) goto aBigger;
1784 if ( aSig1 < bSig1 ) goto bBigger;
1785 return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 );
1786 bExpBigger:
1787 if ( bExp == 0x7FF ) {
1788 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1789 return packFloat64( zSign ^ 1, 0x7FF, 0, 0 );
1791 if ( aExp == 0 ) {
1792 ++expDiff;
1794 else {
1795 aSig0 |= 0x40000000;
1797 shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
1798 bSig0 |= 0x40000000;
1799 bBigger:
1800 sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
1801 zExp = bExp;
1802 zSign ^= 1;
1803 goto normalizeRoundAndPack;
1804 aExpBigger:
1805 if ( aExp == 0x7FF ) {
1806 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1807 return a;
1809 if ( bExp == 0 ) {
1810 --expDiff;
1812 else {
1813 bSig0 |= 0x40000000;
1815 shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
1816 aSig0 |= 0x40000000;
1817 aBigger:
1818 sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1819 zExp = aExp;
1820 normalizeRoundAndPack:
1821 --zExp;
1822 return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 );
1827 -------------------------------------------------------------------------------
1828 Returns the result of adding the double-precision floating-point values `a'
1829 and `b'. The operation is performed according to the IEC/IEEE Standard for
1830 Binary Floating-Point Arithmetic.
1831 -------------------------------------------------------------------------------
1833 float64 float64_add( float64 a, float64 b )
1835 flag aSign, bSign;
1837 aSign = extractFloat64Sign( a );
1838 bSign = extractFloat64Sign( b );
1839 if ( aSign == bSign ) {
1840 return addFloat64Sigs( a, b, aSign );
1842 else {
1843 return subFloat64Sigs( a, b, aSign );
1849 -------------------------------------------------------------------------------
1850 Returns the result of subtracting the double-precision floating-point values
1851 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1852 for Binary Floating-Point Arithmetic.
1853 -------------------------------------------------------------------------------
1855 float64 float64_sub( float64 a, float64 b )
1857 flag aSign, bSign;
1859 aSign = extractFloat64Sign( a );
1860 bSign = extractFloat64Sign( b );
1861 if ( aSign == bSign ) {
1862 return subFloat64Sigs( a, b, aSign );
1864 else {
1865 return addFloat64Sigs( a, b, aSign );
1871 -------------------------------------------------------------------------------
1872 Returns the result of multiplying the double-precision floating-point values
1873 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1874 for Binary Floating-Point Arithmetic.
1875 -------------------------------------------------------------------------------
1877 float64 float64_mul( float64 a, float64 b )
1879 flag aSign, bSign, zSign;
1880 int16 aExp, bExp, zExp;
1881 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
1883 aSig1 = extractFloat64Frac1( a );
1884 aSig0 = extractFloat64Frac0( a );
1885 aExp = extractFloat64Exp( a );
1886 aSign = extractFloat64Sign( a );
1887 bSig1 = extractFloat64Frac1( b );
1888 bSig0 = extractFloat64Frac0( b );
1889 bExp = extractFloat64Exp( b );
1890 bSign = extractFloat64Sign( b );
1891 zSign = aSign ^ bSign;
1892 if ( aExp == 0x7FF ) {
1893 if ( ( aSig0 | aSig1 )
1894 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
1895 return propagateFloat64NaN( a, b );
1897 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
1898 return packFloat64( zSign, 0x7FF, 0, 0 );
1900 if ( bExp == 0x7FF ) {
1901 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1902 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1903 invalid:
1904 float_raise( float_flag_invalid );
1905 return float64_default_nan;
1907 return packFloat64( zSign, 0x7FF, 0, 0 );
1909 if ( aExp == 0 ) {
1910 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1911 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1913 if ( bExp == 0 ) {
1914 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1915 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1917 zExp = aExp + bExp - 0x400;
1918 aSig0 |= 0x00100000;
1919 shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 );
1920 mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
1921 add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
1922 zSig2 |= ( zSig3 != 0 );
1923 if ( 0x00200000 <= zSig0 ) {
1924 shift64ExtraRightJamming(
1925 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1926 ++zExp;
1928 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1933 -------------------------------------------------------------------------------
1934 Returns the result of dividing the double-precision floating-point value `a'
1935 by the corresponding value `b'. The operation is performed according to the
1936 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1937 -------------------------------------------------------------------------------
1939 float64 float64_div( float64 a, float64 b )
1941 flag aSign, bSign, zSign;
1942 int16 aExp, bExp, zExp;
1943 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1944 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
1946 aSig1 = extractFloat64Frac1( a );
1947 aSig0 = extractFloat64Frac0( a );
1948 aExp = extractFloat64Exp( a );
1949 aSign = extractFloat64Sign( a );
1950 bSig1 = extractFloat64Frac1( b );
1951 bSig0 = extractFloat64Frac0( b );
1952 bExp = extractFloat64Exp( b );
1953 bSign = extractFloat64Sign( b );
1954 zSign = aSign ^ bSign;
1955 if ( aExp == 0x7FF ) {
1956 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1957 if ( bExp == 0x7FF ) {
1958 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1959 goto invalid;
1961 return packFloat64( zSign, 0x7FF, 0, 0 );
1963 if ( bExp == 0x7FF ) {
1964 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1965 return packFloat64( zSign, 0, 0, 0 );
1967 if ( bExp == 0 ) {
1968 if ( ( bSig0 | bSig1 ) == 0 ) {
1969 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1970 invalid:
1971 float_raise( float_flag_invalid );
1972 return float64_default_nan;
1974 float_raise( float_flag_divbyzero );
1975 return packFloat64( zSign, 0x7FF, 0, 0 );
1977 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1979 if ( aExp == 0 ) {
1980 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1981 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1983 zExp = aExp - bExp + 0x3FD;
1984 shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 );
1985 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
1986 if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) {
1987 shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
1988 ++zExp;
1990 zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 );
1991 mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
1992 sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
1993 while ( (sbits32) rem0 < 0 ) {
1994 --zSig0;
1995 add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
1997 zSig1 = estimateDiv64To32( rem1, rem2, bSig0 );
1998 if ( ( zSig1 & 0x3FF ) <= 4 ) {
1999 mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
2000 sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
2001 while ( (sbits32) rem1 < 0 ) {
2002 --zSig1;
2003 add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
2005 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2007 shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 );
2008 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
2012 #ifndef SOFTFLOAT_FOR_GCC
2014 -------------------------------------------------------------------------------
2015 Returns the remainder of the double-precision floating-point value `a'
2016 with respect to the corresponding value `b'. The operation is performed
2017 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2018 -------------------------------------------------------------------------------
2020 float64 float64_rem( float64 a, float64 b )
2022 flag aSign, bSign, zSign;
2023 int16 aExp, bExp, expDiff;
2024 bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
2025 bits32 allZero, alternateASig0, alternateASig1, sigMean1;
2026 sbits32 sigMean0;
2027 float64 z;
2029 aSig1 = extractFloat64Frac1( a );
2030 aSig0 = extractFloat64Frac0( a );
2031 aExp = extractFloat64Exp( a );
2032 aSign = extractFloat64Sign( a );
2033 bSig1 = extractFloat64Frac1( b );
2034 bSig0 = extractFloat64Frac0( b );
2035 bExp = extractFloat64Exp( b );
2036 bSign = extractFloat64Sign( b );
2037 if ( aExp == 0x7FF ) {
2038 if ( ( aSig0 | aSig1 )
2039 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
2040 return propagateFloat64NaN( a, b );
2042 goto invalid;
2044 if ( bExp == 0x7FF ) {
2045 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
2046 return a;
2048 if ( bExp == 0 ) {
2049 if ( ( bSig0 | bSig1 ) == 0 ) {
2050 invalid:
2051 float_raise( float_flag_invalid );
2052 return float64_default_nan;
2054 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2056 if ( aExp == 0 ) {
2057 if ( ( aSig0 | aSig1 ) == 0 ) return a;
2058 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2060 expDiff = aExp - bExp;
2061 if ( expDiff < -1 ) return a;
2062 shortShift64Left(
2063 aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 );
2064 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
2065 q = le64( bSig0, bSig1, aSig0, aSig1 );
2066 if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2067 expDiff -= 32;
2068 while ( 0 < expDiff ) {
2069 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2070 q = ( 4 < q ) ? q - 4 : 0;
2071 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2072 shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero );
2073 shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero );
2074 sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 );
2075 expDiff -= 29;
2077 if ( -32 < expDiff ) {
2078 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2079 q = ( 4 < q ) ? q - 4 : 0;
2080 q >>= - expDiff;
2081 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2082 expDiff += 24;
2083 if ( expDiff < 0 ) {
2084 shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2086 else {
2087 shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
2089 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2090 sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
2092 else {
2093 shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 );
2094 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2096 do {
2097 alternateASig0 = aSig0;
2098 alternateASig1 = aSig1;
2099 ++q;
2100 sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2101 } while ( 0 <= (sbits32) aSig0 );
2102 add64(
2103 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
2104 if ( ( sigMean0 < 0 )
2105 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
2106 aSig0 = alternateASig0;
2107 aSig1 = alternateASig1;
2109 zSign = ( (sbits32) aSig0 < 0 );
2110 if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
2111 return
2112 normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
2115 #endif
2117 #ifndef SOFTFLOAT_FOR_GCC
2119 -------------------------------------------------------------------------------
2120 Returns the square root of the double-precision floating-point value `a'.
2121 The operation is performed according to the IEC/IEEE Standard for Binary
2122 Floating-Point Arithmetic.
2123 -------------------------------------------------------------------------------
2125 float64 float64_sqrt( float64 a )
2127 flag aSign;
2128 int16 aExp, zExp;
2129 bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
2130 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2131 float64 z;
2133 aSig1 = extractFloat64Frac1( a );
2134 aSig0 = extractFloat64Frac0( a );
2135 aExp = extractFloat64Exp( a );
2136 aSign = extractFloat64Sign( a );
2137 if ( aExp == 0x7FF ) {
2138 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a );
2139 if ( ! aSign ) return a;
2140 goto invalid;
2142 if ( aSign ) {
2143 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
2144 invalid:
2145 float_raise( float_flag_invalid );
2146 return float64_default_nan;
2148 if ( aExp == 0 ) {
2149 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 );
2150 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2152 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2153 aSig0 |= 0x00100000;
2154 shortShift64Left( aSig0, aSig1, 11, &term0, &term1 );
2155 zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1;
2156 if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF;
2157 doubleZSig0 = zSig0 + zSig0;
2158 shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 );
2159 mul32To64( zSig0, zSig0, &term0, &term1 );
2160 sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 );
2161 while ( (sbits32) rem0 < 0 ) {
2162 --zSig0;
2163 doubleZSig0 -= 2;
2164 add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 );
2166 zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 );
2167 if ( ( zSig1 & 0x1FF ) <= 5 ) {
2168 if ( zSig1 == 0 ) zSig1 = 1;
2169 mul32To64( doubleZSig0, zSig1, &term1, &term2 );
2170 sub64( rem1, 0, term1, term2, &rem1, &rem2 );
2171 mul32To64( zSig1, zSig1, &term2, &term3 );
2172 sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
2173 while ( (sbits32) rem1 < 0 ) {
2174 --zSig1;
2175 shortShift64Left( 0, zSig1, 1, &term2, &term3 );
2176 term3 |= 1;
2177 term2 |= doubleZSig0;
2178 add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
2180 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2182 shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 );
2183 return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 );
2186 #endif
2189 -------------------------------------------------------------------------------
2190 Returns 1 if the double-precision floating-point value `a' is equal to
2191 the corresponding value `b', and 0 otherwise. The comparison is performed
2192 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2193 -------------------------------------------------------------------------------
2195 flag float64_eq( float64 a, float64 b )
2198 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2199 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2200 || ( ( extractFloat64Exp( b ) == 0x7FF )
2201 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2203 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2204 float_raise( float_flag_invalid );
2206 return 0;
2208 return ( a == b ) ||
2209 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
2214 -------------------------------------------------------------------------------
2215 Returns 1 if the double-precision floating-point value `a' is less than
2216 or equal to the corresponding value `b', and 0 otherwise. The comparison
2217 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2218 Arithmetic.
2219 -------------------------------------------------------------------------------
2221 flag float64_le( float64 a, float64 b )
2223 flag aSign, bSign;
2225 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2226 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2227 || ( ( extractFloat64Exp( b ) == 0x7FF )
2228 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2230 float_raise( float_flag_invalid );
2231 return 0;
2233 aSign = extractFloat64Sign( a );
2234 bSign = extractFloat64Sign( b );
2235 if ( aSign != bSign )
2236 return aSign ||
2237 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
2238 0 );
2239 return ( a == b ) ||
2240 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2244 -------------------------------------------------------------------------------
2245 Returns 1 if the double-precision floating-point value `a' is less than
2246 the corresponding value `b', and 0 otherwise. The comparison is performed
2247 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2248 -------------------------------------------------------------------------------
2250 flag float64_lt( float64 a, float64 b )
2252 flag aSign, bSign;
2254 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2255 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2256 || ( ( extractFloat64Exp( b ) == 0x7FF )
2257 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2259 float_raise( float_flag_invalid );
2260 return 0;
2262 aSign = extractFloat64Sign( a );
2263 bSign = extractFloat64Sign( b );
2264 if ( aSign != bSign )
2265 return aSign &&
2266 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
2267 0 );
2268 return ( a != b ) &&
2269 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2273 #ifndef SOFTFLOAT_FOR_GCC
2275 -------------------------------------------------------------------------------
2276 Returns 1 if the double-precision floating-point value `a' is equal to
2277 the corresponding value `b', and 0 otherwise. The invalid exception is
2278 raised if either operand is a NaN. Otherwise, the comparison is performed
2279 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2280 -------------------------------------------------------------------------------
2282 flag float64_eq_signaling( float64 a, float64 b )
2285 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2286 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2287 || ( ( extractFloat64Exp( b ) == 0x7FF )
2288 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2290 float_raise( float_flag_invalid );
2291 return 0;
2293 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2298 -------------------------------------------------------------------------------
2299 Returns 1 if the double-precision floating-point value `a' is less than or
2300 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2301 cause an exception. Otherwise, the comparison is performed according to the
2302 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2303 -------------------------------------------------------------------------------
2305 flag float64_le_quiet( float64 a, float64 b )
2307 flag aSign, bSign;
2309 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2310 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2311 || ( ( extractFloat64Exp( b ) == 0x7FF )
2312 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2314 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2315 float_raise( float_flag_invalid );
2317 return 0;
2319 aSign = extractFloat64Sign( a );
2320 bSign = extractFloat64Sign( b );
2321 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2322 return ( a == b ) || ( aSign ^ ( a < b ) );
2327 -------------------------------------------------------------------------------
2328 Returns 1 if the double-precision floating-point value `a' is less than
2329 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2330 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2331 Standard for Binary Floating-Point Arithmetic.
2332 -------------------------------------------------------------------------------
2334 flag float64_lt_quiet( float64 a, float64 b )
2336 flag aSign, bSign;
2338 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2339 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2340 || ( ( extractFloat64Exp( b ) == 0x7FF )
2341 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2343 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2344 float_raise( float_flag_invalid );
2346 return 0;
2348 aSign = extractFloat64Sign( a );
2349 bSign = extractFloat64Sign( b );
2350 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2351 return ( a != b ) && ( aSign ^ ( a < b ) );
2355 #endif