add a simple fork test
[qemu/linux-user.git] / fpu / softfloat.c
blob3d5169db930b0f7b4f35766f251c94f7340fa216
2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 /* FIXME: Flush-To-Zero only effects results. Denormal inputs should also
34 be flushed to zero. */
35 #include "softfloat.h"
37 /*----------------------------------------------------------------------------
38 | Primitive arithmetic functions, including multi-word arithmetic, and
39 | division and square root approximations. (Can be specialized to target if
40 | desired.)
41 *----------------------------------------------------------------------------*/
42 #include "softfloat-macros.h"
44 /*----------------------------------------------------------------------------
45 | Functions and definitions to determine: (1) whether tininess for underflow
46 | is detected before or after rounding by default, (2) what (if anything)
47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49 | are propagated from function inputs to output. These details are target-
50 | specific.
51 *----------------------------------------------------------------------------*/
52 #include "softfloat-specialize.h"
54 void set_float_rounding_mode(int val STATUS_PARAM)
56 STATUS(float_rounding_mode) = val;
59 void set_float_exception_flags(int val STATUS_PARAM)
61 STATUS(float_exception_flags) = val;
64 #ifdef FLOATX80
65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
67 STATUS(floatx80_rounding_precision) = val;
69 #endif
71 /*----------------------------------------------------------------------------
72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
74 | input. If `zSign' is 1, the input is negated before being converted to an
75 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
76 | is simply rounded to an integer, with the inexact exception raised if the
77 | input cannot be represented exactly as an integer. However, if the fixed-
78 | point input is too large, the invalid exception is raised and the largest
79 | positive or negative integer is returned.
80 *----------------------------------------------------------------------------*/
82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
84 int8 roundingMode;
85 flag roundNearestEven;
86 int8 roundIncrement, roundBits;
87 int32 z;
89 roundingMode = STATUS(float_rounding_mode);
90 roundNearestEven = ( roundingMode == float_round_nearest_even );
91 roundIncrement = 0x40;
92 if ( ! roundNearestEven ) {
93 if ( roundingMode == float_round_to_zero ) {
94 roundIncrement = 0;
96 else {
97 roundIncrement = 0x7F;
98 if ( zSign ) {
99 if ( roundingMode == float_round_up ) roundIncrement = 0;
101 else {
102 if ( roundingMode == float_round_down ) roundIncrement = 0;
106 roundBits = absZ & 0x7F;
107 absZ = ( absZ + roundIncrement )>>7;
108 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
109 z = absZ;
110 if ( zSign ) z = - z;
111 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
112 float_raise( float_flag_invalid STATUS_VAR);
113 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
115 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
116 return z;
120 /*----------------------------------------------------------------------------
121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
123 | and returns the properly rounded 64-bit integer corresponding to the input.
124 | If `zSign' is 1, the input is negated before being converted to an integer.
125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
126 | the inexact exception raised if the input cannot be represented exactly as
127 | an integer. However, if the fixed-point input is too large, the invalid
128 | exception is raised and the largest positive or negative integer is
129 | returned.
130 *----------------------------------------------------------------------------*/
132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
134 int8 roundingMode;
135 flag roundNearestEven, increment;
136 int64 z;
138 roundingMode = STATUS(float_rounding_mode);
139 roundNearestEven = ( roundingMode == float_round_nearest_even );
140 increment = ( (sbits64) absZ1 < 0 );
141 if ( ! roundNearestEven ) {
142 if ( roundingMode == float_round_to_zero ) {
143 increment = 0;
145 else {
146 if ( zSign ) {
147 increment = ( roundingMode == float_round_down ) && absZ1;
149 else {
150 increment = ( roundingMode == float_round_up ) && absZ1;
154 if ( increment ) {
155 ++absZ0;
156 if ( absZ0 == 0 ) goto overflow;
157 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
159 z = absZ0;
160 if ( zSign ) z = - z;
161 if ( z && ( ( z < 0 ) ^ zSign ) ) {
162 overflow:
163 float_raise( float_flag_invalid STATUS_VAR);
164 return
165 zSign ? (sbits64) LIT64( 0x8000000000000000 )
166 : LIT64( 0x7FFFFFFFFFFFFFFF );
168 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
169 return z;
173 /*----------------------------------------------------------------------------
174 | Returns the fraction bits of the single-precision floating-point value `a'.
175 *----------------------------------------------------------------------------*/
177 INLINE bits32 extractFloat32Frac( float32 a )
180 return float32_val(a) & 0x007FFFFF;
184 /*----------------------------------------------------------------------------
185 | Returns the exponent bits of the single-precision floating-point value `a'.
186 *----------------------------------------------------------------------------*/
188 INLINE int16 extractFloat32Exp( float32 a )
191 return ( float32_val(a)>>23 ) & 0xFF;
195 /*----------------------------------------------------------------------------
196 | Returns the sign bit of the single-precision floating-point value `a'.
197 *----------------------------------------------------------------------------*/
199 INLINE flag extractFloat32Sign( float32 a )
202 return float32_val(a)>>31;
206 /*----------------------------------------------------------------------------
207 | Normalizes the subnormal single-precision floating-point value represented
208 | by the denormalized significand `aSig'. The normalized exponent and
209 | significand are stored at the locations pointed to by `zExpPtr' and
210 | `zSigPtr', respectively.
211 *----------------------------------------------------------------------------*/
213 static void
214 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
216 int8 shiftCount;
218 shiftCount = countLeadingZeros32( aSig ) - 8;
219 *zSigPtr = aSig<<shiftCount;
220 *zExpPtr = 1 - shiftCount;
224 /*----------------------------------------------------------------------------
225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226 | single-precision floating-point value, returning the result. After being
227 | shifted into the proper positions, the three fields are simply added
228 | together to form the result. This means that any integer portion of `zSig'
229 | will be added into the exponent. Since a properly normalized significand
230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
231 | than the desired result exponent whenever `zSig' is a complete, normalized
232 | significand.
233 *----------------------------------------------------------------------------*/
235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
238 return make_float32(
239 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
243 /*----------------------------------------------------------------------------
244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245 | and significand `zSig', and returns the proper single-precision floating-
246 | point value corresponding to the abstract input. Ordinarily, the abstract
247 | value is simply rounded and packed into the single-precision format, with
248 | the inexact exception raised if the abstract input cannot be represented
249 | exactly. However, if the abstract value is too large, the overflow and
250 | inexact exceptions are raised and an infinity or maximal finite value is
251 | returned. If the abstract value is too small, the input value is rounded to
252 | a subnormal number, and the underflow and inexact exceptions are raised if
253 | the abstract input cannot be represented exactly as a subnormal single-
254 | precision floating-point number.
255 | The input significand `zSig' has its binary point between bits 30
256 | and 29, which is 7 bits to the left of the usual location. This shifted
257 | significand must be normalized or smaller. If `zSig' is not normalized,
258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
259 | and it must not require rounding. In the usual case that `zSig' is
260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
262 | Binary Floating-Point Arithmetic.
263 *----------------------------------------------------------------------------*/
265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
267 int8 roundingMode;
268 flag roundNearestEven;
269 int8 roundIncrement, roundBits;
270 flag isTiny;
272 roundingMode = STATUS(float_rounding_mode);
273 roundNearestEven = ( roundingMode == float_round_nearest_even );
274 roundIncrement = 0x40;
275 if ( ! roundNearestEven ) {
276 if ( roundingMode == float_round_to_zero ) {
277 roundIncrement = 0;
279 else {
280 roundIncrement = 0x7F;
281 if ( zSign ) {
282 if ( roundingMode == float_round_up ) roundIncrement = 0;
284 else {
285 if ( roundingMode == float_round_down ) roundIncrement = 0;
289 roundBits = zSig & 0x7F;
290 if ( 0xFD <= (bits16) zExp ) {
291 if ( ( 0xFD < zExp )
292 || ( ( zExp == 0xFD )
293 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
295 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
296 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
298 if ( zExp < 0 ) {
299 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
300 isTiny =
301 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
302 || ( zExp < -1 )
303 || ( zSig + roundIncrement < 0x80000000 );
304 shift32RightJamming( zSig, - zExp, &zSig );
305 zExp = 0;
306 roundBits = zSig & 0x7F;
307 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
310 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
311 zSig = ( zSig + roundIncrement )>>7;
312 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
313 if ( zSig == 0 ) zExp = 0;
314 return packFloat32( zSign, zExp, zSig );
318 /*----------------------------------------------------------------------------
319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320 | and significand `zSig', and returns the proper single-precision floating-
321 | point value corresponding to the abstract input. This routine is just like
322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324 | floating-point exponent.
325 *----------------------------------------------------------------------------*/
327 static float32
328 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
330 int8 shiftCount;
332 shiftCount = countLeadingZeros32( zSig ) - 1;
333 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
337 /*----------------------------------------------------------------------------
338 | Returns the fraction bits of the double-precision floating-point value `a'.
339 *----------------------------------------------------------------------------*/
341 INLINE bits64 extractFloat64Frac( float64 a )
344 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
348 /*----------------------------------------------------------------------------
349 | Returns the exponent bits of the double-precision floating-point value `a'.
350 *----------------------------------------------------------------------------*/
352 INLINE int16 extractFloat64Exp( float64 a )
355 return ( float64_val(a)>>52 ) & 0x7FF;
359 /*----------------------------------------------------------------------------
360 | Returns the sign bit of the double-precision floating-point value `a'.
361 *----------------------------------------------------------------------------*/
363 INLINE flag extractFloat64Sign( float64 a )
366 return float64_val(a)>>63;
370 /*----------------------------------------------------------------------------
371 | Normalizes the subnormal double-precision floating-point value represented
372 | by the denormalized significand `aSig'. The normalized exponent and
373 | significand are stored at the locations pointed to by `zExpPtr' and
374 | `zSigPtr', respectively.
375 *----------------------------------------------------------------------------*/
377 static void
378 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
380 int8 shiftCount;
382 shiftCount = countLeadingZeros64( aSig ) - 11;
383 *zSigPtr = aSig<<shiftCount;
384 *zExpPtr = 1 - shiftCount;
388 /*----------------------------------------------------------------------------
389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390 | double-precision floating-point value, returning the result. After being
391 | shifted into the proper positions, the three fields are simply added
392 | together to form the result. This means that any integer portion of `zSig'
393 | will be added into the exponent. Since a properly normalized significand
394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
395 | than the desired result exponent whenever `zSig' is a complete, normalized
396 | significand.
397 *----------------------------------------------------------------------------*/
399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
402 return make_float64(
403 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
407 /*----------------------------------------------------------------------------
408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409 | and significand `zSig', and returns the proper double-precision floating-
410 | point value corresponding to the abstract input. Ordinarily, the abstract
411 | value is simply rounded and packed into the double-precision format, with
412 | the inexact exception raised if the abstract input cannot be represented
413 | exactly. However, if the abstract value is too large, the overflow and
414 | inexact exceptions are raised and an infinity or maximal finite value is
415 | returned. If the abstract value is too small, the input value is rounded
416 | to a subnormal number, and the underflow and inexact exceptions are raised
417 | if the abstract input cannot be represented exactly as a subnormal double-
418 | precision floating-point number.
419 | The input significand `zSig' has its binary point between bits 62
420 | and 61, which is 10 bits to the left of the usual location. This shifted
421 | significand must be normalized or smaller. If `zSig' is not normalized,
422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
423 | and it must not require rounding. In the usual case that `zSig' is
424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
426 | Binary Floating-Point Arithmetic.
427 *----------------------------------------------------------------------------*/
429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
431 int8 roundingMode;
432 flag roundNearestEven;
433 int16 roundIncrement, roundBits;
434 flag isTiny;
436 roundingMode = STATUS(float_rounding_mode);
437 roundNearestEven = ( roundingMode == float_round_nearest_even );
438 roundIncrement = 0x200;
439 if ( ! roundNearestEven ) {
440 if ( roundingMode == float_round_to_zero ) {
441 roundIncrement = 0;
443 else {
444 roundIncrement = 0x3FF;
445 if ( zSign ) {
446 if ( roundingMode == float_round_up ) roundIncrement = 0;
448 else {
449 if ( roundingMode == float_round_down ) roundIncrement = 0;
453 roundBits = zSig & 0x3FF;
454 if ( 0x7FD <= (bits16) zExp ) {
455 if ( ( 0x7FD < zExp )
456 || ( ( zExp == 0x7FD )
457 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
459 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
460 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
462 if ( zExp < 0 ) {
463 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
464 isTiny =
465 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
466 || ( zExp < -1 )
467 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
468 shift64RightJamming( zSig, - zExp, &zSig );
469 zExp = 0;
470 roundBits = zSig & 0x3FF;
471 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
474 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
475 zSig = ( zSig + roundIncrement )>>10;
476 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
477 if ( zSig == 0 ) zExp = 0;
478 return packFloat64( zSign, zExp, zSig );
482 /*----------------------------------------------------------------------------
483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484 | and significand `zSig', and returns the proper double-precision floating-
485 | point value corresponding to the abstract input. This routine is just like
486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488 | floating-point exponent.
489 *----------------------------------------------------------------------------*/
491 static float64
492 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
494 int8 shiftCount;
496 shiftCount = countLeadingZeros64( zSig ) - 1;
497 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
501 #ifdef FLOATX80
503 /*----------------------------------------------------------------------------
504 | Returns the fraction bits of the extended double-precision floating-point
505 | value `a'.
506 *----------------------------------------------------------------------------*/
508 INLINE bits64 extractFloatx80Frac( floatx80 a )
511 return a.low;
515 /*----------------------------------------------------------------------------
516 | Returns the exponent bits of the extended double-precision floating-point
517 | value `a'.
518 *----------------------------------------------------------------------------*/
520 INLINE int32 extractFloatx80Exp( floatx80 a )
523 return a.high & 0x7FFF;
527 /*----------------------------------------------------------------------------
528 | Returns the sign bit of the extended double-precision floating-point value
529 | `a'.
530 *----------------------------------------------------------------------------*/
532 INLINE flag extractFloatx80Sign( floatx80 a )
535 return a.high>>15;
539 /*----------------------------------------------------------------------------
540 | Normalizes the subnormal extended double-precision floating-point value
541 | represented by the denormalized significand `aSig'. The normalized exponent
542 | and significand are stored at the locations pointed to by `zExpPtr' and
543 | `zSigPtr', respectively.
544 *----------------------------------------------------------------------------*/
546 static void
547 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
549 int8 shiftCount;
551 shiftCount = countLeadingZeros64( aSig );
552 *zSigPtr = aSig<<shiftCount;
553 *zExpPtr = 1 - shiftCount;
557 /*----------------------------------------------------------------------------
558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559 | extended double-precision floating-point value, returning the result.
560 *----------------------------------------------------------------------------*/
562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
564 floatx80 z;
566 z.low = zSig;
567 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
568 return z;
572 /*----------------------------------------------------------------------------
573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
575 | and returns the proper extended double-precision floating-point value
576 | corresponding to the abstract input. Ordinarily, the abstract value is
577 | rounded and packed into the extended double-precision format, with the
578 | inexact exception raised if the abstract input cannot be represented
579 | exactly. However, if the abstract value is too large, the overflow and
580 | inexact exceptions are raised and an infinity or maximal finite value is
581 | returned. If the abstract value is too small, the input value is rounded to
582 | a subnormal number, and the underflow and inexact exceptions are raised if
583 | the abstract input cannot be represented exactly as a subnormal extended
584 | double-precision floating-point number.
585 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
586 | number of bits as single or double precision, respectively. Otherwise, the
587 | result is rounded to the full precision of the extended double-precision
588 | format.
589 | The input significand must be normalized or smaller. If the input
590 | significand is not normalized, `zExp' must be 0; in that case, the result
591 | returned is a subnormal number, and it must not require rounding. The
592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593 | Floating-Point Arithmetic.
594 *----------------------------------------------------------------------------*/
596 static floatx80
597 roundAndPackFloatx80(
598 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
599 STATUS_PARAM)
601 int8 roundingMode;
602 flag roundNearestEven, increment, isTiny;
603 int64 roundIncrement, roundMask, roundBits;
605 roundingMode = STATUS(float_rounding_mode);
606 roundNearestEven = ( roundingMode == float_round_nearest_even );
607 if ( roundingPrecision == 80 ) goto precision80;
608 if ( roundingPrecision == 64 ) {
609 roundIncrement = LIT64( 0x0000000000000400 );
610 roundMask = LIT64( 0x00000000000007FF );
612 else if ( roundingPrecision == 32 ) {
613 roundIncrement = LIT64( 0x0000008000000000 );
614 roundMask = LIT64( 0x000000FFFFFFFFFF );
616 else {
617 goto precision80;
619 zSig0 |= ( zSig1 != 0 );
620 if ( ! roundNearestEven ) {
621 if ( roundingMode == float_round_to_zero ) {
622 roundIncrement = 0;
624 else {
625 roundIncrement = roundMask;
626 if ( zSign ) {
627 if ( roundingMode == float_round_up ) roundIncrement = 0;
629 else {
630 if ( roundingMode == float_round_down ) roundIncrement = 0;
634 roundBits = zSig0 & roundMask;
635 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
636 if ( ( 0x7FFE < zExp )
637 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
639 goto overflow;
641 if ( zExp <= 0 ) {
642 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
643 isTiny =
644 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
645 || ( zExp < 0 )
646 || ( zSig0 <= zSig0 + roundIncrement );
647 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
648 zExp = 0;
649 roundBits = zSig0 & roundMask;
650 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
651 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
652 zSig0 += roundIncrement;
653 if ( (sbits64) zSig0 < 0 ) zExp = 1;
654 roundIncrement = roundMask + 1;
655 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
656 roundMask |= roundIncrement;
658 zSig0 &= ~ roundMask;
659 return packFloatx80( zSign, zExp, zSig0 );
662 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
663 zSig0 += roundIncrement;
664 if ( zSig0 < roundIncrement ) {
665 ++zExp;
666 zSig0 = LIT64( 0x8000000000000000 );
668 roundIncrement = roundMask + 1;
669 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
670 roundMask |= roundIncrement;
672 zSig0 &= ~ roundMask;
673 if ( zSig0 == 0 ) zExp = 0;
674 return packFloatx80( zSign, zExp, zSig0 );
675 precision80:
676 increment = ( (sbits64) zSig1 < 0 );
677 if ( ! roundNearestEven ) {
678 if ( roundingMode == float_round_to_zero ) {
679 increment = 0;
681 else {
682 if ( zSign ) {
683 increment = ( roundingMode == float_round_down ) && zSig1;
685 else {
686 increment = ( roundingMode == float_round_up ) && zSig1;
690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691 if ( ( 0x7FFE < zExp )
692 || ( ( zExp == 0x7FFE )
693 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
694 && increment
697 roundMask = 0;
698 overflow:
699 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
700 if ( ( roundingMode == float_round_to_zero )
701 || ( zSign && ( roundingMode == float_round_up ) )
702 || ( ! zSign && ( roundingMode == float_round_down ) )
704 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
706 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
708 if ( zExp <= 0 ) {
709 isTiny =
710 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
711 || ( zExp < 0 )
712 || ! increment
713 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
714 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
715 zExp = 0;
716 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
717 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
718 if ( roundNearestEven ) {
719 increment = ( (sbits64) zSig1 < 0 );
721 else {
722 if ( zSign ) {
723 increment = ( roundingMode == float_round_down ) && zSig1;
725 else {
726 increment = ( roundingMode == float_round_up ) && zSig1;
729 if ( increment ) {
730 ++zSig0;
731 zSig0 &=
732 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
733 if ( (sbits64) zSig0 < 0 ) zExp = 1;
735 return packFloatx80( zSign, zExp, zSig0 );
738 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
739 if ( increment ) {
740 ++zSig0;
741 if ( zSig0 == 0 ) {
742 ++zExp;
743 zSig0 = LIT64( 0x8000000000000000 );
745 else {
746 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
749 else {
750 if ( zSig0 == 0 ) zExp = 0;
752 return packFloatx80( zSign, zExp, zSig0 );
756 /*----------------------------------------------------------------------------
757 | Takes an abstract floating-point value having sign `zSign', exponent
758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759 | and returns the proper extended double-precision floating-point value
760 | corresponding to the abstract input. This routine is just like
761 | `roundAndPackFloatx80' except that the input significand does not have to be
762 | normalized.
763 *----------------------------------------------------------------------------*/
765 static floatx80
766 normalizeRoundAndPackFloatx80(
767 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
768 STATUS_PARAM)
770 int8 shiftCount;
772 if ( zSig0 == 0 ) {
773 zSig0 = zSig1;
774 zSig1 = 0;
775 zExp -= 64;
777 shiftCount = countLeadingZeros64( zSig0 );
778 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
779 zExp -= shiftCount;
780 return
781 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
785 #endif
787 #ifdef FLOAT128
789 /*----------------------------------------------------------------------------
790 | Returns the least-significant 64 fraction bits of the quadruple-precision
791 | floating-point value `a'.
792 *----------------------------------------------------------------------------*/
794 INLINE bits64 extractFloat128Frac1( float128 a )
797 return a.low;
801 /*----------------------------------------------------------------------------
802 | Returns the most-significant 48 fraction bits of the quadruple-precision
803 | floating-point value `a'.
804 *----------------------------------------------------------------------------*/
806 INLINE bits64 extractFloat128Frac0( float128 a )
809 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
813 /*----------------------------------------------------------------------------
814 | Returns the exponent bits of the quadruple-precision floating-point value
815 | `a'.
816 *----------------------------------------------------------------------------*/
818 INLINE int32 extractFloat128Exp( float128 a )
821 return ( a.high>>48 ) & 0x7FFF;
825 /*----------------------------------------------------------------------------
826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
827 *----------------------------------------------------------------------------*/
829 INLINE flag extractFloat128Sign( float128 a )
832 return a.high>>63;
836 /*----------------------------------------------------------------------------
837 | Normalizes the subnormal quadruple-precision floating-point value
838 | represented by the denormalized significand formed by the concatenation of
839 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
840 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
841 | significand are stored at the location pointed to by `zSig0Ptr', and the
842 | least significant 64 bits of the normalized significand are stored at the
843 | location pointed to by `zSig1Ptr'.
844 *----------------------------------------------------------------------------*/
846 static void
847 normalizeFloat128Subnormal(
848 bits64 aSig0,
849 bits64 aSig1,
850 int32 *zExpPtr,
851 bits64 *zSig0Ptr,
852 bits64 *zSig1Ptr
855 int8 shiftCount;
857 if ( aSig0 == 0 ) {
858 shiftCount = countLeadingZeros64( aSig1 ) - 15;
859 if ( shiftCount < 0 ) {
860 *zSig0Ptr = aSig1>>( - shiftCount );
861 *zSig1Ptr = aSig1<<( shiftCount & 63 );
863 else {
864 *zSig0Ptr = aSig1<<shiftCount;
865 *zSig1Ptr = 0;
867 *zExpPtr = - shiftCount - 63;
869 else {
870 shiftCount = countLeadingZeros64( aSig0 ) - 15;
871 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
872 *zExpPtr = 1 - shiftCount;
877 /*----------------------------------------------------------------------------
878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880 | floating-point value, returning the result. After being shifted into the
881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882 | added together to form the most significant 32 bits of the result. This
883 | means that any integer portion of `zSig0' will be added into the exponent.
884 | Since a properly normalized significand will have an integer portion equal
885 | to 1, the `zExp' input should be 1 less than the desired result exponent
886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
887 | significand.
888 *----------------------------------------------------------------------------*/
890 INLINE float128
891 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
893 float128 z;
895 z.low = zSig1;
896 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
897 return z;
901 /*----------------------------------------------------------------------------
902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
904 | and `zSig2', and returns the proper quadruple-precision floating-point value
905 | corresponding to the abstract input. Ordinarily, the abstract value is
906 | simply rounded and packed into the quadruple-precision format, with the
907 | inexact exception raised if the abstract input cannot be represented
908 | exactly. However, if the abstract value is too large, the overflow and
909 | inexact exceptions are raised and an infinity or maximal finite value is
910 | returned. If the abstract value is too small, the input value is rounded to
911 | a subnormal number, and the underflow and inexact exceptions are raised if
912 | the abstract input cannot be represented exactly as a subnormal quadruple-
913 | precision floating-point number.
914 | The input significand must be normalized or smaller. If the input
915 | significand is not normalized, `zExp' must be 0; in that case, the result
916 | returned is a subnormal number, and it must not require rounding. In the
917 | usual case that the input significand is normalized, `zExp' must be 1 less
918 | than the ``true'' floating-point exponent. The handling of underflow and
919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920 *----------------------------------------------------------------------------*/
922 static float128
923 roundAndPackFloat128(
924 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
926 int8 roundingMode;
927 flag roundNearestEven, increment, isTiny;
929 roundingMode = STATUS(float_rounding_mode);
930 roundNearestEven = ( roundingMode == float_round_nearest_even );
931 increment = ( (sbits64) zSig2 < 0 );
932 if ( ! roundNearestEven ) {
933 if ( roundingMode == float_round_to_zero ) {
934 increment = 0;
936 else {
937 if ( zSign ) {
938 increment = ( roundingMode == float_round_down ) && zSig2;
940 else {
941 increment = ( roundingMode == float_round_up ) && zSig2;
945 if ( 0x7FFD <= (bits32) zExp ) {
946 if ( ( 0x7FFD < zExp )
947 || ( ( zExp == 0x7FFD )
948 && eq128(
949 LIT64( 0x0001FFFFFFFFFFFF ),
950 LIT64( 0xFFFFFFFFFFFFFFFF ),
951 zSig0,
952 zSig1
954 && increment
957 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
958 if ( ( roundingMode == float_round_to_zero )
959 || ( zSign && ( roundingMode == float_round_up ) )
960 || ( ! zSign && ( roundingMode == float_round_down ) )
962 return
963 packFloat128(
964 zSign,
965 0x7FFE,
966 LIT64( 0x0000FFFFFFFFFFFF ),
967 LIT64( 0xFFFFFFFFFFFFFFFF )
970 return packFloat128( zSign, 0x7FFF, 0, 0 );
972 if ( zExp < 0 ) {
973 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
974 isTiny =
975 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
976 || ( zExp < -1 )
977 || ! increment
978 || lt128(
979 zSig0,
980 zSig1,
981 LIT64( 0x0001FFFFFFFFFFFF ),
982 LIT64( 0xFFFFFFFFFFFFFFFF )
984 shift128ExtraRightJamming(
985 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
986 zExp = 0;
987 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
988 if ( roundNearestEven ) {
989 increment = ( (sbits64) zSig2 < 0 );
991 else {
992 if ( zSign ) {
993 increment = ( roundingMode == float_round_down ) && zSig2;
995 else {
996 increment = ( roundingMode == float_round_up ) && zSig2;
1001 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1002 if ( increment ) {
1003 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1006 else {
1007 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1009 return packFloat128( zSign, zExp, zSig0, zSig1 );
1013 /*----------------------------------------------------------------------------
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1016 | returns the proper quadruple-precision floating-point value corresponding
1017 | to the abstract input. This routine is just like `roundAndPackFloat128'
1018 | except that the input significand has fewer bits and does not have to be
1019 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1020 | point exponent.
1021 *----------------------------------------------------------------------------*/
1023 static float128
1024 normalizeRoundAndPackFloat128(
1025 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1027 int8 shiftCount;
1028 bits64 zSig2;
1030 if ( zSig0 == 0 ) {
1031 zSig0 = zSig1;
1032 zSig1 = 0;
1033 zExp -= 64;
1035 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036 if ( 0 <= shiftCount ) {
1037 zSig2 = 0;
1038 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1040 else {
1041 shift128ExtraRightJamming(
1042 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1044 zExp -= shiftCount;
1045 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1049 #endif
1051 /*----------------------------------------------------------------------------
1052 | Returns the result of converting the 32-bit two's complement integer `a'
1053 | to the single-precision floating-point format. The conversion is performed
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055 *----------------------------------------------------------------------------*/
1057 float32 int32_to_float32( int32 a STATUS_PARAM )
1059 flag zSign;
1061 if ( a == 0 ) return float32_zero;
1062 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063 zSign = ( a < 0 );
1064 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1068 /*----------------------------------------------------------------------------
1069 | Returns the result of converting the 32-bit two's complement integer `a'
1070 | to the double-precision floating-point format. The conversion is performed
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072 *----------------------------------------------------------------------------*/
1074 float64 int32_to_float64( int32 a STATUS_PARAM )
1076 flag zSign;
1077 uint32 absA;
1078 int8 shiftCount;
1079 bits64 zSig;
1081 if ( a == 0 ) return float64_zero;
1082 zSign = ( a < 0 );
1083 absA = zSign ? - a : a;
1084 shiftCount = countLeadingZeros32( absA ) + 21;
1085 zSig = absA;
1086 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1090 #ifdef FLOATX80
1092 /*----------------------------------------------------------------------------
1093 | Returns the result of converting the 32-bit two's complement integer `a'
1094 | to the extended double-precision floating-point format. The conversion
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096 | Arithmetic.
1097 *----------------------------------------------------------------------------*/
1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1101 flag zSign;
1102 uint32 absA;
1103 int8 shiftCount;
1104 bits64 zSig;
1106 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107 zSign = ( a < 0 );
1108 absA = zSign ? - a : a;
1109 shiftCount = countLeadingZeros32( absA ) + 32;
1110 zSig = absA;
1111 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1115 #endif
1117 #ifdef FLOAT128
1119 /*----------------------------------------------------------------------------
1120 | Returns the result of converting the 32-bit two's complement integer `a' to
1121 | the quadruple-precision floating-point format. The conversion is performed
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123 *----------------------------------------------------------------------------*/
1125 float128 int32_to_float128( int32 a STATUS_PARAM )
1127 flag zSign;
1128 uint32 absA;
1129 int8 shiftCount;
1130 bits64 zSig0;
1132 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133 zSign = ( a < 0 );
1134 absA = zSign ? - a : a;
1135 shiftCount = countLeadingZeros32( absA ) + 17;
1136 zSig0 = absA;
1137 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1141 #endif
1143 /*----------------------------------------------------------------------------
1144 | Returns the result of converting the 64-bit two's complement integer `a'
1145 | to the single-precision floating-point format. The conversion is performed
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147 *----------------------------------------------------------------------------*/
1149 float32 int64_to_float32( int64 a STATUS_PARAM )
1151 flag zSign;
1152 uint64 absA;
1153 int8 shiftCount;
1155 if ( a == 0 ) return float32_zero;
1156 zSign = ( a < 0 );
1157 absA = zSign ? - a : a;
1158 shiftCount = countLeadingZeros64( absA ) - 40;
1159 if ( 0 <= shiftCount ) {
1160 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1162 else {
1163 shiftCount += 7;
1164 if ( shiftCount < 0 ) {
1165 shift64RightJamming( absA, - shiftCount, &absA );
1167 else {
1168 absA <<= shiftCount;
1170 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1177 int8 shiftCount;
1179 if ( a == 0 ) return float32_zero;
1180 shiftCount = countLeadingZeros64( a ) - 40;
1181 if ( 0 <= shiftCount ) {
1182 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1184 else {
1185 shiftCount += 7;
1186 if ( shiftCount < 0 ) {
1187 shift64RightJamming( a, - shiftCount, &a );
1189 else {
1190 a <<= shiftCount;
1192 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1196 /*----------------------------------------------------------------------------
1197 | Returns the result of converting the 64-bit two's complement integer `a'
1198 | to the double-precision floating-point format. The conversion is performed
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200 *----------------------------------------------------------------------------*/
1202 float64 int64_to_float64( int64 a STATUS_PARAM )
1204 flag zSign;
1206 if ( a == 0 ) return float64_zero;
1207 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208 return packFloat64( 1, 0x43E, 0 );
1210 zSign = ( a < 0 );
1211 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1217 if ( a == 0 ) return float64_zero;
1218 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1222 #ifdef FLOATX80
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the extended double-precision floating-point format. The conversion
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228 | Arithmetic.
1229 *----------------------------------------------------------------------------*/
1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1233 flag zSign;
1234 uint64 absA;
1235 int8 shiftCount;
1237 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238 zSign = ( a < 0 );
1239 absA = zSign ? - a : a;
1240 shiftCount = countLeadingZeros64( absA );
1241 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1245 #endif
1247 #ifdef FLOAT128
1249 /*----------------------------------------------------------------------------
1250 | Returns the result of converting the 64-bit two's complement integer `a' to
1251 | the quadruple-precision floating-point format. The conversion is performed
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253 *----------------------------------------------------------------------------*/
1255 float128 int64_to_float128( int64 a STATUS_PARAM )
1257 flag zSign;
1258 uint64 absA;
1259 int8 shiftCount;
1260 int32 zExp;
1261 bits64 zSig0, zSig1;
1263 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264 zSign = ( a < 0 );
1265 absA = zSign ? - a : a;
1266 shiftCount = countLeadingZeros64( absA ) + 49;
1267 zExp = 0x406E - shiftCount;
1268 if ( 64 <= shiftCount ) {
1269 zSig1 = 0;
1270 zSig0 = absA;
1271 shiftCount -= 64;
1273 else {
1274 zSig1 = absA;
1275 zSig0 = 0;
1277 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278 return packFloat128( zSign, zExp, zSig0, zSig1 );
1282 #endif
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the single-precision floating-point value
1286 | `a' to the 32-bit two's complement integer format. The conversion is
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic---which means in particular that the conversion is rounded
1289 | according to the current rounding mode. If `a' is a NaN, the largest
1290 | positive integer is returned. Otherwise, if the conversion overflows, the
1291 | largest integer with the same sign as `a' is returned.
1292 *----------------------------------------------------------------------------*/
1294 int32 float32_to_int32( float32 a STATUS_PARAM )
1296 flag aSign;
1297 int16 aExp, shiftCount;
1298 bits32 aSig;
1299 bits64 aSig64;
1301 aSig = extractFloat32Frac( a );
1302 aExp = extractFloat32Exp( a );
1303 aSign = extractFloat32Sign( a );
1304 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305 if ( aExp ) aSig |= 0x00800000;
1306 shiftCount = 0xAF - aExp;
1307 aSig64 = aSig;
1308 aSig64 <<= 32;
1309 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1314 /*----------------------------------------------------------------------------
1315 | Returns the result of converting the single-precision floating-point value
1316 | `a' to the 32-bit two's complement integer format. The conversion is
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1318 | Arithmetic, except that the conversion is always rounded toward zero.
1319 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1320 | the conversion overflows, the largest integer with the same sign as `a' is
1321 | returned.
1322 *----------------------------------------------------------------------------*/
1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1326 flag aSign;
1327 int16 aExp, shiftCount;
1328 bits32 aSig;
1329 int32 z;
1331 aSig = extractFloat32Frac( a );
1332 aExp = extractFloat32Exp( a );
1333 aSign = extractFloat32Sign( a );
1334 shiftCount = aExp - 0x9E;
1335 if ( 0 <= shiftCount ) {
1336 if ( float32_val(a) != 0xCF000000 ) {
1337 float_raise( float_flag_invalid STATUS_VAR);
1338 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1340 return (sbits32) 0x80000000;
1342 else if ( aExp <= 0x7E ) {
1343 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1344 return 0;
1346 aSig = ( aSig | 0x00800000 )<<8;
1347 z = aSig>>( - shiftCount );
1348 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349 STATUS(float_exception_flags) |= float_flag_inexact;
1351 if ( aSign ) z = - z;
1352 return z;
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 64-bit two's complement integer format. The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode. If `a' is a NaN, the largest
1362 | positive integer is returned. Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1366 int64 float32_to_int64( float32 a STATUS_PARAM )
1368 flag aSign;
1369 int16 aExp, shiftCount;
1370 bits32 aSig;
1371 bits64 aSig64, aSigExtra;
1373 aSig = extractFloat32Frac( a );
1374 aExp = extractFloat32Exp( a );
1375 aSign = extractFloat32Sign( a );
1376 shiftCount = 0xBE - aExp;
1377 if ( shiftCount < 0 ) {
1378 float_raise( float_flag_invalid STATUS_VAR);
1379 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1380 return LIT64( 0x7FFFFFFFFFFFFFFF );
1382 return (sbits64) LIT64( 0x8000000000000000 );
1384 if ( aExp ) aSig |= 0x00800000;
1385 aSig64 = aSig;
1386 aSig64 <<= 40;
1387 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1392 /*----------------------------------------------------------------------------
1393 | Returns the result of converting the single-precision floating-point value
1394 | `a' to the 64-bit two's complement integer format. The conversion is
1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1396 | Arithmetic, except that the conversion is always rounded toward zero. If
1397 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1398 | conversion overflows, the largest integer with the same sign as `a' is
1399 | returned.
1400 *----------------------------------------------------------------------------*/
1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1404 flag aSign;
1405 int16 aExp, shiftCount;
1406 bits32 aSig;
1407 bits64 aSig64;
1408 int64 z;
1410 aSig = extractFloat32Frac( a );
1411 aExp = extractFloat32Exp( a );
1412 aSign = extractFloat32Sign( a );
1413 shiftCount = aExp - 0xBE;
1414 if ( 0 <= shiftCount ) {
1415 if ( float32_val(a) != 0xDF000000 ) {
1416 float_raise( float_flag_invalid STATUS_VAR);
1417 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1418 return LIT64( 0x7FFFFFFFFFFFFFFF );
1421 return (sbits64) LIT64( 0x8000000000000000 );
1423 else if ( aExp <= 0x7E ) {
1424 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1425 return 0;
1427 aSig64 = aSig | 0x00800000;
1428 aSig64 <<= 40;
1429 z = aSig64>>( - shiftCount );
1430 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431 STATUS(float_exception_flags) |= float_flag_inexact;
1433 if ( aSign ) z = - z;
1434 return z;
1438 /*----------------------------------------------------------------------------
1439 | Returns the result of converting the single-precision floating-point value
1440 | `a' to the double-precision floating-point format. The conversion is
1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1442 | Arithmetic.
1443 *----------------------------------------------------------------------------*/
1445 float64 float32_to_float64( float32 a STATUS_PARAM )
1447 flag aSign;
1448 int16 aExp;
1449 bits32 aSig;
1451 aSig = extractFloat32Frac( a );
1452 aExp = extractFloat32Exp( a );
1453 aSign = extractFloat32Sign( a );
1454 if ( aExp == 0xFF ) {
1455 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456 return packFloat64( aSign, 0x7FF, 0 );
1458 if ( aExp == 0 ) {
1459 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461 --aExp;
1463 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1467 #ifdef FLOATX80
1469 /*----------------------------------------------------------------------------
1470 | Returns the result of converting the single-precision floating-point value
1471 | `a' to the extended double-precision floating-point format. The conversion
1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473 | Arithmetic.
1474 *----------------------------------------------------------------------------*/
1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1478 flag aSign;
1479 int16 aExp;
1480 bits32 aSig;
1482 aSig = extractFloat32Frac( a );
1483 aExp = extractFloat32Exp( a );
1484 aSign = extractFloat32Sign( a );
1485 if ( aExp == 0xFF ) {
1486 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1489 if ( aExp == 0 ) {
1490 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1493 aSig |= 0x00800000;
1494 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1498 #endif
1500 #ifdef FLOAT128
1502 /*----------------------------------------------------------------------------
1503 | Returns the result of converting the single-precision floating-point value
1504 | `a' to the double-precision floating-point format. The conversion is
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1506 | Arithmetic.
1507 *----------------------------------------------------------------------------*/
1509 float128 float32_to_float128( float32 a STATUS_PARAM )
1511 flag aSign;
1512 int16 aExp;
1513 bits32 aSig;
1515 aSig = extractFloat32Frac( a );
1516 aExp = extractFloat32Exp( a );
1517 aSign = extractFloat32Sign( a );
1518 if ( aExp == 0xFF ) {
1519 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520 return packFloat128( aSign, 0x7FFF, 0, 0 );
1522 if ( aExp == 0 ) {
1523 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525 --aExp;
1527 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1531 #endif
1533 /*----------------------------------------------------------------------------
1534 | Rounds the single-precision floating-point value `a' to an integer, and
1535 | returns the result as a single-precision floating-point value. The
1536 | operation is performed according to the IEC/IEEE Standard for Binary
1537 | Floating-Point Arithmetic.
1538 *----------------------------------------------------------------------------*/
1540 float32 float32_round_to_int( float32 a STATUS_PARAM)
1542 flag aSign;
1543 int16 aExp;
1544 bits32 lastBitMask, roundBitsMask;
1545 int8 roundingMode;
1546 bits32 z;
1548 aExp = extractFloat32Exp( a );
1549 if ( 0x96 <= aExp ) {
1550 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551 return propagateFloat32NaN( a, a STATUS_VAR );
1553 return a;
1555 if ( aExp <= 0x7E ) {
1556 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557 STATUS(float_exception_flags) |= float_flag_inexact;
1558 aSign = extractFloat32Sign( a );
1559 switch ( STATUS(float_rounding_mode) ) {
1560 case float_round_nearest_even:
1561 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562 return packFloat32( aSign, 0x7F, 0 );
1564 break;
1565 case float_round_down:
1566 return make_float32(aSign ? 0xBF800000 : 0);
1567 case float_round_up:
1568 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1570 return packFloat32( aSign, 0, 0 );
1572 lastBitMask = 1;
1573 lastBitMask <<= 0x96 - aExp;
1574 roundBitsMask = lastBitMask - 1;
1575 z = float32_val(a);
1576 roundingMode = STATUS(float_rounding_mode);
1577 if ( roundingMode == float_round_nearest_even ) {
1578 z += lastBitMask>>1;
1579 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1581 else if ( roundingMode != float_round_to_zero ) {
1582 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583 z += roundBitsMask;
1586 z &= ~ roundBitsMask;
1587 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1588 return make_float32(z);
1592 /*----------------------------------------------------------------------------
1593 | Returns the result of adding the absolute values of the single-precision
1594 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1595 | before being returned. `zSign' is ignored if the result is a NaN.
1596 | The addition is performed according to the IEC/IEEE Standard for Binary
1597 | Floating-Point Arithmetic.
1598 *----------------------------------------------------------------------------*/
1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1602 int16 aExp, bExp, zExp;
1603 bits32 aSig, bSig, zSig;
1604 int16 expDiff;
1606 aSig = extractFloat32Frac( a );
1607 aExp = extractFloat32Exp( a );
1608 bSig = extractFloat32Frac( b );
1609 bExp = extractFloat32Exp( b );
1610 expDiff = aExp - bExp;
1611 aSig <<= 6;
1612 bSig <<= 6;
1613 if ( 0 < expDiff ) {
1614 if ( aExp == 0xFF ) {
1615 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616 return a;
1618 if ( bExp == 0 ) {
1619 --expDiff;
1621 else {
1622 bSig |= 0x20000000;
1624 shift32RightJamming( bSig, expDiff, &bSig );
1625 zExp = aExp;
1627 else if ( expDiff < 0 ) {
1628 if ( bExp == 0xFF ) {
1629 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630 return packFloat32( zSign, 0xFF, 0 );
1632 if ( aExp == 0 ) {
1633 ++expDiff;
1635 else {
1636 aSig |= 0x20000000;
1638 shift32RightJamming( aSig, - expDiff, &aSig );
1639 zExp = bExp;
1641 else {
1642 if ( aExp == 0xFF ) {
1643 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644 return a;
1646 if ( aExp == 0 ) {
1647 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1650 zSig = 0x40000000 + aSig + bSig;
1651 zExp = aExp;
1652 goto roundAndPack;
1654 aSig |= 0x20000000;
1655 zSig = ( aSig + bSig )<<1;
1656 --zExp;
1657 if ( (sbits32) zSig < 0 ) {
1658 zSig = aSig + bSig;
1659 ++zExp;
1661 roundAndPack:
1662 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1666 /*----------------------------------------------------------------------------
1667 | Returns the result of subtracting the absolute values of the single-
1668 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1669 | difference is negated before being returned. `zSign' is ignored if the
1670 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1671 | Standard for Binary Floating-Point Arithmetic.
1672 *----------------------------------------------------------------------------*/
1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1676 int16 aExp, bExp, zExp;
1677 bits32 aSig, bSig, zSig;
1678 int16 expDiff;
1680 aSig = extractFloat32Frac( a );
1681 aExp = extractFloat32Exp( a );
1682 bSig = extractFloat32Frac( b );
1683 bExp = extractFloat32Exp( b );
1684 expDiff = aExp - bExp;
1685 aSig <<= 7;
1686 bSig <<= 7;
1687 if ( 0 < expDiff ) goto aExpBigger;
1688 if ( expDiff < 0 ) goto bExpBigger;
1689 if ( aExp == 0xFF ) {
1690 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691 float_raise( float_flag_invalid STATUS_VAR);
1692 return float32_default_nan;
1694 if ( aExp == 0 ) {
1695 aExp = 1;
1696 bExp = 1;
1698 if ( bSig < aSig ) goto aBigger;
1699 if ( aSig < bSig ) goto bBigger;
1700 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701 bExpBigger:
1702 if ( bExp == 0xFF ) {
1703 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704 return packFloat32( zSign ^ 1, 0xFF, 0 );
1706 if ( aExp == 0 ) {
1707 ++expDiff;
1709 else {
1710 aSig |= 0x40000000;
1712 shift32RightJamming( aSig, - expDiff, &aSig );
1713 bSig |= 0x40000000;
1714 bBigger:
1715 zSig = bSig - aSig;
1716 zExp = bExp;
1717 zSign ^= 1;
1718 goto normalizeRoundAndPack;
1719 aExpBigger:
1720 if ( aExp == 0xFF ) {
1721 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722 return a;
1724 if ( bExp == 0 ) {
1725 --expDiff;
1727 else {
1728 bSig |= 0x40000000;
1730 shift32RightJamming( bSig, expDiff, &bSig );
1731 aSig |= 0x40000000;
1732 aBigger:
1733 zSig = aSig - bSig;
1734 zExp = aExp;
1735 normalizeRoundAndPack:
1736 --zExp;
1737 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1741 /*----------------------------------------------------------------------------
1742 | Returns the result of adding the single-precision floating-point values `a'
1743 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1744 | Binary Floating-Point Arithmetic.
1745 *----------------------------------------------------------------------------*/
1747 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1749 flag aSign, bSign;
1751 aSign = extractFloat32Sign( a );
1752 bSign = extractFloat32Sign( b );
1753 if ( aSign == bSign ) {
1754 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1756 else {
1757 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of subtracting the single-precision floating-point values
1764 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1765 | for Binary Floating-Point Arithmetic.
1766 *----------------------------------------------------------------------------*/
1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1770 flag aSign, bSign;
1772 aSign = extractFloat32Sign( a );
1773 bSign = extractFloat32Sign( b );
1774 if ( aSign == bSign ) {
1775 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1777 else {
1778 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of multiplying the single-precision floating-point values
1785 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1786 | for Binary Floating-Point Arithmetic.
1787 *----------------------------------------------------------------------------*/
1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1791 flag aSign, bSign, zSign;
1792 int16 aExp, bExp, zExp;
1793 bits32 aSig, bSig;
1794 bits64 zSig64;
1795 bits32 zSig;
1797 aSig = extractFloat32Frac( a );
1798 aExp = extractFloat32Exp( a );
1799 aSign = extractFloat32Sign( a );
1800 bSig = extractFloat32Frac( b );
1801 bExp = extractFloat32Exp( b );
1802 bSign = extractFloat32Sign( b );
1803 zSign = aSign ^ bSign;
1804 if ( aExp == 0xFF ) {
1805 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1806 return propagateFloat32NaN( a, b STATUS_VAR );
1808 if ( ( bExp | bSig ) == 0 ) {
1809 float_raise( float_flag_invalid STATUS_VAR);
1810 return float32_default_nan;
1812 return packFloat32( zSign, 0xFF, 0 );
1814 if ( bExp == 0xFF ) {
1815 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816 if ( ( aExp | aSig ) == 0 ) {
1817 float_raise( float_flag_invalid STATUS_VAR);
1818 return float32_default_nan;
1820 return packFloat32( zSign, 0xFF, 0 );
1822 if ( aExp == 0 ) {
1823 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1826 if ( bExp == 0 ) {
1827 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1830 zExp = aExp + bExp - 0x7F;
1831 aSig = ( aSig | 0x00800000 )<<7;
1832 bSig = ( bSig | 0x00800000 )<<8;
1833 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834 zSig = zSig64;
1835 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836 zSig <<= 1;
1837 --zExp;
1839 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of dividing the single-precision floating-point value `a'
1845 | by the corresponding value `b'. The operation is performed according to the
1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847 *----------------------------------------------------------------------------*/
1849 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1851 flag aSign, bSign, zSign;
1852 int16 aExp, bExp, zExp;
1853 bits32 aSig, bSig, zSig;
1855 aSig = extractFloat32Frac( a );
1856 aExp = extractFloat32Exp( a );
1857 aSign = extractFloat32Sign( a );
1858 bSig = extractFloat32Frac( b );
1859 bExp = extractFloat32Exp( b );
1860 bSign = extractFloat32Sign( b );
1861 zSign = aSign ^ bSign;
1862 if ( aExp == 0xFF ) {
1863 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864 if ( bExp == 0xFF ) {
1865 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866 float_raise( float_flag_invalid STATUS_VAR);
1867 return float32_default_nan;
1869 return packFloat32( zSign, 0xFF, 0 );
1871 if ( bExp == 0xFF ) {
1872 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873 return packFloat32( zSign, 0, 0 );
1875 if ( bExp == 0 ) {
1876 if ( bSig == 0 ) {
1877 if ( ( aExp | aSig ) == 0 ) {
1878 float_raise( float_flag_invalid STATUS_VAR);
1879 return float32_default_nan;
1881 float_raise( float_flag_divbyzero STATUS_VAR);
1882 return packFloat32( zSign, 0xFF, 0 );
1884 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1886 if ( aExp == 0 ) {
1887 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1890 zExp = aExp - bExp + 0x7D;
1891 aSig = ( aSig | 0x00800000 )<<7;
1892 bSig = ( bSig | 0x00800000 )<<8;
1893 if ( bSig <= ( aSig + aSig ) ) {
1894 aSig >>= 1;
1895 ++zExp;
1897 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898 if ( ( zSig & 0x3F ) == 0 ) {
1899 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1901 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1905 /*----------------------------------------------------------------------------
1906 | Returns the remainder of the single-precision floating-point value `a'
1907 | with respect to the corresponding value `b'. The operation is performed
1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909 *----------------------------------------------------------------------------*/
1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1913 flag aSign, bSign, zSign;
1914 int16 aExp, bExp, expDiff;
1915 bits32 aSig, bSig;
1916 bits32 q;
1917 bits64 aSig64, bSig64, q64;
1918 bits32 alternateASig;
1919 sbits32 sigMean;
1921 aSig = extractFloat32Frac( a );
1922 aExp = extractFloat32Exp( a );
1923 aSign = extractFloat32Sign( a );
1924 bSig = extractFloat32Frac( b );
1925 bExp = extractFloat32Exp( b );
1926 bSign = extractFloat32Sign( b );
1927 if ( aExp == 0xFF ) {
1928 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1929 return propagateFloat32NaN( a, b STATUS_VAR );
1931 float_raise( float_flag_invalid STATUS_VAR);
1932 return float32_default_nan;
1934 if ( bExp == 0xFF ) {
1935 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1936 return a;
1938 if ( bExp == 0 ) {
1939 if ( bSig == 0 ) {
1940 float_raise( float_flag_invalid STATUS_VAR);
1941 return float32_default_nan;
1943 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1945 if ( aExp == 0 ) {
1946 if ( aSig == 0 ) return a;
1947 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1949 expDiff = aExp - bExp;
1950 aSig |= 0x00800000;
1951 bSig |= 0x00800000;
1952 if ( expDiff < 32 ) {
1953 aSig <<= 8;
1954 bSig <<= 8;
1955 if ( expDiff < 0 ) {
1956 if ( expDiff < -1 ) return a;
1957 aSig >>= 1;
1959 q = ( bSig <= aSig );
1960 if ( q ) aSig -= bSig;
1961 if ( 0 < expDiff ) {
1962 q = ( ( (bits64) aSig )<<32 ) / bSig;
1963 q >>= 32 - expDiff;
1964 bSig >>= 2;
1965 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1967 else {
1968 aSig >>= 2;
1969 bSig >>= 2;
1972 else {
1973 if ( bSig <= aSig ) aSig -= bSig;
1974 aSig64 = ( (bits64) aSig )<<40;
1975 bSig64 = ( (bits64) bSig )<<40;
1976 expDiff -= 64;
1977 while ( 0 < expDiff ) {
1978 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1979 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1980 aSig64 = - ( ( bSig * q64 )<<38 );
1981 expDiff -= 62;
1983 expDiff += 64;
1984 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1985 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1986 q = q64>>( 64 - expDiff );
1987 bSig <<= 6;
1988 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1990 do {
1991 alternateASig = aSig;
1992 ++q;
1993 aSig -= bSig;
1994 } while ( 0 <= (sbits32) aSig );
1995 sigMean = aSig + alternateASig;
1996 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1997 aSig = alternateASig;
1999 zSign = ( (sbits32) aSig < 0 );
2000 if ( zSign ) aSig = - aSig;
2001 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2005 /*----------------------------------------------------------------------------
2006 | Returns the square root of the single-precision floating-point value `a'.
2007 | The operation is performed according to the IEC/IEEE Standard for Binary
2008 | Floating-Point Arithmetic.
2009 *----------------------------------------------------------------------------*/
2011 float32 float32_sqrt( float32 a STATUS_PARAM )
2013 flag aSign;
2014 int16 aExp, zExp;
2015 bits32 aSig, zSig;
2016 bits64 rem, term;
2018 aSig = extractFloat32Frac( a );
2019 aExp = extractFloat32Exp( a );
2020 aSign = extractFloat32Sign( a );
2021 if ( aExp == 0xFF ) {
2022 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2023 if ( ! aSign ) return a;
2024 float_raise( float_flag_invalid STATUS_VAR);
2025 return float32_default_nan;
2027 if ( aSign ) {
2028 if ( ( aExp | aSig ) == 0 ) return a;
2029 float_raise( float_flag_invalid STATUS_VAR);
2030 return float32_default_nan;
2032 if ( aExp == 0 ) {
2033 if ( aSig == 0 ) return float32_zero;
2034 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2036 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2037 aSig = ( aSig | 0x00800000 )<<8;
2038 zSig = estimateSqrt32( aExp, aSig ) + 2;
2039 if ( ( zSig & 0x7F ) <= 5 ) {
2040 if ( zSig < 2 ) {
2041 zSig = 0x7FFFFFFF;
2042 goto roundAndPack;
2044 aSig >>= aExp & 1;
2045 term = ( (bits64) zSig ) * zSig;
2046 rem = ( ( (bits64) aSig )<<32 ) - term;
2047 while ( (sbits64) rem < 0 ) {
2048 --zSig;
2049 rem += ( ( (bits64) zSig )<<1 ) | 1;
2051 zSig |= ( rem != 0 );
2053 shift32RightJamming( zSig, 1, &zSig );
2054 roundAndPack:
2055 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2059 /*----------------------------------------------------------------------------
2060 | Returns 1 if the single-precision floating-point value `a' is equal to
2061 | the corresponding value `b', and 0 otherwise. The comparison is performed
2062 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2063 *----------------------------------------------------------------------------*/
2065 int float32_eq( float32 a, float32 b STATUS_PARAM )
2068 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2069 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2071 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2072 float_raise( float_flag_invalid STATUS_VAR);
2074 return 0;
2076 return ( float32_val(a) == float32_val(b) ) ||
2077 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2081 /*----------------------------------------------------------------------------
2082 | Returns 1 if the single-precision floating-point value `a' is less than
2083 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2084 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2085 | Arithmetic.
2086 *----------------------------------------------------------------------------*/
2088 int float32_le( float32 a, float32 b STATUS_PARAM )
2090 flag aSign, bSign;
2091 bits32 av, bv;
2093 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2094 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2096 float_raise( float_flag_invalid STATUS_VAR);
2097 return 0;
2099 aSign = extractFloat32Sign( a );
2100 bSign = extractFloat32Sign( b );
2101 av = float32_val(a);
2102 bv = float32_val(b);
2103 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2104 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2108 /*----------------------------------------------------------------------------
2109 | Returns 1 if the single-precision floating-point value `a' is less than
2110 | the corresponding value `b', and 0 otherwise. The comparison is performed
2111 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2112 *----------------------------------------------------------------------------*/
2114 int float32_lt( float32 a, float32 b STATUS_PARAM )
2116 flag aSign, bSign;
2117 bits32 av, bv;
2119 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2120 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2122 float_raise( float_flag_invalid STATUS_VAR);
2123 return 0;
2125 aSign = extractFloat32Sign( a );
2126 bSign = extractFloat32Sign( b );
2127 av = float32_val(a);
2128 bv = float32_val(b);
2129 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2130 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2134 /*----------------------------------------------------------------------------
2135 | Returns 1 if the single-precision floating-point value `a' is equal to
2136 | the corresponding value `b', and 0 otherwise. The invalid exception is
2137 | raised if either operand is a NaN. Otherwise, the comparison is performed
2138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2139 *----------------------------------------------------------------------------*/
2141 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2143 bits32 av, bv;
2145 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2148 float_raise( float_flag_invalid STATUS_VAR);
2149 return 0;
2151 av = float32_val(a);
2152 bv = float32_val(b);
2153 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2157 /*----------------------------------------------------------------------------
2158 | Returns 1 if the single-precision floating-point value `a' is less than or
2159 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2160 | cause an exception. Otherwise, the comparison is performed according to the
2161 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2162 *----------------------------------------------------------------------------*/
2164 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2166 flag aSign, bSign;
2167 bits32 av, bv;
2169 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2170 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2172 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2173 float_raise( float_flag_invalid STATUS_VAR);
2175 return 0;
2177 aSign = extractFloat32Sign( a );
2178 bSign = extractFloat32Sign( b );
2179 av = float32_val(a);
2180 bv = float32_val(b);
2181 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2182 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2186 /*----------------------------------------------------------------------------
2187 | Returns 1 if the single-precision floating-point value `a' is less than
2188 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2189 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2190 | Standard for Binary Floating-Point Arithmetic.
2191 *----------------------------------------------------------------------------*/
2193 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2195 flag aSign, bSign;
2196 bits32 av, bv;
2198 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2199 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2201 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2202 float_raise( float_flag_invalid STATUS_VAR);
2204 return 0;
2206 aSign = extractFloat32Sign( a );
2207 bSign = extractFloat32Sign( b );
2208 av = float32_val(a);
2209 bv = float32_val(b);
2210 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2211 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2215 /*----------------------------------------------------------------------------
2216 | Returns the result of converting the double-precision floating-point value
2217 | `a' to the 32-bit two's complement integer format. The conversion is
2218 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2219 | Arithmetic---which means in particular that the conversion is rounded
2220 | according to the current rounding mode. If `a' is a NaN, the largest
2221 | positive integer is returned. Otherwise, if the conversion overflows, the
2222 | largest integer with the same sign as `a' is returned.
2223 *----------------------------------------------------------------------------*/
2225 int32 float64_to_int32( float64 a STATUS_PARAM )
2227 flag aSign;
2228 int16 aExp, shiftCount;
2229 bits64 aSig;
2231 aSig = extractFloat64Frac( a );
2232 aExp = extractFloat64Exp( a );
2233 aSign = extractFloat64Sign( a );
2234 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2235 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2236 shiftCount = 0x42C - aExp;
2237 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2238 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2242 /*----------------------------------------------------------------------------
2243 | Returns the result of converting the double-precision floating-point value
2244 | `a' to the 32-bit two's complement integer format. The conversion is
2245 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2246 | Arithmetic, except that the conversion is always rounded toward zero.
2247 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2248 | the conversion overflows, the largest integer with the same sign as `a' is
2249 | returned.
2250 *----------------------------------------------------------------------------*/
2252 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2254 flag aSign;
2255 int16 aExp, shiftCount;
2256 bits64 aSig, savedASig;
2257 int32 z;
2259 aSig = extractFloat64Frac( a );
2260 aExp = extractFloat64Exp( a );
2261 aSign = extractFloat64Sign( a );
2262 if ( 0x41E < aExp ) {
2263 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2264 goto invalid;
2266 else if ( aExp < 0x3FF ) {
2267 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2268 return 0;
2270 aSig |= LIT64( 0x0010000000000000 );
2271 shiftCount = 0x433 - aExp;
2272 savedASig = aSig;
2273 aSig >>= shiftCount;
2274 z = aSig;
2275 if ( aSign ) z = - z;
2276 if ( ( z < 0 ) ^ aSign ) {
2277 invalid:
2278 float_raise( float_flag_invalid STATUS_VAR);
2279 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2281 if ( ( aSig<<shiftCount ) != savedASig ) {
2282 STATUS(float_exception_flags) |= float_flag_inexact;
2284 return z;
2288 /*----------------------------------------------------------------------------
2289 | Returns the result of converting the double-precision floating-point value
2290 | `a' to the 64-bit two's complement integer format. The conversion is
2291 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2292 | Arithmetic---which means in particular that the conversion is rounded
2293 | according to the current rounding mode. If `a' is a NaN, the largest
2294 | positive integer is returned. Otherwise, if the conversion overflows, the
2295 | largest integer with the same sign as `a' is returned.
2296 *----------------------------------------------------------------------------*/
2298 int64 float64_to_int64( float64 a STATUS_PARAM )
2300 flag aSign;
2301 int16 aExp, shiftCount;
2302 bits64 aSig, aSigExtra;
2304 aSig = extractFloat64Frac( a );
2305 aExp = extractFloat64Exp( a );
2306 aSign = extractFloat64Sign( a );
2307 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2308 shiftCount = 0x433 - aExp;
2309 if ( shiftCount <= 0 ) {
2310 if ( 0x43E < aExp ) {
2311 float_raise( float_flag_invalid STATUS_VAR);
2312 if ( ! aSign
2313 || ( ( aExp == 0x7FF )
2314 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2316 return LIT64( 0x7FFFFFFFFFFFFFFF );
2318 return (sbits64) LIT64( 0x8000000000000000 );
2320 aSigExtra = 0;
2321 aSig <<= - shiftCount;
2323 else {
2324 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2326 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2330 /*----------------------------------------------------------------------------
2331 | Returns the result of converting the double-precision floating-point value
2332 | `a' to the 64-bit two's complement integer format. The conversion is
2333 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2334 | Arithmetic, except that the conversion is always rounded toward zero.
2335 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2336 | the conversion overflows, the largest integer with the same sign as `a' is
2337 | returned.
2338 *----------------------------------------------------------------------------*/
2340 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2342 flag aSign;
2343 int16 aExp, shiftCount;
2344 bits64 aSig;
2345 int64 z;
2347 aSig = extractFloat64Frac( a );
2348 aExp = extractFloat64Exp( a );
2349 aSign = extractFloat64Sign( a );
2350 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2351 shiftCount = aExp - 0x433;
2352 if ( 0 <= shiftCount ) {
2353 if ( 0x43E <= aExp ) {
2354 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2355 float_raise( float_flag_invalid STATUS_VAR);
2356 if ( ! aSign
2357 || ( ( aExp == 0x7FF )
2358 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2360 return LIT64( 0x7FFFFFFFFFFFFFFF );
2363 return (sbits64) LIT64( 0x8000000000000000 );
2365 z = aSig<<shiftCount;
2367 else {
2368 if ( aExp < 0x3FE ) {
2369 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2370 return 0;
2372 z = aSig>>( - shiftCount );
2373 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2374 STATUS(float_exception_flags) |= float_flag_inexact;
2377 if ( aSign ) z = - z;
2378 return z;
2382 /*----------------------------------------------------------------------------
2383 | Returns the result of converting the double-precision floating-point value
2384 | `a' to the single-precision floating-point format. The conversion is
2385 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2386 | Arithmetic.
2387 *----------------------------------------------------------------------------*/
2389 float32 float64_to_float32( float64 a STATUS_PARAM )
2391 flag aSign;
2392 int16 aExp;
2393 bits64 aSig;
2394 bits32 zSig;
2396 aSig = extractFloat64Frac( a );
2397 aExp = extractFloat64Exp( a );
2398 aSign = extractFloat64Sign( a );
2399 if ( aExp == 0x7FF ) {
2400 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2401 return packFloat32( aSign, 0xFF, 0 );
2403 shift64RightJamming( aSig, 22, &aSig );
2404 zSig = aSig;
2405 if ( aExp || zSig ) {
2406 zSig |= 0x40000000;
2407 aExp -= 0x381;
2409 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2413 #ifdef FLOATX80
2415 /*----------------------------------------------------------------------------
2416 | Returns the result of converting the double-precision floating-point value
2417 | `a' to the extended double-precision floating-point format. The conversion
2418 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2419 | Arithmetic.
2420 *----------------------------------------------------------------------------*/
2422 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2424 flag aSign;
2425 int16 aExp;
2426 bits64 aSig;
2428 aSig = extractFloat64Frac( a );
2429 aExp = extractFloat64Exp( a );
2430 aSign = extractFloat64Sign( a );
2431 if ( aExp == 0x7FF ) {
2432 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2433 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2435 if ( aExp == 0 ) {
2436 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2437 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2439 return
2440 packFloatx80(
2441 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2445 #endif
2447 #ifdef FLOAT128
2449 /*----------------------------------------------------------------------------
2450 | Returns the result of converting the double-precision floating-point value
2451 | `a' to the quadruple-precision floating-point format. The conversion is
2452 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2453 | Arithmetic.
2454 *----------------------------------------------------------------------------*/
2456 float128 float64_to_float128( float64 a STATUS_PARAM )
2458 flag aSign;
2459 int16 aExp;
2460 bits64 aSig, zSig0, zSig1;
2462 aSig = extractFloat64Frac( a );
2463 aExp = extractFloat64Exp( a );
2464 aSign = extractFloat64Sign( a );
2465 if ( aExp == 0x7FF ) {
2466 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2467 return packFloat128( aSign, 0x7FFF, 0, 0 );
2469 if ( aExp == 0 ) {
2470 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2471 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2472 --aExp;
2474 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2475 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2479 #endif
2481 /*----------------------------------------------------------------------------
2482 | Rounds the double-precision floating-point value `a' to an integer, and
2483 | returns the result as a double-precision floating-point value. The
2484 | operation is performed according to the IEC/IEEE Standard for Binary
2485 | Floating-Point Arithmetic.
2486 *----------------------------------------------------------------------------*/
2488 float64 float64_round_to_int( float64 a STATUS_PARAM )
2490 flag aSign;
2491 int16 aExp;
2492 bits64 lastBitMask, roundBitsMask;
2493 int8 roundingMode;
2494 bits64 z;
2496 aExp = extractFloat64Exp( a );
2497 if ( 0x433 <= aExp ) {
2498 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2499 return propagateFloat64NaN( a, a STATUS_VAR );
2501 return a;
2503 if ( aExp < 0x3FF ) {
2504 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2505 STATUS(float_exception_flags) |= float_flag_inexact;
2506 aSign = extractFloat64Sign( a );
2507 switch ( STATUS(float_rounding_mode) ) {
2508 case float_round_nearest_even:
2509 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2510 return packFloat64( aSign, 0x3FF, 0 );
2512 break;
2513 case float_round_down:
2514 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2515 case float_round_up:
2516 return make_float64(
2517 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2519 return packFloat64( aSign, 0, 0 );
2521 lastBitMask = 1;
2522 lastBitMask <<= 0x433 - aExp;
2523 roundBitsMask = lastBitMask - 1;
2524 z = float64_val(a);
2525 roundingMode = STATUS(float_rounding_mode);
2526 if ( roundingMode == float_round_nearest_even ) {
2527 z += lastBitMask>>1;
2528 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2530 else if ( roundingMode != float_round_to_zero ) {
2531 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2532 z += roundBitsMask;
2535 z &= ~ roundBitsMask;
2536 if ( z != float64_val(a) )
2537 STATUS(float_exception_flags) |= float_flag_inexact;
2538 return make_float64(z);
2542 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2544 int oldmode;
2545 float64 res;
2546 oldmode = STATUS(float_rounding_mode);
2547 STATUS(float_rounding_mode) = float_round_to_zero;
2548 res = float64_round_to_int(a STATUS_VAR);
2549 STATUS(float_rounding_mode) = oldmode;
2550 return res;
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of adding the absolute values of the double-precision
2555 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2556 | before being returned. `zSign' is ignored if the result is a NaN.
2557 | The addition is performed according to the IEC/IEEE Standard for Binary
2558 | Floating-Point Arithmetic.
2559 *----------------------------------------------------------------------------*/
2561 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2563 int16 aExp, bExp, zExp;
2564 bits64 aSig, bSig, zSig;
2565 int16 expDiff;
2567 aSig = extractFloat64Frac( a );
2568 aExp = extractFloat64Exp( a );
2569 bSig = extractFloat64Frac( b );
2570 bExp = extractFloat64Exp( b );
2571 expDiff = aExp - bExp;
2572 aSig <<= 9;
2573 bSig <<= 9;
2574 if ( 0 < expDiff ) {
2575 if ( aExp == 0x7FF ) {
2576 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2577 return a;
2579 if ( bExp == 0 ) {
2580 --expDiff;
2582 else {
2583 bSig |= LIT64( 0x2000000000000000 );
2585 shift64RightJamming( bSig, expDiff, &bSig );
2586 zExp = aExp;
2588 else if ( expDiff < 0 ) {
2589 if ( bExp == 0x7FF ) {
2590 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2591 return packFloat64( zSign, 0x7FF, 0 );
2593 if ( aExp == 0 ) {
2594 ++expDiff;
2596 else {
2597 aSig |= LIT64( 0x2000000000000000 );
2599 shift64RightJamming( aSig, - expDiff, &aSig );
2600 zExp = bExp;
2602 else {
2603 if ( aExp == 0x7FF ) {
2604 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2605 return a;
2607 if ( aExp == 0 ) {
2608 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2609 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2611 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2612 zExp = aExp;
2613 goto roundAndPack;
2615 aSig |= LIT64( 0x2000000000000000 );
2616 zSig = ( aSig + bSig )<<1;
2617 --zExp;
2618 if ( (sbits64) zSig < 0 ) {
2619 zSig = aSig + bSig;
2620 ++zExp;
2622 roundAndPack:
2623 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2627 /*----------------------------------------------------------------------------
2628 | Returns the result of subtracting the absolute values of the double-
2629 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2630 | difference is negated before being returned. `zSign' is ignored if the
2631 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2632 | Standard for Binary Floating-Point Arithmetic.
2633 *----------------------------------------------------------------------------*/
2635 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2637 int16 aExp, bExp, zExp;
2638 bits64 aSig, bSig, zSig;
2639 int16 expDiff;
2641 aSig = extractFloat64Frac( a );
2642 aExp = extractFloat64Exp( a );
2643 bSig = extractFloat64Frac( b );
2644 bExp = extractFloat64Exp( b );
2645 expDiff = aExp - bExp;
2646 aSig <<= 10;
2647 bSig <<= 10;
2648 if ( 0 < expDiff ) goto aExpBigger;
2649 if ( expDiff < 0 ) goto bExpBigger;
2650 if ( aExp == 0x7FF ) {
2651 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2652 float_raise( float_flag_invalid STATUS_VAR);
2653 return float64_default_nan;
2655 if ( aExp == 0 ) {
2656 aExp = 1;
2657 bExp = 1;
2659 if ( bSig < aSig ) goto aBigger;
2660 if ( aSig < bSig ) goto bBigger;
2661 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2662 bExpBigger:
2663 if ( bExp == 0x7FF ) {
2664 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2665 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2667 if ( aExp == 0 ) {
2668 ++expDiff;
2670 else {
2671 aSig |= LIT64( 0x4000000000000000 );
2673 shift64RightJamming( aSig, - expDiff, &aSig );
2674 bSig |= LIT64( 0x4000000000000000 );
2675 bBigger:
2676 zSig = bSig - aSig;
2677 zExp = bExp;
2678 zSign ^= 1;
2679 goto normalizeRoundAndPack;
2680 aExpBigger:
2681 if ( aExp == 0x7FF ) {
2682 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2683 return a;
2685 if ( bExp == 0 ) {
2686 --expDiff;
2688 else {
2689 bSig |= LIT64( 0x4000000000000000 );
2691 shift64RightJamming( bSig, expDiff, &bSig );
2692 aSig |= LIT64( 0x4000000000000000 );
2693 aBigger:
2694 zSig = aSig - bSig;
2695 zExp = aExp;
2696 normalizeRoundAndPack:
2697 --zExp;
2698 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2702 /*----------------------------------------------------------------------------
2703 | Returns the result of adding the double-precision floating-point values `a'
2704 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2705 | Binary Floating-Point Arithmetic.
2706 *----------------------------------------------------------------------------*/
2708 float64 float64_add( float64 a, float64 b STATUS_PARAM )
2710 flag aSign, bSign;
2712 aSign = extractFloat64Sign( a );
2713 bSign = extractFloat64Sign( b );
2714 if ( aSign == bSign ) {
2715 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2717 else {
2718 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2723 /*----------------------------------------------------------------------------
2724 | Returns the result of subtracting the double-precision floating-point values
2725 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2726 | for Binary Floating-Point Arithmetic.
2727 *----------------------------------------------------------------------------*/
2729 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2731 flag aSign, bSign;
2733 aSign = extractFloat64Sign( a );
2734 bSign = extractFloat64Sign( b );
2735 if ( aSign == bSign ) {
2736 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2738 else {
2739 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2744 /*----------------------------------------------------------------------------
2745 | Returns the result of multiplying the double-precision floating-point values
2746 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2747 | for Binary Floating-Point Arithmetic.
2748 *----------------------------------------------------------------------------*/
2750 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2752 flag aSign, bSign, zSign;
2753 int16 aExp, bExp, zExp;
2754 bits64 aSig, bSig, zSig0, zSig1;
2756 aSig = extractFloat64Frac( a );
2757 aExp = extractFloat64Exp( a );
2758 aSign = extractFloat64Sign( a );
2759 bSig = extractFloat64Frac( b );
2760 bExp = extractFloat64Exp( b );
2761 bSign = extractFloat64Sign( b );
2762 zSign = aSign ^ bSign;
2763 if ( aExp == 0x7FF ) {
2764 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2765 return propagateFloat64NaN( a, b STATUS_VAR );
2767 if ( ( bExp | bSig ) == 0 ) {
2768 float_raise( float_flag_invalid STATUS_VAR);
2769 return float64_default_nan;
2771 return packFloat64( zSign, 0x7FF, 0 );
2773 if ( bExp == 0x7FF ) {
2774 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2775 if ( ( aExp | aSig ) == 0 ) {
2776 float_raise( float_flag_invalid STATUS_VAR);
2777 return float64_default_nan;
2779 return packFloat64( zSign, 0x7FF, 0 );
2781 if ( aExp == 0 ) {
2782 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2783 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2785 if ( bExp == 0 ) {
2786 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2787 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2789 zExp = aExp + bExp - 0x3FF;
2790 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2791 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2792 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2793 zSig0 |= ( zSig1 != 0 );
2794 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2795 zSig0 <<= 1;
2796 --zExp;
2798 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2802 /*----------------------------------------------------------------------------
2803 | Returns the result of dividing the double-precision floating-point value `a'
2804 | by the corresponding value `b'. The operation is performed according to
2805 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2806 *----------------------------------------------------------------------------*/
2808 float64 float64_div( float64 a, float64 b STATUS_PARAM )
2810 flag aSign, bSign, zSign;
2811 int16 aExp, bExp, zExp;
2812 bits64 aSig, bSig, zSig;
2813 bits64 rem0, rem1;
2814 bits64 term0, term1;
2816 aSig = extractFloat64Frac( a );
2817 aExp = extractFloat64Exp( a );
2818 aSign = extractFloat64Sign( a );
2819 bSig = extractFloat64Frac( b );
2820 bExp = extractFloat64Exp( b );
2821 bSign = extractFloat64Sign( b );
2822 zSign = aSign ^ bSign;
2823 if ( aExp == 0x7FF ) {
2824 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2825 if ( bExp == 0x7FF ) {
2826 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2827 float_raise( float_flag_invalid STATUS_VAR);
2828 return float64_default_nan;
2830 return packFloat64( zSign, 0x7FF, 0 );
2832 if ( bExp == 0x7FF ) {
2833 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2834 return packFloat64( zSign, 0, 0 );
2836 if ( bExp == 0 ) {
2837 if ( bSig == 0 ) {
2838 if ( ( aExp | aSig ) == 0 ) {
2839 float_raise( float_flag_invalid STATUS_VAR);
2840 return float64_default_nan;
2842 float_raise( float_flag_divbyzero STATUS_VAR);
2843 return packFloat64( zSign, 0x7FF, 0 );
2845 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2847 if ( aExp == 0 ) {
2848 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2849 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2851 zExp = aExp - bExp + 0x3FD;
2852 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2853 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2854 if ( bSig <= ( aSig + aSig ) ) {
2855 aSig >>= 1;
2856 ++zExp;
2858 zSig = estimateDiv128To64( aSig, 0, bSig );
2859 if ( ( zSig & 0x1FF ) <= 2 ) {
2860 mul64To128( bSig, zSig, &term0, &term1 );
2861 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2862 while ( (sbits64) rem0 < 0 ) {
2863 --zSig;
2864 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2866 zSig |= ( rem1 != 0 );
2868 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2872 /*----------------------------------------------------------------------------
2873 | Returns the remainder of the double-precision floating-point value `a'
2874 | with respect to the corresponding value `b'. The operation is performed
2875 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2876 *----------------------------------------------------------------------------*/
2878 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2880 flag aSign, bSign, zSign;
2881 int16 aExp, bExp, expDiff;
2882 bits64 aSig, bSig;
2883 bits64 q, alternateASig;
2884 sbits64 sigMean;
2886 aSig = extractFloat64Frac( a );
2887 aExp = extractFloat64Exp( a );
2888 aSign = extractFloat64Sign( a );
2889 bSig = extractFloat64Frac( b );
2890 bExp = extractFloat64Exp( b );
2891 bSign = extractFloat64Sign( b );
2892 if ( aExp == 0x7FF ) {
2893 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2894 return propagateFloat64NaN( a, b STATUS_VAR );
2896 float_raise( float_flag_invalid STATUS_VAR);
2897 return float64_default_nan;
2899 if ( bExp == 0x7FF ) {
2900 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2901 return a;
2903 if ( bExp == 0 ) {
2904 if ( bSig == 0 ) {
2905 float_raise( float_flag_invalid STATUS_VAR);
2906 return float64_default_nan;
2908 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2910 if ( aExp == 0 ) {
2911 if ( aSig == 0 ) return a;
2912 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2914 expDiff = aExp - bExp;
2915 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2916 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2917 if ( expDiff < 0 ) {
2918 if ( expDiff < -1 ) return a;
2919 aSig >>= 1;
2921 q = ( bSig <= aSig );
2922 if ( q ) aSig -= bSig;
2923 expDiff -= 64;
2924 while ( 0 < expDiff ) {
2925 q = estimateDiv128To64( aSig, 0, bSig );
2926 q = ( 2 < q ) ? q - 2 : 0;
2927 aSig = - ( ( bSig>>2 ) * q );
2928 expDiff -= 62;
2930 expDiff += 64;
2931 if ( 0 < expDiff ) {
2932 q = estimateDiv128To64( aSig, 0, bSig );
2933 q = ( 2 < q ) ? q - 2 : 0;
2934 q >>= 64 - expDiff;
2935 bSig >>= 2;
2936 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2938 else {
2939 aSig >>= 2;
2940 bSig >>= 2;
2942 do {
2943 alternateASig = aSig;
2944 ++q;
2945 aSig -= bSig;
2946 } while ( 0 <= (sbits64) aSig );
2947 sigMean = aSig + alternateASig;
2948 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2949 aSig = alternateASig;
2951 zSign = ( (sbits64) aSig < 0 );
2952 if ( zSign ) aSig = - aSig;
2953 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2957 /*----------------------------------------------------------------------------
2958 | Returns the square root of the double-precision floating-point value `a'.
2959 | The operation is performed according to the IEC/IEEE Standard for Binary
2960 | Floating-Point Arithmetic.
2961 *----------------------------------------------------------------------------*/
2963 float64 float64_sqrt( float64 a STATUS_PARAM )
2965 flag aSign;
2966 int16 aExp, zExp;
2967 bits64 aSig, zSig, doubleZSig;
2968 bits64 rem0, rem1, term0, term1;
2970 aSig = extractFloat64Frac( a );
2971 aExp = extractFloat64Exp( a );
2972 aSign = extractFloat64Sign( a );
2973 if ( aExp == 0x7FF ) {
2974 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2975 if ( ! aSign ) return a;
2976 float_raise( float_flag_invalid STATUS_VAR);
2977 return float64_default_nan;
2979 if ( aSign ) {
2980 if ( ( aExp | aSig ) == 0 ) return a;
2981 float_raise( float_flag_invalid STATUS_VAR);
2982 return float64_default_nan;
2984 if ( aExp == 0 ) {
2985 if ( aSig == 0 ) return float64_zero;
2986 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2988 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2989 aSig |= LIT64( 0x0010000000000000 );
2990 zSig = estimateSqrt32( aExp, aSig>>21 );
2991 aSig <<= 9 - ( aExp & 1 );
2992 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2993 if ( ( zSig & 0x1FF ) <= 5 ) {
2994 doubleZSig = zSig<<1;
2995 mul64To128( zSig, zSig, &term0, &term1 );
2996 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2997 while ( (sbits64) rem0 < 0 ) {
2998 --zSig;
2999 doubleZSig -= 2;
3000 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3002 zSig |= ( ( rem0 | rem1 ) != 0 );
3004 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3008 /*----------------------------------------------------------------------------
3009 | Returns 1 if the double-precision floating-point value `a' is equal to the
3010 | corresponding value `b', and 0 otherwise. The comparison is performed
3011 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3012 *----------------------------------------------------------------------------*/
3014 int float64_eq( float64 a, float64 b STATUS_PARAM )
3016 bits64 av, bv;
3018 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3019 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3021 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3022 float_raise( float_flag_invalid STATUS_VAR);
3024 return 0;
3026 av = float64_val(a);
3027 bv = float64_val(b);
3028 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3032 /*----------------------------------------------------------------------------
3033 | Returns 1 if the double-precision floating-point value `a' is less than or
3034 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3035 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3036 | Arithmetic.
3037 *----------------------------------------------------------------------------*/
3039 int float64_le( float64 a, float64 b STATUS_PARAM )
3041 flag aSign, bSign;
3042 bits64 av, bv;
3044 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3045 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3047 float_raise( float_flag_invalid STATUS_VAR);
3048 return 0;
3050 aSign = extractFloat64Sign( a );
3051 bSign = extractFloat64Sign( b );
3052 av = float64_val(a);
3053 bv = float64_val(b);
3054 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3055 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3059 /*----------------------------------------------------------------------------
3060 | Returns 1 if the double-precision floating-point value `a' is less than
3061 | the corresponding value `b', and 0 otherwise. The comparison is performed
3062 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3063 *----------------------------------------------------------------------------*/
3065 int float64_lt( float64 a, float64 b STATUS_PARAM )
3067 flag aSign, bSign;
3068 bits64 av, bv;
3070 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3071 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3073 float_raise( float_flag_invalid STATUS_VAR);
3074 return 0;
3076 aSign = extractFloat64Sign( a );
3077 bSign = extractFloat64Sign( b );
3078 av = float64_val(a);
3079 bv = float64_val(b);
3080 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3081 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3085 /*----------------------------------------------------------------------------
3086 | Returns 1 if the double-precision floating-point value `a' is equal to the
3087 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3088 | if either operand is a NaN. Otherwise, the comparison is performed
3089 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3090 *----------------------------------------------------------------------------*/
3092 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3094 bits64 av, bv;
3096 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3097 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3099 float_raise( float_flag_invalid STATUS_VAR);
3100 return 0;
3102 av = float64_val(a);
3103 bv = float64_val(b);
3104 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3108 /*----------------------------------------------------------------------------
3109 | Returns 1 if the double-precision floating-point value `a' is less than or
3110 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3111 | cause an exception. Otherwise, the comparison is performed according to the
3112 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3113 *----------------------------------------------------------------------------*/
3115 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3117 flag aSign, bSign;
3118 bits64 av, bv;
3120 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3121 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3123 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3124 float_raise( float_flag_invalid STATUS_VAR);
3126 return 0;
3128 aSign = extractFloat64Sign( a );
3129 bSign = extractFloat64Sign( b );
3130 av = float64_val(a);
3131 bv = float64_val(b);
3132 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3133 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3137 /*----------------------------------------------------------------------------
3138 | Returns 1 if the double-precision floating-point value `a' is less than
3139 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3140 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3141 | Standard for Binary Floating-Point Arithmetic.
3142 *----------------------------------------------------------------------------*/
3144 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3146 flag aSign, bSign;
3147 bits64 av, bv;
3149 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3150 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3152 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3153 float_raise( float_flag_invalid STATUS_VAR);
3155 return 0;
3157 aSign = extractFloat64Sign( a );
3158 bSign = extractFloat64Sign( b );
3159 av = float64_val(a);
3160 bv = float64_val(b);
3161 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3162 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3166 #ifdef FLOATX80
3168 /*----------------------------------------------------------------------------
3169 | Returns the result of converting the extended double-precision floating-
3170 | point value `a' to the 32-bit two's complement integer format. The
3171 | conversion is performed according to the IEC/IEEE Standard for Binary
3172 | Floating-Point Arithmetic---which means in particular that the conversion
3173 | is rounded according to the current rounding mode. If `a' is a NaN, the
3174 | largest positive integer is returned. Otherwise, if the conversion
3175 | overflows, the largest integer with the same sign as `a' is returned.
3176 *----------------------------------------------------------------------------*/
3178 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3180 flag aSign;
3181 int32 aExp, shiftCount;
3182 bits64 aSig;
3184 aSig = extractFloatx80Frac( a );
3185 aExp = extractFloatx80Exp( a );
3186 aSign = extractFloatx80Sign( a );
3187 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3188 shiftCount = 0x4037 - aExp;
3189 if ( shiftCount <= 0 ) shiftCount = 1;
3190 shift64RightJamming( aSig, shiftCount, &aSig );
3191 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3195 /*----------------------------------------------------------------------------
3196 | Returns the result of converting the extended double-precision floating-
3197 | point value `a' to the 32-bit two's complement integer format. The
3198 | conversion is performed according to the IEC/IEEE Standard for Binary
3199 | Floating-Point Arithmetic, except that the conversion is always rounded
3200 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3201 | Otherwise, if the conversion overflows, the largest integer with the same
3202 | sign as `a' is returned.
3203 *----------------------------------------------------------------------------*/
3205 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3207 flag aSign;
3208 int32 aExp, shiftCount;
3209 bits64 aSig, savedASig;
3210 int32 z;
3212 aSig = extractFloatx80Frac( a );
3213 aExp = extractFloatx80Exp( a );
3214 aSign = extractFloatx80Sign( a );
3215 if ( 0x401E < aExp ) {
3216 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3217 goto invalid;
3219 else if ( aExp < 0x3FFF ) {
3220 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3221 return 0;
3223 shiftCount = 0x403E - aExp;
3224 savedASig = aSig;
3225 aSig >>= shiftCount;
3226 z = aSig;
3227 if ( aSign ) z = - z;
3228 if ( ( z < 0 ) ^ aSign ) {
3229 invalid:
3230 float_raise( float_flag_invalid STATUS_VAR);
3231 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3233 if ( ( aSig<<shiftCount ) != savedASig ) {
3234 STATUS(float_exception_flags) |= float_flag_inexact;
3236 return z;
3240 /*----------------------------------------------------------------------------
3241 | Returns the result of converting the extended double-precision floating-
3242 | point value `a' to the 64-bit two's complement integer format. The
3243 | conversion is performed according to the IEC/IEEE Standard for Binary
3244 | Floating-Point Arithmetic---which means in particular that the conversion
3245 | is rounded according to the current rounding mode. If `a' is a NaN,
3246 | the largest positive integer is returned. Otherwise, if the conversion
3247 | overflows, the largest integer with the same sign as `a' is returned.
3248 *----------------------------------------------------------------------------*/
3250 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3252 flag aSign;
3253 int32 aExp, shiftCount;
3254 bits64 aSig, aSigExtra;
3256 aSig = extractFloatx80Frac( a );
3257 aExp = extractFloatx80Exp( a );
3258 aSign = extractFloatx80Sign( a );
3259 shiftCount = 0x403E - aExp;
3260 if ( shiftCount <= 0 ) {
3261 if ( shiftCount ) {
3262 float_raise( float_flag_invalid STATUS_VAR);
3263 if ( ! aSign
3264 || ( ( aExp == 0x7FFF )
3265 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3267 return LIT64( 0x7FFFFFFFFFFFFFFF );
3269 return (sbits64) LIT64( 0x8000000000000000 );
3271 aSigExtra = 0;
3273 else {
3274 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3276 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3280 /*----------------------------------------------------------------------------
3281 | Returns the result of converting the extended double-precision floating-
3282 | point value `a' to the 64-bit two's complement integer format. The
3283 | conversion is performed according to the IEC/IEEE Standard for Binary
3284 | Floating-Point Arithmetic, except that the conversion is always rounded
3285 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3286 | Otherwise, if the conversion overflows, the largest integer with the same
3287 | sign as `a' is returned.
3288 *----------------------------------------------------------------------------*/
3290 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3292 flag aSign;
3293 int32 aExp, shiftCount;
3294 bits64 aSig;
3295 int64 z;
3297 aSig = extractFloatx80Frac( a );
3298 aExp = extractFloatx80Exp( a );
3299 aSign = extractFloatx80Sign( a );
3300 shiftCount = aExp - 0x403E;
3301 if ( 0 <= shiftCount ) {
3302 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3303 if ( ( a.high != 0xC03E ) || aSig ) {
3304 float_raise( float_flag_invalid STATUS_VAR);
3305 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3306 return LIT64( 0x7FFFFFFFFFFFFFFF );
3309 return (sbits64) LIT64( 0x8000000000000000 );
3311 else if ( aExp < 0x3FFF ) {
3312 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3313 return 0;
3315 z = aSig>>( - shiftCount );
3316 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3317 STATUS(float_exception_flags) |= float_flag_inexact;
3319 if ( aSign ) z = - z;
3320 return z;
3324 /*----------------------------------------------------------------------------
3325 | Returns the result of converting the extended double-precision floating-
3326 | point value `a' to the single-precision floating-point format. The
3327 | conversion is performed according to the IEC/IEEE Standard for Binary
3328 | Floating-Point Arithmetic.
3329 *----------------------------------------------------------------------------*/
3331 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3333 flag aSign;
3334 int32 aExp;
3335 bits64 aSig;
3337 aSig = extractFloatx80Frac( a );
3338 aExp = extractFloatx80Exp( a );
3339 aSign = extractFloatx80Sign( a );
3340 if ( aExp == 0x7FFF ) {
3341 if ( (bits64) ( aSig<<1 ) ) {
3342 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3344 return packFloat32( aSign, 0xFF, 0 );
3346 shift64RightJamming( aSig, 33, &aSig );
3347 if ( aExp || aSig ) aExp -= 0x3F81;
3348 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3352 /*----------------------------------------------------------------------------
3353 | Returns the result of converting the extended double-precision floating-
3354 | point value `a' to the double-precision floating-point format. The
3355 | conversion is performed according to the IEC/IEEE Standard for Binary
3356 | Floating-Point Arithmetic.
3357 *----------------------------------------------------------------------------*/
3359 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3361 flag aSign;
3362 int32 aExp;
3363 bits64 aSig, zSig;
3365 aSig = extractFloatx80Frac( a );
3366 aExp = extractFloatx80Exp( a );
3367 aSign = extractFloatx80Sign( a );
3368 if ( aExp == 0x7FFF ) {
3369 if ( (bits64) ( aSig<<1 ) ) {
3370 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3372 return packFloat64( aSign, 0x7FF, 0 );
3374 shift64RightJamming( aSig, 1, &zSig );
3375 if ( aExp || aSig ) aExp -= 0x3C01;
3376 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3380 #ifdef FLOAT128
3382 /*----------------------------------------------------------------------------
3383 | Returns the result of converting the extended double-precision floating-
3384 | point value `a' to the quadruple-precision floating-point format. The
3385 | conversion is performed according to the IEC/IEEE Standard for Binary
3386 | Floating-Point Arithmetic.
3387 *----------------------------------------------------------------------------*/
3389 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3391 flag aSign;
3392 int16 aExp;
3393 bits64 aSig, zSig0, zSig1;
3395 aSig = extractFloatx80Frac( a );
3396 aExp = extractFloatx80Exp( a );
3397 aSign = extractFloatx80Sign( a );
3398 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3399 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3401 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3402 return packFloat128( aSign, aExp, zSig0, zSig1 );
3406 #endif
3408 /*----------------------------------------------------------------------------
3409 | Rounds the extended double-precision floating-point value `a' to an integer,
3410 | and returns the result as an extended quadruple-precision floating-point
3411 | value. The operation is performed according to the IEC/IEEE Standard for
3412 | Binary Floating-Point Arithmetic.
3413 *----------------------------------------------------------------------------*/
3415 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3417 flag aSign;
3418 int32 aExp;
3419 bits64 lastBitMask, roundBitsMask;
3420 int8 roundingMode;
3421 floatx80 z;
3423 aExp = extractFloatx80Exp( a );
3424 if ( 0x403E <= aExp ) {
3425 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3426 return propagateFloatx80NaN( a, a STATUS_VAR );
3428 return a;
3430 if ( aExp < 0x3FFF ) {
3431 if ( ( aExp == 0 )
3432 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3433 return a;
3435 STATUS(float_exception_flags) |= float_flag_inexact;
3436 aSign = extractFloatx80Sign( a );
3437 switch ( STATUS(float_rounding_mode) ) {
3438 case float_round_nearest_even:
3439 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3441 return
3442 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3444 break;
3445 case float_round_down:
3446 return
3447 aSign ?
3448 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3449 : packFloatx80( 0, 0, 0 );
3450 case float_round_up:
3451 return
3452 aSign ? packFloatx80( 1, 0, 0 )
3453 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3455 return packFloatx80( aSign, 0, 0 );
3457 lastBitMask = 1;
3458 lastBitMask <<= 0x403E - aExp;
3459 roundBitsMask = lastBitMask - 1;
3460 z = a;
3461 roundingMode = STATUS(float_rounding_mode);
3462 if ( roundingMode == float_round_nearest_even ) {
3463 z.low += lastBitMask>>1;
3464 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3466 else if ( roundingMode != float_round_to_zero ) {
3467 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3468 z.low += roundBitsMask;
3471 z.low &= ~ roundBitsMask;
3472 if ( z.low == 0 ) {
3473 ++z.high;
3474 z.low = LIT64( 0x8000000000000000 );
3476 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3477 return z;
3481 /*----------------------------------------------------------------------------
3482 | Returns the result of adding the absolute values of the extended double-
3483 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3484 | negated before being returned. `zSign' is ignored if the result is a NaN.
3485 | The addition is performed according to the IEC/IEEE Standard for Binary
3486 | Floating-Point Arithmetic.
3487 *----------------------------------------------------------------------------*/
3489 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3491 int32 aExp, bExp, zExp;
3492 bits64 aSig, bSig, zSig0, zSig1;
3493 int32 expDiff;
3495 aSig = extractFloatx80Frac( a );
3496 aExp = extractFloatx80Exp( a );
3497 bSig = extractFloatx80Frac( b );
3498 bExp = extractFloatx80Exp( b );
3499 expDiff = aExp - bExp;
3500 if ( 0 < expDiff ) {
3501 if ( aExp == 0x7FFF ) {
3502 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3503 return a;
3505 if ( bExp == 0 ) --expDiff;
3506 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3507 zExp = aExp;
3509 else if ( expDiff < 0 ) {
3510 if ( bExp == 0x7FFF ) {
3511 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3512 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3514 if ( aExp == 0 ) ++expDiff;
3515 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3516 zExp = bExp;
3518 else {
3519 if ( aExp == 0x7FFF ) {
3520 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3521 return propagateFloatx80NaN( a, b STATUS_VAR );
3523 return a;
3525 zSig1 = 0;
3526 zSig0 = aSig + bSig;
3527 if ( aExp == 0 ) {
3528 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3529 goto roundAndPack;
3531 zExp = aExp;
3532 goto shiftRight1;
3534 zSig0 = aSig + bSig;
3535 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3536 shiftRight1:
3537 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3538 zSig0 |= LIT64( 0x8000000000000000 );
3539 ++zExp;
3540 roundAndPack:
3541 return
3542 roundAndPackFloatx80(
3543 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3547 /*----------------------------------------------------------------------------
3548 | Returns the result of subtracting the absolute values of the extended
3549 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3550 | difference is negated before being returned. `zSign' is ignored if the
3551 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3552 | Standard for Binary Floating-Point Arithmetic.
3553 *----------------------------------------------------------------------------*/
3555 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3557 int32 aExp, bExp, zExp;
3558 bits64 aSig, bSig, zSig0, zSig1;
3559 int32 expDiff;
3560 floatx80 z;
3562 aSig = extractFloatx80Frac( a );
3563 aExp = extractFloatx80Exp( a );
3564 bSig = extractFloatx80Frac( b );
3565 bExp = extractFloatx80Exp( b );
3566 expDiff = aExp - bExp;
3567 if ( 0 < expDiff ) goto aExpBigger;
3568 if ( expDiff < 0 ) goto bExpBigger;
3569 if ( aExp == 0x7FFF ) {
3570 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3571 return propagateFloatx80NaN( a, b STATUS_VAR );
3573 float_raise( float_flag_invalid STATUS_VAR);
3574 z.low = floatx80_default_nan_low;
3575 z.high = floatx80_default_nan_high;
3576 return z;
3578 if ( aExp == 0 ) {
3579 aExp = 1;
3580 bExp = 1;
3582 zSig1 = 0;
3583 if ( bSig < aSig ) goto aBigger;
3584 if ( aSig < bSig ) goto bBigger;
3585 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3586 bExpBigger:
3587 if ( bExp == 0x7FFF ) {
3588 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3589 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3591 if ( aExp == 0 ) ++expDiff;
3592 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3593 bBigger:
3594 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3595 zExp = bExp;
3596 zSign ^= 1;
3597 goto normalizeRoundAndPack;
3598 aExpBigger:
3599 if ( aExp == 0x7FFF ) {
3600 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3601 return a;
3603 if ( bExp == 0 ) --expDiff;
3604 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3605 aBigger:
3606 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3607 zExp = aExp;
3608 normalizeRoundAndPack:
3609 return
3610 normalizeRoundAndPackFloatx80(
3611 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3615 /*----------------------------------------------------------------------------
3616 | Returns the result of adding the extended double-precision floating-point
3617 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3618 | Standard for Binary Floating-Point Arithmetic.
3619 *----------------------------------------------------------------------------*/
3621 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3623 flag aSign, bSign;
3625 aSign = extractFloatx80Sign( a );
3626 bSign = extractFloatx80Sign( b );
3627 if ( aSign == bSign ) {
3628 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3630 else {
3631 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3636 /*----------------------------------------------------------------------------
3637 | Returns the result of subtracting the extended double-precision floating-
3638 | point values `a' and `b'. The operation is performed according to the
3639 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3640 *----------------------------------------------------------------------------*/
3642 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3644 flag aSign, bSign;
3646 aSign = extractFloatx80Sign( a );
3647 bSign = extractFloatx80Sign( b );
3648 if ( aSign == bSign ) {
3649 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3651 else {
3652 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3657 /*----------------------------------------------------------------------------
3658 | Returns the result of multiplying the extended double-precision floating-
3659 | point values `a' and `b'. The operation is performed according to the
3660 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3661 *----------------------------------------------------------------------------*/
3663 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3665 flag aSign, bSign, zSign;
3666 int32 aExp, bExp, zExp;
3667 bits64 aSig, bSig, zSig0, zSig1;
3668 floatx80 z;
3670 aSig = extractFloatx80Frac( a );
3671 aExp = extractFloatx80Exp( a );
3672 aSign = extractFloatx80Sign( a );
3673 bSig = extractFloatx80Frac( b );
3674 bExp = extractFloatx80Exp( b );
3675 bSign = extractFloatx80Sign( b );
3676 zSign = aSign ^ bSign;
3677 if ( aExp == 0x7FFF ) {
3678 if ( (bits64) ( aSig<<1 )
3679 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3680 return propagateFloatx80NaN( a, b STATUS_VAR );
3682 if ( ( bExp | bSig ) == 0 ) goto invalid;
3683 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3685 if ( bExp == 0x7FFF ) {
3686 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3687 if ( ( aExp | aSig ) == 0 ) {
3688 invalid:
3689 float_raise( float_flag_invalid STATUS_VAR);
3690 z.low = floatx80_default_nan_low;
3691 z.high = floatx80_default_nan_high;
3692 return z;
3694 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3696 if ( aExp == 0 ) {
3697 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3698 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3700 if ( bExp == 0 ) {
3701 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3702 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3704 zExp = aExp + bExp - 0x3FFE;
3705 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3706 if ( 0 < (sbits64) zSig0 ) {
3707 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3708 --zExp;
3710 return
3711 roundAndPackFloatx80(
3712 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3716 /*----------------------------------------------------------------------------
3717 | Returns the result of dividing the extended double-precision floating-point
3718 | value `a' by the corresponding value `b'. The operation is performed
3719 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3720 *----------------------------------------------------------------------------*/
3722 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3724 flag aSign, bSign, zSign;
3725 int32 aExp, bExp, zExp;
3726 bits64 aSig, bSig, zSig0, zSig1;
3727 bits64 rem0, rem1, rem2, term0, term1, term2;
3728 floatx80 z;
3730 aSig = extractFloatx80Frac( a );
3731 aExp = extractFloatx80Exp( a );
3732 aSign = extractFloatx80Sign( a );
3733 bSig = extractFloatx80Frac( b );
3734 bExp = extractFloatx80Exp( b );
3735 bSign = extractFloatx80Sign( b );
3736 zSign = aSign ^ bSign;
3737 if ( aExp == 0x7FFF ) {
3738 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3739 if ( bExp == 0x7FFF ) {
3740 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3741 goto invalid;
3743 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3745 if ( bExp == 0x7FFF ) {
3746 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3747 return packFloatx80( zSign, 0, 0 );
3749 if ( bExp == 0 ) {
3750 if ( bSig == 0 ) {
3751 if ( ( aExp | aSig ) == 0 ) {
3752 invalid:
3753 float_raise( float_flag_invalid STATUS_VAR);
3754 z.low = floatx80_default_nan_low;
3755 z.high = floatx80_default_nan_high;
3756 return z;
3758 float_raise( float_flag_divbyzero STATUS_VAR);
3759 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3761 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3763 if ( aExp == 0 ) {
3764 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3765 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3767 zExp = aExp - bExp + 0x3FFE;
3768 rem1 = 0;
3769 if ( bSig <= aSig ) {
3770 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3771 ++zExp;
3773 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3774 mul64To128( bSig, zSig0, &term0, &term1 );
3775 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3776 while ( (sbits64) rem0 < 0 ) {
3777 --zSig0;
3778 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3780 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3781 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3782 mul64To128( bSig, zSig1, &term1, &term2 );
3783 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3784 while ( (sbits64) rem1 < 0 ) {
3785 --zSig1;
3786 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3788 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3790 return
3791 roundAndPackFloatx80(
3792 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3796 /*----------------------------------------------------------------------------
3797 | Returns the remainder of the extended double-precision floating-point value
3798 | `a' with respect to the corresponding value `b'. The operation is performed
3799 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3800 *----------------------------------------------------------------------------*/
3802 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3804 flag aSign, bSign, zSign;
3805 int32 aExp, bExp, expDiff;
3806 bits64 aSig0, aSig1, bSig;
3807 bits64 q, term0, term1, alternateASig0, alternateASig1;
3808 floatx80 z;
3810 aSig0 = extractFloatx80Frac( a );
3811 aExp = extractFloatx80Exp( a );
3812 aSign = extractFloatx80Sign( a );
3813 bSig = extractFloatx80Frac( b );
3814 bExp = extractFloatx80Exp( b );
3815 bSign = extractFloatx80Sign( b );
3816 if ( aExp == 0x7FFF ) {
3817 if ( (bits64) ( aSig0<<1 )
3818 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3819 return propagateFloatx80NaN( a, b STATUS_VAR );
3821 goto invalid;
3823 if ( bExp == 0x7FFF ) {
3824 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3825 return a;
3827 if ( bExp == 0 ) {
3828 if ( bSig == 0 ) {
3829 invalid:
3830 float_raise( float_flag_invalid STATUS_VAR);
3831 z.low = floatx80_default_nan_low;
3832 z.high = floatx80_default_nan_high;
3833 return z;
3835 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3837 if ( aExp == 0 ) {
3838 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3839 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3841 bSig |= LIT64( 0x8000000000000000 );
3842 zSign = aSign;
3843 expDiff = aExp - bExp;
3844 aSig1 = 0;
3845 if ( expDiff < 0 ) {
3846 if ( expDiff < -1 ) return a;
3847 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3848 expDiff = 0;
3850 q = ( bSig <= aSig0 );
3851 if ( q ) aSig0 -= bSig;
3852 expDiff -= 64;
3853 while ( 0 < expDiff ) {
3854 q = estimateDiv128To64( aSig0, aSig1, bSig );
3855 q = ( 2 < q ) ? q - 2 : 0;
3856 mul64To128( bSig, q, &term0, &term1 );
3857 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3858 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3859 expDiff -= 62;
3861 expDiff += 64;
3862 if ( 0 < expDiff ) {
3863 q = estimateDiv128To64( aSig0, aSig1, bSig );
3864 q = ( 2 < q ) ? q - 2 : 0;
3865 q >>= 64 - expDiff;
3866 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3867 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3868 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3869 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3870 ++q;
3871 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3874 else {
3875 term1 = 0;
3876 term0 = bSig;
3878 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3879 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3880 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3881 && ( q & 1 ) )
3883 aSig0 = alternateASig0;
3884 aSig1 = alternateASig1;
3885 zSign = ! zSign;
3887 return
3888 normalizeRoundAndPackFloatx80(
3889 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3893 /*----------------------------------------------------------------------------
3894 | Returns the square root of the extended double-precision floating-point
3895 | value `a'. The operation is performed according to the IEC/IEEE Standard
3896 | for Binary Floating-Point Arithmetic.
3897 *----------------------------------------------------------------------------*/
3899 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3901 flag aSign;
3902 int32 aExp, zExp;
3903 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3904 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3905 floatx80 z;
3907 aSig0 = extractFloatx80Frac( a );
3908 aExp = extractFloatx80Exp( a );
3909 aSign = extractFloatx80Sign( a );
3910 if ( aExp == 0x7FFF ) {
3911 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3912 if ( ! aSign ) return a;
3913 goto invalid;
3915 if ( aSign ) {
3916 if ( ( aExp | aSig0 ) == 0 ) return a;
3917 invalid:
3918 float_raise( float_flag_invalid STATUS_VAR);
3919 z.low = floatx80_default_nan_low;
3920 z.high = floatx80_default_nan_high;
3921 return z;
3923 if ( aExp == 0 ) {
3924 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3925 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3927 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3928 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3929 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3930 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3931 doubleZSig0 = zSig0<<1;
3932 mul64To128( zSig0, zSig0, &term0, &term1 );
3933 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3934 while ( (sbits64) rem0 < 0 ) {
3935 --zSig0;
3936 doubleZSig0 -= 2;
3937 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3939 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3940 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3941 if ( zSig1 == 0 ) zSig1 = 1;
3942 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3943 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3944 mul64To128( zSig1, zSig1, &term2, &term3 );
3945 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3946 while ( (sbits64) rem1 < 0 ) {
3947 --zSig1;
3948 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3949 term3 |= 1;
3950 term2 |= doubleZSig0;
3951 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3953 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3955 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3956 zSig0 |= doubleZSig0;
3957 return
3958 roundAndPackFloatx80(
3959 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3963 /*----------------------------------------------------------------------------
3964 | Returns 1 if the extended double-precision floating-point value `a' is
3965 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3966 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3967 | Arithmetic.
3968 *----------------------------------------------------------------------------*/
3970 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3973 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3974 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3975 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3976 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3978 if ( floatx80_is_signaling_nan( a )
3979 || floatx80_is_signaling_nan( b ) ) {
3980 float_raise( float_flag_invalid STATUS_VAR);
3982 return 0;
3984 return
3985 ( a.low == b.low )
3986 && ( ( a.high == b.high )
3987 || ( ( a.low == 0 )
3988 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3993 /*----------------------------------------------------------------------------
3994 | Returns 1 if the extended double-precision floating-point value `a' is
3995 | less than or equal to the corresponding value `b', and 0 otherwise. The
3996 | comparison is performed according to the IEC/IEEE Standard for Binary
3997 | Floating-Point Arithmetic.
3998 *----------------------------------------------------------------------------*/
4000 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4002 flag aSign, bSign;
4004 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4005 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4006 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4007 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4009 float_raise( float_flag_invalid STATUS_VAR);
4010 return 0;
4012 aSign = extractFloatx80Sign( a );
4013 bSign = extractFloatx80Sign( b );
4014 if ( aSign != bSign ) {
4015 return
4016 aSign
4017 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4018 == 0 );
4020 return
4021 aSign ? le128( b.high, b.low, a.high, a.low )
4022 : le128( a.high, a.low, b.high, b.low );
4026 /*----------------------------------------------------------------------------
4027 | Returns 1 if the extended double-precision floating-point value `a' is
4028 | less than the corresponding value `b', and 0 otherwise. The comparison
4029 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4030 | Arithmetic.
4031 *----------------------------------------------------------------------------*/
4033 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4035 flag aSign, bSign;
4037 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4038 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4039 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4040 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4042 float_raise( float_flag_invalid STATUS_VAR);
4043 return 0;
4045 aSign = extractFloatx80Sign( a );
4046 bSign = extractFloatx80Sign( b );
4047 if ( aSign != bSign ) {
4048 return
4049 aSign
4050 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4051 != 0 );
4053 return
4054 aSign ? lt128( b.high, b.low, a.high, a.low )
4055 : lt128( a.high, a.low, b.high, b.low );
4059 /*----------------------------------------------------------------------------
4060 | Returns 1 if the extended double-precision floating-point value `a' is equal
4061 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4062 | raised if either operand is a NaN. Otherwise, the comparison is performed
4063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4064 *----------------------------------------------------------------------------*/
4066 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4069 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4070 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4071 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4072 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4074 float_raise( float_flag_invalid STATUS_VAR);
4075 return 0;
4077 return
4078 ( a.low == b.low )
4079 && ( ( a.high == b.high )
4080 || ( ( a.low == 0 )
4081 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4086 /*----------------------------------------------------------------------------
4087 | Returns 1 if the extended double-precision floating-point value `a' is less
4088 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4089 | do not cause an exception. Otherwise, the comparison is performed according
4090 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4091 *----------------------------------------------------------------------------*/
4093 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4095 flag aSign, bSign;
4097 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4098 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4099 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4100 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4102 if ( floatx80_is_signaling_nan( a )
4103 || floatx80_is_signaling_nan( b ) ) {
4104 float_raise( float_flag_invalid STATUS_VAR);
4106 return 0;
4108 aSign = extractFloatx80Sign( a );
4109 bSign = extractFloatx80Sign( b );
4110 if ( aSign != bSign ) {
4111 return
4112 aSign
4113 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4114 == 0 );
4116 return
4117 aSign ? le128( b.high, b.low, a.high, a.low )
4118 : le128( a.high, a.low, b.high, b.low );
4122 /*----------------------------------------------------------------------------
4123 | Returns 1 if the extended double-precision floating-point value `a' is less
4124 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4125 | an exception. Otherwise, the comparison is performed according to the
4126 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4127 *----------------------------------------------------------------------------*/
4129 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4131 flag aSign, bSign;
4133 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4134 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4135 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4136 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4138 if ( floatx80_is_signaling_nan( a )
4139 || floatx80_is_signaling_nan( b ) ) {
4140 float_raise( float_flag_invalid STATUS_VAR);
4142 return 0;
4144 aSign = extractFloatx80Sign( a );
4145 bSign = extractFloatx80Sign( b );
4146 if ( aSign != bSign ) {
4147 return
4148 aSign
4149 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4150 != 0 );
4152 return
4153 aSign ? lt128( b.high, b.low, a.high, a.low )
4154 : lt128( a.high, a.low, b.high, b.low );
4158 #endif
4160 #ifdef FLOAT128
4162 /*----------------------------------------------------------------------------
4163 | Returns the result of converting the quadruple-precision floating-point
4164 | value `a' to the 32-bit two's complement integer format. The conversion
4165 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4166 | Arithmetic---which means in particular that the conversion is rounded
4167 | according to the current rounding mode. If `a' is a NaN, the largest
4168 | positive integer is returned. Otherwise, if the conversion overflows, the
4169 | largest integer with the same sign as `a' is returned.
4170 *----------------------------------------------------------------------------*/
4172 int32 float128_to_int32( float128 a STATUS_PARAM )
4174 flag aSign;
4175 int32 aExp, shiftCount;
4176 bits64 aSig0, aSig1;
4178 aSig1 = extractFloat128Frac1( a );
4179 aSig0 = extractFloat128Frac0( a );
4180 aExp = extractFloat128Exp( a );
4181 aSign = extractFloat128Sign( a );
4182 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4183 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4184 aSig0 |= ( aSig1 != 0 );
4185 shiftCount = 0x4028 - aExp;
4186 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4187 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4191 /*----------------------------------------------------------------------------
4192 | Returns the result of converting the quadruple-precision floating-point
4193 | value `a' to the 32-bit two's complement integer format. The conversion
4194 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4195 | Arithmetic, except that the conversion is always rounded toward zero. If
4196 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4197 | conversion overflows, the largest integer with the same sign as `a' is
4198 | returned.
4199 *----------------------------------------------------------------------------*/
4201 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4203 flag aSign;
4204 int32 aExp, shiftCount;
4205 bits64 aSig0, aSig1, savedASig;
4206 int32 z;
4208 aSig1 = extractFloat128Frac1( a );
4209 aSig0 = extractFloat128Frac0( a );
4210 aExp = extractFloat128Exp( a );
4211 aSign = extractFloat128Sign( a );
4212 aSig0 |= ( aSig1 != 0 );
4213 if ( 0x401E < aExp ) {
4214 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4215 goto invalid;
4217 else if ( aExp < 0x3FFF ) {
4218 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4219 return 0;
4221 aSig0 |= LIT64( 0x0001000000000000 );
4222 shiftCount = 0x402F - aExp;
4223 savedASig = aSig0;
4224 aSig0 >>= shiftCount;
4225 z = aSig0;
4226 if ( aSign ) z = - z;
4227 if ( ( z < 0 ) ^ aSign ) {
4228 invalid:
4229 float_raise( float_flag_invalid STATUS_VAR);
4230 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4232 if ( ( aSig0<<shiftCount ) != savedASig ) {
4233 STATUS(float_exception_flags) |= float_flag_inexact;
4235 return z;
4239 /*----------------------------------------------------------------------------
4240 | Returns the result of converting the quadruple-precision floating-point
4241 | value `a' to the 64-bit two's complement integer format. The conversion
4242 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4243 | Arithmetic---which means in particular that the conversion is rounded
4244 | according to the current rounding mode. If `a' is a NaN, the largest
4245 | positive integer is returned. Otherwise, if the conversion overflows, the
4246 | largest integer with the same sign as `a' is returned.
4247 *----------------------------------------------------------------------------*/
4249 int64 float128_to_int64( float128 a STATUS_PARAM )
4251 flag aSign;
4252 int32 aExp, shiftCount;
4253 bits64 aSig0, aSig1;
4255 aSig1 = extractFloat128Frac1( a );
4256 aSig0 = extractFloat128Frac0( a );
4257 aExp = extractFloat128Exp( a );
4258 aSign = extractFloat128Sign( a );
4259 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4260 shiftCount = 0x402F - aExp;
4261 if ( shiftCount <= 0 ) {
4262 if ( 0x403E < aExp ) {
4263 float_raise( float_flag_invalid STATUS_VAR);
4264 if ( ! aSign
4265 || ( ( aExp == 0x7FFF )
4266 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4269 return LIT64( 0x7FFFFFFFFFFFFFFF );
4271 return (sbits64) LIT64( 0x8000000000000000 );
4273 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4275 else {
4276 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4278 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4282 /*----------------------------------------------------------------------------
4283 | Returns the result of converting the quadruple-precision floating-point
4284 | value `a' to the 64-bit two's complement integer format. The conversion
4285 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4286 | Arithmetic, except that the conversion is always rounded toward zero.
4287 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4288 | the conversion overflows, the largest integer with the same sign as `a' is
4289 | returned.
4290 *----------------------------------------------------------------------------*/
4292 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4294 flag aSign;
4295 int32 aExp, shiftCount;
4296 bits64 aSig0, aSig1;
4297 int64 z;
4299 aSig1 = extractFloat128Frac1( a );
4300 aSig0 = extractFloat128Frac0( a );
4301 aExp = extractFloat128Exp( a );
4302 aSign = extractFloat128Sign( a );
4303 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4304 shiftCount = aExp - 0x402F;
4305 if ( 0 < shiftCount ) {
4306 if ( 0x403E <= aExp ) {
4307 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4308 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4309 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4310 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4312 else {
4313 float_raise( float_flag_invalid STATUS_VAR);
4314 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4315 return LIT64( 0x7FFFFFFFFFFFFFFF );
4318 return (sbits64) LIT64( 0x8000000000000000 );
4320 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4321 if ( (bits64) ( aSig1<<shiftCount ) ) {
4322 STATUS(float_exception_flags) |= float_flag_inexact;
4325 else {
4326 if ( aExp < 0x3FFF ) {
4327 if ( aExp | aSig0 | aSig1 ) {
4328 STATUS(float_exception_flags) |= float_flag_inexact;
4330 return 0;
4332 z = aSig0>>( - shiftCount );
4333 if ( aSig1
4334 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4335 STATUS(float_exception_flags) |= float_flag_inexact;
4338 if ( aSign ) z = - z;
4339 return z;
4343 /*----------------------------------------------------------------------------
4344 | Returns the result of converting the quadruple-precision floating-point
4345 | value `a' to the single-precision floating-point format. The conversion
4346 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4347 | Arithmetic.
4348 *----------------------------------------------------------------------------*/
4350 float32 float128_to_float32( float128 a STATUS_PARAM )
4352 flag aSign;
4353 int32 aExp;
4354 bits64 aSig0, aSig1;
4355 bits32 zSig;
4357 aSig1 = extractFloat128Frac1( a );
4358 aSig0 = extractFloat128Frac0( a );
4359 aExp = extractFloat128Exp( a );
4360 aSign = extractFloat128Sign( a );
4361 if ( aExp == 0x7FFF ) {
4362 if ( aSig0 | aSig1 ) {
4363 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4365 return packFloat32( aSign, 0xFF, 0 );
4367 aSig0 |= ( aSig1 != 0 );
4368 shift64RightJamming( aSig0, 18, &aSig0 );
4369 zSig = aSig0;
4370 if ( aExp || zSig ) {
4371 zSig |= 0x40000000;
4372 aExp -= 0x3F81;
4374 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4378 /*----------------------------------------------------------------------------
4379 | Returns the result of converting the quadruple-precision floating-point
4380 | value `a' to the double-precision floating-point format. The conversion
4381 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4382 | Arithmetic.
4383 *----------------------------------------------------------------------------*/
4385 float64 float128_to_float64( float128 a STATUS_PARAM )
4387 flag aSign;
4388 int32 aExp;
4389 bits64 aSig0, aSig1;
4391 aSig1 = extractFloat128Frac1( a );
4392 aSig0 = extractFloat128Frac0( a );
4393 aExp = extractFloat128Exp( a );
4394 aSign = extractFloat128Sign( a );
4395 if ( aExp == 0x7FFF ) {
4396 if ( aSig0 | aSig1 ) {
4397 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4399 return packFloat64( aSign, 0x7FF, 0 );
4401 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4402 aSig0 |= ( aSig1 != 0 );
4403 if ( aExp || aSig0 ) {
4404 aSig0 |= LIT64( 0x4000000000000000 );
4405 aExp -= 0x3C01;
4407 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4411 #ifdef FLOATX80
4413 /*----------------------------------------------------------------------------
4414 | Returns the result of converting the quadruple-precision floating-point
4415 | value `a' to the extended double-precision floating-point format. The
4416 | conversion is performed according to the IEC/IEEE Standard for Binary
4417 | Floating-Point Arithmetic.
4418 *----------------------------------------------------------------------------*/
4420 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4422 flag aSign;
4423 int32 aExp;
4424 bits64 aSig0, aSig1;
4426 aSig1 = extractFloat128Frac1( a );
4427 aSig0 = extractFloat128Frac0( a );
4428 aExp = extractFloat128Exp( a );
4429 aSign = extractFloat128Sign( a );
4430 if ( aExp == 0x7FFF ) {
4431 if ( aSig0 | aSig1 ) {
4432 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4434 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4436 if ( aExp == 0 ) {
4437 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4438 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4440 else {
4441 aSig0 |= LIT64( 0x0001000000000000 );
4443 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4444 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4448 #endif
4450 /*----------------------------------------------------------------------------
4451 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4452 | returns the result as a quadruple-precision floating-point value. The
4453 | operation is performed according to the IEC/IEEE Standard for Binary
4454 | Floating-Point Arithmetic.
4455 *----------------------------------------------------------------------------*/
4457 float128 float128_round_to_int( float128 a STATUS_PARAM )
4459 flag aSign;
4460 int32 aExp;
4461 bits64 lastBitMask, roundBitsMask;
4462 int8 roundingMode;
4463 float128 z;
4465 aExp = extractFloat128Exp( a );
4466 if ( 0x402F <= aExp ) {
4467 if ( 0x406F <= aExp ) {
4468 if ( ( aExp == 0x7FFF )
4469 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4471 return propagateFloat128NaN( a, a STATUS_VAR );
4473 return a;
4475 lastBitMask = 1;
4476 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4477 roundBitsMask = lastBitMask - 1;
4478 z = a;
4479 roundingMode = STATUS(float_rounding_mode);
4480 if ( roundingMode == float_round_nearest_even ) {
4481 if ( lastBitMask ) {
4482 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4483 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4485 else {
4486 if ( (sbits64) z.low < 0 ) {
4487 ++z.high;
4488 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4492 else if ( roundingMode != float_round_to_zero ) {
4493 if ( extractFloat128Sign( z )
4494 ^ ( roundingMode == float_round_up ) ) {
4495 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4498 z.low &= ~ roundBitsMask;
4500 else {
4501 if ( aExp < 0x3FFF ) {
4502 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4503 STATUS(float_exception_flags) |= float_flag_inexact;
4504 aSign = extractFloat128Sign( a );
4505 switch ( STATUS(float_rounding_mode) ) {
4506 case float_round_nearest_even:
4507 if ( ( aExp == 0x3FFE )
4508 && ( extractFloat128Frac0( a )
4509 | extractFloat128Frac1( a ) )
4511 return packFloat128( aSign, 0x3FFF, 0, 0 );
4513 break;
4514 case float_round_down:
4515 return
4516 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4517 : packFloat128( 0, 0, 0, 0 );
4518 case float_round_up:
4519 return
4520 aSign ? packFloat128( 1, 0, 0, 0 )
4521 : packFloat128( 0, 0x3FFF, 0, 0 );
4523 return packFloat128( aSign, 0, 0, 0 );
4525 lastBitMask = 1;
4526 lastBitMask <<= 0x402F - aExp;
4527 roundBitsMask = lastBitMask - 1;
4528 z.low = 0;
4529 z.high = a.high;
4530 roundingMode = STATUS(float_rounding_mode);
4531 if ( roundingMode == float_round_nearest_even ) {
4532 z.high += lastBitMask>>1;
4533 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4534 z.high &= ~ lastBitMask;
4537 else if ( roundingMode != float_round_to_zero ) {
4538 if ( extractFloat128Sign( z )
4539 ^ ( roundingMode == float_round_up ) ) {
4540 z.high |= ( a.low != 0 );
4541 z.high += roundBitsMask;
4544 z.high &= ~ roundBitsMask;
4546 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4547 STATUS(float_exception_flags) |= float_flag_inexact;
4549 return z;
4553 /*----------------------------------------------------------------------------
4554 | Returns the result of adding the absolute values of the quadruple-precision
4555 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4556 | before being returned. `zSign' is ignored if the result is a NaN.
4557 | The addition is performed according to the IEC/IEEE Standard for Binary
4558 | Floating-Point Arithmetic.
4559 *----------------------------------------------------------------------------*/
4561 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4563 int32 aExp, bExp, zExp;
4564 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4565 int32 expDiff;
4567 aSig1 = extractFloat128Frac1( a );
4568 aSig0 = extractFloat128Frac0( a );
4569 aExp = extractFloat128Exp( a );
4570 bSig1 = extractFloat128Frac1( b );
4571 bSig0 = extractFloat128Frac0( b );
4572 bExp = extractFloat128Exp( b );
4573 expDiff = aExp - bExp;
4574 if ( 0 < expDiff ) {
4575 if ( aExp == 0x7FFF ) {
4576 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4577 return a;
4579 if ( bExp == 0 ) {
4580 --expDiff;
4582 else {
4583 bSig0 |= LIT64( 0x0001000000000000 );
4585 shift128ExtraRightJamming(
4586 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4587 zExp = aExp;
4589 else if ( expDiff < 0 ) {
4590 if ( bExp == 0x7FFF ) {
4591 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4592 return packFloat128( zSign, 0x7FFF, 0, 0 );
4594 if ( aExp == 0 ) {
4595 ++expDiff;
4597 else {
4598 aSig0 |= LIT64( 0x0001000000000000 );
4600 shift128ExtraRightJamming(
4601 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4602 zExp = bExp;
4604 else {
4605 if ( aExp == 0x7FFF ) {
4606 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4607 return propagateFloat128NaN( a, b STATUS_VAR );
4609 return a;
4611 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4612 if ( aExp == 0 ) {
4613 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4614 return packFloat128( zSign, 0, zSig0, zSig1 );
4616 zSig2 = 0;
4617 zSig0 |= LIT64( 0x0002000000000000 );
4618 zExp = aExp;
4619 goto shiftRight1;
4621 aSig0 |= LIT64( 0x0001000000000000 );
4622 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4623 --zExp;
4624 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4625 ++zExp;
4626 shiftRight1:
4627 shift128ExtraRightJamming(
4628 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4629 roundAndPack:
4630 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4634 /*----------------------------------------------------------------------------
4635 | Returns the result of subtracting the absolute values of the quadruple-
4636 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4637 | difference is negated before being returned. `zSign' is ignored if the
4638 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4639 | Standard for Binary Floating-Point Arithmetic.
4640 *----------------------------------------------------------------------------*/
4642 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4644 int32 aExp, bExp, zExp;
4645 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4646 int32 expDiff;
4647 float128 z;
4649 aSig1 = extractFloat128Frac1( a );
4650 aSig0 = extractFloat128Frac0( a );
4651 aExp = extractFloat128Exp( a );
4652 bSig1 = extractFloat128Frac1( b );
4653 bSig0 = extractFloat128Frac0( b );
4654 bExp = extractFloat128Exp( b );
4655 expDiff = aExp - bExp;
4656 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4657 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4658 if ( 0 < expDiff ) goto aExpBigger;
4659 if ( expDiff < 0 ) goto bExpBigger;
4660 if ( aExp == 0x7FFF ) {
4661 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4662 return propagateFloat128NaN( a, b STATUS_VAR );
4664 float_raise( float_flag_invalid STATUS_VAR);
4665 z.low = float128_default_nan_low;
4666 z.high = float128_default_nan_high;
4667 return z;
4669 if ( aExp == 0 ) {
4670 aExp = 1;
4671 bExp = 1;
4673 if ( bSig0 < aSig0 ) goto aBigger;
4674 if ( aSig0 < bSig0 ) goto bBigger;
4675 if ( bSig1 < aSig1 ) goto aBigger;
4676 if ( aSig1 < bSig1 ) goto bBigger;
4677 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4678 bExpBigger:
4679 if ( bExp == 0x7FFF ) {
4680 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4681 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4683 if ( aExp == 0 ) {
4684 ++expDiff;
4686 else {
4687 aSig0 |= LIT64( 0x4000000000000000 );
4689 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4690 bSig0 |= LIT64( 0x4000000000000000 );
4691 bBigger:
4692 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4693 zExp = bExp;
4694 zSign ^= 1;
4695 goto normalizeRoundAndPack;
4696 aExpBigger:
4697 if ( aExp == 0x7FFF ) {
4698 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4699 return a;
4701 if ( bExp == 0 ) {
4702 --expDiff;
4704 else {
4705 bSig0 |= LIT64( 0x4000000000000000 );
4707 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4708 aSig0 |= LIT64( 0x4000000000000000 );
4709 aBigger:
4710 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4711 zExp = aExp;
4712 normalizeRoundAndPack:
4713 --zExp;
4714 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4718 /*----------------------------------------------------------------------------
4719 | Returns the result of adding the quadruple-precision floating-point values
4720 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4721 | for Binary Floating-Point Arithmetic.
4722 *----------------------------------------------------------------------------*/
4724 float128 float128_add( float128 a, float128 b STATUS_PARAM )
4726 flag aSign, bSign;
4728 aSign = extractFloat128Sign( a );
4729 bSign = extractFloat128Sign( b );
4730 if ( aSign == bSign ) {
4731 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4733 else {
4734 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4739 /*----------------------------------------------------------------------------
4740 | Returns the result of subtracting the quadruple-precision floating-point
4741 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4742 | Standard for Binary Floating-Point Arithmetic.
4743 *----------------------------------------------------------------------------*/
4745 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4747 flag aSign, bSign;
4749 aSign = extractFloat128Sign( a );
4750 bSign = extractFloat128Sign( b );
4751 if ( aSign == bSign ) {
4752 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4754 else {
4755 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4760 /*----------------------------------------------------------------------------
4761 | Returns the result of multiplying the quadruple-precision floating-point
4762 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4763 | Standard for Binary Floating-Point Arithmetic.
4764 *----------------------------------------------------------------------------*/
4766 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4768 flag aSign, bSign, zSign;
4769 int32 aExp, bExp, zExp;
4770 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4771 float128 z;
4773 aSig1 = extractFloat128Frac1( a );
4774 aSig0 = extractFloat128Frac0( a );
4775 aExp = extractFloat128Exp( a );
4776 aSign = extractFloat128Sign( a );
4777 bSig1 = extractFloat128Frac1( b );
4778 bSig0 = extractFloat128Frac0( b );
4779 bExp = extractFloat128Exp( b );
4780 bSign = extractFloat128Sign( b );
4781 zSign = aSign ^ bSign;
4782 if ( aExp == 0x7FFF ) {
4783 if ( ( aSig0 | aSig1 )
4784 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4785 return propagateFloat128NaN( a, b STATUS_VAR );
4787 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4788 return packFloat128( zSign, 0x7FFF, 0, 0 );
4790 if ( bExp == 0x7FFF ) {
4791 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4792 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4793 invalid:
4794 float_raise( float_flag_invalid STATUS_VAR);
4795 z.low = float128_default_nan_low;
4796 z.high = float128_default_nan_high;
4797 return z;
4799 return packFloat128( zSign, 0x7FFF, 0, 0 );
4801 if ( aExp == 0 ) {
4802 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4803 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4805 if ( bExp == 0 ) {
4806 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4807 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4809 zExp = aExp + bExp - 0x4000;
4810 aSig0 |= LIT64( 0x0001000000000000 );
4811 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4812 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4813 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4814 zSig2 |= ( zSig3 != 0 );
4815 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4816 shift128ExtraRightJamming(
4817 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4818 ++zExp;
4820 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4824 /*----------------------------------------------------------------------------
4825 | Returns the result of dividing the quadruple-precision floating-point value
4826 | `a' by the corresponding value `b'. The operation is performed according to
4827 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4828 *----------------------------------------------------------------------------*/
4830 float128 float128_div( float128 a, float128 b STATUS_PARAM )
4832 flag aSign, bSign, zSign;
4833 int32 aExp, bExp, zExp;
4834 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4835 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4836 float128 z;
4838 aSig1 = extractFloat128Frac1( a );
4839 aSig0 = extractFloat128Frac0( a );
4840 aExp = extractFloat128Exp( a );
4841 aSign = extractFloat128Sign( a );
4842 bSig1 = extractFloat128Frac1( b );
4843 bSig0 = extractFloat128Frac0( b );
4844 bExp = extractFloat128Exp( b );
4845 bSign = extractFloat128Sign( b );
4846 zSign = aSign ^ bSign;
4847 if ( aExp == 0x7FFF ) {
4848 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4849 if ( bExp == 0x7FFF ) {
4850 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4851 goto invalid;
4853 return packFloat128( zSign, 0x7FFF, 0, 0 );
4855 if ( bExp == 0x7FFF ) {
4856 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4857 return packFloat128( zSign, 0, 0, 0 );
4859 if ( bExp == 0 ) {
4860 if ( ( bSig0 | bSig1 ) == 0 ) {
4861 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4862 invalid:
4863 float_raise( float_flag_invalid STATUS_VAR);
4864 z.low = float128_default_nan_low;
4865 z.high = float128_default_nan_high;
4866 return z;
4868 float_raise( float_flag_divbyzero STATUS_VAR);
4869 return packFloat128( zSign, 0x7FFF, 0, 0 );
4871 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4873 if ( aExp == 0 ) {
4874 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4875 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4877 zExp = aExp - bExp + 0x3FFD;
4878 shortShift128Left(
4879 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4880 shortShift128Left(
4881 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4882 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4883 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4884 ++zExp;
4886 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4887 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4888 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4889 while ( (sbits64) rem0 < 0 ) {
4890 --zSig0;
4891 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4893 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4894 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4895 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4896 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4897 while ( (sbits64) rem1 < 0 ) {
4898 --zSig1;
4899 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4901 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4903 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4904 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4908 /*----------------------------------------------------------------------------
4909 | Returns the remainder of the quadruple-precision floating-point value `a'
4910 | with respect to the corresponding value `b'. The operation is performed
4911 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4912 *----------------------------------------------------------------------------*/
4914 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4916 flag aSign, bSign, zSign;
4917 int32 aExp, bExp, expDiff;
4918 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4919 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4920 sbits64 sigMean0;
4921 float128 z;
4923 aSig1 = extractFloat128Frac1( a );
4924 aSig0 = extractFloat128Frac0( a );
4925 aExp = extractFloat128Exp( a );
4926 aSign = extractFloat128Sign( a );
4927 bSig1 = extractFloat128Frac1( b );
4928 bSig0 = extractFloat128Frac0( b );
4929 bExp = extractFloat128Exp( b );
4930 bSign = extractFloat128Sign( b );
4931 if ( aExp == 0x7FFF ) {
4932 if ( ( aSig0 | aSig1 )
4933 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4934 return propagateFloat128NaN( a, b STATUS_VAR );
4936 goto invalid;
4938 if ( bExp == 0x7FFF ) {
4939 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4940 return a;
4942 if ( bExp == 0 ) {
4943 if ( ( bSig0 | bSig1 ) == 0 ) {
4944 invalid:
4945 float_raise( float_flag_invalid STATUS_VAR);
4946 z.low = float128_default_nan_low;
4947 z.high = float128_default_nan_high;
4948 return z;
4950 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4952 if ( aExp == 0 ) {
4953 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4954 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4956 expDiff = aExp - bExp;
4957 if ( expDiff < -1 ) return a;
4958 shortShift128Left(
4959 aSig0 | LIT64( 0x0001000000000000 ),
4960 aSig1,
4961 15 - ( expDiff < 0 ),
4962 &aSig0,
4963 &aSig1
4965 shortShift128Left(
4966 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4967 q = le128( bSig0, bSig1, aSig0, aSig1 );
4968 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4969 expDiff -= 64;
4970 while ( 0 < expDiff ) {
4971 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4972 q = ( 4 < q ) ? q - 4 : 0;
4973 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4974 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4975 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4976 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4977 expDiff -= 61;
4979 if ( -64 < expDiff ) {
4980 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4981 q = ( 4 < q ) ? q - 4 : 0;
4982 q >>= - expDiff;
4983 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4984 expDiff += 52;
4985 if ( expDiff < 0 ) {
4986 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4988 else {
4989 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4991 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4992 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4994 else {
4995 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4996 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4998 do {
4999 alternateASig0 = aSig0;
5000 alternateASig1 = aSig1;
5001 ++q;
5002 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5003 } while ( 0 <= (sbits64) aSig0 );
5004 add128(
5005 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5006 if ( ( sigMean0 < 0 )
5007 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5008 aSig0 = alternateASig0;
5009 aSig1 = alternateASig1;
5011 zSign = ( (sbits64) aSig0 < 0 );
5012 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5013 return
5014 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5018 /*----------------------------------------------------------------------------
5019 | Returns the square root of the quadruple-precision floating-point value `a'.
5020 | The operation is performed according to the IEC/IEEE Standard for Binary
5021 | Floating-Point Arithmetic.
5022 *----------------------------------------------------------------------------*/
5024 float128 float128_sqrt( float128 a STATUS_PARAM )
5026 flag aSign;
5027 int32 aExp, zExp;
5028 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5029 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5030 float128 z;
5032 aSig1 = extractFloat128Frac1( a );
5033 aSig0 = extractFloat128Frac0( a );
5034 aExp = extractFloat128Exp( a );
5035 aSign = extractFloat128Sign( a );
5036 if ( aExp == 0x7FFF ) {
5037 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5038 if ( ! aSign ) return a;
5039 goto invalid;
5041 if ( aSign ) {
5042 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5043 invalid:
5044 float_raise( float_flag_invalid STATUS_VAR);
5045 z.low = float128_default_nan_low;
5046 z.high = float128_default_nan_high;
5047 return z;
5049 if ( aExp == 0 ) {
5050 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5051 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5053 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5054 aSig0 |= LIT64( 0x0001000000000000 );
5055 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5056 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5057 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5058 doubleZSig0 = zSig0<<1;
5059 mul64To128( zSig0, zSig0, &term0, &term1 );
5060 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5061 while ( (sbits64) rem0 < 0 ) {
5062 --zSig0;
5063 doubleZSig0 -= 2;
5064 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5066 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5067 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5068 if ( zSig1 == 0 ) zSig1 = 1;
5069 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5070 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5071 mul64To128( zSig1, zSig1, &term2, &term3 );
5072 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5073 while ( (sbits64) rem1 < 0 ) {
5074 --zSig1;
5075 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5076 term3 |= 1;
5077 term2 |= doubleZSig0;
5078 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5080 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5082 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5083 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5087 /*----------------------------------------------------------------------------
5088 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5089 | the corresponding value `b', and 0 otherwise. The comparison is performed
5090 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5091 *----------------------------------------------------------------------------*/
5093 int float128_eq( float128 a, float128 b STATUS_PARAM )
5096 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5097 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5098 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5099 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5101 if ( float128_is_signaling_nan( a )
5102 || float128_is_signaling_nan( b ) ) {
5103 float_raise( float_flag_invalid STATUS_VAR);
5105 return 0;
5107 return
5108 ( a.low == b.low )
5109 && ( ( a.high == b.high )
5110 || ( ( a.low == 0 )
5111 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5116 /*----------------------------------------------------------------------------
5117 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5118 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5119 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5120 | Arithmetic.
5121 *----------------------------------------------------------------------------*/
5123 int float128_le( float128 a, float128 b STATUS_PARAM )
5125 flag aSign, bSign;
5127 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5128 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5129 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5130 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5132 float_raise( float_flag_invalid STATUS_VAR);
5133 return 0;
5135 aSign = extractFloat128Sign( a );
5136 bSign = extractFloat128Sign( b );
5137 if ( aSign != bSign ) {
5138 return
5139 aSign
5140 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5141 == 0 );
5143 return
5144 aSign ? le128( b.high, b.low, a.high, a.low )
5145 : le128( a.high, a.low, b.high, b.low );
5149 /*----------------------------------------------------------------------------
5150 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5151 | the corresponding value `b', and 0 otherwise. The comparison is performed
5152 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5153 *----------------------------------------------------------------------------*/
5155 int float128_lt( float128 a, float128 b STATUS_PARAM )
5157 flag aSign, bSign;
5159 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5160 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5161 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5162 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5164 float_raise( float_flag_invalid STATUS_VAR);
5165 return 0;
5167 aSign = extractFloat128Sign( a );
5168 bSign = extractFloat128Sign( b );
5169 if ( aSign != bSign ) {
5170 return
5171 aSign
5172 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5173 != 0 );
5175 return
5176 aSign ? lt128( b.high, b.low, a.high, a.low )
5177 : lt128( a.high, a.low, b.high, b.low );
5181 /*----------------------------------------------------------------------------
5182 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5183 | the corresponding value `b', and 0 otherwise. The invalid exception is
5184 | raised if either operand is a NaN. Otherwise, the comparison is performed
5185 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5186 *----------------------------------------------------------------------------*/
5188 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5191 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5192 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5193 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5194 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5196 float_raise( float_flag_invalid STATUS_VAR);
5197 return 0;
5199 return
5200 ( a.low == b.low )
5201 && ( ( a.high == b.high )
5202 || ( ( a.low == 0 )
5203 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5208 /*----------------------------------------------------------------------------
5209 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5210 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5211 | cause an exception. Otherwise, the comparison is performed according to the
5212 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5213 *----------------------------------------------------------------------------*/
5215 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5217 flag aSign, bSign;
5219 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5220 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5221 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5222 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5224 if ( float128_is_signaling_nan( a )
5225 || float128_is_signaling_nan( b ) ) {
5226 float_raise( float_flag_invalid STATUS_VAR);
5228 return 0;
5230 aSign = extractFloat128Sign( a );
5231 bSign = extractFloat128Sign( b );
5232 if ( aSign != bSign ) {
5233 return
5234 aSign
5235 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5236 == 0 );
5238 return
5239 aSign ? le128( b.high, b.low, a.high, a.low )
5240 : le128( a.high, a.low, b.high, b.low );
5244 /*----------------------------------------------------------------------------
5245 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5246 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5247 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5248 | Standard for Binary Floating-Point Arithmetic.
5249 *----------------------------------------------------------------------------*/
5251 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5253 flag aSign, bSign;
5255 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5256 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5257 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5258 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5260 if ( float128_is_signaling_nan( a )
5261 || float128_is_signaling_nan( b ) ) {
5262 float_raise( float_flag_invalid STATUS_VAR);
5264 return 0;
5266 aSign = extractFloat128Sign( a );
5267 bSign = extractFloat128Sign( b );
5268 if ( aSign != bSign ) {
5269 return
5270 aSign
5271 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5272 != 0 );
5274 return
5275 aSign ? lt128( b.high, b.low, a.high, a.low )
5276 : lt128( a.high, a.low, b.high, b.low );
5280 #endif
5282 /* misc functions */
5283 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5285 return int64_to_float32(a STATUS_VAR);
5288 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5290 return int64_to_float64(a STATUS_VAR);
5293 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5295 int64_t v;
5296 unsigned int res;
5298 v = float32_to_int64(a STATUS_VAR);
5299 if (v < 0) {
5300 res = 0;
5301 float_raise( float_flag_invalid STATUS_VAR);
5302 } else if (v > 0xffffffff) {
5303 res = 0xffffffff;
5304 float_raise( float_flag_invalid STATUS_VAR);
5305 } else {
5306 res = v;
5308 return res;
5311 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5313 int64_t v;
5314 unsigned int res;
5316 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5317 if (v < 0) {
5318 res = 0;
5319 float_raise( float_flag_invalid STATUS_VAR);
5320 } else if (v > 0xffffffff) {
5321 res = 0xffffffff;
5322 float_raise( float_flag_invalid STATUS_VAR);
5323 } else {
5324 res = v;
5326 return res;
5329 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5331 int64_t v;
5332 unsigned int res;
5334 v = float64_to_int64(a STATUS_VAR);
5335 if (v < 0) {
5336 res = 0;
5337 float_raise( float_flag_invalid STATUS_VAR);
5338 } else if (v > 0xffffffff) {
5339 res = 0xffffffff;
5340 float_raise( float_flag_invalid STATUS_VAR);
5341 } else {
5342 res = v;
5344 return res;
5347 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5349 int64_t v;
5350 unsigned int res;
5352 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5353 if (v < 0) {
5354 res = 0;
5355 float_raise( float_flag_invalid STATUS_VAR);
5356 } else if (v > 0xffffffff) {
5357 res = 0xffffffff;
5358 float_raise( float_flag_invalid STATUS_VAR);
5359 } else {
5360 res = v;
5362 return res;
5365 /* FIXME: This looks broken. */
5366 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5368 int64_t v;
5370 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5371 v += float64_val(a);
5372 v = float64_to_int64(make_float64(v) STATUS_VAR);
5374 return v - INT64_MIN;
5377 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5379 int64_t v;
5381 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5382 v += float64_val(a);
5383 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5385 return v - INT64_MIN;
5388 #define COMPARE(s, nan_exp) \
5389 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5390 int is_quiet STATUS_PARAM ) \
5392 flag aSign, bSign; \
5393 bits ## s av, bv; \
5395 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5396 extractFloat ## s ## Frac( a ) ) || \
5397 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5398 extractFloat ## s ## Frac( b ) )) { \
5399 if (!is_quiet || \
5400 float ## s ## _is_signaling_nan( a ) || \
5401 float ## s ## _is_signaling_nan( b ) ) { \
5402 float_raise( float_flag_invalid STATUS_VAR); \
5404 return float_relation_unordered; \
5406 aSign = extractFloat ## s ## Sign( a ); \
5407 bSign = extractFloat ## s ## Sign( b ); \
5408 av = float ## s ## _val(a); \
5409 bv = float ## s ## _val(b); \
5410 if ( aSign != bSign ) { \
5411 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5412 /* zero case */ \
5413 return float_relation_equal; \
5414 } else { \
5415 return 1 - (2 * aSign); \
5417 } else { \
5418 if (av == bv) { \
5419 return float_relation_equal; \
5420 } else { \
5421 return 1 - 2 * (aSign ^ ( av < bv )); \
5426 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5428 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5431 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5433 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5436 COMPARE(32, 0xff)
5437 COMPARE(64, 0x7ff)
5439 INLINE int float128_compare_internal( float128 a, float128 b,
5440 int is_quiet STATUS_PARAM )
5442 flag aSign, bSign;
5444 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5445 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5446 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5447 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5448 if (!is_quiet ||
5449 float128_is_signaling_nan( a ) ||
5450 float128_is_signaling_nan( b ) ) {
5451 float_raise( float_flag_invalid STATUS_VAR);
5453 return float_relation_unordered;
5455 aSign = extractFloat128Sign( a );
5456 bSign = extractFloat128Sign( b );
5457 if ( aSign != bSign ) {
5458 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5459 /* zero case */
5460 return float_relation_equal;
5461 } else {
5462 return 1 - (2 * aSign);
5464 } else {
5465 if (a.low == b.low && a.high == b.high) {
5466 return float_relation_equal;
5467 } else {
5468 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5473 int float128_compare( float128 a, float128 b STATUS_PARAM )
5475 return float128_compare_internal(a, b, 0 STATUS_VAR);
5478 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5480 return float128_compare_internal(a, b, 1 STATUS_VAR);
5483 /* Multiply A by 2 raised to the power N. */
5484 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5486 flag aSign;
5487 int16 aExp;
5488 bits32 aSig;
5490 aSig = extractFloat32Frac( a );
5491 aExp = extractFloat32Exp( a );
5492 aSign = extractFloat32Sign( a );
5494 if ( aExp == 0xFF ) {
5495 return a;
5497 if ( aExp != 0 )
5498 aSig |= 0x00800000;
5499 else if ( aSig == 0 )
5500 return a;
5502 aExp += n - 1;
5503 aSig <<= 7;
5504 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5507 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5509 flag aSign;
5510 int16 aExp;
5511 bits64 aSig;
5513 aSig = extractFloat64Frac( a );
5514 aExp = extractFloat64Exp( a );
5515 aSign = extractFloat64Sign( a );
5517 if ( aExp == 0x7FF ) {
5518 return a;
5520 if ( aExp != 0 )
5521 aSig |= LIT64( 0x0010000000000000 );
5522 else if ( aSig == 0 )
5523 return a;
5525 aExp += n - 1;
5526 aSig <<= 10;
5527 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5530 #ifdef FLOATX80
5531 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5533 flag aSign;
5534 int16 aExp;
5535 bits64 aSig;
5537 aSig = extractFloatx80Frac( a );
5538 aExp = extractFloatx80Exp( a );
5539 aSign = extractFloatx80Sign( a );
5541 if ( aExp == 0x7FF ) {
5542 return a;
5544 if (aExp == 0 && aSig == 0)
5545 return a;
5547 aExp += n;
5548 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5549 aSign, aExp, aSig, 0 STATUS_VAR );
5551 #endif
5553 #ifdef FLOAT128
5554 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5556 flag aSign;
5557 int32 aExp;
5558 bits64 aSig0, aSig1;
5560 aSig1 = extractFloat128Frac1( a );
5561 aSig0 = extractFloat128Frac0( a );
5562 aExp = extractFloat128Exp( a );
5563 aSign = extractFloat128Sign( a );
5564 if ( aExp == 0x7FFF ) {
5565 return a;
5567 if ( aExp != 0 )
5568 aSig0 |= LIT64( 0x0001000000000000 );
5569 else if ( aSig0 == 0 && aSig1 == 0 )
5570 return a;
5572 aExp += n - 1;
5573 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5574 STATUS_VAR );
5577 #endif