- fixed endianness problems at PCI DMA block transfers using getHostMemAddr()
[bochs-mirror.git] / fpu / softfloat.cc
blob5d82213e1bb1a1c4728e5e8b87b19b344b1d40df
1 /*============================================================================
2 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
3 Package, Release 2b.
5 Written by John R. Hauser. This work was made possible in part by the
6 International Computer Science Institute, located at Suite 600, 1947 Center
7 Street, Berkeley, California 94704. Funding was partially provided by the
8 National Science Foundation under grant MIP-9311980. The original version
9 of this code was written as part of a project to build a fixed-point vector
10 processor in collaboration with the University of California at Berkeley,
11 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
12 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
13 arithmetic/SoftFloat.html'.
15 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
16 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
17 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
18 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
19 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
20 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
21 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
22 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
24 Derivative works are acceptable, even for commercial purposes, so long as
25 (1) the source code for the derivative work includes prominent notice that
26 the work is derivative, and (2) the source code includes prominent notice with
27 these four paragraphs for those parts of this code that are retained.
28 =============================================================================*/
30 #define FLOAT128
32 /*============================================================================
33 * Adapted for Bochs (x86 achitecture simulator) by
34 * Stanislav Shwartsman [sshwarts at sourceforge net]
35 * ==========================================================================*/
37 #include "softfloat.h"
38 #include "softfloat-round-pack.h"
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target
43 | if desired).
44 *----------------------------------------------------------------------------*/
45 #define USE_estimateDiv128To64
46 #define USE_estimateSqrt32
47 #include "softfloat-macros.h"
49 /*----------------------------------------------------------------------------
50 | Functions and definitions to determine: (1) whether tininess for underflow
51 | is detected before or after rounding by default, (2) what (if anything)
52 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
53 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
54 | are propagated from function inputs to output. These details are target-
55 | specific.
56 *----------------------------------------------------------------------------*/
57 #include "softfloat-specialize.h"
59 /*----------------------------------------------------------------------------
60 | Returns the result of converting the 32-bit two's complement integer `a'
61 | to the single-precision floating-point format. The conversion is performed
62 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
63 *----------------------------------------------------------------------------*/
65 float32 int32_to_float32(Bit32s a, float_status_t &status)
67 if (a == 0) return 0;
68 if (a == (Bit32s) 0x80000000) return packFloat32(1, 0x9E, 0);
69 int zSign = (a < 0);
70 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
73 /*----------------------------------------------------------------------------
74 | Returns the result of converting the 32-bit two's complement integer `a'
75 | to the double-precision floating-point format. The conversion is performed
76 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
77 *----------------------------------------------------------------------------*/
79 float64 int32_to_float64(Bit32s a)
81 if (a == 0) return 0;
82 int zSign = (a < 0);
83 Bit32u absA = zSign ? -a : a;
84 int shiftCount = countLeadingZeros32(absA) + 21;
85 Bit64u zSig = absA;
86 return packFloat64(zSign, 0x432 - shiftCount, zSig<<shiftCount);
89 /*----------------------------------------------------------------------------
90 | Returns the result of converting the 64-bit two's complement integer `a'
91 | to the single-precision floating-point format. The conversion is performed
92 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
93 *----------------------------------------------------------------------------*/
95 float32 int64_to_float32(Bit64s a, float_status_t &status)
97 if (a == 0) return 0;
98 int zSign = (a < 0);
99 Bit64u absA = zSign ? -a : a;
100 int shiftCount = countLeadingZeros64(absA) - 40;
101 if (0 <= shiftCount) {
102 return packFloat32(zSign, 0x95 - shiftCount, (Bit32u)(absA<<shiftCount));
104 else {
105 shiftCount += 7;
106 if (shiftCount < 0) {
107 absA = shift64RightJamming(absA, -shiftCount);
109 else {
110 absA <<= shiftCount;
112 return roundAndPackFloat32(zSign, 0x9C - shiftCount, (Bit32u) absA, status);
116 /*----------------------------------------------------------------------------
117 | Returns the result of converting the 64-bit two's complement integer `a'
118 | to the double-precision floating-point format. The conversion is performed
119 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
120 *----------------------------------------------------------------------------*/
122 float64 int64_to_float64(Bit64s a, float_status_t &status)
124 if (a == 0) return 0;
125 if (a == (Bit64s) BX_CONST64(0x8000000000000000)) {
126 return packFloat64(1, 0x43E, 0);
128 int zSign = (a < 0);
129 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
132 /*----------------------------------------------------------------------------
133 | Returns the result of converting the single-precision floating-point value
134 | `a' to the 32-bit two's complement integer format. The conversion is
135 | performed according to the IEC/IEEE Standard for Binary Floating-Point
136 | Arithmetic - which means in particular that the conversion is rounded
137 | according to the current rounding mode. If `a' is a NaN or the
138 | conversion overflows the integer indefinite value is returned.
139 *----------------------------------------------------------------------------*/
141 Bit32s float32_to_int32(float32 a, float_status_t &status)
143 Bit32u aSig = extractFloat32Frac(a);
144 Bit16s aExp = extractFloat32Exp(a);
145 int aSign = extractFloat32Sign(a);
146 if ((aExp == 0xFF) && aSig) aSign = 0;
147 if (aExp) aSig |= 0x00800000;
148 int shiftCount = 0xAF - aExp;
149 Bit64u aSig64 = aSig;
150 aSig64 <<= 32;
151 if (0 < shiftCount) aSig64 = shift64RightJamming(aSig64, shiftCount);
152 return roundAndPackInt32(aSign, aSig64, status);
155 /*----------------------------------------------------------------------------
156 | Returns the result of converting the single-precision floating-point value
157 | `a' to the 32-bit two's complement integer format. The conversion is
158 | performed according to the IEC/IEEE Standard for Binary Floating-Point
159 | Arithmetic, except that the conversion is always rounded toward zero.
160 | If `a' is a NaN or the conversion overflows, the integer indefinite
161 | value is returned.
162 *----------------------------------------------------------------------------*/
164 Bit32s float32_to_int32_round_to_zero(float32 a, float_status_t &status)
166 int aSign;
167 Bit16s aExp;
168 Bit32u aSig;
169 Bit32s z;
171 aSig = extractFloat32Frac(a);
172 aExp = extractFloat32Exp(a);
173 aSign = extractFloat32Sign(a);
174 int shiftCount = aExp - 0x9E;
175 if (0 <= shiftCount) {
176 if (a != 0xCF000000) {
177 float_raise(status, float_flag_invalid);
179 return (Bit32s)(int32_indefinite);
181 else if (aExp <= 0x7E) {
182 if (aExp | aSig) float_raise(status, float_flag_inexact);
183 return 0;
185 aSig = (aSig | 0x00800000)<<8;
186 z = aSig>>(-shiftCount);
187 if ((Bit32u) (aSig<<(shiftCount & 31))) {
188 float_raise(status, float_flag_inexact);
190 if (aSign) z = -z;
191 return z;
194 /*----------------------------------------------------------------------------
195 | Returns the result of converting the single-precision floating-point value
196 | `a' to the 64-bit two's complement integer format. The conversion is
197 | performed according to the IEC/IEEE Standard for Binary Floating-Point
198 | Arithmetic - which means in particular that the conversion is rounded
199 | according to the current rounding mode. If `a' is a NaN or the
200 | conversion overflows, the integer indefinite value is returned.
201 *----------------------------------------------------------------------------*/
203 Bit64s float32_to_int64(float32 a, float_status_t &status)
205 Bit64u aSig64, aSigExtra;
207 Bit32u aSig = extractFloat32Frac(a);
208 Bit16s aExp = extractFloat32Exp(a);
209 int aSign = extractFloat32Sign(a);
211 int shiftCount = 0xBE - aExp;
212 if (shiftCount < 0) {
213 float_raise(status, float_flag_invalid);
214 return (Bit64s)(int64_indefinite);
216 if (aExp) aSig |= 0x00800000;
217 aSig64 = aSig;
218 aSig64 <<= 40;
219 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
220 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
223 /*----------------------------------------------------------------------------
224 | Returns the result of converting the single-precision floating-point value
225 | `a' to the 64-bit two's complement integer format. The conversion is
226 | performed according to the IEC/IEEE Standard for Binary Floating-Point
227 | Arithmetic, except that the conversion is always rounded toward zero.
228 | If `a' is a NaN or the conversion overflows, the integer indefinite
229 | value is returned.
230 *----------------------------------------------------------------------------*/
232 Bit64s float32_to_int64_round_to_zero(float32 a, float_status_t &status)
234 int aSign;
235 Bit16s aExp;
236 Bit32u aSig;
237 Bit64u aSig64;
238 Bit64s z;
240 aSig = extractFloat32Frac(a);
241 aExp = extractFloat32Exp(a);
242 aSign = extractFloat32Sign(a);
243 int shiftCount = aExp - 0xBE;
244 if (0 <= shiftCount) {
245 if (a != 0xDF000000) {
246 float_raise(status, float_flag_invalid);
248 return (Bit64s)(int64_indefinite);
250 else if (aExp <= 0x7E) {
251 if (aExp | aSig) float_raise(status, float_flag_inexact);
252 return 0;
254 aSig64 = aSig | 0x00800000;
255 aSig64 <<= 40;
256 z = aSig64>>(-shiftCount);
257 if ((Bit64u) (aSig64<<(shiftCount & 63))) {
258 float_raise(status, float_flag_inexact);
260 if (aSign) z = -z;
261 return z;
264 /*----------------------------------------------------------------------------
265 | Returns the result of converting the single-precision floating-point value
266 | `a' to the double-precision floating-point format. The conversion is
267 | performed according to the IEC/IEEE Standard for Binary Floating-Point
268 | Arithmetic.
269 *----------------------------------------------------------------------------*/
271 float64 float32_to_float64(float32 a, float_status_t &status)
273 Bit32u aSig = extractFloat32Frac(a);
274 Bit16s aExp = extractFloat32Exp(a);
275 int aSign = extractFloat32Sign(a);
277 if (aExp == 0xFF) {
278 if (aSig) return commonNaNToFloat64(float32ToCommonNaN(a, status));
279 return packFloat64(aSign, 0x7FF, 0);
281 if (aExp == 0) {
282 if (aSig == 0) return packFloat64(aSign, 0, 0);
283 float_raise(status, float_flag_denormal);
284 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
285 --aExp;
287 return packFloat64(aSign, aExp + 0x380, ((Bit64u) aSig)<<29);
290 /*----------------------------------------------------------------------------
291 | Rounds the single-precision floating-point value `a' to an integer, and
292 | returns the result as a single-precision floating-point value. The
293 | operation is performed according to the IEC/IEEE Standard for Binary
294 | Floating-Point Arithmetic.
295 *----------------------------------------------------------------------------*/
297 float32 float32_round_to_int(float32 a, float_status_t &status)
299 int aSign;
300 Bit16s aExp;
301 Bit32u lastBitMask, roundBitsMask;
302 int roundingMode;
303 float32 z;
305 aExp = extractFloat32Exp(a);
306 if (0x96 <= aExp) {
307 if ((aExp == 0xFF) && extractFloat32Frac(a)) {
308 return propagateFloat32NaN(a, status);
310 return a;
312 if (aExp <= 0x7E) {
313 if ((Bit32u) (a<<1) == 0) return a;
314 float_raise(status, float_flag_inexact);
315 aSign = extractFloat32Sign(a);
316 switch (get_float_rounding_mode(status)) {
317 case float_round_nearest_even:
318 if ((aExp == 0x7E) && extractFloat32Frac(a)) {
319 return packFloat32(aSign, 0x7F, 0);
321 break;
322 case float_round_down:
323 return aSign ? 0xBF800000 : 0;
324 case float_round_up:
325 return aSign ? 0x80000000 : 0x3F800000;
327 return packFloat32(aSign, 0, 0);
329 lastBitMask = 1;
330 lastBitMask <<= 0x96 - aExp;
331 roundBitsMask = lastBitMask - 1;
332 z = a;
333 roundingMode = get_float_rounding_mode(status);
334 if (roundingMode == float_round_nearest_even) {
335 z += lastBitMask>>1;
336 if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
338 else if (roundingMode != float_round_to_zero) {
339 if (extractFloat32Sign(z) ^ (roundingMode == float_round_up)) {
340 z += roundBitsMask;
343 z &= ~roundBitsMask;
344 if (z != a) float_raise(status, float_flag_inexact);
345 return z;
348 /*----------------------------------------------------------------------------
349 | Returns the result of adding the absolute values of the single-precision
350 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
351 | before being returned. `zSign' is ignored if the result is a NaN.
352 | The addition is performed according to the IEC/IEEE Standard for Binary
353 | Floating-Point Arithmetic.
354 *----------------------------------------------------------------------------*/
356 static float32 addFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
358 Bit16s aExp, bExp, zExp;
359 Bit32u aSig, bSig, zSig;
360 Bit16s expDiff;
362 aSig = extractFloat32Frac(a);
363 aExp = extractFloat32Exp(a);
364 bSig = extractFloat32Frac(b);
365 bExp = extractFloat32Exp(b);
367 expDiff = aExp - bExp;
368 aSig <<= 6;
369 bSig <<= 6;
371 if (0 < expDiff) {
372 if (aExp == 0xFF) {
373 if (aSig) return propagateFloat32NaN(a, b, status);
374 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
375 return a;
377 if ((aExp == 0) && aSig)
378 float_raise(status, float_flag_denormal);
380 if (bExp == 0) {
381 if (bSig) float_raise(status, float_flag_denormal);
382 --expDiff;
384 else bSig |= 0x20000000;
386 bSig = shift32RightJamming(bSig, expDiff);
387 zExp = aExp;
389 else if (expDiff < 0) {
390 if (bExp == 0xFF) {
391 if (bSig) return propagateFloat32NaN(a, b, status);
392 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
393 return packFloat32(zSign, 0xFF, 0);
395 if ((bExp == 0) && bSig)
396 float_raise(status, float_flag_denormal);
398 if (aExp == 0) {
399 if (aSig) float_raise(status, float_flag_denormal);
400 ++expDiff;
402 else aSig |= 0x20000000;
404 aSig = shift32RightJamming(aSig, -expDiff);
405 zExp = bExp;
407 else {
408 if (aExp == 0xFF) {
409 if (aSig | bSig) return propagateFloat32NaN(a, b, status);
410 return a;
412 if (aExp == 0) {
413 if (aSig | bSig) float_raise(status, float_flag_denormal);
414 return packFloat32(zSign, 0, (aSig + bSig)>>6);
416 zSig = 0x40000000 + aSig + bSig;
417 zExp = aExp;
418 goto roundAndPack;
420 aSig |= 0x20000000;
421 zSig = (aSig + bSig)<<1;
422 --zExp;
423 if ((Bit32s) zSig < 0) {
424 zSig = aSig + bSig;
425 ++zExp;
427 roundAndPack:
428 return roundAndPackFloat32(zSign, zExp, zSig, status);
431 /*----------------------------------------------------------------------------
432 | Returns the result of subtracting the absolute values of the single-
433 | precision floating-point values `a' and `b'. If `zSign' is 1, the
434 | difference is negated before being returned. `zSign' is ignored if the
435 | result is a NaN. The subtraction is performed according to the IEC/IEEE
436 | Standard for Binary Floating-Point Arithmetic.
437 *----------------------------------------------------------------------------*/
439 static float32 subFloat32Sigs(float32 a, float32 b, int zSign, float_status_t &status)
441 Bit16s aExp, bExp, zExp;
442 Bit32u aSig, bSig, zSig;
443 Bit16s expDiff;
445 aSig = extractFloat32Frac(a);
446 aExp = extractFloat32Exp(a);
447 bSig = extractFloat32Frac(b);
448 bExp = extractFloat32Exp(b);
450 expDiff = aExp - bExp;
451 aSig <<= 7;
452 bSig <<= 7;
453 if (0 < expDiff) goto aExpBigger;
454 if (expDiff < 0) goto bExpBigger;
455 if (aExp == 0xFF) {
456 if (aSig | bSig) return propagateFloat32NaN(a, b, status);
457 float_raise(status, float_flag_invalid);
458 return float32_default_nan;
460 if (aExp == 0) {
461 if (aSig | bSig) float_raise(status, float_flag_denormal);
462 aExp = 1;
463 bExp = 1;
465 if (bSig < aSig) goto aBigger;
466 if (aSig < bSig) goto bBigger;
467 return packFloat32(get_float_rounding_mode(status) == float_round_down, 0, 0);
468 bExpBigger:
469 if (bExp == 0xFF) {
470 if (bSig) return propagateFloat32NaN(a, b, status);
471 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
472 return packFloat32(zSign ^ 1, 0xFF, 0);
474 if ((bExp == 0) && bSig)
475 float_raise(status, float_flag_denormal);
477 if (aExp == 0) {
478 if (aSig) float_raise(status, float_flag_denormal);
479 ++expDiff;
481 else aSig |= 0x40000000;
483 aSig = shift32RightJamming(aSig, -expDiff);
484 bSig |= 0x40000000;
485 bBigger:
486 zSig = bSig - aSig;
487 zExp = bExp;
488 zSign ^= 1;
489 goto normalizeRoundAndPack;
490 aExpBigger:
491 if (aExp == 0xFF) {
492 if (aSig) return propagateFloat32NaN(a, b, status);
493 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
494 return a;
496 if ((aExp == 0) && aSig)
497 float_raise(status, float_flag_denormal);
499 if (bExp == 0) {
500 if (bSig) float_raise(status, float_flag_denormal);
501 --expDiff;
503 else bSig |= 0x40000000;
505 bSig = shift32RightJamming(bSig, expDiff);
506 aSig |= 0x40000000;
507 aBigger:
508 zSig = aSig - bSig;
509 zExp = aExp;
510 normalizeRoundAndPack:
511 --zExp;
512 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
515 /*----------------------------------------------------------------------------
516 | Returns the result of adding the single-precision floating-point values `a'
517 | and `b'. The operation is performed according to the IEC/IEEE Standard for
518 | Binary Floating-Point Arithmetic.
519 *----------------------------------------------------------------------------*/
521 float32 float32_add(float32 a, float32 b, float_status_t &status)
523 int aSign = extractFloat32Sign(a);
524 int bSign = extractFloat32Sign(b);
526 if (aSign == bSign) {
527 return addFloat32Sigs(a, b, aSign, status);
529 else {
530 return subFloat32Sigs(a, b, aSign, status);
534 /*----------------------------------------------------------------------------
535 | Returns the result of subtracting the single-precision floating-point values
536 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
537 | for Binary Floating-Point Arithmetic.
538 *----------------------------------------------------------------------------*/
540 float32 float32_sub(float32 a, float32 b, float_status_t &status)
542 int aSign = extractFloat32Sign(a);
543 int bSign = extractFloat32Sign(b);
545 if (aSign == bSign) {
546 return subFloat32Sigs(a, b, aSign, status);
548 else {
549 return addFloat32Sigs(a, b, aSign, status);
553 /*----------------------------------------------------------------------------
554 | Returns the result of multiplying the single-precision floating-point values
555 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
556 | for Binary Floating-Point Arithmetic.
557 *----------------------------------------------------------------------------*/
559 float32 float32_mul(float32 a, float32 b, float_status_t &status)
561 int aSign, bSign, zSign;
562 Bit16s aExp, bExp, zExp;
563 Bit32u aSig, bSig;
564 Bit64u zSig64;
565 Bit32u zSig;
567 aSig = extractFloat32Frac(a);
568 aExp = extractFloat32Exp(a);
569 aSign = extractFloat32Sign(a);
570 bSig = extractFloat32Frac(b);
571 bExp = extractFloat32Exp(b);
572 bSign = extractFloat32Sign(b);
573 zSign = aSign ^ bSign;
574 if (aExp == 0xFF) {
575 if (aSig || ((bExp == 0xFF) && bSig))
576 return propagateFloat32NaN(a, b, status);
578 if ((bExp | bSig) == 0) {
579 float_raise(status, float_flag_invalid);
580 return float32_default_nan;
582 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
583 return packFloat32(zSign, 0xFF, 0);
585 if (bExp == 0xFF) {
586 if (bSig) return propagateFloat32NaN(a, b, status);
587 if ((aExp | aSig) == 0) {
588 float_raise(status, float_flag_invalid);
589 return float32_default_nan;
591 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
592 return packFloat32(zSign, 0xFF, 0);
594 if (aExp == 0) {
595 if (aSig == 0) {
596 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
597 return packFloat32(zSign, 0, 0);
599 float_raise(status, float_flag_denormal);
600 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
602 if (bExp == 0) {
603 if (bSig == 0) return packFloat32(zSign, 0, 0);
604 float_raise(status, float_flag_denormal);
605 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
607 zExp = aExp + bExp - 0x7F;
608 aSig = (aSig | 0x00800000)<<7;
609 bSig = (bSig | 0x00800000)<<8;
610 zSig64 = shift64RightJamming(((Bit64u) aSig) * bSig, 32);
611 zSig = (Bit32u) zSig64;
612 if (0 <= (Bit32s) (zSig<<1)) {
613 zSig <<= 1;
614 --zExp;
616 return roundAndPackFloat32(zSign, zExp, zSig, status);
619 /*----------------------------------------------------------------------------
620 | Returns the result of dividing the single-precision floating-point value `a'
621 | by the corresponding value `b'. The operation is performed according to the
622 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
623 *----------------------------------------------------------------------------*/
625 float32 float32_div(float32 a, float32 b, float_status_t &status)
627 int aSign, bSign, zSign;
628 Bit16s aExp, bExp, zExp;
629 Bit32u aSig, bSig, zSig;
631 aSig = extractFloat32Frac(a);
632 aExp = extractFloat32Exp(a);
633 aSign = extractFloat32Sign(a);
634 bSig = extractFloat32Frac(b);
635 bExp = extractFloat32Exp(b);
636 bSign = extractFloat32Sign(b);
637 zSign = aSign ^ bSign;
638 if (aExp == 0xFF) {
639 if (aSig) return propagateFloat32NaN(a, b, status);
640 if (bExp == 0xFF) {
641 if (bSig) return propagateFloat32NaN(a, b, status);
642 invalid:
643 float_raise(status, float_flag_invalid);
644 return float32_default_nan;
646 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
647 return packFloat32(zSign, 0xFF, 0);
649 if (bExp == 0xFF) {
650 if (bSig) return propagateFloat32NaN(a, b, status);
651 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
652 return packFloat32(zSign, 0, 0);
654 if (bExp == 0) {
655 if (bSig == 0) {
656 if ((aExp | aSig) == 0) goto invalid;
657 float_raise(status, float_flag_divbyzero);
658 return packFloat32(zSign, 0xFF, 0);
660 float_raise(status, float_flag_denormal);
661 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
663 if (aExp == 0) {
664 if (aSig == 0) return packFloat32(zSign, 0, 0);
665 float_raise(status, float_flag_denormal);
666 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
668 zExp = aExp - bExp + 0x7D;
669 aSig = (aSig | 0x00800000)<<7;
670 bSig = (bSig | 0x00800000)<<8;
671 if (bSig <= (aSig + aSig)) {
672 aSig >>= 1;
673 ++zExp;
675 zSig = (((Bit64u) aSig)<<32) / bSig;
676 if ((zSig & 0x3F) == 0) {
677 zSig |= ((Bit64u) bSig * zSig != ((Bit64u) aSig)<<32);
679 return roundAndPackFloat32(zSign, zExp, zSig, status);
682 /*----------------------------------------------------------------------------
683 | Returns the square root of the single-precision floating-point value `a'.
684 | The operation is performed according to the IEC/IEEE Standard for Binary
685 | Floating-Point Arithmetic.
686 *----------------------------------------------------------------------------*/
688 float32 float32_sqrt(float32 a, float_status_t &status)
690 int aSign;
691 Bit16s aExp, zExp;
692 Bit32u aSig, zSig;
693 Bit64u rem, term;
695 aSig = extractFloat32Frac(a);
696 aExp = extractFloat32Exp(a);
697 aSign = extractFloat32Sign(a);
698 if (aExp == 0xFF) {
699 if (aSig) return propagateFloat32NaN(a, status);
700 if (! aSign) return a;
701 float_raise(status, float_flag_invalid);
702 return float32_default_nan;
704 if (aSign) {
705 if ((aExp | aSig) == 0) return a;
706 float_raise(status, float_flag_invalid);
707 return float32_default_nan;
709 if (aExp == 0) {
710 if (aSig == 0) return 0;
711 float_raise(status, float_flag_denormal);
712 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
714 zExp = ((aExp - 0x7F)>>1) + 0x7E;
715 aSig = (aSig | 0x00800000)<<8;
716 zSig = estimateSqrt32(aExp, aSig) + 2;
717 if ((zSig & 0x7F) <= 5) {
718 if (zSig < 2) {
719 zSig = 0x7FFFFFFF;
720 goto roundAndPack;
722 aSig >>= aExp & 1;
723 term = ((Bit64u) zSig) * zSig;
724 rem = (((Bit64u) aSig)<<32) - term;
725 while ((Bit64s) rem < 0) {
726 --zSig;
727 rem += (((Bit64u) zSig)<<1) | 1;
729 zSig |= (rem != 0);
731 zSig = shift32RightJamming(zSig, 1);
732 roundAndPack:
733 return roundAndPackFloat32(0, zExp, zSig, status);
736 /*----------------------------------------------------------------------------
737 | Determine single-precision floating-point number class
738 *----------------------------------------------------------------------------*/
740 float_class_t float32_class(float32 a)
742 Bit16s aExp = extractFloat32Exp(a);
743 Bit32u aSig = extractFloat32Frac(a);
744 int aSign = extractFloat32Sign(a);
746 if(aExp == 0xFF) {
747 if (aSig == 0)
748 return (aSign) ? float_negative_inf : float_positive_inf;
750 return float_NaN;
753 if(aExp == 0) {
754 if (aSig == 0) return float_zero;
755 return float_denormal;
758 return float_normalized;
761 /*----------------------------------------------------------------------------
762 | Returns 1 if the single-precision floating-point value `a' is equal to
763 | the corresponding value `b', and 0 otherwise. The comparison is performed
764 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
765 *----------------------------------------------------------------------------*/
767 int float32_eq(float32 a, float32 b, float_status_t &status)
769 float_class_t aClass = float32_class(a);
770 float_class_t bClass = float32_class(b);
772 if (aClass == float_NaN || bClass == float_NaN)
774 if (float32_is_signaling_nan(a) || float32_is_signaling_nan(b))
776 float_raise(status, float_flag_invalid);
778 return 0;
781 if (aClass == float_denormal || bClass == float_denormal)
783 float_raise(status, float_flag_denormal);
786 return (a == b) || ((Bit32u) ((a | b)<<1) == 0);
789 /*----------------------------------------------------------------------------
790 | Returns 1 if the single-precision floating-point value `a' is less than
791 | or equal to the corresponding value `b', and 0 otherwise. The comparison
792 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
793 | Arithmetic.
794 *----------------------------------------------------------------------------*/
796 int float32_le(float32 a, float32 b, float_status_t &status)
798 float_class_t aClass = float32_class(a);
799 float_class_t bClass = float32_class(b);
801 if (aClass == float_NaN || bClass == float_NaN) {
802 float_raise(status, float_flag_invalid);
803 return 0;
806 if (aClass == float_denormal || bClass == float_denormal)
808 float_raise(status, float_flag_denormal);
811 int aSign = extractFloat32Sign(a);
812 int bSign = extractFloat32Sign(b);
813 if (aSign != bSign) return aSign || ((Bit32u) ((a | b)<<1) == 0);
814 return (a == b) || (aSign ^ (a < b));
817 /*----------------------------------------------------------------------------
818 | Returns 1 if the single-precision floating-point value `a' is less than
819 | the corresponding value `b', and 0 otherwise. The comparison is performed
820 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
821 *----------------------------------------------------------------------------*/
823 int float32_lt(float32 a, float32 b, float_status_t &status)
825 float_class_t aClass = float32_class(a);
826 float_class_t bClass = float32_class(b);
828 if (aClass == float_NaN || bClass == float_NaN) {
829 float_raise(status, float_flag_invalid);
830 return 0;
833 if (aClass == float_denormal || bClass == float_denormal)
835 float_raise(status, float_flag_denormal);
838 int aSign = extractFloat32Sign(a);
839 int bSign = extractFloat32Sign(b);
840 if (aSign != bSign) return aSign && ((Bit32u) ((a | b)<<1) != 0);
841 return (a != b) && (aSign ^ (a < b));
844 /*----------------------------------------------------------------------------
845 | Returns 1 if the single-precision floating-point value `a' is equal to
846 | the corresponding value `b', and 0 otherwise. The invalid exception is
847 | raised if either operand is a NaN. Otherwise, the comparison is performed
848 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
849 *----------------------------------------------------------------------------*/
851 int float32_eq_signaling(float32 a, float32 b, float_status_t &status)
853 float_class_t aClass = float32_class(a);
854 float_class_t bClass = float32_class(b);
856 if (aClass == float_NaN || bClass == float_NaN) {
857 float_raise(status, float_flag_invalid);
858 return 0;
861 if (aClass == float_denormal || bClass == float_denormal)
863 float_raise(status, float_flag_denormal);
866 return (a == b) || ((Bit32u) ((a | b)<<1) == 0);
869 /*----------------------------------------------------------------------------
870 | Returns 1 if the single-precision floating-point value `a' is less than or
871 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
872 | cause an exception. Otherwise, the comparison is performed according to the
873 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
874 *----------------------------------------------------------------------------*/
876 int float32_le_quiet(float32 a, float32 b, float_status_t &status)
878 float_class_t aClass = float32_class(a);
879 float_class_t bClass = float32_class(b);
881 if (aClass == float_NaN || bClass == float_NaN)
883 if (float32_is_signaling_nan(a) || float32_is_signaling_nan(b))
885 float_raise(status, float_flag_invalid);
887 return 0;
890 if (aClass == float_denormal || bClass == float_denormal)
892 float_raise(status, float_flag_denormal);
895 int aSign = extractFloat32Sign(a);
896 int bSign = extractFloat32Sign(b);
897 if (aSign != bSign) return aSign || ((Bit32u) ((a | b)<<1) == 0);
898 return (a == b) || (aSign ^ (a < b));
901 /*----------------------------------------------------------------------------
902 | Returns 1 if the single-precision floating-point value `a' is less than
903 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
904 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
905 | Standard for Binary Floating-Point Arithmetic.
906 *----------------------------------------------------------------------------*/
908 int float32_lt_quiet(float32 a, float32 b, float_status_t &status)
910 float_class_t aClass = float32_class(a);
911 float_class_t bClass = float32_class(b);
913 if (aClass == float_NaN || bClass == float_NaN)
915 if (float32_is_signaling_nan(a) || float32_is_signaling_nan(b))
917 float_raise(status, float_flag_invalid);
919 return 0;
922 if (aClass == float_denormal || bClass == float_denormal)
924 float_raise(status, float_flag_denormal);
927 int aSign = extractFloat32Sign(a);
928 int bSign = extractFloat32Sign(b);
929 if (aSign != bSign) return aSign && ((Bit32u) ((a | b)<<1) != 0);
930 return (a != b) && (aSign ^ (a < b));
933 /*----------------------------------------------------------------------------
934 | The unordered relationship is true when at least one of two source operands
935 | being compared is a NaN. Quiet NaNs do not cause an exception.
936 *----------------------------------------------------------------------------*/
938 int float32_unordered(float32 a, float32 b, float_status_t &status)
940 float_class_t aClass = float32_class(a);
941 float_class_t bClass = float32_class(b);
943 if (aClass == float_NaN || bClass == float_NaN)
945 if (float32_is_signaling_nan(a) || float32_is_signaling_nan(b))
947 float_raise(status, float_flag_invalid);
949 return 1;
952 if (aClass == float_denormal || bClass == float_denormal)
954 float_raise(status, float_flag_denormal);
957 return 0;
960 /*----------------------------------------------------------------------------
961 | Compare between two single precision floating point numbers. Returns
962 | 'float_relation_equal' if the operands are equal, 'float_relation_less' if
963 | the value 'a' is less than the corresponding value `b',
964 | 'float_relation_greater' if the value 'a' is greater than the corresponding
965 | value `b', or 'float_relation_unordered' otherwise.
966 *----------------------------------------------------------------------------*/
968 int float32_compare(float32 a, float32 b, float_status_t &status)
970 float_class_t aClass = float32_class(a);
971 float_class_t bClass = float32_class(b);
973 if (aClass == float_NaN || bClass == float_NaN) {
974 float_raise(status, float_flag_invalid);
975 return float_relation_unordered;
978 if (aClass == float_denormal || bClass == float_denormal)
980 float_raise(status, float_flag_denormal);
983 if ((a == b) || ((Bit32u) ((a | b)<<1) == 0)) return float_relation_equal;
985 int aSign = extractFloat32Sign(a);
986 int bSign = extractFloat32Sign(b);
987 if (aSign != bSign)
988 return (aSign) ? float_relation_less : float_relation_greater;
990 if (aSign ^ (a < b)) return float_relation_less;
991 return float_relation_greater;
994 /*----------------------------------------------------------------------------
995 | Compare between two double precision floating point numbers. Returns
996 | 'float_relation_equal' if the operands are equal, 'float_relation_less' if
997 | the value 'a' is less than the corresponding value `b',
998 | 'float_relation_greater' if the value 'a' is greater than the corresponding
999 | value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause
1000 | an exception.
1001 *----------------------------------------------------------------------------*/
1003 int float32_compare_quiet(float32 a, float32 b, float_status_t &status)
1005 float_class_t aClass = float32_class(a);
1006 float_class_t bClass = float32_class(b);
1008 if (aClass == float_NaN || bClass == float_NaN)
1010 if (float32_is_signaling_nan(a) || float32_is_signaling_nan(b))
1012 float_raise(status, float_flag_invalid);
1014 return float_relation_unordered;
1017 if (aClass == float_denormal || bClass == float_denormal)
1019 float_raise(status, float_flag_denormal);
1022 if ((a == b) || ((Bit32u) ((a | b)<<1) == 0)) return float_relation_equal;
1024 int aSign = extractFloat32Sign(a);
1025 int bSign = extractFloat32Sign(b);
1026 if (aSign != bSign)
1027 return (aSign) ? float_relation_less : float_relation_greater;
1029 if (aSign ^ (a < b)) return float_relation_less;
1030 return float_relation_greater;
1033 /*----------------------------------------------------------------------------
1034 | Returns the result of converting the double-precision floating-point value
1035 | `a' to the 32-bit two's complement integer format. The conversion is
1036 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1037 | Arithmetic - which means in particular that the conversion is rounded
1038 | according to the current rounding mode. If `a' is a NaN or the
1039 | conversion overflows, the integer indefinite value is returned.
1040 *----------------------------------------------------------------------------*/
1042 Bit32s float64_to_int32(float64 a, float_status_t &status)
1044 Bit64u aSig = extractFloat64Frac(a);
1045 Bit16s aExp = extractFloat64Exp(a);
1046 int aSign = extractFloat64Sign(a);
1047 if ((aExp == 0x7FF) && aSig) aSign = 0;
1048 if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1049 int shiftCount = 0x42C - aExp;
1050 if (0 < shiftCount) aSig = shift64RightJamming(aSig, shiftCount);
1051 return roundAndPackInt32(aSign, aSig, status);
1054 /*----------------------------------------------------------------------------
1055 | Returns the result of converting the double-precision floating-point value
1056 | `a' to the 32-bit two's complement integer format. The conversion is
1057 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1058 | Arithmetic, except that the conversion is always rounded toward zero.
1059 | If `a' is a NaN or the conversion overflows, the integer indefinite
1060 | value is returned.
1061 *----------------------------------------------------------------------------*/
1063 Bit32s float64_to_int32_round_to_zero(float64 a, float_status_t &status)
1065 int aSign;
1066 Bit16s aExp;
1067 Bit64u aSig, savedASig;
1068 Bit32s z;
1069 int shiftCount;
1071 aSig = extractFloat64Frac(a);
1072 aExp = extractFloat64Exp(a);
1073 aSign = extractFloat64Sign(a);
1074 if (0x41E < aExp) {
1075 if ((aExp == 0x7FF) && aSig) aSign = 0;
1076 goto invalid;
1078 else if (aExp < 0x3FF) {
1079 if (aExp || aSig) float_raise(status, float_flag_inexact);
1080 return 0;
1082 aSig |= BX_CONST64(0x0010000000000000);
1083 shiftCount = 0x433 - aExp;
1084 savedASig = aSig;
1085 aSig >>= shiftCount;
1086 z = (Bit32s) aSig;
1087 if (aSign) z = -z;
1088 if ((z < 0) ^ aSign) {
1089 invalid:
1090 float_raise(status, float_flag_invalid);
1091 return (Bit32s)(int32_indefinite);
1093 if ((aSig<<shiftCount) != savedASig) {
1094 float_raise(status, float_flag_inexact);
1096 return z;
1099 /*----------------------------------------------------------------------------
1100 | Returns the result of converting the double-precision floating-point value
1101 | `a' to the 64-bit two's complement integer format. The conversion is
1102 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1103 | Arithmetic - which means in particular that the conversion is rounded
1104 | according to the current rounding mode. If `a' is a NaN, the largest
1105 | positive integer is returned. Otherwise, if the conversion overflows, the
1106 | largest integer with the same sign as `a' is returned.
1107 *----------------------------------------------------------------------------*/
1109 Bit64s float64_to_int64(float64 a, float_status_t &status)
1111 int aSign;
1112 Bit16s aExp;
1113 Bit64u aSig, aSigExtra;
1115 aSig = extractFloat64Frac(a);
1116 aExp = extractFloat64Exp(a);
1117 aSign = extractFloat64Sign(a);
1118 if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1119 int shiftCount = 0x433 - aExp;
1120 if (shiftCount <= 0) {
1121 if (0x43E < aExp) {
1122 float_raise(status, float_flag_invalid);
1123 return (Bit64s)(int64_indefinite);
1125 aSigExtra = 0;
1126 aSig <<= -shiftCount;
1128 else {
1129 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
1131 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
1134 /*----------------------------------------------------------------------------
1135 | Returns the result of converting the double-precision floating-point value
1136 | `a' to the 64-bit two's complement integer format. The conversion is
1137 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1138 | Arithmetic, except that the conversion is always rounded toward zero.
1139 | If `a' is a NaN or the conversion overflows, the integer indefinite
1140 | value is returned.
1141 *----------------------------------------------------------------------------*/
1143 Bit64s float64_to_int64_round_to_zero(float64 a, float_status_t &status)
1145 int aSign;
1146 Bit16s aExp;
1147 Bit64u aSig;
1148 Bit64s z;
1150 aSig = extractFloat64Frac(a);
1151 aExp = extractFloat64Exp(a);
1152 aSign = extractFloat64Sign(a);
1153 if (aExp) aSig |= BX_CONST64(0x0010000000000000);
1154 int shiftCount = aExp - 0x433;
1155 if (0 <= shiftCount) {
1156 if (0x43E <= aExp) {
1157 if (a != BX_CONST64(0xC3E0000000000000)) {
1158 float_raise(status, float_flag_invalid);
1160 return (Bit64s)(int64_indefinite);
1162 z = aSig<<shiftCount;
1164 else {
1165 if (aExp < 0x3FE) {
1166 if (aExp | aSig) float_raise(status, float_flag_inexact);
1167 return 0;
1169 z = aSig>>(-shiftCount);
1170 if ((Bit64u) (aSig<<(shiftCount & 63))) {
1171 float_raise(status, float_flag_inexact);
1174 if (aSign) z = -z;
1175 return z;
1178 /*----------------------------------------------------------------------------
1179 | Returns the result of converting the double-precision floating-point value
1180 | `a' to the single-precision floating-point format. The conversion is
1181 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1182 | Arithmetic.
1183 *----------------------------------------------------------------------------*/
1185 float32 float64_to_float32(float64 a, float_status_t &status)
1187 int aSign;
1188 Bit16s aExp;
1189 Bit64u aSig;
1190 Bit32u zSig;
1192 aSig = extractFloat64Frac(a);
1193 aExp = extractFloat64Exp(a);
1194 aSign = extractFloat64Sign(a);
1195 if (aExp == 0x7FF) {
1196 if (aSig) return commonNaNToFloat32(float64ToCommonNaN(a, status));
1197 return packFloat32(aSign, 0xFF, 0);
1199 if (aExp == 0) {
1200 if (aSig == 0) return packFloat32(aSign, 0, 0);
1201 float_raise(status, float_flag_denormal);
1203 aSig = shift64RightJamming(aSig, 22);
1204 zSig = (Bit32u) aSig;
1205 if (aExp || zSig) {
1206 zSig |= 0x40000000;
1207 aExp -= 0x381;
1209 return roundAndPackFloat32(aSign, aExp, zSig, status);
1212 /*----------------------------------------------------------------------------
1213 | Rounds the double-precision floating-point value `a' to an integer, and
1214 | returns the result as a double-precision floating-point value. The
1215 | operation is performed according to the IEC/IEEE Standard for Binary
1216 | Floating-Point Arithmetic.
1217 *----------------------------------------------------------------------------*/
1219 float64 float64_round_to_int(float64 a, float_status_t &status)
1221 int aSign;
1222 Bit16s aExp;
1223 Bit64u lastBitMask, roundBitsMask;
1224 int roundingMode;
1225 float64 z;
1227 aExp = extractFloat64Exp(a);
1228 if (0x433 <= aExp) {
1229 if ((aExp == 0x7FF) && extractFloat64Frac(a)) {
1230 return propagateFloat64NaN(a, status);
1232 return a;
1234 if (aExp < 0x3FF) {
1235 if ((Bit64u) (a<<1) == 0) return a;
1236 float_raise(status, float_flag_inexact);
1237 aSign = extractFloat64Sign(a);
1238 switch (get_float_rounding_mode(status)) {
1239 case float_round_nearest_even:
1240 if ((aExp == 0x3FE) && extractFloat64Frac(a)) {
1241 return packFloat64(aSign, 0x3FF, 0);
1243 break;
1244 case float_round_down:
1245 return aSign ? BX_CONST64(0xBFF0000000000000) : 0;
1246 case float_round_up:
1247 return
1248 aSign ? BX_CONST64(0x8000000000000000) : BX_CONST64(0x3FF0000000000000);
1250 return packFloat64(aSign, 0, 0);
1252 lastBitMask = 1;
1253 lastBitMask <<= 0x433 - aExp;
1254 roundBitsMask = lastBitMask - 1;
1255 z = a;
1256 roundingMode = get_float_rounding_mode(status);
1257 if (roundingMode == float_round_nearest_even) {
1258 z += lastBitMask>>1;
1259 if ((z & roundBitsMask) == 0) z &= ~lastBitMask;
1261 else if (roundingMode != float_round_to_zero) {
1262 if (extractFloat64Sign(z) ^ (roundingMode == float_round_up)) {
1263 z += roundBitsMask;
1266 z &= ~roundBitsMask;
1267 if (z != a) float_raise(status, float_flag_inexact);
1268 return z;
1271 /*----------------------------------------------------------------------------
1272 | Returns the result of adding the absolute values of the double-precision
1273 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1274 | before being returned. `zSign' is ignored if the result is a NaN.
1275 | The addition is performed according to the IEC/IEEE Standard for Binary
1276 | Floating-Point Arithmetic.
1277 *----------------------------------------------------------------------------*/
1279 static float64 addFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
1281 Bit16s aExp, bExp, zExp;
1282 Bit64u aSig, bSig, zSig;
1283 Bit16s expDiff;
1285 aSig = extractFloat64Frac(a);
1286 aExp = extractFloat64Exp(a);
1287 bSig = extractFloat64Frac(b);
1288 bExp = extractFloat64Exp(b);
1290 expDiff = aExp - bExp;
1291 aSig <<= 9;
1292 bSig <<= 9;
1293 if (0 < expDiff) {
1294 if (aExp == 0x7FF) {
1295 if (aSig) return propagateFloat64NaN(a, b, status);
1296 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1297 return a;
1299 if ((aExp == 0) && aSig)
1300 float_raise(status, float_flag_denormal);
1302 if (bExp == 0) {
1303 if (bSig) float_raise(status, float_flag_denormal);
1304 --expDiff;
1306 else bSig |= BX_CONST64(0x2000000000000000);
1308 bSig = shift64RightJamming(bSig, expDiff);
1309 zExp = aExp;
1311 else if (expDiff < 0) {
1312 if (bExp == 0x7FF) {
1313 if (bSig) return propagateFloat64NaN(a, b, status);
1314 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1315 return packFloat64(zSign, 0x7FF, 0);
1317 if ((bExp == 0) && bSig)
1318 float_raise(status, float_flag_denormal);
1320 if (aExp == 0) {
1321 if (aSig) float_raise(status, float_flag_denormal);
1322 ++expDiff;
1324 else aSig |= BX_CONST64(0x2000000000000000);
1326 aSig = shift64RightJamming(aSig, -expDiff);
1327 zExp = bExp;
1329 else {
1330 if (aExp == 0x7FF) {
1331 if (aSig | bSig) return propagateFloat64NaN(a, b, status);
1332 return a;
1334 if (aExp == 0) {
1335 if (aSig | bSig) float_raise(status, float_flag_denormal);
1336 return packFloat64(zSign, 0, (aSig + bSig)>>9);
1338 zSig = BX_CONST64(0x4000000000000000) + aSig + bSig;
1339 zExp = aExp;
1340 goto roundAndPack;
1342 aSig |= BX_CONST64(0x2000000000000000);
1343 zSig = (aSig + bSig)<<1;
1344 --zExp;
1345 if ((Bit64s) zSig < 0) {
1346 zSig = aSig + bSig;
1347 ++zExp;
1349 roundAndPack:
1350 return roundAndPackFloat64(zSign, zExp, zSig, status);
1353 /*----------------------------------------------------------------------------
1354 | Returns the result of subtracting the absolute values of the double-
1355 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1356 | difference is negated before being returned. `zSign' is ignored if the
1357 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1358 | Standard for Binary Floating-Point Arithmetic.
1359 *----------------------------------------------------------------------------*/
1361 static float64 subFloat64Sigs(float64 a, float64 b, int zSign, float_status_t &status)
1363 Bit16s aExp, bExp, zExp;
1364 Bit64u aSig, bSig, zSig;
1365 Bit16s expDiff;
1367 aSig = extractFloat64Frac(a);
1368 aExp = extractFloat64Exp(a);
1369 bSig = extractFloat64Frac(b);
1370 bExp = extractFloat64Exp(b);
1372 expDiff = aExp - bExp;
1373 aSig <<= 10;
1374 bSig <<= 10;
1375 if (0 < expDiff) goto aExpBigger;
1376 if (expDiff < 0) goto bExpBigger;
1377 if (aExp == 0x7FF) {
1378 if (aSig | bSig) return propagateFloat64NaN(a, b, status);
1379 float_raise(status, float_flag_invalid);
1380 return float64_default_nan;
1382 if (aExp == 0) {
1383 if (aSig | bSig) float_raise(status, float_flag_denormal);
1384 aExp = 1;
1385 bExp = 1;
1387 if (bSig < aSig) goto aBigger;
1388 if (aSig < bSig) goto bBigger;
1389 return packFloat64(get_float_rounding_mode(status) == float_round_down, 0, 0);
1390 bExpBigger:
1391 if (bExp == 0x7FF) {
1392 if (bSig) return propagateFloat64NaN(a, b, status);
1393 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1394 return packFloat64(zSign ^ 1, 0x7FF, 0);
1396 if ((bExp == 0) && bSig)
1397 float_raise(status, float_flag_denormal);
1399 if (aExp == 0) {
1400 if (aSig) float_raise(status, float_flag_denormal);
1401 ++expDiff;
1403 else aSig |= BX_CONST64(0x4000000000000000);
1405 aSig = shift64RightJamming(aSig, -expDiff);
1406 bSig |= BX_CONST64(0x4000000000000000);
1407 bBigger:
1408 zSig = bSig - aSig;
1409 zExp = bExp;
1410 zSign ^= 1;
1411 goto normalizeRoundAndPack;
1412 aExpBigger:
1413 if (aExp == 0x7FF) {
1414 if (aSig) return propagateFloat64NaN(a, b, status);
1415 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1416 return a;
1418 if ((aExp == 0) && aSig)
1419 float_raise(status, float_flag_denormal);
1421 if (bExp == 0) {
1422 if (bSig) float_raise(status, float_flag_denormal);
1423 --expDiff;
1425 else bSig |= BX_CONST64(0x4000000000000000);
1427 bSig = shift64RightJamming(bSig, expDiff);
1428 aSig |= BX_CONST64(0x4000000000000000);
1429 aBigger:
1430 zSig = aSig - bSig;
1431 zExp = aExp;
1432 normalizeRoundAndPack:
1433 --zExp;
1434 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
1437 /*----------------------------------------------------------------------------
1438 | Returns the result of adding the double-precision floating-point values `a'
1439 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1440 | Binary Floating-Point Arithmetic.
1441 *----------------------------------------------------------------------------*/
1443 float64 float64_add(float64 a, float64 b, float_status_t &status)
1445 int aSign = extractFloat64Sign(a);
1446 int bSign = extractFloat64Sign(b);
1448 if (aSign == bSign) {
1449 return addFloat64Sigs(a, b, aSign, status);
1451 else {
1452 return subFloat64Sigs(a, b, aSign, status);
1456 /*----------------------------------------------------------------------------
1457 | Returns the result of subtracting the double-precision floating-point values
1458 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1459 | for Binary Floating-Point Arithmetic.
1460 *----------------------------------------------------------------------------*/
1462 float64 float64_sub(float64 a, float64 b, float_status_t &status)
1464 int aSign = extractFloat64Sign(a);
1465 int bSign = extractFloat64Sign(b);
1467 if (aSign == bSign) {
1468 return subFloat64Sigs(a, b, aSign, status);
1470 else {
1471 return addFloat64Sigs(a, b, aSign, status);
1475 /*----------------------------------------------------------------------------
1476 | Returns the result of multiplying the double-precision floating-point values
1477 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1478 | for Binary Floating-Point Arithmetic.
1479 *----------------------------------------------------------------------------*/
1481 float64 float64_mul(float64 a, float64 b, float_status_t &status)
1483 int aSign, bSign, zSign;
1484 Bit16s aExp, bExp, zExp;
1485 Bit64u aSig, bSig, zSig0, zSig1;
1487 aSig = extractFloat64Frac(a);
1488 aExp = extractFloat64Exp(a);
1489 aSign = extractFloat64Sign(a);
1490 bSig = extractFloat64Frac(b);
1491 bExp = extractFloat64Exp(b);
1492 bSign = extractFloat64Sign(b);
1493 zSign = aSign ^ bSign;
1494 if (aExp == 0x7FF) {
1495 if (aSig || ((bExp == 0x7FF) && bSig)) {
1496 return propagateFloat64NaN(a, b, status);
1498 if ((bExp | bSig) == 0) {
1499 float_raise(status, float_flag_invalid);
1500 return float64_default_nan;
1502 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1503 return packFloat64(zSign, 0x7FF, 0);
1505 if (bExp == 0x7FF) {
1506 if (bSig) return propagateFloat64NaN(a, b, status);
1507 if ((aExp | aSig) == 0) {
1508 float_raise(status, float_flag_invalid);
1509 return float64_default_nan;
1511 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1512 return packFloat64(zSign, 0x7FF, 0);
1514 if (aExp == 0) {
1515 if (aSig == 0) {
1516 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1517 return packFloat64(zSign, 0, 0);
1519 float_raise(status, float_flag_denormal);
1520 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1522 if (bExp == 0) {
1523 if (bSig == 0) return packFloat64(zSign, 0, 0);
1524 float_raise(status, float_flag_denormal);
1525 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
1527 zExp = aExp + bExp - 0x3FF;
1528 aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
1529 bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
1530 mul64To128(aSig, bSig, &zSig0, &zSig1);
1531 zSig0 |= (zSig1 != 0);
1532 if (0 <= (Bit64s) (zSig0<<1)) {
1533 zSig0 <<= 1;
1534 --zExp;
1536 return roundAndPackFloat64(zSign, zExp, zSig0, status);
1539 /*----------------------------------------------------------------------------
1540 | Returns the result of dividing the double-precision floating-point value `a'
1541 | by the corresponding value `b'. The operation is performed according to
1542 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1543 *----------------------------------------------------------------------------*/
1545 float64 float64_div(float64 a, float64 b, float_status_t &status)
1547 int aSign, bSign, zSign;
1548 Bit16s aExp, bExp, zExp;
1549 Bit64u aSig, bSig, zSig;
1550 Bit64u rem0, rem1;
1551 Bit64u term0, term1;
1553 aSig = extractFloat64Frac(a);
1554 aExp = extractFloat64Exp(a);
1555 aSign = extractFloat64Sign(a);
1556 bSig = extractFloat64Frac(b);
1557 bExp = extractFloat64Exp(b);
1558 bSign = extractFloat64Sign(b);
1559 zSign = aSign ^ bSign;
1560 if (aExp == 0x7FF) {
1561 if (aSig) return propagateFloat64NaN(a, b, status);
1562 if (bExp == 0x7FF) {
1563 if (bSig) return propagateFloat64NaN(a, b, status);
1564 invalid:
1565 float_raise(status, float_flag_invalid);
1566 return float64_default_nan;
1568 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
1569 return packFloat64(zSign, 0x7FF, 0);
1571 if (bExp == 0x7FF) {
1572 if (bSig) return propagateFloat64NaN(a, b, status);
1573 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
1574 return packFloat64(zSign, 0, 0);
1576 if (bExp == 0) {
1577 if (bSig == 0) {
1578 if ((aExp | aSig) == 0) goto invalid;
1579 float_raise(status, float_flag_divbyzero);
1580 return packFloat64(zSign, 0x7FF, 0);
1582 float_raise(status, float_flag_denormal);
1583 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
1585 if (aExp == 0) {
1586 if (aSig == 0) return packFloat64(zSign, 0, 0);
1587 float_raise(status, float_flag_denormal);
1588 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1590 zExp = aExp - bExp + 0x3FD;
1591 aSig = (aSig | BX_CONST64(0x0010000000000000))<<10;
1592 bSig = (bSig | BX_CONST64(0x0010000000000000))<<11;
1593 if (bSig <= (aSig + aSig)) {
1594 aSig >>= 1;
1595 ++zExp;
1597 zSig = estimateDiv128To64(aSig, 0, bSig);
1598 if ((zSig & 0x1FF) <= 2) {
1599 mul64To128(bSig, zSig, &term0, &term1);
1600 sub128(aSig, 0, term0, term1, &rem0, &rem1);
1601 while ((Bit64s) rem0 < 0) {
1602 --zSig;
1603 add128(rem0, rem1, 0, bSig, &rem0, &rem1);
1605 zSig |= (rem1 != 0);
1607 return roundAndPackFloat64(zSign, zExp, zSig, status);
1610 /*----------------------------------------------------------------------------
1611 | Returns the square root of the double-precision floating-point value `a'.
1612 | The operation is performed according to the IEC/IEEE Standard for Binary
1613 | Floating-Point Arithmetic.
1614 *----------------------------------------------------------------------------*/
1616 float64 float64_sqrt(float64 a, float_status_t &status)
1618 int aSign;
1619 Bit16s aExp, zExp;
1620 Bit64u aSig, zSig, doubleZSig;
1621 Bit64u rem0, rem1, term0, term1;
1623 aSig = extractFloat64Frac(a);
1624 aExp = extractFloat64Exp(a);
1625 aSign = extractFloat64Sign(a);
1626 if (aExp == 0x7FF) {
1627 if (aSig) return propagateFloat64NaN(a, status);
1628 if (! aSign) return a;
1629 float_raise(status, float_flag_invalid);
1630 return float64_default_nan;
1632 if (aSign) {
1633 if ((aExp | aSig) == 0) return a;
1634 float_raise(status, float_flag_invalid);
1635 return float64_default_nan;
1637 if (aExp == 0) {
1638 if (aSig == 0) return 0;
1639 float_raise(status, float_flag_denormal);
1640 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
1642 zExp = ((aExp - 0x3FF)>>1) + 0x3FE;
1643 aSig |= BX_CONST64(0x0010000000000000);
1644 zSig = estimateSqrt32(aExp, (Bit32u)(aSig>>21));
1645 aSig <<= 9 - (aExp & 1);
1646 zSig = estimateDiv128To64(aSig, 0, zSig<<32) + (zSig<<30);
1647 if ((zSig & 0x1FF) <= 5) {
1648 doubleZSig = zSig<<1;
1649 mul64To128(zSig, zSig, &term0, &term1);
1650 sub128(aSig, 0, term0, term1, &rem0, &rem1);
1651 while ((Bit64s) rem0 < 0) {
1652 --zSig;
1653 doubleZSig -= 2;
1654 add128(rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1);
1656 zSig |= ((rem0 | rem1) != 0);
1658 return roundAndPackFloat64(0, zExp, zSig, status);
1661 /*----------------------------------------------------------------------------
1662 | Determine double-precision floating-point number class
1663 *----------------------------------------------------------------------------*/
1665 float_class_t float64_class(float64 a)
1667 Bit16s aExp = extractFloat64Exp(a);
1668 Bit64u aSig = extractFloat64Frac(a);
1669 int aSign = extractFloat64Sign(a);
1671 if(aExp == 0x7FF) {
1672 if (aSig == 0)
1673 return (aSign) ? float_negative_inf : float_positive_inf;
1675 return float_NaN;
1678 if(aExp == 0) {
1679 if (aSig == 0)
1680 return float_zero;
1681 return float_denormal;
1684 return float_normalized;
1687 /*----------------------------------------------------------------------------
1688 | Returns 1 if the double-precision floating-point value `a' is equal to the
1689 | corresponding value `b', and 0 otherwise. The comparison is performed
1690 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1691 *----------------------------------------------------------------------------*/
1693 int float64_eq(float64 a, float64 b, float_status_t &status)
1695 float_class_t aClass = float64_class(a);
1696 float_class_t bClass = float64_class(b);
1698 if (aClass == float_NaN || bClass == float_NaN)
1700 if (float64_is_signaling_nan(a) || float64_is_signaling_nan(b))
1702 float_raise(status, float_flag_invalid);
1704 return 0;
1707 if (aClass == float_denormal || bClass == float_denormal)
1709 float_raise(status, float_flag_denormal);
1712 return (a == b) || ((Bit64u) ((a | b)<<1) == 0);
1715 /*----------------------------------------------------------------------------
1716 | Returns 1 if the double-precision floating-point value `a' is less than or
1717 | equal to the corresponding value `b', and 0 otherwise. The comparison is
1718 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1719 | Arithmetic.
1720 *----------------------------------------------------------------------------*/
1722 int float64_le(float64 a, float64 b, float_status_t &status)
1724 float_class_t aClass = float64_class(a);
1725 float_class_t bClass = float64_class(b);
1727 if (aClass == float_NaN || bClass == float_NaN) {
1728 float_raise(status, float_flag_invalid);
1729 return 0;
1732 if (aClass == float_denormal || bClass == float_denormal)
1734 float_raise(status, float_flag_denormal);
1737 int aSign = extractFloat64Sign(a);
1738 int bSign = extractFloat64Sign(b);
1739 if (aSign != bSign) return aSign || ((Bit64u) ((a | b)<<1) == 0);
1740 return (a == b) || (aSign ^ (a < b));
1743 /*----------------------------------------------------------------------------
1744 | Returns 1 if the double-precision floating-point value `a' is less than
1745 | the corresponding value `b', and 0 otherwise. The comparison is performed
1746 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1747 *----------------------------------------------------------------------------*/
1749 int float64_lt(float64 a, float64 b, float_status_t &status)
1751 float_class_t aClass = float64_class(a);
1752 float_class_t bClass = float64_class(b);
1754 if (aClass == float_NaN || bClass == float_NaN) {
1755 float_raise(status, float_flag_invalid);
1756 return 0;
1759 if (aClass == float_denormal || bClass == float_denormal)
1761 float_raise(status, float_flag_denormal);
1764 int aSign = extractFloat64Sign(a);
1765 int bSign = extractFloat64Sign(b);
1766 if (aSign != bSign) return aSign && ((Bit64u) ((a | b)<<1) != 0);
1767 return (a != b) && (aSign ^ (a < b));
1770 /*----------------------------------------------------------------------------
1771 | Returns 1 if the double-precision floating-point value `a' is equal to the
1772 | corresponding value `b', and 0 otherwise. The invalid exception is raised
1773 | if either operand is a NaN. Otherwise, the comparison is performed
1774 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1775 *----------------------------------------------------------------------------*/
1777 int float64_eq_signaling(float64 a, float64 b, float_status_t &status)
1779 float_class_t aClass = float64_class(a);
1780 float_class_t bClass = float64_class(b);
1782 if (aClass == float_NaN || bClass == float_NaN) {
1783 float_raise(status, float_flag_invalid);
1784 return 0;
1787 if (aClass == float_denormal || bClass == float_denormal)
1789 float_raise(status, float_flag_denormal);
1792 return (a == b) || ((Bit64u) ((a | b)<<1) == 0);
1795 /*----------------------------------------------------------------------------
1796 | Returns 1 if the double-precision floating-point value `a' is less than or
1797 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1798 | cause an exception. Otherwise, the comparison is performed according to the
1799 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1800 *----------------------------------------------------------------------------*/
1802 int float64_le_quiet(float64 a, float64 b, float_status_t &status)
1804 float_class_t aClass = float64_class(a);
1805 float_class_t bClass = float64_class(b);
1807 if (aClass == float_NaN || bClass == float_NaN)
1809 if (float64_is_signaling_nan(a) || float64_is_signaling_nan(b))
1811 float_raise(status, float_flag_invalid);
1813 return 0;
1816 if (aClass == float_denormal || bClass == float_denormal)
1818 float_raise(status, float_flag_denormal);
1821 int aSign = extractFloat64Sign(a);
1822 int bSign = extractFloat64Sign(b);
1823 if (aSign != bSign) return aSign || ((Bit64u) ((a | b)<<1) == 0);
1824 return (a == b) || (aSign ^ (a < b));
1827 /*----------------------------------------------------------------------------
1828 | Returns 1 if the double-precision floating-point value `a' is less than
1829 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1830 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
1831 | Standard for Binary Floating-Point Arithmetic.
1832 *----------------------------------------------------------------------------*/
1834 int float64_lt_quiet(float64 a, float64 b, float_status_t &status)
1836 float_class_t aClass = float64_class(a);
1837 float_class_t bClass = float64_class(b);
1839 if (aClass == float_NaN || bClass == float_NaN)
1841 if (float64_is_signaling_nan(a) || float64_is_signaling_nan(b))
1843 float_raise(status, float_flag_invalid);
1845 return 0;
1848 if (aClass == float_denormal || bClass == float_denormal)
1850 float_raise(status, float_flag_denormal);
1853 int aSign = extractFloat64Sign(a);
1854 int bSign = extractFloat64Sign(b);
1855 if (aSign != bSign) return aSign && ((Bit64u) ((a | b)<<1) != 0);
1856 return (a != b) && (aSign ^ (a < b));
1859 /*----------------------------------------------------------------------------
1860 | The unordered relationship is true when at least one of two source operands
1861 | being compared is a NaN. Quiet NaNs do not cause an exception.
1862 *----------------------------------------------------------------------------*/
1864 int float64_unordered(float64 a, float64 b, float_status_t &status)
1866 float_class_t aClass = float64_class(a);
1867 float_class_t bClass = float64_class(b);
1869 if (aClass == float_NaN || bClass == float_NaN)
1871 if (float64_is_signaling_nan(a) || float64_is_signaling_nan(b))
1873 float_raise(status, float_flag_invalid);
1875 return 1;
1878 if (aClass == float_denormal || bClass == float_denormal)
1880 float_raise(status, float_flag_denormal);
1883 return 0;
1886 /*----------------------------------------------------------------------------
1887 | Compare between two double precision floating point numbers. Returns
1888 | 'float_relation_equal' if the operands are equal, 'float_relation_less' if
1889 | the value 'a' is less than the corresponding value `b',
1890 | 'float_relation_greater' if the value 'a' is greater than the corresponding
1891 | value `b', or 'float_relation_unordered' otherwise.
1892 *----------------------------------------------------------------------------*/
1894 int float64_compare(float64 a, float64 b, float_status_t &status)
1896 float_class_t aClass = float64_class(a);
1897 float_class_t bClass = float64_class(b);
1899 if (aClass == float_NaN || bClass == float_NaN) {
1900 float_raise(status, float_flag_invalid);
1901 return float_relation_unordered;
1904 if (aClass == float_denormal || bClass == float_denormal)
1906 float_raise(status, float_flag_denormal);
1909 if ((a == b) || ((Bit64u) ((a | b)<<1) == 0)) return float_relation_equal;
1911 int aSign = extractFloat64Sign(a);
1912 int bSign = extractFloat64Sign(b);
1913 if (aSign != bSign)
1914 return (aSign) ? float_relation_less : float_relation_greater;
1916 if (aSign ^ (a < b)) return float_relation_less;
1917 return float_relation_greater;
1920 /*----------------------------------------------------------------------------
1921 | Compare between two double precision floating point numbers. Returns
1922 | 'float_relation_equal' if the operands are equal, 'float_relation_less' if
1923 | the value 'a' is less than the corresponding value `b',
1924 | 'float_relation_greater' if the value 'a' is greater than the corresponding
1925 | value `b', or 'float_relation_unordered' otherwise. Quiet NaNs do not cause
1926 | an exception.
1927 *----------------------------------------------------------------------------*/
1929 int float64_compare_quiet(float64 a, float64 b, float_status_t &status)
1931 float_class_t aClass = float64_class(a);
1932 float_class_t bClass = float64_class(b);
1934 if (aClass == float_NaN || bClass == float_NaN)
1936 if (float64_is_signaling_nan(a) || float64_is_signaling_nan(b))
1938 float_raise(status, float_flag_invalid);
1940 return float_relation_unordered;
1943 if (aClass == float_denormal || bClass == float_denormal)
1945 float_raise(status, float_flag_denormal);
1948 if ((a == b) || ((Bit64u) ((a | b)<<1) == 0)) return float_relation_equal;
1950 int aSign = extractFloat64Sign(a);
1951 int bSign = extractFloat64Sign(b);
1952 if (aSign != bSign)
1953 return (aSign) ? float_relation_less : float_relation_greater;
1955 if (aSign ^ (a < b)) return float_relation_less;
1956 return float_relation_greater;
1959 #ifdef FLOATX80
1961 /*----------------------------------------------------------------------------
1962 | Returns the result of converting the 32-bit two's complement integer `a'
1963 | to the extended double-precision floating-point format. The conversion
1964 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1965 | Arithmetic.
1966 *----------------------------------------------------------------------------*/
1968 floatx80 int32_to_floatx80(Bit32s a)
1970 if (a == 0) return packFloatx80(0, 0, 0);
1971 int zSign = (a < 0);
1972 Bit32u absA = zSign ? -a : a;
1973 int shiftCount = countLeadingZeros32(absA) + 32;
1974 Bit64u zSig = absA;
1975 return packFloatx80(zSign, 0x403E - shiftCount, zSig<<shiftCount);
1978 /*----------------------------------------------------------------------------
1979 | Returns the result of converting the 64-bit two's complement integer `a'
1980 | to the extended double-precision floating-point format. The conversion
1981 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1982 | Arithmetic.
1983 *----------------------------------------------------------------------------*/
1985 floatx80 int64_to_floatx80(Bit64s a)
1987 if (a == 0) return packFloatx80(0, 0, 0);
1988 int zSign = (a < 0);
1989 Bit64u absA = zSign ? -a : a;
1990 int shiftCount = countLeadingZeros64(absA);
1991 return packFloatx80(zSign, 0x403E - shiftCount, absA<<shiftCount);
1994 /*----------------------------------------------------------------------------
1995 | Returns the result of converting the single-precision floating-point value
1996 | `a' to the extended double-precision floating-point format. The conversion
1997 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1998 | Arithmetic.
1999 *----------------------------------------------------------------------------*/
2001 floatx80 float32_to_floatx80(float32 a, float_status_t &status)
2003 Bit32u aSig = extractFloat32Frac(a);
2004 Bit16s aExp = extractFloat32Exp(a);
2005 int aSign = extractFloat32Sign(a);
2006 if (aExp == 0xFF) {
2007 if (aSig) return commonNaNToFloatx80(float32ToCommonNaN(a, status));
2008 return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2010 if (aExp == 0) {
2011 if (aSig == 0) return packFloatx80(aSign, 0, 0);
2012 float_raise(status, float_flag_denormal);
2013 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2015 aSig |= 0x00800000;
2016 return packFloatx80(aSign, aExp + 0x3F80, ((Bit64u) aSig)<<40);
2019 /*----------------------------------------------------------------------------
2020 | Returns the result of converting the double-precision floating-point value
2021 | `a' to the extended double-precision floating-point format. The conversion
2022 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2023 | Arithmetic.
2024 *----------------------------------------------------------------------------*/
2026 floatx80 float64_to_floatx80(float64 a, float_status_t &status)
2028 Bit64u aSig = extractFloat64Frac(a);
2029 Bit16s aExp = extractFloat64Exp(a);
2030 int aSign = extractFloat64Sign(a);
2032 if (aExp == 0x7FF) {
2033 if (aSig) return commonNaNToFloatx80(float64ToCommonNaN(a, status));
2034 return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2036 if (aExp == 0) {
2037 if (aSig == 0) return packFloatx80(aSign, 0, 0);
2038 float_raise(status, float_flag_denormal);
2039 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
2041 return
2042 packFloatx80(
2043 aSign, aExp + 0x3C00, (aSig | BX_CONST64(0x0010000000000000))<<11);
2046 /*----------------------------------------------------------------------------
2047 | Returns the result of converting the extended double-precision floating-
2048 | point value `a' to the 32-bit two's complement integer format. The
2049 | conversion is performed according to the IEC/IEEE Standard for Binary
2050 | Floating-Point Arithmetic - which means in particular that the conversion
2051 | is rounded according to the current rounding mode. If `a' is a NaN or the
2052 | conversion overflows, the integer indefinite value is returned.
2053 *----------------------------------------------------------------------------*/
2055 Bit32s floatx80_to_int32(floatx80 a, float_status_t &status)
2057 Bit64u aSig = extractFloatx80Frac(a);
2058 Bit32s aExp = extractFloatx80Exp(a);
2059 int aSign = extractFloatx80Sign(a);
2061 // handle unsupported extended double-precision floating encodings
2062 if (floatx80_is_unsupported(a))
2064 float_raise(status, float_flag_invalid);
2065 return int32_indefinite;
2068 if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) aSign = 0;
2069 int shiftCount = 0x4037 - aExp;
2070 if (shiftCount <= 0) shiftCount = 1;
2071 aSig = shift64RightJamming(aSig, shiftCount);
2072 return roundAndPackInt32(aSign, aSig, status);
2075 /*----------------------------------------------------------------------------
2076 | Returns the result of converting the extended double-precision floating-
2077 | point value `a' to the 32-bit two's complement integer format. The
2078 | conversion is performed according to the IEC/IEEE Standard for Binary
2079 | Floating-Point Arithmetic, except that the conversion is always rounded
2080 | toward zero. If `a' is a NaN or the conversion overflows, the integer
2081 | indefinite value is returned.
2082 *----------------------------------------------------------------------------*/
2084 Bit32s floatx80_to_int32_round_to_zero(floatx80 a, float_status_t &status)
2086 Bit32s aExp;
2087 Bit64u aSig, savedASig;
2088 Bit32s z;
2089 int shiftCount;
2091 // handle unsupported extended double-precision floating encodings
2092 if (floatx80_is_unsupported(a))
2094 float_raise(status, float_flag_invalid);
2095 return int32_indefinite;
2098 aSig = extractFloatx80Frac(a);
2099 aExp = extractFloatx80Exp(a);
2100 int aSign = extractFloatx80Sign(a);
2102 if (aExp > 0x401E) goto invalid;
2103 if (aExp < 0x3FFF) {
2104 if (aExp || aSig) float_raise(status, float_flag_inexact);
2105 return 0;
2107 shiftCount = 0x403E - aExp;
2108 savedASig = aSig;
2109 aSig >>= shiftCount;
2110 z = (Bit32s) aSig;
2111 if (aSign) z = -z;
2112 if ((z < 0) ^ aSign) {
2113 invalid:
2114 float_raise(status, float_flag_invalid);
2115 return (Bit32s)(int32_indefinite);
2117 if ((aSig<<shiftCount) != savedASig)
2119 float_raise(status, float_flag_inexact);
2121 return z;
2124 /*----------------------------------------------------------------------------
2125 | Returns the result of converting the extended double-precision floating-
2126 | point value `a' to the 64-bit two's complement integer format. The
2127 | conversion is performed according to the IEC/IEEE Standard for Binary
2128 | Floating-Point Arithmetic - which means in particular that the conversion
2129 | is rounded according to the current rounding mode. If `a' is a NaN or the
2130 | conversion overflows, the integer indefinite value is returned.
2131 *----------------------------------------------------------------------------*/
2133 Bit64s floatx80_to_int64(floatx80 a, float_status_t &status)
2135 Bit32s aExp;
2136 Bit64u aSig, aSigExtra;
2138 // handle unsupported extended double-precision floating encodings
2139 if (floatx80_is_unsupported(a))
2141 float_raise(status, float_flag_invalid);
2142 return int64_indefinite;
2145 aSig = extractFloatx80Frac(a);
2146 aExp = extractFloatx80Exp(a);
2147 int aSign = extractFloatx80Sign(a);
2149 int shiftCount = 0x403E - aExp;
2150 if (shiftCount <= 0)
2152 if (shiftCount)
2154 float_raise(status, float_flag_invalid);
2155 return (Bit64s)(int64_indefinite);
2157 aSigExtra = 0;
2159 else {
2160 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
2163 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
2166 /*----------------------------------------------------------------------------
2167 | Returns the result of converting the extended double-precision floating-
2168 | point value `a' to the 64-bit two's complement integer format. The
2169 | conversion is performed according to the IEC/IEEE Standard for Binary
2170 | Floating-Point Arithmetic, except that the conversion is always rounded
2171 | toward zero. If `a' is a NaN or the conversion overflows, the integer
2172 | indefinite value is returned.
2173 *----------------------------------------------------------------------------*/
2175 Bit64s floatx80_to_int64_round_to_zero(floatx80 a, float_status_t &status)
2177 int aSign;
2178 Bit32s aExp;
2179 Bit64u aSig;
2180 Bit64s z;
2182 // handle unsupported extended double-precision floating encodings
2183 if (floatx80_is_unsupported(a))
2185 float_raise(status, float_flag_invalid);
2186 return int64_indefinite;
2189 aSig = extractFloatx80Frac(a);
2190 aExp = extractFloatx80Exp(a);
2191 aSign = extractFloatx80Sign(a);
2192 int shiftCount = aExp - 0x403E;
2193 if (0 <= shiftCount) {
2194 aSig &= BX_CONST64(0x7FFFFFFFFFFFFFFF);
2195 if ((a.exp != 0xC03E) || aSig) {
2196 float_raise(status, float_flag_invalid);
2198 return (Bit64s)(int64_indefinite);
2200 else if (aExp < 0x3FFF) {
2201 if (aExp | aSig) float_raise(status, float_flag_inexact);
2202 return 0;
2204 z = aSig>>(-shiftCount);
2205 if ((Bit64u) (aSig<<(shiftCount & 63))) {
2206 float_raise(status, float_flag_inexact);
2208 if (aSign) z = -z;
2209 return z;
2212 /*----------------------------------------------------------------------------
2213 | Returns the result of converting the extended double-precision floating-
2214 | point value `a' to the single-precision floating-point format. The
2215 | conversion is performed according to the IEC/IEEE Standard for Binary
2216 | Floating-Point Arithmetic.
2217 *----------------------------------------------------------------------------*/
2219 float32 floatx80_to_float32(floatx80 a, float_status_t &status)
2221 Bit64u aSig = extractFloatx80Frac(a);
2222 Bit32s aExp = extractFloatx80Exp(a);
2223 int aSign = extractFloatx80Sign(a);
2225 // handle unsupported extended double-precision floating encodings
2226 if (floatx80_is_unsupported(a))
2228 float_raise(status, float_flag_invalid);
2229 return float32_default_nan;
2232 if (aExp == 0x7FFF) {
2233 if ((Bit64u) (aSig<<1))
2234 return commonNaNToFloat32(floatx80ToCommonNaN(a, status));
2236 return packFloat32(aSign, 0xFF, 0);
2238 aSig = shift64RightJamming(aSig, 33);
2239 if (aExp || aSig) aExp -= 0x3F81;
2240 return roundAndPackFloat32(aSign, aExp, (Bit32u) aSig, status);
2243 /*----------------------------------------------------------------------------
2244 | Returns the result of converting the extended double-precision floating-
2245 | point value `a' to the double-precision floating-point format. The
2246 | conversion is performed according to the IEC/IEEE Standard for Binary
2247 | Floating-Point Arithmetic.
2248 *----------------------------------------------------------------------------*/
2250 float64 floatx80_to_float64(floatx80 a, float_status_t &status)
2252 Bit32s aExp;
2253 Bit64u aSig, zSig;
2255 // handle unsupported extended double-precision floating encodings
2256 if (floatx80_is_unsupported(a))
2258 float_raise(status, float_flag_invalid);
2259 return float64_default_nan;
2262 aSig = extractFloatx80Frac(a);
2263 aExp = extractFloatx80Exp(a);
2264 int aSign = extractFloatx80Sign(a);
2266 if (aExp == 0x7FFF) {
2267 if ((Bit64u) (aSig<<1)) {
2268 return commonNaNToFloat64(floatx80ToCommonNaN(a, status));
2270 return packFloat64(aSign, 0x7FF, 0);
2272 zSig = shift64RightJamming(aSig, 1);
2273 if (aExp || aSig) aExp -= 0x3C01;
2274 return roundAndPackFloat64(aSign, aExp, zSig, status);
2277 /*----------------------------------------------------------------------------
2278 | Rounds the extended double-precision floating-point value `a' to an integer,
2279 | and returns the result as an extended double-precision floating-point
2280 | value. The operation is performed according to the IEC/IEEE Standard for
2281 | Binary Floating-Point Arithmetic.
2282 *----------------------------------------------------------------------------*/
2284 floatx80 floatx80_round_to_int(floatx80 a, float_status_t &status)
2286 int aSign;
2287 Bit64u lastBitMask, roundBitsMask;
2288 Bit8u roundingMode;
2289 floatx80 z;
2291 // handle unsupported extended double-precision floating encodings
2292 if (floatx80_is_unsupported(a))
2294 float_raise(status, float_flag_invalid);
2295 return floatx80_default_nan;
2298 Bit32s aExp = extractFloatx80Exp(a);
2299 Bit64u aSig = extractFloatx80Frac(a);
2300 if (0x403E <= aExp) {
2301 if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1)) {
2302 return propagateFloatx80NaN(a, status);
2304 return a;
2306 if (aExp < 0x3FFF) {
2307 if ((aExp == 0) && ((Bit64u) (aSig<<1) == 0)) {
2308 return a;
2310 float_raise(status, float_flag_inexact);
2311 aSign = extractFloatx80Sign(a);
2312 switch (get_float_rounding_mode(status)) {
2313 case float_round_nearest_even:
2314 if ((aExp == 0x3FFE) && (Bit64u) (aSig<<1))
2315 return packFloatx80(aSign, 0x3FFF, BX_CONST64(0x8000000000000000));
2316 break;
2317 case float_round_down:
2318 return aSign ?
2319 packFloatx80(1, 0x3FFF, BX_CONST64(0x8000000000000000))
2320 : packFloatx80(0, 0, 0);
2321 case float_round_up:
2322 return aSign ?
2323 packFloatx80(1, 0, 0)
2324 : packFloatx80(0, 0x3FFF, BX_CONST64(0x8000000000000000));
2326 return packFloatx80(aSign, 0, 0);
2328 lastBitMask = 1;
2329 lastBitMask <<= 0x403E - aExp;
2330 roundBitsMask = lastBitMask - 1;
2331 z = a;
2332 roundingMode = get_float_rounding_mode(status);
2333 if (roundingMode == float_round_nearest_even) {
2334 z.fraction += lastBitMask>>1;
2335 if ((z.fraction & roundBitsMask) == 0) z.fraction &= ~lastBitMask;
2337 else if (roundingMode != float_round_to_zero) {
2338 if (extractFloatx80Sign(z) ^ (roundingMode == float_round_up))
2339 z.fraction += roundBitsMask;
2341 z.fraction &= ~roundBitsMask;
2342 if (z.fraction == 0) {
2343 z.exp++;
2344 z.fraction = BX_CONST64(0x8000000000000000);
2346 if (z.fraction != a.fraction) float_raise(status, float_flag_inexact);
2347 return z;
2350 /*----------------------------------------------------------------------------
2351 | Returns the result of adding the absolute values of the extended double-
2352 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
2353 | negated before being returned. `zSign' is ignored if the result is a NaN.
2354 | The addition is performed according to the IEC/IEEE Standard for Binary
2355 | Floating-Point Arithmetic.
2356 *----------------------------------------------------------------------------*/
2358 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
2360 Bit32s aExp, bExp, zExp;
2361 Bit64u aSig, bSig, zSig0, zSig1;
2362 Bit32s expDiff;
2364 // handle unsupported extended double-precision floating encodings
2365 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
2367 float_raise(status, float_flag_invalid);
2368 return floatx80_default_nan;
2371 aSig = extractFloatx80Frac(a);
2372 aExp = extractFloatx80Exp(a);
2373 bSig = extractFloatx80Frac(b);
2374 bExp = extractFloatx80Exp(b);
2376 if (aExp == 0x7FFF) {
2377 if ((Bit64u) (aSig<<1)
2378 || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
2380 return propagateFloatx80NaN(a, b, status);
2382 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2383 return a;
2385 if (bExp == 0x7FFF) {
2386 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
2387 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2388 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2390 if (aExp == 0) {
2391 if (aSig == 0) {
2392 if ((bExp == 0) && bSig) {
2393 float_raise(status, float_flag_denormal);
2394 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
2396 return roundAndPackFloatx80(get_float_rounding_precision(status),
2397 zSign, bExp, bSig, 0, status);
2399 float_raise(status, float_flag_denormal);
2400 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
2402 if (bExp == 0) {
2403 if (bSig == 0)
2404 return roundAndPackFloatx80(get_float_rounding_precision(status),
2405 zSign, aExp, aSig, 0, status);
2407 float_raise(status, float_flag_denormal);
2408 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
2410 expDiff = aExp - bExp;
2411 zExp = aExp;
2412 if (0 < expDiff) {
2413 shift64ExtraRightJamming(bSig, 0, expDiff, &bSig, &zSig1);
2415 else if (expDiff < 0) {
2416 shift64ExtraRightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
2417 zExp = bExp;
2419 else {
2420 zSig0 = aSig + bSig;
2421 zSig1 = 0;
2422 goto shiftRight1;
2424 zSig0 = aSig + bSig;
2425 if ((Bit64s) zSig0 < 0) goto roundAndPack;
2426 shiftRight1:
2427 shift64ExtraRightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
2428 zSig0 |= BX_CONST64(0x8000000000000000);
2429 zExp++;
2430 roundAndPack:
2431 return
2432 roundAndPackFloatx80(get_float_rounding_precision(status),
2433 zSign, zExp, zSig0, zSig1, status);
2436 /*----------------------------------------------------------------------------
2437 | Returns the result of subtracting the absolute values of the extended
2438 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
2439 | difference is negated before being returned. `zSign' is ignored if the
2440 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2441 | Standard for Binary Floating-Point Arithmetic.
2442 *----------------------------------------------------------------------------*/
2444 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, int zSign, float_status_t &status)
2446 Bit32s aExp, bExp, zExp;
2447 Bit64u aSig, bSig, zSig0, zSig1;
2448 Bit32s expDiff;
2450 // handle unsupported extended double-precision floating encodings
2451 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
2453 float_raise(status, float_flag_invalid);
2454 return floatx80_default_nan;
2457 aSig = extractFloatx80Frac(a);
2458 aExp = extractFloatx80Exp(a);
2459 bSig = extractFloatx80Frac(b);
2460 bExp = extractFloatx80Exp(b);
2462 expDiff = aExp - bExp;
2463 if (0 < expDiff) goto aExpBigger;
2464 if (expDiff < 0) goto bExpBigger;
2465 if (aExp == 0x7FFF) {
2466 if ((Bit64u) ((aSig | bSig)<<1))
2467 return propagateFloatx80NaN(a, b, status);
2468 float_raise(status, float_flag_invalid);
2469 return floatx80_default_nan;
2471 if (aExp == 0) {
2472 if (aSig | bSig) float_raise(status, float_flag_denormal);
2473 aExp = 1;
2474 bExp = 1;
2476 zSig1 = 0;
2477 if (bSig < aSig) goto aBigger;
2478 if (aSig < bSig) goto bBigger;
2479 return packFloatx80(get_float_rounding_mode(status) == float_round_down, 0, 0);
2480 bExpBigger:
2481 if (bExp == 0x7FFF) {
2482 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
2483 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2484 return packFloatx80(zSign ^ 1, 0x7FFF, BX_CONST64(0x8000000000000000));
2486 if ((bExp == 0) && bSig)
2487 float_raise(status, float_flag_denormal);
2488 if (aExp == 0) {
2489 if (aSig) float_raise(status, float_flag_denormal);
2490 ++expDiff;
2492 shift128RightJamming(aSig, 0, -expDiff, &aSig, &zSig1);
2493 bBigger:
2494 sub128(bSig, 0, aSig, zSig1, &zSig0, &zSig1);
2495 zExp = bExp;
2496 zSign ^= 1;
2497 goto normalizeRoundAndPack;
2498 aExpBigger:
2499 if (aExp == 0x7FFF) {
2500 if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
2501 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2502 return a;
2504 if ((aExp == 0) && aSig)
2505 float_raise(status, float_flag_denormal);
2507 if (bExp == 0) {
2508 if (bSig) float_raise(status, float_flag_denormal);
2509 --expDiff;
2511 shift128RightJamming(bSig, 0, expDiff, &bSig, &zSig1);
2512 aBigger:
2513 sub128(aSig, 0, bSig, zSig1, &zSig0, &zSig1);
2514 zExp = aExp;
2515 normalizeRoundAndPack:
2516 return
2517 normalizeRoundAndPackFloatx80(get_float_rounding_precision(status),
2518 zSign, zExp, zSig0, zSig1, status);
2521 /*----------------------------------------------------------------------------
2522 | Returns the result of adding the extended double-precision floating-point
2523 | values `a' and `b'. The operation is performed according to the IEC/IEEE
2524 | Standard for Binary Floating-Point Arithmetic.
2525 *----------------------------------------------------------------------------*/
2527 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status_t &status)
2529 int aSign = extractFloatx80Sign(a);
2530 int bSign = extractFloatx80Sign(b);
2532 if (aSign == bSign)
2533 return addFloatx80Sigs(a, b, aSign, status);
2534 else
2535 return subFloatx80Sigs(a, b, aSign, status);
2538 /*----------------------------------------------------------------------------
2539 | Returns the result of subtracting the extended double-precision floating-
2540 | point values `a' and `b'. The operation is performed according to the
2541 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2542 *----------------------------------------------------------------------------*/
2544 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status_t &status)
2546 int aSign = extractFloatx80Sign(a);
2547 int bSign = extractFloatx80Sign(b);
2549 if (aSign == bSign)
2550 return subFloatx80Sigs(a, b, aSign, status);
2551 else
2552 return addFloatx80Sigs(a, b, aSign, status);
2555 /*----------------------------------------------------------------------------
2556 | Returns the result of multiplying the extended double-precision floating-
2557 | point values `a' and `b'. The operation is performed according to the
2558 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2559 *----------------------------------------------------------------------------*/
2561 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status_t &status)
2563 int aSign, bSign, zSign;
2564 Bit32s aExp, bExp, zExp;
2565 Bit64u aSig, bSig, zSig0, zSig1;
2567 // handle unsupported extended double-precision floating encodings
2568 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
2570 invalid:
2571 float_raise(status, float_flag_invalid);
2572 return floatx80_default_nan;
2575 aSig = extractFloatx80Frac(a);
2576 aExp = extractFloatx80Exp(a);
2577 aSign = extractFloatx80Sign(a);
2578 bSig = extractFloatx80Frac(b);
2579 bExp = extractFloatx80Exp(b);
2580 bSign = extractFloatx80Sign(b);
2581 zSign = aSign ^ bSign;
2583 if (aExp == 0x7FFF) {
2584 if ((Bit64u) (aSig<<1)
2585 || ((bExp == 0x7FFF) && (Bit64u) (bSig<<1)))
2587 return propagateFloatx80NaN(a, b, status);
2589 if (bExp == 0) {
2590 if (bSig == 0) goto invalid;
2591 float_raise(status, float_flag_denormal);
2593 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2595 if (bExp == 0x7FFF) {
2596 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
2597 if (aExp == 0) {
2598 if (aSig == 0) goto invalid;
2599 float_raise(status, float_flag_denormal);
2601 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2603 if (aExp == 0) {
2604 if (aSig == 0) {
2605 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2606 return packFloatx80(zSign, 0, 0);
2608 float_raise(status, float_flag_denormal);
2609 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
2611 if (bExp == 0) {
2612 if (bSig == 0) return packFloatx80(zSign, 0, 0);
2613 float_raise(status, float_flag_denormal);
2614 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
2616 zExp = aExp + bExp - 0x3FFE;
2617 mul64To128(aSig, bSig, &zSig0, &zSig1);
2618 if (0 < (Bit64s) zSig0) {
2619 shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
2620 --zExp;
2622 return
2623 roundAndPackFloatx80(get_float_rounding_precision(status),
2624 zSign, zExp, zSig0, zSig1, status);
2627 /*----------------------------------------------------------------------------
2628 | Returns the result of dividing the extended double-precision floating-point
2629 | value `a' by the corresponding value `b'. The operation is performed
2630 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2631 *----------------------------------------------------------------------------*/
2633 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status_t &status)
2635 int aSign, bSign, zSign;
2636 Bit32s aExp, bExp, zExp;
2637 Bit64u aSig, bSig, zSig0, zSig1;
2638 Bit64u rem0, rem1, rem2, term0, term1, term2;
2640 // handle unsupported extended double-precision floating encodings
2641 if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
2643 float_raise(status, float_flag_invalid);
2644 return floatx80_default_nan;
2647 aSig = extractFloatx80Frac(a);
2648 aExp = extractFloatx80Exp(a);
2649 aSign = extractFloatx80Sign(a);
2650 bSig = extractFloatx80Frac(b);
2651 bExp = extractFloatx80Exp(b);
2652 bSign = extractFloatx80Sign(b);
2654 zSign = aSign ^ bSign;
2655 if (aExp == 0x7FFF) {
2656 if ((Bit64u) (aSig<<1)) return propagateFloatx80NaN(a, b, status);
2657 if (bExp == 0x7FFF) {
2658 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
2659 goto invalid;
2661 if (bSig && (bExp == 0)) float_raise(status, float_flag_denormal);
2662 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2664 if (bExp == 0x7FFF) {
2665 if ((Bit64u) (bSig<<1)) return propagateFloatx80NaN(a, b, status);
2666 if (aSig && (aExp == 0)) float_raise(status, float_flag_denormal);
2667 return packFloatx80(zSign, 0, 0);
2669 if (bExp == 0) {
2670 if (bSig == 0) {
2671 if ((aExp | aSig) == 0) {
2672 invalid:
2673 float_raise(status, float_flag_invalid);
2674 return floatx80_default_nan;
2676 float_raise(status, float_flag_divbyzero);
2677 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2679 float_raise(status, float_flag_denormal);
2680 normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
2682 if (aExp == 0) {
2683 if (aSig == 0) return packFloatx80(zSign, 0, 0);
2684 float_raise(status, float_flag_denormal);
2685 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
2687 zExp = aExp - bExp + 0x3FFE;
2688 rem1 = 0;
2689 if (bSig <= aSig) {
2690 shift128Right(aSig, 0, 1, &aSig, &rem1);
2691 ++zExp;
2693 zSig0 = estimateDiv128To64(aSig, rem1, bSig);
2694 mul64To128(bSig, zSig0, &term0, &term1);
2695 sub128(aSig, rem1, term0, term1, &rem0, &rem1);
2696 while ((Bit64s) rem0 < 0) {
2697 --zSig0;
2698 add128(rem0, rem1, 0, bSig, &rem0, &rem1);
2700 zSig1 = estimateDiv128To64(rem1, 0, bSig);
2701 if ((Bit64u) (zSig1<<1) <= 8) {
2702 mul64To128(bSig, zSig1, &term1, &term2);
2703 sub128(rem1, 0, term1, term2, &rem1, &rem2);
2704 while ((Bit64s) rem1 < 0) {
2705 --zSig1;
2706 add128(rem1, rem2, 0, bSig, &rem1, &rem2);
2708 zSig1 |= ((rem1 | rem2) != 0);
2710 return
2711 roundAndPackFloatx80(get_float_rounding_precision(status),
2712 zSign, zExp, zSig0, zSig1, status);
2715 /*----------------------------------------------------------------------------
2716 | Returns the square root of the extended double-precision floating-point
2717 | value `a'. The operation is performed according to the IEC/IEEE Standard
2718 | for Binary Floating-Point Arithmetic.
2719 *----------------------------------------------------------------------------*/
2721 floatx80 floatx80_sqrt(floatx80 a, float_status_t &status)
2723 int aSign;
2724 Bit32s aExp, zExp;
2725 Bit64u aSig0, aSig1, zSig0, zSig1, doubleZSig0;
2726 Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2728 // handle unsupported extended double-precision floating encodings
2729 if (floatx80_is_unsupported(a))
2731 float_raise(status, float_flag_invalid);
2732 return floatx80_default_nan;
2735 aSig0 = extractFloatx80Frac(a);
2736 aExp = extractFloatx80Exp(a);
2737 aSign = extractFloatx80Sign(a);
2738 if (aExp == 0x7FFF) {
2739 if ((Bit64u) (aSig0<<1)) return propagateFloatx80NaN(a, status);
2740 if (! aSign) return a;
2741 goto invalid;
2743 if (aSign) {
2744 if ((aExp | aSig0) == 0) return a;
2745 invalid:
2746 float_raise(status, float_flag_invalid);
2747 return floatx80_default_nan;
2749 if (aExp == 0) {
2750 if (aSig0 == 0) return packFloatx80(0, 0, 0);
2751 float_raise(status, float_flag_denormal);
2752 normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
2754 zExp = ((aExp - 0x3FFF)>>1) + 0x3FFF;
2755 zSig0 = estimateSqrt32(aExp, aSig0>>32);
2756 shift128Right(aSig0, 0, 2 + (aExp & 1), &aSig0, &aSig1);
2757 zSig0 = estimateDiv128To64(aSig0, aSig1, zSig0<<32) + (zSig0<<30);
2758 doubleZSig0 = zSig0<<1;
2759 mul64To128(zSig0, zSig0, &term0, &term1);
2760 sub128(aSig0, aSig1, term0, term1, &rem0, &rem1);
2761 while ((Bit64s) rem0 < 0) {
2762 --zSig0;
2763 doubleZSig0 -= 2;
2764 add128(rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1);
2766 zSig1 = estimateDiv128To64(rem1, 0, doubleZSig0);
2767 if ((zSig1 & BX_CONST64(0x3FFFFFFFFFFFFFFF)) <= 5) {
2768 if (zSig1 == 0) zSig1 = 1;
2769 mul64To128(doubleZSig0, zSig1, &term1, &term2);
2770 sub128(rem1, 0, term1, term2, &rem1, &rem2);
2771 mul64To128(zSig1, zSig1, &term2, &term3);
2772 sub192(rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3);
2773 while ((Bit64s) rem1 < 0) {
2774 --zSig1;
2775 shortShift128Left(0, zSig1, 1, &term2, &term3);
2776 term3 |= 1;
2777 term2 |= doubleZSig0;
2778 add192(rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3);
2780 zSig1 |= ((rem1 | rem2 | rem3) != 0);
2782 shortShift128Left(0, zSig1, 1, &zSig0, &zSig1);
2783 zSig0 |= doubleZSig0;
2784 return
2785 roundAndPackFloatx80(get_float_rounding_precision(status),
2786 0, zExp, zSig0, zSig1, status);
2789 #endif
2791 #ifdef FLOAT128
2793 /*----------------------------------------------------------------------------
2794 | Returns the result of converting the extended double-precision floating-
2795 | point value `a' to the quadruple-precision floating-point format. The
2796 | conversion is performed according to the IEC/IEEE Standard for Binary
2797 | Floating-Point Arithmetic.
2798 *----------------------------------------------------------------------------*/
2800 float128 floatx80_to_float128(floatx80 a, float_status_t &status)
2802 Bit64u zSig0, zSig1;
2804 Bit64u aSig = extractFloatx80Frac(a);
2805 Bit32s aExp = extractFloatx80Exp(a);
2806 int aSign = extractFloatx80Sign(a);
2808 if ((aExp == 0x7FFF) && (Bit64u) (aSig<<1))
2809 return commonNaNToFloat128(floatx80ToCommonNaN(a, status));
2811 shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1);
2812 return packFloat128(aSign, aExp, zSig0, zSig1);
2815 /*----------------------------------------------------------------------------
2816 | Returns the result of converting the quadruple-precision floating-point
2817 | value `a' to the extended double-precision floating-point format. The
2818 | conversion is performed according to the IEC/IEEE Standard for Binary
2819 | Floating-Point Arithmetic.
2820 *----------------------------------------------------------------------------*/
2822 floatx80 float128_to_floatx80(float128 a, float_status_t &status)
2824 Bit32s aExp;
2825 Bit64u aSig0, aSig1;
2827 aSig1 = extractFloat128Frac1(a);
2828 aSig0 = extractFloat128Frac0(a);
2829 aExp = extractFloat128Exp(a);
2830 int aSign = extractFloat128Sign(a);
2832 if (aExp == 0x7FFF) {
2833 if (aSig0 | aSig1)
2834 return commonNaNToFloatx80(float128ToCommonNaN(a, status));
2836 return packFloatx80(aSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2839 if (aExp == 0) {
2840 if ((aSig0 | aSig1) == 0) return packFloatx80(aSign, 0, 0);
2841 float_raise(status, float_flag_denormal);
2842 normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
2844 else aSig0 |= BX_CONST64(0x0001000000000000);
2846 shortShift128Left(aSig0, aSig1, 15, &aSig0, &aSig1);
2847 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
2850 /*----------------------------------------------------------------------------
2851 | Returns the result of multiplying the extended double-precision floating-
2852 | point value `a' and quadruple-precision floating point value `b'. The
2853 | operation is performed according to the IEC/IEEE Standard for Binary
2854 | Floating-Point Arithmetic.
2855 *----------------------------------------------------------------------------*/
2857 floatx80 floatx80_mul(floatx80 a, float128 b, float_status_t &status)
2859 Bit32s aExp, bExp, zExp;
2860 Bit64u aSig, bSig0, bSig1, zSig0, zSig1, zSig2;
2861 int aSign, bSign, zSign;
2863 // handle unsupported extended double-precision floating encodings
2864 if (floatx80_is_unsupported(a))
2866 invalid:
2867 float_raise(status, float_flag_invalid);
2868 return floatx80_default_nan;
2871 aSig = extractFloatx80Frac(a);
2872 aExp = extractFloatx80Exp(a);
2873 aSign = extractFloatx80Sign(a);
2874 bSig0 = extractFloat128Frac0(b);
2875 bSig1 = extractFloat128Frac1(b);
2876 bExp = extractFloat128Exp(b);
2877 bSign = extractFloat128Sign(b);
2879 zSign = aSign ^ bSign;
2881 if (aExp == 0x7FFF) {
2882 if ((Bit64u) (aSig<<1)
2883 || ((bExp == 0x7FFF) && (bSig0 | bSig1)))
2885 floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
2886 return propagateFloatx80NaN(a, r, status);
2888 if (bExp == 0) {
2889 if ((bSig0 | bSig1) == 0) goto invalid;
2890 float_raise(status, float_flag_denormal);
2892 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2894 if (bExp == 0x7FFF) {
2895 if (bSig0 | bSig1) {
2896 floatx80 r = commonNaNToFloatx80(float128ToCommonNaN(b, status));
2897 return propagateFloatx80NaN(a, r, status);
2899 if (aExp == 0) {
2900 if (aSig == 0) goto invalid;
2901 float_raise(status, float_flag_denormal);
2903 return packFloatx80(zSign, 0x7FFF, BX_CONST64(0x8000000000000000));
2905 if (aExp == 0) {
2906 if (aSig == 0) {
2907 if ((bExp == 0) && (bSig0 | bSig1)) float_raise(status, float_flag_denormal);
2908 return packFloatx80(zSign, 0, 0);
2910 float_raise(status, float_flag_denormal);
2911 normalizeFloatx80Subnormal(aSig, &aExp, &aSig);
2913 if (bExp == 0) {
2914 if ((bSig0 | bSig1) == 0) return packFloatx80(zSign, 0, 0);
2915 float_raise(status, float_flag_denormal);
2916 normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
2918 else bSig0 |= BX_CONST64(0x0001000000000000);
2920 zExp = aExp + bExp - 0x3FFE;
2921 shortShift128Left(bSig0, bSig1, 15, &bSig0, &bSig1);
2922 mul128By64To192(bSig0, bSig1, aSig, &zSig0, &zSig1, &zSig2);
2923 if (0 < (Bit64s) zSig0) {
2924 shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1);
2925 --zExp;
2927 return
2928 roundAndPackFloatx80(get_float_rounding_precision(status),
2929 zSign, zExp, zSig0, zSig1, status);
2932 /*----------------------------------------------------------------------------
2933 | Returns the result of adding the absolute values of the quadruple-precision
2934 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2935 | before being returned. `zSign' is ignored if the result is a NaN.
2936 | The addition is performed according to the IEC/IEEE Standard for Binary
2937 | Floating-Point Arithmetic.
2938 *----------------------------------------------------------------------------*/
2940 static float128 addFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
2942 Bit32s aExp, bExp, zExp;
2943 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
2944 Bit32s expDiff;
2946 aSig1 = extractFloat128Frac1(a);
2947 aSig0 = extractFloat128Frac0(a);
2948 aExp = extractFloat128Exp(a);
2949 bSig1 = extractFloat128Frac1(b);
2950 bSig0 = extractFloat128Frac0(b);
2951 bExp = extractFloat128Exp(b);
2952 expDiff = aExp - bExp;
2954 if (0 < expDiff) {
2955 if (aExp == 0x7FFF) {
2956 if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
2957 return a;
2959 if (bExp == 0) --expDiff;
2960 else bSig0 |= BX_CONST64(0x0001000000000000);
2961 shift128ExtraRightJamming(bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2);
2962 zExp = aExp;
2964 else if (expDiff < 0) {
2965 if (bExp == 0x7FFF) {
2966 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
2967 return packFloat128(zSign, 0x7FFF, 0, 0);
2969 if (aExp == 0) ++expDiff;
2970 else aSig0 |= BX_CONST64(0x0001000000000000);
2971 shift128ExtraRightJamming(aSig0, aSig1, 0, -expDiff, &aSig0, &aSig1, &zSig2);
2972 zExp = bExp;
2974 else {
2975 if (aExp == 0x7FFF) {
2976 if (aSig0 | aSig1 | bSig0 | bSig1)
2977 return propagateFloat128NaN(a, b, status);
2979 return a;
2981 add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
2982 if (aExp == 0) return packFloat128(zSign, 0, zSig0, zSig1);
2983 zSig2 = 0;
2984 zSig0 |= BX_CONST64(0x0002000000000000);
2985 zExp = aExp;
2986 goto shiftRight1;
2988 aSig0 |= BX_CONST64(0x0001000000000000);
2989 add128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
2990 --zExp;
2991 if (zSig0 < BX_CONST64(0x0002000000000000)) goto roundAndPack;
2992 ++zExp;
2993 shiftRight1:
2994 shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
2995 roundAndPack:
2996 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
2999 /*----------------------------------------------------------------------------
3000 | Returns the result of subtracting the absolute values of the quadruple-
3001 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3002 | difference is negated before being returned. `zSign' is ignored if the
3003 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3004 | Standard for Binary Floating-Point Arithmetic.
3005 *----------------------------------------------------------------------------*/
3007 static float128 subFloat128Sigs(float128 a, float128 b, int zSign, float_status_t &status)
3009 Bit32s aExp, bExp, zExp;
3010 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
3011 Bit32s expDiff;
3013 aSig1 = extractFloat128Frac1(a);
3014 aSig0 = extractFloat128Frac0(a);
3015 aExp = extractFloat128Exp(a);
3016 bSig1 = extractFloat128Frac1(b);
3017 bSig0 = extractFloat128Frac0(b);
3018 bExp = extractFloat128Exp(b);
3020 expDiff = aExp - bExp;
3021 shortShift128Left(aSig0, aSig1, 14, &aSig0, &aSig1);
3022 shortShift128Left(bSig0, bSig1, 14, &bSig0, &bSig1);
3023 if (0 < expDiff) goto aExpBigger;
3024 if (expDiff < 0) goto bExpBigger;
3025 if (aExp == 0x7FFF) {
3026 if (aSig0 | aSig1 | bSig0 | bSig1)
3027 return propagateFloat128NaN(a, b, status);
3029 float_raise(status, float_flag_invalid);
3030 return float128_default_nan;
3032 if (aExp == 0) {
3033 aExp = 1;
3034 bExp = 1;
3036 if (bSig0 < aSig0) goto aBigger;
3037 if (aSig0 < bSig0) goto bBigger;
3038 if (bSig1 < aSig1) goto aBigger;
3039 if (aSig1 < bSig1) goto bBigger;
3040 return packFloat128(0, 0);
3042 bExpBigger:
3043 if (bExp == 0x7FFF) {
3044 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3045 return packFloat128(zSign ^ 1, 0x7FFF, 0, 0);
3047 if (aExp == 0) ++expDiff;
3048 else {
3049 aSig0 |= BX_CONST64(0x4000000000000000);
3051 shift128RightJamming(aSig0, aSig1, - expDiff, &aSig0, &aSig1);
3052 bSig0 |= BX_CONST64(0x4000000000000000);
3053 bBigger:
3054 sub128(bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1);
3055 zExp = bExp;
3056 zSign ^= 1;
3057 goto normalizeRoundAndPack;
3058 aExpBigger:
3059 if (aExp == 0x7FFF) {
3060 if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3061 return a;
3063 if (bExp == 0) --expDiff;
3064 else {
3065 bSig0 |= BX_CONST64(0x4000000000000000);
3067 shift128RightJamming(bSig0, bSig1, expDiff, &bSig0, &bSig1);
3068 aSig0 |= BX_CONST64(0x4000000000000000);
3069 aBigger:
3070 sub128(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1);
3071 zExp = aExp;
3072 normalizeRoundAndPack:
3073 --zExp;
3074 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1, status);
3077 /*----------------------------------------------------------------------------
3078 | Returns the result of adding the quadruple-precision floating-point values
3079 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3080 | for Binary Floating-Point Arithmetic.
3081 *----------------------------------------------------------------------------*/
3083 float128 float128_add(float128 a, float128 b, float_status_t &status)
3085 int aSign = extractFloat128Sign(a);
3086 int bSign = extractFloat128Sign(b);
3088 if (aSign == bSign) {
3089 return addFloat128Sigs(a, b, aSign, status);
3091 else {
3092 return subFloat128Sigs(a, b, aSign, status);
3096 /*----------------------------------------------------------------------------
3097 | Returns the result of subtracting the quadruple-precision floating-point
3098 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3099 | Standard for Binary Floating-Point Arithmetic.
3100 *----------------------------------------------------------------------------*/
3102 float128 float128_sub(float128 a, float128 b, float_status_t &status)
3104 int aSign = extractFloat128Sign(a);
3105 int bSign = extractFloat128Sign(b);
3107 if (aSign == bSign) {
3108 return subFloat128Sigs(a, b, aSign, status);
3110 else {
3111 return addFloat128Sigs(a, b, aSign, status);
3115 /*----------------------------------------------------------------------------
3116 | Returns the result of multiplying the quadruple-precision floating-point
3117 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3118 | Standard for Binary Floating-Point Arithmetic.
3119 *----------------------------------------------------------------------------*/
3121 float128 float128_mul(float128 a, float128 b, float_status_t &status)
3123 int aSign, bSign, zSign;
3124 Bit32s aExp, bExp, zExp;
3125 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
3127 aSig1 = extractFloat128Frac1(a);
3128 aSig0 = extractFloat128Frac0(a);
3129 aExp = extractFloat128Exp(a);
3130 aSign = extractFloat128Sign(a);
3131 bSig1 = extractFloat128Frac1(b);
3132 bSig0 = extractFloat128Frac0(b);
3133 bExp = extractFloat128Exp(b);
3134 bSign = extractFloat128Sign(b);
3136 zSign = aSign ^ bSign;
3137 if (aExp == 0x7FFF) {
3138 if ((aSig0 | aSig1) || ((bExp == 0x7FFF) && (bSig0 | bSig1)))
3140 return propagateFloat128NaN(a, b, status);
3142 if ((bExp | bSig0 | bSig1) == 0) goto invalid;
3143 return packFloat128(zSign, 0x7FFF, 0, 0);
3145 if (bExp == 0x7FFF) {
3146 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3147 if ((aExp | aSig0 | aSig1) == 0) {
3148 invalid:
3149 float_raise(status, float_flag_invalid);
3150 return float128_default_nan;
3152 return packFloat128(zSign, 0x7FFF, 0, 0);
3154 if (aExp == 0) {
3155 if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3156 float_raise(status, float_flag_denormal);
3157 normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3159 if (bExp == 0) {
3160 if ((bSig0 | bSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3161 float_raise(status, float_flag_denormal);
3162 normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3164 zExp = aExp + bExp - 0x4000;
3165 aSig0 |= BX_CONST64(0x0001000000000000);
3166 shortShift128Left(bSig0, bSig1, 16, &bSig0, &bSig1);
3167 mul128To256(aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3);
3168 add128(zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1);
3169 zSig2 |= (zSig3 != 0);
3170 if (BX_CONST64(0x0002000000000000) <= zSig0) {
3171 shift128ExtraRightJamming(zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2);
3172 ++zExp;
3174 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3177 /*----------------------------------------------------------------------------
3178 | Returns the result of dividing the quadruple-precision floating-point value
3179 | `a' by the corresponding value `b'. The operation is performed according to
3180 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3181 *----------------------------------------------------------------------------*/
3183 float128 float128_div(float128 a, float128 b, float_status_t &status)
3185 int aSign, bSign, zSign;
3186 Bit32s aExp, bExp, zExp;
3187 Bit64u aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
3188 Bit64u rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3190 aSig1 = extractFloat128Frac1(a);
3191 aSig0 = extractFloat128Frac0(a);
3192 aExp = extractFloat128Exp(a);
3193 aSign = extractFloat128Sign(a);
3194 bSig1 = extractFloat128Frac1(b);
3195 bSig0 = extractFloat128Frac0(b);
3196 bExp = extractFloat128Exp(b);
3197 bSign = extractFloat128Sign(b);
3199 zSign = aSign ^ bSign;
3200 if (aExp == 0x7FFF) {
3201 if (aSig0 | aSig1) return propagateFloat128NaN(a, b, status);
3202 if (bExp == 0x7FFF) {
3203 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3204 goto invalid;
3206 return packFloat128(zSign, 0x7FFF, 0, 0);
3208 if (bExp == 0x7FFF) {
3209 if (bSig0 | bSig1) return propagateFloat128NaN(a, b, status);
3210 return packFloat128(zSign, 0, 0, 0);
3212 if (bExp == 0) {
3213 if ((bSig0 | bSig1) == 0) {
3214 if ((aExp | aSig0 | aSig1) == 0) {
3215 invalid:
3216 float_raise(status, float_flag_invalid);
3217 return float128_default_nan;
3219 float_raise(status, float_flag_divbyzero);
3220 return packFloat128(zSign, 0x7FFF, 0, 0);
3222 float_raise(status, float_flag_denormal);
3223 normalizeFloat128Subnormal(bSig0, bSig1, &bExp, &bSig0, &bSig1);
3225 if (aExp == 0) {
3226 if ((aSig0 | aSig1) == 0) return packFloat128(zSign, 0, 0, 0);
3227 float_raise(status, float_flag_denormal);
3228 normalizeFloat128Subnormal(aSig0, aSig1, &aExp, &aSig0, &aSig1);
3230 zExp = aExp - bExp + 0x3FFD;
3231 shortShift128Left(
3232 aSig0 | BX_CONST64(0x0001000000000000), aSig1, 15, &aSig0, &aSig1);
3233 shortShift128Left(
3234 bSig0 | BX_CONST64(0x0001000000000000), bSig1, 15, &bSig0, &bSig1);
3235 if (le128(bSig0, bSig1, aSig0, aSig1)) {
3236 shift128Right(aSig0, aSig1, 1, &aSig0, &aSig1);
3237 ++zExp;
3239 zSig0 = estimateDiv128To64(aSig0, aSig1, bSig0);
3240 mul128By64To192(bSig0, bSig1, zSig0, &term0, &term1, &term2);
3241 sub192(aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2);
3242 while ((Bit64s) rem0 < 0) {
3243 --zSig0;
3244 add192(rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2);
3246 zSig1 = estimateDiv128To64(rem1, rem2, bSig0);
3247 if ((zSig1 & 0x3FFF) <= 4) {
3248 mul128By64To192(bSig0, bSig1, zSig1, &term1, &term2, &term3);
3249 sub192(rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3);
3250 while ((Bit64s) rem1 < 0) {
3251 --zSig1;
3252 add192(rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3);
3254 zSig1 |= ((rem1 | rem2 | rem3) != 0);
3256 shift128ExtraRightJamming(zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2);
3257 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3260 #endif