target/i386/tcg/fpu_helper.c

   1 /*
   2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include <math.h>
  22 #include "cpu.h"
  23 #include "tcg-cpu.h"
  24 #include "exec/helper-proto.h"
  25 #include "fpu/softfloat.h"
  26 #include "fpu/softfloat-macros.h"
  27 #include "helper-tcg.h"
  28
  29 /* float macros */
  30 #define FT0    (env->ft0)
  31 #define ST0    (env->fpregs[env->fpstt].d)
  32 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
  33 #define ST1    ST(1)
  34
  35 #define FPU_RC_SHIFT        10
  36 #define FPU_RC_MASK         (3 << FPU_RC_SHIFT)
  37 #define FPU_RC_NEAR         0x000
  38 #define FPU_RC_DOWN         0x400
  39 #define FPU_RC_UP           0x800
  40 #define FPU_RC_CHOP         0xc00
  41
  42 #define MAXTAN 9223372036854775808.0
  43
  44 /* the following deal with x86 long double-precision numbers */
  45 #define MAXEXPD 0x7fff
  46 #define EXPBIAS 16383
  47 #define EXPD(fp)        (fp.l.upper & 0x7fff)
  48 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
  49 #define MANTD(fp)       (fp.l.lower)
  50 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
  51
  52 #define FPUS_IE (1 << 0)
  53 #define FPUS_DE (1 << 1)
  54 #define FPUS_ZE (1 << 2)
  55 #define FPUS_OE (1 << 3)
  56 #define FPUS_UE (1 << 4)
  57 #define FPUS_PE (1 << 5)
  58 #define FPUS_SF (1 << 6)
  59 #define FPUS_SE (1 << 7)
  60 #define FPUS_B  (1 << 15)
  61
  62 #define FPUC_EM 0x3f
  63
  64 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
  65 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
  66 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
  67 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
  68 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
  69 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
  70 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
  71 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
  72
  73 static inline void fpush(CPUX86State *env)
  74 {
  75     env->fpstt = (env->fpstt - 1) & 7;
  76     env->fptags[env->fpstt] = 0; /* validate stack entry */
  77 }
  78
  79 static inline void fpop(CPUX86State *env)
  80 {
  81     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
  82     env->fpstt = (env->fpstt + 1) & 7;
  83 }
  84
  85 static floatx80 do_fldt(CPUX86State *env, target_ulong ptr, uintptr_t retaddr)
  86 {
  87     CPU_LDoubleU temp;
  88
  89     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
  90     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
  91     return temp.d;
  92 }
  93
  94 static void do_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
  95                     uintptr_t retaddr)
  96 {
  97     CPU_LDoubleU temp;
  98
  99     temp.d = f;
 100     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
 101     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
 102 }
 103
 104 /* x87 FPU helpers */
 105
 106 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
 107 {
 108     union {
 109         float64 f64;
 110         double d;
 111     } u;
 112
 113     u.f64 = floatx80_to_float64(a, &env->fp_status);
 114     return u.d;
 115 }
 116
 117 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
 118 {
 119     union {
 120         float64 f64;
 121         double d;
 122     } u;
 123
 124     u.d = a;
 125     return float64_to_floatx80(u.f64, &env->fp_status);
 126 }
 127
 128 static void fpu_set_exception(CPUX86State *env, int mask)
 129 {
 130     env->fpus |= mask;
 131     if (env->fpus & (~env->fpuc & FPUC_EM)) {
 132         env->fpus |= FPUS_SE | FPUS_B;
 133     }
 134 }
 135
 136 static inline uint8_t save_exception_flags(CPUX86State *env)
 137 {
 138     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
 139     set_float_exception_flags(0, &env->fp_status);
 140     return old_flags;
 141 }
 142
 143 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
 144 {
 145     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
 146     float_raise(old_flags, &env->fp_status);
 147     fpu_set_exception(env,
 148                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
 149                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
 150                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
 151                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
 152                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
 153                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
 154 }
 155
 156 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
 157 {
 158     uint8_t old_flags = save_exception_flags(env);
 159     floatx80 ret = floatx80_div(a, b, &env->fp_status);
 160     merge_exception_flags(env, old_flags);
 161     return ret;
 162 }
 163
 164 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
 165 {
 166     if (env->cr[0] & CR0_NE_MASK) {
 167         raise_exception_ra(env, EXCP10_COPR, retaddr);
 168     }
 169 #if !defined(CONFIG_USER_ONLY)
 170     else {
 171         fpu_check_raise_ferr_irq(env);
 172     }
 173 #endif
 174 }
 175
 176 void helper_flds_FT0(CPUX86State *env, uint32_t val)
 177 {
 178     uint8_t old_flags = save_exception_flags(env);
 179     union {
 180         float32 f;
 181         uint32_t i;
 182     } u;
 183
 184     u.i = val;
 185     FT0 = float32_to_floatx80(u.f, &env->fp_status);
 186     merge_exception_flags(env, old_flags);
 187 }
 188
 189 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
 190 {
 191     uint8_t old_flags = save_exception_flags(env);
 192     union {
 193         float64 f;
 194         uint64_t i;
 195     } u;
 196
 197     u.i = val;
 198     FT0 = float64_to_floatx80(u.f, &env->fp_status);
 199     merge_exception_flags(env, old_flags);
 200 }
 201
 202 void helper_fildl_FT0(CPUX86State *env, int32_t val)
 203 {
 204     FT0 = int32_to_floatx80(val, &env->fp_status);
 205 }
 206
 207 void helper_flds_ST0(CPUX86State *env, uint32_t val)
 208 {
 209     uint8_t old_flags = save_exception_flags(env);
 210     int new_fpstt;
 211     union {
 212         float32 f;
 213         uint32_t i;
 214     } u;
 215
 216     new_fpstt = (env->fpstt - 1) & 7;
 217     u.i = val;
 218     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
 219     env->fpstt = new_fpstt;
 220     env->fptags[new_fpstt] = 0; /* validate stack entry */
 221     merge_exception_flags(env, old_flags);
 222 }
 223
 224 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
 225 {
 226     uint8_t old_flags = save_exception_flags(env);
 227     int new_fpstt;
 228     union {
 229         float64 f;
 230         uint64_t i;
 231     } u;
 232
 233     new_fpstt = (env->fpstt - 1) & 7;
 234     u.i = val;
 235     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
 236     env->fpstt = new_fpstt;
 237     env->fptags[new_fpstt] = 0; /* validate stack entry */
 238     merge_exception_flags(env, old_flags);
 239 }
 240
 241 static FloatX80RoundPrec tmp_maximise_precision(float_status *st)
 242 {
 243     FloatX80RoundPrec old = get_floatx80_rounding_precision(st);
 244     set_floatx80_rounding_precision(floatx80_precision_x, st);
 245     return old;
 246 }
 247
 248 void helper_fildl_ST0(CPUX86State *env, int32_t val)
 249 {
 250     int new_fpstt;
 251     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
 252
 253     new_fpstt = (env->fpstt - 1) & 7;
 254     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
 255     env->fpstt = new_fpstt;
 256     env->fptags[new_fpstt] = 0; /* validate stack entry */
 257
 258     set_floatx80_rounding_precision(old, &env->fp_status);
 259 }
 260
 261 void helper_fildll_ST0(CPUX86State *env, int64_t val)
 262 {
 263     int new_fpstt;
 264     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
 265
 266     new_fpstt = (env->fpstt - 1) & 7;
 267     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
 268     env->fpstt = new_fpstt;
 269     env->fptags[new_fpstt] = 0; /* validate stack entry */
 270
 271     set_floatx80_rounding_precision(old, &env->fp_status);
 272 }
 273
 274 uint32_t helper_fsts_ST0(CPUX86State *env)
 275 {
 276     uint8_t old_flags = save_exception_flags(env);
 277     union {
 278         float32 f;
 279         uint32_t i;
 280     } u;
 281
 282     u.f = floatx80_to_float32(ST0, &env->fp_status);
 283     merge_exception_flags(env, old_flags);
 284     return u.i;
 285 }
 286
 287 uint64_t helper_fstl_ST0(CPUX86State *env)
 288 {
 289     uint8_t old_flags = save_exception_flags(env);
 290     union {
 291         float64 f;
 292         uint64_t i;
 293     } u;
 294
 295     u.f = floatx80_to_float64(ST0, &env->fp_status);
 296     merge_exception_flags(env, old_flags);
 297     return u.i;
 298 }
 299
 300 int32_t helper_fist_ST0(CPUX86State *env)
 301 {
 302     uint8_t old_flags = save_exception_flags(env);
 303     int32_t val;
 304
 305     val = floatx80_to_int32(ST0, &env->fp_status);
 306     if (val != (int16_t)val) {
 307         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 308         val = -32768;
 309     }
 310     merge_exception_flags(env, old_flags);
 311     return val;
 312 }
 313
 314 int32_t helper_fistl_ST0(CPUX86State *env)
 315 {
 316     uint8_t old_flags = save_exception_flags(env);
 317     int32_t val;
 318
 319     val = floatx80_to_int32(ST0, &env->fp_status);
 320     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 321         val = 0x80000000;
 322     }
 323     merge_exception_flags(env, old_flags);
 324     return val;
 325 }
 326
 327 int64_t helper_fistll_ST0(CPUX86State *env)
 328 {
 329     uint8_t old_flags = save_exception_flags(env);
 330     int64_t val;
 331
 332     val = floatx80_to_int64(ST0, &env->fp_status);
 333     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 334         val = 0x8000000000000000ULL;
 335     }
 336     merge_exception_flags(env, old_flags);
 337     return val;
 338 }
 339
 340 int32_t helper_fistt_ST0(CPUX86State *env)
 341 {
 342     uint8_t old_flags = save_exception_flags(env);
 343     int32_t val;
 344
 345     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 346     if (val != (int16_t)val) {
 347         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 348         val = -32768;
 349     }
 350     merge_exception_flags(env, old_flags);
 351     return val;
 352 }
 353
 354 int32_t helper_fisttl_ST0(CPUX86State *env)
 355 {
 356     uint8_t old_flags = save_exception_flags(env);
 357     int32_t val;
 358
 359     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 360     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 361         val = 0x80000000;
 362     }
 363     merge_exception_flags(env, old_flags);
 364     return val;
 365 }
 366
 367 int64_t helper_fisttll_ST0(CPUX86State *env)
 368 {
 369     uint8_t old_flags = save_exception_flags(env);
 370     int64_t val;
 371
 372     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
 373     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 374         val = 0x8000000000000000ULL;
 375     }
 376     merge_exception_flags(env, old_flags);
 377     return val;
 378 }
 379
 380 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
 381 {
 382     int new_fpstt;
 383
 384     new_fpstt = (env->fpstt - 1) & 7;
 385     env->fpregs[new_fpstt].d = do_fldt(env, ptr, GETPC());
 386     env->fpstt = new_fpstt;
 387     env->fptags[new_fpstt] = 0; /* validate stack entry */
 388 }
 389
 390 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
 391 {
 392     do_fstt(env, ST0, ptr, GETPC());
 393 }
 394
 395 void helper_fpush(CPUX86State *env)
 396 {
 397     fpush(env);
 398 }
 399
 400 void helper_fpop(CPUX86State *env)
 401 {
 402     fpop(env);
 403 }
 404
 405 void helper_fdecstp(CPUX86State *env)
 406 {
 407     env->fpstt = (env->fpstt - 1) & 7;
 408     env->fpus &= ~0x4700;
 409 }
 410
 411 void helper_fincstp(CPUX86State *env)
 412 {
 413     env->fpstt = (env->fpstt + 1) & 7;
 414     env->fpus &= ~0x4700;
 415 }
 416
 417 /* FPU move */
 418
 419 void helper_ffree_STN(CPUX86State *env, int st_index)
 420 {
 421     env->fptags[(env->fpstt + st_index) & 7] = 1;
 422 }
 423
 424 void helper_fmov_ST0_FT0(CPUX86State *env)
 425 {
 426     ST0 = FT0;
 427 }
 428
 429 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
 430 {
 431     FT0 = ST(st_index);
 432 }
 433
 434 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
 435 {
 436     ST0 = ST(st_index);
 437 }
 438
 439 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
 440 {
 441     ST(st_index) = ST0;
 442 }
 443
 444 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
 445 {
 446     floatx80 tmp;
 447
 448     tmp = ST(st_index);
 449     ST(st_index) = ST0;
 450     ST0 = tmp;
 451 }
 452
 453 /* FPU operations */
 454
 455 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
 456
 457 void helper_fcom_ST0_FT0(CPUX86State *env)
 458 {
 459     uint8_t old_flags = save_exception_flags(env);
 460     FloatRelation ret;
 461
 462     ret = floatx80_compare(ST0, FT0, &env->fp_status);
 463     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 464     merge_exception_flags(env, old_flags);
 465 }
 466
 467 void helper_fucom_ST0_FT0(CPUX86State *env)
 468 {
 469     uint8_t old_flags = save_exception_flags(env);
 470     FloatRelation ret;
 471
 472     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 473     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 474     merge_exception_flags(env, old_flags);
 475 }
 476
 477 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
 478
 479 void helper_fcomi_ST0_FT0(CPUX86State *env)
 480 {
 481     uint8_t old_flags = save_exception_flags(env);
 482     int eflags;
 483     FloatRelation ret;
 484
 485     ret = floatx80_compare(ST0, FT0, &env->fp_status);
 486     eflags = cpu_cc_compute_all(env, CC_OP);
 487     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 488     CC_SRC = eflags;
 489     merge_exception_flags(env, old_flags);
 490 }
 491
 492 void helper_fucomi_ST0_FT0(CPUX86State *env)
 493 {
 494     uint8_t old_flags = save_exception_flags(env);
 495     int eflags;
 496     FloatRelation ret;
 497
 498     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 499     eflags = cpu_cc_compute_all(env, CC_OP);
 500     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 501     CC_SRC = eflags;
 502     merge_exception_flags(env, old_flags);
 503 }
 504
 505 void helper_fadd_ST0_FT0(CPUX86State *env)
 506 {
 507     uint8_t old_flags = save_exception_flags(env);
 508     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
 509     merge_exception_flags(env, old_flags);
 510 }
 511
 512 void helper_fmul_ST0_FT0(CPUX86State *env)
 513 {
 514     uint8_t old_flags = save_exception_flags(env);
 515     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
 516     merge_exception_flags(env, old_flags);
 517 }
 518
 519 void helper_fsub_ST0_FT0(CPUX86State *env)
 520 {
 521     uint8_t old_flags = save_exception_flags(env);
 522     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
 523     merge_exception_flags(env, old_flags);
 524 }
 525
 526 void helper_fsubr_ST0_FT0(CPUX86State *env)
 527 {
 528     uint8_t old_flags = save_exception_flags(env);
 529     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
 530     merge_exception_flags(env, old_flags);
 531 }
 532
 533 void helper_fdiv_ST0_FT0(CPUX86State *env)
 534 {
 535     ST0 = helper_fdiv(env, ST0, FT0);
 536 }
 537
 538 void helper_fdivr_ST0_FT0(CPUX86State *env)
 539 {
 540     ST0 = helper_fdiv(env, FT0, ST0);
 541 }
 542
 543 /* fp operations between STN and ST0 */
 544
 545 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
 546 {
 547     uint8_t old_flags = save_exception_flags(env);
 548     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
 549     merge_exception_flags(env, old_flags);
 550 }
 551
 552 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
 553 {
 554     uint8_t old_flags = save_exception_flags(env);
 555     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
 556     merge_exception_flags(env, old_flags);
 557 }
 558
 559 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
 560 {
 561     uint8_t old_flags = save_exception_flags(env);
 562     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
 563     merge_exception_flags(env, old_flags);
 564 }
 565
 566 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
 567 {
 568     uint8_t old_flags = save_exception_flags(env);
 569     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
 570     merge_exception_flags(env, old_flags);
 571 }
 572
 573 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
 574 {
 575     floatx80 *p;
 576
 577     p = &ST(st_index);
 578     *p = helper_fdiv(env, *p, ST0);
 579 }
 580
 581 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
 582 {
 583     floatx80 *p;
 584
 585     p = &ST(st_index);
 586     *p = helper_fdiv(env, ST0, *p);
 587 }
 588
 589 /* misc FPU operations */
 590 void helper_fchs_ST0(CPUX86State *env)
 591 {
 592     ST0 = floatx80_chs(ST0);
 593 }
 594
 595 void helper_fabs_ST0(CPUX86State *env)
 596 {
 597     ST0 = floatx80_abs(ST0);
 598 }
 599
 600 void helper_fld1_ST0(CPUX86State *env)
 601 {
 602     ST0 = floatx80_one;
 603 }
 604
 605 void helper_fldl2t_ST0(CPUX86State *env)
 606 {
 607     switch (env->fpuc & FPU_RC_MASK) {
 608     case FPU_RC_UP:
 609         ST0 = floatx80_l2t_u;
 610         break;
 611     default:
 612         ST0 = floatx80_l2t;
 613         break;
 614     }
 615 }
 616
 617 void helper_fldl2e_ST0(CPUX86State *env)
 618 {
 619     switch (env->fpuc & FPU_RC_MASK) {
 620     case FPU_RC_DOWN:
 621     case FPU_RC_CHOP:
 622         ST0 = floatx80_l2e_d;
 623         break;
 624     default:
 625         ST0 = floatx80_l2e;
 626         break;
 627     }
 628 }
 629
 630 void helper_fldpi_ST0(CPUX86State *env)
 631 {
 632     switch (env->fpuc & FPU_RC_MASK) {
 633     case FPU_RC_DOWN:
 634     case FPU_RC_CHOP:
 635         ST0 = floatx80_pi_d;
 636         break;
 637     default:
 638         ST0 = floatx80_pi;
 639         break;
 640     }
 641 }
 642
 643 void helper_fldlg2_ST0(CPUX86State *env)
 644 {
 645     switch (env->fpuc & FPU_RC_MASK) {
 646     case FPU_RC_DOWN:
 647     case FPU_RC_CHOP:
 648         ST0 = floatx80_lg2_d;
 649         break;
 650     default:
 651         ST0 = floatx80_lg2;
 652         break;
 653     }
 654 }
 655
 656 void helper_fldln2_ST0(CPUX86State *env)
 657 {
 658     switch (env->fpuc & FPU_RC_MASK) {
 659     case FPU_RC_DOWN:
 660     case FPU_RC_CHOP:
 661         ST0 = floatx80_ln2_d;
 662         break;
 663     default:
 664         ST0 = floatx80_ln2;
 665         break;
 666     }
 667 }
 668
 669 void helper_fldz_ST0(CPUX86State *env)
 670 {
 671     ST0 = floatx80_zero;
 672 }
 673
 674 void helper_fldz_FT0(CPUX86State *env)
 675 {
 676     FT0 = floatx80_zero;
 677 }
 678
 679 uint32_t helper_fnstsw(CPUX86State *env)
 680 {
 681     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
 682 }
 683
 684 uint32_t helper_fnstcw(CPUX86State *env)
 685 {
 686     return env->fpuc;
 687 }
 688
 689 static void set_x86_rounding_mode(unsigned mode, float_status *status)
 690 {
 691     static FloatRoundMode x86_round_mode[4] = {
 692         float_round_nearest_even,
 693         float_round_down,
 694         float_round_up,
 695         float_round_to_zero
 696     };
 697     assert(mode < ARRAY_SIZE(x86_round_mode));
 698     set_float_rounding_mode(x86_round_mode[mode], status);
 699 }
 700
 701 void update_fp_status(CPUX86State *env)
 702 {
 703     int rnd_mode;
 704     FloatX80RoundPrec rnd_prec;
 705
 706     /* set rounding mode */
 707     rnd_mode = (env->fpuc & FPU_RC_MASK) >> FPU_RC_SHIFT;
 708     set_x86_rounding_mode(rnd_mode, &env->fp_status);
 709
 710     switch ((env->fpuc >> 8) & 3) {
 711     case 0:
 712         rnd_prec = floatx80_precision_s;
 713         break;
 714     case 2:
 715         rnd_prec = floatx80_precision_d;
 716         break;
 717     case 3:
 718     default:
 719         rnd_prec = floatx80_precision_x;
 720         break;
 721     }
 722     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
 723 }
 724
 725 void helper_fldcw(CPUX86State *env, uint32_t val)
 726 {
 727     cpu_set_fpuc(env, val);
 728 }
 729
 730 void helper_fclex(CPUX86State *env)
 731 {
 732     env->fpus &= 0x7f00;
 733 }
 734
 735 void helper_fwait(CPUX86State *env)
 736 {
 737     if (env->fpus & FPUS_SE) {
 738         fpu_raise_exception(env, GETPC());
 739     }
 740 }
 741
 742 static void do_fninit(CPUX86State *env)
 743 {
 744     env->fpus = 0;
 745     env->fpstt = 0;
 746     env->fpcs = 0;
 747     env->fpds = 0;
 748     env->fpip = 0;
 749     env->fpdp = 0;
 750     cpu_set_fpuc(env, 0x37f);
 751     env->fptags[0] = 1;
 752     env->fptags[1] = 1;
 753     env->fptags[2] = 1;
 754     env->fptags[3] = 1;
 755     env->fptags[4] = 1;
 756     env->fptags[5] = 1;
 757     env->fptags[6] = 1;
 758     env->fptags[7] = 1;
 759 }
 760
 761 void helper_fninit(CPUX86State *env)
 762 {
 763     do_fninit(env);
 764 }
 765
 766 /* BCD ops */
 767
 768 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
 769 {
 770     floatx80 tmp;
 771     uint64_t val;
 772     unsigned int v;
 773     int i;
 774
 775     val = 0;
 776     for (i = 8; i >= 0; i--) {
 777         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
 778         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
 779     }
 780     tmp = int64_to_floatx80(val, &env->fp_status);
 781     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
 782         tmp = floatx80_chs(tmp);
 783     }
 784     fpush(env);
 785     ST0 = tmp;
 786 }
 787
 788 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
 789 {
 790     uint8_t old_flags = save_exception_flags(env);
 791     int v;
 792     target_ulong mem_ref, mem_end;
 793     int64_t val;
 794     CPU_LDoubleU temp;
 795
 796     temp.d = ST0;
 797
 798     val = floatx80_to_int64(ST0, &env->fp_status);
 799     mem_ref = ptr;
 800     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
 801         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 802         while (mem_ref < ptr + 7) {
 803             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 804         }
 805         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
 806         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 807         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 808         merge_exception_flags(env, old_flags);
 809         return;
 810     }
 811     mem_end = mem_ref + 9;
 812     if (SIGND(temp)) {
 813         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
 814         val = -val;
 815     } else {
 816         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
 817     }
 818     while (mem_ref < mem_end) {
 819         if (val == 0) {
 820             break;
 821         }
 822         v = val % 100;
 823         val = val / 100;
 824         v = ((v / 10) << 4) | (v % 10);
 825         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
 826     }
 827     while (mem_ref < mem_end) {
 828         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 829     }
 830     merge_exception_flags(env, old_flags);
 831 }
 832
 833 /* 128-bit significand of log(2).  */
 834 #define ln2_sig_high 0xb17217f7d1cf79abULL
 835 #define ln2_sig_low 0xc9e3b39803f2f6afULL
 836
 837 /*
 838  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
 839  * the interval [-1/64, 1/64].
 840  */
 841 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
 842 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
 843 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
 844 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
 845 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
 846 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
 847 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
 848 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
 849 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
 850
 851 struct f2xm1_data {
 852     /*
 853      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
 854      * are very close to exact floatx80 values.
 855      */
 856     floatx80 t;
 857     /* The value of 2^t.  */
 858     floatx80 exp2;
 859     /* The value of 2^t - 1.  */
 860     floatx80 exp2m1;
 861 };
 862
 863 static const struct f2xm1_data f2xm1_table[65] = {
 864     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
 865       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
 866       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
 867     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
 868       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
 869       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
 870     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
 871       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
 872       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
 873     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
 874       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
 875       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
 876     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
 877       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
 878       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
 879     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
 880       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
 881       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
 882     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
 883       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
 884       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
 885     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
 886       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
 887       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
 888     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
 889       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
 890       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
 891     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
 892       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
 893       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
 894     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
 895       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
 896       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
 897     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
 898       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
 899       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
 900     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
 901       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
 902       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
 903     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
 904       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
 905       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
 906     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
 907       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
 908       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
 909     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
 910       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
 911       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
 912     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
 913       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
 914       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
 915     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
 916       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
 917       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
 918     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
 919       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
 920       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
 921     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
 922       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
 923       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
 924     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
 925       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
 926       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
 927     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
 928       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
 929       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
 930     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
 931       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
 932       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
 933     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
 934       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
 935       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
 936     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
 937       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
 938       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
 939     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
 940       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
 941       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
 942     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
 943       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
 944       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
 945     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
 946       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
 947       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
 948     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
 949       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
 950       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
 951     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
 952       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
 953       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
 954     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
 955       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
 956       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
 957     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
 958       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
 959       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
 960     { floatx80_zero_init,
 961       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
 962       floatx80_zero_init },
 963     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
 964       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
 965       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
 966     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
 967       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
 968       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
 969     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
 970       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
 971       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
 972     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
 973       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
 974       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
 975     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
 976       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
 977       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
 978     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
 979       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
 980       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
 981     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
 982       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
 983       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
 984     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
 985       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
 986       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
 987     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
 988       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
 989       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
 990     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
 991       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
 992       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
 993     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
 994       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
 995       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
 996     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
 997       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
 998       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
 999     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1000       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1001       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1002     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1003       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1004       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1005     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1006       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1007       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1008     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1009       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1010       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1011     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1012       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1013       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1014     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1015       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1016       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1017     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1018       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1019       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1020     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1021       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1022       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1023     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1024       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1025       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1026     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1027       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1028       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1029     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1030       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1031       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1032     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1033       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1034       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1035     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1036       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1037       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1038     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1039       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1040       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1041     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1042       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1043       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1044     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1045       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1046       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1047     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1048       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1049       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1050     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1051       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1052       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1053     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1054       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1055       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1056     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1057       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1058       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1059 };
1060
1061 void helper_f2xm1(CPUX86State *env)
1062 {
1063     uint8_t old_flags = save_exception_flags(env);
1064     uint64_t sig = extractFloatx80Frac(ST0);
1065     int32_t exp = extractFloatx80Exp(ST0);
1066     bool sign = extractFloatx80Sign(ST0);
1067
1068     if (floatx80_invalid_encoding(ST0)) {
1069         float_raise(float_flag_invalid, &env->fp_status);
1070         ST0 = floatx80_default_nan(&env->fp_status);
1071     } else if (floatx80_is_any_nan(ST0)) {
1072         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1073             float_raise(float_flag_invalid, &env->fp_status);
1074             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1075         }
1076     } else if (exp > 0x3fff ||
1077                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1078         /* Out of range for the instruction, treat as invalid.  */
1079         float_raise(float_flag_invalid, &env->fp_status);
1080         ST0 = floatx80_default_nan(&env->fp_status);
1081     } else if (exp == 0x3fff) {
1082         /* Argument 1 or -1, exact result 1 or -0.5.  */
1083         if (sign) {
1084             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1085         }
1086     } else if (exp < 0x3fb0) {
1087         if (!floatx80_is_zero(ST0)) {
1088             /*
1089              * Multiplying the argument by an extra-precision version
1090              * of log(2) is sufficiently precise.  Zero arguments are
1091              * returned unchanged.
1092              */
1093             uint64_t sig0, sig1, sig2;
1094             if (exp == 0) {
1095                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1096             }
1097             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1098                             &sig2);
1099             /* This result is inexact.  */
1100             sig1 |= 1;
1101             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1102                                                 sign, exp, sig0, sig1,
1103                                                 &env->fp_status);
1104         }
1105     } else {
1106         floatx80 tmp, y, accum;
1107         bool asign, bsign;
1108         int32_t n, aexp, bexp;
1109         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1110         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1111         FloatX80RoundPrec save_prec =
1112             env->fp_status.floatx80_rounding_precision;
1113         env->fp_status.float_rounding_mode = float_round_nearest_even;
1114         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1115
1116         /* Find the nearest multiple of 1/32 to the argument.  */
1117         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1118         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1119         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1120
1121         if (floatx80_is_zero(y)) {
1122             /*
1123              * Use the value of 2^t - 1 from the table, to avoid
1124              * needing to special-case zero as a result of
1125              * multiplication below.
1126              */
1127             ST0 = f2xm1_table[n].t;
1128             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1129             env->fp_status.float_rounding_mode = save_mode;
1130         } else {
1131             /*
1132              * Compute the lower parts of a polynomial expansion for
1133              * (2^y - 1) / y.
1134              */
1135             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1136             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1137             accum = floatx80_mul(accum, y, &env->fp_status);
1138             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1139             accum = floatx80_mul(accum, y, &env->fp_status);
1140             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1141             accum = floatx80_mul(accum, y, &env->fp_status);
1142             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1143             accum = floatx80_mul(accum, y, &env->fp_status);
1144             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1145             accum = floatx80_mul(accum, y, &env->fp_status);
1146             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1147             accum = floatx80_mul(accum, y, &env->fp_status);
1148             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1149
1150             /*
1151              * The full polynomial expansion is f2xm1_coeff_0 + accum
1152              * (where accum has much lower magnitude, and so, in
1153              * particular, carry out of the addition is not possible).
1154              * (This expansion is only accurate to about 70 bits, not
1155              * 128 bits.)
1156              */
1157             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1158             asign = extractFloatx80Sign(f2xm1_coeff_0);
1159             shift128RightJamming(extractFloatx80Frac(accum), 0,
1160                                  aexp - extractFloatx80Exp(accum),
1161                                  &asig0, &asig1);
1162             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1163             bsig1 = 0;
1164             if (asign == extractFloatx80Sign(accum)) {
1165                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1166             } else {
1167                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1168             }
1169             /* And thus compute an approximation to 2^y - 1.  */
1170             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1171                             &asig0, &asig1, &asig2);
1172             aexp += extractFloatx80Exp(y) - 0x3ffe;
1173             asign ^= extractFloatx80Sign(y);
1174             if (n != 32) {
1175                 /*
1176                  * Multiply this by the precomputed value of 2^t and
1177                  * add that of 2^t - 1.
1178                  */
1179                 mul128By64To192(asig0, asig1,
1180                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1181                                 &asig0, &asig1, &asig2);
1182                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1183                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1184                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1185                 bsig1 = 0;
1186                 if (bexp < aexp) {
1187                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1188                                          &bsig0, &bsig1);
1189                 } else if (aexp < bexp) {
1190                     shift128RightJamming(asig0, asig1, bexp - aexp,
1191                                          &asig0, &asig1);
1192                     aexp = bexp;
1193                 }
1194                 /* The sign of 2^t - 1 is always that of the result.  */
1195                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1196                 if (asign == bsign) {
1197                     /* Avoid possible carry out of the addition.  */
1198                     shift128RightJamming(asig0, asig1, 1,
1199                                          &asig0, &asig1);
1200                     shift128RightJamming(bsig0, bsig1, 1,
1201                                          &bsig0, &bsig1);
1202                     ++aexp;
1203                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1204                 } else {
1205                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1206                     asign = bsign;
1207                 }
1208             }
1209             env->fp_status.float_rounding_mode = save_mode;
1210             /* This result is inexact.  */
1211             asig1 |= 1;
1212             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1213                                                 asign, aexp, asig0, asig1,
1214                                                 &env->fp_status);
1215         }
1216
1217         env->fp_status.floatx80_rounding_precision = save_prec;
1218     }
1219     merge_exception_flags(env, old_flags);
1220 }
1221
1222 void helper_fptan(CPUX86State *env)
1223 {
1224     double fptemp = floatx80_to_double(env, ST0);
1225
1226     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1227         env->fpus |= 0x400;
1228     } else {
1229         fptemp = tan(fptemp);
1230         ST0 = double_to_floatx80(env, fptemp);
1231         fpush(env);
1232         ST0 = floatx80_one;
1233         env->fpus &= ~0x400; /* C2 <-- 0 */
1234         /* the above code is for |arg| < 2**52 only */
1235     }
1236 }
1237
1238 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1239 #define pi_4_exp 0x3ffe
1240 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1241 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1242 #define pi_2_exp 0x3fff
1243 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1244 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1245 #define pi_34_exp 0x4000
1246 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1247 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1248 #define pi_exp 0x4000
1249 #define pi_sig_high 0xc90fdaa22168c234ULL
1250 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1251
1252 /*
1253  * Polynomial coefficients for an approximation to atan(x), with only
1254  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1255  * for some other approximations, no low part is needed for the first
1256  * coefficient here to achieve a sufficiently accurate result, because
1257  * the coefficient in this minimax approximation is very close to
1258  * exactly 1.)
1259  */
1260 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1261 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1262 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1263 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1264 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1265 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1266 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1267
1268 struct fpatan_data {
1269     /* High and low parts of atan(x).  */
1270     floatx80 atan_high, atan_low;
1271 };
1272
1273 static const struct fpatan_data fpatan_table[9] = {
1274     { floatx80_zero_init,
1275       floatx80_zero_init },
1276     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1277       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1278     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1279       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1280     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1281       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1282     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1283       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1284     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1285       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1286     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1287       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1288     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1289       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1290     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1291       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1292 };
1293
1294 void helper_fpatan(CPUX86State *env)
1295 {
1296     uint8_t old_flags = save_exception_flags(env);
1297     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1298     int32_t arg0_exp = extractFloatx80Exp(ST0);
1299     bool arg0_sign = extractFloatx80Sign(ST0);
1300     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1301     int32_t arg1_exp = extractFloatx80Exp(ST1);
1302     bool arg1_sign = extractFloatx80Sign(ST1);
1303
1304     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1305         float_raise(float_flag_invalid, &env->fp_status);
1306         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1307     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1308         float_raise(float_flag_invalid, &env->fp_status);
1309         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1310     } else if (floatx80_invalid_encoding(ST0) ||
1311                floatx80_invalid_encoding(ST1)) {
1312         float_raise(float_flag_invalid, &env->fp_status);
1313         ST1 = floatx80_default_nan(&env->fp_status);
1314     } else if (floatx80_is_any_nan(ST0)) {
1315         ST1 = ST0;
1316     } else if (floatx80_is_any_nan(ST1)) {
1317         /* Pass this NaN through.  */
1318     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1319         /* Pass this zero through.  */
1320     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1321                  arg0_exp - arg1_exp >= 80) &&
1322                !arg0_sign) {
1323         /*
1324          * Dividing ST1 by ST0 gives the correct result up to
1325          * rounding, and avoids spurious underflow exceptions that
1326          * might result from passing some small values through the
1327          * polynomial approximation, but if a finite nonzero result of
1328          * division is exact, the result of fpatan is still inexact
1329          * (and underflowing where appropriate).
1330          */
1331         FloatX80RoundPrec save_prec =
1332             env->fp_status.floatx80_rounding_precision;
1333         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1334         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1335         env->fp_status.floatx80_rounding_precision = save_prec;
1336         if (!floatx80_is_zero(ST1) &&
1337             !(get_float_exception_flags(&env->fp_status) &
1338               float_flag_inexact)) {
1339             /*
1340              * The mathematical result is very slightly closer to zero
1341              * than this exact result.  Round a value with the
1342              * significand adjusted accordingly to get the correct
1343              * exceptions, and possibly an adjusted result depending
1344              * on the rounding mode.
1345              */
1346             uint64_t sig = extractFloatx80Frac(ST1);
1347             int32_t exp = extractFloatx80Exp(ST1);
1348             bool sign = extractFloatx80Sign(ST1);
1349             if (exp == 0) {
1350                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1351             }
1352             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1353                                                 sign, exp, sig - 1,
1354                                                 -1, &env->fp_status);
1355         }
1356     } else {
1357         /* The result is inexact.  */
1358         bool rsign = arg1_sign;
1359         int32_t rexp;
1360         uint64_t rsig0, rsig1;
1361         if (floatx80_is_zero(ST1)) {
1362             /*
1363              * ST0 is negative.  The result is pi with the sign of
1364              * ST1.
1365              */
1366             rexp = pi_exp;
1367             rsig0 = pi_sig_high;
1368             rsig1 = pi_sig_low;
1369         } else if (floatx80_is_infinity(ST1)) {
1370             if (floatx80_is_infinity(ST0)) {
1371                 if (arg0_sign) {
1372                     rexp = pi_34_exp;
1373                     rsig0 = pi_34_sig_high;
1374                     rsig1 = pi_34_sig_low;
1375                 } else {
1376                     rexp = pi_4_exp;
1377                     rsig0 = pi_4_sig_high;
1378                     rsig1 = pi_4_sig_low;
1379                 }
1380             } else {
1381                 rexp = pi_2_exp;
1382                 rsig0 = pi_2_sig_high;
1383                 rsig1 = pi_2_sig_low;
1384             }
1385         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1386             rexp = pi_2_exp;
1387             rsig0 = pi_2_sig_high;
1388             rsig1 = pi_2_sig_low;
1389         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1390             /* ST0 is negative.  */
1391             rexp = pi_exp;
1392             rsig0 = pi_sig_high;
1393             rsig1 = pi_sig_low;
1394         } else {
1395             /*
1396              * ST0 and ST1 are finite, nonzero and with exponents not
1397              * too far apart.
1398              */
1399             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1400             int32_t azexp, axexp;
1401             bool adj_sub, ysign, zsign;
1402             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1403             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1404             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1405             uint64_t azsig0, azsig1;
1406             uint64_t azsig2, azsig3, axsig0, axsig1;
1407             floatx80 x8;
1408             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1409             FloatX80RoundPrec save_prec =
1410                 env->fp_status.floatx80_rounding_precision;
1411             env->fp_status.float_rounding_mode = float_round_nearest_even;
1412             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1413
1414             if (arg0_exp == 0) {
1415                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1416             }
1417             if (arg1_exp == 0) {
1418                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1419             }
1420             if (arg0_exp > arg1_exp ||
1421                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1422                 /* Work with abs(ST1) / abs(ST0).  */
1423                 num_exp = arg1_exp;
1424                 num_sig = arg1_sig;
1425                 den_exp = arg0_exp;
1426                 den_sig = arg0_sig;
1427                 if (arg0_sign) {
1428                     /* The result is subtracted from pi.  */
1429                     adj_exp = pi_exp;
1430                     adj_sig0 = pi_sig_high;
1431                     adj_sig1 = pi_sig_low;
1432                     adj_sub = true;
1433                 } else {
1434                     /* The result is used as-is.  */
1435                     adj_exp = 0;
1436                     adj_sig0 = 0;
1437                     adj_sig1 = 0;
1438                     adj_sub = false;
1439                 }
1440             } else {
1441                 /* Work with abs(ST0) / abs(ST1).  */
1442                 num_exp = arg0_exp;
1443                 num_sig = arg0_sig;
1444                 den_exp = arg1_exp;
1445                 den_sig = arg1_sig;
1446                 /* The result is added to or subtracted from pi/2.  */
1447                 adj_exp = pi_2_exp;
1448                 adj_sig0 = pi_2_sig_high;
1449                 adj_sig1 = pi_2_sig_low;
1450                 adj_sub = !arg0_sign;
1451             }
1452
1453             /*
1454              * Compute x = num/den, where 0 < x <= 1 and x is not too
1455              * small.
1456              */
1457             xexp = num_exp - den_exp + 0x3ffe;
1458             remsig0 = num_sig;
1459             remsig1 = 0;
1460             if (den_sig <= remsig0) {
1461                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1462                 ++xexp;
1463             }
1464             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1465             mul64To128(den_sig, xsig0, &msig0, &msig1);
1466             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1467             while ((int64_t) remsig0 < 0) {
1468                 --xsig0;
1469                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1470             }
1471             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1472             /*
1473              * No need to correct any estimation error in xsig1; even
1474              * with such error, it is accurate enough.
1475              */
1476
1477             /*
1478              * Split x as x = t + y, where t = n/8 is the nearest
1479              * multiple of 1/8 to x.
1480              */
1481             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1482                                                false, xexp + 3, xsig0,
1483                                                xsig1, &env->fp_status);
1484             n = floatx80_to_int32(x8, &env->fp_status);
1485             if (n == 0) {
1486                 ysign = false;
1487                 yexp = xexp;
1488                 ysig0 = xsig0;
1489                 ysig1 = xsig1;
1490                 texp = 0;
1491                 tsig = 0;
1492             } else {
1493                 int shift = clz32(n) + 32;
1494                 texp = 0x403b - shift;
1495                 tsig = n;
1496                 tsig <<= shift;
1497                 if (texp == xexp) {
1498                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1499                     if ((int64_t) ysig0 >= 0) {
1500                         ysign = false;
1501                         if (ysig0 == 0) {
1502                             if (ysig1 == 0) {
1503                                 yexp = 0;
1504                             } else {
1505                                 shift = clz64(ysig1) + 64;
1506                                 yexp = xexp - shift;
1507                                 shift128Left(ysig0, ysig1, shift,
1508                                              &ysig0, &ysig1);
1509                             }
1510                         } else {
1511                             shift = clz64(ysig0);
1512                             yexp = xexp - shift;
1513                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1514                         }
1515                     } else {
1516                         ysign = true;
1517                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1518                         if (ysig0 == 0) {
1519                             shift = clz64(ysig1) + 64;
1520                         } else {
1521                             shift = clz64(ysig0);
1522                         }
1523                         yexp = xexp - shift;
1524                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1525                     }
1526                 } else {
1527                     /*
1528                      * t's exponent must be greater than x's because t
1529                      * is positive and the nearest multiple of 1/8 to
1530                      * x, and if x has a greater exponent, the power
1531                      * of 2 with that exponent is also a multiple of
1532                      * 1/8.
1533                      */
1534                     uint64_t usig0, usig1;
1535                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1536                                          &usig0, &usig1);
1537                     ysign = true;
1538                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1539                     if (ysig0 == 0) {
1540                         shift = clz64(ysig1) + 64;
1541                     } else {
1542                         shift = clz64(ysig0);
1543                     }
1544                     yexp = texp - shift;
1545                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1546                 }
1547             }
1548
1549             /*
1550              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1551              * arctan(z).
1552              */
1553             zsign = ysign;
1554             if (texp == 0 || yexp == 0) {
1555                 zexp = yexp;
1556                 zsig0 = ysig0;
1557                 zsig1 = ysig1;
1558             } else {
1559                 /*
1560                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1561                  */
1562                 int32_t dexp = texp + xexp - 0x3ffe;
1563                 uint64_t dsig0, dsig1, dsig2;
1564                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1565                 /*
1566                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1567                  * bit).  Add 1 to produce the denominator 1+tx.
1568                  */
1569                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1570                                      &dsig0, &dsig1);
1571                 dsig0 |= 0x8000000000000000ULL;
1572                 zexp = yexp - 1;
1573                 remsig0 = ysig0;
1574                 remsig1 = ysig1;
1575                 remsig2 = 0;
1576                 if (dsig0 <= remsig0) {
1577                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1578                     ++zexp;
1579                 }
1580                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1581                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1582                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1583                        &remsig0, &remsig1, &remsig2);
1584                 while ((int64_t) remsig0 < 0) {
1585                     --zsig0;
1586                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1587                            &remsig0, &remsig1, &remsig2);
1588                 }
1589                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1590                 /* No need to correct any estimation error in zsig1.  */
1591             }
1592
1593             if (zexp == 0) {
1594                 azexp = 0;
1595                 azsig0 = 0;
1596                 azsig1 = 0;
1597             } else {
1598                 floatx80 z2, accum;
1599                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1600                 /* Compute z^2.  */
1601                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1602                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1603                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1604                                                    zexp + zexp - 0x3ffe,
1605                                                    z2sig0, z2sig1,
1606                                                    &env->fp_status);
1607
1608                 /* Compute the lower parts of the polynomial expansion.  */
1609                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1610                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1611                 accum = floatx80_mul(accum, z2, &env->fp_status);
1612                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1613                 accum = floatx80_mul(accum, z2, &env->fp_status);
1614                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1615                 accum = floatx80_mul(accum, z2, &env->fp_status);
1616                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1617                 accum = floatx80_mul(accum, z2, &env->fp_status);
1618                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1619                 accum = floatx80_mul(accum, z2, &env->fp_status);
1620
1621                 /*
1622                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1623                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1624                  */
1625                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1626                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1627                                      aexp - extractFloatx80Exp(accum),
1628                                      &asig0, &asig1);
1629                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1630                        &asig0, &asig1);
1631                 /* Multiply by z to compute arctan(z).  */
1632                 azexp = aexp + zexp - 0x3ffe;
1633                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1634                             &azsig2, &azsig3);
1635             }
1636
1637             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1638             if (texp == 0) {
1639                 /* z is positive.  */
1640                 axexp = azexp;
1641                 axsig0 = azsig0;
1642                 axsig1 = azsig1;
1643             } else {
1644                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1645                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1646                 uint64_t low_sig0 =
1647                     extractFloatx80Frac(fpatan_table[n].atan_low);
1648                 uint64_t low_sig1 = 0;
1649                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1650                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1651                 axsig1 = 0;
1652                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1653                                      &low_sig0, &low_sig1);
1654                 if (low_sign) {
1655                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1656                            &axsig0, &axsig1);
1657                 } else {
1658                     add128(axsig0, axsig1, low_sig0, low_sig1,
1659                            &axsig0, &axsig1);
1660                 }
1661                 if (azexp >= axexp) {
1662                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1663                                          &axsig0, &axsig1);
1664                     axexp = azexp + 1;
1665                     shift128RightJamming(azsig0, azsig1, 1,
1666                                          &azsig0, &azsig1);
1667                 } else {
1668                     shift128RightJamming(axsig0, axsig1, 1,
1669                                          &axsig0, &axsig1);
1670                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1671                                          &azsig0, &azsig1);
1672                     ++axexp;
1673                 }
1674                 if (zsign) {
1675                     sub128(axsig0, axsig1, azsig0, azsig1,
1676                            &axsig0, &axsig1);
1677                 } else {
1678                     add128(axsig0, axsig1, azsig0, azsig1,
1679                            &axsig0, &axsig1);
1680                 }
1681             }
1682
1683             if (adj_exp == 0) {
1684                 rexp = axexp;
1685                 rsig0 = axsig0;
1686                 rsig1 = axsig1;
1687             } else {
1688                 /*
1689                  * Add or subtract arctan(x) (exponent axexp,
1690                  * significand axsig0 and axsig1, positive, not
1691                  * necessarily normalized) to the number given by
1692                  * adj_exp, adj_sig0 and adj_sig1, according to
1693                  * adj_sub.
1694                  */
1695                 if (adj_exp >= axexp) {
1696                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1697                                          &axsig0, &axsig1);
1698                     rexp = adj_exp + 1;
1699                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1700                                          &adj_sig0, &adj_sig1);
1701                 } else {
1702                     shift128RightJamming(axsig0, axsig1, 1,
1703                                          &axsig0, &axsig1);
1704                     shift128RightJamming(adj_sig0, adj_sig1,
1705                                          axexp - adj_exp + 1,
1706                                          &adj_sig0, &adj_sig1);
1707                     rexp = axexp + 1;
1708                 }
1709                 if (adj_sub) {
1710                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1711                            &rsig0, &rsig1);
1712                 } else {
1713                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1714                            &rsig0, &rsig1);
1715                 }
1716             }
1717
1718             env->fp_status.float_rounding_mode = save_mode;
1719             env->fp_status.floatx80_rounding_precision = save_prec;
1720         }
1721         /* This result is inexact.  */
1722         rsig1 |= 1;
1723         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1724                                             rsig0, rsig1, &env->fp_status);
1725     }
1726
1727     fpop(env);
1728     merge_exception_flags(env, old_flags);
1729 }
1730
1731 void helper_fxtract(CPUX86State *env)
1732 {
1733     uint8_t old_flags = save_exception_flags(env);
1734     CPU_LDoubleU temp;
1735
1736     temp.d = ST0;
1737
1738     if (floatx80_is_zero(ST0)) {
1739         /* Easy way to generate -inf and raising division by 0 exception */
1740         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1741                            &env->fp_status);
1742         fpush(env);
1743         ST0 = temp.d;
1744     } else if (floatx80_invalid_encoding(ST0)) {
1745         float_raise(float_flag_invalid, &env->fp_status);
1746         ST0 = floatx80_default_nan(&env->fp_status);
1747         fpush(env);
1748         ST0 = ST1;
1749     } else if (floatx80_is_any_nan(ST0)) {
1750         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1751             float_raise(float_flag_invalid, &env->fp_status);
1752             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1753         }
1754         fpush(env);
1755         ST0 = ST1;
1756     } else if (floatx80_is_infinity(ST0)) {
1757         fpush(env);
1758         ST0 = ST1;
1759         ST1 = floatx80_infinity;
1760     } else {
1761         int expdif;
1762
1763         if (EXPD(temp) == 0) {
1764             int shift = clz64(temp.l.lower);
1765             temp.l.lower <<= shift;
1766             expdif = 1 - EXPBIAS - shift;
1767             float_raise(float_flag_input_denormal, &env->fp_status);
1768         } else {
1769             expdif = EXPD(temp) - EXPBIAS;
1770         }
1771         /* DP exponent bias */
1772         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1773         fpush(env);
1774         BIASEXPONENT(temp);
1775         ST0 = temp.d;
1776     }
1777     merge_exception_flags(env, old_flags);
1778 }
1779
1780 static void helper_fprem_common(CPUX86State *env, bool mod)
1781 {
1782     uint8_t old_flags = save_exception_flags(env);
1783     uint64_t quotient;
1784     CPU_LDoubleU temp0, temp1;
1785     int exp0, exp1, expdiff;
1786
1787     temp0.d = ST0;
1788     temp1.d = ST1;
1789     exp0 = EXPD(temp0);
1790     exp1 = EXPD(temp1);
1791
1792     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1793     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1794         exp0 == 0x7fff || exp1 == 0x7fff ||
1795         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1796         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1797     } else {
1798         if (exp0 == 0) {
1799             exp0 = 1 - clz64(temp0.l.lower);
1800         }
1801         if (exp1 == 0) {
1802             exp1 = 1 - clz64(temp1.l.lower);
1803         }
1804         expdiff = exp0 - exp1;
1805         if (expdiff < 64) {
1806             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1807             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1808             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1809             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1810         } else {
1811             /*
1812              * Partial remainder.  This choice of how many bits to
1813              * process at once is specified in AMD instruction set
1814              * manuals, and empirically is followed by Intel
1815              * processors as well; it ensures that the final remainder
1816              * operation in a loop does produce the correct low three
1817              * bits of the quotient.  AMD manuals specify that the
1818              * flags other than C2 are cleared, and empirically Intel
1819              * processors clear them as well.
1820              */
1821             int n = 32 + (expdiff % 32);
1822             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1823             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1824             env->fpus |= 0x400;  /* C2 <-- 1 */
1825         }
1826     }
1827     merge_exception_flags(env, old_flags);
1828 }
1829
1830 void helper_fprem1(CPUX86State *env)
1831 {
1832     helper_fprem_common(env, false);
1833 }
1834
1835 void helper_fprem(CPUX86State *env)
1836 {
1837     helper_fprem_common(env, true);
1838 }
1839
1840 /* 128-bit significand of log2(e).  */
1841 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1842 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1843
1844 /*
1845  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1846  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1847  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1848  * interval [sqrt(2)/2, sqrt(2)].
1849  */
1850 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1851 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1852 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1853 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1854 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1855 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1856 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1857 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1858 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1859 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1860 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1861
1862 /*
1863  * Compute an approximation of log2(1+arg), where 1+arg is in the
1864  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1865  * function is called, rounding precision is set to 80 and the
1866  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1867  * and must not be so close to zero that underflow might occur.
1868  */
1869 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1870                                 uint64_t *sig0, uint64_t *sig1)
1871 {
1872     uint64_t arg0_sig = extractFloatx80Frac(arg);
1873     int32_t arg0_exp = extractFloatx80Exp(arg);
1874     bool arg0_sign = extractFloatx80Sign(arg);
1875     bool asign;
1876     int32_t dexp, texp, aexp;
1877     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1878     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1879     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1880     floatx80 t2, accum;
1881
1882     /*
1883      * Compute an approximation of arg/(2+arg), with extra precision,
1884      * as the argument to a polynomial approximation.  The extra
1885      * precision is only needed for the first term of the
1886      * approximation, with subsequent terms being significantly
1887      * smaller; the approximation only uses odd exponents, and the
1888      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1889      */
1890     if (arg0_sign) {
1891         dexp = 0x3fff;
1892         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1893         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1894     } else {
1895         dexp = 0x4000;
1896         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1897         dsig0 |= 0x8000000000000000ULL;
1898     }
1899     texp = arg0_exp - dexp + 0x3ffe;
1900     rsig0 = arg0_sig;
1901     rsig1 = 0;
1902     rsig2 = 0;
1903     if (dsig0 <= rsig0) {
1904         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1905         ++texp;
1906     }
1907     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1908     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1909     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1910            &rsig0, &rsig1, &rsig2);
1911     while ((int64_t) rsig0 < 0) {
1912         --tsig0;
1913         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1914                &rsig0, &rsig1, &rsig2);
1915     }
1916     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1917     /*
1918      * No need to correct any estimation error in tsig1; even with
1919      * such error, it is accurate enough.  Now compute the square of
1920      * that approximation.
1921      */
1922     mul128To256(tsig0, tsig1, tsig0, tsig1,
1923                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1924     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1925                                        texp + texp - 0x3ffe,
1926                                        t2sig0, t2sig1, &env->fp_status);
1927
1928     /* Compute the lower parts of the polynomial expansion.  */
1929     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1930     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1931     accum = floatx80_mul(accum, t2, &env->fp_status);
1932     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1933     accum = floatx80_mul(accum, t2, &env->fp_status);
1934     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1935     accum = floatx80_mul(accum, t2, &env->fp_status);
1936     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1937     accum = floatx80_mul(accum, t2, &env->fp_status);
1938     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1939     accum = floatx80_mul(accum, t2, &env->fp_status);
1940     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1941     accum = floatx80_mul(accum, t2, &env->fp_status);
1942     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1943     accum = floatx80_mul(accum, t2, &env->fp_status);
1944     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1945     accum = floatx80_mul(accum, t2, &env->fp_status);
1946     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1947
1948     /*
1949      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1950      * accum has much lower magnitude, and so, in particular, carry
1951      * out of the addition is not possible), multiplied by t.  (This
1952      * expansion is only accurate to about 70 bits, not 128 bits.)
1953      */
1954     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1955     asign = extractFloatx80Sign(fyl2x_coeff_0);
1956     shift128RightJamming(extractFloatx80Frac(accum), 0,
1957                          aexp - extractFloatx80Exp(accum),
1958                          &asig0, &asig1);
1959     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1960     bsig1 = 0;
1961     if (asign == extractFloatx80Sign(accum)) {
1962         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1963     } else {
1964         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1965     }
1966     /* Multiply by t to compute the required result.  */
1967     mul128To256(asig0, asig1, tsig0, tsig1,
1968                 &asig0, &asig1, &asig2, &asig3);
1969     aexp += texp - 0x3ffe;
1970     *exp = aexp;
1971     *sig0 = asig0;
1972     *sig1 = asig1;
1973 }
1974
1975 void helper_fyl2xp1(CPUX86State *env)
1976 {
1977     uint8_t old_flags = save_exception_flags(env);
1978     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1979     int32_t arg0_exp = extractFloatx80Exp(ST0);
1980     bool arg0_sign = extractFloatx80Sign(ST0);
1981     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1982     int32_t arg1_exp = extractFloatx80Exp(ST1);
1983     bool arg1_sign = extractFloatx80Sign(ST1);
1984
1985     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1986         float_raise(float_flag_invalid, &env->fp_status);
1987         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1988     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1989         float_raise(float_flag_invalid, &env->fp_status);
1990         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1991     } else if (floatx80_invalid_encoding(ST0) ||
1992                floatx80_invalid_encoding(ST1)) {
1993         float_raise(float_flag_invalid, &env->fp_status);
1994         ST1 = floatx80_default_nan(&env->fp_status);
1995     } else if (floatx80_is_any_nan(ST0)) {
1996         ST1 = ST0;
1997     } else if (floatx80_is_any_nan(ST1)) {
1998         /* Pass this NaN through.  */
1999     } else if (arg0_exp > 0x3ffd ||
2000                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2001                                                   0x95f619980c4336f7ULL :
2002                                                   0xd413cccfe7799211ULL))) {
2003         /*
2004          * Out of range for the instruction (ST0 must have absolute
2005          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2006          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2007          * to sqrt(2) - 1, which we allow here), treat as invalid.
2008          */
2009         float_raise(float_flag_invalid, &env->fp_status);
2010         ST1 = floatx80_default_nan(&env->fp_status);
2011     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2012                arg1_exp == 0x7fff) {
2013         /*
2014          * One argument is zero, or multiplying by infinity; correct
2015          * result is exact and can be obtained by multiplying the
2016          * arguments.
2017          */
2018         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2019     } else if (arg0_exp < 0x3fb0) {
2020         /*
2021          * Multiplying both arguments and an extra-precision version
2022          * of log2(e) is sufficiently precise.
2023          */
2024         uint64_t sig0, sig1, sig2;
2025         int32_t exp;
2026         if (arg0_exp == 0) {
2027             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2028         }
2029         if (arg1_exp == 0) {
2030             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2031         }
2032         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2033                         &sig0, &sig1, &sig2);
2034         exp = arg0_exp + 1;
2035         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2036         exp += arg1_exp - 0x3ffe;
2037         /* This result is inexact.  */
2038         sig1 |= 1;
2039         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2040                                             arg0_sign ^ arg1_sign, exp,
2041                                             sig0, sig1, &env->fp_status);
2042     } else {
2043         int32_t aexp;
2044         uint64_t asig0, asig1, asig2;
2045         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2046         FloatX80RoundPrec save_prec =
2047             env->fp_status.floatx80_rounding_precision;
2048         env->fp_status.float_rounding_mode = float_round_nearest_even;
2049         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2050
2051         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2052         /*
2053          * Multiply by the second argument to compute the required
2054          * result.
2055          */
2056         if (arg1_exp == 0) {
2057             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2058         }
2059         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2060         aexp += arg1_exp - 0x3ffe;
2061         /* This result is inexact.  */
2062         asig1 |= 1;
2063         env->fp_status.float_rounding_mode = save_mode;
2064         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2065                                             arg0_sign ^ arg1_sign, aexp,
2066                                             asig0, asig1, &env->fp_status);
2067         env->fp_status.floatx80_rounding_precision = save_prec;
2068     }
2069     fpop(env);
2070     merge_exception_flags(env, old_flags);
2071 }
2072
2073 void helper_fyl2x(CPUX86State *env)
2074 {
2075     uint8_t old_flags = save_exception_flags(env);
2076     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2077     int32_t arg0_exp = extractFloatx80Exp(ST0);
2078     bool arg0_sign = extractFloatx80Sign(ST0);
2079     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2080     int32_t arg1_exp = extractFloatx80Exp(ST1);
2081     bool arg1_sign = extractFloatx80Sign(ST1);
2082
2083     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2084         float_raise(float_flag_invalid, &env->fp_status);
2085         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2086     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2087         float_raise(float_flag_invalid, &env->fp_status);
2088         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2089     } else if (floatx80_invalid_encoding(ST0) ||
2090                floatx80_invalid_encoding(ST1)) {
2091         float_raise(float_flag_invalid, &env->fp_status);
2092         ST1 = floatx80_default_nan(&env->fp_status);
2093     } else if (floatx80_is_any_nan(ST0)) {
2094         ST1 = ST0;
2095     } else if (floatx80_is_any_nan(ST1)) {
2096         /* Pass this NaN through.  */
2097     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2098         float_raise(float_flag_invalid, &env->fp_status);
2099         ST1 = floatx80_default_nan(&env->fp_status);
2100     } else if (floatx80_is_infinity(ST1)) {
2101         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2102                                              &env->fp_status);
2103         switch (cmp) {
2104         case float_relation_less:
2105             ST1 = floatx80_chs(ST1);
2106             break;
2107         case float_relation_greater:
2108             /* Result is infinity of the same sign as ST1.  */
2109             break;
2110         default:
2111             float_raise(float_flag_invalid, &env->fp_status);
2112             ST1 = floatx80_default_nan(&env->fp_status);
2113             break;
2114         }
2115     } else if (floatx80_is_infinity(ST0)) {
2116         if (floatx80_is_zero(ST1)) {
2117             float_raise(float_flag_invalid, &env->fp_status);
2118             ST1 = floatx80_default_nan(&env->fp_status);
2119         } else if (arg1_sign) {
2120             ST1 = floatx80_chs(ST0);
2121         } else {
2122             ST1 = ST0;
2123         }
2124     } else if (floatx80_is_zero(ST0)) {
2125         if (floatx80_is_zero(ST1)) {
2126             float_raise(float_flag_invalid, &env->fp_status);
2127             ST1 = floatx80_default_nan(&env->fp_status);
2128         } else {
2129             /* Result is infinity with opposite sign to ST1.  */
2130             float_raise(float_flag_divbyzero, &env->fp_status);
2131             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2132                                 0x8000000000000000ULL);
2133         }
2134     } else if (floatx80_is_zero(ST1)) {
2135         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2136             ST1 = floatx80_chs(ST1);
2137         }
2138         /* Otherwise, ST1 is already the correct result.  */
2139     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2140         if (arg1_sign) {
2141             ST1 = floatx80_chs(floatx80_zero);
2142         } else {
2143             ST1 = floatx80_zero;
2144         }
2145     } else {
2146         int32_t int_exp;
2147         floatx80 arg0_m1;
2148         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2149         FloatX80RoundPrec save_prec =
2150             env->fp_status.floatx80_rounding_precision;
2151         env->fp_status.float_rounding_mode = float_round_nearest_even;
2152         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2153
2154         if (arg0_exp == 0) {
2155             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2156         }
2157         if (arg1_exp == 0) {
2158             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2159         }
2160         int_exp = arg0_exp - 0x3fff;
2161         if (arg0_sig > 0xb504f333f9de6484ULL) {
2162             ++int_exp;
2163         }
2164         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2165                                                &env->fp_status),
2166                                floatx80_one, &env->fp_status);
2167         if (floatx80_is_zero(arg0_m1)) {
2168             /* Exact power of 2; multiply by ST1.  */
2169             env->fp_status.float_rounding_mode = save_mode;
2170             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2171                                ST1, &env->fp_status);
2172         } else {
2173             bool asign = extractFloatx80Sign(arg0_m1);
2174             int32_t aexp;
2175             uint64_t asig0, asig1, asig2;
2176             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2177             if (int_exp != 0) {
2178                 bool isign = (int_exp < 0);
2179                 int32_t iexp;
2180                 uint64_t isig;
2181                 int shift;
2182                 int_exp = isign ? -int_exp : int_exp;
2183                 shift = clz32(int_exp) + 32;
2184                 isig = int_exp;
2185                 isig <<= shift;
2186                 iexp = 0x403e - shift;
2187                 shift128RightJamming(asig0, asig1, iexp - aexp,
2188                                      &asig0, &asig1);
2189                 if (asign == isign) {
2190                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2191                 } else {
2192                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2193                 }
2194                 aexp = iexp;
2195                 asign = isign;
2196             }
2197             /*
2198              * Multiply by the second argument to compute the required
2199              * result.
2200              */
2201             if (arg1_exp == 0) {
2202                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2203             }
2204             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2205             aexp += arg1_exp - 0x3ffe;
2206             /* This result is inexact.  */
2207             asig1 |= 1;
2208             env->fp_status.float_rounding_mode = save_mode;
2209             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2210                                                 asign ^ arg1_sign, aexp,
2211                                                 asig0, asig1, &env->fp_status);
2212         }
2213
2214         env->fp_status.floatx80_rounding_precision = save_prec;
2215     }
2216     fpop(env);
2217     merge_exception_flags(env, old_flags);
2218 }
2219
2220 void helper_fsqrt(CPUX86State *env)
2221 {
2222     uint8_t old_flags = save_exception_flags(env);
2223     if (floatx80_is_neg(ST0)) {
2224         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2225         env->fpus |= 0x400;
2226     }
2227     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2228     merge_exception_flags(env, old_flags);
2229 }
2230
2231 void helper_fsincos(CPUX86State *env)
2232 {
2233     double fptemp = floatx80_to_double(env, ST0);
2234
2235     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2236         env->fpus |= 0x400;
2237     } else {
2238         ST0 = double_to_floatx80(env, sin(fptemp));
2239         fpush(env);
2240         ST0 = double_to_floatx80(env, cos(fptemp));
2241         env->fpus &= ~0x400;  /* C2 <-- 0 */
2242         /* the above code is for |arg| < 2**63 only */
2243     }
2244 }
2245
2246 void helper_frndint(CPUX86State *env)
2247 {
2248     uint8_t old_flags = save_exception_flags(env);
2249     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2250     merge_exception_flags(env, old_flags);
2251 }
2252
2253 void helper_fscale(CPUX86State *env)
2254 {
2255     uint8_t old_flags = save_exception_flags(env);
2256     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2257         float_raise(float_flag_invalid, &env->fp_status);
2258         ST0 = floatx80_default_nan(&env->fp_status);
2259     } else if (floatx80_is_any_nan(ST1)) {
2260         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2261             float_raise(float_flag_invalid, &env->fp_status);
2262         }
2263         ST0 = ST1;
2264         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2265             float_raise(float_flag_invalid, &env->fp_status);
2266             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2267         }
2268     } else if (floatx80_is_infinity(ST1) &&
2269                !floatx80_invalid_encoding(ST0) &&
2270                !floatx80_is_any_nan(ST0)) {
2271         if (floatx80_is_neg(ST1)) {
2272             if (floatx80_is_infinity(ST0)) {
2273                 float_raise(float_flag_invalid, &env->fp_status);
2274                 ST0 = floatx80_default_nan(&env->fp_status);
2275             } else {
2276                 ST0 = (floatx80_is_neg(ST0) ?
2277                        floatx80_chs(floatx80_zero) :
2278                        floatx80_zero);
2279             }
2280         } else {
2281             if (floatx80_is_zero(ST0)) {
2282                 float_raise(float_flag_invalid, &env->fp_status);
2283                 ST0 = floatx80_default_nan(&env->fp_status);
2284             } else {
2285                 ST0 = (floatx80_is_neg(ST0) ?
2286                        floatx80_chs(floatx80_infinity) :
2287                        floatx80_infinity);
2288             }
2289         }
2290     } else {
2291         int n;
2292         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2293         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2294         set_float_exception_flags(0, &env->fp_status);
2295         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2296         set_float_exception_flags(save_flags, &env->fp_status);
2297         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2298         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2299         env->fp_status.floatx80_rounding_precision = save;
2300     }
2301     merge_exception_flags(env, old_flags);
2302 }
2303
2304 void helper_fsin(CPUX86State *env)
2305 {
2306     double fptemp = floatx80_to_double(env, ST0);
2307
2308     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2309         env->fpus |= 0x400;
2310     } else {
2311         ST0 = double_to_floatx80(env, sin(fptemp));
2312         env->fpus &= ~0x400;  /* C2 <-- 0 */
2313         /* the above code is for |arg| < 2**53 only */
2314     }
2315 }
2316
2317 void helper_fcos(CPUX86State *env)
2318 {
2319     double fptemp = floatx80_to_double(env, ST0);
2320
2321     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2322         env->fpus |= 0x400;
2323     } else {
2324         ST0 = double_to_floatx80(env, cos(fptemp));
2325         env->fpus &= ~0x400;  /* C2 <-- 0 */
2326         /* the above code is for |arg| < 2**63 only */
2327     }
2328 }
2329
2330 void helper_fxam_ST0(CPUX86State *env)
2331 {
2332     CPU_LDoubleU temp;
2333     int expdif;
2334
2335     temp.d = ST0;
2336
2337     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2338     if (SIGND(temp)) {
2339         env->fpus |= 0x200; /* C1 <-- 1 */
2340     }
2341
2342     if (env->fptags[env->fpstt]) {
2343         env->fpus |= 0x4100; /* Empty */
2344         return;
2345     }
2346
2347     expdif = EXPD(temp);
2348     if (expdif == MAXEXPD) {
2349         if (MANTD(temp) == 0x8000000000000000ULL) {
2350             env->fpus |= 0x500; /* Infinity */
2351         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2352             env->fpus |= 0x100; /* NaN */
2353         }
2354     } else if (expdif == 0) {
2355         if (MANTD(temp) == 0) {
2356             env->fpus |=  0x4000; /* Zero */
2357         } else {
2358             env->fpus |= 0x4400; /* Denormal */
2359         }
2360     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2361         env->fpus |= 0x400;
2362     }
2363 }
2364
2365 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2366                       uintptr_t retaddr)
2367 {
2368     int fpus, fptag, exp, i;
2369     uint64_t mant;
2370     CPU_LDoubleU tmp;
2371
2372     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2373     fptag = 0;
2374     for (i = 7; i >= 0; i--) {
2375         fptag <<= 2;
2376         if (env->fptags[i]) {
2377             fptag |= 3;
2378         } else {
2379             tmp.d = env->fpregs[i].d;
2380             exp = EXPD(tmp);
2381             mant = MANTD(tmp);
2382             if (exp == 0 && mant == 0) {
2383                 /* zero */
2384                 fptag |= 1;
2385             } else if (exp == 0 || exp == MAXEXPD
2386                        || (mant & (1LL << 63)) == 0) {
2387                 /* NaNs, infinity, denormal */
2388                 fptag |= 2;
2389             }
2390         }
2391     }
2392     if (data32) {
2393         /* 32 bit */
2394         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2395         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2396         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2397         cpu_stl_data_ra(env, ptr + 12, env->fpip, retaddr); /* fpip */
2398         cpu_stl_data_ra(env, ptr + 16, env->fpcs, retaddr); /* fpcs */
2399         cpu_stl_data_ra(env, ptr + 20, env->fpdp, retaddr); /* fpoo */
2400         cpu_stl_data_ra(env, ptr + 24, env->fpds, retaddr); /* fpos */
2401     } else {
2402         /* 16 bit */
2403         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2404         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2405         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2406         cpu_stw_data_ra(env, ptr + 6, env->fpip, retaddr);
2407         cpu_stw_data_ra(env, ptr + 8, env->fpcs, retaddr);
2408         cpu_stw_data_ra(env, ptr + 10, env->fpdp, retaddr);
2409         cpu_stw_data_ra(env, ptr + 12, env->fpds, retaddr);
2410     }
2411 }
2412
2413 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2414 {
2415     do_fstenv(env, ptr, data32, GETPC());
2416 }
2417
2418 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2419 {
2420     env->fpstt = (fpus >> 11) & 7;
2421     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2422     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2423 #if !defined(CONFIG_USER_ONLY)
2424     if (!(env->fpus & FPUS_SE)) {
2425         /*
2426          * Here the processor deasserts FERR#; in response, the chipset deasserts
2427          * IGNNE#.
2428          */
2429         cpu_clear_ignne();
2430     }
2431 #endif
2432 }
2433
2434 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2435                       uintptr_t retaddr)
2436 {
2437     int i, fpus, fptag;
2438
2439     if (data32) {
2440         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2441         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2442         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2443     } else {
2444         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2445         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2446         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2447     }
2448     cpu_set_fpus(env, fpus);
2449     for (i = 0; i < 8; i++) {
2450         env->fptags[i] = ((fptag & 3) == 3);
2451         fptag >>= 2;
2452     }
2453 }
2454
2455 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2456 {
2457     do_fldenv(env, ptr, data32, GETPC());
2458 }
2459
2460 static void do_fsave(CPUX86State *env, target_ulong ptr, int data32,
2461                      uintptr_t retaddr)
2462 {
2463     floatx80 tmp;
2464     int i;
2465
2466     do_fstenv(env, ptr, data32, retaddr);
2467
2468     ptr += (target_ulong)14 << data32;
2469     for (i = 0; i < 8; i++) {
2470         tmp = ST(i);
2471         do_fstt(env, tmp, ptr, retaddr);
2472         ptr += 10;
2473     }
2474
2475     do_fninit(env);
2476 }
2477
2478 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2479 {
2480     do_fsave(env, ptr, data32, GETPC());
2481 }
2482
2483 static void do_frstor(CPUX86State *env, target_ulong ptr, int data32,
2484                       uintptr_t retaddr)
2485 {
2486     floatx80 tmp;
2487     int i;
2488
2489     do_fldenv(env, ptr, data32, retaddr);
2490     ptr += (target_ulong)14 << data32;
2491
2492     for (i = 0; i < 8; i++) {
2493         tmp = do_fldt(env, ptr, retaddr);
2494         ST(i) = tmp;
2495         ptr += 10;
2496     }
2497 }
2498
2499 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2500 {
2501     do_frstor(env, ptr, data32, GETPC());
2502 }
2503
2504 #define XO(X)  offsetof(X86XSaveArea, X)
2505
2506 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2507 {
2508     int fpus, fptag, i;
2509     target_ulong addr;
2510
2511     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2512     fptag = 0;
2513     for (i = 0; i < 8; i++) {
2514         fptag |= (env->fptags[i] << i);
2515     }
2516
2517     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2518     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2519     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2520
2521     /* In 32-bit mode this is eip, sel, dp, sel.
2522        In 64-bit mode this is rip, rdp.
2523        But in either case we don't write actual data, just zeros.  */
2524     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2525     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2526
2527     addr = ptr + XO(legacy.fpregs);
2528     for (i = 0; i < 8; i++) {
2529         floatx80 tmp = ST(i);
2530         do_fstt(env, tmp, addr, ra);
2531         addr += 16;
2532     }
2533 }
2534
2535 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2536 {
2537     update_mxcsr_from_sse_status(env);
2538     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2539     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2540 }
2541
2542 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2543 {
2544     int i, nb_xmm_regs;
2545     target_ulong addr;
2546
2547     if (env->hflags & HF_CS64_MASK) {
2548         nb_xmm_regs = 16;
2549     } else {
2550         nb_xmm_regs = 8;
2551     }
2552
2553     addr = ptr + XO(legacy.xmm_regs);
2554     for (i = 0; i < nb_xmm_regs; i++) {
2555         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2556         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2557         addr += 16;
2558     }
2559 }
2560
2561 static void do_xsave_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2562 {
2563     int i, nb_xmm_regs;
2564
2565     if (env->hflags & HF_CS64_MASK) {
2566         nb_xmm_regs = 16;
2567     } else {
2568         nb_xmm_regs = 8;
2569     }
2570
2571     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2572         cpu_stq_data_ra(env, ptr, env->xmm_regs[i].ZMM_Q(2), ra);
2573         cpu_stq_data_ra(env, ptr + 8, env->xmm_regs[i].ZMM_Q(3), ra);
2574     }
2575 }
2576
2577 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2578 {
2579     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2580     int i;
2581
2582     for (i = 0; i < 4; i++, addr += 16) {
2583         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2584         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2585     }
2586 }
2587
2588 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2589 {
2590     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2591                     env->bndcs_regs.cfgu, ra);
2592     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2593                     env->bndcs_regs.sts, ra);
2594 }
2595
2596 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2597 {
2598     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2599 }
2600
2601 static void do_fxsave(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2602 {
2603     /* The operand must be 16 byte aligned */
2604     if (ptr & 0xf) {
2605         raise_exception_ra(env, EXCP0D_GPF, ra);
2606     }
2607
2608     do_xsave_fpu(env, ptr, ra);
2609
2610     if (env->cr[4] & CR4_OSFXSR_MASK) {
2611         do_xsave_mxcsr(env, ptr, ra);
2612         /* Fast FXSAVE leaves out the XMM registers */
2613         if (!(env->efer & MSR_EFER_FFXSR)
2614             || (env->hflags & HF_CPL_MASK)
2615             || !(env->hflags & HF_LMA_MASK)) {
2616             do_xsave_sse(env, ptr, ra);
2617         }
2618     }
2619 }
2620
2621 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2622 {
2623     do_fxsave(env, ptr, GETPC());
2624 }
2625
2626 static uint64_t get_xinuse(CPUX86State *env)
2627 {
2628     uint64_t inuse = -1;
2629
2630     /* For the most part, we don't track XINUSE.  We could calculate it
2631        here for all components, but it's probably less work to simply
2632        indicate in use.  That said, the state of BNDREGS is important
2633        enough to track in HFLAGS, so we might as well use that here.  */
2634     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2635        inuse &= ~XSTATE_BNDREGS_MASK;
2636     }
2637     return inuse;
2638 }
2639
2640 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2641                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2642 {
2643     uint64_t old_bv, new_bv;
2644
2645     /* The OS must have enabled XSAVE.  */
2646     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2647         raise_exception_ra(env, EXCP06_ILLOP, ra);
2648     }
2649
2650     /* The operand must be 64 byte aligned.  */
2651     if (ptr & 63) {
2652         raise_exception_ra(env, EXCP0D_GPF, ra);
2653     }
2654
2655     /* Never save anything not enabled by XCR0.  */
2656     rfbm &= env->xcr0;
2657     opt &= rfbm;
2658
2659     if (opt & XSTATE_FP_MASK) {
2660         do_xsave_fpu(env, ptr, ra);
2661     }
2662     if (rfbm & XSTATE_SSE_MASK) {
2663         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2664         do_xsave_mxcsr(env, ptr, ra);
2665     }
2666     if (opt & XSTATE_SSE_MASK) {
2667         do_xsave_sse(env, ptr, ra);
2668     }
2669     if (opt & XSTATE_YMM_MASK) {
2670         do_xsave_ymmh(env, ptr + XO(avx_state), ra);
2671     }
2672     if (opt & XSTATE_BNDREGS_MASK) {
2673         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2674     }
2675     if (opt & XSTATE_BNDCSR_MASK) {
2676         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2677     }
2678     if (opt & XSTATE_PKRU_MASK) {
2679         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2680     }
2681
2682     /* Update the XSTATE_BV field.  */
2683     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2684     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2685     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2686 }
2687
2688 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2689 {
2690     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2691 }
2692
2693 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2694 {
2695     uint64_t inuse = get_xinuse(env);
2696     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2697 }
2698
2699 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2700 {
2701     int i, fpuc, fpus, fptag;
2702     target_ulong addr;
2703
2704     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2705     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2706     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2707     cpu_set_fpuc(env, fpuc);
2708     cpu_set_fpus(env, fpus);
2709     fptag ^= 0xff;
2710     for (i = 0; i < 8; i++) {
2711         env->fptags[i] = ((fptag >> i) & 1);
2712     }
2713
2714     addr = ptr + XO(legacy.fpregs);
2715     for (i = 0; i < 8; i++) {
2716         floatx80 tmp = do_fldt(env, addr, ra);
2717         ST(i) = tmp;
2718         addr += 16;
2719     }
2720 }
2721
2722 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2723 {
2724     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2725 }
2726
2727 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2728 {
2729     int i, nb_xmm_regs;
2730     target_ulong addr;
2731
2732     if (env->hflags & HF_CS64_MASK) {
2733         nb_xmm_regs = 16;
2734     } else {
2735         nb_xmm_regs = 8;
2736     }
2737
2738     addr = ptr + XO(legacy.xmm_regs);
2739     for (i = 0; i < nb_xmm_regs; i++) {
2740         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2741         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2742         addr += 16;
2743     }
2744 }
2745
2746 static void do_clear_sse(CPUX86State *env)
2747 {
2748     int i, nb_xmm_regs;
2749
2750     if (env->hflags & HF_CS64_MASK) {
2751         nb_xmm_regs = 16;
2752     } else {
2753         nb_xmm_regs = 8;
2754     }
2755
2756     for (i = 0; i < nb_xmm_regs; i++) {
2757         env->xmm_regs[i].ZMM_Q(0) = 0;
2758         env->xmm_regs[i].ZMM_Q(1) = 0;
2759     }
2760 }
2761
2762 static void do_xrstor_ymmh(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2763 {
2764     int i, nb_xmm_regs;
2765
2766     if (env->hflags & HF_CS64_MASK) {
2767         nb_xmm_regs = 16;
2768     } else {
2769         nb_xmm_regs = 8;
2770     }
2771
2772     for (i = 0; i < nb_xmm_regs; i++, ptr += 16) {
2773         env->xmm_regs[i].ZMM_Q(2) = cpu_ldq_data_ra(env, ptr, ra);
2774         env->xmm_regs[i].ZMM_Q(3) = cpu_ldq_data_ra(env, ptr + 8, ra);
2775     }
2776 }
2777
2778 static void do_clear_ymmh(CPUX86State *env)
2779 {
2780     int i, nb_xmm_regs;
2781
2782     if (env->hflags & HF_CS64_MASK) {
2783         nb_xmm_regs = 16;
2784     } else {
2785         nb_xmm_regs = 8;
2786     }
2787
2788     for (i = 0; i < nb_xmm_regs; i++) {
2789         env->xmm_regs[i].ZMM_Q(2) = 0;
2790         env->xmm_regs[i].ZMM_Q(3) = 0;
2791     }
2792 }
2793
2794 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2795 {
2796     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2797     int i;
2798
2799     for (i = 0; i < 4; i++, addr += 16) {
2800         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2801         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2802     }
2803 }
2804
2805 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2806 {
2807     /* FIXME: Extend highest implemented bit of linear address.  */
2808     env->bndcs_regs.cfgu
2809         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2810     env->bndcs_regs.sts
2811         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2812 }
2813
2814 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2815 {
2816     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2817 }
2818
2819 static void do_fxrstor(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2820 {
2821     /* The operand must be 16 byte aligned */
2822     if (ptr & 0xf) {
2823         raise_exception_ra(env, EXCP0D_GPF, ra);
2824     }
2825
2826     do_xrstor_fpu(env, ptr, ra);
2827
2828     if (env->cr[4] & CR4_OSFXSR_MASK) {
2829         do_xrstor_mxcsr(env, ptr, ra);
2830         /* Fast FXRSTOR leaves out the XMM registers */
2831         if (!(env->efer & MSR_EFER_FFXSR)
2832             || (env->hflags & HF_CPL_MASK)
2833             || !(env->hflags & HF_LMA_MASK)) {
2834             do_xrstor_sse(env, ptr, ra);
2835         }
2836     }
2837 }
2838
2839 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2840 {
2841     do_fxrstor(env, ptr, GETPC());
2842 }
2843
2844 static void do_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm, uintptr_t ra)
2845 {
2846     uint64_t xstate_bv, xcomp_bv, reserve0;
2847
2848     rfbm &= env->xcr0;
2849
2850     /* The OS must have enabled XSAVE.  */
2851     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2852         raise_exception_ra(env, EXCP06_ILLOP, ra);
2853     }
2854
2855     /* The operand must be 64 byte aligned.  */
2856     if (ptr & 63) {
2857         raise_exception_ra(env, EXCP0D_GPF, ra);
2858     }
2859
2860     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2861
2862     if ((int64_t)xstate_bv < 0) {
2863         /* FIXME: Compact form.  */
2864         raise_exception_ra(env, EXCP0D_GPF, ra);
2865     }
2866
2867     /* Standard form.  */
2868
2869     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2870     if (xstate_bv & ~env->xcr0) {
2871         raise_exception_ra(env, EXCP0D_GPF, ra);
2872     }
2873
2874     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2875        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2876        describes only XCOMP_BV, but the description of the standard form
2877        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2878        includes the next 64-bit field.  */
2879     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2880     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2881     if (xcomp_bv || reserve0) {
2882         raise_exception_ra(env, EXCP0D_GPF, ra);
2883     }
2884
2885     if (rfbm & XSTATE_FP_MASK) {
2886         if (xstate_bv & XSTATE_FP_MASK) {
2887             do_xrstor_fpu(env, ptr, ra);
2888         } else {
2889             do_fninit(env);
2890             memset(env->fpregs, 0, sizeof(env->fpregs));
2891         }
2892     }
2893     if (rfbm & XSTATE_SSE_MASK) {
2894         /* Note that the standard form of XRSTOR loads MXCSR from memory
2895            whether or not the XSTATE_BV bit is set.  */
2896         do_xrstor_mxcsr(env, ptr, ra);
2897         if (xstate_bv & XSTATE_SSE_MASK) {
2898             do_xrstor_sse(env, ptr, ra);
2899         } else {
2900             do_clear_sse(env);
2901         }
2902     }
2903     if (rfbm & XSTATE_YMM_MASK) {
2904         if (xstate_bv & XSTATE_YMM_MASK) {
2905             do_xrstor_ymmh(env, ptr + XO(avx_state), ra);
2906         } else {
2907             do_clear_ymmh(env);
2908         }
2909     }
2910     if (rfbm & XSTATE_BNDREGS_MASK) {
2911         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2912             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2913             env->hflags |= HF_MPX_IU_MASK;
2914         } else {
2915             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2916             env->hflags &= ~HF_MPX_IU_MASK;
2917         }
2918     }
2919     if (rfbm & XSTATE_BNDCSR_MASK) {
2920         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2921             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2922         } else {
2923             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2924         }
2925         cpu_sync_bndcs_hflags(env);
2926     }
2927     if (rfbm & XSTATE_PKRU_MASK) {
2928         uint64_t old_pkru = env->pkru;
2929         if (xstate_bv & XSTATE_PKRU_MASK) {
2930             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2931         } else {
2932             env->pkru = 0;
2933         }
2934         if (env->pkru != old_pkru) {
2935             CPUState *cs = env_cpu(env);
2936             tlb_flush(cs);
2937         }
2938     }
2939 }
2940
2941 #undef XO
2942
2943 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2944 {
2945     do_xrstor(env, ptr, rfbm, GETPC());
2946 }
2947
2948 #if defined(CONFIG_USER_ONLY)
2949 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2950 {
2951     do_fsave(env, ptr, data32, 0);
2952 }
2953
2954 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2955 {
2956     do_frstor(env, ptr, data32, 0);
2957 }
2958
2959 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2960 {
2961     do_fxsave(env, ptr, 0);
2962 }
2963
2964 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2965 {
2966     do_fxrstor(env, ptr, 0);
2967 }
2968
2969 void cpu_x86_xsave(CPUX86State *env, target_ulong ptr)
2970 {
2971     do_xsave(env, ptr, -1, get_xinuse(env), -1, 0);
2972 }
2973
2974 void cpu_x86_xrstor(CPUX86State *env, target_ulong ptr)
2975 {
2976     do_xrstor(env, ptr, -1, 0);
2977 }
2978 #endif
2979
2980 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2981 {
2982     /* The OS must have enabled XSAVE.  */
2983     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2984         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2985     }
2986
2987     switch (ecx) {
2988     case 0:
2989         return env->xcr0;
2990     case 1:
2991         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2992             return env->xcr0 & get_xinuse(env);
2993         }
2994         break;
2995     }
2996     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2997 }
2998
2999 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
3000 {
3001     uint32_t dummy, ena_lo, ena_hi;
3002     uint64_t ena;
3003
3004     /* The OS must have enabled XSAVE.  */
3005     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
3006         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
3007     }
3008
3009     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
3010     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
3011         goto do_gpf;
3012     }
3013
3014     /* Disallow enabling unimplemented features.  */
3015     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
3016     ena = ((uint64_t)ena_hi << 32) | ena_lo;
3017     if (mask & ~ena) {
3018         goto do_gpf;
3019     }
3020
3021     /* Disallow enabling only half of MPX.  */
3022     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
3023         & XSTATE_BNDCSR_MASK) {
3024         goto do_gpf;
3025     }
3026
3027     env->xcr0 = mask;
3028     cpu_sync_bndcs_hflags(env);
3029     cpu_sync_avx_hflag(env);
3030     return;
3031
3032  do_gpf:
3033     raise_exception_ra(env, EXCP0D_GPF, GETPC());
3034 }
3035
3036 /* MMX/SSE */
3037 /* XXX: optimize by storing fptt and fptags in the static cpu state */
3038
3039 #define SSE_DAZ             0x0040
3040 #define SSE_RC_SHIFT        13
3041 #define SSE_RC_MASK         (3 << SSE_RC_SHIFT)
3042 #define SSE_FZ              0x8000
3043
3044 void update_mxcsr_status(CPUX86State *env)
3045 {
3046     uint32_t mxcsr = env->mxcsr;
3047     int rnd_type;
3048
3049     /* set rounding mode */
3050     rnd_type = (mxcsr & SSE_RC_MASK) >> SSE_RC_SHIFT;
3051     set_x86_rounding_mode(rnd_type, &env->sse_status);
3052
3053     /* Set exception flags.  */
3054     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
3055                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
3056                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
3057                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
3058                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
3059                               &env->sse_status);
3060
3061     /* set denormals are zero */
3062     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
3063
3064     /* set flush to zero */
3065     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
3066 }
3067
3068 void update_mxcsr_from_sse_status(CPUX86State *env)
3069 {
3070     uint8_t flags = get_float_exception_flags(&env->sse_status);
3071     /*
3072      * The MXCSR denormal flag has opposite semantics to
3073      * float_flag_input_denormal (the softfloat code sets that flag
3074      * only when flushing input denormals to zero, but SSE sets it
3075      * only when not flushing them to zero), so is not converted
3076      * here.
3077      */
3078     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3079                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3080                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3081                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3082                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3083                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3084                     0));
3085 }
3086
3087 void helper_update_mxcsr(CPUX86State *env)
3088 {
3089     update_mxcsr_from_sse_status(env);
3090 }
3091
3092 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3093 {
3094     cpu_set_mxcsr(env, val);
3095 }
3096
3097 void helper_enter_mmx(CPUX86State *env)
3098 {
3099     env->fpstt = 0;
3100     *(uint32_t *)(env->fptags) = 0;
3101     *(uint32_t *)(env->fptags + 4) = 0;
3102 }
3103
3104 void helper_emms(CPUX86State *env)
3105 {
3106     /* set to empty state */
3107     *(uint32_t *)(env->fptags) = 0x01010101;
3108     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3109 }
3110
3111 #define SHIFT 0
3112 #include "ops_sse.h"
3113
3114 #define SHIFT 1
3115 #include "ops_sse.h"
3116
3117 #define SHIFT 2
3118 #include "ops_sse.h"