VEX/priv/host_amd64_defs.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                                 host_amd64_defs.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 #include "libvex_basictypes.h"
  35 #include "libvex.h"
  36 #include "libvex_trc_values.h"
  37
  38 #include "main_util.h"
  39 #include "host_generic_regs.h"
  40 #include "host_amd64_defs.h"
  41
  42
  43 /* --------- Registers. --------- */
  44
  45 const RRegUniverse* getRRegUniverse_AMD64 ( void )
  46 {
  47    /* The real-register universe is a big constant, so we just want to
  48       initialise it once. */
  49    static RRegUniverse rRegUniverse_AMD64;
  50    static Bool         rRegUniverse_AMD64_initted = False;
  51
  52    /* Handy shorthand, nothing more */
  53    RRegUniverse* ru = &rRegUniverse_AMD64;
  54
  55    /* This isn't thread-safe.  Sigh. */
  56    if (LIKELY(rRegUniverse_AMD64_initted))
  57       return ru;
  58
  59    RRegUniverse__init(ru);
  60
  61    /* Add the registers.  The initial segment of this array must be
  62       those available for allocation by reg-alloc, and those that
  63       follow are not available for allocation. */
  64    ru->allocable_start[HRcInt64] = ru->size;
  65    ru->regs[ru->size++] = hregAMD64_R12();
  66    ru->regs[ru->size++] = hregAMD64_R13();
  67    ru->regs[ru->size++] = hregAMD64_R14();
  68    ru->regs[ru->size++] = hregAMD64_R15();
  69    ru->regs[ru->size++] = hregAMD64_RBX();
  70    ru->regs[ru->size++] = hregAMD64_RSI();
  71    ru->regs[ru->size++] = hregAMD64_RDI();
  72    ru->regs[ru->size++] = hregAMD64_R8();
  73    ru->regs[ru->size++] = hregAMD64_R9();
  74    ru->regs[ru->size++] = hregAMD64_R10();
  75    ru->allocable_end[HRcInt64] = ru->size - 1;
  76
  77    ru->allocable_start[HRcVec128] = ru->size;
  78    ru->regs[ru->size++] = hregAMD64_XMM3();
  79    ru->regs[ru->size++] = hregAMD64_XMM4();
  80    ru->regs[ru->size++] = hregAMD64_XMM5();
  81    ru->regs[ru->size++] = hregAMD64_XMM6();
  82    ru->regs[ru->size++] = hregAMD64_XMM7();
  83    ru->regs[ru->size++] = hregAMD64_XMM8();
  84    ru->regs[ru->size++] = hregAMD64_XMM9();
  85    ru->regs[ru->size++] = hregAMD64_XMM10();
  86    ru->regs[ru->size++] = hregAMD64_XMM11();
  87    ru->regs[ru->size++] = hregAMD64_XMM12();
  88    ru->allocable_end[HRcVec128] = ru->size - 1;
  89    ru->allocable = ru->size;
  90
  91    /* And other regs, not available to the allocator. */
  92    ru->regs[ru->size++] = hregAMD64_RAX();
  93    ru->regs[ru->size++] = hregAMD64_RCX();
  94    ru->regs[ru->size++] = hregAMD64_RDX();
  95    ru->regs[ru->size++] = hregAMD64_RSP();
  96    ru->regs[ru->size++] = hregAMD64_RBP();
  97    ru->regs[ru->size++] = hregAMD64_R11();
  98    ru->regs[ru->size++] = hregAMD64_XMM0();
  99    ru->regs[ru->size++] = hregAMD64_XMM1();
 100
 101    rRegUniverse_AMD64_initted = True;
 102
 103    RRegUniverse__check_is_sane(ru);
 104    return ru;
 105 }
 106
 107
 108 UInt ppHRegAMD64 ( HReg reg )
 109 {
 110    Int r;
 111    static const HChar* ireg64_names[16]
 112      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
 113          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
 114    /* Be generic for all virtual regs. */
 115    if (hregIsVirtual(reg)) {
 116       return ppHReg(reg);
 117    }
 118    /* But specific for real regs. */
 119    switch (hregClass(reg)) {
 120       case HRcInt64:
 121          r = hregEncoding(reg);
 122          vassert(r >= 0 && r < 16);
 123          return vex_printf("%s", ireg64_names[r]);
 124       case HRcVec128:
 125          r = hregEncoding(reg);
 126          vassert(r >= 0 && r < 16);
 127          return vex_printf("%%xmm%d", r);
 128       default:
 129          vpanic("ppHRegAMD64");
 130    }
 131 }
 132
 133 static UInt ppHRegAMD64_lo32 ( HReg reg )
 134 {
 135    Int r;
 136    static const HChar* ireg32_names[16]
 137      = { "%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
 138          "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
 139    /* Be generic for all virtual regs. */
 140    if (hregIsVirtual(reg)) {
 141       UInt written = ppHReg(reg);
 142       written += vex_printf("d");
 143       return written;
 144    }
 145    /* But specific for real regs. */
 146    switch (hregClass(reg)) {
 147       case HRcInt64:
 148          r = hregEncoding(reg);
 149          vassert(r >= 0 && r < 16);
 150          return vex_printf("%s", ireg32_names[r]);
 151       default:
 152          vpanic("ppHRegAMD64_lo32: invalid regclass");
 153    }
 154 }
 155
 156
 157 /* --------- Condition codes, Intel encoding. --------- */
 158
 159 const HChar* showAMD64CondCode ( AMD64CondCode cond )
 160 {
 161    switch (cond) {
 162       case Acc_O:      return "o";
 163       case Acc_NO:     return "no";
 164       case Acc_B:      return "b";
 165       case Acc_NB:     return "nb";
 166       case Acc_Z:      return "z";
 167       case Acc_NZ:     return "nz";
 168       case Acc_BE:     return "be";
 169       case Acc_NBE:    return "nbe";
 170       case Acc_S:      return "s";
 171       case Acc_NS:     return "ns";
 172       case Acc_P:      return "p";
 173       case Acc_NP:     return "np";
 174       case Acc_L:      return "l";
 175       case Acc_NL:     return "nl";
 176       case Acc_LE:     return "le";
 177       case Acc_NLE:    return "nle";
 178       case Acc_ALWAYS: return "ALWAYS";
 179       default: vpanic("ppAMD64CondCode");
 180    }
 181 }
 182
 183
 184 /* --------- AMD64AMode: memory address expressions. --------- */
 185
 186 AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
 187    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
 188    am->tag        = Aam_IR;
 189    am->Aam.IR.imm = imm32;
 190    am->Aam.IR.reg = reg;
 191    return am;
 192 }
 193 AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
 194    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
 195    am->tag = Aam_IRRS;
 196    am->Aam.IRRS.imm   = imm32;
 197    am->Aam.IRRS.base  = base;
 198    am->Aam.IRRS.index = indEx;
 199    am->Aam.IRRS.shift = shift;
 200    vassert(shift >= 0 && shift <= 3);
 201    return am;
 202 }
 203
 204 void ppAMD64AMode ( AMD64AMode* am ) {
 205    switch (am->tag) {
 206       case Aam_IR:
 207          if (am->Aam.IR.imm == 0)
 208             vex_printf("(");
 209          else
 210             vex_printf("0x%x(", am->Aam.IR.imm);
 211          ppHRegAMD64(am->Aam.IR.reg);
 212          vex_printf(")");
 213          return;
 214       case Aam_IRRS:
 215          vex_printf("0x%x(", am->Aam.IRRS.imm);
 216          ppHRegAMD64(am->Aam.IRRS.base);
 217          vex_printf(",");
 218          ppHRegAMD64(am->Aam.IRRS.index);
 219          vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
 220          return;
 221       default:
 222          vpanic("ppAMD64AMode");
 223    }
 224 }
 225
 226 static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
 227    switch (am->tag) {
 228       case Aam_IR:
 229          addHRegUse(u, HRmRead, am->Aam.IR.reg);
 230          return;
 231       case Aam_IRRS:
 232          addHRegUse(u, HRmRead, am->Aam.IRRS.base);
 233          addHRegUse(u, HRmRead, am->Aam.IRRS.index);
 234          return;
 235       default:
 236          vpanic("addRegUsage_AMD64AMode");
 237    }
 238 }
 239
 240 static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
 241    switch (am->tag) {
 242       case Aam_IR:
 243          am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
 244          return;
 245       case Aam_IRRS:
 246          am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
 247          am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
 248          return;
 249       default:
 250          vpanic("mapRegs_AMD64AMode");
 251    }
 252 }
 253
 254 /* --------- Operand, which can be reg, immediate or memory. --------- */
 255
 256 AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
 257    AMD64RMI* op       = LibVEX_Alloc_inline(sizeof(AMD64RMI));
 258    op->tag            = Armi_Imm;
 259    op->Armi.Imm.imm32 = imm32;
 260    return op;
 261 }
 262 AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
 263    AMD64RMI* op     = LibVEX_Alloc_inline(sizeof(AMD64RMI));
 264    op->tag          = Armi_Reg;
 265    op->Armi.Reg.reg = reg;
 266    return op;
 267 }
 268 AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
 269    AMD64RMI* op    = LibVEX_Alloc_inline(sizeof(AMD64RMI));
 270    op->tag         = Armi_Mem;
 271    op->Armi.Mem.am = am;
 272    return op;
 273 }
 274
 275 static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
 276    switch (op->tag) {
 277       case Armi_Imm:
 278          vex_printf("$0x%x", op->Armi.Imm.imm32);
 279          return;
 280       case Armi_Reg:
 281          if (lo32)
 282             ppHRegAMD64_lo32(op->Armi.Reg.reg);
 283          else
 284             ppHRegAMD64(op->Armi.Reg.reg);
 285          return;
 286       case Armi_Mem:
 287          ppAMD64AMode(op->Armi.Mem.am);
 288          return;
 289      default:
 290          vpanic("ppAMD64RMI");
 291    }
 292 }
 293 void ppAMD64RMI ( AMD64RMI* op ) {
 294    ppAMD64RMI_wrk(op, False/*!lo32*/);
 295 }
 296 void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
 297    ppAMD64RMI_wrk(op, True/*lo32*/);
 298 }
 299
 300 /* An AMD64RMI can only be used in a "read" context (what would it mean
 301    to write or modify a literal?) and so we enumerate its registers
 302    accordingly. */
 303 static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
 304    switch (op->tag) {
 305       case Armi_Imm:
 306          return;
 307       case Armi_Reg:
 308          addHRegUse(u, HRmRead, op->Armi.Reg.reg);
 309          return;
 310       case Armi_Mem:
 311          addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
 312          return;
 313       default:
 314          vpanic("addRegUsage_AMD64RMI");
 315    }
 316 }
 317
 318 static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
 319    switch (op->tag) {
 320       case Armi_Imm:
 321          return;
 322       case Armi_Reg:
 323          op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
 324          return;
 325       case Armi_Mem:
 326          mapRegs_AMD64AMode(m, op->Armi.Mem.am);
 327          return;
 328       default:
 329          vpanic("mapRegs_AMD64RMI");
 330    }
 331 }
 332
 333
 334 /* --------- Operand, which can be reg or immediate only. --------- */
 335
 336 AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
 337    AMD64RI* op       = LibVEX_Alloc_inline(sizeof(AMD64RI));
 338    op->tag           = Ari_Imm;
 339    op->Ari.Imm.imm32 = imm32;
 340    return op;
 341 }
 342 AMD64RI* AMD64RI_Reg ( HReg reg ) {
 343    AMD64RI* op     = LibVEX_Alloc_inline(sizeof(AMD64RI));
 344    op->tag         = Ari_Reg;
 345    op->Ari.Reg.reg = reg;
 346    return op;
 347 }
 348
 349 void ppAMD64RI ( AMD64RI* op ) {
 350    switch (op->tag) {
 351       case Ari_Imm:
 352          vex_printf("$0x%x", op->Ari.Imm.imm32);
 353          return;
 354       case Ari_Reg:
 355          ppHRegAMD64(op->Ari.Reg.reg);
 356          return;
 357      default:
 358          vpanic("ppAMD64RI");
 359    }
 360 }
 361
 362 /* An AMD64RI can only be used in a "read" context (what would it mean
 363    to write or modify a literal?) and so we enumerate its registers
 364    accordingly. */
 365 static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
 366    switch (op->tag) {
 367       case Ari_Imm:
 368          return;
 369       case Ari_Reg:
 370          addHRegUse(u, HRmRead, op->Ari.Reg.reg);
 371          return;
 372       default:
 373          vpanic("addRegUsage_AMD64RI");
 374    }
 375 }
 376
 377 static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
 378    switch (op->tag) {
 379       case Ari_Imm:
 380          return;
 381       case Ari_Reg:
 382          op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
 383          return;
 384       default:
 385          vpanic("mapRegs_AMD64RI");
 386    }
 387 }
 388
 389
 390 /* --------- Operand, which can be reg or memory only. --------- */
 391
 392 AMD64RM* AMD64RM_Reg ( HReg reg ) {
 393    AMD64RM* op       = LibVEX_Alloc_inline(sizeof(AMD64RM));
 394    op->tag         = Arm_Reg;
 395    op->Arm.Reg.reg = reg;
 396    return op;
 397 }
 398 AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
 399    AMD64RM* op    = LibVEX_Alloc_inline(sizeof(AMD64RM));
 400    op->tag        = Arm_Mem;
 401    op->Arm.Mem.am = am;
 402    return op;
 403 }
 404
 405 void ppAMD64RM ( AMD64RM* op ) {
 406    switch (op->tag) {
 407       case Arm_Mem:
 408          ppAMD64AMode(op->Arm.Mem.am);
 409          return;
 410       case Arm_Reg:
 411          ppHRegAMD64(op->Arm.Reg.reg);
 412          return;
 413      default:
 414          vpanic("ppAMD64RM");
 415    }
 416 }
 417
 418 /* Because an AMD64RM can be both a source or destination operand, we
 419    have to supply a mode -- pertaining to the operand as a whole --
 420    indicating how it's being used. */
 421 static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
 422    switch (op->tag) {
 423       case Arm_Mem:
 424          /* Memory is read, written or modified.  So we just want to
 425             know the regs read by the amode. */
 426          addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
 427          return;
 428       case Arm_Reg:
 429          /* reg is read, written or modified.  Add it in the
 430             appropriate way. */
 431          addHRegUse(u, mode, op->Arm.Reg.reg);
 432          return;
 433      default:
 434          vpanic("addRegUsage_AMD64RM");
 435    }
 436 }
 437
 438 static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
 439 {
 440    switch (op->tag) {
 441       case Arm_Mem:
 442          mapRegs_AMD64AMode(m, op->Arm.Mem.am);
 443          return;
 444       case Arm_Reg:
 445          op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
 446          return;
 447      default:
 448          vpanic("mapRegs_AMD64RM");
 449    }
 450 }
 451
 452
 453 /* --------- Instructions. --------- */
 454
 455 static const HChar* showAMD64ScalarSz ( Int sz ) {
 456    switch (sz) {
 457       case 2: return "w";
 458       case 4: return "l";
 459       case 8: return "q";
 460       default: vpanic("showAMD64ScalarSz");
 461    }
 462 }
 463
 464 const HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
 465    switch (op) {
 466       case Aun_NOT: return "not";
 467       case Aun_NEG: return "neg";
 468       default: vpanic("showAMD64UnaryOp");
 469    }
 470 }
 471
 472 const HChar* showAMD64AluOp ( AMD64AluOp op ) {
 473    switch (op) {
 474       case Aalu_MOV:  return "mov";
 475       case Aalu_CMP:  return "cmp";
 476       case Aalu_ADD:  return "add";
 477       case Aalu_SUB:  return "sub";
 478       case Aalu_ADC:  return "adc";
 479       case Aalu_SBB:  return "sbb";
 480       case Aalu_AND:  return "and";
 481       case Aalu_OR:   return "or";
 482       case Aalu_XOR:  return "xor";
 483       case Aalu_MUL:  return "imul";
 484       default: vpanic("showAMD64AluOp");
 485    }
 486 }
 487
 488 const HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
 489    switch (op) {
 490       case Ash_SHL: return "shl";
 491       case Ash_SHR: return "shr";
 492       case Ash_SAR: return "sar";
 493       default: vpanic("showAMD64ShiftOp");
 494    }
 495 }
 496
 497 const HChar* showA87FpOp ( A87FpOp op ) {
 498    switch (op) {
 499       case Afp_SCALE:  return "scale";
 500       case Afp_ATAN:   return "atan";
 501       case Afp_YL2X:   return "yl2x";
 502       case Afp_YL2XP1: return "yl2xp1";
 503       case Afp_PREM:   return "prem";
 504       case Afp_PREM1:  return "prem1";
 505       case Afp_SQRT:   return "sqrt";
 506       case Afp_SIN:    return "sin";
 507       case Afp_COS:    return "cos";
 508       case Afp_TAN:    return "tan";
 509       case Afp_ROUND:  return "round";
 510       case Afp_2XM1:   return "2xm1";
 511       default: vpanic("showA87FpOp");
 512    }
 513 }
 514
 515 const HChar* showAMD64SseOp ( AMD64SseOp op ) {
 516    switch (op) {
 517       case Asse_MOV:      return "movups";
 518       case Asse_ADDF:     return "add";
 519       case Asse_SUBF:     return "sub";
 520       case Asse_MULF:     return "mul";
 521       case Asse_DIVF:     return "div";
 522       case Asse_MAXF:     return "max";
 523       case Asse_MINF:     return "min";
 524       case Asse_CMPEQF:   return "cmpFeq";
 525       case Asse_CMPLTF:   return "cmpFlt";
 526       case Asse_CMPLEF:   return "cmpFle";
 527       case Asse_CMPUNF:   return "cmpFun";
 528       case Asse_RCPF:     return "rcp";
 529       case Asse_RSQRTF:   return "rsqrt";
 530       case Asse_SQRTF:    return "sqrt";
 531       case Asse_I2F:      return "cvtdq2ps.";
 532       case Asse_F2I:      return "cvtps2dq.";
 533       case Asse_AND:      return "and";
 534       case Asse_OR:       return "or";
 535       case Asse_XOR:      return "xor";
 536       case Asse_ANDN:     return "andn";
 537       case Asse_ADD8:     return "paddb";
 538       case Asse_ADD16:    return "paddw";
 539       case Asse_ADD32:    return "paddd";
 540       case Asse_ADD64:    return "paddq";
 541       case Asse_QADD8U:   return "paddusb";
 542       case Asse_QADD16U:  return "paddusw";
 543       case Asse_QADD8S:   return "paddsb";
 544       case Asse_QADD16S:  return "paddsw";
 545       case Asse_SUB8:     return "psubb";
 546       case Asse_SUB16:    return "psubw";
 547       case Asse_SUB32:    return "psubd";
 548       case Asse_SUB64:    return "psubq";
 549       case Asse_QSUB8U:   return "psubusb";
 550       case Asse_QSUB16U:  return "psubusw";
 551       case Asse_QSUB8S:   return "psubsb";
 552       case Asse_QSUB16S:  return "psubsw";
 553       case Asse_MUL16:    return "pmullw";
 554       case Asse_MULHI16U: return "pmulhuw";
 555       case Asse_MULHI16S: return "pmulhw";
 556       case Asse_AVG8U:    return "pavgb";
 557       case Asse_AVG16U:   return "pavgw";
 558       case Asse_MAX16S:   return "pmaxw";
 559       case Asse_MAX8U:    return "pmaxub";
 560       case Asse_MIN16S:   return "pminw";
 561       case Asse_MIN8U:    return "pminub";
 562       case Asse_CMPEQ8:   return "pcmpeqb";
 563       case Asse_CMPEQ16:  return "pcmpeqw";
 564       case Asse_CMPEQ32:  return "pcmpeqd";
 565       case Asse_CMPGT8S:  return "pcmpgtb";
 566       case Asse_CMPGT16S: return "pcmpgtw";
 567       case Asse_CMPGT32S: return "pcmpgtd";
 568       case Asse_SHL16:    return "psllw";
 569       case Asse_SHL32:    return "pslld";
 570       case Asse_SHL64:    return "psllq";
 571       case Asse_SHL128:   return "pslldq";
 572       case Asse_SHR16:    return "psrlw";
 573       case Asse_SHR32:    return "psrld";
 574       case Asse_SHR64:    return "psrlq";
 575       case Asse_SHR128:   return "psrldq";
 576       case Asse_SAR16:    return "psraw";
 577       case Asse_SAR32:    return "psrad";
 578       case Asse_PACKSSD:  return "packssdw";
 579       case Asse_PACKSSW:  return "packsswb";
 580       case Asse_PACKUSW:  return "packuswb";
 581       case Asse_UNPCKHB:  return "punpckhb";
 582       case Asse_UNPCKHW:  return "punpckhw";
 583       case Asse_UNPCKHD:  return "punpckhd";
 584       case Asse_UNPCKHQ:  return "punpckhq";
 585       case Asse_UNPCKLB:  return "punpcklb";
 586       case Asse_UNPCKLW:  return "punpcklw";
 587       case Asse_UNPCKLD:  return "punpckld";
 588       case Asse_UNPCKLQ:  return "punpcklq";
 589       case Asse_PSHUFB:   return "pshufb";
 590       case Asse_PMADDUBSW: return "pmaddubsw";
 591       case Asse_F32toF16: return "vcvtps2ph(rm_field=$0x4).";
 592       case Asse_F16toF32: return "vcvtph2ps.";
 593       default: vpanic("showAMD64SseOp");
 594    }
 595 }
 596
 597 AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
 598    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 599    i->tag             = Ain_Imm64;
 600    i->Ain.Imm64.imm64 = imm64;
 601    i->Ain.Imm64.dst   = dst;
 602    return i;
 603 }
 604 AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
 605    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 606    i->tag            = Ain_Alu64R;
 607    i->Ain.Alu64R.op  = op;
 608    i->Ain.Alu64R.src = src;
 609    i->Ain.Alu64R.dst = dst;
 610    return i;
 611 }
 612 AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
 613    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 614    i->tag            = Ain_Alu64M;
 615    i->Ain.Alu64M.op  = op;
 616    i->Ain.Alu64M.src = src;
 617    i->Ain.Alu64M.dst = dst;
 618    vassert(op != Aalu_MUL);
 619    return i;
 620 }
 621 AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
 622    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 623    i->tag          = Ain_Sh64;
 624    i->Ain.Sh64.op  = op;
 625    i->Ain.Sh64.src = src;
 626    i->Ain.Sh64.dst = dst;
 627    return i;
 628 }
 629 AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
 630    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 631    i->tag              = Ain_Test64;
 632    i->Ain.Test64.imm32 = imm32;
 633    i->Ain.Test64.dst   = dst;
 634    return i;
 635 }
 636 AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
 637    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 638    i->tag             = Ain_Unary64;
 639    i->Ain.Unary64.op  = op;
 640    i->Ain.Unary64.dst = dst;
 641    return i;
 642 }
 643 AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
 644    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 645    i->tag             = Ain_Lea64;
 646    i->Ain.Lea64.am    = am;
 647    i->Ain.Lea64.dst   = dst;
 648    return i;
 649 }
 650 AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
 651    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 652    i->tag            = Ain_Alu32R;
 653    i->Ain.Alu32R.op  = op;
 654    i->Ain.Alu32R.src = src;
 655    i->Ain.Alu32R.dst = dst;
 656    switch (op) {
 657       case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
 658       case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
 659       default: vassert(0);
 660    }
 661    return i;
 662 }
 663 AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
 664    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 665    i->tag            = Ain_MulL;
 666    i->Ain.MulL.syned = syned;
 667    i->Ain.MulL.src   = src;
 668    return i;
 669 }
 670 AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
 671    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 672    i->tag            = Ain_Div;
 673    i->Ain.Div.syned  = syned;
 674    i->Ain.Div.sz     = sz;
 675    i->Ain.Div.src    = src;
 676    vassert(sz == 4 || sz == 8);
 677    return i;
 678 }
 679 AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
 680    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 681    i->tag          = Ain_Push;
 682    i->Ain.Push.src = src;
 683    return i;
 684 }
 685 AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms,
 686                               RetLoc rloc ) {
 687    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 688    i->tag               = Ain_Call;
 689    i->Ain.Call.cond     = cond;
 690    i->Ain.Call.target   = target;
 691    i->Ain.Call.regparms = regparms;
 692    i->Ain.Call.rloc     = rloc;
 693    vassert(regparms >= 0 && regparms <= 6);
 694    vassert(is_sane_RetLoc(rloc));
 695    return i;
 696 }
 697
 698 AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
 699                                  AMD64CondCode cond, Bool toFastEP ) {
 700    AMD64Instr* i           = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 701    i->tag                  = Ain_XDirect;
 702    i->Ain.XDirect.dstGA    = dstGA;
 703    i->Ain.XDirect.amRIP    = amRIP;
 704    i->Ain.XDirect.cond     = cond;
 705    i->Ain.XDirect.toFastEP = toFastEP;
 706    return i;
 707 }
 708 AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
 709                                 AMD64CondCode cond ) {
 710    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 711    i->tag              = Ain_XIndir;
 712    i->Ain.XIndir.dstGA = dstGA;
 713    i->Ain.XIndir.amRIP = amRIP;
 714    i->Ain.XIndir.cond  = cond;
 715    return i;
 716 }
 717 AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
 718                                    AMD64CondCode cond, IRJumpKind jk ) {
 719    AMD64Instr* i          = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 720    i->tag                 = Ain_XAssisted;
 721    i->Ain.XAssisted.dstGA = dstGA;
 722    i->Ain.XAssisted.amRIP = amRIP;
 723    i->Ain.XAssisted.cond  = cond;
 724    i->Ain.XAssisted.jk    = jk;
 725    return i;
 726 }
 727
 728 AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, HReg src, HReg dst ) {
 729    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 730    i->tag             = Ain_CMov64;
 731    i->Ain.CMov64.cond = cond;
 732    i->Ain.CMov64.src  = src;
 733    i->Ain.CMov64.dst  = dst;
 734    vassert(cond != Acc_ALWAYS);
 735    return i;
 736 }
 737 AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB,
 738                                AMD64AMode* addr, HReg dst ) {
 739    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 740    i->tag            = Ain_CLoad;
 741    i->Ain.CLoad.cond = cond;
 742    i->Ain.CLoad.szB  = szB;
 743    i->Ain.CLoad.addr = addr;
 744    i->Ain.CLoad.dst  = dst;
 745    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
 746    return i;
 747 }
 748 AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
 749                                 HReg src, AMD64AMode* addr ) {
 750    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 751    i->tag             = Ain_CStore;
 752    i->Ain.CStore.cond = cond;
 753    i->Ain.CStore.szB  = szB;
 754    i->Ain.CStore.src  = src;
 755    i->Ain.CStore.addr = addr;
 756    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
 757    return i;
 758 }
 759 AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
 760    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 761    i->tag              = Ain_MovxLQ;
 762    i->Ain.MovxLQ.syned = syned;
 763    i->Ain.MovxLQ.src   = src;
 764    i->Ain.MovxLQ.dst   = dst;
 765    return i;
 766 }
 767 AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
 768                                 AMD64AMode* src, HReg dst ) {
 769    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 770    i->tag                = Ain_LoadEX;
 771    i->Ain.LoadEX.szSmall = szSmall;
 772    i->Ain.LoadEX.syned   = syned;
 773    i->Ain.LoadEX.src     = src;
 774    i->Ain.LoadEX.dst     = dst;
 775    vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
 776    return i;
 777 }
 778 AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
 779    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 780    i->tag           = Ain_Store;
 781    i->Ain.Store.sz  = sz;
 782    i->Ain.Store.src = src;
 783    i->Ain.Store.dst = dst;
 784    vassert(sz == 1 || sz == 2 || sz == 4);
 785    return i;
 786 }
 787 AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
 788    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 789    i->tag            = Ain_Set64;
 790    i->Ain.Set64.cond = cond;
 791    i->Ain.Set64.dst  = dst;
 792    return i;
 793 }
 794 AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
 795    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 796    i->tag               = Ain_Bsfr64;
 797    i->Ain.Bsfr64.isFwds = isFwds;
 798    i->Ain.Bsfr64.src    = src;
 799    i->Ain.Bsfr64.dst    = dst;
 800    return i;
 801 }
 802 AMD64Instr* AMD64Instr_MFence ( void ) {
 803    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 804    i->tag        = Ain_MFence;
 805    return i;
 806 }
 807 AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
 808    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 809    i->tag           = Ain_ACAS;
 810    i->Ain.ACAS.addr = addr;
 811    i->Ain.ACAS.sz   = sz;
 812    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
 813    return i;
 814 }
 815 AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
 816    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 817    i->tag            = Ain_DACAS;
 818    i->Ain.DACAS.addr = addr;
 819    i->Ain.DACAS.sz   = sz;
 820    vassert(sz == 8 || sz == 4);
 821    return i;
 822 }
 823
 824 AMD64Instr* AMD64Instr_A87Free ( Int nregs )
 825 {
 826    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 827    i->tag               = Ain_A87Free;
 828    i->Ain.A87Free.nregs = nregs;
 829    vassert(nregs >= 1 && nregs <= 7);
 830    return i;
 831 }
 832 AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
 833 {
 834    AMD64Instr* i            = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 835    i->tag                   = Ain_A87PushPop;
 836    i->Ain.A87PushPop.addr   = addr;
 837    i->Ain.A87PushPop.isPush = isPush;
 838    i->Ain.A87PushPop.szB    = szB;
 839    vassert(szB == 8 || szB == 4);
 840    return i;
 841 }
 842 AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
 843 {
 844    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 845    i->tag            = Ain_A87FpOp;
 846    i->Ain.A87FpOp.op = op;
 847    return i;
 848 }
 849 AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
 850 {
 851    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 852    i->tag              = Ain_A87LdCW;
 853    i->Ain.A87LdCW.addr = addr;
 854    return i;
 855 }
 856 AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
 857 {
 858    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 859    i->tag              = Ain_A87StSW;
 860    i->Ain.A87StSW.addr = addr;
 861    return i;
 862 }
 863 AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
 864    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 865    i->tag                = Ain_LdMXCSR;
 866    i->Ain.LdMXCSR.addr   = addr;
 867    return i;
 868 }
 869 AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
 870    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 871    i->tag                = Ain_SseUComIS;
 872    i->Ain.SseUComIS.sz   = toUChar(sz);
 873    i->Ain.SseUComIS.srcL = srcL;
 874    i->Ain.SseUComIS.srcR = srcR;
 875    i->Ain.SseUComIS.dst  = dst;
 876    vassert(sz == 4 || sz == 8);
 877    return i;
 878 }
 879 AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
 880    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 881    i->tag              = Ain_SseSI2SF;
 882    i->Ain.SseSI2SF.szS = toUChar(szS);
 883    i->Ain.SseSI2SF.szD = toUChar(szD);
 884    i->Ain.SseSI2SF.src = src;
 885    i->Ain.SseSI2SF.dst = dst;
 886    vassert(szS == 4 || szS == 8);
 887    vassert(szD == 4 || szD == 8);
 888    return i;
 889 }
 890 AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
 891    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 892    i->tag              = Ain_SseSF2SI;
 893    i->Ain.SseSF2SI.szS = toUChar(szS);
 894    i->Ain.SseSF2SI.szD = toUChar(szD);
 895    i->Ain.SseSF2SI.src = src;
 896    i->Ain.SseSF2SI.dst = dst;
 897    vassert(szS == 4 || szS == 8);
 898    vassert(szD == 4 || szD == 8);
 899    return i;
 900 }
 901 AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
 902 {
 903    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 904    i->tag                = Ain_SseSDSS;
 905    i->Ain.SseSDSS.from64 = from64;
 906    i->Ain.SseSDSS.src    = src;
 907    i->Ain.SseSDSS.dst    = dst;
 908    return i;
 909 }
 910 AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
 911                                  HReg reg, AMD64AMode* addr ) {
 912    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 913    i->tag                = Ain_SseLdSt;
 914    i->Ain.SseLdSt.isLoad = isLoad;
 915    i->Ain.SseLdSt.sz     = toUChar(sz);
 916    i->Ain.SseLdSt.reg    = reg;
 917    i->Ain.SseLdSt.addr   = addr;
 918    vassert(sz == 4 || sz == 8 || sz == 16);
 919    return i;
 920 }
 921 AMD64Instr* AMD64Instr_SseCStore ( AMD64CondCode cond,
 922                                    HReg src, AMD64AMode* addr )
 923 {
 924    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 925    i->tag                = Ain_SseCStore;
 926    i->Ain.SseCStore.cond = cond;
 927    i->Ain.SseCStore.src  = src;
 928    i->Ain.SseCStore.addr = addr;
 929    vassert(cond != Acc_ALWAYS);
 930    return i;
 931 }
 932 AMD64Instr* AMD64Instr_SseCLoad ( AMD64CondCode cond,
 933                                   AMD64AMode* addr, HReg dst )
 934 {
 935    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 936    i->tag               = Ain_SseCLoad;
 937    i->Ain.SseCLoad.cond = cond;
 938    i->Ain.SseCLoad.addr = addr;
 939    i->Ain.SseCLoad.dst  = dst;
 940    vassert(cond != Acc_ALWAYS);
 941    return i;
 942 }
 943 AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
 944 {
 945    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 946    i->tag                = Ain_SseLdzLO;
 947    i->Ain.SseLdzLO.sz    = sz;
 948    i->Ain.SseLdzLO.reg   = reg;
 949    i->Ain.SseLdzLO.addr  = addr;
 950    vassert(sz == 4 || sz == 8);
 951    return i;
 952 }
 953 AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
 954    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 955    i->tag              = Ain_Sse32Fx4;
 956    i->Ain.Sse32Fx4.op  = op;
 957    i->Ain.Sse32Fx4.src = src;
 958    i->Ain.Sse32Fx4.dst = dst;
 959    vassert(op != Asse_MOV);
 960    return i;
 961 }
 962 AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
 963    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 964    i->tag              = Ain_Sse32FLo;
 965    i->Ain.Sse32FLo.op  = op;
 966    i->Ain.Sse32FLo.src = src;
 967    i->Ain.Sse32FLo.dst = dst;
 968    vassert(op != Asse_MOV);
 969    return i;
 970 }
 971 AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
 972    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 973    i->tag              = Ain_Sse64Fx2;
 974    i->Ain.Sse64Fx2.op  = op;
 975    i->Ain.Sse64Fx2.src = src;
 976    i->Ain.Sse64Fx2.dst = dst;
 977    vassert(op != Asse_MOV);
 978    return i;
 979 }
 980 AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
 981    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 982    i->tag              = Ain_Sse64FLo;
 983    i->Ain.Sse64FLo.op  = op;
 984    i->Ain.Sse64FLo.src = src;
 985    i->Ain.Sse64FLo.dst = dst;
 986    vassert(op != Asse_MOV);
 987    return i;
 988 }
 989 AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
 990    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 991    i->tag             = Ain_SseReRg;
 992    i->Ain.SseReRg.op  = op;
 993    i->Ain.SseReRg.src = re;
 994    i->Ain.SseReRg.dst = rg;
 995    return i;
 996 }
 997 AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
 998    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 999    i->tag              = Ain_SseCMov;
1000    i->Ain.SseCMov.cond = cond;
1001    i->Ain.SseCMov.src  = src;
1002    i->Ain.SseCMov.dst  = dst;
1003    vassert(cond != Acc_ALWAYS);
1004    return i;
1005 }
1006 AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
1007    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1008    i->tag               = Ain_SseShuf;
1009    i->Ain.SseShuf.order = order;
1010    i->Ain.SseShuf.src   = src;
1011    i->Ain.SseShuf.dst   = dst;
1012    vassert(order >= 0 && order <= 0xFF);
1013    return i;
1014 }
1015 AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op,
1016                                    UInt shiftBits, HReg dst ) {
1017    AMD64Instr* i              = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1018    i->tag                     = Ain_SseShiftN;
1019    i->Ain.SseShiftN.op        = op;
1020    i->Ain.SseShiftN.shiftBits = shiftBits;
1021    i->Ain.SseShiftN.dst       = dst;
1022    return i;
1023 }
1024 AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ) {
1025    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1026    i->tag               = Ain_SseMOVQ;
1027    i->Ain.SseMOVQ.gpr   = gpr;
1028    i->Ain.SseMOVQ.xmm   = xmm;
1029    i->Ain.SseMOVQ.toXMM = toXMM;
1030    vassert(hregClass(gpr) == HRcInt64);
1031    vassert(hregClass(xmm) == HRcVec128);
1032    return i;
1033 }
1034 //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
1035 //uu                                  HReg reg, AMD64AMode* addr ) {
1036 //uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1037 //uu    i->tag                = Ain_AvxLdSt;
1038 //uu    i->Ain.AvxLdSt.isLoad = isLoad;
1039 //uu    i->Ain.AvxLdSt.reg    = reg;
1040 //uu    i->Ain.AvxLdSt.addr   = addr;
1041 //uu    return i;
1042 //uu }
1043 //uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
1044 //uu    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1045 //uu    i->tag             = Ain_AvxReRg;
1046 //uu    i->Ain.AvxReRg.op  = op;
1047 //uu    i->Ain.AvxReRg.src = re;
1048 //uu    i->Ain.AvxReRg.dst = rg;
1049 //uu    return i;
1050 //uu }
1051 AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
1052                                  AMD64AMode* amFailAddr ) {
1053    AMD64Instr* i             = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1054    i->tag                    = Ain_EvCheck;
1055    i->Ain.EvCheck.amCounter  = amCounter;
1056    i->Ain.EvCheck.amFailAddr = amFailAddr;
1057    return i;
1058 }
1059 AMD64Instr* AMD64Instr_ProfInc ( void ) {
1060    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1061    i->tag        = Ain_ProfInc;
1062    return i;
1063 }
1064
1065 void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
1066 {
1067    vassert(mode64 == True);
1068    switch (i->tag) {
1069       case Ain_Imm64:
1070          vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
1071          ppHRegAMD64(i->Ain.Imm64.dst);
1072          return;
1073       case Ain_Alu64R:
1074          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
1075          ppAMD64RMI(i->Ain.Alu64R.src);
1076          vex_printf(",");
1077          ppHRegAMD64(i->Ain.Alu64R.dst);
1078          return;
1079       case Ain_Alu64M:
1080          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
1081          ppAMD64RI(i->Ain.Alu64M.src);
1082          vex_printf(",");
1083          ppAMD64AMode(i->Ain.Alu64M.dst);
1084          return;
1085       case Ain_Sh64:
1086          vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
1087          if (i->Ain.Sh64.src == 0)
1088             vex_printf("%%cl,");
1089          else
1090             vex_printf("$%d,", (Int)i->Ain.Sh64.src);
1091          ppHRegAMD64(i->Ain.Sh64.dst);
1092          return;
1093       case Ain_Test64:
1094          vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
1095          ppHRegAMD64(i->Ain.Test64.dst);
1096          return;
1097       case Ain_Unary64:
1098          vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
1099          ppHRegAMD64(i->Ain.Unary64.dst);
1100          return;
1101       case Ain_Lea64:
1102          vex_printf("leaq ");
1103          ppAMD64AMode(i->Ain.Lea64.am);
1104          vex_printf(",");
1105          ppHRegAMD64(i->Ain.Lea64.dst);
1106          return;
1107       case Ain_Alu32R:
1108          vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
1109          ppAMD64RMI_lo32(i->Ain.Alu32R.src);
1110          vex_printf(",");
1111          ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
1112          return;
1113       case Ain_MulL:
1114          vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
1115          ppAMD64RM(i->Ain.MulL.src);
1116          return;
1117       case Ain_Div:
1118          vex_printf("%cdiv%s ",
1119                     i->Ain.Div.syned ? 's' : 'u',
1120                     showAMD64ScalarSz(i->Ain.Div.sz));
1121          ppAMD64RM(i->Ain.Div.src);
1122          return;
1123       case Ain_Push:
1124          vex_printf("pushq ");
1125          ppAMD64RMI(i->Ain.Push.src);
1126          return;
1127       case Ain_Call:
1128          vex_printf("call%s[%d,",
1129                     i->Ain.Call.cond==Acc_ALWAYS
1130                        ? "" : showAMD64CondCode(i->Ain.Call.cond),
1131                     i->Ain.Call.regparms );
1132          ppRetLoc(i->Ain.Call.rloc);
1133          vex_printf("] 0x%llx", i->Ain.Call.target);
1134          break;
1135
1136       case Ain_XDirect:
1137          vex_printf("(xDirect) ");
1138          vex_printf("if (%%rflags.%s) { ",
1139                     showAMD64CondCode(i->Ain.XDirect.cond));
1140          vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
1141          vex_printf("movq %%r11,");
1142          ppAMD64AMode(i->Ain.XDirect.amRIP);
1143          vex_printf("; ");
1144          vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
1145                     i->Ain.XDirect.toFastEP ? "fast" : "slow");
1146          return;
1147       case Ain_XIndir:
1148          vex_printf("(xIndir) ");
1149          vex_printf("if (%%rflags.%s) { ",
1150                     showAMD64CondCode(i->Ain.XIndir.cond));
1151          vex_printf("movq ");
1152          ppHRegAMD64(i->Ain.XIndir.dstGA);
1153          vex_printf(",");
1154          ppAMD64AMode(i->Ain.XIndir.amRIP);
1155          vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
1156          return;
1157       case Ain_XAssisted:
1158          vex_printf("(xAssisted) ");
1159          vex_printf("if (%%rflags.%s) { ",
1160                     showAMD64CondCode(i->Ain.XAssisted.cond));
1161          vex_printf("movq ");
1162          ppHRegAMD64(i->Ain.XAssisted.dstGA);
1163          vex_printf(",");
1164          ppAMD64AMode(i->Ain.XAssisted.amRIP);
1165          vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
1166                     (Int)i->Ain.XAssisted.jk);
1167          vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
1168          return;
1169
1170       case Ain_CMov64:
1171          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
1172          ppHRegAMD64(i->Ain.CMov64.src);
1173          vex_printf(",");
1174          ppHRegAMD64(i->Ain.CMov64.dst);
1175          return;
1176       case Ain_CLoad:
1177          vex_printf("if (%%rflags.%s) { ",
1178                     showAMD64CondCode(i->Ain.CLoad.cond));
1179          vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
1180          ppAMD64AMode(i->Ain.CLoad.addr);
1181          vex_printf(", ");
1182          (i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1183             (i->Ain.CLoad.dst);
1184          vex_printf(" }");
1185          return;
1186       case Ain_CStore:
1187          vex_printf("if (%%rflags.%s) { ",
1188                     showAMD64CondCode(i->Ain.CStore.cond));
1189          vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q');
1190          (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1191             (i->Ain.CStore.src);
1192          vex_printf(", ");
1193          ppAMD64AMode(i->Ain.CStore.addr);
1194          vex_printf(" }");
1195          return;
1196
1197       case Ain_MovxLQ:
1198          vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
1199          ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
1200          vex_printf(",");
1201          ppHRegAMD64(i->Ain.MovxLQ.dst);
1202          return;
1203       case Ain_LoadEX:
1204          if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
1205             vex_printf("movl ");
1206             ppAMD64AMode(i->Ain.LoadEX.src);
1207             vex_printf(",");
1208             ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
1209          } else {
1210             vex_printf("mov%c%cq ",
1211                        i->Ain.LoadEX.syned ? 's' : 'z',
1212                        i->Ain.LoadEX.szSmall==1
1213                           ? 'b'
1214                           : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
1215             ppAMD64AMode(i->Ain.LoadEX.src);
1216             vex_printf(",");
1217             ppHRegAMD64(i->Ain.LoadEX.dst);
1218          }
1219          return;
1220       case Ain_Store:
1221          vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
1222                               : (i->Ain.Store.sz==2 ? 'w' : 'l'));
1223          ppHRegAMD64(i->Ain.Store.src);
1224          vex_printf(",");
1225          ppAMD64AMode(i->Ain.Store.dst);
1226          return;
1227       case Ain_Set64:
1228          vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
1229          ppHRegAMD64(i->Ain.Set64.dst);
1230          return;
1231       case Ain_Bsfr64:
1232          vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
1233          ppHRegAMD64(i->Ain.Bsfr64.src);
1234          vex_printf(",");
1235          ppHRegAMD64(i->Ain.Bsfr64.dst);
1236          return;
1237       case Ain_MFence:
1238          vex_printf("mfence" );
1239          return;
1240       case Ain_ACAS:
1241          vex_printf("lock cmpxchg%c ",
1242                      i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
1243                      : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
1244          vex_printf("{%%rax->%%rbx},");
1245          ppAMD64AMode(i->Ain.ACAS.addr);
1246          return;
1247       case Ain_DACAS:
1248          vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
1249                     (Int)(2 * i->Ain.DACAS.sz));
1250          ppAMD64AMode(i->Ain.DACAS.addr);
1251          return;
1252       case Ain_A87Free:
1253          vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
1254          break;
1255       case Ain_A87PushPop:
1256          vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
1257                     i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
1258          ppAMD64AMode(i->Ain.A87PushPop.addr);
1259          break;
1260       case Ain_A87FpOp:
1261          vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
1262          break;
1263       case Ain_A87LdCW:
1264          vex_printf("fldcw ");
1265          ppAMD64AMode(i->Ain.A87LdCW.addr);
1266          break;
1267       case Ain_A87StSW:
1268          vex_printf("fstsw ");
1269          ppAMD64AMode(i->Ain.A87StSW.addr);
1270          break;
1271       case Ain_LdMXCSR:
1272          vex_printf("ldmxcsr ");
1273          ppAMD64AMode(i->Ain.LdMXCSR.addr);
1274          break;
1275       case Ain_SseUComIS:
1276          vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
1277          ppHRegAMD64(i->Ain.SseUComIS.srcL);
1278          vex_printf(",");
1279          ppHRegAMD64(i->Ain.SseUComIS.srcR);
1280          vex_printf(" ; pushfq ; popq ");
1281          ppHRegAMD64(i->Ain.SseUComIS.dst);
1282          break;
1283       case Ain_SseSI2SF:
1284          vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
1285          (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1286             (i->Ain.SseSI2SF.src);
1287          vex_printf(",");
1288          ppHRegAMD64(i->Ain.SseSI2SF.dst);
1289          break;
1290       case Ain_SseSF2SI:
1291          vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
1292          ppHRegAMD64(i->Ain.SseSF2SI.src);
1293          vex_printf(",");
1294          (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1295             (i->Ain.SseSF2SI.dst);
1296          break;
1297       case Ain_SseSDSS:
1298          vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
1299          ppHRegAMD64(i->Ain.SseSDSS.src);
1300          vex_printf(",");
1301          ppHRegAMD64(i->Ain.SseSDSS.dst);
1302          break;
1303       case Ain_SseLdSt:
1304          switch (i->Ain.SseLdSt.sz) {
1305             case 4:  vex_printf("movss "); break;
1306             case 8:  vex_printf("movsd "); break;
1307             case 16: vex_printf("movups "); break;
1308             default: vassert(0);
1309          }
1310          if (i->Ain.SseLdSt.isLoad) {
1311             ppAMD64AMode(i->Ain.SseLdSt.addr);
1312             vex_printf(",");
1313             ppHRegAMD64(i->Ain.SseLdSt.reg);
1314          } else {
1315             ppHRegAMD64(i->Ain.SseLdSt.reg);
1316             vex_printf(",");
1317             ppAMD64AMode(i->Ain.SseLdSt.addr);
1318          }
1319          return;
1320       case Ain_SseCStore:
1321          vex_printf("if (%%rflags.%s) { ",
1322                     showAMD64CondCode(i->Ain.SseCStore.cond));
1323          vex_printf("movups ");
1324          ppHRegAMD64(i->Ain.SseCStore.src);
1325          vex_printf(", ");
1326          ppAMD64AMode(i->Ain.SseCStore.addr);
1327          vex_printf(" }");
1328          return;
1329       case Ain_SseCLoad:
1330          vex_printf("if (%%rflags.%s) { ",
1331                     showAMD64CondCode(i->Ain.SseCLoad.cond));
1332          vex_printf("movups ");
1333          ppAMD64AMode(i->Ain.SseCLoad.addr);
1334          vex_printf(", ");
1335          ppHRegAMD64(i->Ain.SseCLoad.dst);
1336          vex_printf(" }");
1337          return;
1338       case Ain_SseLdzLO:
1339          vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
1340          ppAMD64AMode(i->Ain.SseLdzLO.addr);
1341          vex_printf(",");
1342          ppHRegAMD64(i->Ain.SseLdzLO.reg);
1343          return;
1344       case Ain_Sse32Fx4:
1345          vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
1346          ppHRegAMD64(i->Ain.Sse32Fx4.src);
1347          vex_printf(",");
1348          ppHRegAMD64(i->Ain.Sse32Fx4.dst);
1349          return;
1350       case Ain_Sse32FLo:
1351          vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
1352          ppHRegAMD64(i->Ain.Sse32FLo.src);
1353          vex_printf(",");
1354          ppHRegAMD64(i->Ain.Sse32FLo.dst);
1355          return;
1356       case Ain_Sse64Fx2:
1357          vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
1358          ppHRegAMD64(i->Ain.Sse64Fx2.src);
1359          vex_printf(",");
1360          ppHRegAMD64(i->Ain.Sse64Fx2.dst);
1361          return;
1362       case Ain_Sse64FLo:
1363          vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
1364          ppHRegAMD64(i->Ain.Sse64FLo.src);
1365          vex_printf(",");
1366          ppHRegAMD64(i->Ain.Sse64FLo.dst);
1367          return;
1368       case Ain_SseReRg:
1369          vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1370          ppHRegAMD64(i->Ain.SseReRg.src);
1371          vex_printf(",");
1372          ppHRegAMD64(i->Ain.SseReRg.dst);
1373          return;
1374       case Ain_SseCMov:
1375          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
1376          ppHRegAMD64(i->Ain.SseCMov.src);
1377          vex_printf(",");
1378          ppHRegAMD64(i->Ain.SseCMov.dst);
1379          return;
1380       case Ain_SseShuf:
1381          vex_printf("pshufd $0x%x,", (UInt)i->Ain.SseShuf.order);
1382          ppHRegAMD64(i->Ain.SseShuf.src);
1383          vex_printf(",");
1384          ppHRegAMD64(i->Ain.SseShuf.dst);
1385          return;
1386       case Ain_SseShiftN:
1387          vex_printf("%s $%u, ", showAMD64SseOp(i->Ain.SseShiftN.op),
1388                                 i->Ain.SseShiftN.shiftBits);
1389          ppHRegAMD64(i->Ain.SseShiftN.dst);
1390          return;
1391       case Ain_SseMOVQ:
1392          vex_printf("movq ");
1393          if (i->Ain.SseMOVQ.toXMM) {
1394             ppHRegAMD64(i->Ain.SseMOVQ.gpr);
1395             vex_printf(",");
1396             ppHRegAMD64(i->Ain.SseMOVQ.xmm);
1397          } else {
1398             ppHRegAMD64(i->Ain.SseMOVQ.xmm);
1399             vex_printf(",");
1400             ppHRegAMD64(i->Ain.SseMOVQ.gpr);
1401          };
1402          return;
1403       //uu case Ain_AvxLdSt:
1404       //uu    vex_printf("vmovups ");
1405       //uu    if (i->Ain.AvxLdSt.isLoad) {
1406       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1407       //uu       vex_printf(",");
1408       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1409       //uu    } else {
1410       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1411       //uu       vex_printf(",");
1412       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1413       //uu    }
1414       //uu    return;
1415       //uu case Ain_AvxReRg:
1416       //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1417       //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
1418       //uu    vex_printf(",");
1419       //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
1420       //uu    return;
1421       case Ain_EvCheck:
1422          vex_printf("(evCheck) decl ");
1423          ppAMD64AMode(i->Ain.EvCheck.amCounter);
1424          vex_printf("; jns nofail; jmp *");
1425          ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
1426          vex_printf("; nofail:");
1427          return;
1428       case Ain_ProfInc:
1429          vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
1430          return;
1431       default:
1432          vpanic("ppAMD64Instr");
1433    }
1434 }
1435
1436 /* --------- Helpers for register allocation. --------- */
1437
1438 void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
1439 {
1440    Bool unary;
1441    vassert(mode64 == True);
1442    initHRegUsage(u);
1443    switch (i->tag) {
1444       case Ain_Imm64:
1445          addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
1446          return;
1447       case Ain_Alu64R:
1448          addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
1449          if (i->Ain.Alu64R.op == Aalu_MOV) {
1450             addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
1451
1452             if (i->Ain.Alu64R.src->tag == Armi_Reg) {
1453                u->isRegRegMove = True;
1454                u->regMoveSrc   = i->Ain.Alu64R.src->Armi.Reg.reg;
1455                u->regMoveDst   = i->Ain.Alu64R.dst;
1456             }
1457             return;
1458          }
1459          if (i->Ain.Alu64R.op == Aalu_CMP) {
1460             addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
1461             return;
1462          }
1463          addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
1464          return;
1465       case Ain_Alu64M:
1466          addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
1467          addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
1468          return;
1469       case Ain_Sh64:
1470          addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
1471          if (i->Ain.Sh64.src == 0)
1472             addHRegUse(u, HRmRead, hregAMD64_RCX());
1473          return;
1474       case Ain_Test64:
1475          addHRegUse(u, HRmRead, i->Ain.Test64.dst);
1476          return;
1477       case Ain_Unary64:
1478          addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
1479          return;
1480       case Ain_Lea64:
1481          addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
1482          addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
1483          return;
1484       case Ain_Alu32R:
1485          vassert(i->Ain.Alu32R.op != Aalu_MOV);
1486          addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
1487          if (i->Ain.Alu32R.op == Aalu_CMP) {
1488             addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
1489             return;
1490          }
1491          addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
1492          return;
1493       case Ain_MulL:
1494          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
1495          addHRegUse(u, HRmModify, hregAMD64_RAX());
1496          addHRegUse(u, HRmWrite, hregAMD64_RDX());
1497          return;
1498       case Ain_Div:
1499          addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
1500          addHRegUse(u, HRmModify, hregAMD64_RAX());
1501          addHRegUse(u, HRmModify, hregAMD64_RDX());
1502          return;
1503       case Ain_Push:
1504          addRegUsage_AMD64RMI(u, i->Ain.Push.src);
1505          addHRegUse(u, HRmModify, hregAMD64_RSP());
1506          return;
1507       case Ain_Call:
1508          /* This is a bit subtle. */
1509          /* First off, claim it trashes all the caller-saved regs
1510             which fall within the register allocator's jurisdiction.
1511             These I believe to be: rax rcx rdx rdi rsi r8 r9 r10
1512             and all the xmm registers. */
1513          addHRegUse(u, HRmWrite, hregAMD64_RAX());
1514          addHRegUse(u, HRmWrite, hregAMD64_RCX());
1515          addHRegUse(u, HRmWrite, hregAMD64_RDX());
1516          addHRegUse(u, HRmWrite, hregAMD64_RDI());
1517          addHRegUse(u, HRmWrite, hregAMD64_RSI());
1518          addHRegUse(u, HRmWrite, hregAMD64_R8());
1519          addHRegUse(u, HRmWrite, hregAMD64_R9());
1520          addHRegUse(u, HRmWrite, hregAMD64_R10());
1521          addHRegUse(u, HRmWrite, hregAMD64_XMM0());
1522          addHRegUse(u, HRmWrite, hregAMD64_XMM1());
1523          addHRegUse(u, HRmWrite, hregAMD64_XMM3());
1524          addHRegUse(u, HRmWrite, hregAMD64_XMM4());
1525          addHRegUse(u, HRmWrite, hregAMD64_XMM5());
1526          addHRegUse(u, HRmWrite, hregAMD64_XMM6());
1527          addHRegUse(u, HRmWrite, hregAMD64_XMM7());
1528          addHRegUse(u, HRmWrite, hregAMD64_XMM8());
1529          addHRegUse(u, HRmWrite, hregAMD64_XMM9());
1530          addHRegUse(u, HRmWrite, hregAMD64_XMM10());
1531          addHRegUse(u, HRmWrite, hregAMD64_XMM11());
1532          addHRegUse(u, HRmWrite, hregAMD64_XMM12());
1533
1534          /* Now we have to state any parameter-carrying registers
1535             which might be read.  This depends on the regparmness. */
1536          switch (i->Ain.Call.regparms) {
1537             case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
1538             case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
1539             case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
1540             case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
1541             case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
1542             case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
1543             case 0: break;
1544             default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
1545          }
1546          /* Finally, there is the issue that the insn trashes a
1547             register because the literal target address has to be
1548             loaded into a register.  Fortunately, r11 is stated in the
1549             ABI as a scratch register, and so seems a suitable victim.  */
1550          addHRegUse(u, HRmWrite, hregAMD64_R11());
1551          /* Upshot of this is that the assembler really must use r11,
1552             and no other, as a destination temporary. */
1553          return;
1554       /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1555          conditionally exit the block.  Hence we only need to list (1)
1556          the registers that they read, and (2) the registers that they
1557          write in the case where the block is not exited.  (2) is
1558          empty, hence only (1) is relevant here. */
1559       case Ain_XDirect:
1560          /* Don't bother to mention the write to %r11, since it is not
1561             available to the allocator. */
1562          addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
1563          return;
1564       case Ain_XIndir:
1565          /* Ditto re %r11 */
1566          addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
1567          addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
1568          return;
1569       case Ain_XAssisted:
1570          /* Ditto re %r11 and %rbp (the baseblock ptr) */
1571          addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
1572          addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
1573          return;
1574       case Ain_CMov64:
1575          addHRegUse(u, HRmRead,   i->Ain.CMov64.src);
1576          addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
1577          return;
1578       case Ain_CLoad:
1579          addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr);
1580          addHRegUse(u, HRmModify, i->Ain.CLoad.dst);
1581          return;
1582       case Ain_CStore:
1583          addRegUsage_AMD64AMode(u, i->Ain.CStore.addr);
1584          addHRegUse(u, HRmRead, i->Ain.CStore.src);
1585          return;
1586       case Ain_MovxLQ:
1587          addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
1588          addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
1589          return;
1590       case Ain_LoadEX:
1591          addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
1592          addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
1593          return;
1594       case Ain_Store:
1595          addHRegUse(u, HRmRead, i->Ain.Store.src);
1596          addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
1597          return;
1598       case Ain_Set64:
1599          addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
1600          return;
1601       case Ain_Bsfr64:
1602          addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
1603          addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
1604          return;
1605       case Ain_MFence:
1606          return;
1607       case Ain_ACAS:
1608          addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
1609          addHRegUse(u, HRmRead, hregAMD64_RBX());
1610          addHRegUse(u, HRmModify, hregAMD64_RAX());
1611          return;
1612       case Ain_DACAS:
1613          addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
1614          addHRegUse(u, HRmRead, hregAMD64_RCX());
1615          addHRegUse(u, HRmRead, hregAMD64_RBX());
1616          addHRegUse(u, HRmModify, hregAMD64_RDX());
1617          addHRegUse(u, HRmModify, hregAMD64_RAX());
1618          return;
1619       case Ain_A87Free:
1620          return;
1621       case Ain_A87PushPop:
1622          addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
1623          return;
1624       case Ain_A87FpOp:
1625          return;
1626       case Ain_A87LdCW:
1627          addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
1628          return;
1629       case Ain_A87StSW:
1630          addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
1631          return;
1632       case Ain_LdMXCSR:
1633          addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
1634          return;
1635       case Ain_SseUComIS:
1636          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
1637          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
1638          addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
1639          return;
1640       case Ain_SseSI2SF:
1641          addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
1642          addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
1643          return;
1644       case Ain_SseSF2SI:
1645          addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
1646          addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
1647          return;
1648       case Ain_SseSDSS:
1649          addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
1650          addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
1651          return;
1652       case Ain_SseLdSt:
1653          addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
1654          addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
1655                        i->Ain.SseLdSt.reg);
1656          return;
1657       case Ain_SseCStore:
1658          addRegUsage_AMD64AMode(u, i->Ain.SseCStore.addr);
1659          addHRegUse(u, HRmRead, i->Ain.SseCStore.src);
1660          return;
1661       case Ain_SseCLoad:
1662          addRegUsage_AMD64AMode(u, i->Ain.SseCLoad.addr);
1663          addHRegUse(u, HRmModify, i->Ain.SseCLoad.dst);
1664          return;
1665       case Ain_SseLdzLO:
1666          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
1667          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
1668          return;
1669       case Ain_Sse32Fx4:
1670          vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
1671          unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
1672                          || i->Ain.Sse32Fx4.op == Asse_RSQRTF
1673                          || i->Ain.Sse32Fx4.op == Asse_SQRTF
1674                          || i->Ain.Sse32Fx4.op == Asse_I2F
1675                          || i->Ain.Sse32Fx4.op == Asse_F2I
1676                          || i->Ain.Sse32Fx4.op == Asse_F32toF16
1677                          || i->Ain.Sse32Fx4.op == Asse_F16toF32 );
1678          addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
1679          addHRegUse(u, unary ? HRmWrite : HRmModify,
1680                        i->Ain.Sse32Fx4.dst);
1681          return;
1682       case Ain_Sse32FLo:
1683          vassert(i->Ain.Sse32FLo.op != Asse_MOV);
1684          unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
1685                          || i->Ain.Sse32FLo.op == Asse_RSQRTF
1686                          || i->Ain.Sse32FLo.op == Asse_SQRTF );
1687          addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
1688          addHRegUse(u, unary ? HRmWrite : HRmModify,
1689                        i->Ain.Sse32FLo.dst);
1690          return;
1691       case Ain_Sse64Fx2:
1692          vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
1693          unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
1694                          || i->Ain.Sse64Fx2.op == Asse_RSQRTF
1695                          || i->Ain.Sse64Fx2.op == Asse_SQRTF );
1696          addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
1697          addHRegUse(u, unary ? HRmWrite : HRmModify,
1698                        i->Ain.Sse64Fx2.dst);
1699          return;
1700       case Ain_Sse64FLo:
1701          vassert(i->Ain.Sse64FLo.op != Asse_MOV);
1702          unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
1703                          || i->Ain.Sse64FLo.op == Asse_RSQRTF
1704                          || i->Ain.Sse64FLo.op == Asse_SQRTF );
1705          addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
1706          addHRegUse(u, unary ? HRmWrite : HRmModify,
1707                        i->Ain.Sse64FLo.dst);
1708          return;
1709       case Ain_SseReRg:
1710          if ( (i->Ain.SseReRg.op == Asse_XOR
1711                || i->Ain.SseReRg.op == Asse_CMPEQ32)
1712               && sameHReg(i->Ain.SseReRg.src, i->Ain.SseReRg.dst)) {
1713             /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
1714                r,r' as a write of a value to r, and independent of any
1715                previous value in r */
1716             /* (as opposed to a rite of passage :-) */
1717             addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
1718          } else {
1719             addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
1720             addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
1721                              ? HRmWrite : HRmModify,
1722                           i->Ain.SseReRg.dst);
1723
1724             if (i->Ain.SseReRg.op == Asse_MOV) {
1725                u->isRegRegMove = True;
1726                u->regMoveSrc   = i->Ain.SseReRg.src;
1727                u->regMoveDst   = i->Ain.SseReRg.dst;
1728             }
1729          }
1730          return;
1731       case Ain_SseCMov:
1732          addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
1733          addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
1734          return;
1735       case Ain_SseShuf:
1736          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
1737          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
1738          return;
1739       case Ain_SseShiftN:
1740          addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst);
1741          return;
1742       case Ain_SseMOVQ:
1743          addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmRead : HRmWrite,
1744                     i->Ain.SseMOVQ.gpr);
1745          addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmWrite : HRmRead,
1746                     i->Ain.SseMOVQ.xmm);
1747          return;
1748       //uu case Ain_AvxLdSt:
1749       //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
1750       //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
1751       //uu               i->Ain.AvxLdSt.reg);
1752       //uu return;
1753       //uu case Ain_AvxReRg:
1754       //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
1755       //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
1756       //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
1757       //uu       /* See comments on the case for Ain_SseReRg. */
1758       //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
1759       //uu    } else {
1760       //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
1761       //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
1762       //uu                        ? HRmWrite : HRmModify,
1763       //uu                     i->Ain.AvxReRg.dst);
1764       //uu
1765       //uu       if (i->Ain.AvxReRg.op == Asse_MOV) {
1766       //uu          u->isRegRegMove = True;
1767       //uu          u->regMoveSrc   = i->Ain.AvxReRg.src;
1768       //uu          u->regMoveDst   = i->Ain.AvxReRg.dst;
1769       //uu       }
1770       //uu    }
1771       //uu    return;
1772       case Ain_EvCheck:
1773          /* We expect both amodes only to mention %rbp, so this is in
1774             fact pointless, since %rbp isn't allocatable, but anyway.. */
1775          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
1776          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
1777          return;
1778       case Ain_ProfInc:
1779          addHRegUse(u, HRmWrite, hregAMD64_R11());
1780          return;
1781       default:
1782          ppAMD64Instr(i, mode64);
1783          vpanic("getRegUsage_AMD64Instr");
1784    }
1785 }
1786
1787 /* local helper */
1788 static inline void mapReg(HRegRemap* m, HReg* r)
1789 {
1790    *r = lookupHRegRemap(m, *r);
1791 }
1792
1793 void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
1794 {
1795    vassert(mode64 == True);
1796    switch (i->tag) {
1797       case Ain_Imm64:
1798          mapReg(m, &i->Ain.Imm64.dst);
1799          return;
1800       case Ain_Alu64R:
1801          mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
1802          mapReg(m, &i->Ain.Alu64R.dst);
1803          return;
1804       case Ain_Alu64M:
1805          mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
1806          mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
1807          return;
1808       case Ain_Sh64:
1809          mapReg(m, &i->Ain.Sh64.dst);
1810          return;
1811       case Ain_Test64:
1812          mapReg(m, &i->Ain.Test64.dst);
1813          return;
1814       case Ain_Unary64:
1815          mapReg(m, &i->Ain.Unary64.dst);
1816          return;
1817       case Ain_Lea64:
1818          mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
1819          mapReg(m, &i->Ain.Lea64.dst);
1820          return;
1821       case Ain_Alu32R:
1822          mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
1823          mapReg(m, &i->Ain.Alu32R.dst);
1824          return;
1825       case Ain_MulL:
1826          mapRegs_AMD64RM(m, i->Ain.MulL.src);
1827          return;
1828       case Ain_Div:
1829          mapRegs_AMD64RM(m, i->Ain.Div.src);
1830          return;
1831       case Ain_Push:
1832          mapRegs_AMD64RMI(m, i->Ain.Push.src);
1833          return;
1834       case Ain_Call:
1835          return;
1836       case Ain_XDirect:
1837          mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
1838          return;
1839       case Ain_XIndir:
1840          mapReg(m, &i->Ain.XIndir.dstGA);
1841          mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
1842          return;
1843       case Ain_XAssisted:
1844          mapReg(m, &i->Ain.XAssisted.dstGA);
1845          mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
1846          return;
1847       case Ain_CMov64:
1848          mapReg(m, &i->Ain.CMov64.src);
1849          mapReg(m, &i->Ain.CMov64.dst);
1850          return;
1851       case Ain_CLoad:
1852          mapRegs_AMD64AMode(m, i->Ain.CLoad.addr);
1853          mapReg(m, &i->Ain.CLoad.dst);
1854          return;
1855       case Ain_CStore:
1856          mapRegs_AMD64AMode(m, i->Ain.CStore.addr);
1857          mapReg(m, &i->Ain.CStore.src);
1858          return;
1859       case Ain_MovxLQ:
1860          mapReg(m, &i->Ain.MovxLQ.src);
1861          mapReg(m, &i->Ain.MovxLQ.dst);
1862          return;
1863       case Ain_LoadEX:
1864          mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
1865          mapReg(m, &i->Ain.LoadEX.dst);
1866          return;
1867       case Ain_Store:
1868          mapReg(m, &i->Ain.Store.src);
1869          mapRegs_AMD64AMode(m, i->Ain.Store.dst);
1870          return;
1871       case Ain_Set64:
1872          mapReg(m, &i->Ain.Set64.dst);
1873          return;
1874       case Ain_Bsfr64:
1875          mapReg(m, &i->Ain.Bsfr64.src);
1876          mapReg(m, &i->Ain.Bsfr64.dst);
1877          return;
1878       case Ain_MFence:
1879          return;
1880       case Ain_ACAS:
1881          mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
1882          return;
1883       case Ain_DACAS:
1884          mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
1885          return;
1886       case Ain_A87Free:
1887          return;
1888       case Ain_A87PushPop:
1889          mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
1890          return;
1891       case Ain_A87FpOp:
1892          return;
1893       case Ain_A87LdCW:
1894          mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
1895          return;
1896       case Ain_A87StSW:
1897          mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
1898          return;
1899       case Ain_LdMXCSR:
1900          mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
1901          return;
1902       case Ain_SseUComIS:
1903          mapReg(m, &i->Ain.SseUComIS.srcL);
1904          mapReg(m, &i->Ain.SseUComIS.srcR);
1905          mapReg(m, &i->Ain.SseUComIS.dst);
1906          return;
1907       case Ain_SseSI2SF:
1908          mapReg(m, &i->Ain.SseSI2SF.src);
1909          mapReg(m, &i->Ain.SseSI2SF.dst);
1910          return;
1911       case Ain_SseSF2SI:
1912          mapReg(m, &i->Ain.SseSF2SI.src);
1913          mapReg(m, &i->Ain.SseSF2SI.dst);
1914          return;
1915       case Ain_SseSDSS:
1916          mapReg(m, &i->Ain.SseSDSS.src);
1917          mapReg(m, &i->Ain.SseSDSS.dst);
1918          return;
1919       case Ain_SseLdSt:
1920          mapReg(m, &i->Ain.SseLdSt.reg);
1921          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
1922          break;
1923       case Ain_SseCStore:
1924          mapRegs_AMD64AMode(m, i->Ain.SseCStore.addr);
1925          mapReg(m, &i->Ain.SseCStore.src);
1926          return;
1927       case Ain_SseCLoad:
1928          mapRegs_AMD64AMode(m, i->Ain.SseCLoad.addr);
1929          mapReg(m, &i->Ain.SseCLoad.dst);
1930          return;
1931       case Ain_SseLdzLO:
1932          mapReg(m, &i->Ain.SseLdzLO.reg);
1933          mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
1934          break;
1935       case Ain_Sse32Fx4:
1936          mapReg(m, &i->Ain.Sse32Fx4.src);
1937          mapReg(m, &i->Ain.Sse32Fx4.dst);
1938          return;
1939       case Ain_Sse32FLo:
1940          mapReg(m, &i->Ain.Sse32FLo.src);
1941          mapReg(m, &i->Ain.Sse32FLo.dst);
1942          return;
1943       case Ain_Sse64Fx2:
1944          mapReg(m, &i->Ain.Sse64Fx2.src);
1945          mapReg(m, &i->Ain.Sse64Fx2.dst);
1946          return;
1947       case Ain_Sse64FLo:
1948          mapReg(m, &i->Ain.Sse64FLo.src);
1949          mapReg(m, &i->Ain.Sse64FLo.dst);
1950          return;
1951       case Ain_SseReRg:
1952          mapReg(m, &i->Ain.SseReRg.src);
1953          mapReg(m, &i->Ain.SseReRg.dst);
1954          return;
1955       case Ain_SseCMov:
1956          mapReg(m, &i->Ain.SseCMov.src);
1957          mapReg(m, &i->Ain.SseCMov.dst);
1958          return;
1959       case Ain_SseShuf:
1960          mapReg(m, &i->Ain.SseShuf.src);
1961          mapReg(m, &i->Ain.SseShuf.dst);
1962          return;
1963       case Ain_SseShiftN:
1964          mapReg(m, &i->Ain.SseShiftN.dst);
1965          return;
1966       case Ain_SseMOVQ:
1967          mapReg(m, &i->Ain.SseMOVQ.gpr);
1968          mapReg(m, &i->Ain.SseMOVQ.xmm);
1969          return;
1970       //uu case Ain_AvxLdSt:
1971       //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
1972       //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
1973       //uu    break;
1974       //uu case Ain_AvxReRg:
1975       //uu    mapReg(m, &i->Ain.AvxReRg.src);
1976       //uu    mapReg(m, &i->Ain.AvxReRg.dst);
1977       //uu    return;
1978       case Ain_EvCheck:
1979          /* We expect both amodes only to mention %rbp, so this is in
1980             fact pointless, since %rbp isn't allocatable, but anyway.. */
1981          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
1982          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
1983          return;
1984       case Ain_ProfInc:
1985          /* hardwires r11 -- nothing to modify. */
1986          return;
1987       default:
1988          ppAMD64Instr(i, mode64);
1989          vpanic("mapRegs_AMD64Instr");
1990    }
1991 }
1992
1993 /* Generate amd64 spill/reload instructions under the direction of the
1994    register allocator.  Note it's critical these don't write the
1995    condition codes. */
1996
1997 void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
1998                       HReg rreg, Int offsetB, Bool mode64 )
1999 {
2000    AMD64AMode* am;
2001    vassert(offsetB >= 0);
2002    vassert(!hregIsVirtual(rreg));
2003    vassert(mode64 == True);
2004    *i1 = *i2 = NULL;
2005    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
2006    switch (hregClass(rreg)) {
2007       case HRcInt64:
2008          *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
2009          return;
2010       case HRcVec128:
2011          *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
2012          return;
2013       default:
2014          ppHRegClass(hregClass(rreg));
2015          vpanic("genSpill_AMD64: unimplemented regclass");
2016    }
2017 }
2018
2019 void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
2020                        HReg rreg, Int offsetB, Bool mode64 )
2021 {
2022    AMD64AMode* am;
2023    vassert(offsetB >= 0);
2024    vassert(!hregIsVirtual(rreg));
2025    vassert(mode64 == True);
2026    *i1 = *i2 = NULL;
2027    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
2028    switch (hregClass(rreg)) {
2029       case HRcInt64:
2030          *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
2031          return;
2032       case HRcVec128:
2033          *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
2034          return;
2035       default:
2036          ppHRegClass(hregClass(rreg));
2037          vpanic("genReload_AMD64: unimplemented regclass");
2038    }
2039 }
2040
2041 AMD64Instr* genMove_AMD64(HReg from, HReg to, Bool mode64)
2042 {
2043    switch (hregClass(from)) {
2044    case HRcInt64:
2045       return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(from), to);
2046    case HRcVec128:
2047       return AMD64Instr_SseReRg(Asse_MOV, from, to);
2048    default:
2049       ppHRegClass(hregClass(from));
2050       vpanic("genMove_AMD64: unimplemented regclass");
2051    }
2052 }
2053
2054 AMD64Instr* directReload_AMD64( AMD64Instr* i, HReg vreg, Short spill_off )
2055 {
2056    vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
2057
2058    /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
2059       Convert to: src=RMI_Mem, dst=Reg
2060    */
2061    if (i->tag == Ain_Alu64R
2062        && (i->Ain.Alu64R.op == Aalu_MOV || i->Ain.Alu64R.op == Aalu_OR
2063            || i->Ain.Alu64R.op == Aalu_XOR)
2064        && i->Ain.Alu64R.src->tag == Armi_Reg
2065        && sameHReg(i->Ain.Alu64R.src->Armi.Reg.reg, vreg)) {
2066       vassert(! sameHReg(i->Ain.Alu64R.dst, vreg));
2067       return AMD64Instr_Alu64R(
2068                 i->Ain.Alu64R.op,
2069                 AMD64RMI_Mem( AMD64AMode_IR( spill_off, hregAMD64_RBP())),
2070                 i->Ain.Alu64R.dst
2071              );
2072    }
2073
2074    /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
2075       Convert to: src=RI_Imm, dst=Mem
2076    */
2077    if (i->tag == Ain_Alu64R
2078        && (i->Ain.Alu64R.op == Aalu_CMP)
2079        && i->Ain.Alu64R.src->tag == Armi_Imm
2080        && sameHReg(i->Ain.Alu64R.dst, vreg)) {
2081       return AMD64Instr_Alu64M(
2082                 i->Ain.Alu64R.op,
2083                 AMD64RI_Imm( i->Ain.Alu64R.src->Armi.Imm.imm32 ),
2084                 AMD64AMode_IR( spill_off, hregAMD64_RBP())
2085              );
2086    }
2087
2088    return NULL;
2089 }
2090
2091
2092 /* --------- The amd64 assembler (bleh.) --------- */
2093
2094 /* Produce the low three bits of an integer register number. */
2095 inline static UInt iregEnc210 ( HReg r )
2096 {
2097    UInt n;
2098    vassert(hregClass(r) == HRcInt64);
2099    vassert(!hregIsVirtual(r));
2100    n = hregEncoding(r);
2101    vassert(n <= 15);
2102    return n & 7;
2103 }
2104
2105 /* Produce bit 3 of an integer register number. */
2106 inline static UInt iregEnc3 ( HReg r )
2107 {
2108    UInt n;
2109    vassert(hregClass(r) == HRcInt64);
2110    vassert(!hregIsVirtual(r));
2111    n = hregEncoding(r);
2112    vassert(n <= 15);
2113    return (n >> 3) & 1;
2114 }
2115
2116 /* Produce a complete 4-bit integer register number. */
2117 inline static UInt iregEnc3210 ( HReg r )
2118 {
2119    UInt n;
2120    vassert(hregClass(r) == HRcInt64);
2121    vassert(!hregIsVirtual(r));
2122    n = hregEncoding(r);
2123    vassert(n <= 15);
2124    return n;
2125 }
2126
2127 /* Produce a complete 4-bit integer register number. */
2128 inline static UInt vregEnc3210 ( HReg r )
2129 {
2130    UInt n;
2131    vassert(hregClass(r) == HRcVec128);
2132    vassert(!hregIsVirtual(r));
2133    n = hregEncoding(r);
2134    vassert(n <= 15);
2135    return n;
2136 }
2137
2138 inline static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
2139 {
2140    vassert(mod < 4);
2141    vassert((reg|regmem) < 8);
2142    return (UChar)( ((mod & 3) << 6) | ((reg & 7) << 3) | (regmem & 7) );
2143 }
2144
2145 inline static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
2146 {
2147    vassert(shift < 4);
2148    vassert((regindex|regbase) < 8);
2149    return (UChar)( ((shift & 3) << 6) | ((regindex & 7) << 3) | (regbase & 7) );
2150 }
2151
2152 static UChar* emit32 ( UChar* p, UInt w32 )
2153 {
2154    *p++ = toUChar((w32)       & 0x000000FF);
2155    *p++ = toUChar((w32 >>  8) & 0x000000FF);
2156    *p++ = toUChar((w32 >> 16) & 0x000000FF);
2157    *p++ = toUChar((w32 >> 24) & 0x000000FF);
2158    return p;
2159 }
2160
2161 static UChar* emit64 ( UChar* p, ULong w64 )
2162 {
2163    p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
2164    p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
2165    return p;
2166 }
2167
2168 /* Does a sign-extend of the lowest 8 bits give
2169    the original number? */
2170 static Bool fits8bits ( UInt w32 )
2171 {
2172    Int i32 = (Int)w32;
2173    return toBool(i32 == ((Int)(w32 << 24) >> 24));
2174 }
2175 /* Can the lower 32 bits be signedly widened to produce the whole
2176    64-bit value?  In other words, are the top 33 bits either all 0 or
2177    all 1 ? */
2178 static Bool fitsIn32Bits ( ULong x )
2179 {
2180    Long y1;
2181    y1 = x << 32;
2182    y1 >>=/*s*/ 32;
2183    return toBool(x == y1);
2184 }
2185
2186
2187 /* Forming mod-reg-rm bytes and scale-index-base bytes.
2188
2189      greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
2190                        =  00 greg ereg
2191
2192      greg,  d8(ereg)   |  ereg is neither of: RSP R12
2193                        =  01 greg ereg, d8
2194
2195      greg,  d32(ereg)  |  ereg is neither of: RSP R12
2196                        =  10 greg ereg, d32
2197
2198      greg,  d8(ereg)   |  ereg is either: RSP R12
2199                        =  01 greg 100, 0x24, d8
2200                        (lowest bit of rex distinguishes R12/RSP)
2201
2202      greg,  d32(ereg)  |  ereg is either: RSP R12
2203                        =  10 greg 100, 0x24, d32
2204                        (lowest bit of rex distinguishes R12/RSP)
2205
2206      -----------------------------------------------
2207
2208      greg,  d8(base,index,scale)
2209                |  index != RSP
2210                =  01 greg 100, scale index base, d8
2211
2212      greg,  d32(base,index,scale)
2213                |  index != RSP
2214                =  10 greg 100, scale index base, d32
2215 */
2216 static UChar* doAMode_M__wrk ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2217 {
2218    UInt gregEnc210 = gregEnc3210 & 7;
2219    if (am->tag == Aam_IR) {
2220       if (am->Aam.IR.imm == 0
2221           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2222           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RBP())
2223           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2224           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R13())
2225          ) {
2226          *p++ = mkModRegRM(0, gregEnc210, iregEnc210(am->Aam.IR.reg));
2227          return p;
2228       }
2229       if (fits8bits(am->Aam.IR.imm)
2230           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2231           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2232          ) {
2233          *p++ = mkModRegRM(1, gregEnc210, iregEnc210(am->Aam.IR.reg));
2234          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2235          return p;
2236       }
2237       if (! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2238           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2239          ) {
2240          *p++ = mkModRegRM(2, gregEnc210, iregEnc210(am->Aam.IR.reg));
2241          p = emit32(p, am->Aam.IR.imm);
2242          return p;
2243       }
2244       if ((sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2245            || sameHReg(am->Aam.IR.reg, hregAMD64_R12()))
2246           && fits8bits(am->Aam.IR.imm)) {
2247          *p++ = mkModRegRM(1, gregEnc210, 4);
2248          *p++ = 0x24;
2249          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2250          return p;
2251       }
2252       if (/* (sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2253               || wait for test case for RSP case */
2254           sameHReg(am->Aam.IR.reg, hregAMD64_R12())) {
2255          *p++ = mkModRegRM(2, gregEnc210, 4);
2256          *p++ = 0x24;
2257          p = emit32(p, am->Aam.IR.imm);
2258          return p;
2259       }
2260       ppAMD64AMode(am);
2261       vpanic("doAMode_M: can't emit amode IR");
2262       /*NOTREACHED*/
2263    }
2264    if (am->tag == Aam_IRRS) {
2265       if (fits8bits(am->Aam.IRRS.imm)
2266           && ! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2267          *p++ = mkModRegRM(1, gregEnc210, 4);
2268          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2269                                           iregEnc210(am->Aam.IRRS.base));
2270          *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
2271          return p;
2272       }
2273       if (! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2274          *p++ = mkModRegRM(2, gregEnc210, 4);
2275          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2276                                           iregEnc210(am->Aam.IRRS.base));
2277          p = emit32(p, am->Aam.IRRS.imm);
2278          return p;
2279       }
2280       ppAMD64AMode(am);
2281       vpanic("doAMode_M: can't emit amode IRRS");
2282       /*NOTREACHED*/
2283    }
2284    vpanic("doAMode_M: unknown amode");
2285    /*NOTREACHED*/
2286 }
2287
2288 static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
2289 {
2290    return doAMode_M__wrk(p, iregEnc3210(greg), am);
2291 }
2292
2293 static UChar* doAMode_M_enc ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2294 {
2295    vassert(gregEnc3210 < 16);
2296    return doAMode_M__wrk(p, gregEnc3210, am);
2297 }
2298
2299
2300 /* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
2301 inline
2302 static UChar* doAMode_R__wrk ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2303 {
2304    *p++ = mkModRegRM(3, gregEnc3210 & 7, eregEnc3210 & 7);
2305    return p;
2306 }
2307
2308 static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
2309 {
2310    return doAMode_R__wrk(p, iregEnc3210(greg), iregEnc3210(ereg));
2311 }
2312
2313 static UChar* doAMode_R_enc_reg ( UChar* p, UInt gregEnc3210, HReg ereg )
2314 {
2315    vassert(gregEnc3210 < 16);
2316    return doAMode_R__wrk(p, gregEnc3210, iregEnc3210(ereg));
2317 }
2318
2319 static UChar* doAMode_R_reg_enc ( UChar* p, HReg greg, UInt eregEnc3210 )
2320 {
2321    vassert(eregEnc3210 < 16);
2322    return doAMode_R__wrk(p, iregEnc3210(greg), eregEnc3210);
2323 }
2324
2325 static UChar* doAMode_R_enc_enc ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2326 {
2327    vassert( (gregEnc3210|eregEnc3210) < 16);
2328    return doAMode_R__wrk(p, gregEnc3210, eregEnc3210);
2329 }
2330
2331
2332 /* Clear the W bit on a REX byte, thereby changing the operand size
2333    back to whatever that instruction's default operand size is. */
2334 static inline UChar clearWBit ( UChar rex )
2335 {
2336    return rex & ~(1<<3);
2337 }
2338
2339 static inline UChar setWBit ( UChar rex )
2340 {
2341    return rex | (1<<3);
2342 }
2343
2344
2345 /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
2346 inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am )
2347 {
2348    if (am->tag == Aam_IR) {
2349       UChar W = 1;  /* we want 64-bit mode */
2350       UChar R = (gregEnc3210 >> 3) & 1;
2351       UChar X = 0; /* not relevant */
2352       UChar B = iregEnc3(am->Aam.IR.reg);
2353       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2354    }
2355    if (am->tag == Aam_IRRS) {
2356       UChar W = 1;  /* we want 64-bit mode */
2357       UChar R = (gregEnc3210 >> 3) & 1;
2358       UChar X = iregEnc3(am->Aam.IRRS.index);
2359       UChar B = iregEnc3(am->Aam.IRRS.base);
2360       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2361    }
2362    vassert(0);
2363    return 0; /*NOTREACHED*/
2364 }
2365
2366 static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
2367 {
2368    return rexAMode_M__wrk(iregEnc3210(greg), am);
2369 }
2370
2371 static UChar rexAMode_M_enc ( UInt gregEnc3210, AMD64AMode* am )
2372 {
2373    vassert(gregEnc3210 < 16);
2374    return rexAMode_M__wrk(gregEnc3210, am);
2375 }
2376
2377
2378 /* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
2379 inline static UChar rexAMode_R__wrk ( UInt gregEnc3210, UInt eregEnc3210 )
2380 {
2381    UChar W = 1;  /* we want 64-bit mode */
2382    UChar R = (gregEnc3210 >> 3) & 1;
2383    UChar X = 0; /* not relevant */
2384    UChar B = (eregEnc3210 >> 3) & 1;
2385    return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2386 }
2387
2388 static UChar rexAMode_R ( HReg greg, HReg ereg )
2389 {
2390    return rexAMode_R__wrk(iregEnc3210(greg), iregEnc3210(ereg));
2391 }
2392
2393 static UChar rexAMode_R_enc_reg ( UInt gregEnc3210, HReg ereg )
2394 {
2395    vassert(gregEnc3210 < 16);
2396    return rexAMode_R__wrk(gregEnc3210, iregEnc3210(ereg));
2397 }
2398
2399 static UChar rexAMode_R_reg_enc ( HReg greg, UInt eregEnc3210 )
2400 {
2401    vassert(eregEnc3210 < 16);
2402    return rexAMode_R__wrk(iregEnc3210(greg), eregEnc3210);
2403 }
2404
2405 static UChar rexAMode_R_enc_enc ( UInt gregEnc3210, UInt eregEnc3210 )
2406 {
2407    vassert((gregEnc3210|eregEnc3210) < 16);
2408    return rexAMode_R__wrk(gregEnc3210, eregEnc3210);
2409 }
2410
2411
2412 //uu /* May 2012: this VEX prefix stuff is currently unused, but has
2413 //uu    verified correct (I reckon).  Certainly it has been known to
2414 //uu    produce correct VEX prefixes during testing. */
2415 //uu
2416 //uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
2417 //uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
2418 //uu    in verbatim.  There's no range checking on the bits. */
2419 //uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
2420 //uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
2421 //uu                             UInt L, UInt pp )
2422 //uu {
2423 //uu    UChar byte0 = 0;
2424 //uu    UChar byte1 = 0;
2425 //uu    UChar byte2 = 0;
2426 //uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
2427 //uu       /* 2 byte encoding is possible. */
2428 //uu       byte0 = 0xC5;
2429 //uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
2430 //uu               | (L << 2) | pp;
2431 //uu    } else {
2432 //uu       /* 3 byte encoding is needed. */
2433 //uu       byte0 = 0xC4;
2434 //uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
2435 //uu               | ((rexB ^ 1) << 5) | mmmmm;
2436 //uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
2437 //uu    }
2438 //uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
2439 //uu }
2440 //uu
2441 //uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
2442 //uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
2443 //uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
2444 //uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
2445 //uu    vvvv=1111 (unused 3rd reg). */
2446 //uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
2447 //uu {
2448 //uu    UChar L       = 1; /* size = 256 */
2449 //uu    UChar pp      = 0; /* no SIMD prefix */
2450 //uu    UChar mmmmm   = 1; /* 0F */
2451 //uu    UChar notVvvv = 0; /* unused */
2452 //uu    UChar rexW    = 0;
2453 //uu    UChar rexR    = 0;
2454 //uu    UChar rexX    = 0;
2455 //uu    UChar rexB    = 0;
2456 //uu    /* Same logic as in rexAMode_M. */
2457 //uu    if (am->tag == Aam_IR) {
2458 //uu       rexR = iregEnc3(greg);
2459 //uu       rexX = 0; /* not relevant */
2460 //uu       rexB = iregEnc3(am->Aam.IR.reg);
2461 //uu    }
2462 //uu    else if (am->tag == Aam_IRRS) {
2463 //uu       rexR = iregEnc3(greg);
2464 //uu       rexX = iregEnc3(am->Aam.IRRS.index);
2465 //uu       rexB = iregEnc3(am->Aam.IRRS.base);
2466 //uu    } else {
2467 //uu       vassert(0);
2468 //uu    }
2469 //uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
2470 //uu }
2471 //uu
2472 //uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
2473 //uu {
2474 //uu    switch (vex & 0xFF) {
2475 //uu       case 0xC5:
2476 //uu          *p++ = 0xC5;
2477 //uu          *p++ = (vex >> 8) & 0xFF;
2478 //uu          vassert(0 == (vex >> 16));
2479 //uu          break;
2480 //uu       case 0xC4:
2481 //uu          *p++ = 0xC4;
2482 //uu          *p++ = (vex >> 8) & 0xFF;
2483 //uu          *p++ = (vex >> 16) & 0xFF;
2484 //uu          vassert(0 == (vex >> 24));
2485 //uu          break;
2486 //uu       default:
2487 //uu          vassert(0);
2488 //uu    }
2489 //uu    return p;
2490 //uu }
2491
2492
2493 /* Emit ffree %st(N) */
2494 static UChar* do_ffree_st ( UChar* p, Int n )
2495 {
2496    vassert(n >= 0 && n <= 7);
2497    *p++ = 0xDD;
2498    *p++ = toUChar(0xC0 + n);
2499    return p;
2500 }
2501
2502 /* Emit an instruction into buf and return the number of bytes used.
2503    Note that buf is not the insn's final place, and therefore it is
2504    imperative to emit position-independent code.  If the emitted
2505    instruction was a profiler inc, set *is_profInc to True, else
2506    leave it unchanged. */
2507
2508 Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
2509                       UChar* buf, Int nbuf, const AMD64Instr* i,
2510                       Bool mode64, VexEndness endness_host,
2511                       const void* disp_cp_chain_me_to_slowEP,
2512                       const void* disp_cp_chain_me_to_fastEP,
2513                       const void* disp_cp_xindir,
2514                       const void* disp_cp_xassisted )
2515 {
2516    UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2517    UInt   xtra;
2518    UInt   reg;
2519    UChar  rex;
2520    UChar* p = &buf[0];
2521    UChar* ptmp;
2522    Int    j;
2523    vassert(nbuf >= 64);
2524    vassert(mode64 == True);
2525
2526    /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
2527
2528    switch (i->tag) {
2529
2530    case Ain_Imm64:
2531       if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
2532          /* Use the short form (load into 32 bit reg, + default
2533             widening rule) for constants under 1 million.  We could
2534             use this form for the range 0 to 0x7FFFFFFF inclusive, but
2535             limit it to a smaller range for verifiability purposes. */
2536          if (1 & iregEnc3(i->Ain.Imm64.dst))
2537             *p++ = 0x41;
2538          *p++ = 0xB8 + iregEnc210(i->Ain.Imm64.dst);
2539          p = emit32(p, (UInt)i->Ain.Imm64.imm64);
2540       } else {
2541          *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Imm64.dst)));
2542          *p++ = toUChar(0xB8 + iregEnc210(i->Ain.Imm64.dst));
2543          p = emit64(p, i->Ain.Imm64.imm64);
2544       }
2545       goto done;
2546
2547    case Ain_Alu64R:
2548       /* Deal specially with MOV */
2549       if (i->Ain.Alu64R.op == Aalu_MOV) {
2550          switch (i->Ain.Alu64R.src->tag) {
2551             case Armi_Imm:
2552                if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
2553                   /* Actually we could use this form for constants in
2554                      the range 0 through 0x7FFFFFFF inclusive, but
2555                      limit it to a small range for verifiability
2556                      purposes. */
2557                   /* Generate "movl $imm32, 32-bit-register" and let
2558                      the default zero-extend rule cause the upper half
2559                      of the dst to be zeroed out too.  This saves 1
2560                      and sometimes 2 bytes compared to the more
2561                      obvious encoding in the 'else' branch. */
2562                   if (1 & iregEnc3(i->Ain.Alu64R.dst))
2563                      *p++ = 0x41;
2564                   *p++ = 0xB8 + iregEnc210(i->Ain.Alu64R.dst);
2565                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2566                } else {
2567                   *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Alu64R.dst)));
2568                   *p++ = 0xC7;
2569                   *p++ = toUChar(0xC0 + iregEnc210(i->Ain.Alu64R.dst));
2570                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2571                }
2572                goto done;
2573             case Armi_Reg:
2574                *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2575                                   i->Ain.Alu64R.dst );
2576                *p++ = 0x89;
2577                p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2578                                 i->Ain.Alu64R.dst);
2579                goto done;
2580             case Armi_Mem:
2581                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2582                                  i->Ain.Alu64R.src->Armi.Mem.am);
2583                *p++ = 0x8B;
2584                p = doAMode_M(p, i->Ain.Alu64R.dst,
2585                                 i->Ain.Alu64R.src->Armi.Mem.am);
2586                goto done;
2587             default:
2588                goto bad;
2589          }
2590       }
2591       /* MUL */
2592       if (i->Ain.Alu64R.op == Aalu_MUL) {
2593          switch (i->Ain.Alu64R.src->tag) {
2594             case Armi_Reg:
2595                *p++ = rexAMode_R( i->Ain.Alu64R.dst,
2596                                   i->Ain.Alu64R.src->Armi.Reg.reg);
2597                *p++ = 0x0F;
2598                *p++ = 0xAF;
2599                p = doAMode_R(p, i->Ain.Alu64R.dst,
2600                                 i->Ain.Alu64R.src->Armi.Reg.reg);
2601                goto done;
2602             case Armi_Mem:
2603                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2604                                  i->Ain.Alu64R.src->Armi.Mem.am);
2605                *p++ = 0x0F;
2606                *p++ = 0xAF;
2607                p = doAMode_M(p, i->Ain.Alu64R.dst,
2608                                 i->Ain.Alu64R.src->Armi.Mem.am);
2609                goto done;
2610             case Armi_Imm:
2611                if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2612                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2613                   *p++ = 0x6B;
2614                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2615                   *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2616                } else {
2617                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2618                   *p++ = 0x69;
2619                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2620                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2621                }
2622                goto done;
2623             default:
2624                goto bad;
2625          }
2626       }
2627       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2628       opc = opc_rr = subopc_imm = opc_imma = 0;
2629       switch (i->Ain.Alu64R.op) {
2630          case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
2631                         subopc_imm = 2; opc_imma = 0x15; break;
2632          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2633                         subopc_imm = 0; opc_imma = 0x05; break;
2634          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2635                         subopc_imm = 5; opc_imma = 0x2D; break;
2636          case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
2637                         subopc_imm = 3; opc_imma = 0x1D; break;
2638          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2639                         subopc_imm = 4; opc_imma = 0x25; break;
2640          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2641                         subopc_imm = 6; opc_imma = 0x35; break;
2642          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2643                         subopc_imm = 1; opc_imma = 0x0D; break;
2644          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2645                         subopc_imm = 7; opc_imma = 0x3D; break;
2646          default: goto bad;
2647       }
2648       switch (i->Ain.Alu64R.src->tag) {
2649          case Armi_Imm:
2650             if (sameHReg(i->Ain.Alu64R.dst, hregAMD64_RAX())
2651                 && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2652                goto bad; /* FIXME: awaiting test case */
2653                *p++ = toUChar(opc_imma);
2654                p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2655             } else
2656             if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2657                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst );
2658                *p++ = 0x83;
2659                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2660                *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2661             } else {
2662                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst);
2663                *p++ = 0x81;
2664                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2665                p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2666             }
2667             goto done;
2668          case Armi_Reg:
2669             *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2670                                i->Ain.Alu64R.dst);
2671             *p++ = toUChar(opc_rr);
2672             p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2673                              i->Ain.Alu64R.dst);
2674             goto done;
2675          case Armi_Mem:
2676             *p++ = rexAMode_M( i->Ain.Alu64R.dst,
2677                                i->Ain.Alu64R.src->Armi.Mem.am);
2678             *p++ = toUChar(opc);
2679             p = doAMode_M(p, i->Ain.Alu64R.dst,
2680                              i->Ain.Alu64R.src->Armi.Mem.am);
2681             goto done;
2682          default:
2683             goto bad;
2684       }
2685       break;
2686
2687    case Ain_Alu64M:
2688       /* Deal specially with MOV */
2689       if (i->Ain.Alu64M.op == Aalu_MOV) {
2690          switch (i->Ain.Alu64M.src->tag) {
2691             case Ari_Reg:
2692                *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
2693                                  i->Ain.Alu64M.dst);
2694                *p++ = 0x89;
2695                p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
2696                                 i->Ain.Alu64M.dst);
2697                goto done;
2698             case Ari_Imm:
2699                *p++ = rexAMode_M_enc(0, i->Ain.Alu64M.dst);
2700                *p++ = 0xC7;
2701                p = doAMode_M_enc(p, 0, i->Ain.Alu64M.dst);
2702                p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2703                goto done;
2704             default:
2705                goto bad;
2706          }
2707       }
2708       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
2709          allowed here. (This is derived from the x86 version of same). */
2710       opc = subopc_imm = opc_imma = 0;
2711       switch (i->Ain.Alu64M.op) {
2712          case Aalu_CMP: opc = 0x39; subopc_imm = 7; break;
2713          default: goto bad;
2714       }
2715       switch (i->Ain.Alu64M.src->tag) {
2716          /*
2717          case Xri_Reg:
2718             *p++ = toUChar(opc);
2719             p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2720                              i->Xin.Alu32M.dst);
2721             goto done;
2722          */
2723          case Ari_Imm:
2724             if (fits8bits(i->Ain.Alu64M.src->Ari.Imm.imm32)) {
2725                *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
2726                *p++ = 0x83;
2727                p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
2728                *p++ = toUChar(0xFF & i->Ain.Alu64M.src->Ari.Imm.imm32);
2729                goto done;
2730             } else {
2731                *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
2732                *p++ = 0x81;
2733                p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
2734                p    = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2735                goto done;
2736             }
2737          default:
2738             goto bad;
2739       }
2740
2741       break;
2742
2743    case Ain_Sh64:
2744       opc_cl = opc_imm = subopc = 0;
2745       switch (i->Ain.Sh64.op) {
2746          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2747          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2748          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2749          default: goto bad;
2750       }
2751       if (i->Ain.Sh64.src == 0) {
2752          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2753          *p++ = toUChar(opc_cl);
2754          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2755          goto done;
2756       } else {
2757          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2758          *p++ = toUChar(opc_imm);
2759          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2760          *p++ = (UChar)(i->Ain.Sh64.src);
2761          goto done;
2762       }
2763       break;
2764
2765    case Ain_Test64:
2766       /* testq sign-extend($imm32), %reg */
2767       *p++ = rexAMode_R_enc_reg(0, i->Ain.Test64.dst);
2768       *p++ = 0xF7;
2769       p = doAMode_R_enc_reg(p, 0, i->Ain.Test64.dst);
2770       p = emit32(p, i->Ain.Test64.imm32);
2771       goto done;
2772
2773    case Ain_Unary64:
2774       if (i->Ain.Unary64.op == Aun_NOT) {
2775          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2776          *p++ = 0xF7;
2777          p = doAMode_R_enc_reg(p, 2, i->Ain.Unary64.dst);
2778          goto done;
2779       }
2780       if (i->Ain.Unary64.op == Aun_NEG) {
2781          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2782          *p++ = 0xF7;
2783          p = doAMode_R_enc_reg(p, 3, i->Ain.Unary64.dst);
2784          goto done;
2785       }
2786       break;
2787
2788    case Ain_Lea64:
2789       *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
2790       *p++ = 0x8D;
2791       p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
2792       goto done;
2793
2794    case Ain_Alu32R:
2795       /* ADD/SUB/AND/OR/XOR/CMP */
2796       opc = opc_rr = subopc_imm = opc_imma = 0;
2797       switch (i->Ain.Alu32R.op) {
2798          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2799                         subopc_imm = 0; opc_imma = 0x05; break;
2800          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2801                         subopc_imm = 5; opc_imma = 0x2D; break;
2802          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2803                         subopc_imm = 4; opc_imma = 0x25; break;
2804          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2805                         subopc_imm = 6; opc_imma = 0x35; break;
2806          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2807                         subopc_imm = 1; opc_imma = 0x0D; break;
2808          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2809                         subopc_imm = 7; opc_imma = 0x3D; break;
2810          default: goto bad;
2811       }
2812       switch (i->Ain.Alu32R.src->tag) {
2813          case Armi_Imm:
2814             if (sameHReg(i->Ain.Alu32R.dst, hregAMD64_RAX())
2815                 && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2816                goto bad; /* FIXME: awaiting test case */
2817                *p++ = toUChar(opc_imma);
2818                p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2819             } else
2820             if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2821                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst ) );
2822                if (rex != 0x40) *p++ = rex;
2823                *p++ = 0x83;
2824                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2825                *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
2826             } else {
2827                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst) );
2828                if (rex != 0x40) *p++ = rex;
2829                *p++ = 0x81;
2830                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2831                p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2832             }
2833             goto done;
2834          case Armi_Reg:
2835             rex  = clearWBit(
2836                    rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
2837                                i->Ain.Alu32R.dst) );
2838             if (rex != 0x40) *p++ = rex;
2839             *p++ = toUChar(opc_rr);
2840             p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
2841                              i->Ain.Alu32R.dst);
2842             goto done;
2843          case Armi_Mem:
2844             rex  = clearWBit(
2845                    rexAMode_M( i->Ain.Alu32R.dst,
2846                                i->Ain.Alu32R.src->Armi.Mem.am) );
2847             if (rex != 0x40) *p++ = rex;
2848             *p++ = toUChar(opc);
2849             p = doAMode_M(p, i->Ain.Alu32R.dst,
2850                              i->Ain.Alu32R.src->Armi.Mem.am);
2851             goto done;
2852          default:
2853             goto bad;
2854       }
2855       break;
2856
2857    case Ain_MulL:
2858       subopc = i->Ain.MulL.syned ? 5 : 4;
2859       switch (i->Ain.MulL.src->tag)  {
2860          case Arm_Mem:
2861             *p++ = rexAMode_M_enc(0, i->Ain.MulL.src->Arm.Mem.am);
2862             *p++ = 0xF7;
2863             p = doAMode_M_enc(p, subopc, i->Ain.MulL.src->Arm.Mem.am);
2864             goto done;
2865          case Arm_Reg:
2866             *p++ = rexAMode_R_enc_reg(0, i->Ain.MulL.src->Arm.Reg.reg);
2867             *p++ = 0xF7;
2868             p = doAMode_R_enc_reg(p, subopc, i->Ain.MulL.src->Arm.Reg.reg);
2869             goto done;
2870          default:
2871             goto bad;
2872       }
2873       break;
2874
2875    case Ain_Div:
2876       subopc = i->Ain.Div.syned ? 7 : 6;
2877       if (i->Ain.Div.sz == 4) {
2878          switch (i->Ain.Div.src->tag)  {
2879             case Arm_Mem:
2880                goto bad;
2881                /*FIXME*/
2882                *p++ = 0xF7;
2883                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2884                goto done;
2885             case Arm_Reg:
2886                *p++ = clearWBit(
2887                       rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg));
2888                *p++ = 0xF7;
2889                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2890                goto done;
2891             default:
2892                goto bad;
2893          }
2894       }
2895       if (i->Ain.Div.sz == 8) {
2896          switch (i->Ain.Div.src->tag)  {
2897             case Arm_Mem:
2898                *p++ = rexAMode_M_enc(0, i->Ain.Div.src->Arm.Mem.am);
2899                *p++ = 0xF7;
2900                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2901                goto done;
2902             case Arm_Reg:
2903                *p++ = rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg);
2904                *p++ = 0xF7;
2905                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2906                goto done;
2907             default:
2908                goto bad;
2909          }
2910       }
2911       break;
2912
2913    case Ain_Push:
2914       switch (i->Ain.Push.src->tag) {
2915          case Armi_Mem:
2916             *p++ = clearWBit(
2917                    rexAMode_M_enc(0, i->Ain.Push.src->Armi.Mem.am));
2918             *p++ = 0xFF;
2919             p = doAMode_M_enc(p, 6, i->Ain.Push.src->Armi.Mem.am);
2920             goto done;
2921          case Armi_Imm:
2922             *p++ = 0x68;
2923             p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
2924             goto done;
2925          case Armi_Reg:
2926             *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.Push.src->Armi.Reg.reg)));
2927             *p++ = toUChar(0x50 + iregEnc210(i->Ain.Push.src->Armi.Reg.reg));
2928             goto done;
2929         default:
2930             goto bad;
2931       }
2932
2933    case Ain_Call: {
2934       /* As per detailed comment for Ain_Call in getRegUsage_AMD64Instr
2935          above, %r11 is used as an address temporary. */
2936       /* If we don't need to do any fixup actions in the case that the
2937          call doesn't happen, just do the simple thing and emit
2938          straight-line code.  This is usually the case. */
2939       if (i->Ain.Call.cond == Acc_ALWAYS/*call always happens*/
2940           || i->Ain.Call.rloc.pri == RLPri_None/*no fixup action*/) {
2941          /* jump over the following two insns if the condition does
2942             not hold */
2943          Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
2944          if (i->Ain.Call.cond != Acc_ALWAYS) {
2945             *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2946             *p++ = shortImm ? 10 : 13;
2947             /* 10 or 13 bytes in the next two insns */
2948          }
2949          if (shortImm) {
2950             /* 7 bytes: movl sign-extend(imm32), %r11 */
2951             *p++ = 0x49;
2952             *p++ = 0xC7;
2953             *p++ = 0xC3;
2954             p = emit32(p, (UInt)i->Ain.Call.target);
2955          } else {
2956             /* 10 bytes: movabsq $target, %r11 */
2957             *p++ = 0x49;
2958             *p++ = 0xBB;
2959             p = emit64(p, i->Ain.Call.target);
2960          }
2961          /* 3 bytes: call *%r11 */
2962          *p++ = 0x41;
2963          *p++ = 0xFF;
2964          *p++ = 0xD3;
2965       } else {
2966          Int delta;
2967          /* Complex case.  We have to generate an if-then-else diamond. */
2968          // before:
2969          //   j{!cond} else:
2970          //   movabsq $target, %r11
2971          //   call* %r11
2972          // preElse:
2973          //   jmp after:
2974          // else:
2975          //   movabsq $0x5555555555555555, %rax  // possibly
2976          //   movq %rax, %rdx                    // possibly
2977          // after:
2978
2979          // before:
2980          UChar* pBefore = p;
2981
2982          //   j{!cond} else:
2983          *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
2984          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
2985
2986          //   movabsq $target, %r11
2987          *p++ = 0x49;
2988          *p++ = 0xBB;
2989          p = emit64(p, i->Ain.Call.target);
2990
2991          //   call* %r11
2992          *p++ = 0x41;
2993          *p++ = 0xFF;
2994          *p++ = 0xD3;
2995
2996          // preElse:
2997          UChar* pPreElse = p;
2998
2999          //   jmp after:
3000          *p++ = 0xEB;
3001          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3002
3003          // else:
3004          UChar* pElse = p;
3005
3006          /* Do the 'else' actions */
3007          switch (i->Ain.Call.rloc.pri) {
3008             case RLPri_Int:
3009                // movabsq $0x5555555555555555, %rax
3010                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
3011                break;
3012             case RLPri_2Int:
3013                goto bad; //ATC
3014                // movabsq $0x5555555555555555, %rax
3015                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
3016                // movq %rax, %rdx
3017                *p++ = 0x48; *p++ = 0x89; *p++ = 0xC2;
3018                break;
3019             case RLPri_V128SpRel:
3020                if (i->Ain.Call.rloc.spOff == 0) {
3021                   // We could accept any |spOff| here, but that's more
3022                   // hassle and the only value we're ever going to get
3023                   // is zero (I believe.)  Hence take the easy path :)
3024                   // We need a scag register -- r11 can be it.
3025                   // movabsq $0x5555555555555555, %r11
3026                   *p++ = 0x49; *p++ = 0xBB;
3027                   p = emit64(p, 0x5555555555555555ULL);
3028                   // movq %r11, 0(%rsp)
3029                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x1C; *p++ = 0x24;
3030                   // movq %r11, 8(%rsp)
3031                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x5C; *p++ = 0x24;
3032                   *p++ = 0x08;
3033                   break;
3034                }
3035                goto bad; //ATC for all other spOff values
3036             case RLPri_V256SpRel:
3037                goto bad; //ATC
3038             case RLPri_None: case RLPri_INVALID: default:
3039                vassert(0); // should never get here
3040          }
3041
3042          // after:
3043          UChar* pAfter = p;
3044
3045          // Fix up the branch offsets.  The +2s in the offset
3046          // calculations are there because x86 requires conditional
3047          // branches to have their offset stated relative to the
3048          // instruction immediately following the branch insn.  And in
3049          // both cases the branch insns are 2 bytes long.
3050
3051          // First, the "j{!cond} else:" at pBefore.
3052          delta = (Int)(Long)(pElse - (pBefore + 2));
3053          vassert(delta >= 0 && delta < 100/*arbitrary*/);
3054          *(pBefore+1) = (UChar)delta;
3055
3056          // And secondly, the "jmp after:" at pPreElse.
3057          delta = (Int)(Long)(pAfter - (pPreElse + 2));
3058          vassert(delta >= 0 && delta < 100/*arbitrary*/);
3059          *(pPreElse+1) = (UChar)delta;
3060       }
3061       goto done;
3062    }
3063
3064    case Ain_XDirect: {
3065       /* NB: what goes on here has to be very closely coordinated with the
3066          chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
3067       /* We're generating chain-me requests here, so we need to be
3068          sure this is actually allowed -- no-redir translations can't
3069          use chain-me's.  Hence: */
3070       vassert(disp_cp_chain_me_to_slowEP != NULL);
3071       vassert(disp_cp_chain_me_to_fastEP != NULL);
3072
3073       HReg r11 = hregAMD64_R11();
3074
3075       /* Use ptmp for backpatching conditional jumps. */
3076       ptmp = NULL;
3077
3078       /* First off, if this is conditional, create a conditional
3079          jump over the rest of it. */
3080       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
3081          /* jmp fwds if !condition */
3082          *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
3083          ptmp = p; /* fill in this bit later */
3084          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3085       }
3086
3087       /* Update the guest RIP. */
3088       if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
3089          /* use a shorter encoding */
3090          /* movl sign-extend(dstGA), %r11 */
3091          *p++ = 0x49;
3092          *p++ = 0xC7;
3093          *p++ = 0xC3;
3094          p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
3095       } else {
3096          /* movabsq $dstGA, %r11 */
3097          *p++ = 0x49;
3098          *p++ = 0xBB;
3099          p = emit64(p, i->Ain.XDirect.dstGA);
3100       }
3101
3102       /* movq %r11, amRIP */
3103       *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
3104       *p++ = 0x89;
3105       p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
3106
3107       /* --- FIRST PATCHABLE BYTE follows --- */
3108       /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
3109          to) backs up the return address, so as to find the address of
3110          the first patchable byte.  So: don't change the length of the
3111          two instructions below. */
3112       /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
3113       *p++ = 0x49;
3114       *p++ = 0xBB;
3115       const void* disp_cp_chain_me
3116                = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
3117                                          : disp_cp_chain_me_to_slowEP;
3118       p = emit64(p, (Addr)disp_cp_chain_me);
3119       /* call *%r11 */
3120       *p++ = 0x41;
3121       *p++ = 0xFF;
3122       *p++ = 0xD3;
3123       /* --- END of PATCHABLE BYTES --- */
3124
3125       /* Fix up the conditional jump, if there was one. */
3126       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
3127          Int delta = p - ptmp;
3128          vassert(delta > 0 && delta < 40);
3129          *ptmp = toUChar(delta-1);
3130       }
3131       goto done;
3132    }
3133
3134    case Ain_XIndir: {
3135       /* We're generating transfers that could lead indirectly to a
3136          chain-me, so we need to be sure this is actually allowed --
3137          no-redir translations are not allowed to reach normal
3138          translations without going through the scheduler.  That means
3139          no XDirects or XIndirs out from no-redir translations.
3140          Hence: */
3141       vassert(disp_cp_xindir != NULL);
3142
3143       /* Use ptmp for backpatching conditional jumps. */
3144       ptmp = NULL;
3145
3146       /* First off, if this is conditional, create a conditional
3147          jump over the rest of it. */
3148       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
3149          /* jmp fwds if !condition */
3150          *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
3151          ptmp = p; /* fill in this bit later */
3152          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3153       }
3154
3155       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
3156       *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
3157       *p++ = 0x89;
3158       p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
3159
3160       /* get $disp_cp_xindir into %r11 */
3161       if (fitsIn32Bits((Addr)disp_cp_xindir)) {
3162          /* use a shorter encoding */
3163          /* movl sign-extend(disp_cp_xindir), %r11 */
3164          *p++ = 0x49;
3165          *p++ = 0xC7;
3166          *p++ = 0xC3;
3167          p = emit32(p, (UInt)(Addr)disp_cp_xindir);
3168       } else {
3169          /* movabsq $disp_cp_xindir, %r11 */
3170          *p++ = 0x49;
3171          *p++ = 0xBB;
3172          p = emit64(p, (Addr)disp_cp_xindir);
3173       }
3174
3175       /* jmp *%r11 */
3176       *p++ = 0x41;
3177       *p++ = 0xFF;
3178       *p++ = 0xE3;
3179
3180       /* Fix up the conditional jump, if there was one. */
3181       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
3182          Int delta = p - ptmp;
3183          vassert(delta > 0 && delta < 40);
3184          *ptmp = toUChar(delta-1);
3185       }
3186       goto done;
3187    }
3188
3189    case Ain_XAssisted: {
3190       /* Use ptmp for backpatching conditional jumps. */
3191       ptmp = NULL;
3192
3193       /* First off, if this is conditional, create a conditional
3194          jump over the rest of it. */
3195       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3196          /* jmp fwds if !condition */
3197          *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
3198          ptmp = p; /* fill in this bit later */
3199          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3200       }
3201
3202       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
3203       *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
3204       *p++ = 0x89;
3205       p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
3206       /* movl $magic_number, %ebp.  Since these numbers are all small positive
3207          integers, we can get away with "movl $N, %ebp" rather than
3208          the longer "movq $N, %rbp". */
3209       UInt trcval = 0;
3210       switch (i->Ain.XAssisted.jk) {
3211          case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
3212          case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
3213          case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
3214          case Ijk_Sys_int210:  trcval = VEX_TRC_JMP_SYS_INT210;  break;
3215          case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
3216          case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
3217          case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
3218          case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
3219          case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
3220          case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
3221          case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
3222          case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
3223          case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
3224          /* We don't expect to see the following being assisted. */
3225          case Ijk_Ret:
3226          case Ijk_Call:
3227          /* fallthrough */
3228          default:
3229             ppIRJumpKind(i->Ain.XAssisted.jk);
3230             vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
3231       }
3232       vassert(trcval != 0);
3233       *p++ = 0xBD;
3234       p = emit32(p, trcval);
3235       /* movabsq $disp_assisted, %r11 */
3236       *p++ = 0x49;
3237       *p++ = 0xBB;
3238       p = emit64(p, (Addr)disp_cp_xassisted);
3239       /* jmp *%r11 */
3240       *p++ = 0x41;
3241       *p++ = 0xFF;
3242       *p++ = 0xE3;
3243
3244       /* Fix up the conditional jump, if there was one. */
3245       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3246          Int delta = p - ptmp;
3247          vassert(delta > 0 && delta < 40);
3248          *ptmp = toUChar(delta-1);
3249       }
3250       goto done;
3251    }
3252
3253    case Ain_CMov64:
3254       vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
3255       *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src);
3256       *p++ = 0x0F;
3257       *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
3258       p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src);
3259       goto done;
3260
3261    case Ain_CLoad: {
3262       vassert(i->Ain.CLoad.cond != Acc_ALWAYS);
3263
3264       /* Only 32- or 64-bit variants are allowed. */
3265       vassert(i->Ain.CLoad.szB == 4 || i->Ain.CLoad.szB == 8);
3266
3267       /* Use ptmp for backpatching conditional jumps. */
3268       ptmp = NULL;
3269
3270       /* jmp fwds if !condition */
3271       *p++ = toUChar(0x70 + (0xF & (i->Ain.CLoad.cond ^ 1)));
3272       ptmp = p; /* fill in this bit later */
3273       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3274
3275       /* Now the load.  Either a normal 64 bit load or a normal 32 bit
3276          load, which, by the default zero-extension rule, zeroes out
3277          the upper half of the destination, as required. */
3278       rex = rexAMode_M(i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3279       *p++ = i->Ain.CLoad.szB == 4 ? clearWBit(rex) : rex;
3280       *p++ = 0x8B;
3281       p = doAMode_M(p, i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3282
3283       /* Fix up the conditional branch */
3284       Int delta = p - ptmp;
3285       vassert(delta > 0 && delta < 40);
3286       *ptmp = toUChar(delta-1);
3287       goto done;
3288    }
3289
3290    case Ain_CStore: {
3291       /* AFAICS this is identical to Ain_CLoad except that the opcode
3292          is 0x89 instead of 0x8B. */
3293       vassert(i->Ain.CStore.cond != Acc_ALWAYS);
3294
3295       /* Only 32- or 64-bit variants are allowed. */
3296       vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8);
3297
3298       /* Use ptmp for backpatching conditional jumps. */
3299       ptmp = NULL;
3300
3301       /* jmp fwds if !condition */
3302       *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1)));
3303       ptmp = p; /* fill in this bit later */
3304       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3305
3306       /* Now the store. */
3307       rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr);
3308       *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex;
3309       *p++ = 0x89;
3310       p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr);
3311
3312       /* Fix up the conditional branch */
3313       Int delta = p - ptmp;
3314       vassert(delta > 0 && delta < 40);
3315       *ptmp = toUChar(delta-1);
3316       goto done;
3317    }
3318
3319    case Ain_MovxLQ:
3320       /* No, _don't_ ask me why the sense of the args has to be
3321          different in the S vs Z case.  I don't know. */
3322       if (i->Ain.MovxLQ.syned) {
3323          /* Need REX.W = 1 here, but rexAMode_R does that for us. */
3324          *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3325          *p++ = 0x63;
3326          p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3327       } else {
3328          /* Produce a 32-bit reg-reg move, since the implicit
3329             zero-extend does what we want. */
3330          *p++ = clearWBit (
3331                    rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
3332          *p++ = 0x89;
3333          p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
3334       }
3335       goto done;
3336
3337    case Ain_LoadEX:
3338       if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
3339          /* movzbq */
3340          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3341          *p++ = 0x0F;
3342          *p++ = 0xB6;
3343          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3344          goto done;
3345       }
3346       if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
3347          /* movzwq */
3348          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3349          *p++ = 0x0F;
3350          *p++ = 0xB7;
3351          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3352          goto done;
3353       }
3354       if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
3355          /* movzlq */
3356          /* This isn't really an existing AMD64 instruction per se.
3357             Rather, we have to do a 32-bit load.  Because a 32-bit
3358             write implicitly clears the upper 32 bits of the target
3359             register, we get what we want. */
3360          *p++ = clearWBit(
3361                 rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
3362          *p++ = 0x8B;
3363          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3364          goto done;
3365       }
3366       break;
3367
3368    case Ain_Set64:
3369       /* Make the destination register be 1 or 0, depending on whether
3370          the relevant condition holds.  Complication: the top 56 bits
3371          of the destination should be forced to zero, but doing 'xorq
3372          %r,%r' kills the flag(s) we are about to read.  Sigh.  So
3373          start off my moving $0 into the dest. */
3374       reg = iregEnc3210(i->Ain.Set64.dst);
3375       vassert(reg < 16);
3376
3377       /* movq $0, %dst */
3378       *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
3379       *p++ = 0xC7;
3380       *p++ = toUChar(0xC0 + (reg & 7));
3381       p = emit32(p, 0);
3382
3383       /* setb lo8(%dst) */
3384       /* note, 8-bit register rex trickyness.  Be careful here. */
3385       *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
3386       *p++ = 0x0F;
3387       *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
3388       *p++ = toUChar(0xC0 + (reg & 7));
3389       goto done;
3390
3391    case Ain_Bsfr64:
3392       *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3393       *p++ = 0x0F;
3394       if (i->Ain.Bsfr64.isFwds) {
3395          *p++ = 0xBC;
3396       } else {
3397          *p++ = 0xBD;
3398       }
3399       p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3400       goto done;
3401
3402    case Ain_MFence:
3403       /* mfence */
3404       *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
3405       goto done;
3406
3407    case Ain_ACAS:
3408       /* lock */
3409       *p++ = 0xF0;
3410       if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
3411       /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
3412          in %rbx.  The new-value register is hardwired to be %rbx
3413          since dealing with byte integer registers is too much hassle,
3414          so we force the register operand to %rbx (could equally be
3415          %rcx or %rdx). */
3416       rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
3417       if (i->Ain.ACAS.sz != 8)
3418          rex = clearWBit(rex);
3419
3420       *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
3421       *p++ = 0x0F;
3422       if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
3423       p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
3424       goto done;
3425
3426    case Ain_DACAS:
3427       /* lock */
3428       *p++ = 0xF0;
3429       /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
3430          value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
3431          aren't encoded in the insn. */
3432       rex = rexAMode_M_enc(1, i->Ain.ACAS.addr );
3433       if (i->Ain.ACAS.sz != 8)
3434          rex = clearWBit(rex);
3435       *p++ = rex;
3436       *p++ = 0x0F;
3437       *p++ = 0xC7;
3438       p = doAMode_M_enc(p, 1, i->Ain.DACAS.addr);
3439       goto done;
3440
3441    case Ain_A87Free:
3442       vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
3443       for (j = 0; j < i->Ain.A87Free.nregs; j++) {
3444          p = do_ffree_st(p, 7-j);
3445       }
3446       goto done;
3447
3448    case Ain_A87PushPop:
3449       vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
3450       if (i->Ain.A87PushPop.isPush) {
3451          /* Load from memory into %st(0): flds/fldl amode */
3452          *p++ = clearWBit(
3453                    rexAMode_M_enc(0, i->Ain.A87PushPop.addr) );
3454          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3455          p = doAMode_M_enc(p, 0/*subopcode*/, i->Ain.A87PushPop.addr);
3456       } else {
3457          /* Dump %st(0) to memory: fstps/fstpl amode */
3458          *p++ = clearWBit(
3459                    rexAMode_M_enc(3, i->Ain.A87PushPop.addr) );
3460          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3461          p = doAMode_M_enc(p, 3/*subopcode*/, i->Ain.A87PushPop.addr);
3462          goto done;
3463       }
3464       goto done;
3465
3466    case Ain_A87FpOp:
3467       switch (i->Ain.A87FpOp.op) {
3468          case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
3469          case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
3470          case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
3471          case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
3472          case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
3473          case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
3474          case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
3475          case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
3476          case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
3477          case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
3478          case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
3479          case Afp_TAN:
3480             /* fptan pushes 1.0 on the FP stack, except when the
3481                argument is out of range.  Hence we have to do the
3482                instruction, then inspect C2 to see if there is an out
3483                of range condition.  If there is, we skip the fincstp
3484                that is used by the in-range case to get rid of this
3485                extra 1.0 value. */
3486             *p++ = 0xD9; *p++ = 0xF2; // fptan
3487             *p++ = 0x50;              // pushq %rax
3488             *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
3489             *p++ = 0x66; *p++ = 0xA9;
3490             *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
3491             *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
3492             *p++ = 0xD9; *p++ = 0xF7; // fincstp
3493             *p++ = 0x58;              // after_fincstp: popq %rax
3494             break;
3495          default:
3496             goto bad;
3497       }
3498       goto done;
3499
3500    case Ain_A87LdCW:
3501       *p++ = clearWBit(
3502                 rexAMode_M_enc(5, i->Ain.A87LdCW.addr) );
3503       *p++ = 0xD9;
3504       p = doAMode_M_enc(p, 5/*subopcode*/, i->Ain.A87LdCW.addr);
3505       goto done;
3506
3507    case Ain_A87StSW:
3508       *p++ = clearWBit(
3509                 rexAMode_M_enc(7, i->Ain.A87StSW.addr) );
3510       *p++ = 0xDD;
3511       p = doAMode_M_enc(p, 7/*subopcode*/, i->Ain.A87StSW.addr);
3512       goto done;
3513
3514    case Ain_Store:
3515       if (i->Ain.Store.sz == 2) {
3516          /* This just goes to show the crazyness of the instruction
3517             set encoding.  We have to insert two prefix bytes, but be
3518             careful to avoid a conflict in what the size should be, by
3519             ensuring that REX.W = 0. */
3520          *p++ = 0x66; /* override to 16-bits */
3521          *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3522          *p++ = 0x89;
3523          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3524          goto done;
3525       }
3526       if (i->Ain.Store.sz == 4) {
3527          *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3528          *p++ = 0x89;
3529          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3530          goto done;
3531       }
3532       if (i->Ain.Store.sz == 1) {
3533          /* This is one place where it would be wrong to skip emitting
3534             a rex byte of 0x40, since the mere presence of rex changes
3535             the meaning of the byte register access.  Be careful. */
3536          *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3537          *p++ = 0x88;
3538          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3539          goto done;
3540       }
3541       break;
3542
3543    case Ain_LdMXCSR:
3544       *p++ = clearWBit(rexAMode_M_enc(0, i->Ain.LdMXCSR.addr));
3545       *p++ = 0x0F;
3546       *p++ = 0xAE;
3547       p = doAMode_M_enc(p, 2/*subopcode*/, i->Ain.LdMXCSR.addr);
3548       goto done;
3549
3550    case Ain_SseUComIS:
3551       /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
3552       /* ucomi[sd] %srcL, %srcR */
3553       if (i->Ain.SseUComIS.sz == 8) {
3554          *p++ = 0x66;
3555       } else {
3556          goto bad;
3557          vassert(i->Ain.SseUComIS.sz == 4);
3558       }
3559       *p++ = clearWBit (
3560              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseUComIS.srcL),
3561                                  vregEnc3210(i->Ain.SseUComIS.srcR) ));
3562       *p++ = 0x0F;
3563       *p++ = 0x2E;
3564       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseUComIS.srcL),
3565                                vregEnc3210(i->Ain.SseUComIS.srcR) );
3566       /* pushfq */
3567       *p++ = 0x9C;
3568       /* popq %dst */
3569       *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.SseUComIS.dst)));
3570       *p++ = toUChar(0x58 + iregEnc210(i->Ain.SseUComIS.dst));
3571       goto done;
3572
3573    case Ain_SseSI2SF:
3574       /* cvssi2s[sd] %src, %dst */
3575       rex = rexAMode_R_enc_reg( vregEnc3210(i->Ain.SseSI2SF.dst),
3576                                 i->Ain.SseSI2SF.src );
3577       *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
3578       *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
3579       *p++ = 0x0F;
3580       *p++ = 0x2A;
3581       p = doAMode_R_enc_reg( p, vregEnc3210(i->Ain.SseSI2SF.dst),
3582                                 i->Ain.SseSI2SF.src );
3583       goto done;
3584
3585    case Ain_SseSF2SI:
3586       /* cvss[sd]2si %src, %dst */
3587       rex = rexAMode_R_reg_enc( i->Ain.SseSF2SI.dst,
3588                                 vregEnc3210(i->Ain.SseSF2SI.src) );
3589       *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
3590       *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
3591       *p++ = 0x0F;
3592       *p++ = 0x2D;
3593       p = doAMode_R_reg_enc( p, i->Ain.SseSF2SI.dst,
3594                                 vregEnc3210(i->Ain.SseSF2SI.src) );
3595       goto done;
3596
3597    case Ain_SseSDSS:
3598       /* cvtsd2ss/cvtss2sd %src, %dst */
3599       *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
3600       *p++ = clearWBit(
3601               rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseSDSS.dst),
3602                                   vregEnc3210(i->Ain.SseSDSS.src) ));
3603       *p++ = 0x0F;
3604       *p++ = 0x5A;
3605       p = doAMode_R_enc_enc( p, vregEnc3210(i->Ain.SseSDSS.dst),
3606                                 vregEnc3210(i->Ain.SseSDSS.src) );
3607       goto done;
3608
3609    case Ain_SseLdSt:
3610       if (i->Ain.SseLdSt.sz == 8) {
3611          *p++ = 0xF2;
3612       } else
3613       if (i->Ain.SseLdSt.sz == 4) {
3614          *p++ = 0xF3;
3615       } else
3616       if (i->Ain.SseLdSt.sz != 16) {
3617          vassert(0);
3618       }
3619       *p++ = clearWBit(
3620              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdSt.reg),
3621                             i->Ain.SseLdSt.addr));
3622       *p++ = 0x0F;
3623       *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
3624       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdSt.reg),
3625                            i->Ain.SseLdSt.addr);
3626       goto done;
3627
3628    case Ain_SseCStore: {
3629       vassert(i->Ain.SseCStore.cond != Acc_ALWAYS);
3630
3631       /* Use ptmp for backpatching conditional jumps. */
3632       ptmp = NULL;
3633
3634       /* jmp fwds if !condition */
3635       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCStore.cond ^ 1)));
3636       ptmp = p; /* fill in this bit later */
3637       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3638
3639       /* Now the store. */
3640       *p++ = clearWBit(
3641              rexAMode_M_enc(vregEnc3210(i->Ain.SseCStore.src),
3642                             i->Ain.SseCStore.addr));
3643       *p++ = 0x0F;
3644       *p++ = toUChar(0x11);
3645       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCStore.src),
3646                            i->Ain.SseCStore.addr);
3647
3648       /* Fix up the conditional branch */
3649       Int delta = p - ptmp;
3650       vassert(delta > 0 && delta < 40);
3651       *ptmp = toUChar(delta-1);
3652       goto done;
3653    }
3654
3655    case Ain_SseCLoad: {
3656       vassert(i->Ain.SseCLoad.cond != Acc_ALWAYS);
3657
3658       /* Use ptmp for backpatching conditional jumps. */
3659       ptmp = NULL;
3660
3661       /* jmp fwds if !condition */
3662       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCLoad.cond ^ 1)));
3663       ptmp = p; /* fill in this bit later */
3664       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3665
3666       /* Now the load. */
3667       *p++ = clearWBit(
3668              rexAMode_M_enc(vregEnc3210(i->Ain.SseCLoad.dst),
3669                             i->Ain.SseCLoad.addr));
3670       *p++ = 0x0F;
3671       *p++ = toUChar(0x10);
3672       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCLoad.dst),
3673                            i->Ain.SseCLoad.addr);
3674
3675       /* Fix up the conditional branch */
3676       Int delta = p - ptmp;
3677       vassert(delta > 0 && delta < 40);
3678       *ptmp = toUChar(delta-1);
3679       goto done;
3680    }
3681
3682    case Ain_SseLdzLO:
3683       vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
3684       /* movs[sd] amode, %xmm-dst */
3685       *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3686       *p++ = clearWBit(
3687              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdzLO.reg),
3688                             i->Ain.SseLdzLO.addr));
3689       *p++ = 0x0F;
3690       *p++ = 0x10;
3691       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdzLO.reg),
3692                            i->Ain.SseLdzLO.addr);
3693       goto done;
3694
3695    case Ain_Sse32Fx4: {
3696       UInt srcRegNo = vregEnc3210(i->Ain.Sse32Fx4.src);
3697       UInt dstRegNo = vregEnc3210(i->Ain.Sse32Fx4.dst);
3698       // VEX encoded cases
3699       switch (i->Ain.Sse32Fx4.op) {
3700          case Asse_F16toF32: { // vcvtph2ps %xmmS, %xmmD
3701             UInt s = srcRegNo;
3702             UInt d = dstRegNo;
3703             // VCVTPH2PS %xmmS, %xmmD (s and d are both xmm regs, range 0 .. 15)
3704             // 0xC4 : ~d3 1 ~s3 0 0 0 1 0 : 0x79 : 0x13 : 1 1 d2 d1 d0 s2 s1 s0
3705             UInt byte2 = ((((~d)>>3)&1)<<7) | (1<<6)
3706                          | ((((~s)>>3)&1)<<5) | (1<<1);
3707             UInt byte5 = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0);
3708             *p++ = 0xC4;
3709             *p++ = byte2;
3710             *p++ = 0x79;
3711             *p++ = 0x13;
3712             *p++ = byte5;
3713             goto done;
3714          }
3715          case Asse_F32toF16: { // vcvtps2ph $4, %xmmS, %xmmD
3716             UInt s = srcRegNo;
3717             UInt d = dstRegNo;
3718             // VCVTPS2PH $4, %xmmS, %xmmD (s and d both xmm regs, range 0 .. 15)
3719             // 0xC4 : ~s3 1 ~d3 0 0 0 1 1 : 0x79
3720             //      : 0x1D : 11 s2 s1 s0 d2 d1 d0 : 0x4
3721             UInt byte2 = ((((~s)>>3)&1)<<7) | (1<<6)
3722                          | ((((~d)>>3)&1)<<5) | (1<<1) | (1 << 0);
3723             UInt byte5 = (1<<7) | (1<<6) | ((s&7) << 3) | ((d&7) << 0);
3724             *p++ = 0xC4;
3725             *p++ = byte2;
3726             *p++ = 0x79;
3727             *p++ = 0x1D;
3728             *p++ = byte5;
3729             *p++ = 0x04;
3730             goto done;
3731          }
3732          default: break;
3733       }
3734       // After this point, REX encoded cases only
3735       xtra = 0;
3736       switch (i->Ain.Sse32Fx4.op) {
3737          case Asse_F2I: *p++ = 0x66; break;
3738          default: break;
3739       }
3740       *p++ = clearWBit(rexAMode_R_enc_enc(dstRegNo, srcRegNo));
3741       *p++ = 0x0F;
3742       switch (i->Ain.Sse32Fx4.op) {
3743          case Asse_ADDF:   *p++ = 0x58; break;
3744          case Asse_DIVF:   *p++ = 0x5E; break;
3745          case Asse_MAXF:   *p++ = 0x5F; break;
3746          case Asse_MINF:   *p++ = 0x5D; break;
3747          case Asse_MULF:   *p++ = 0x59; break;
3748          case Asse_RCPF:   *p++ = 0x53; break;
3749          case Asse_RSQRTF: *p++ = 0x52; break;
3750          case Asse_SQRTF:  *p++ = 0x51; break;
3751          case Asse_I2F:    *p++ = 0x5B; break; // cvtdq2ps; no 0x66 pfx
3752          case Asse_F2I:    *p++ = 0x5B; break; // cvtps2dq; with 0x66 pfx
3753          case Asse_SUBF:   *p++ = 0x5C; break;
3754          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3755          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3756          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3757          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3758          default: goto bad;
3759       }
3760       p = doAMode_R_enc_enc(p, dstRegNo, srcRegNo);
3761       if (xtra & 0x100)
3762          *p++ = toUChar(xtra & 0xFF);
3763       goto done;
3764    }
3765
3766    case Ain_Sse64Fx2:
3767       xtra = 0;
3768       *p++ = 0x66;
3769       *p++ = clearWBit(
3770              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64Fx2.dst),
3771                                  vregEnc3210(i->Ain.Sse64Fx2.src) ));
3772       *p++ = 0x0F;
3773       switch (i->Ain.Sse64Fx2.op) {
3774          case Asse_ADDF:   *p++ = 0x58; break;
3775          case Asse_DIVF:   *p++ = 0x5E; break;
3776          case Asse_MAXF:   *p++ = 0x5F; break;
3777          case Asse_MINF:   *p++ = 0x5D; break;
3778          case Asse_MULF:   *p++ = 0x59; break;
3779          case Asse_SQRTF:  *p++ = 0x51; break;
3780          case Asse_SUBF:   *p++ = 0x5C; break;
3781          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3782          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3783          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3784          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3785          default: goto bad;
3786       }
3787       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64Fx2.dst),
3788                                vregEnc3210(i->Ain.Sse64Fx2.src) );
3789       if (xtra & 0x100)
3790          *p++ = toUChar(xtra & 0xFF);
3791       goto done;
3792
3793    case Ain_Sse32FLo:
3794       xtra = 0;
3795       *p++ = 0xF3;
3796       *p++ = clearWBit(
3797              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32FLo.dst),
3798                                  vregEnc3210(i->Ain.Sse32FLo.src) ));
3799       *p++ = 0x0F;
3800       switch (i->Ain.Sse32FLo.op) {
3801          case Asse_ADDF:   *p++ = 0x58; break;
3802          case Asse_DIVF:   *p++ = 0x5E; break;
3803          case Asse_MAXF:   *p++ = 0x5F; break;
3804          case Asse_MINF:   *p++ = 0x5D; break;
3805          case Asse_MULF:   *p++ = 0x59; break;
3806          case Asse_RCPF:   *p++ = 0x53; break;
3807          case Asse_RSQRTF: *p++ = 0x52; break;
3808          case Asse_SQRTF:  *p++ = 0x51; break;
3809          case Asse_SUBF:   *p++ = 0x5C; break;
3810          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3811          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3812          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3813          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3814          default: goto bad;
3815       }
3816       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32FLo.dst),
3817                                vregEnc3210(i->Ain.Sse32FLo.src) );
3818       if (xtra & 0x100)
3819          *p++ = toUChar(xtra & 0xFF);
3820       goto done;
3821
3822    case Ain_Sse64FLo:
3823       xtra = 0;
3824       *p++ = 0xF2;
3825       *p++ = clearWBit(
3826              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64FLo.dst),
3827                                  vregEnc3210(i->Ain.Sse64FLo.src) ));
3828       *p++ = 0x0F;
3829       switch (i->Ain.Sse64FLo.op) {
3830          case Asse_ADDF:   *p++ = 0x58; break;
3831          case Asse_DIVF:   *p++ = 0x5E; break;
3832          case Asse_MAXF:   *p++ = 0x5F; break;
3833          case Asse_MINF:   *p++ = 0x5D; break;
3834          case Asse_MULF:   *p++ = 0x59; break;
3835          case Asse_SQRTF:  *p++ = 0x51; break;
3836          case Asse_SUBF:   *p++ = 0x5C; break;
3837          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3838          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3839          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3840          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3841          default: goto bad;
3842       }
3843       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64FLo.dst),
3844                                vregEnc3210(i->Ain.Sse64FLo.src) );
3845       if (xtra & 0x100)
3846          *p++ = toUChar(xtra & 0xFF);
3847       goto done;
3848
3849    case Ain_SseReRg:
3850 #     define XX(_n) *p++ = (_n)
3851
3852       rex = clearWBit(
3853             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseReRg.dst),
3854                                 vregEnc3210(i->Ain.SseReRg.src) ));
3855
3856       switch (i->Ain.SseReRg.op) {
3857          case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
3858          case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
3859          case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
3860          case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
3861          case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
3862          case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
3863          case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
3864          case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
3865          case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
3866          case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
3867          case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
3868          case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
3869          case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
3870          case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
3871          case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
3872          case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
3873          case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
3874          case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
3875          case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
3876          case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
3877          case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
3878          case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
3879          case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
3880          case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
3881          case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
3882          case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
3883          case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
3884          case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
3885          case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
3886          case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
3887          case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
3888          case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
3889          case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
3890          case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
3891          case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
3892          case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
3893          case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
3894          case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
3895          case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
3896          case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
3897          case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
3898          case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
3899          case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
3900          case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
3901          case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
3902          case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
3903          case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
3904          case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
3905          case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
3906          case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
3907          case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
3908          case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
3909          case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
3910          case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
3911          case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
3912          case Asse_PSHUFB:   XX(0x66); XX(rex);
3913                              XX(0x0F); XX(0x38); XX(0x00); break;
3914          case Asse_PMADDUBSW:XX(0x66); XX(rex);
3915                              XX(0x0F); XX(0x38); XX(0x04); break;
3916          default: goto bad;
3917       }
3918       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst),
3919                                vregEnc3210(i->Ain.SseReRg.src) );
3920 #     undef XX
3921       goto done;
3922
3923    case Ain_SseCMov:
3924       /* jmp fwds if !condition */
3925       *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
3926       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
3927       ptmp = p;
3928
3929       /* movaps %src, %dst */
3930       *p++ = clearWBit(
3931              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseCMov.dst),
3932                                  vregEnc3210(i->Ain.SseCMov.src) ));
3933       *p++ = 0x0F;
3934       *p++ = 0x28;
3935       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseCMov.dst),
3936                                vregEnc3210(i->Ain.SseCMov.src) );
3937
3938       /* Fill in the jump offset. */
3939       *(ptmp-1) = toUChar(p - ptmp);
3940       goto done;
3941
3942    case Ain_SseShuf:
3943       *p++ = 0x66;
3944       *p++ = clearWBit(
3945              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseShuf.dst),
3946                                  vregEnc3210(i->Ain.SseShuf.src) ));
3947       *p++ = 0x0F;
3948       *p++ = 0x70;
3949       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseShuf.dst),
3950                                vregEnc3210(i->Ain.SseShuf.src) );
3951       *p++ = (UChar)(i->Ain.SseShuf.order);
3952       goto done;
3953
3954    case Ain_SseShiftN: {
3955       UInt limit  = 0;
3956       UInt shiftImm = i->Ain.SseShiftN.shiftBits;
3957       switch (i->Ain.SseShiftN.op) {
3958          case Asse_SHL16: limit = 15; opc = 0x71; subopc_imm = 6; break;
3959          case Asse_SHL32: limit = 31; opc = 0x72; subopc_imm = 6; break;
3960          case Asse_SHL64: limit = 63; opc = 0x73; subopc_imm = 6; break;
3961          case Asse_SAR16: limit = 15; opc = 0x71; subopc_imm = 4; break;
3962          case Asse_SAR32: limit = 31; opc = 0x72; subopc_imm = 4; break;
3963          case Asse_SHR16: limit = 15; opc = 0x71; subopc_imm = 2; break;
3964          case Asse_SHR32: limit = 31; opc = 0x72; subopc_imm = 2; break;
3965          case Asse_SHR64: limit = 63; opc = 0x73; subopc_imm = 2; break;
3966          case Asse_SHL128:
3967             if ((shiftImm & 7) != 0) goto bad;
3968             shiftImm >>= 3;
3969             limit = 15; opc = 0x73; subopc_imm = 7;
3970             break;
3971          case Asse_SHR128:
3972             if ((shiftImm & 7) != 0) goto bad;
3973             shiftImm >>= 3;
3974             limit = 15; opc = 0x73; subopc_imm = 3;
3975             break;
3976          default:
3977             // This should never happen .. SSE2 only offers the above 10 insns
3978             // for the "shift with immediate" case
3979             goto bad;
3980       }
3981       vassert(limit > 0 && opc > 0 && subopc_imm > 0);
3982       if (shiftImm > limit) goto bad;
3983       *p++ = 0x66;
3984       *p++ = clearWBit(
3985              rexAMode_R_enc_enc( subopc_imm,
3986                                  vregEnc3210(i->Ain.SseShiftN.dst) ));
3987       *p++ = 0x0F;
3988       *p++ = opc;
3989       p = doAMode_R_enc_enc(p, subopc_imm, vregEnc3210(i->Ain.SseShiftN.dst));
3990       *p++ = shiftImm;
3991       goto done;
3992    }
3993
3994    case Ain_SseMOVQ: {
3995       Bool toXMM = i->Ain.SseMOVQ.toXMM;
3996       HReg gpr = i->Ain.SseMOVQ.gpr;
3997       HReg xmm = i->Ain.SseMOVQ.xmm;
3998       *p++ = 0x66;
3999       *p++ = setWBit( rexAMode_R_enc_enc( vregEnc3210(xmm), iregEnc3210(gpr)) );
4000       *p++ = 0x0F;
4001       *p++ = toXMM ? 0x6E : 0x7E;
4002       p = doAMode_R_enc_enc( p, vregEnc3210(xmm), iregEnc3210(gpr) );
4003       goto done;
4004    }
4005
4006    //uu case Ain_AvxLdSt: {
4007    //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
4008    //uu                           i->Ain.AvxLdSt.addr );
4009    //uu    p = emitVexPrefix(p, vex);
4010    //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
4011    //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
4012    //uu      goto done;
4013    //uu }
4014
4015    case Ain_EvCheck: {
4016       /* We generate:
4017             (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
4018             (2 bytes)  jns  nofail     expected taken
4019             (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
4020             nofail:
4021       */
4022       /* This is heavily asserted re instruction lengths.  It needs to
4023          be.  If we get given unexpected forms of .amCounter or
4024          .amFailAddr -- basically, anything that's not of the form
4025          uimm7(%rbp) -- they are likely to fail. */
4026       /* Note also that after the decl we must be very careful not to
4027          read the carry flag, else we get a partial flags stall.
4028          js/jns avoids that, though. */
4029       UChar* p0 = p;
4030       /* ---  decl 8(%rbp) --- */
4031       /* Need to compute the REX byte for the decl in order to prove
4032          that we don't need it, since this is a 32-bit inc and all
4033          registers involved in the amode are < r8.  "1" because
4034          there's no register in this encoding; instead the register
4035          field is used as a sub opcode.  The encoding for "decl r/m32"
4036          is FF /1, hence the "1". */
4037       rex = clearWBit(rexAMode_M_enc(1, i->Ain.EvCheck.amCounter));
4038       if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
4039       *p++ = 0xFF;
4040       p = doAMode_M_enc(p, 1, i->Ain.EvCheck.amCounter);
4041       vassert(p - p0 == 3);
4042       /* --- jns nofail --- */
4043       *p++ = 0x79;
4044       *p++ = 0x03; /* need to check this 0x03 after the next insn */
4045       vassert(p - p0 == 5);
4046       /* --- jmp* 0(%rbp) --- */
4047       /* Once again, verify we don't need REX.  The encoding is FF /4.
4048          We don't need REX.W since by default FF /4 in 64-bit mode
4049          implies a 64 bit load. */
4050       rex = clearWBit(rexAMode_M_enc(4, i->Ain.EvCheck.amFailAddr));
4051       if (rex != 0x40) goto bad;
4052       *p++ = 0xFF;
4053       p = doAMode_M_enc(p, 4, i->Ain.EvCheck.amFailAddr);
4054       vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
4055       /* And crosscheck .. */
4056       vassert(evCheckSzB_AMD64() == 8);
4057       goto done;
4058    }
4059
4060    case Ain_ProfInc: {
4061       /* We generate   movabsq $0, %r11
4062                        incq (%r11)
4063          in the expectation that a later call to LibVEX_patchProfCtr
4064          will be used to fill in the immediate field once the right
4065          value is known.
4066          49 BB 00 00 00 00 00 00 00 00
4067          49 FF 03
4068       */
4069       *p++ = 0x49; *p++ = 0xBB;
4070       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
4071       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
4072       *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
4073       /* Tell the caller .. */
4074       vassert(!(*is_profInc));
4075       *is_profInc = True;
4076       goto done;
4077    }
4078
4079    default:
4080       goto bad;
4081    }
4082
4083   bad:
4084    ppAMD64Instr(i, mode64);
4085    vpanic("emit_AMD64Instr");
4086    /*NOTREACHED*/
4087
4088   done:
4089    vassert(p - &buf[0] <= 64);
4090    return p - &buf[0];
4091 }
4092
4093
4094 /* How big is an event check?  See case for Ain_EvCheck in
4095    emit_AMD64Instr just above.  That crosschecks what this returns, so
4096    we can tell if we're inconsistent. */
4097 Int evCheckSzB_AMD64 (void)
4098 {
4099    return 8;
4100 }
4101
4102
4103 /* NB: what goes on here has to be very closely coordinated with the
4104    emitInstr case for XDirect, above. */
4105 VexInvalRange chainXDirect_AMD64 ( VexEndness endness_host,
4106                                    void* place_to_chain,
4107                                    const void* disp_cp_chain_me_EXPECTED,
4108                                    const void* place_to_jump_to )
4109 {
4110    vassert(endness_host == VexEndnessLE);
4111
4112    /* What we're expecting to see is:
4113         movabsq $disp_cp_chain_me_EXPECTED, %r11
4114         call *%r11
4115       viz
4116         49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
4117         41 FF D3
4118    */
4119    UChar* p = (UChar*)place_to_chain;
4120    vassert(p[0] == 0x49);
4121    vassert(p[1] == 0xBB);
4122    vassert(read_misaligned_ULong_LE(&p[2]) == (Addr)disp_cp_chain_me_EXPECTED);
4123    vassert(p[10] == 0x41);
4124    vassert(p[11] == 0xFF);
4125    vassert(p[12] == 0xD3);
4126    /* And what we want to change it to is either:
4127         (general case):
4128           movabsq $place_to_jump_to, %r11
4129           jmpq *%r11
4130         viz
4131           49 BB <8 bytes value == place_to_jump_to>
4132           41 FF E3
4133         So it's the same length (convenient, huh) and we don't
4134         need to change all the bits.
4135       ---OR---
4136         in the case where the displacement falls within 32 bits
4137           jmpq disp32   where disp32 is relative to the next insn
4138           ud2; ud2; ud2; ud2
4139         viz
4140           E9 <4 bytes == disp32>
4141           0F 0B 0F 0B 0F 0B 0F 0B
4142
4143       In both cases the replacement has the same length as the original.
4144       To remain sane & verifiable,
4145       (1) limit the displacement for the short form to
4146           (say) +/- one billion, so as to avoid wraparound
4147           off-by-ones
4148       (2) even if the short form is applicable, once every (say)
4149           1024 times use the long form anyway, so as to maintain
4150           verifiability
4151    */
4152    /* This is the delta we need to put into a JMP d32 insn.  It's
4153       relative to the start of the next insn, hence the -5.  */
4154    Long delta   = (Long)((const UChar *)place_to_jump_to - (const UChar*)p) - 5;
4155    Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
4156
4157    static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
4158    if (shortOK) {
4159       shortCTR++; // thread safety bleh
4160       if (0 == (shortCTR & 0x3FF)) {
4161          shortOK = False;
4162          if (0)
4163             vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
4164                        "using long jmp\n", shortCTR);
4165       }
4166    }
4167
4168    /* And make the modifications. */
4169    if (shortOK) {
4170       p[0]  = 0xE9;
4171       write_misaligned_UInt_LE(&p[1], (UInt)(Int)delta);
4172       p[5]  = 0x0F; p[6]  = 0x0B;
4173       p[7]  = 0x0F; p[8]  = 0x0B;
4174       p[9]  = 0x0F; p[10] = 0x0B;
4175       p[11] = 0x0F; p[12] = 0x0B;
4176       /* sanity check on the delta -- top 32 are all 0 or all 1 */
4177       delta >>= 32;
4178       vassert(delta == 0LL || delta == -1LL);
4179    } else {
4180       /* Minimal modifications from the starting sequence. */
4181       write_misaligned_ULong_LE(&p[2], (ULong)(Addr)place_to_jump_to);
4182       p[12] = 0xE3;
4183    }
4184    VexInvalRange vir = { (HWord)place_to_chain, 13 };
4185    return vir;
4186 }
4187
4188
4189 /* NB: what goes on here has to be very closely coordinated with the
4190    emitInstr case for XDirect, above. */
4191 VexInvalRange unchainXDirect_AMD64 ( VexEndness endness_host,
4192                                      void* place_to_unchain,
4193                                      const void* place_to_jump_to_EXPECTED,
4194                                      const void* disp_cp_chain_me )
4195 {
4196    vassert(endness_host == VexEndnessLE);
4197
4198    /* What we're expecting to see is either:
4199         (general case)
4200           movabsq $place_to_jump_to_EXPECTED, %r11
4201           jmpq *%r11
4202         viz
4203           49 BB <8 bytes value == place_to_jump_to_EXPECTED>
4204           41 FF E3
4205       ---OR---
4206         in the case where the displacement falls within 32 bits
4207           jmpq d32
4208           ud2; ud2; ud2; ud2
4209         viz
4210           E9 <4 bytes == disp32>
4211           0F 0B 0F 0B 0F 0B 0F 0B
4212    */
4213    UChar* p     = (UChar*)place_to_unchain;
4214    Bool   valid = False;
4215    if (p[0] == 0x49 && p[1] == 0xBB
4216        && read_misaligned_ULong_LE(&p[2])
4217           == (ULong)(Addr)place_to_jump_to_EXPECTED
4218        && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
4219       /* it's the long form */
4220       valid = True;
4221    }
4222    else
4223    if (p[0] == 0xE9
4224        && p[5]  == 0x0F && p[6]  == 0x0B
4225        && p[7]  == 0x0F && p[8]  == 0x0B
4226        && p[9]  == 0x0F && p[10] == 0x0B
4227        && p[11] == 0x0F && p[12] == 0x0B) {
4228       /* It's the short form.  Check the offset is right. */
4229       Int  s32 = (Int)read_misaligned_UInt_LE(&p[1]);
4230       Long s64 = (Long)s32;
4231       if ((UChar*)p + 5 + s64 == place_to_jump_to_EXPECTED) {
4232          valid = True;
4233          if (0)
4234             vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
4235       }
4236    }
4237    vassert(valid);
4238    /* And what we want to change it to is:
4239         movabsq $disp_cp_chain_me, %r11
4240         call *%r11
4241       viz
4242         49 BB <8 bytes value == disp_cp_chain_me>
4243         41 FF D3
4244       So it's the same length (convenient, huh).
4245    */
4246    p[0] = 0x49;
4247    p[1] = 0xBB;
4248    write_misaligned_ULong_LE(&p[2], (ULong)(Addr)disp_cp_chain_me);
4249    p[10] = 0x41;
4250    p[11] = 0xFF;
4251    p[12] = 0xD3;
4252    VexInvalRange vir = { (HWord)place_to_unchain, 13 };
4253    return vir;
4254 }
4255
4256
4257 /* Patch the counter address into a profile inc point, as previously
4258    created by the Ain_ProfInc case for emit_AMD64Instr. */
4259 VexInvalRange patchProfInc_AMD64 ( VexEndness endness_host,
4260                                    void*  place_to_patch,
4261                                    const ULong* location_of_counter )
4262 {
4263    vassert(endness_host == VexEndnessLE);
4264    vassert(sizeof(ULong*) == 8);
4265    UChar* p = (UChar*)place_to_patch;
4266    vassert(p[0] == 0x49);
4267    vassert(p[1] == 0xBB);
4268    vassert(p[2] == 0x00);
4269    vassert(p[3] == 0x00);
4270    vassert(p[4] == 0x00);
4271    vassert(p[5] == 0x00);
4272    vassert(p[6] == 0x00);
4273    vassert(p[7] == 0x00);
4274    vassert(p[8] == 0x00);
4275    vassert(p[9] == 0x00);
4276    vassert(p[10] == 0x49);
4277    vassert(p[11] == 0xFF);
4278    vassert(p[12] == 0x03);
4279    ULong imm64 = (ULong)(Addr)location_of_counter;
4280    p[2] = imm64 & 0xFF; imm64 >>= 8;
4281    p[3] = imm64 & 0xFF; imm64 >>= 8;
4282    p[4] = imm64 & 0xFF; imm64 >>= 8;
4283    p[5] = imm64 & 0xFF; imm64 >>= 8;
4284    p[6] = imm64 & 0xFF; imm64 >>= 8;
4285    p[7] = imm64 & 0xFF; imm64 >>= 8;
4286    p[8] = imm64 & 0xFF; imm64 >>= 8;
4287    p[9] = imm64 & 0xFF; imm64 >>= 8;
4288    VexInvalRange vir = { (HWord)place_to_patch, 13 };
4289    return vir;
4290 }
4291
4292
4293 /*---------------------------------------------------------------*/
4294 /*--- end                                   host_amd64_defs.c ---*/
4295 /*---------------------------------------------------------------*/