VEX/priv/host_amd64_defs.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                                 host_amd64_defs.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2017 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, see <http://www.gnu.org/licenses/>.
  25
  26    The GNU General Public License is contained in the file COPYING.
  27
  28    Neither the names of the U.S. Department of Energy nor the
  29    University of California nor the names of its contributors may be
  30    used to endorse or promote products derived from this software
  31    without prior written permission.
  32 */
  33
  34 #include "libvex_basictypes.h"
  35 #include "libvex.h"
  36 #include "libvex_trc_values.h"
  37
  38 #include "main_util.h"
  39 #include "host_generic_regs.h"
  40 #include "host_amd64_defs.h"
  41
  42
  43 /* --------- Registers. --------- */
  44
  45 const RRegUniverse* getRRegUniverse_AMD64 ( void )
  46 {
  47    /* The real-register universe is a big constant, so we just want to
  48       initialise it once. */
  49    static RRegUniverse rRegUniverse_AMD64;
  50    static Bool         rRegUniverse_AMD64_initted = False;
  51
  52    /* Handy shorthand, nothing more */
  53    RRegUniverse* ru = &rRegUniverse_AMD64;
  54
  55    /* This isn't thread-safe.  Sigh. */
  56    if (LIKELY(rRegUniverse_AMD64_initted))
  57       return ru;
  58
  59    RRegUniverse__init(ru);
  60
  61    /* Add the registers.  The initial segment of this array must be
  62       those available for allocation by reg-alloc, and those that
  63       follow are not available for allocation. */
  64    ru->allocable_start[HRcInt64] = ru->size;
  65    ru->regs[ru->size++] = hregAMD64_R12();
  66    ru->regs[ru->size++] = hregAMD64_R13();
  67    ru->regs[ru->size++] = hregAMD64_R14();
  68    ru->regs[ru->size++] = hregAMD64_R15();
  69    ru->regs[ru->size++] = hregAMD64_RBX();
  70    ru->regs[ru->size++] = hregAMD64_RSI();
  71    ru->regs[ru->size++] = hregAMD64_RDI();
  72    ru->regs[ru->size++] = hregAMD64_R8();
  73    ru->regs[ru->size++] = hregAMD64_R9();
  74    ru->regs[ru->size++] = hregAMD64_R10();
  75    ru->allocable_end[HRcInt64] = ru->size - 1;
  76
  77    ru->allocable_start[HRcVec128] = ru->size;
  78    ru->regs[ru->size++] = hregAMD64_XMM3();
  79    ru->regs[ru->size++] = hregAMD64_XMM4();
  80    ru->regs[ru->size++] = hregAMD64_XMM5();
  81    ru->regs[ru->size++] = hregAMD64_XMM6();
  82    ru->regs[ru->size++] = hregAMD64_XMM7();
  83    ru->regs[ru->size++] = hregAMD64_XMM8();
  84    ru->regs[ru->size++] = hregAMD64_XMM9();
  85    ru->regs[ru->size++] = hregAMD64_XMM10();
  86    ru->regs[ru->size++] = hregAMD64_XMM11();
  87    ru->regs[ru->size++] = hregAMD64_XMM12();
  88    ru->allocable_end[HRcVec128] = ru->size - 1;
  89    ru->allocable = ru->size;
  90
  91    /* And other regs, not available to the allocator. */
  92    ru->regs[ru->size++] = hregAMD64_RAX();
  93    ru->regs[ru->size++] = hregAMD64_RCX();
  94    ru->regs[ru->size++] = hregAMD64_RDX();
  95    ru->regs[ru->size++] = hregAMD64_RSP();
  96    ru->regs[ru->size++] = hregAMD64_RBP();
  97    ru->regs[ru->size++] = hregAMD64_R11();
  98    ru->regs[ru->size++] = hregAMD64_XMM0();
  99    ru->regs[ru->size++] = hregAMD64_XMM1();
 100
 101    rRegUniverse_AMD64_initted = True;
 102
 103    RRegUniverse__check_is_sane(ru);
 104    return ru;
 105 }
 106
 107
 108 UInt ppHRegAMD64 ( HReg reg )
 109 {
 110    Int r;
 111    static const HChar* ireg64_names[16]
 112      = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
 113          "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
 114    /* Be generic for all virtual regs. */
 115    if (hregIsVirtual(reg)) {
 116       return ppHReg(reg);
 117    }
 118    /* But specific for real regs. */
 119    switch (hregClass(reg)) {
 120       case HRcInt64:
 121          r = hregEncoding(reg);
 122          vassert(r >= 0 && r < 16);
 123          return vex_printf("%s", ireg64_names[r]);
 124       case HRcVec128:
 125          r = hregEncoding(reg);
 126          vassert(r >= 0 && r < 16);
 127          return vex_printf("%%xmm%d", r);
 128       default:
 129          vpanic("ppHRegAMD64");
 130    }
 131 }
 132
 133 static UInt ppHRegAMD64_lo32 ( HReg reg )
 134 {
 135    Int r;
 136    static const HChar* ireg32_names[16]
 137      = { "%eax", "%ecx", "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
 138          "%r8d", "%r9d", "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
 139    /* Be generic for all virtual regs. */
 140    if (hregIsVirtual(reg)) {
 141       UInt written = ppHReg(reg);
 142       written += vex_printf("d");
 143       return written;
 144    }
 145    /* But specific for real regs. */
 146    switch (hregClass(reg)) {
 147       case HRcInt64:
 148          r = hregEncoding(reg);
 149          vassert(r >= 0 && r < 16);
 150          return vex_printf("%s", ireg32_names[r]);
 151       default:
 152          vpanic("ppHRegAMD64_lo32: invalid regclass");
 153    }
 154 }
 155
 156
 157 /* --------- Condition codes, Intel encoding. --------- */
 158
 159 const HChar* showAMD64CondCode ( AMD64CondCode cond )
 160 {
 161    switch (cond) {
 162       case Acc_O:      return "o";
 163       case Acc_NO:     return "no";
 164       case Acc_B:      return "b";
 165       case Acc_NB:     return "nb";
 166       case Acc_Z:      return "z";
 167       case Acc_NZ:     return "nz";
 168       case Acc_BE:     return "be";
 169       case Acc_NBE:    return "nbe";
 170       case Acc_S:      return "s";
 171       case Acc_NS:     return "ns";
 172       case Acc_P:      return "p";
 173       case Acc_NP:     return "np";
 174       case Acc_L:      return "l";
 175       case Acc_NL:     return "nl";
 176       case Acc_LE:     return "le";
 177       case Acc_NLE:    return "nle";
 178       case Acc_ALWAYS: return "ALWAYS";
 179       default: vpanic("ppAMD64CondCode");
 180    }
 181 }
 182
 183
 184 /* --------- AMD64AMode: memory address expressions. --------- */
 185
 186 AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
 187    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
 188    am->tag        = Aam_IR;
 189    am->Aam.IR.imm = imm32;
 190    am->Aam.IR.reg = reg;
 191    return am;
 192 }
 193 AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
 194    AMD64AMode* am = LibVEX_Alloc_inline(sizeof(AMD64AMode));
 195    am->tag = Aam_IRRS;
 196    am->Aam.IRRS.imm   = imm32;
 197    am->Aam.IRRS.base  = base;
 198    am->Aam.IRRS.index = indEx;
 199    am->Aam.IRRS.shift = shift;
 200    vassert(shift >= 0 && shift <= 3);
 201    return am;
 202 }
 203
 204 void ppAMD64AMode ( AMD64AMode* am ) {
 205    switch (am->tag) {
 206       case Aam_IR:
 207          if (am->Aam.IR.imm == 0)
 208             vex_printf("(");
 209          else
 210             vex_printf("0x%x(", am->Aam.IR.imm);
 211          ppHRegAMD64(am->Aam.IR.reg);
 212          vex_printf(")");
 213          return;
 214       case Aam_IRRS:
 215          vex_printf("0x%x(", am->Aam.IRRS.imm);
 216          ppHRegAMD64(am->Aam.IRRS.base);
 217          vex_printf(",");
 218          ppHRegAMD64(am->Aam.IRRS.index);
 219          vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
 220          return;
 221       default:
 222          vpanic("ppAMD64AMode");
 223    }
 224 }
 225
 226 static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
 227    switch (am->tag) {
 228       case Aam_IR:
 229          addHRegUse(u, HRmRead, am->Aam.IR.reg);
 230          return;
 231       case Aam_IRRS:
 232          addHRegUse(u, HRmRead, am->Aam.IRRS.base);
 233          addHRegUse(u, HRmRead, am->Aam.IRRS.index);
 234          return;
 235       default:
 236          vpanic("addRegUsage_AMD64AMode");
 237    }
 238 }
 239
 240 static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
 241    switch (am->tag) {
 242       case Aam_IR:
 243          am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
 244          return;
 245       case Aam_IRRS:
 246          am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
 247          am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
 248          return;
 249       default:
 250          vpanic("mapRegs_AMD64AMode");
 251    }
 252 }
 253
 254 /* --------- Operand, which can be reg, immediate or memory. --------- */
 255
 256 AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
 257    AMD64RMI* op       = LibVEX_Alloc_inline(sizeof(AMD64RMI));
 258    op->tag            = Armi_Imm;
 259    op->Armi.Imm.imm32 = imm32;
 260    return op;
 261 }
 262 AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
 263    AMD64RMI* op     = LibVEX_Alloc_inline(sizeof(AMD64RMI));
 264    op->tag          = Armi_Reg;
 265    op->Armi.Reg.reg = reg;
 266    return op;
 267 }
 268 AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
 269    AMD64RMI* op    = LibVEX_Alloc_inline(sizeof(AMD64RMI));
 270    op->tag         = Armi_Mem;
 271    op->Armi.Mem.am = am;
 272    return op;
 273 }
 274
 275 static void ppAMD64RMI_wrk ( AMD64RMI* op, Bool lo32 ) {
 276    switch (op->tag) {
 277       case Armi_Imm:
 278          vex_printf("$0x%x", op->Armi.Imm.imm32);
 279          return;
 280       case Armi_Reg:
 281          if (lo32)
 282             ppHRegAMD64_lo32(op->Armi.Reg.reg);
 283          else
 284             ppHRegAMD64(op->Armi.Reg.reg);
 285          return;
 286       case Armi_Mem:
 287          ppAMD64AMode(op->Armi.Mem.am);
 288          return;
 289      default:
 290          vpanic("ppAMD64RMI");
 291    }
 292 }
 293 void ppAMD64RMI ( AMD64RMI* op ) {
 294    ppAMD64RMI_wrk(op, False/*!lo32*/);
 295 }
 296 void ppAMD64RMI_lo32 ( AMD64RMI* op ) {
 297    ppAMD64RMI_wrk(op, True/*lo32*/);
 298 }
 299
 300 /* An AMD64RMI can only be used in a "read" context (what would it mean
 301    to write or modify a literal?) and so we enumerate its registers
 302    accordingly. */
 303 static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
 304    switch (op->tag) {
 305       case Armi_Imm:
 306          return;
 307       case Armi_Reg:
 308          addHRegUse(u, HRmRead, op->Armi.Reg.reg);
 309          return;
 310       case Armi_Mem:
 311          addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
 312          return;
 313       default:
 314          vpanic("addRegUsage_AMD64RMI");
 315    }
 316 }
 317
 318 static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
 319    switch (op->tag) {
 320       case Armi_Imm:
 321          return;
 322       case Armi_Reg:
 323          op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
 324          return;
 325       case Armi_Mem:
 326          mapRegs_AMD64AMode(m, op->Armi.Mem.am);
 327          return;
 328       default:
 329          vpanic("mapRegs_AMD64RMI");
 330    }
 331 }
 332
 333
 334 /* --------- Operand, which can be reg or immediate only. --------- */
 335
 336 AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
 337    AMD64RI* op       = LibVEX_Alloc_inline(sizeof(AMD64RI));
 338    op->tag           = Ari_Imm;
 339    op->Ari.Imm.imm32 = imm32;
 340    return op;
 341 }
 342 AMD64RI* AMD64RI_Reg ( HReg reg ) {
 343    AMD64RI* op     = LibVEX_Alloc_inline(sizeof(AMD64RI));
 344    op->tag         = Ari_Reg;
 345    op->Ari.Reg.reg = reg;
 346    return op;
 347 }
 348
 349 void ppAMD64RI ( AMD64RI* op ) {
 350    switch (op->tag) {
 351       case Ari_Imm:
 352          vex_printf("$0x%x", op->Ari.Imm.imm32);
 353          return;
 354       case Ari_Reg:
 355          ppHRegAMD64(op->Ari.Reg.reg);
 356          return;
 357      default:
 358          vpanic("ppAMD64RI");
 359    }
 360 }
 361
 362 /* An AMD64RI can only be used in a "read" context (what would it mean
 363    to write or modify a literal?) and so we enumerate its registers
 364    accordingly. */
 365 static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
 366    switch (op->tag) {
 367       case Ari_Imm:
 368          return;
 369       case Ari_Reg:
 370          addHRegUse(u, HRmRead, op->Ari.Reg.reg);
 371          return;
 372       default:
 373          vpanic("addRegUsage_AMD64RI");
 374    }
 375 }
 376
 377 static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
 378    switch (op->tag) {
 379       case Ari_Imm:
 380          return;
 381       case Ari_Reg:
 382          op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
 383          return;
 384       default:
 385          vpanic("mapRegs_AMD64RI");
 386    }
 387 }
 388
 389
 390 /* --------- Operand, which can be reg or memory only. --------- */
 391
 392 AMD64RM* AMD64RM_Reg ( HReg reg ) {
 393    AMD64RM* op       = LibVEX_Alloc_inline(sizeof(AMD64RM));
 394    op->tag         = Arm_Reg;
 395    op->Arm.Reg.reg = reg;
 396    return op;
 397 }
 398 AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
 399    AMD64RM* op    = LibVEX_Alloc_inline(sizeof(AMD64RM));
 400    op->tag        = Arm_Mem;
 401    op->Arm.Mem.am = am;
 402    return op;
 403 }
 404
 405 void ppAMD64RM ( AMD64RM* op ) {
 406    switch (op->tag) {
 407       case Arm_Mem:
 408          ppAMD64AMode(op->Arm.Mem.am);
 409          return;
 410       case Arm_Reg:
 411          ppHRegAMD64(op->Arm.Reg.reg);
 412          return;
 413      default:
 414          vpanic("ppAMD64RM");
 415    }
 416 }
 417
 418 /* Because an AMD64RM can be both a source or destination operand, we
 419    have to supply a mode -- pertaining to the operand as a whole --
 420    indicating how it's being used. */
 421 static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
 422    switch (op->tag) {
 423       case Arm_Mem:
 424          /* Memory is read, written or modified.  So we just want to
 425             know the regs read by the amode. */
 426          addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
 427          return;
 428       case Arm_Reg:
 429          /* reg is read, written or modified.  Add it in the
 430             appropriate way. */
 431          addHRegUse(u, mode, op->Arm.Reg.reg);
 432          return;
 433      default:
 434          vpanic("addRegUsage_AMD64RM");
 435    }
 436 }
 437
 438 static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
 439 {
 440    switch (op->tag) {
 441       case Arm_Mem:
 442          mapRegs_AMD64AMode(m, op->Arm.Mem.am);
 443          return;
 444       case Arm_Reg:
 445          op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
 446          return;
 447      default:
 448          vpanic("mapRegs_AMD64RM");
 449    }
 450 }
 451
 452
 453 /* --------- Instructions. --------- */
 454
 455 static const HChar* showAMD64ScalarSz ( Int sz ) {
 456    switch (sz) {
 457       case 2: return "w";
 458       case 4: return "l";
 459       case 8: return "q";
 460       default: vpanic("showAMD64ScalarSz");
 461    }
 462 }
 463
 464 const HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
 465    switch (op) {
 466       case Aun_NOT: return "not";
 467       case Aun_NEG: return "neg";
 468       default: vpanic("showAMD64UnaryOp");
 469    }
 470 }
 471
 472 const HChar* showAMD64AluOp ( AMD64AluOp op ) {
 473    switch (op) {
 474       case Aalu_MOV:  return "mov";
 475       case Aalu_CMP:  return "cmp";
 476       case Aalu_ADD:  return "add";
 477       case Aalu_SUB:  return "sub";
 478       case Aalu_ADC:  return "adc";
 479       case Aalu_SBB:  return "sbb";
 480       case Aalu_AND:  return "and";
 481       case Aalu_OR:   return "or";
 482       case Aalu_XOR:  return "xor";
 483       case Aalu_MUL:  return "imul";
 484       default: vpanic("showAMD64AluOp");
 485    }
 486 }
 487
 488 const HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
 489    switch (op) {
 490       case Ash_SHL: return "shl";
 491       case Ash_SHR: return "shr";
 492       case Ash_SAR: return "sar";
 493       default: vpanic("showAMD64ShiftOp");
 494    }
 495 }
 496
 497 const HChar* showA87FpOp ( A87FpOp op ) {
 498    switch (op) {
 499       case Afp_SCALE:  return "scale";
 500       case Afp_ATAN:   return "atan";
 501       case Afp_YL2X:   return "yl2x";
 502       case Afp_YL2XP1: return "yl2xp1";
 503       case Afp_PREM:   return "prem";
 504       case Afp_PREM1:  return "prem1";
 505       case Afp_SQRT:   return "sqrt";
 506       case Afp_SIN:    return "sin";
 507       case Afp_COS:    return "cos";
 508       case Afp_TAN:    return "tan";
 509       case Afp_ROUND:  return "round";
 510       case Afp_2XM1:   return "2xm1";
 511       default: vpanic("showA87FpOp");
 512    }
 513 }
 514
 515 const HChar* showAMD64SseOp ( AMD64SseOp op ) {
 516    switch (op) {
 517       case Asse_MOV:      return "movups";
 518       case Asse_ADDF:     return "add";
 519       case Asse_SUBF:     return "sub";
 520       case Asse_MULF:     return "mul";
 521       case Asse_DIVF:     return "div";
 522       case Asse_MAXF:     return "max";
 523       case Asse_MINF:     return "min";
 524       case Asse_CMPEQF:   return "cmpFeq";
 525       case Asse_CMPLTF:   return "cmpFlt";
 526       case Asse_CMPLEF:   return "cmpFle";
 527       case Asse_CMPUNF:   return "cmpFun";
 528       case Asse_RCPF:     return "rcp";
 529       case Asse_RSQRTF:   return "rsqrt";
 530       case Asse_SQRTF:    return "sqrt";
 531       case Asse_I2F:      return "cvtdq2ps.";
 532       case Asse_F2I:      return "cvtps2dq.";
 533       case Asse_AND:      return "and";
 534       case Asse_OR:       return "or";
 535       case Asse_XOR:      return "xor";
 536       case Asse_ANDN:     return "andn";
 537       case Asse_ADD8:     return "paddb";
 538       case Asse_ADD16:    return "paddw";
 539       case Asse_ADD32:    return "paddd";
 540       case Asse_ADD64:    return "paddq";
 541       case Asse_QADD8U:   return "paddusb";
 542       case Asse_QADD16U:  return "paddusw";
 543       case Asse_QADD8S:   return "paddsb";
 544       case Asse_QADD16S:  return "paddsw";
 545       case Asse_SUB8:     return "psubb";
 546       case Asse_SUB16:    return "psubw";
 547       case Asse_SUB32:    return "psubd";
 548       case Asse_SUB64:    return "psubq";
 549       case Asse_QSUB8U:   return "psubusb";
 550       case Asse_QSUB16U:  return "psubusw";
 551       case Asse_QSUB8S:   return "psubsb";
 552       case Asse_QSUB16S:  return "psubsw";
 553       case Asse_MUL16:    return "pmullw";
 554       case Asse_MULHI16U: return "pmulhuw";
 555       case Asse_MULHI16S: return "pmulhw";
 556       case Asse_AVG8U:    return "pavgb";
 557       case Asse_AVG16U:   return "pavgw";
 558       case Asse_MAX16S:   return "pmaxw";
 559       case Asse_MAX8U:    return "pmaxub";
 560       case Asse_MIN16S:   return "pminw";
 561       case Asse_MIN8U:    return "pminub";
 562       case Asse_CMPEQ8:   return "pcmpeqb";
 563       case Asse_CMPEQ16:  return "pcmpeqw";
 564       case Asse_CMPEQ32:  return "pcmpeqd";
 565       case Asse_CMPGT8S:  return "pcmpgtb";
 566       case Asse_CMPGT16S: return "pcmpgtw";
 567       case Asse_CMPGT32S: return "pcmpgtd";
 568       case Asse_SHL16:    return "psllw";
 569       case Asse_SHL32:    return "pslld";
 570       case Asse_SHL64:    return "psllq";
 571       case Asse_SHL128:   return "pslldq";
 572       case Asse_SHR16:    return "psrlw";
 573       case Asse_SHR32:    return "psrld";
 574       case Asse_SHR64:    return "psrlq";
 575       case Asse_SHR128:   return "psrldq";
 576       case Asse_SAR16:    return "psraw";
 577       case Asse_SAR32:    return "psrad";
 578       case Asse_PACKSSD:  return "packssdw";
 579       case Asse_PACKSSW:  return "packsswb";
 580       case Asse_PACKUSW:  return "packuswb";
 581       case Asse_UNPCKHB:  return "punpckhb";
 582       case Asse_UNPCKHW:  return "punpckhw";
 583       case Asse_UNPCKHD:  return "punpckhd";
 584       case Asse_UNPCKHQ:  return "punpckhq";
 585       case Asse_UNPCKLB:  return "punpcklb";
 586       case Asse_UNPCKLW:  return "punpcklw";
 587       case Asse_UNPCKLD:  return "punpckld";
 588       case Asse_UNPCKLQ:  return "punpcklq";
 589       case Asse_PSHUFB:   return "pshufb";
 590       case Asse_PMADDUBSW: return "pmaddubsw";
 591       case Asse_F32toF16: return "vcvtps2ph(rm_field=$0x4).";
 592       case Asse_F16toF32: return "vcvtph2ps.";
 593       case Asse_VFMADD213: return "vfmadd213";
 594       default: vpanic("showAMD64SseOp");
 595    }
 596 }
 597
 598 AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
 599    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 600    i->tag             = Ain_Imm64;
 601    i->Ain.Imm64.imm64 = imm64;
 602    i->Ain.Imm64.dst   = dst;
 603    return i;
 604 }
 605 AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
 606    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 607    i->tag            = Ain_Alu64R;
 608    i->Ain.Alu64R.op  = op;
 609    i->Ain.Alu64R.src = src;
 610    i->Ain.Alu64R.dst = dst;
 611    return i;
 612 }
 613 AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
 614    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 615    i->tag            = Ain_Alu64M;
 616    i->Ain.Alu64M.op  = op;
 617    i->Ain.Alu64M.src = src;
 618    i->Ain.Alu64M.dst = dst;
 619    vassert(op != Aalu_MUL);
 620    return i;
 621 }
 622 AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
 623    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 624    i->tag          = Ain_Sh64;
 625    i->Ain.Sh64.op  = op;
 626    i->Ain.Sh64.src = src;
 627    i->Ain.Sh64.dst = dst;
 628    return i;
 629 }
 630 AMD64Instr* AMD64Instr_Sh32 ( AMD64ShiftOp op, UInt src, HReg dst ) {
 631    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 632    i->tag          = Ain_Sh32;
 633    i->Ain.Sh32.op  = op;
 634    i->Ain.Sh32.src = src;
 635    i->Ain.Sh32.dst = dst;
 636    return i;
 637 }
 638 AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
 639    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 640    i->tag              = Ain_Test64;
 641    i->Ain.Test64.imm32 = imm32;
 642    i->Ain.Test64.dst   = dst;
 643    return i;
 644 }
 645 AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
 646    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 647    i->tag             = Ain_Unary64;
 648    i->Ain.Unary64.op  = op;
 649    i->Ain.Unary64.dst = dst;
 650    return i;
 651 }
 652 AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
 653    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 654    i->tag             = Ain_Lea64;
 655    i->Ain.Lea64.am    = am;
 656    i->Ain.Lea64.dst   = dst;
 657    return i;
 658 }
 659 AMD64Instr* AMD64Instr_Alu32R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
 660    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 661    i->tag            = Ain_Alu32R;
 662    i->Ain.Alu32R.op  = op;
 663    i->Ain.Alu32R.src = src;
 664    i->Ain.Alu32R.dst = dst;
 665    switch (op) {
 666       case Aalu_ADD: case Aalu_SUB: case Aalu_CMP:
 667       case Aalu_AND: case Aalu_OR:  case Aalu_XOR: break;
 668       default: vassert(0);
 669    }
 670    return i;
 671 }
 672 AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
 673    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 674    i->tag            = Ain_MulL;
 675    i->Ain.MulL.syned = syned;
 676    i->Ain.MulL.src   = src;
 677    return i;
 678 }
 679 AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
 680    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 681    i->tag            = Ain_Div;
 682    i->Ain.Div.syned  = syned;
 683    i->Ain.Div.sz     = sz;
 684    i->Ain.Div.src    = src;
 685    vassert(sz == 4 || sz == 8);
 686    return i;
 687 }
 688 AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
 689    AMD64Instr* i   = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 690    i->tag          = Ain_Push;
 691    i->Ain.Push.src = src;
 692    return i;
 693 }
 694 AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms,
 695                               RetLoc rloc ) {
 696    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 697    i->tag               = Ain_Call;
 698    i->Ain.Call.cond     = cond;
 699    i->Ain.Call.target   = target;
 700    i->Ain.Call.regparms = regparms;
 701    i->Ain.Call.rloc     = rloc;
 702    vassert(regparms >= 0 && regparms <= 6);
 703    vassert(is_sane_RetLoc(rloc));
 704    return i;
 705 }
 706
 707 AMD64Instr* AMD64Instr_XDirect ( Addr64 dstGA, AMD64AMode* amRIP,
 708                                  AMD64CondCode cond, Bool toFastEP ) {
 709    AMD64Instr* i           = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 710    i->tag                  = Ain_XDirect;
 711    i->Ain.XDirect.dstGA    = dstGA;
 712    i->Ain.XDirect.amRIP    = amRIP;
 713    i->Ain.XDirect.cond     = cond;
 714    i->Ain.XDirect.toFastEP = toFastEP;
 715    return i;
 716 }
 717 AMD64Instr* AMD64Instr_XIndir ( HReg dstGA, AMD64AMode* amRIP,
 718                                 AMD64CondCode cond ) {
 719    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 720    i->tag              = Ain_XIndir;
 721    i->Ain.XIndir.dstGA = dstGA;
 722    i->Ain.XIndir.amRIP = amRIP;
 723    i->Ain.XIndir.cond  = cond;
 724    return i;
 725 }
 726 AMD64Instr* AMD64Instr_XAssisted ( HReg dstGA, AMD64AMode* amRIP,
 727                                    AMD64CondCode cond, IRJumpKind jk ) {
 728    AMD64Instr* i          = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 729    i->tag                 = Ain_XAssisted;
 730    i->Ain.XAssisted.dstGA = dstGA;
 731    i->Ain.XAssisted.amRIP = amRIP;
 732    i->Ain.XAssisted.cond  = cond;
 733    i->Ain.XAssisted.jk    = jk;
 734    return i;
 735 }
 736
 737 AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, HReg src, HReg dst ) {
 738    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 739    i->tag             = Ain_CMov64;
 740    i->Ain.CMov64.cond = cond;
 741    i->Ain.CMov64.src  = src;
 742    i->Ain.CMov64.dst  = dst;
 743    vassert(cond != Acc_ALWAYS);
 744    return i;
 745 }
 746 AMD64Instr* AMD64Instr_CLoad ( AMD64CondCode cond, UChar szB,
 747                                AMD64AMode* addr, HReg dst ) {
 748    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 749    i->tag            = Ain_CLoad;
 750    i->Ain.CLoad.cond = cond;
 751    i->Ain.CLoad.szB  = szB;
 752    i->Ain.CLoad.addr = addr;
 753    i->Ain.CLoad.dst  = dst;
 754    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
 755    return i;
 756 }
 757 AMD64Instr* AMD64Instr_CStore ( AMD64CondCode cond, UChar szB,
 758                                 HReg src, AMD64AMode* addr ) {
 759    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 760    i->tag             = Ain_CStore;
 761    i->Ain.CStore.cond = cond;
 762    i->Ain.CStore.szB  = szB;
 763    i->Ain.CStore.src  = src;
 764    i->Ain.CStore.addr = addr;
 765    vassert(cond != Acc_ALWAYS && (szB == 4 || szB == 8));
 766    return i;
 767 }
 768 AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
 769    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 770    i->tag              = Ain_MovxLQ;
 771    i->Ain.MovxLQ.syned = syned;
 772    i->Ain.MovxLQ.src   = src;
 773    i->Ain.MovxLQ.dst   = dst;
 774    return i;
 775 }
 776 AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
 777                                 AMD64AMode* src, HReg dst ) {
 778    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 779    i->tag                = Ain_LoadEX;
 780    i->Ain.LoadEX.szSmall = szSmall;
 781    i->Ain.LoadEX.syned   = syned;
 782    i->Ain.LoadEX.src     = src;
 783    i->Ain.LoadEX.dst     = dst;
 784    vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
 785    return i;
 786 }
 787 AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
 788    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 789    i->tag           = Ain_Store;
 790    i->Ain.Store.sz  = sz;
 791    i->Ain.Store.src = src;
 792    i->Ain.Store.dst = dst;
 793    vassert(sz == 1 || sz == 2 || sz == 4);
 794    return i;
 795 }
 796 AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
 797    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 798    i->tag            = Ain_Set64;
 799    i->Ain.Set64.cond = cond;
 800    i->Ain.Set64.dst  = dst;
 801    return i;
 802 }
 803 AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
 804    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 805    i->tag               = Ain_Bsfr64;
 806    i->Ain.Bsfr64.isFwds = isFwds;
 807    i->Ain.Bsfr64.src    = src;
 808    i->Ain.Bsfr64.dst    = dst;
 809    return i;
 810 }
 811 AMD64Instr* AMD64Instr_MFence ( void ) {
 812    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 813    i->tag        = Ain_MFence;
 814    return i;
 815 }
 816 AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
 817    AMD64Instr* i    = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 818    i->tag           = Ain_ACAS;
 819    i->Ain.ACAS.addr = addr;
 820    i->Ain.ACAS.sz   = sz;
 821    vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
 822    return i;
 823 }
 824 AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
 825    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 826    i->tag            = Ain_DACAS;
 827    i->Ain.DACAS.addr = addr;
 828    i->Ain.DACAS.sz   = sz;
 829    vassert(sz == 8 || sz == 4);
 830    return i;
 831 }
 832
 833 AMD64Instr* AMD64Instr_A87Free ( Int nregs )
 834 {
 835    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 836    i->tag               = Ain_A87Free;
 837    i->Ain.A87Free.nregs = nregs;
 838    vassert(nregs >= 1 && nregs <= 7);
 839    return i;
 840 }
 841 AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
 842 {
 843    AMD64Instr* i            = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 844    i->tag                   = Ain_A87PushPop;
 845    i->Ain.A87PushPop.addr   = addr;
 846    i->Ain.A87PushPop.isPush = isPush;
 847    i->Ain.A87PushPop.szB    = szB;
 848    vassert(szB == 8 || szB == 4);
 849    return i;
 850 }
 851 AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
 852 {
 853    AMD64Instr* i     = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 854    i->tag            = Ain_A87FpOp;
 855    i->Ain.A87FpOp.op = op;
 856    return i;
 857 }
 858 AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
 859 {
 860    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 861    i->tag              = Ain_A87LdCW;
 862    i->Ain.A87LdCW.addr = addr;
 863    return i;
 864 }
 865 AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
 866 {
 867    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 868    i->tag              = Ain_A87StSW;
 869    i->Ain.A87StSW.addr = addr;
 870    return i;
 871 }
 872 AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
 873    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 874    i->tag                = Ain_LdMXCSR;
 875    i->Ain.LdMXCSR.addr   = addr;
 876    return i;
 877 }
 878 AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
 879    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 880    i->tag                = Ain_SseUComIS;
 881    i->Ain.SseUComIS.sz   = toUChar(sz);
 882    i->Ain.SseUComIS.srcL = srcL;
 883    i->Ain.SseUComIS.srcR = srcR;
 884    i->Ain.SseUComIS.dst  = dst;
 885    vassert(sz == 4 || sz == 8);
 886    return i;
 887 }
 888 AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
 889    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 890    i->tag              = Ain_SseSI2SF;
 891    i->Ain.SseSI2SF.szS = toUChar(szS);
 892    i->Ain.SseSI2SF.szD = toUChar(szD);
 893    i->Ain.SseSI2SF.src = src;
 894    i->Ain.SseSI2SF.dst = dst;
 895    vassert(szS == 4 || szS == 8);
 896    vassert(szD == 4 || szD == 8);
 897    return i;
 898 }
 899 AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
 900    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 901    i->tag              = Ain_SseSF2SI;
 902    i->Ain.SseSF2SI.szS = toUChar(szS);
 903    i->Ain.SseSF2SI.szD = toUChar(szD);
 904    i->Ain.SseSF2SI.src = src;
 905    i->Ain.SseSF2SI.dst = dst;
 906    vassert(szS == 4 || szS == 8);
 907    vassert(szD == 4 || szD == 8);
 908    return i;
 909 }
 910 AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
 911 {
 912    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 913    i->tag                = Ain_SseSDSS;
 914    i->Ain.SseSDSS.from64 = from64;
 915    i->Ain.SseSDSS.src    = src;
 916    i->Ain.SseSDSS.dst    = dst;
 917    return i;
 918 }
 919 AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz,
 920                                  HReg reg, AMD64AMode* addr ) {
 921    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 922    i->tag                = Ain_SseLdSt;
 923    i->Ain.SseLdSt.isLoad = isLoad;
 924    i->Ain.SseLdSt.sz     = toUChar(sz);
 925    i->Ain.SseLdSt.reg    = reg;
 926    i->Ain.SseLdSt.addr   = addr;
 927    vassert(sz == 4 || sz == 8 || sz == 16);
 928    return i;
 929 }
 930 AMD64Instr* AMD64Instr_SseCStore ( AMD64CondCode cond,
 931                                    HReg src, AMD64AMode* addr )
 932 {
 933    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 934    i->tag                = Ain_SseCStore;
 935    i->Ain.SseCStore.cond = cond;
 936    i->Ain.SseCStore.src  = src;
 937    i->Ain.SseCStore.addr = addr;
 938    vassert(cond != Acc_ALWAYS);
 939    return i;
 940 }
 941 AMD64Instr* AMD64Instr_SseCLoad ( AMD64CondCode cond,
 942                                   AMD64AMode* addr, HReg dst )
 943 {
 944    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 945    i->tag               = Ain_SseCLoad;
 946    i->Ain.SseCLoad.cond = cond;
 947    i->Ain.SseCLoad.addr = addr;
 948    i->Ain.SseCLoad.dst  = dst;
 949    vassert(cond != Acc_ALWAYS);
 950    return i;
 951 }
 952 AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
 953 {
 954    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 955    i->tag                = Ain_SseLdzLO;
 956    i->Ain.SseLdzLO.sz    = sz;
 957    i->Ain.SseLdzLO.reg   = reg;
 958    i->Ain.SseLdzLO.addr  = addr;
 959    vassert(sz == 4 || sz == 8);
 960    return i;
 961 }
 962 AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
 963    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 964    i->tag              = Ain_Sse32Fx4;
 965    i->Ain.Sse32Fx4.op  = op;
 966    i->Ain.Sse32Fx4.src = src;
 967    i->Ain.Sse32Fx4.dst = dst;
 968    vassert(op != Asse_MOV);
 969    return i;
 970 }
 971 AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
 972    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 973    i->tag              = Ain_Sse32FLo;
 974    i->Ain.Sse32FLo.op  = op;
 975    i->Ain.Sse32FLo.src = src;
 976    i->Ain.Sse32FLo.dst = dst;
 977    vassert(op != Asse_MOV);
 978    return i;
 979 }
 980 AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
 981    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 982    i->tag              = Ain_Sse64Fx2;
 983    i->Ain.Sse64Fx2.op  = op;
 984    i->Ain.Sse64Fx2.src = src;
 985    i->Ain.Sse64Fx2.dst = dst;
 986    vassert(op != Asse_MOV);
 987    return i;
 988 }
 989 AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
 990    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
 991    i->tag              = Ain_Sse64FLo;
 992    i->Ain.Sse64FLo.op  = op;
 993    i->Ain.Sse64FLo.src = src;
 994    i->Ain.Sse64FLo.dst = dst;
 995    vassert(op != Asse_MOV);
 996    return i;
 997 }
 998 AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
 999    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1000    i->tag             = Ain_SseReRg;
1001    i->Ain.SseReRg.op  = op;
1002    i->Ain.SseReRg.src = re;
1003    i->Ain.SseReRg.dst = rg;
1004    return i;
1005 }
1006 AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
1007    AMD64Instr* i       = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1008    i->tag              = Ain_SseCMov;
1009    i->Ain.SseCMov.cond = cond;
1010    i->Ain.SseCMov.src  = src;
1011    i->Ain.SseCMov.dst  = dst;
1012    vassert(cond != Acc_ALWAYS);
1013    return i;
1014 }
1015 AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
1016    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1017    i->tag               = Ain_SseShuf;
1018    i->Ain.SseShuf.order = order;
1019    i->Ain.SseShuf.src   = src;
1020    i->Ain.SseShuf.dst   = dst;
1021    vassert(order >= 0 && order <= 0xFF);
1022    return i;
1023 }
1024 AMD64Instr* AMD64Instr_SseShiftN ( AMD64SseOp op,
1025                                    UInt shiftBits, HReg dst ) {
1026    AMD64Instr* i              = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1027    i->tag                     = Ain_SseShiftN;
1028    i->Ain.SseShiftN.op        = op;
1029    i->Ain.SseShiftN.shiftBits = shiftBits;
1030    i->Ain.SseShiftN.dst       = dst;
1031    return i;
1032 }
1033 AMD64Instr* AMD64Instr_SseMOVQ ( HReg gpr, HReg xmm, Bool toXMM ) {
1034    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1035    i->tag               = Ain_SseMOVQ;
1036    i->Ain.SseMOVQ.gpr   = gpr;
1037    i->Ain.SseMOVQ.xmm   = xmm;
1038    i->Ain.SseMOVQ.toXMM = toXMM;
1039    vassert(hregClass(gpr) == HRcInt64);
1040    vassert(hregClass(xmm) == HRcVec128);
1041    return i;
1042 }
1043 //uu AMD64Instr* AMD64Instr_AvxLdSt ( Bool isLoad,
1044 //uu                                  HReg reg, AMD64AMode* addr ) {
1045 //uu    AMD64Instr* i         = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1046 //uu    i->tag                = Ain_AvxLdSt;
1047 //uu    i->Ain.AvxLdSt.isLoad = isLoad;
1048 //uu    i->Ain.AvxLdSt.reg    = reg;
1049 //uu    i->Ain.AvxLdSt.addr   = addr;
1050 //uu    return i;
1051 //uu }
1052 //uu AMD64Instr* AMD64Instr_AvxReRg ( AMD64SseOp op, HReg re, HReg rg ) {
1053 //uu    AMD64Instr* i      = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1054 //uu    i->tag             = Ain_AvxReRg;
1055 //uu    i->Ain.AvxReRg.op  = op;
1056 //uu    i->Ain.AvxReRg.src = re;
1057 //uu    i->Ain.AvxReRg.dst = rg;
1058 //uu    return i;
1059 //uu }
1060 AMD64Instr* AMD64Instr_Avx32FLo ( AMD64SseOp op, HReg src1, HReg src2, HReg dst ) {
1061    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1062    i->tag               = Ain_Avx32FLo;
1063    i->Ain.Avx32FLo.op   = op;
1064    i->Ain.Avx32FLo.src1 = src1;
1065    i->Ain.Avx32FLo.src2 = src2;
1066    i->Ain.Avx32FLo.dst  = dst;
1067    vassert(op != Asse_MOV);
1068    return i;
1069 }
1070
1071 AMD64Instr* AMD64Instr_Avx64FLo ( AMD64SseOp op, HReg src1, HReg src2, HReg dst ) {
1072    AMD64Instr* i        = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1073    i->tag               = Ain_Avx64FLo;
1074    i->Ain.Avx64FLo.op   = op;
1075    i->Ain.Avx64FLo.src1 = src1;
1076    i->Ain.Avx64FLo.src2 = src2;
1077    i->Ain.Avx64FLo.dst  = dst;
1078    vassert(op != Asse_MOV);
1079    return i;
1080 }
1081
1082 AMD64Instr* AMD64Instr_EvCheck ( AMD64AMode* amCounter,
1083                                  AMD64AMode* amFailAddr ) {
1084    AMD64Instr* i             = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1085    i->tag                    = Ain_EvCheck;
1086    i->Ain.EvCheck.amCounter  = amCounter;
1087    i->Ain.EvCheck.amFailAddr = amFailAddr;
1088    return i;
1089 }
1090 AMD64Instr* AMD64Instr_ProfInc ( void ) {
1091    AMD64Instr* i = LibVEX_Alloc_inline(sizeof(AMD64Instr));
1092    i->tag        = Ain_ProfInc;
1093    return i;
1094 }
1095
1096 void ppAMD64Instr ( const AMD64Instr* i, Bool mode64 )
1097 {
1098    vassert(mode64 == True);
1099    switch (i->tag) {
1100       case Ain_Imm64:
1101          vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
1102          ppHRegAMD64(i->Ain.Imm64.dst);
1103          return;
1104       case Ain_Alu64R:
1105          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
1106          ppAMD64RMI(i->Ain.Alu64R.src);
1107          vex_printf(",");
1108          ppHRegAMD64(i->Ain.Alu64R.dst);
1109          return;
1110       case Ain_Alu64M:
1111          vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
1112          ppAMD64RI(i->Ain.Alu64M.src);
1113          vex_printf(",");
1114          ppAMD64AMode(i->Ain.Alu64M.dst);
1115          return;
1116       case Ain_Sh64:
1117          vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
1118          if (i->Ain.Sh64.src == 0)
1119             vex_printf("%%cl,");
1120          else
1121             vex_printf("$%d,", (Int)i->Ain.Sh64.src);
1122          ppHRegAMD64(i->Ain.Sh64.dst);
1123          return;
1124       case Ain_Sh32:
1125          vex_printf("%sl ", showAMD64ShiftOp(i->Ain.Sh32.op));
1126          if (i->Ain.Sh32.src == 0)
1127             vex_printf("%%cl,");
1128          else
1129             vex_printf("$%d,", (Int)i->Ain.Sh32.src);
1130          ppHRegAMD64_lo32(i->Ain.Sh32.dst);
1131          return;
1132       case Ain_Test64:
1133          vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
1134          ppHRegAMD64(i->Ain.Test64.dst);
1135          return;
1136       case Ain_Unary64:
1137          vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
1138          ppHRegAMD64(i->Ain.Unary64.dst);
1139          return;
1140       case Ain_Lea64:
1141          vex_printf("leaq ");
1142          ppAMD64AMode(i->Ain.Lea64.am);
1143          vex_printf(",");
1144          ppHRegAMD64(i->Ain.Lea64.dst);
1145          return;
1146       case Ain_Alu32R:
1147          vex_printf("%sl ", showAMD64AluOp(i->Ain.Alu32R.op));
1148          ppAMD64RMI_lo32(i->Ain.Alu32R.src);
1149          vex_printf(",");
1150          ppHRegAMD64_lo32(i->Ain.Alu32R.dst);
1151          return;
1152       case Ain_MulL:
1153          vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
1154          ppAMD64RM(i->Ain.MulL.src);
1155          return;
1156       case Ain_Div:
1157          vex_printf("%cdiv%s ",
1158                     i->Ain.Div.syned ? 's' : 'u',
1159                     showAMD64ScalarSz(i->Ain.Div.sz));
1160          ppAMD64RM(i->Ain.Div.src);
1161          return;
1162       case Ain_Push:
1163          vex_printf("pushq ");
1164          ppAMD64RMI(i->Ain.Push.src);
1165          return;
1166       case Ain_Call:
1167          vex_printf("call%s[%d,",
1168                     i->Ain.Call.cond==Acc_ALWAYS
1169                        ? "" : showAMD64CondCode(i->Ain.Call.cond),
1170                     i->Ain.Call.regparms );
1171          ppRetLoc(i->Ain.Call.rloc);
1172          vex_printf("] 0x%llx", i->Ain.Call.target);
1173          break;
1174
1175       case Ain_XDirect:
1176          vex_printf("(xDirect) ");
1177          vex_printf("if (%%rflags.%s) { ",
1178                     showAMD64CondCode(i->Ain.XDirect.cond));
1179          vex_printf("movabsq $0x%llx,%%r11; ", i->Ain.XDirect.dstGA);
1180          vex_printf("movq %%r11,");
1181          ppAMD64AMode(i->Ain.XDirect.amRIP);
1182          vex_printf("; ");
1183          vex_printf("movabsq $disp_cp_chain_me_to_%sEP,%%r11; call *%%r11 }",
1184                     i->Ain.XDirect.toFastEP ? "fast" : "slow");
1185          return;
1186       case Ain_XIndir:
1187          vex_printf("(xIndir) ");
1188          vex_printf("if (%%rflags.%s) { ",
1189                     showAMD64CondCode(i->Ain.XIndir.cond));
1190          vex_printf("movq ");
1191          ppHRegAMD64(i->Ain.XIndir.dstGA);
1192          vex_printf(",");
1193          ppAMD64AMode(i->Ain.XIndir.amRIP);
1194          vex_printf("; movabsq $disp_indir,%%r11; jmp *%%r11 }");
1195          return;
1196       case Ain_XAssisted:
1197          vex_printf("(xAssisted) ");
1198          vex_printf("if (%%rflags.%s) { ",
1199                     showAMD64CondCode(i->Ain.XAssisted.cond));
1200          vex_printf("movq ");
1201          ppHRegAMD64(i->Ain.XAssisted.dstGA);
1202          vex_printf(",");
1203          ppAMD64AMode(i->Ain.XAssisted.amRIP);
1204          vex_printf("; movl $IRJumpKind_to_TRCVAL(%d),%%rbp",
1205                     (Int)i->Ain.XAssisted.jk);
1206          vex_printf("; movabsq $disp_assisted,%%r11; jmp *%%r11 }");
1207          return;
1208
1209       case Ain_CMov64:
1210          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
1211          ppHRegAMD64(i->Ain.CMov64.src);
1212          vex_printf(",");
1213          ppHRegAMD64(i->Ain.CMov64.dst);
1214          return;
1215       case Ain_CLoad:
1216          vex_printf("if (%%rflags.%s) { ",
1217                     showAMD64CondCode(i->Ain.CLoad.cond));
1218          vex_printf("mov%c ", i->Ain.CLoad.szB == 4 ? 'l' : 'q');
1219          ppAMD64AMode(i->Ain.CLoad.addr);
1220          vex_printf(", ");
1221          (i->Ain.CLoad.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1222             (i->Ain.CLoad.dst);
1223          vex_printf(" }");
1224          return;
1225       case Ain_CStore:
1226          vex_printf("if (%%rflags.%s) { ",
1227                     showAMD64CondCode(i->Ain.CStore.cond));
1228          vex_printf("mov%c ", i->Ain.CStore.szB == 4 ? 'l' : 'q');
1229          (i->Ain.CStore.szB == 4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1230             (i->Ain.CStore.src);
1231          vex_printf(", ");
1232          ppAMD64AMode(i->Ain.CStore.addr);
1233          vex_printf(" }");
1234          return;
1235
1236       case Ain_MovxLQ:
1237          vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
1238          ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
1239          vex_printf(",");
1240          ppHRegAMD64(i->Ain.MovxLQ.dst);
1241          return;
1242       case Ain_LoadEX:
1243          if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
1244             vex_printf("movl ");
1245             ppAMD64AMode(i->Ain.LoadEX.src);
1246             vex_printf(",");
1247             ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
1248          } else {
1249             vex_printf("mov%c%cq ",
1250                        i->Ain.LoadEX.syned ? 's' : 'z',
1251                        i->Ain.LoadEX.szSmall==1
1252                           ? 'b'
1253                           : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
1254             ppAMD64AMode(i->Ain.LoadEX.src);
1255             vex_printf(",");
1256             ppHRegAMD64(i->Ain.LoadEX.dst);
1257          }
1258          return;
1259       case Ain_Store:
1260          vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b'
1261                               : (i->Ain.Store.sz==2 ? 'w' : 'l'));
1262          ppHRegAMD64(i->Ain.Store.src);
1263          vex_printf(",");
1264          ppAMD64AMode(i->Ain.Store.dst);
1265          return;
1266       case Ain_Set64:
1267          vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
1268          ppHRegAMD64(i->Ain.Set64.dst);
1269          return;
1270       case Ain_Bsfr64:
1271          vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
1272          ppHRegAMD64(i->Ain.Bsfr64.src);
1273          vex_printf(",");
1274          ppHRegAMD64(i->Ain.Bsfr64.dst);
1275          return;
1276       case Ain_MFence:
1277          vex_printf("mfence" );
1278          return;
1279       case Ain_ACAS:
1280          vex_printf("lock cmpxchg%c ",
1281                      i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w'
1282                      : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
1283          vex_printf("{%%rax->%%rbx},");
1284          ppAMD64AMode(i->Ain.ACAS.addr);
1285          return;
1286       case Ain_DACAS:
1287          vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
1288                     (Int)(2 * i->Ain.DACAS.sz));
1289          ppAMD64AMode(i->Ain.DACAS.addr);
1290          return;
1291       case Ain_A87Free:
1292          vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
1293          break;
1294       case Ain_A87PushPop:
1295          vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
1296                     i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
1297          ppAMD64AMode(i->Ain.A87PushPop.addr);
1298          break;
1299       case Ain_A87FpOp:
1300          vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
1301          break;
1302       case Ain_A87LdCW:
1303          vex_printf("fldcw ");
1304          ppAMD64AMode(i->Ain.A87LdCW.addr);
1305          break;
1306       case Ain_A87StSW:
1307          vex_printf("fstsw ");
1308          ppAMD64AMode(i->Ain.A87StSW.addr);
1309          break;
1310       case Ain_LdMXCSR:
1311          vex_printf("ldmxcsr ");
1312          ppAMD64AMode(i->Ain.LdMXCSR.addr);
1313          break;
1314       case Ain_SseUComIS:
1315          vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
1316          ppHRegAMD64(i->Ain.SseUComIS.srcL);
1317          vex_printf(",");
1318          ppHRegAMD64(i->Ain.SseUComIS.srcR);
1319          vex_printf(" ; pushfq ; popq ");
1320          ppHRegAMD64(i->Ain.SseUComIS.dst);
1321          break;
1322       case Ain_SseSI2SF:
1323          vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
1324          (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1325             (i->Ain.SseSI2SF.src);
1326          vex_printf(",");
1327          ppHRegAMD64(i->Ain.SseSI2SF.dst);
1328          break;
1329       case Ain_SseSF2SI:
1330          vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
1331          ppHRegAMD64(i->Ain.SseSF2SI.src);
1332          vex_printf(",");
1333          (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
1334             (i->Ain.SseSF2SI.dst);
1335          break;
1336       case Ain_SseSDSS:
1337          vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
1338          ppHRegAMD64(i->Ain.SseSDSS.src);
1339          vex_printf(",");
1340          ppHRegAMD64(i->Ain.SseSDSS.dst);
1341          break;
1342       case Ain_SseLdSt:
1343          switch (i->Ain.SseLdSt.sz) {
1344             case 4:  vex_printf("movss "); break;
1345             case 8:  vex_printf("movsd "); break;
1346             case 16: vex_printf("movups "); break;
1347             default: vassert(0);
1348          }
1349          if (i->Ain.SseLdSt.isLoad) {
1350             ppAMD64AMode(i->Ain.SseLdSt.addr);
1351             vex_printf(",");
1352             ppHRegAMD64(i->Ain.SseLdSt.reg);
1353          } else {
1354             ppHRegAMD64(i->Ain.SseLdSt.reg);
1355             vex_printf(",");
1356             ppAMD64AMode(i->Ain.SseLdSt.addr);
1357          }
1358          return;
1359       case Ain_SseCStore:
1360          vex_printf("if (%%rflags.%s) { ",
1361                     showAMD64CondCode(i->Ain.SseCStore.cond));
1362          vex_printf("movups ");
1363          ppHRegAMD64(i->Ain.SseCStore.src);
1364          vex_printf(", ");
1365          ppAMD64AMode(i->Ain.SseCStore.addr);
1366          vex_printf(" }");
1367          return;
1368       case Ain_SseCLoad:
1369          vex_printf("if (%%rflags.%s) { ",
1370                     showAMD64CondCode(i->Ain.SseCLoad.cond));
1371          vex_printf("movups ");
1372          ppAMD64AMode(i->Ain.SseCLoad.addr);
1373          vex_printf(", ");
1374          ppHRegAMD64(i->Ain.SseCLoad.dst);
1375          vex_printf(" }");
1376          return;
1377       case Ain_SseLdzLO:
1378          vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
1379          ppAMD64AMode(i->Ain.SseLdzLO.addr);
1380          vex_printf(",");
1381          ppHRegAMD64(i->Ain.SseLdzLO.reg);
1382          return;
1383       case Ain_Sse32Fx4:
1384          vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
1385          ppHRegAMD64(i->Ain.Sse32Fx4.src);
1386          vex_printf(",");
1387          ppHRegAMD64(i->Ain.Sse32Fx4.dst);
1388          return;
1389       case Ain_Sse32FLo:
1390          vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
1391          ppHRegAMD64(i->Ain.Sse32FLo.src);
1392          vex_printf(",");
1393          ppHRegAMD64(i->Ain.Sse32FLo.dst);
1394          return;
1395       case Ain_Sse64Fx2:
1396          vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
1397          ppHRegAMD64(i->Ain.Sse64Fx2.src);
1398          vex_printf(",");
1399          ppHRegAMD64(i->Ain.Sse64Fx2.dst);
1400          return;
1401       case Ain_Sse64FLo:
1402          vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
1403          ppHRegAMD64(i->Ain.Sse64FLo.src);
1404          vex_printf(",");
1405          ppHRegAMD64(i->Ain.Sse64FLo.dst);
1406          return;
1407       case Ain_SseReRg:
1408          vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1409          ppHRegAMD64(i->Ain.SseReRg.src);
1410          vex_printf(",");
1411          ppHRegAMD64(i->Ain.SseReRg.dst);
1412          return;
1413       case Ain_SseCMov:
1414          vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
1415          ppHRegAMD64(i->Ain.SseCMov.src);
1416          vex_printf(",");
1417          ppHRegAMD64(i->Ain.SseCMov.dst);
1418          return;
1419       case Ain_SseShuf:
1420          vex_printf("pshufd $0x%x,", (UInt)i->Ain.SseShuf.order);
1421          ppHRegAMD64(i->Ain.SseShuf.src);
1422          vex_printf(",");
1423          ppHRegAMD64(i->Ain.SseShuf.dst);
1424          return;
1425       case Ain_SseShiftN:
1426          vex_printf("%s $%u, ", showAMD64SseOp(i->Ain.SseShiftN.op),
1427                                 i->Ain.SseShiftN.shiftBits);
1428          ppHRegAMD64(i->Ain.SseShiftN.dst);
1429          return;
1430       case Ain_SseMOVQ:
1431          vex_printf("movq ");
1432          if (i->Ain.SseMOVQ.toXMM) {
1433             ppHRegAMD64(i->Ain.SseMOVQ.gpr);
1434             vex_printf(",");
1435             ppHRegAMD64(i->Ain.SseMOVQ.xmm);
1436          } else {
1437             ppHRegAMD64(i->Ain.SseMOVQ.xmm);
1438             vex_printf(",");
1439             ppHRegAMD64(i->Ain.SseMOVQ.gpr);
1440          };
1441          return;
1442       //uu case Ain_AvxLdSt:
1443       //uu    vex_printf("vmovups ");
1444       //uu    if (i->Ain.AvxLdSt.isLoad) {
1445       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1446       //uu       vex_printf(",");
1447       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1448       //uu    } else {
1449       //uu       ppHRegAMD64(i->Ain.AvxLdSt.reg);
1450       //uu       vex_printf(",");
1451       //uu       ppAMD64AMode(i->Ain.AvxLdSt.addr);
1452       //uu    }
1453       //uu    return;
1454       //uu case Ain_AvxReRg:
1455       //uu    vex_printf("v%s ", showAMD64SseOp(i->Ain.SseReRg.op));
1456       //uu    ppHRegAMD64(i->Ain.AvxReRg.src);
1457       //uu    vex_printf(",");
1458       //uu    ppHRegAMD64(i->Ain.AvxReRg.dst);
1459       //uu    return;
1460       case Ain_Avx32FLo:
1461          vex_printf("%sss ", showAMD64SseOp(i->Ain.Avx32FLo.op));
1462          ppHRegAMD64(i->Ain.Avx32FLo.src2);
1463          vex_printf(",");
1464          ppHRegAMD64(i->Ain.Avx32FLo.src1);
1465          vex_printf(",");
1466          ppHRegAMD64(i->Ain.Avx32FLo.dst);
1467          return;
1468       case Ain_Avx64FLo:
1469          vex_printf("%ssd ", showAMD64SseOp(i->Ain.Avx64FLo.op));
1470          ppHRegAMD64(i->Ain.Avx64FLo.src2);
1471          vex_printf(",");
1472          ppHRegAMD64(i->Ain.Avx64FLo.src1);
1473          vex_printf(",");
1474          ppHRegAMD64(i->Ain.Avx64FLo.dst);
1475          return;
1476       case Ain_EvCheck:
1477          vex_printf("(evCheck) decl ");
1478          ppAMD64AMode(i->Ain.EvCheck.amCounter);
1479          vex_printf("; jns nofail; jmp *");
1480          ppAMD64AMode(i->Ain.EvCheck.amFailAddr);
1481          vex_printf("; nofail:");
1482          return;
1483       case Ain_ProfInc:
1484          vex_printf("(profInc) movabsq $NotKnownYet, %%r11; incq (%%r11)");
1485          return;
1486       default:
1487          vpanic("ppAMD64Instr");
1488    }
1489 }
1490
1491 /* --------- Helpers for register allocation. --------- */
1492
1493 void getRegUsage_AMD64Instr ( HRegUsage* u, const AMD64Instr* i, Bool mode64 )
1494 {
1495    Bool unary;
1496    vassert(mode64 == True);
1497    initHRegUsage(u);
1498    switch (i->tag) {
1499       case Ain_Imm64:
1500          addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
1501          return;
1502       case Ain_Alu64R:
1503          addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
1504          if (i->Ain.Alu64R.op == Aalu_MOV) {
1505             addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
1506
1507             if (i->Ain.Alu64R.src->tag == Armi_Reg) {
1508                u->isRegRegMove = True;
1509                u->regMoveSrc   = i->Ain.Alu64R.src->Armi.Reg.reg;
1510                u->regMoveDst   = i->Ain.Alu64R.dst;
1511             }
1512             return;
1513          }
1514          if (i->Ain.Alu64R.op == Aalu_CMP) {
1515             addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
1516             return;
1517          }
1518          addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
1519          return;
1520       case Ain_Alu64M:
1521          addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
1522          addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
1523          return;
1524       case Ain_Sh64:
1525          addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
1526          if (i->Ain.Sh64.src == 0)
1527             addHRegUse(u, HRmRead, hregAMD64_RCX());
1528          return;
1529       case Ain_Sh32:
1530          addHRegUse(u, HRmModify, i->Ain.Sh32.dst);
1531          if (i->Ain.Sh32.src == 0)
1532             addHRegUse(u, HRmRead, hregAMD64_RCX());
1533          return;
1534       case Ain_Test64:
1535          addHRegUse(u, HRmRead, i->Ain.Test64.dst);
1536          return;
1537       case Ain_Unary64:
1538          addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
1539          return;
1540       case Ain_Lea64:
1541          addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
1542          addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
1543          return;
1544       case Ain_Alu32R:
1545          vassert(i->Ain.Alu32R.op != Aalu_MOV);
1546          addRegUsage_AMD64RMI(u, i->Ain.Alu32R.src);
1547          if (i->Ain.Alu32R.op == Aalu_CMP) {
1548             addHRegUse(u, HRmRead, i->Ain.Alu32R.dst);
1549             return;
1550          }
1551          addHRegUse(u, HRmModify, i->Ain.Alu32R.dst);
1552          return;
1553       case Ain_MulL:
1554          addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
1555          addHRegUse(u, HRmModify, hregAMD64_RAX());
1556          addHRegUse(u, HRmWrite, hregAMD64_RDX());
1557          return;
1558       case Ain_Div:
1559          addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
1560          addHRegUse(u, HRmModify, hregAMD64_RAX());
1561          addHRegUse(u, HRmModify, hregAMD64_RDX());
1562          return;
1563       case Ain_Push:
1564          addRegUsage_AMD64RMI(u, i->Ain.Push.src);
1565          addHRegUse(u, HRmModify, hregAMD64_RSP());
1566          return;
1567       case Ain_Call:
1568          /* This is a bit subtle. */
1569          /* First off, claim it trashes all the caller-saved regs
1570             which fall within the register allocator's jurisdiction.
1571             These I believe to be: rax rcx rdx rdi rsi r8 r9 r10
1572             and all the xmm registers. */
1573          addHRegUse(u, HRmWrite, hregAMD64_RAX());
1574          addHRegUse(u, HRmWrite, hregAMD64_RCX());
1575          addHRegUse(u, HRmWrite, hregAMD64_RDX());
1576          addHRegUse(u, HRmWrite, hregAMD64_RDI());
1577          addHRegUse(u, HRmWrite, hregAMD64_RSI());
1578          addHRegUse(u, HRmWrite, hregAMD64_R8());
1579          addHRegUse(u, HRmWrite, hregAMD64_R9());
1580          addHRegUse(u, HRmWrite, hregAMD64_R10());
1581          addHRegUse(u, HRmWrite, hregAMD64_XMM0());
1582          addHRegUse(u, HRmWrite, hregAMD64_XMM1());
1583          addHRegUse(u, HRmWrite, hregAMD64_XMM3());
1584          addHRegUse(u, HRmWrite, hregAMD64_XMM4());
1585          addHRegUse(u, HRmWrite, hregAMD64_XMM5());
1586          addHRegUse(u, HRmWrite, hregAMD64_XMM6());
1587          addHRegUse(u, HRmWrite, hregAMD64_XMM7());
1588          addHRegUse(u, HRmWrite, hregAMD64_XMM8());
1589          addHRegUse(u, HRmWrite, hregAMD64_XMM9());
1590          addHRegUse(u, HRmWrite, hregAMD64_XMM10());
1591          addHRegUse(u, HRmWrite, hregAMD64_XMM11());
1592          addHRegUse(u, HRmWrite, hregAMD64_XMM12());
1593
1594          /* Now we have to state any parameter-carrying registers
1595             which might be read.  This depends on the regparmness. */
1596          switch (i->Ain.Call.regparms) {
1597             case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
1598             case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
1599             case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
1600             case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
1601             case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
1602             case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
1603             case 0: break;
1604             default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
1605          }
1606          /* Finally, there is the issue that the insn trashes a
1607             register because the literal target address has to be
1608             loaded into a register.  Fortunately, r11 is stated in the
1609             ABI as a scratch register, and so seems a suitable victim.  */
1610          addHRegUse(u, HRmWrite, hregAMD64_R11());
1611          /* Upshot of this is that the assembler really must use r11,
1612             and no other, as a destination temporary. */
1613          return;
1614       /* XDirect/XIndir/XAssisted are also a bit subtle.  They
1615          conditionally exit the block.  Hence we only need to list (1)
1616          the registers that they read, and (2) the registers that they
1617          write in the case where the block is not exited.  (2) is
1618          empty, hence only (1) is relevant here. */
1619       case Ain_XDirect:
1620          /* Don't bother to mention the write to %r11, since it is not
1621             available to the allocator. */
1622          addRegUsage_AMD64AMode(u, i->Ain.XDirect.amRIP);
1623          return;
1624       case Ain_XIndir:
1625          /* Ditto re %r11 */
1626          addHRegUse(u, HRmRead, i->Ain.XIndir.dstGA);
1627          addRegUsage_AMD64AMode(u, i->Ain.XIndir.amRIP);
1628          return;
1629       case Ain_XAssisted:
1630          /* Ditto re %r11 and %rbp (the baseblock ptr) */
1631          addHRegUse(u, HRmRead, i->Ain.XAssisted.dstGA);
1632          addRegUsage_AMD64AMode(u, i->Ain.XAssisted.amRIP);
1633          return;
1634       case Ain_CMov64:
1635          addHRegUse(u, HRmRead,   i->Ain.CMov64.src);
1636          addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
1637          return;
1638       case Ain_CLoad:
1639          addRegUsage_AMD64AMode(u, i->Ain.CLoad.addr);
1640          addHRegUse(u, HRmModify, i->Ain.CLoad.dst);
1641          return;
1642       case Ain_CStore:
1643          addRegUsage_AMD64AMode(u, i->Ain.CStore.addr);
1644          addHRegUse(u, HRmRead, i->Ain.CStore.src);
1645          return;
1646       case Ain_MovxLQ:
1647          addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
1648          addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
1649          return;
1650       case Ain_LoadEX:
1651          addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
1652          addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
1653          return;
1654       case Ain_Store:
1655          addHRegUse(u, HRmRead, i->Ain.Store.src);
1656          addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
1657          return;
1658       case Ain_Set64:
1659          addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
1660          return;
1661       case Ain_Bsfr64:
1662          addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
1663          addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
1664          return;
1665       case Ain_MFence:
1666          return;
1667       case Ain_ACAS:
1668          addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
1669          addHRegUse(u, HRmRead, hregAMD64_RBX());
1670          addHRegUse(u, HRmModify, hregAMD64_RAX());
1671          return;
1672       case Ain_DACAS:
1673          addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
1674          addHRegUse(u, HRmRead, hregAMD64_RCX());
1675          addHRegUse(u, HRmRead, hregAMD64_RBX());
1676          addHRegUse(u, HRmModify, hregAMD64_RDX());
1677          addHRegUse(u, HRmModify, hregAMD64_RAX());
1678          return;
1679       case Ain_A87Free:
1680          return;
1681       case Ain_A87PushPop:
1682          addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
1683          return;
1684       case Ain_A87FpOp:
1685          return;
1686       case Ain_A87LdCW:
1687          addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
1688          return;
1689       case Ain_A87StSW:
1690          addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
1691          return;
1692       case Ain_LdMXCSR:
1693          addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
1694          return;
1695       case Ain_SseUComIS:
1696          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
1697          addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
1698          addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
1699          return;
1700       case Ain_SseSI2SF:
1701          addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
1702          addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
1703          return;
1704       case Ain_SseSF2SI:
1705          addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
1706          addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
1707          return;
1708       case Ain_SseSDSS:
1709          addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
1710          addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
1711          return;
1712       case Ain_SseLdSt:
1713          addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
1714          addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
1715                        i->Ain.SseLdSt.reg);
1716          return;
1717       case Ain_SseCStore:
1718          addRegUsage_AMD64AMode(u, i->Ain.SseCStore.addr);
1719          addHRegUse(u, HRmRead, i->Ain.SseCStore.src);
1720          return;
1721       case Ain_SseCLoad:
1722          addRegUsage_AMD64AMode(u, i->Ain.SseCLoad.addr);
1723          addHRegUse(u, HRmModify, i->Ain.SseCLoad.dst);
1724          return;
1725       case Ain_SseLdzLO:
1726          addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
1727          addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
1728          return;
1729       case Ain_Sse32Fx4:
1730          vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
1731          unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
1732                          || i->Ain.Sse32Fx4.op == Asse_RSQRTF
1733                          || i->Ain.Sse32Fx4.op == Asse_SQRTF
1734                          || i->Ain.Sse32Fx4.op == Asse_I2F
1735                          || i->Ain.Sse32Fx4.op == Asse_F2I
1736                          || i->Ain.Sse32Fx4.op == Asse_F32toF16
1737                          || i->Ain.Sse32Fx4.op == Asse_F16toF32 );
1738          addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
1739          addHRegUse(u, unary ? HRmWrite : HRmModify,
1740                        i->Ain.Sse32Fx4.dst);
1741          return;
1742       case Ain_Sse32FLo:
1743          vassert(i->Ain.Sse32FLo.op != Asse_MOV);
1744          unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
1745                          || i->Ain.Sse32FLo.op == Asse_RSQRTF
1746                          || i->Ain.Sse32FLo.op == Asse_SQRTF );
1747          addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
1748          addHRegUse(u, unary ? HRmWrite : HRmModify,
1749                        i->Ain.Sse32FLo.dst);
1750          return;
1751       case Ain_Sse64Fx2:
1752          vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
1753          unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
1754                          || i->Ain.Sse64Fx2.op == Asse_RSQRTF
1755                          || i->Ain.Sse64Fx2.op == Asse_SQRTF );
1756          addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
1757          addHRegUse(u, unary ? HRmWrite : HRmModify,
1758                        i->Ain.Sse64Fx2.dst);
1759          return;
1760       case Ain_Sse64FLo:
1761          vassert(i->Ain.Sse64FLo.op != Asse_MOV);
1762          unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
1763                          || i->Ain.Sse64FLo.op == Asse_RSQRTF
1764                          || i->Ain.Sse64FLo.op == Asse_SQRTF );
1765          addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
1766          addHRegUse(u, unary ? HRmWrite : HRmModify,
1767                        i->Ain.Sse64FLo.dst);
1768          return;
1769       case Ain_SseReRg:
1770          if ( (i->Ain.SseReRg.op == Asse_XOR
1771                || i->Ain.SseReRg.op == Asse_CMPEQ32)
1772               && sameHReg(i->Ain.SseReRg.src, i->Ain.SseReRg.dst)) {
1773             /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
1774                r,r' as a write of a value to r, and independent of any
1775                previous value in r */
1776             /* (as opposed to a rite of passage :-) */
1777             addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
1778          } else {
1779             addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
1780             addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV
1781                              ? HRmWrite : HRmModify,
1782                           i->Ain.SseReRg.dst);
1783
1784             if (i->Ain.SseReRg.op == Asse_MOV) {
1785                u->isRegRegMove = True;
1786                u->regMoveSrc   = i->Ain.SseReRg.src;
1787                u->regMoveDst   = i->Ain.SseReRg.dst;
1788             }
1789          }
1790          return;
1791       case Ain_SseCMov:
1792          addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
1793          addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
1794          return;
1795       case Ain_SseShuf:
1796          addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
1797          addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
1798          return;
1799       case Ain_SseShiftN:
1800          addHRegUse(u, HRmModify, i->Ain.SseShiftN.dst);
1801          return;
1802       case Ain_SseMOVQ:
1803          addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmRead : HRmWrite,
1804                     i->Ain.SseMOVQ.gpr);
1805          addHRegUse(u, i->Ain.SseMOVQ.toXMM ? HRmWrite : HRmRead,
1806                     i->Ain.SseMOVQ.xmm);
1807          return;
1808       //uu case Ain_AvxLdSt:
1809       //uu addRegUsage_AMD64AMode(u, i->Ain.AvxLdSt.addr);
1810       //uu addHRegUse(u, i->Ain.AvxLdSt.isLoad ? HRmWrite : HRmRead,
1811       //uu               i->Ain.AvxLdSt.reg);
1812       //uu return;
1813       //uu case Ain_AvxReRg:
1814       //uu    if ( (i->Ain.AvxReRg.op == Asse_XOR
1815       //uu          || i->Ain.AvxReRg.op == Asse_CMPEQ32)
1816       //uu         && i->Ain.AvxReRg.src == i->Ain.AvxReRg.dst) {
1817       //uu       /* See comments on the case for Ain_SseReRg. */
1818       //uu       addHRegUse(u, HRmWrite, i->Ain.AvxReRg.dst);
1819       //uu    } else {
1820       //uu       addHRegUse(u, HRmRead, i->Ain.AvxReRg.src);
1821       //uu       addHRegUse(u, i->Ain.AvxReRg.op == Asse_MOV
1822       //uu                        ? HRmWrite : HRmModify,
1823       //uu                     i->Ain.AvxReRg.dst);
1824       //uu
1825       //uu       if (i->Ain.AvxReRg.op == Asse_MOV) {
1826       //uu          u->isRegRegMove = True;
1827       //uu          u->regMoveSrc   = i->Ain.AvxReRg.src;
1828       //uu          u->regMoveDst   = i->Ain.AvxReRg.dst;
1829       //uu       }
1830       //uu    }
1831       //uu    return;
1832       case Ain_Avx32FLo:
1833          vassert(i->Ain.Avx32FLo.op != Asse_MOV);
1834          addHRegUse(u, HRmRead, i->Ain.Avx32FLo.src1);
1835          addHRegUse(u, HRmRead, i->Ain.Avx32FLo.src2);
1836          addHRegUse(u, HRmModify, i->Ain.Avx32FLo.dst);
1837          return;
1838       case Ain_Avx64FLo:
1839          vassert(i->Ain.Avx64FLo.op != Asse_MOV);
1840          addHRegUse(u, HRmRead, i->Ain.Avx64FLo.src1);
1841          addHRegUse(u, HRmRead, i->Ain.Avx64FLo.src2);
1842          addHRegUse(u, HRmModify, i->Ain.Avx64FLo.dst);
1843          return;
1844       case Ain_EvCheck:
1845          /* We expect both amodes only to mention %rbp, so this is in
1846             fact pointless, since %rbp isn't allocatable, but anyway.. */
1847          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amCounter);
1848          addRegUsage_AMD64AMode(u, i->Ain.EvCheck.amFailAddr);
1849          return;
1850       case Ain_ProfInc:
1851          addHRegUse(u, HRmWrite, hregAMD64_R11());
1852          return;
1853       default:
1854          ppAMD64Instr(i, mode64);
1855          vpanic("getRegUsage_AMD64Instr");
1856    }
1857 }
1858
1859 /* local helper */
1860 static inline void mapReg(HRegRemap* m, HReg* r)
1861 {
1862    *r = lookupHRegRemap(m, *r);
1863 }
1864
1865 void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
1866 {
1867    vassert(mode64 == True);
1868    switch (i->tag) {
1869       case Ain_Imm64:
1870          mapReg(m, &i->Ain.Imm64.dst);
1871          return;
1872       case Ain_Alu64R:
1873          mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
1874          mapReg(m, &i->Ain.Alu64R.dst);
1875          return;
1876       case Ain_Alu64M:
1877          mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
1878          mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
1879          return;
1880       case Ain_Sh64:
1881          mapReg(m, &i->Ain.Sh64.dst);
1882          return;
1883       case Ain_Sh32:
1884          mapReg(m, &i->Ain.Sh32.dst);
1885          return;
1886       case Ain_Test64:
1887          mapReg(m, &i->Ain.Test64.dst);
1888          return;
1889       case Ain_Unary64:
1890          mapReg(m, &i->Ain.Unary64.dst);
1891          return;
1892       case Ain_Lea64:
1893          mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
1894          mapReg(m, &i->Ain.Lea64.dst);
1895          return;
1896       case Ain_Alu32R:
1897          mapRegs_AMD64RMI(m, i->Ain.Alu32R.src);
1898          mapReg(m, &i->Ain.Alu32R.dst);
1899          return;
1900       case Ain_MulL:
1901          mapRegs_AMD64RM(m, i->Ain.MulL.src);
1902          return;
1903       case Ain_Div:
1904          mapRegs_AMD64RM(m, i->Ain.Div.src);
1905          return;
1906       case Ain_Push:
1907          mapRegs_AMD64RMI(m, i->Ain.Push.src);
1908          return;
1909       case Ain_Call:
1910          return;
1911       case Ain_XDirect:
1912          mapRegs_AMD64AMode(m, i->Ain.XDirect.amRIP);
1913          return;
1914       case Ain_XIndir:
1915          mapReg(m, &i->Ain.XIndir.dstGA);
1916          mapRegs_AMD64AMode(m, i->Ain.XIndir.amRIP);
1917          return;
1918       case Ain_XAssisted:
1919          mapReg(m, &i->Ain.XAssisted.dstGA);
1920          mapRegs_AMD64AMode(m, i->Ain.XAssisted.amRIP);
1921          return;
1922       case Ain_CMov64:
1923          mapReg(m, &i->Ain.CMov64.src);
1924          mapReg(m, &i->Ain.CMov64.dst);
1925          return;
1926       case Ain_CLoad:
1927          mapRegs_AMD64AMode(m, i->Ain.CLoad.addr);
1928          mapReg(m, &i->Ain.CLoad.dst);
1929          return;
1930       case Ain_CStore:
1931          mapRegs_AMD64AMode(m, i->Ain.CStore.addr);
1932          mapReg(m, &i->Ain.CStore.src);
1933          return;
1934       case Ain_MovxLQ:
1935          mapReg(m, &i->Ain.MovxLQ.src);
1936          mapReg(m, &i->Ain.MovxLQ.dst);
1937          return;
1938       case Ain_LoadEX:
1939          mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
1940          mapReg(m, &i->Ain.LoadEX.dst);
1941          return;
1942       case Ain_Store:
1943          mapReg(m, &i->Ain.Store.src);
1944          mapRegs_AMD64AMode(m, i->Ain.Store.dst);
1945          return;
1946       case Ain_Set64:
1947          mapReg(m, &i->Ain.Set64.dst);
1948          return;
1949       case Ain_Bsfr64:
1950          mapReg(m, &i->Ain.Bsfr64.src);
1951          mapReg(m, &i->Ain.Bsfr64.dst);
1952          return;
1953       case Ain_MFence:
1954          return;
1955       case Ain_ACAS:
1956          mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
1957          return;
1958       case Ain_DACAS:
1959          mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
1960          return;
1961       case Ain_A87Free:
1962          return;
1963       case Ain_A87PushPop:
1964          mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
1965          return;
1966       case Ain_A87FpOp:
1967          return;
1968       case Ain_A87LdCW:
1969          mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
1970          return;
1971       case Ain_A87StSW:
1972          mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
1973          return;
1974       case Ain_LdMXCSR:
1975          mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
1976          return;
1977       case Ain_SseUComIS:
1978          mapReg(m, &i->Ain.SseUComIS.srcL);
1979          mapReg(m, &i->Ain.SseUComIS.srcR);
1980          mapReg(m, &i->Ain.SseUComIS.dst);
1981          return;
1982       case Ain_SseSI2SF:
1983          mapReg(m, &i->Ain.SseSI2SF.src);
1984          mapReg(m, &i->Ain.SseSI2SF.dst);
1985          return;
1986       case Ain_SseSF2SI:
1987          mapReg(m, &i->Ain.SseSF2SI.src);
1988          mapReg(m, &i->Ain.SseSF2SI.dst);
1989          return;
1990       case Ain_SseSDSS:
1991          mapReg(m, &i->Ain.SseSDSS.src);
1992          mapReg(m, &i->Ain.SseSDSS.dst);
1993          return;
1994       case Ain_SseLdSt:
1995          mapReg(m, &i->Ain.SseLdSt.reg);
1996          mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
1997          break;
1998       case Ain_SseCStore:
1999          mapRegs_AMD64AMode(m, i->Ain.SseCStore.addr);
2000          mapReg(m, &i->Ain.SseCStore.src);
2001          return;
2002       case Ain_SseCLoad:
2003          mapRegs_AMD64AMode(m, i->Ain.SseCLoad.addr);
2004          mapReg(m, &i->Ain.SseCLoad.dst);
2005          return;
2006       case Ain_SseLdzLO:
2007          mapReg(m, &i->Ain.SseLdzLO.reg);
2008          mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
2009          break;
2010       case Ain_Sse32Fx4:
2011          mapReg(m, &i->Ain.Sse32Fx4.src);
2012          mapReg(m, &i->Ain.Sse32Fx4.dst);
2013          return;
2014       case Ain_Sse32FLo:
2015          mapReg(m, &i->Ain.Sse32FLo.src);
2016          mapReg(m, &i->Ain.Sse32FLo.dst);
2017          return;
2018       case Ain_Sse64Fx2:
2019          mapReg(m, &i->Ain.Sse64Fx2.src);
2020          mapReg(m, &i->Ain.Sse64Fx2.dst);
2021          return;
2022       case Ain_Sse64FLo:
2023          mapReg(m, &i->Ain.Sse64FLo.src);
2024          mapReg(m, &i->Ain.Sse64FLo.dst);
2025          return;
2026       case Ain_SseReRg:
2027          mapReg(m, &i->Ain.SseReRg.src);
2028          mapReg(m, &i->Ain.SseReRg.dst);
2029          return;
2030       case Ain_SseCMov:
2031          mapReg(m, &i->Ain.SseCMov.src);
2032          mapReg(m, &i->Ain.SseCMov.dst);
2033          return;
2034       case Ain_SseShuf:
2035          mapReg(m, &i->Ain.SseShuf.src);
2036          mapReg(m, &i->Ain.SseShuf.dst);
2037          return;
2038       case Ain_SseShiftN:
2039          mapReg(m, &i->Ain.SseShiftN.dst);
2040          return;
2041       case Ain_SseMOVQ:
2042          mapReg(m, &i->Ain.SseMOVQ.gpr);
2043          mapReg(m, &i->Ain.SseMOVQ.xmm);
2044          return;
2045       //uu case Ain_AvxLdSt:
2046       //uu    mapReg(m, &i->Ain.AvxLdSt.reg);
2047       //uu    mapRegs_AMD64AMode(m, i->Ain.AvxLdSt.addr);
2048       //uu    break;
2049       //uu case Ain_AvxReRg:
2050       //uu    mapReg(m, &i->Ain.AvxReRg.src);
2051       //uu    mapReg(m, &i->Ain.AvxReRg.dst);
2052       //uu    return;
2053       case Ain_Avx32FLo:
2054          mapReg(m, &i->Ain.Avx32FLo.src1);
2055          mapReg(m, &i->Ain.Avx32FLo.src2);
2056          mapReg(m, &i->Ain.Avx32FLo.dst);
2057          return;
2058       case Ain_Avx64FLo:
2059          mapReg(m, &i->Ain.Avx64FLo.src1);
2060          mapReg(m, &i->Ain.Avx64FLo.src2);
2061          mapReg(m, &i->Ain.Avx64FLo.dst);
2062          return;
2063       case Ain_EvCheck:
2064          /* We expect both amodes only to mention %rbp, so this is in
2065             fact pointless, since %rbp isn't allocatable, but anyway.. */
2066          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amCounter);
2067          mapRegs_AMD64AMode(m, i->Ain.EvCheck.amFailAddr);
2068          return;
2069       case Ain_ProfInc:
2070          /* hardwires r11 -- nothing to modify. */
2071          return;
2072       default:
2073          ppAMD64Instr(i, mode64);
2074          vpanic("mapRegs_AMD64Instr");
2075    }
2076 }
2077
2078 /* Generate amd64 spill/reload instructions under the direction of the
2079    register allocator.  Note it's critical these don't write the
2080    condition codes. */
2081
2082 void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
2083                       HReg rreg, Int offsetB, Bool mode64 )
2084 {
2085    AMD64AMode* am;
2086    vassert(offsetB >= 0);
2087    vassert(!hregIsVirtual(rreg));
2088    vassert(mode64 == True);
2089    *i1 = *i2 = NULL;
2090    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
2091    switch (hregClass(rreg)) {
2092       case HRcInt64:
2093          *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
2094          return;
2095       case HRcVec128:
2096          *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
2097          return;
2098       default:
2099          ppHRegClass(hregClass(rreg));
2100          vpanic("genSpill_AMD64: unimplemented regclass");
2101    }
2102 }
2103
2104 void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
2105                        HReg rreg, Int offsetB, Bool mode64 )
2106 {
2107    AMD64AMode* am;
2108    vassert(offsetB >= 0);
2109    vassert(!hregIsVirtual(rreg));
2110    vassert(mode64 == True);
2111    *i1 = *i2 = NULL;
2112    am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
2113    switch (hregClass(rreg)) {
2114       case HRcInt64:
2115          *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
2116          return;
2117       case HRcVec128:
2118          *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
2119          return;
2120       default:
2121          ppHRegClass(hregClass(rreg));
2122          vpanic("genReload_AMD64: unimplemented regclass");
2123    }
2124 }
2125
2126 AMD64Instr* genMove_AMD64(HReg from, HReg to, Bool mode64)
2127 {
2128    switch (hregClass(from)) {
2129    case HRcInt64:
2130       return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(from), to);
2131    case HRcVec128:
2132       return AMD64Instr_SseReRg(Asse_MOV, from, to);
2133    default:
2134       ppHRegClass(hregClass(from));
2135       vpanic("genMove_AMD64: unimplemented regclass");
2136    }
2137 }
2138
2139 AMD64Instr* directReload_AMD64( AMD64Instr* i, HReg vreg, Short spill_off )
2140 {
2141    vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
2142
2143    /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg
2144       Convert to: src=RMI_Mem, dst=Reg
2145    */
2146    if (i->tag == Ain_Alu64R
2147        && (i->Ain.Alu64R.op == Aalu_MOV || i->Ain.Alu64R.op == Aalu_OR
2148            || i->Ain.Alu64R.op == Aalu_XOR)
2149        && i->Ain.Alu64R.src->tag == Armi_Reg
2150        && sameHReg(i->Ain.Alu64R.src->Armi.Reg.reg, vreg)) {
2151       vassert(! sameHReg(i->Ain.Alu64R.dst, vreg));
2152       return AMD64Instr_Alu64R(
2153                 i->Ain.Alu64R.op,
2154                 AMD64RMI_Mem( AMD64AMode_IR( spill_off, hregAMD64_RBP())),
2155                 i->Ain.Alu64R.dst
2156              );
2157    }
2158
2159    /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg
2160       Convert to: src=RI_Imm, dst=Mem
2161    */
2162    if (i->tag == Ain_Alu64R
2163        && (i->Ain.Alu64R.op == Aalu_CMP)
2164        && i->Ain.Alu64R.src->tag == Armi_Imm
2165        && sameHReg(i->Ain.Alu64R.dst, vreg)) {
2166       return AMD64Instr_Alu64M(
2167                 i->Ain.Alu64R.op,
2168                 AMD64RI_Imm( i->Ain.Alu64R.src->Armi.Imm.imm32 ),
2169                 AMD64AMode_IR( spill_off, hregAMD64_RBP())
2170              );
2171    }
2172
2173    return NULL;
2174 }
2175
2176
2177 /* --------- The amd64 assembler (bleh.) --------- */
2178
2179 /* Produce the low three bits of an integer register number. */
2180 inline static UInt iregEnc210 ( HReg r )
2181 {
2182    UInt n;
2183    vassert(hregClass(r) == HRcInt64);
2184    vassert(!hregIsVirtual(r));
2185    n = hregEncoding(r);
2186    vassert(n <= 15);
2187    return n & 7;
2188 }
2189
2190 /* Produce bit 3 of an integer register number. */
2191 inline static UInt iregEnc3 ( HReg r )
2192 {
2193    UInt n;
2194    vassert(hregClass(r) == HRcInt64);
2195    vassert(!hregIsVirtual(r));
2196    n = hregEncoding(r);
2197    vassert(n <= 15);
2198    return (n >> 3) & 1;
2199 }
2200
2201 /* Produce a complete 4-bit integer register number. */
2202 inline static UInt iregEnc3210 ( HReg r )
2203 {
2204    UInt n;
2205    vassert(hregClass(r) == HRcInt64);
2206    vassert(!hregIsVirtual(r));
2207    n = hregEncoding(r);
2208    vassert(n <= 15);
2209    return n;
2210 }
2211
2212 /* Produce a complete 4-bit integer register number. */
2213 inline static UInt vregEnc3210 ( HReg r )
2214 {
2215    UInt n;
2216    vassert(hregClass(r) == HRcVec128);
2217    vassert(!hregIsVirtual(r));
2218    n = hregEncoding(r);
2219    vassert(n <= 15);
2220    return n;
2221 }
2222
2223 inline static UChar mkModRegRM ( UInt mod, UInt reg, UInt regmem )
2224 {
2225    vassert(mod < 4);
2226    vassert((reg|regmem) < 8);
2227    return (UChar)( ((mod & 3) << 6) | ((reg & 7) << 3) | (regmem & 7) );
2228 }
2229
2230 inline static UChar mkSIB ( UInt shift, UInt regindex, UInt regbase )
2231 {
2232    vassert(shift < 4);
2233    vassert((regindex|regbase) < 8);
2234    return (UChar)( ((shift & 3) << 6) | ((regindex & 7) << 3) | (regbase & 7) );
2235 }
2236
2237 static UChar* emit32 ( UChar* p, UInt w32 )
2238 {
2239    *p++ = toUChar((w32)       & 0x000000FF);
2240    *p++ = toUChar((w32 >>  8) & 0x000000FF);
2241    *p++ = toUChar((w32 >> 16) & 0x000000FF);
2242    *p++ = toUChar((w32 >> 24) & 0x000000FF);
2243    return p;
2244 }
2245
2246 static UChar* emit64 ( UChar* p, ULong w64 )
2247 {
2248    p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
2249    p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
2250    return p;
2251 }
2252
2253 /* Does a sign-extend of the lowest 8 bits give
2254    the original number? */
2255 static Bool fits8bits ( UInt w32 )
2256 {
2257    Int i32 = (Int)w32;
2258    return toBool(i32 == ((Int)(w32 << 24) >> 24));
2259 }
2260 /* Can the lower 32 bits be signedly widened to produce the whole
2261    64-bit value?  In other words, are the top 33 bits either all 0 or
2262    all 1 ? */
2263 static Bool fitsIn32Bits ( ULong x )
2264 {
2265    Long y1;
2266    y1 = x << 32;
2267    y1 >>=/*s*/ 32;
2268    return toBool(x == y1);
2269 }
2270
2271
2272 /* Forming mod-reg-rm bytes and scale-index-base bytes.
2273
2274      greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
2275                        =  00 greg ereg
2276
2277      greg,  d8(ereg)   |  ereg is neither of: RSP R12
2278                        =  01 greg ereg, d8
2279
2280      greg,  d32(ereg)  |  ereg is neither of: RSP R12
2281                        =  10 greg ereg, d32
2282
2283      greg,  d8(ereg)   |  ereg is either: RSP R12
2284                        =  01 greg 100, 0x24, d8
2285                        (lowest bit of rex distinguishes R12/RSP)
2286
2287      greg,  d32(ereg)  |  ereg is either: RSP R12
2288                        =  10 greg 100, 0x24, d32
2289                        (lowest bit of rex distinguishes R12/RSP)
2290
2291      -----------------------------------------------
2292
2293      greg,  d8(base,index,scale)
2294                |  index != RSP
2295                =  01 greg 100, scale index base, d8
2296
2297      greg,  d32(base,index,scale)
2298                |  index != RSP
2299                =  10 greg 100, scale index base, d32
2300 */
2301 static UChar* doAMode_M__wrk ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2302 {
2303    UInt gregEnc210 = gregEnc3210 & 7;
2304    if (am->tag == Aam_IR) {
2305       if (am->Aam.IR.imm == 0
2306           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2307           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RBP())
2308           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2309           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R13())
2310          ) {
2311          *p++ = mkModRegRM(0, gregEnc210, iregEnc210(am->Aam.IR.reg));
2312          return p;
2313       }
2314       if (fits8bits(am->Aam.IR.imm)
2315           && ! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2316           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2317          ) {
2318          *p++ = mkModRegRM(1, gregEnc210, iregEnc210(am->Aam.IR.reg));
2319          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2320          return p;
2321       }
2322       if (! sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2323           && ! sameHReg(am->Aam.IR.reg, hregAMD64_R12())
2324          ) {
2325          *p++ = mkModRegRM(2, gregEnc210, iregEnc210(am->Aam.IR.reg));
2326          p = emit32(p, am->Aam.IR.imm);
2327          return p;
2328       }
2329       if ((sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2330            || sameHReg(am->Aam.IR.reg, hregAMD64_R12()))
2331           && fits8bits(am->Aam.IR.imm)) {
2332          *p++ = mkModRegRM(1, gregEnc210, 4);
2333          *p++ = 0x24;
2334          *p++ = toUChar(am->Aam.IR.imm & 0xFF);
2335          return p;
2336       }
2337       if (/* (sameHReg(am->Aam.IR.reg, hregAMD64_RSP())
2338               || wait for test case for RSP case */
2339           sameHReg(am->Aam.IR.reg, hregAMD64_R12())) {
2340          *p++ = mkModRegRM(2, gregEnc210, 4);
2341          *p++ = 0x24;
2342          p = emit32(p, am->Aam.IR.imm);
2343          return p;
2344       }
2345       ppAMD64AMode(am);
2346       vpanic("doAMode_M: can't emit amode IR");
2347       /*NOTREACHED*/
2348    }
2349    if (am->tag == Aam_IRRS) {
2350       if (fits8bits(am->Aam.IRRS.imm)
2351           && ! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2352          *p++ = mkModRegRM(1, gregEnc210, 4);
2353          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2354                                           iregEnc210(am->Aam.IRRS.base));
2355          *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
2356          return p;
2357       }
2358       if (! sameHReg(am->Aam.IRRS.index, hregAMD64_RSP())) {
2359          *p++ = mkModRegRM(2, gregEnc210, 4);
2360          *p++ = mkSIB(am->Aam.IRRS.shift, iregEnc210(am->Aam.IRRS.index),
2361                                           iregEnc210(am->Aam.IRRS.base));
2362          p = emit32(p, am->Aam.IRRS.imm);
2363          return p;
2364       }
2365       ppAMD64AMode(am);
2366       vpanic("doAMode_M: can't emit amode IRRS");
2367       /*NOTREACHED*/
2368    }
2369    vpanic("doAMode_M: unknown amode");
2370    /*NOTREACHED*/
2371 }
2372
2373 static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am )
2374 {
2375    return doAMode_M__wrk(p, iregEnc3210(greg), am);
2376 }
2377
2378 static UChar* doAMode_M_enc ( UChar* p, UInt gregEnc3210, AMD64AMode* am )
2379 {
2380    vassert(gregEnc3210 < 16);
2381    return doAMode_M__wrk(p, gregEnc3210, am);
2382 }
2383
2384
2385 /* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
2386 inline
2387 static UChar* doAMode_R__wrk ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2388 {
2389    *p++ = mkModRegRM(3, gregEnc3210 & 7, eregEnc3210 & 7);
2390    return p;
2391 }
2392
2393 static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg )
2394 {
2395    return doAMode_R__wrk(p, iregEnc3210(greg), iregEnc3210(ereg));
2396 }
2397
2398 static UChar* doAMode_R_enc_reg ( UChar* p, UInt gregEnc3210, HReg ereg )
2399 {
2400    vassert(gregEnc3210 < 16);
2401    return doAMode_R__wrk(p, gregEnc3210, iregEnc3210(ereg));
2402 }
2403
2404 static UChar* doAMode_R_reg_enc ( UChar* p, HReg greg, UInt eregEnc3210 )
2405 {
2406    vassert(eregEnc3210 < 16);
2407    return doAMode_R__wrk(p, iregEnc3210(greg), eregEnc3210);
2408 }
2409
2410 static UChar* doAMode_R_enc_enc ( UChar* p, UInt gregEnc3210, UInt eregEnc3210 )
2411 {
2412    vassert( (gregEnc3210|eregEnc3210) < 16);
2413    return doAMode_R__wrk(p, gregEnc3210, eregEnc3210);
2414 }
2415
2416
2417 /* Clear the W bit on a REX byte, thereby changing the operand size
2418    back to whatever that instruction's default operand size is. */
2419 static inline UChar clearWBit ( UChar rex )
2420 {
2421    return rex & ~(1<<3);
2422 }
2423
2424 static inline UChar setWBit ( UChar rex )
2425 {
2426    return rex | (1<<3);
2427 }
2428
2429
2430 /* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
2431 inline static UChar rexAMode_M__wrk ( UInt gregEnc3210, AMD64AMode* am )
2432 {
2433    if (am->tag == Aam_IR) {
2434       UChar W = 1;  /* we want 64-bit mode */
2435       UChar R = (gregEnc3210 >> 3) & 1;
2436       UChar X = 0; /* not relevant */
2437       UChar B = iregEnc3(am->Aam.IR.reg);
2438       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2439    }
2440    if (am->tag == Aam_IRRS) {
2441       UChar W = 1;  /* we want 64-bit mode */
2442       UChar R = (gregEnc3210 >> 3) & 1;
2443       UChar X = iregEnc3(am->Aam.IRRS.index);
2444       UChar B = iregEnc3(am->Aam.IRRS.base);
2445       return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2446    }
2447    vassert(0);
2448    return 0; /*NOTREACHED*/
2449 }
2450
2451 static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
2452 {
2453    return rexAMode_M__wrk(iregEnc3210(greg), am);
2454 }
2455
2456 static UChar rexAMode_M_enc ( UInt gregEnc3210, AMD64AMode* am )
2457 {
2458    vassert(gregEnc3210 < 16);
2459    return rexAMode_M__wrk(gregEnc3210, am);
2460 }
2461
2462
2463 /* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
2464 inline static UChar rexAMode_R__wrk ( UInt gregEnc3210, UInt eregEnc3210 )
2465 {
2466    UChar W = 1;  /* we want 64-bit mode */
2467    UChar R = (gregEnc3210 >> 3) & 1;
2468    UChar X = 0; /* not relevant */
2469    UChar B = (eregEnc3210 >> 3) & 1;
2470    return 0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0));
2471 }
2472
2473 static UChar rexAMode_R ( HReg greg, HReg ereg )
2474 {
2475    return rexAMode_R__wrk(iregEnc3210(greg), iregEnc3210(ereg));
2476 }
2477
2478 static UChar rexAMode_R_enc_reg ( UInt gregEnc3210, HReg ereg )
2479 {
2480    vassert(gregEnc3210 < 16);
2481    return rexAMode_R__wrk(gregEnc3210, iregEnc3210(ereg));
2482 }
2483
2484 static UChar rexAMode_R_reg_enc ( HReg greg, UInt eregEnc3210 )
2485 {
2486    vassert(eregEnc3210 < 16);
2487    return rexAMode_R__wrk(iregEnc3210(greg), eregEnc3210);
2488 }
2489
2490 static UChar rexAMode_R_enc_enc ( UInt gregEnc3210, UInt eregEnc3210 )
2491 {
2492    vassert((gregEnc3210|eregEnc3210) < 16);
2493    return rexAMode_R__wrk(gregEnc3210, eregEnc3210);
2494 }
2495
2496
2497 //uu /* May 2012: this VEX prefix stuff is currently unused, but has
2498 //uu    verified correct (I reckon).  Certainly it has been known to
2499 //uu    produce correct VEX prefixes during testing. */
2500 //uu
2501 //uu /* Assemble a 2 or 3 byte VEX prefix from parts.  rexR, rexX, rexB and
2502 //uu    notVvvvv need to be not-ed before packing.  mmmmm, rexW, L and pp go
2503 //uu    in verbatim.  There's no range checking on the bits. */
2504 //uu static UInt packVexPrefix ( UInt rexR, UInt rexX, UInt rexB,
2505 //uu                             UInt mmmmm, UInt rexW, UInt notVvvv,
2506 //uu                             UInt L, UInt pp )
2507 //uu {
2508 //uu    UChar byte0 = 0;
2509 //uu    UChar byte1 = 0;
2510 //uu    UChar byte2 = 0;
2511 //uu    if (rexX == 0 && rexB == 0 && mmmmm == 1 && rexW == 0) {
2512 //uu       /* 2 byte encoding is possible. */
2513 //uu       byte0 = 0xC5;
2514 //uu       byte1 = ((rexR ^ 1) << 7) | ((notVvvv ^ 0xF) << 3)
2515 //uu               | (L << 2) | pp;
2516 //uu    } else {
2517 //uu       /* 3 byte encoding is needed. */
2518 //uu       byte0 = 0xC4;
2519 //uu       byte1 = ((rexR ^ 1) << 7) | ((rexX ^ 1) << 6)
2520 //uu               | ((rexB ^ 1) << 5) | mmmmm;
2521 //uu       byte2 = (rexW << 7) | ((notVvvv ^ 0xF) << 3) | (L << 2) | pp;
2522 //uu    }
2523 //uu    return (((UInt)byte2) << 16) | (((UInt)byte1) << 8) | ((UInt)byte0);
2524 //uu }
2525 //uu
2526 //uu /* Make up a VEX prefix for a (greg,amode) pair.  First byte in bits
2527 //uu    7:0 of result, second in 15:8, third (for a 3 byte prefix) in
2528 //uu    23:16.  Has m-mmmm set to indicate a prefix of 0F, pp set to
2529 //uu    indicate no SIMD prefix, W=0 (ignore), L=1 (size=256), and
2530 //uu    vvvv=1111 (unused 3rd reg). */
2531 //uu static UInt vexAMode_M ( HReg greg, AMD64AMode* am )
2532 //uu {
2533 //uu    UChar L       = 1; /* size = 256 */
2534 //uu    UChar pp      = 0; /* no SIMD prefix */
2535 //uu    UChar mmmmm   = 1; /* 0F */
2536 //uu    UChar notVvvv = 0; /* unused */
2537 //uu    UChar rexW    = 0;
2538 //uu    UChar rexR    = 0;
2539 //uu    UChar rexX    = 0;
2540 //uu    UChar rexB    = 0;
2541 //uu    /* Same logic as in rexAMode_M. */
2542 //uu    if (am->tag == Aam_IR) {
2543 //uu       rexR = iregEnc3(greg);
2544 //uu       rexX = 0; /* not relevant */
2545 //uu       rexB = iregEnc3(am->Aam.IR.reg);
2546 //uu    }
2547 //uu    else if (am->tag == Aam_IRRS) {
2548 //uu       rexR = iregEnc3(greg);
2549 //uu       rexX = iregEnc3(am->Aam.IRRS.index);
2550 //uu       rexB = iregEnc3(am->Aam.IRRS.base);
2551 //uu    } else {
2552 //uu       vassert(0);
2553 //uu    }
2554 //uu    return packVexPrefix( rexR, rexX, rexB, mmmmm, rexW, notVvvv, L, pp );
2555 //uu }
2556 //uu
2557 //uu static UChar* emitVexPrefix ( UChar* p, UInt vex )
2558 //uu {
2559 //uu    switch (vex & 0xFF) {
2560 //uu       case 0xC5:
2561 //uu          *p++ = 0xC5;
2562 //uu          *p++ = (vex >> 8) & 0xFF;
2563 //uu          vassert(0 == (vex >> 16));
2564 //uu          break;
2565 //uu       case 0xC4:
2566 //uu          *p++ = 0xC4;
2567 //uu          *p++ = (vex >> 8) & 0xFF;
2568 //uu          *p++ = (vex >> 16) & 0xFF;
2569 //uu          vassert(0 == (vex >> 24));
2570 //uu          break;
2571 //uu       default:
2572 //uu          vassert(0);
2573 //uu    }
2574 //uu    return p;
2575 //uu }
2576
2577
2578 /* Emit ffree %st(N) */
2579 static UChar* do_ffree_st ( UChar* p, Int n )
2580 {
2581    vassert(n >= 0 && n <= 7);
2582    *p++ = 0xDD;
2583    *p++ = toUChar(0xC0 + n);
2584    return p;
2585 }
2586
2587 /* Emit an instruction into buf and return the number of bytes used.
2588    Note that buf is not the insn's final place, and therefore it is
2589    imperative to emit position-independent code.  If the emitted
2590    instruction was a profiler inc, set *is_profInc to True, else
2591    leave it unchanged. */
2592
2593 Int emit_AMD64Instr ( /*MB_MOD*/Bool* is_profInc,
2594                       UChar* buf, Int nbuf, const AMD64Instr* i,
2595                       Bool mode64, VexEndness endness_host,
2596                       const void* disp_cp_chain_me_to_slowEP,
2597                       const void* disp_cp_chain_me_to_fastEP,
2598                       const void* disp_cp_xindir,
2599                       const void* disp_cp_xassisted )
2600 {
2601    UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
2602    UInt   xtra;
2603    UInt   reg;
2604    UChar  rex;
2605    UChar* p = &buf[0];
2606    UChar* ptmp;
2607    Int    j;
2608    vassert(nbuf >= 64);
2609    vassert(mode64 == True);
2610
2611    /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
2612
2613    switch (i->tag) {
2614
2615    case Ain_Imm64:
2616       if (i->Ain.Imm64.imm64 <= 0xFFFFFULL) {
2617          /* Use the short form (load into 32 bit reg, + default
2618             widening rule) for constants under 1 million.  We could
2619             use this form for the range 0 to 0x7FFFFFFF inclusive, but
2620             limit it to a smaller range for verifiability purposes. */
2621          if (1 & iregEnc3(i->Ain.Imm64.dst))
2622             *p++ = 0x41;
2623          *p++ = 0xB8 + iregEnc210(i->Ain.Imm64.dst);
2624          p = emit32(p, (UInt)i->Ain.Imm64.imm64);
2625       } else {
2626          *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Imm64.dst)));
2627          *p++ = toUChar(0xB8 + iregEnc210(i->Ain.Imm64.dst));
2628          p = emit64(p, i->Ain.Imm64.imm64);
2629       }
2630       goto done;
2631
2632    case Ain_Alu64R:
2633       /* Deal specially with MOV */
2634       if (i->Ain.Alu64R.op == Aalu_MOV) {
2635          switch (i->Ain.Alu64R.src->tag) {
2636             case Armi_Imm:
2637                if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFFFF)) {
2638                   /* Actually we could use this form for constants in
2639                      the range 0 through 0x7FFFFFFF inclusive, but
2640                      limit it to a small range for verifiability
2641                      purposes. */
2642                   /* Generate "movl $imm32, 32-bit-register" and let
2643                      the default zero-extend rule cause the upper half
2644                      of the dst to be zeroed out too.  This saves 1
2645                      and sometimes 2 bytes compared to the more
2646                      obvious encoding in the 'else' branch. */
2647                   if (1 & iregEnc3(i->Ain.Alu64R.dst))
2648                      *p++ = 0x41;
2649                   *p++ = 0xB8 + iregEnc210(i->Ain.Alu64R.dst);
2650                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2651                } else {
2652                   *p++ = toUChar(0x48 + (1 & iregEnc3(i->Ain.Alu64R.dst)));
2653                   *p++ = 0xC7;
2654                   *p++ = toUChar(0xC0 + iregEnc210(i->Ain.Alu64R.dst));
2655                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2656                }
2657                goto done;
2658             case Armi_Reg:
2659                *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2660                                   i->Ain.Alu64R.dst );
2661                *p++ = 0x89;
2662                p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2663                                 i->Ain.Alu64R.dst);
2664                goto done;
2665             case Armi_Mem:
2666                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2667                                  i->Ain.Alu64R.src->Armi.Mem.am);
2668                *p++ = 0x8B;
2669                p = doAMode_M(p, i->Ain.Alu64R.dst,
2670                                 i->Ain.Alu64R.src->Armi.Mem.am);
2671                goto done;
2672             default:
2673                goto bad;
2674          }
2675       }
2676       /* MUL */
2677       if (i->Ain.Alu64R.op == Aalu_MUL) {
2678          switch (i->Ain.Alu64R.src->tag) {
2679             case Armi_Reg:
2680                *p++ = rexAMode_R( i->Ain.Alu64R.dst,
2681                                   i->Ain.Alu64R.src->Armi.Reg.reg);
2682                *p++ = 0x0F;
2683                *p++ = 0xAF;
2684                p = doAMode_R(p, i->Ain.Alu64R.dst,
2685                                 i->Ain.Alu64R.src->Armi.Reg.reg);
2686                goto done;
2687             case Armi_Mem:
2688                *p++ = rexAMode_M(i->Ain.Alu64R.dst,
2689                                  i->Ain.Alu64R.src->Armi.Mem.am);
2690                *p++ = 0x0F;
2691                *p++ = 0xAF;
2692                p = doAMode_M(p, i->Ain.Alu64R.dst,
2693                                 i->Ain.Alu64R.src->Armi.Mem.am);
2694                goto done;
2695             case Armi_Imm:
2696                if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2697                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2698                   *p++ = 0x6B;
2699                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2700                   *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2701                } else {
2702                   *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2703                   *p++ = 0x69;
2704                   p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
2705                   p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2706                }
2707                goto done;
2708             default:
2709                goto bad;
2710          }
2711       }
2712       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
2713       opc = opc_rr = subopc_imm = opc_imma = 0;
2714       switch (i->Ain.Alu64R.op) {
2715          case Aalu_ADC: opc = 0x13; opc_rr = 0x11;
2716                         subopc_imm = 2; opc_imma = 0x15; break;
2717          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2718                         subopc_imm = 0; opc_imma = 0x05; break;
2719          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2720                         subopc_imm = 5; opc_imma = 0x2D; break;
2721          case Aalu_SBB: opc = 0x1B; opc_rr = 0x19;
2722                         subopc_imm = 3; opc_imma = 0x1D; break;
2723          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2724                         subopc_imm = 4; opc_imma = 0x25; break;
2725          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2726                         subopc_imm = 6; opc_imma = 0x35; break;
2727          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2728                         subopc_imm = 1; opc_imma = 0x0D; break;
2729          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2730                         subopc_imm = 7; opc_imma = 0x3D; break;
2731          default: goto bad;
2732       }
2733       switch (i->Ain.Alu64R.src->tag) {
2734          case Armi_Imm:
2735             if (sameHReg(i->Ain.Alu64R.dst, hregAMD64_RAX())
2736                 && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2737                goto bad; /* FIXME: awaiting test case */
2738                *p++ = toUChar(opc_imma);
2739                p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2740             } else
2741             if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
2742                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst );
2743                *p++ = 0x83;
2744                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2745                *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
2746             } else {
2747                *p++ = rexAMode_R_enc_reg( 0, i->Ain.Alu64R.dst);
2748                *p++ = 0x81;
2749                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu64R.dst);
2750                p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
2751             }
2752             goto done;
2753          case Armi_Reg:
2754             *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
2755                                i->Ain.Alu64R.dst);
2756             *p++ = toUChar(opc_rr);
2757             p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
2758                              i->Ain.Alu64R.dst);
2759             goto done;
2760          case Armi_Mem:
2761             *p++ = rexAMode_M( i->Ain.Alu64R.dst,
2762                                i->Ain.Alu64R.src->Armi.Mem.am);
2763             *p++ = toUChar(opc);
2764             p = doAMode_M(p, i->Ain.Alu64R.dst,
2765                              i->Ain.Alu64R.src->Armi.Mem.am);
2766             goto done;
2767          default:
2768             goto bad;
2769       }
2770       break;
2771
2772    case Ain_Alu64M:
2773       /* Deal specially with MOV */
2774       if (i->Ain.Alu64M.op == Aalu_MOV) {
2775          switch (i->Ain.Alu64M.src->tag) {
2776             case Ari_Reg:
2777                *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
2778                                  i->Ain.Alu64M.dst);
2779                *p++ = 0x89;
2780                p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
2781                                 i->Ain.Alu64M.dst);
2782                goto done;
2783             case Ari_Imm:
2784                *p++ = rexAMode_M_enc(0, i->Ain.Alu64M.dst);
2785                *p++ = 0xC7;
2786                p = doAMode_M_enc(p, 0, i->Ain.Alu64M.dst);
2787                p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2788                goto done;
2789             default:
2790                goto bad;
2791          }
2792       }
2793       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
2794          allowed here. (This is derived from the x86 version of same). */
2795       opc = subopc_imm = opc_imma = 0;
2796       switch (i->Ain.Alu64M.op) {
2797          case Aalu_CMP: opc = 0x39; subopc_imm = 7; break;
2798          default: goto bad;
2799       }
2800       switch (i->Ain.Alu64M.src->tag) {
2801          /*
2802          case Xri_Reg:
2803             *p++ = toUChar(opc);
2804             p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
2805                              i->Xin.Alu32M.dst);
2806             goto done;
2807          */
2808          case Ari_Imm:
2809             if (fits8bits(i->Ain.Alu64M.src->Ari.Imm.imm32)) {
2810                *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
2811                *p++ = 0x83;
2812                p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
2813                *p++ = toUChar(0xFF & i->Ain.Alu64M.src->Ari.Imm.imm32);
2814                goto done;
2815             } else {
2816                *p++ = rexAMode_M_enc(subopc_imm, i->Ain.Alu64M.dst);
2817                *p++ = 0x81;
2818                p    = doAMode_M_enc(p, subopc_imm, i->Ain.Alu64M.dst);
2819                p    = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
2820                goto done;
2821             }
2822          default:
2823             goto bad;
2824       }
2825
2826       break;
2827
2828    case Ain_Sh64:
2829       opc_cl = opc_imm = subopc = 0;
2830       switch (i->Ain.Sh64.op) {
2831          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2832          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2833          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2834          default: goto bad;
2835       }
2836       if (i->Ain.Sh64.src == 0) {
2837          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2838          *p++ = toUChar(opc_cl);
2839          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2840          goto done;
2841       } else {
2842          *p++ = rexAMode_R_enc_reg(0, i->Ain.Sh64.dst);
2843          *p++ = toUChar(opc_imm);
2844          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh64.dst);
2845          *p++ = (UChar)(i->Ain.Sh64.src);
2846          goto done;
2847       }
2848       break;
2849
2850    case Ain_Sh32:
2851       opc_cl = opc_imm = subopc = 0;
2852       switch (i->Ain.Sh32.op) {
2853          case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
2854          case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
2855          case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
2856          default: goto bad;
2857       }
2858       if (i->Ain.Sh32.src == 0) {
2859          rex = clearWBit( rexAMode_R_enc_reg(0, i->Ain.Sh32.dst) );
2860          if (rex != 0x40) *p++ = rex;
2861          *p++ = toUChar(opc_cl);
2862          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh32.dst);
2863          goto done;
2864       } else {
2865          rex = clearWBit( rexAMode_R_enc_reg(0, i->Ain.Sh32.dst) );
2866          if (rex != 0x40) *p++ = rex;
2867          *p++ = toUChar(opc_imm);
2868          p = doAMode_R_enc_reg(p, subopc, i->Ain.Sh32.dst);
2869          *p++ = (UChar)(i->Ain.Sh32.src);
2870          goto done;
2871       }
2872       break;
2873
2874    case Ain_Test64:
2875       /* testq sign-extend($imm32), %reg */
2876       *p++ = rexAMode_R_enc_reg(0, i->Ain.Test64.dst);
2877       *p++ = 0xF7;
2878       p = doAMode_R_enc_reg(p, 0, i->Ain.Test64.dst);
2879       p = emit32(p, i->Ain.Test64.imm32);
2880       goto done;
2881
2882    case Ain_Unary64:
2883       if (i->Ain.Unary64.op == Aun_NOT) {
2884          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2885          *p++ = 0xF7;
2886          p = doAMode_R_enc_reg(p, 2, i->Ain.Unary64.dst);
2887          goto done;
2888       }
2889       if (i->Ain.Unary64.op == Aun_NEG) {
2890          *p++ = rexAMode_R_enc_reg(0, i->Ain.Unary64.dst);
2891          *p++ = 0xF7;
2892          p = doAMode_R_enc_reg(p, 3, i->Ain.Unary64.dst);
2893          goto done;
2894       }
2895       break;
2896
2897    case Ain_Lea64:
2898       *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
2899       *p++ = 0x8D;
2900       p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
2901       goto done;
2902
2903    case Ain_Alu32R:
2904       /* ADD/SUB/AND/OR/XOR/CMP */
2905       opc = opc_rr = subopc_imm = opc_imma = 0;
2906       switch (i->Ain.Alu32R.op) {
2907          case Aalu_ADD: opc = 0x03; opc_rr = 0x01;
2908                         subopc_imm = 0; opc_imma = 0x05; break;
2909          case Aalu_SUB: opc = 0x2B; opc_rr = 0x29;
2910                         subopc_imm = 5; opc_imma = 0x2D; break;
2911          case Aalu_AND: opc = 0x23; opc_rr = 0x21;
2912                         subopc_imm = 4; opc_imma = 0x25; break;
2913          case Aalu_XOR: opc = 0x33; opc_rr = 0x31;
2914                         subopc_imm = 6; opc_imma = 0x35; break;
2915          case Aalu_OR:  opc = 0x0B; opc_rr = 0x09;
2916                         subopc_imm = 1; opc_imma = 0x0D; break;
2917          case Aalu_CMP: opc = 0x3B; opc_rr = 0x39;
2918                         subopc_imm = 7; opc_imma = 0x3D; break;
2919          default: goto bad;
2920       }
2921       switch (i->Ain.Alu32R.src->tag) {
2922          case Armi_Imm:
2923             if (sameHReg(i->Ain.Alu32R.dst, hregAMD64_RAX())
2924                 && !fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2925                goto bad; /* FIXME: awaiting test case */
2926                *p++ = toUChar(opc_imma);
2927                p = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2928             } else
2929             if (fits8bits(i->Ain.Alu32R.src->Armi.Imm.imm32)) {
2930                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst ) );
2931                if (rex != 0x40) *p++ = rex;
2932                *p++ = 0x83;
2933                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2934                *p++ = toUChar(0xFF & i->Ain.Alu32R.src->Armi.Imm.imm32);
2935             } else {
2936                rex  = clearWBit( rexAMode_R_enc_reg( 0, i->Ain.Alu32R.dst) );
2937                if (rex != 0x40) *p++ = rex;
2938                *p++ = 0x81;
2939                p    = doAMode_R_enc_reg(p, subopc_imm, i->Ain.Alu32R.dst);
2940                p    = emit32(p, i->Ain.Alu32R.src->Armi.Imm.imm32);
2941             }
2942             goto done;
2943          case Armi_Reg:
2944             rex  = clearWBit(
2945                    rexAMode_R( i->Ain.Alu32R.src->Armi.Reg.reg,
2946                                i->Ain.Alu32R.dst) );
2947             if (rex != 0x40) *p++ = rex;
2948             *p++ = toUChar(opc_rr);
2949             p = doAMode_R(p, i->Ain.Alu32R.src->Armi.Reg.reg,
2950                              i->Ain.Alu32R.dst);
2951             goto done;
2952          case Armi_Mem:
2953             rex  = clearWBit(
2954                    rexAMode_M( i->Ain.Alu32R.dst,
2955                                i->Ain.Alu32R.src->Armi.Mem.am) );
2956             if (rex != 0x40) *p++ = rex;
2957             *p++ = toUChar(opc);
2958             p = doAMode_M(p, i->Ain.Alu32R.dst,
2959                              i->Ain.Alu32R.src->Armi.Mem.am);
2960             goto done;
2961          default:
2962             goto bad;
2963       }
2964       break;
2965
2966    case Ain_MulL:
2967       subopc = i->Ain.MulL.syned ? 5 : 4;
2968       switch (i->Ain.MulL.src->tag)  {
2969          case Arm_Mem:
2970             *p++ = rexAMode_M_enc(0, i->Ain.MulL.src->Arm.Mem.am);
2971             *p++ = 0xF7;
2972             p = doAMode_M_enc(p, subopc, i->Ain.MulL.src->Arm.Mem.am);
2973             goto done;
2974          case Arm_Reg:
2975             *p++ = rexAMode_R_enc_reg(0, i->Ain.MulL.src->Arm.Reg.reg);
2976             *p++ = 0xF7;
2977             p = doAMode_R_enc_reg(p, subopc, i->Ain.MulL.src->Arm.Reg.reg);
2978             goto done;
2979          default:
2980             goto bad;
2981       }
2982       break;
2983
2984    case Ain_Div:
2985       subopc = i->Ain.Div.syned ? 7 : 6;
2986       if (i->Ain.Div.sz == 4) {
2987          switch (i->Ain.Div.src->tag)  {
2988             case Arm_Mem:
2989                goto bad;
2990                /*FIXME*/
2991                *p++ = 0xF7;
2992                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
2993                goto done;
2994             case Arm_Reg:
2995                *p++ = clearWBit(
2996                       rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg));
2997                *p++ = 0xF7;
2998                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
2999                goto done;
3000             default:
3001                goto bad;
3002          }
3003       }
3004       if (i->Ain.Div.sz == 8) {
3005          switch (i->Ain.Div.src->tag)  {
3006             case Arm_Mem:
3007                *p++ = rexAMode_M_enc(0, i->Ain.Div.src->Arm.Mem.am);
3008                *p++ = 0xF7;
3009                p = doAMode_M_enc(p, subopc, i->Ain.Div.src->Arm.Mem.am);
3010                goto done;
3011             case Arm_Reg:
3012                *p++ = rexAMode_R_enc_reg(0, i->Ain.Div.src->Arm.Reg.reg);
3013                *p++ = 0xF7;
3014                p = doAMode_R_enc_reg(p, subopc, i->Ain.Div.src->Arm.Reg.reg);
3015                goto done;
3016             default:
3017                goto bad;
3018          }
3019       }
3020       break;
3021
3022    case Ain_Push:
3023       switch (i->Ain.Push.src->tag) {
3024          case Armi_Mem:
3025             *p++ = clearWBit(
3026                    rexAMode_M_enc(0, i->Ain.Push.src->Armi.Mem.am));
3027             *p++ = 0xFF;
3028             p = doAMode_M_enc(p, 6, i->Ain.Push.src->Armi.Mem.am);
3029             goto done;
3030          case Armi_Imm:
3031             *p++ = 0x68;
3032             p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
3033             goto done;
3034          case Armi_Reg:
3035             *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.Push.src->Armi.Reg.reg)));
3036             *p++ = toUChar(0x50 + iregEnc210(i->Ain.Push.src->Armi.Reg.reg));
3037             goto done;
3038         default:
3039             goto bad;
3040       }
3041
3042    case Ain_Call: {
3043       /* As per detailed comment for Ain_Call in getRegUsage_AMD64Instr
3044          above, %r11 is used as an address temporary. */
3045       /* If we don't need to do any fixup actions in the case that the
3046          call doesn't happen, just do the simple thing and emit
3047          straight-line code.  This is usually the case. */
3048       if (i->Ain.Call.cond == Acc_ALWAYS/*call always happens*/
3049           || i->Ain.Call.rloc.pri == RLPri_None/*no fixup action*/) {
3050          /* jump over the following two insns if the condition does
3051             not hold */
3052          Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
3053          if (i->Ain.Call.cond != Acc_ALWAYS) {
3054             *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
3055             *p++ = shortImm ? 10 : 13;
3056             /* 10 or 13 bytes in the next two insns */
3057          }
3058          if (shortImm) {
3059             /* 7 bytes: movl sign-extend(imm32), %r11 */
3060             *p++ = 0x49;
3061             *p++ = 0xC7;
3062             *p++ = 0xC3;
3063             p = emit32(p, (UInt)i->Ain.Call.target);
3064          } else {
3065             /* 10 bytes: movabsq $target, %r11 */
3066             *p++ = 0x49;
3067             *p++ = 0xBB;
3068             p = emit64(p, i->Ain.Call.target);
3069          }
3070          /* 3 bytes: call *%r11 */
3071          *p++ = 0x41;
3072          *p++ = 0xFF;
3073          *p++ = 0xD3;
3074       } else {
3075          Int delta;
3076          /* Complex case.  We have to generate an if-then-else diamond. */
3077          // before:
3078          //   j{!cond} else:
3079          //   movabsq $target, %r11
3080          //   call* %r11
3081          // preElse:
3082          //   jmp after:
3083          // else:
3084          //   movabsq $0x5555555555555555, %rax  // possibly
3085          //   movq %rax, %rdx                    // possibly
3086          // after:
3087
3088          // before:
3089          UChar* pBefore = p;
3090
3091          //   j{!cond} else:
3092          *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
3093          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3094
3095          //   movabsq $target, %r11
3096          *p++ = 0x49;
3097          *p++ = 0xBB;
3098          p = emit64(p, i->Ain.Call.target);
3099
3100          //   call* %r11
3101          *p++ = 0x41;
3102          *p++ = 0xFF;
3103          *p++ = 0xD3;
3104
3105          // preElse:
3106          UChar* pPreElse = p;
3107
3108          //   jmp after:
3109          *p++ = 0xEB;
3110          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3111
3112          // else:
3113          UChar* pElse = p;
3114
3115          /* Do the 'else' actions */
3116          switch (i->Ain.Call.rloc.pri) {
3117             case RLPri_Int:
3118                // movabsq $0x5555555555555555, %rax
3119                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
3120                break;
3121             case RLPri_2Int:
3122                goto bad; //ATC
3123                // movabsq $0x5555555555555555, %rax
3124                *p++ = 0x48; *p++ = 0xB8; p = emit64(p, 0x5555555555555555ULL);
3125                // movq %rax, %rdx
3126                *p++ = 0x48; *p++ = 0x89; *p++ = 0xC2;
3127                break;
3128             case RLPri_V128SpRel:
3129                if (i->Ain.Call.rloc.spOff == 0) {
3130                   // We could accept any |spOff| here, but that's more
3131                   // hassle and the only value we're ever going to get
3132                   // is zero (I believe.)  Hence take the easy path :)
3133                   // We need a scag register -- r11 can be it.
3134                   // movabsq $0x5555555555555555, %r11
3135                   *p++ = 0x49; *p++ = 0xBB;
3136                   p = emit64(p, 0x5555555555555555ULL);
3137                   // movq %r11, 0(%rsp)
3138                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x1C; *p++ = 0x24;
3139                   // movq %r11, 8(%rsp)
3140                   *p++ = 0x4C; *p++ = 0x89; *p++ = 0x5C; *p++ = 0x24;
3141                   *p++ = 0x08;
3142                   break;
3143                }
3144                goto bad; //ATC for all other spOff values
3145             case RLPri_V256SpRel:
3146                goto bad; //ATC
3147             case RLPri_None: case RLPri_INVALID: default:
3148                vassert(0); // should never get here
3149          }
3150
3151          // after:
3152          UChar* pAfter = p;
3153
3154          // Fix up the branch offsets.  The +2s in the offset
3155          // calculations are there because x86 requires conditional
3156          // branches to have their offset stated relative to the
3157          // instruction immediately following the branch insn.  And in
3158          // both cases the branch insns are 2 bytes long.
3159
3160          // First, the "j{!cond} else:" at pBefore.
3161          delta = (Int)(Long)(pElse - (pBefore + 2));
3162          vassert(delta >= 0 && delta < 100/*arbitrary*/);
3163          *(pBefore+1) = (UChar)delta;
3164
3165          // And secondly, the "jmp after:" at pPreElse.
3166          delta = (Int)(Long)(pAfter - (pPreElse + 2));
3167          vassert(delta >= 0 && delta < 100/*arbitrary*/);
3168          *(pPreElse+1) = (UChar)delta;
3169       }
3170       goto done;
3171    }
3172
3173    case Ain_XDirect: {
3174       /* NB: what goes on here has to be very closely coordinated with the
3175          chainXDirect_AMD64 and unchainXDirect_AMD64 below. */
3176       /* We're generating chain-me requests here, so we need to be
3177          sure this is actually allowed -- no-redir translations can't
3178          use chain-me's.  Hence: */
3179       vassert(disp_cp_chain_me_to_slowEP != NULL);
3180       vassert(disp_cp_chain_me_to_fastEP != NULL);
3181
3182       HReg r11 = hregAMD64_R11();
3183
3184       /* Use ptmp for backpatching conditional jumps. */
3185       ptmp = NULL;
3186
3187       /* First off, if this is conditional, create a conditional
3188          jump over the rest of it. */
3189       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
3190          /* jmp fwds if !condition */
3191          *p++ = toUChar(0x70 + (0xF & (i->Ain.XDirect.cond ^ 1)));
3192          ptmp = p; /* fill in this bit later */
3193          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3194       }
3195
3196       /* Update the guest RIP. */
3197       if (fitsIn32Bits(i->Ain.XDirect.dstGA)) {
3198          /* use a shorter encoding */
3199          /* movl sign-extend(dstGA), %r11 */
3200          *p++ = 0x49;
3201          *p++ = 0xC7;
3202          *p++ = 0xC3;
3203          p = emit32(p, (UInt)i->Ain.XDirect.dstGA);
3204       } else {
3205          /* movabsq $dstGA, %r11 */
3206          *p++ = 0x49;
3207          *p++ = 0xBB;
3208          p = emit64(p, i->Ain.XDirect.dstGA);
3209       }
3210
3211       /* movq %r11, amRIP */
3212       *p++ = rexAMode_M(r11, i->Ain.XDirect.amRIP);
3213       *p++ = 0x89;
3214       p = doAMode_M(p, r11, i->Ain.XDirect.amRIP);
3215
3216       /* --- FIRST PATCHABLE BYTE follows --- */
3217       /* VG_(disp_cp_chain_me_to_{slowEP,fastEP}) (where we're calling
3218          to) backs up the return address, so as to find the address of
3219          the first patchable byte.  So: don't change the length of the
3220          two instructions below. */
3221       /* movabsq $disp_cp_chain_me_to_{slow,fast}EP,%r11; */
3222       *p++ = 0x49;
3223       *p++ = 0xBB;
3224       const void* disp_cp_chain_me
3225                = i->Ain.XDirect.toFastEP ? disp_cp_chain_me_to_fastEP
3226                                          : disp_cp_chain_me_to_slowEP;
3227       p = emit64(p, (Addr)disp_cp_chain_me);
3228       /* call *%r11 */
3229       *p++ = 0x41;
3230       *p++ = 0xFF;
3231       *p++ = 0xD3;
3232       /* --- END of PATCHABLE BYTES --- */
3233
3234       /* Fix up the conditional jump, if there was one. */
3235       if (i->Ain.XDirect.cond != Acc_ALWAYS) {
3236          Int delta = p - ptmp;
3237          vassert(delta > 0 && delta < 40);
3238          *ptmp = toUChar(delta-1);
3239       }
3240       goto done;
3241    }
3242
3243    case Ain_XIndir: {
3244       /* We're generating transfers that could lead indirectly to a
3245          chain-me, so we need to be sure this is actually allowed --
3246          no-redir translations are not allowed to reach normal
3247          translations without going through the scheduler.  That means
3248          no XDirects or XIndirs out from no-redir translations.
3249          Hence: */
3250       vassert(disp_cp_xindir != NULL);
3251
3252       /* Use ptmp for backpatching conditional jumps. */
3253       ptmp = NULL;
3254
3255       /* First off, if this is conditional, create a conditional
3256          jump over the rest of it. */
3257       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
3258          /* jmp fwds if !condition */
3259          *p++ = toUChar(0x70 + (0xF & (i->Ain.XIndir.cond ^ 1)));
3260          ptmp = p; /* fill in this bit later */
3261          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3262       }
3263
3264       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
3265       *p++ = rexAMode_M(i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
3266       *p++ = 0x89;
3267       p = doAMode_M(p, i->Ain.XIndir.dstGA, i->Ain.XIndir.amRIP);
3268
3269       /* get $disp_cp_xindir into %r11 */
3270       if (fitsIn32Bits((Addr)disp_cp_xindir)) {
3271          /* use a shorter encoding */
3272          /* movl sign-extend(disp_cp_xindir), %r11 */
3273          *p++ = 0x49;
3274          *p++ = 0xC7;
3275          *p++ = 0xC3;
3276          p = emit32(p, (UInt)(Addr)disp_cp_xindir);
3277       } else {
3278          /* movabsq $disp_cp_xindir, %r11 */
3279          *p++ = 0x49;
3280          *p++ = 0xBB;
3281          p = emit64(p, (Addr)disp_cp_xindir);
3282       }
3283
3284       /* jmp *%r11 */
3285       *p++ = 0x41;
3286       *p++ = 0xFF;
3287       *p++ = 0xE3;
3288
3289       /* Fix up the conditional jump, if there was one. */
3290       if (i->Ain.XIndir.cond != Acc_ALWAYS) {
3291          Int delta = p - ptmp;
3292          vassert(delta > 0 && delta < 40);
3293          *ptmp = toUChar(delta-1);
3294       }
3295       goto done;
3296    }
3297
3298    case Ain_XAssisted: {
3299       /* Use ptmp for backpatching conditional jumps. */
3300       ptmp = NULL;
3301
3302       /* First off, if this is conditional, create a conditional
3303          jump over the rest of it. */
3304       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3305          /* jmp fwds if !condition */
3306          *p++ = toUChar(0x70 + (0xF & (i->Ain.XAssisted.cond ^ 1)));
3307          ptmp = p; /* fill in this bit later */
3308          *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3309       }
3310
3311       /* movq dstGA(a reg), amRIP -- copied from Alu64M MOV case */
3312       *p++ = rexAMode_M(i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
3313       *p++ = 0x89;
3314       p = doAMode_M(p, i->Ain.XAssisted.dstGA, i->Ain.XAssisted.amRIP);
3315       /* movl $magic_number, %ebp.  Since these numbers are all small positive
3316          integers, we can get away with "movl $N, %ebp" rather than
3317          the longer "movq $N, %rbp". */
3318       UInt trcval = 0;
3319       switch (i->Ain.XAssisted.jk) {
3320          case Ijk_ClientReq:   trcval = VEX_TRC_JMP_CLIENTREQ;   break;
3321          case Ijk_Sys_syscall: trcval = VEX_TRC_JMP_SYS_SYSCALL; break;
3322          case Ijk_Sys_int32:   trcval = VEX_TRC_JMP_SYS_INT32;   break;
3323          case Ijk_Sys_int210:  trcval = VEX_TRC_JMP_SYS_INT210;  break;
3324          case Ijk_Yield:       trcval = VEX_TRC_JMP_YIELD;       break;
3325          case Ijk_EmWarn:      trcval = VEX_TRC_JMP_EMWARN;      break;
3326          case Ijk_MapFail:     trcval = VEX_TRC_JMP_MAPFAIL;     break;
3327          case Ijk_NoDecode:    trcval = VEX_TRC_JMP_NODECODE;    break;
3328          case Ijk_InvalICache: trcval = VEX_TRC_JMP_INVALICACHE; break;
3329          case Ijk_NoRedir:     trcval = VEX_TRC_JMP_NOREDIR;     break;
3330          case Ijk_SigTRAP:     trcval = VEX_TRC_JMP_SIGTRAP;     break;
3331          case Ijk_SigSEGV:     trcval = VEX_TRC_JMP_SIGSEGV;     break;
3332          case Ijk_SigBUS:      trcval = VEX_TRC_JMP_SIGBUS;      break;
3333          case Ijk_Boring:      trcval = VEX_TRC_JMP_BORING;      break;
3334          /* We don't expect to see the following being assisted. */
3335          case Ijk_Ret:
3336          case Ijk_Call:
3337          /* fallthrough */
3338          default:
3339             ppIRJumpKind(i->Ain.XAssisted.jk);
3340             vpanic("emit_AMD64Instr.Ain_XAssisted: unexpected jump kind");
3341       }
3342       vassert(trcval != 0);
3343       *p++ = 0xBD;
3344       p = emit32(p, trcval);
3345       /* movabsq $disp_assisted, %r11 */
3346       *p++ = 0x49;
3347       *p++ = 0xBB;
3348       p = emit64(p, (Addr)disp_cp_xassisted);
3349       /* jmp *%r11 */
3350       *p++ = 0x41;
3351       *p++ = 0xFF;
3352       *p++ = 0xE3;
3353
3354       /* Fix up the conditional jump, if there was one. */
3355       if (i->Ain.XAssisted.cond != Acc_ALWAYS) {
3356          Int delta = p - ptmp;
3357          vassert(delta > 0 && delta < 40);
3358          *ptmp = toUChar(delta-1);
3359       }
3360       goto done;
3361    }
3362
3363    case Ain_CMov64:
3364       vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
3365       *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src);
3366       *p++ = 0x0F;
3367       *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
3368       p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src);
3369       goto done;
3370
3371    case Ain_CLoad: {
3372       vassert(i->Ain.CLoad.cond != Acc_ALWAYS);
3373
3374       /* Only 32- or 64-bit variants are allowed. */
3375       vassert(i->Ain.CLoad.szB == 4 || i->Ain.CLoad.szB == 8);
3376
3377       /* Use ptmp for backpatching conditional jumps. */
3378       ptmp = NULL;
3379
3380       /* jmp fwds if !condition */
3381       *p++ = toUChar(0x70 + (0xF & (i->Ain.CLoad.cond ^ 1)));
3382       ptmp = p; /* fill in this bit later */
3383       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3384
3385       /* Now the load.  Either a normal 64 bit load or a normal 32 bit
3386          load, which, by the default zero-extension rule, zeroes out
3387          the upper half of the destination, as required. */
3388       rex = rexAMode_M(i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3389       *p++ = i->Ain.CLoad.szB == 4 ? clearWBit(rex) : rex;
3390       *p++ = 0x8B;
3391       p = doAMode_M(p, i->Ain.CLoad.dst, i->Ain.CLoad.addr);
3392
3393       /* Fix up the conditional branch */
3394       Int delta = p - ptmp;
3395       vassert(delta > 0 && delta < 40);
3396       *ptmp = toUChar(delta-1);
3397       goto done;
3398    }
3399
3400    case Ain_CStore: {
3401       /* AFAICS this is identical to Ain_CLoad except that the opcode
3402          is 0x89 instead of 0x8B. */
3403       vassert(i->Ain.CStore.cond != Acc_ALWAYS);
3404
3405       /* Only 32- or 64-bit variants are allowed. */
3406       vassert(i->Ain.CStore.szB == 4 || i->Ain.CStore.szB == 8);
3407
3408       /* Use ptmp for backpatching conditional jumps. */
3409       ptmp = NULL;
3410
3411       /* jmp fwds if !condition */
3412       *p++ = toUChar(0x70 + (0xF & (i->Ain.CStore.cond ^ 1)));
3413       ptmp = p; /* fill in this bit later */
3414       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3415
3416       /* Now the store. */
3417       rex = rexAMode_M(i->Ain.CStore.src, i->Ain.CStore.addr);
3418       *p++ = i->Ain.CStore.szB == 4 ? clearWBit(rex) : rex;
3419       *p++ = 0x89;
3420       p = doAMode_M(p, i->Ain.CStore.src, i->Ain.CStore.addr);
3421
3422       /* Fix up the conditional branch */
3423       Int delta = p - ptmp;
3424       vassert(delta > 0 && delta < 40);
3425       *ptmp = toUChar(delta-1);
3426       goto done;
3427    }
3428
3429    case Ain_MovxLQ:
3430       /* No, _don't_ ask me why the sense of the args has to be
3431          different in the S vs Z case.  I don't know. */
3432       if (i->Ain.MovxLQ.syned) {
3433          /* Need REX.W = 1 here, but rexAMode_R does that for us. */
3434          *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3435          *p++ = 0x63;
3436          p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
3437       } else {
3438          /* Produce a 32-bit reg-reg move, since the implicit
3439             zero-extend does what we want. */
3440          *p++ = clearWBit (
3441                    rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
3442          *p++ = 0x89;
3443          p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
3444       }
3445       goto done;
3446
3447    case Ain_LoadEX:
3448       if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
3449          /* movzbq */
3450          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3451          *p++ = 0x0F;
3452          *p++ = 0xB6;
3453          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3454          goto done;
3455       }
3456       if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
3457          /* movzwq */
3458          *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3459          *p++ = 0x0F;
3460          *p++ = 0xB7;
3461          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3462          goto done;
3463       }
3464       if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
3465          /* movzlq */
3466          /* This isn't really an existing AMD64 instruction per se.
3467             Rather, we have to do a 32-bit load.  Because a 32-bit
3468             write implicitly clears the upper 32 bits of the target
3469             register, we get what we want. */
3470          *p++ = clearWBit(
3471                 rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
3472          *p++ = 0x8B;
3473          p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
3474          goto done;
3475       }
3476       break;
3477
3478    case Ain_Set64:
3479       /* Make the destination register be 1 or 0, depending on whether
3480          the relevant condition holds.  Complication: the top 56 bits
3481          of the destination should be forced to zero, but doing 'xorq
3482          %r,%r' kills the flag(s) we are about to read.  Sigh.  So
3483          start off my moving $0 into the dest. */
3484       reg = iregEnc3210(i->Ain.Set64.dst);
3485       vassert(reg < 16);
3486
3487       /* movq $0, %dst */
3488       *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
3489       *p++ = 0xC7;
3490       *p++ = toUChar(0xC0 + (reg & 7));
3491       p = emit32(p, 0);
3492
3493       /* setb lo8(%dst) */
3494       /* note, 8-bit register rex trickyness.  Be careful here. */
3495       *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
3496       *p++ = 0x0F;
3497       *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
3498       *p++ = toUChar(0xC0 + (reg & 7));
3499       goto done;
3500
3501    case Ain_Bsfr64:
3502       *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3503       *p++ = 0x0F;
3504       if (i->Ain.Bsfr64.isFwds) {
3505          *p++ = 0xBC;
3506       } else {
3507          *p++ = 0xBD;
3508       }
3509       p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
3510       goto done;
3511
3512    case Ain_MFence:
3513       /* mfence */
3514       *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
3515       goto done;
3516
3517    case Ain_ACAS:
3518       /* lock */
3519       *p++ = 0xF0;
3520       if (i->Ain.ACAS.sz == 2) *p++ = 0x66;
3521       /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
3522          in %rbx.  The new-value register is hardwired to be %rbx
3523          since dealing with byte integer registers is too much hassle,
3524          so we force the register operand to %rbx (could equally be
3525          %rcx or %rdx). */
3526       rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
3527       if (i->Ain.ACAS.sz != 8)
3528          rex = clearWBit(rex);
3529
3530       *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
3531       *p++ = 0x0F;
3532       if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
3533       p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
3534       goto done;
3535
3536    case Ain_DACAS:
3537       /* lock */
3538       *p++ = 0xF0;
3539       /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
3540          value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
3541          aren't encoded in the insn. */
3542       rex = rexAMode_M_enc(1, i->Ain.ACAS.addr );
3543       if (i->Ain.ACAS.sz != 8)
3544          rex = clearWBit(rex);
3545       *p++ = rex;
3546       *p++ = 0x0F;
3547       *p++ = 0xC7;
3548       p = doAMode_M_enc(p, 1, i->Ain.DACAS.addr);
3549       goto done;
3550
3551    case Ain_A87Free:
3552       vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
3553       for (j = 0; j < i->Ain.A87Free.nregs; j++) {
3554          p = do_ffree_st(p, 7-j);
3555       }
3556       goto done;
3557
3558    case Ain_A87PushPop:
3559       vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
3560       if (i->Ain.A87PushPop.isPush) {
3561          /* Load from memory into %st(0): flds/fldl amode */
3562          *p++ = clearWBit(
3563                    rexAMode_M_enc(0, i->Ain.A87PushPop.addr) );
3564          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3565          p = doAMode_M_enc(p, 0/*subopcode*/, i->Ain.A87PushPop.addr);
3566       } else {
3567          /* Dump %st(0) to memory: fstps/fstpl amode */
3568          *p++ = clearWBit(
3569                    rexAMode_M_enc(3, i->Ain.A87PushPop.addr) );
3570          *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
3571          p = doAMode_M_enc(p, 3/*subopcode*/, i->Ain.A87PushPop.addr);
3572          goto done;
3573       }
3574       goto done;
3575
3576    case Ain_A87FpOp:
3577       switch (i->Ain.A87FpOp.op) {
3578          case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
3579          case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
3580          case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
3581          case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
3582          case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
3583          case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
3584          case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
3585          case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
3586          case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
3587          case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
3588          case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
3589          case Afp_TAN:
3590             /* fptan pushes 1.0 on the FP stack, except when the
3591                argument is out of range.  Hence we have to do the
3592                instruction, then inspect C2 to see if there is an out
3593                of range condition.  If there is, we skip the fincstp
3594                that is used by the in-range case to get rid of this
3595                extra 1.0 value. */
3596             *p++ = 0xD9; *p++ = 0xF2; // fptan
3597             *p++ = 0x50;              // pushq %rax
3598             *p++ = 0xDF; *p++ = 0xE0; // fnstsw %ax
3599             *p++ = 0x66; *p++ = 0xA9;
3600             *p++ = 0x00; *p++ = 0x04; // testw $0x400,%ax
3601             *p++ = 0x75; *p++ = 0x02; // jnz after_fincstp
3602             *p++ = 0xD9; *p++ = 0xF7; // fincstp
3603             *p++ = 0x58;              // after_fincstp: popq %rax
3604             break;
3605          default:
3606             goto bad;
3607       }
3608       goto done;
3609
3610    case Ain_A87LdCW:
3611       *p++ = clearWBit(
3612                 rexAMode_M_enc(5, i->Ain.A87LdCW.addr) );
3613       *p++ = 0xD9;
3614       p = doAMode_M_enc(p, 5/*subopcode*/, i->Ain.A87LdCW.addr);
3615       goto done;
3616
3617    case Ain_A87StSW:
3618       *p++ = clearWBit(
3619                 rexAMode_M_enc(7, i->Ain.A87StSW.addr) );
3620       *p++ = 0xDD;
3621       p = doAMode_M_enc(p, 7/*subopcode*/, i->Ain.A87StSW.addr);
3622       goto done;
3623
3624    case Ain_Store:
3625       if (i->Ain.Store.sz == 2) {
3626          /* This just goes to show the crazyness of the instruction
3627             set encoding.  We have to insert two prefix bytes, but be
3628             careful to avoid a conflict in what the size should be, by
3629             ensuring that REX.W = 0. */
3630          *p++ = 0x66; /* override to 16-bits */
3631          *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3632          *p++ = 0x89;
3633          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3634          goto done;
3635       }
3636       if (i->Ain.Store.sz == 4) {
3637          *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3638          *p++ = 0x89;
3639          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3640          goto done;
3641       }
3642       if (i->Ain.Store.sz == 1) {
3643          /* This is one place where it would be wrong to skip emitting
3644             a rex byte of 0x40, since the mere presence of rex changes
3645             the meaning of the byte register access.  Be careful. */
3646          *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
3647          *p++ = 0x88;
3648          p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
3649          goto done;
3650       }
3651       break;
3652
3653    case Ain_LdMXCSR:
3654       *p++ = clearWBit(rexAMode_M_enc(0, i->Ain.LdMXCSR.addr));
3655       *p++ = 0x0F;
3656       *p++ = 0xAE;
3657       p = doAMode_M_enc(p, 2/*subopcode*/, i->Ain.LdMXCSR.addr);
3658       goto done;
3659
3660    case Ain_SseUComIS:
3661       /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
3662       /* ucomi[sd] %srcL, %srcR */
3663       if (i->Ain.SseUComIS.sz == 8) {
3664          *p++ = 0x66;
3665       } else {
3666          goto bad;
3667          vassert(i->Ain.SseUComIS.sz == 4);
3668       }
3669       *p++ = clearWBit (
3670              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseUComIS.srcL),
3671                                  vregEnc3210(i->Ain.SseUComIS.srcR) ));
3672       *p++ = 0x0F;
3673       *p++ = 0x2E;
3674       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseUComIS.srcL),
3675                                vregEnc3210(i->Ain.SseUComIS.srcR) );
3676       /* pushfq */
3677       *p++ = 0x9C;
3678       /* popq %dst */
3679       *p++ = toUChar(0x40 + (1 & iregEnc3(i->Ain.SseUComIS.dst)));
3680       *p++ = toUChar(0x58 + iregEnc210(i->Ain.SseUComIS.dst));
3681       goto done;
3682
3683    case Ain_SseSI2SF:
3684       /* cvssi2s[sd] %src, %dst */
3685       rex = rexAMode_R_enc_reg( vregEnc3210(i->Ain.SseSI2SF.dst),
3686                                 i->Ain.SseSI2SF.src );
3687       *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
3688       *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
3689       *p++ = 0x0F;
3690       *p++ = 0x2A;
3691       p = doAMode_R_enc_reg( p, vregEnc3210(i->Ain.SseSI2SF.dst),
3692                                 i->Ain.SseSI2SF.src );
3693       goto done;
3694
3695    case Ain_SseSF2SI:
3696       /* cvss[sd]2si %src, %dst */
3697       rex = rexAMode_R_reg_enc( i->Ain.SseSF2SI.dst,
3698                                 vregEnc3210(i->Ain.SseSF2SI.src) );
3699       *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
3700       *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
3701       *p++ = 0x0F;
3702       *p++ = 0x2D;
3703       p = doAMode_R_reg_enc( p, i->Ain.SseSF2SI.dst,
3704                                 vregEnc3210(i->Ain.SseSF2SI.src) );
3705       goto done;
3706
3707    case Ain_SseSDSS:
3708       /* cvtsd2ss/cvtss2sd %src, %dst */
3709       *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
3710       *p++ = clearWBit(
3711               rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseSDSS.dst),
3712                                   vregEnc3210(i->Ain.SseSDSS.src) ));
3713       *p++ = 0x0F;
3714       *p++ = 0x5A;
3715       p = doAMode_R_enc_enc( p, vregEnc3210(i->Ain.SseSDSS.dst),
3716                                 vregEnc3210(i->Ain.SseSDSS.src) );
3717       goto done;
3718
3719    case Ain_SseLdSt:
3720       if (i->Ain.SseLdSt.sz == 8) {
3721          *p++ = 0xF2;
3722       } else
3723       if (i->Ain.SseLdSt.sz == 4) {
3724          *p++ = 0xF3;
3725       } else
3726       if (i->Ain.SseLdSt.sz != 16) {
3727          vassert(0);
3728       }
3729       *p++ = clearWBit(
3730              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdSt.reg),
3731                             i->Ain.SseLdSt.addr));
3732       *p++ = 0x0F;
3733       *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
3734       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdSt.reg),
3735                            i->Ain.SseLdSt.addr);
3736       goto done;
3737
3738    case Ain_SseCStore: {
3739       vassert(i->Ain.SseCStore.cond != Acc_ALWAYS);
3740
3741       /* Use ptmp for backpatching conditional jumps. */
3742       ptmp = NULL;
3743
3744       /* jmp fwds if !condition */
3745       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCStore.cond ^ 1)));
3746       ptmp = p; /* fill in this bit later */
3747       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3748
3749       /* Now the store. */
3750       *p++ = clearWBit(
3751              rexAMode_M_enc(vregEnc3210(i->Ain.SseCStore.src),
3752                             i->Ain.SseCStore.addr));
3753       *p++ = 0x0F;
3754       *p++ = toUChar(0x11);
3755       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCStore.src),
3756                            i->Ain.SseCStore.addr);
3757
3758       /* Fix up the conditional branch */
3759       Int delta = p - ptmp;
3760       vassert(delta > 0 && delta < 40);
3761       *ptmp = toUChar(delta-1);
3762       goto done;
3763    }
3764
3765    case Ain_SseCLoad: {
3766       vassert(i->Ain.SseCLoad.cond != Acc_ALWAYS);
3767
3768       /* Use ptmp for backpatching conditional jumps. */
3769       ptmp = NULL;
3770
3771       /* jmp fwds if !condition */
3772       *p++ = toUChar(0x70 + (0xF & (i->Ain.SseCLoad.cond ^ 1)));
3773       ptmp = p; /* fill in this bit later */
3774       *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
3775
3776       /* Now the load. */
3777       *p++ = clearWBit(
3778              rexAMode_M_enc(vregEnc3210(i->Ain.SseCLoad.dst),
3779                             i->Ain.SseCLoad.addr));
3780       *p++ = 0x0F;
3781       *p++ = toUChar(0x10);
3782       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseCLoad.dst),
3783                            i->Ain.SseCLoad.addr);
3784
3785       /* Fix up the conditional branch */
3786       Int delta = p - ptmp;
3787       vassert(delta > 0 && delta < 40);
3788       *ptmp = toUChar(delta-1);
3789       goto done;
3790    }
3791
3792    case Ain_SseLdzLO:
3793       vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
3794       /* movs[sd] amode, %xmm-dst */
3795       *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
3796       *p++ = clearWBit(
3797              rexAMode_M_enc(vregEnc3210(i->Ain.SseLdzLO.reg),
3798                             i->Ain.SseLdzLO.addr));
3799       *p++ = 0x0F;
3800       *p++ = 0x10;
3801       p = doAMode_M_enc(p, vregEnc3210(i->Ain.SseLdzLO.reg),
3802                            i->Ain.SseLdzLO.addr);
3803       goto done;
3804
3805    case Ain_Sse32Fx4: {
3806       UInt srcRegNo = vregEnc3210(i->Ain.Sse32Fx4.src);
3807       UInt dstRegNo = vregEnc3210(i->Ain.Sse32Fx4.dst);
3808       // VEX encoded cases
3809       switch (i->Ain.Sse32Fx4.op) {
3810          case Asse_F16toF32: { // vcvtph2ps %xmmS, %xmmD
3811             UInt s = srcRegNo;
3812             UInt d = dstRegNo;
3813             // VCVTPH2PS %xmmS, %xmmD (s and d are both xmm regs, range 0 .. 15)
3814             // 0xC4 : ~d3 1 ~s3 0 0 0 1 0 : 0x79 : 0x13 : 1 1 d2 d1 d0 s2 s1 s0
3815             UInt byte2 = ((((~d)>>3)&1)<<7) | (1<<6)
3816                          | ((((~s)>>3)&1)<<5) | (1<<1);
3817             UInt byte5 = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0);
3818             *p++ = 0xC4;
3819             *p++ = byte2;
3820             *p++ = 0x79;
3821             *p++ = 0x13;
3822             *p++ = byte5;
3823             goto done;
3824          }
3825          case Asse_F32toF16: { // vcvtps2ph $4, %xmmS, %xmmD
3826             UInt s = srcRegNo;
3827             UInt d = dstRegNo;
3828             // VCVTPS2PH $4, %xmmS, %xmmD (s and d both xmm regs, range 0 .. 15)
3829             // 0xC4 : ~s3 1 ~d3 0 0 0 1 1 : 0x79
3830             //      : 0x1D : 11 s2 s1 s0 d2 d1 d0 : 0x4
3831             UInt byte2 = ((((~s)>>3)&1)<<7) | (1<<6)
3832                          | ((((~d)>>3)&1)<<5) | (1<<1) | (1 << 0);
3833             UInt byte5 = (1<<7) | (1<<6) | ((s&7) << 3) | ((d&7) << 0);
3834             *p++ = 0xC4;
3835             *p++ = byte2;
3836             *p++ = 0x79;
3837             *p++ = 0x1D;
3838             *p++ = byte5;
3839             *p++ = 0x04;
3840             goto done;
3841          }
3842          default: break;
3843       }
3844       // After this point, REX encoded cases only
3845       xtra = 0;
3846       switch (i->Ain.Sse32Fx4.op) {
3847          case Asse_F2I: *p++ = 0x66; break;
3848          default: break;
3849       }
3850       *p++ = clearWBit(rexAMode_R_enc_enc(dstRegNo, srcRegNo));
3851       *p++ = 0x0F;
3852       switch (i->Ain.Sse32Fx4.op) {
3853          case Asse_ADDF:   *p++ = 0x58; break;
3854          case Asse_DIVF:   *p++ = 0x5E; break;
3855          case Asse_MAXF:   *p++ = 0x5F; break;
3856          case Asse_MINF:   *p++ = 0x5D; break;
3857          case Asse_MULF:   *p++ = 0x59; break;
3858          case Asse_RCPF:   *p++ = 0x53; break;
3859          case Asse_RSQRTF: *p++ = 0x52; break;
3860          case Asse_SQRTF:  *p++ = 0x51; break;
3861          case Asse_I2F:    *p++ = 0x5B; break; // cvtdq2ps; no 0x66 pfx
3862          case Asse_F2I:    *p++ = 0x5B; break; // cvtps2dq; with 0x66 pfx
3863          case Asse_SUBF:   *p++ = 0x5C; break;
3864          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3865          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3866          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3867          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3868          default: goto bad;
3869       }
3870       p = doAMode_R_enc_enc(p, dstRegNo, srcRegNo);
3871       if (xtra & 0x100)
3872          *p++ = toUChar(xtra & 0xFF);
3873       goto done;
3874    }
3875
3876    case Ain_Sse64Fx2:
3877       xtra = 0;
3878       *p++ = 0x66;
3879       *p++ = clearWBit(
3880              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64Fx2.dst),
3881                                  vregEnc3210(i->Ain.Sse64Fx2.src) ));
3882       *p++ = 0x0F;
3883       switch (i->Ain.Sse64Fx2.op) {
3884          case Asse_ADDF:   *p++ = 0x58; break;
3885          case Asse_DIVF:   *p++ = 0x5E; break;
3886          case Asse_MAXF:   *p++ = 0x5F; break;
3887          case Asse_MINF:   *p++ = 0x5D; break;
3888          case Asse_MULF:   *p++ = 0x59; break;
3889          case Asse_SQRTF:  *p++ = 0x51; break;
3890          case Asse_SUBF:   *p++ = 0x5C; break;
3891          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3892          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3893          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3894          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3895          default: goto bad;
3896       }
3897       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64Fx2.dst),
3898                                vregEnc3210(i->Ain.Sse64Fx2.src) );
3899       if (xtra & 0x100)
3900          *p++ = toUChar(xtra & 0xFF);
3901       goto done;
3902
3903    case Ain_Sse32FLo:
3904       xtra = 0;
3905       *p++ = 0xF3;
3906       *p++ = clearWBit(
3907              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse32FLo.dst),
3908                                  vregEnc3210(i->Ain.Sse32FLo.src) ));
3909       *p++ = 0x0F;
3910       switch (i->Ain.Sse32FLo.op) {
3911          case Asse_ADDF:   *p++ = 0x58; break;
3912          case Asse_DIVF:   *p++ = 0x5E; break;
3913          case Asse_MAXF:   *p++ = 0x5F; break;
3914          case Asse_MINF:   *p++ = 0x5D; break;
3915          case Asse_MULF:   *p++ = 0x59; break;
3916          case Asse_RCPF:   *p++ = 0x53; break;
3917          case Asse_RSQRTF: *p++ = 0x52; break;
3918          case Asse_SQRTF:  *p++ = 0x51; break;
3919          case Asse_SUBF:   *p++ = 0x5C; break;
3920          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3921          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3922          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3923          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3924          default: goto bad;
3925       }
3926       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse32FLo.dst),
3927                                vregEnc3210(i->Ain.Sse32FLo.src) );
3928       if (xtra & 0x100)
3929          *p++ = toUChar(xtra & 0xFF);
3930       goto done;
3931
3932    case Ain_Sse64FLo:
3933       xtra = 0;
3934       *p++ = 0xF2;
3935       *p++ = clearWBit(
3936              rexAMode_R_enc_enc( vregEnc3210(i->Ain.Sse64FLo.dst),
3937                                  vregEnc3210(i->Ain.Sse64FLo.src) ));
3938       *p++ = 0x0F;
3939       switch (i->Ain.Sse64FLo.op) {
3940          case Asse_ADDF:   *p++ = 0x58; break;
3941          case Asse_DIVF:   *p++ = 0x5E; break;
3942          case Asse_MAXF:   *p++ = 0x5F; break;
3943          case Asse_MINF:   *p++ = 0x5D; break;
3944          case Asse_MULF:   *p++ = 0x59; break;
3945          case Asse_SQRTF:  *p++ = 0x51; break;
3946          case Asse_SUBF:   *p++ = 0x5C; break;
3947          case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
3948          case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
3949          case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
3950          case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
3951          default: goto bad;
3952       }
3953       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.Sse64FLo.dst),
3954                                vregEnc3210(i->Ain.Sse64FLo.src) );
3955       if (xtra & 0x100)
3956          *p++ = toUChar(xtra & 0xFF);
3957       goto done;
3958
3959    case Ain_SseReRg:
3960 #     define XX(_n) *p++ = (_n)
3961
3962       rex = clearWBit(
3963             rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseReRg.dst),
3964                                 vregEnc3210(i->Ain.SseReRg.src) ));
3965
3966       switch (i->Ain.SseReRg.op) {
3967          case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
3968          case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
3969          case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
3970          case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
3971          case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
3972          case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
3973          case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
3974          case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
3975          case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
3976          case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
3977          case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
3978          case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
3979          case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
3980          case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
3981          case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
3982          case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
3983          case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
3984          case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
3985          case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
3986          case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
3987          case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
3988          case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
3989          case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
3990          case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
3991          case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
3992          case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
3993          case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
3994          case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
3995          case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
3996          case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
3997          case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
3998          case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
3999          case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
4000          case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
4001          case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
4002          case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
4003          case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
4004          case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
4005          case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
4006          case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
4007          case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
4008          case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
4009          case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
4010          case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
4011          case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
4012          case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
4013          case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
4014          case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
4015          case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
4016          case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
4017          case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
4018          case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
4019          case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
4020          case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
4021          case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
4022          case Asse_PSHUFB:   XX(0x66); XX(rex);
4023                              XX(0x0F); XX(0x38); XX(0x00); break;
4024          case Asse_PMADDUBSW:XX(0x66); XX(rex);
4025                              XX(0x0F); XX(0x38); XX(0x04); break;
4026          default: goto bad;
4027       }
4028       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseReRg.dst),
4029                                vregEnc3210(i->Ain.SseReRg.src) );
4030 #     undef XX
4031       goto done;
4032
4033    case Ain_SseCMov:
4034       /* jmp fwds if !condition */
4035       *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
4036       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
4037       ptmp = p;
4038
4039       /* movaps %src, %dst */
4040       *p++ = clearWBit(
4041              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseCMov.dst),
4042                                  vregEnc3210(i->Ain.SseCMov.src) ));
4043       *p++ = 0x0F;
4044       *p++ = 0x28;
4045       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseCMov.dst),
4046                                vregEnc3210(i->Ain.SseCMov.src) );
4047
4048       /* Fill in the jump offset. */
4049       *(ptmp-1) = toUChar(p - ptmp);
4050       goto done;
4051
4052    case Ain_SseShuf:
4053       *p++ = 0x66;
4054       *p++ = clearWBit(
4055              rexAMode_R_enc_enc( vregEnc3210(i->Ain.SseShuf.dst),
4056                                  vregEnc3210(i->Ain.SseShuf.src) ));
4057       *p++ = 0x0F;
4058       *p++ = 0x70;
4059       p = doAMode_R_enc_enc(p, vregEnc3210(i->Ain.SseShuf.dst),
4060                                vregEnc3210(i->Ain.SseShuf.src) );
4061       *p++ = (UChar)(i->Ain.SseShuf.order);
4062       goto done;
4063
4064    case Ain_SseShiftN: {
4065       UInt limit  = 0;
4066       UInt shiftImm = i->Ain.SseShiftN.shiftBits;
4067       switch (i->Ain.SseShiftN.op) {
4068          case Asse_SHL16: limit = 15; opc = 0x71; subopc_imm = 6; break;
4069          case Asse_SHL32: limit = 31; opc = 0x72; subopc_imm = 6; break;
4070          case Asse_SHL64: limit = 63; opc = 0x73; subopc_imm = 6; break;
4071          case Asse_SAR16: limit = 15; opc = 0x71; subopc_imm = 4; break;
4072          case Asse_SAR32: limit = 31; opc = 0x72; subopc_imm = 4; break;
4073          case Asse_SHR16: limit = 15; opc = 0x71; subopc_imm = 2; break;
4074          case Asse_SHR32: limit = 31; opc = 0x72; subopc_imm = 2; break;
4075          case Asse_SHR64: limit = 63; opc = 0x73; subopc_imm = 2; break;
4076          case Asse_SHL128:
4077             if ((shiftImm & 7) != 0) goto bad;
4078             shiftImm >>= 3;
4079             limit = 15; opc = 0x73; subopc_imm = 7;
4080             break;
4081          case Asse_SHR128:
4082             if ((shiftImm & 7) != 0) goto bad;
4083             shiftImm >>= 3;
4084             limit = 15; opc = 0x73; subopc_imm = 3;
4085             break;
4086          default:
4087             // This should never happen .. SSE2 only offers the above 10 insns
4088             // for the "shift with immediate" case
4089             goto bad;
4090       }
4091       vassert(limit > 0 && opc > 0 && subopc_imm > 0);
4092       if (shiftImm > limit) goto bad;
4093       *p++ = 0x66;
4094       *p++ = clearWBit(
4095              rexAMode_R_enc_enc( subopc_imm,
4096                                  vregEnc3210(i->Ain.SseShiftN.dst) ));
4097       *p++ = 0x0F;
4098       *p++ = opc;
4099       p = doAMode_R_enc_enc(p, subopc_imm, vregEnc3210(i->Ain.SseShiftN.dst));
4100       *p++ = shiftImm;
4101       goto done;
4102    }
4103
4104    case Ain_SseMOVQ: {
4105       Bool toXMM = i->Ain.SseMOVQ.toXMM;
4106       HReg gpr = i->Ain.SseMOVQ.gpr;
4107       HReg xmm = i->Ain.SseMOVQ.xmm;
4108       *p++ = 0x66;
4109       *p++ = setWBit( rexAMode_R_enc_enc( vregEnc3210(xmm), iregEnc3210(gpr)) );
4110       *p++ = 0x0F;
4111       *p++ = toXMM ? 0x6E : 0x7E;
4112       p = doAMode_R_enc_enc( p, vregEnc3210(xmm), iregEnc3210(gpr) );
4113       goto done;
4114    }
4115
4116    //uu case Ain_AvxLdSt: {
4117    //uu    UInt vex = vexAMode_M( dvreg2ireg(i->Ain.AvxLdSt.reg),
4118    //uu                           i->Ain.AvxLdSt.addr );
4119    //uu    p = emitVexPrefix(p, vex);
4120    //uu    *p++ = toUChar(i->Ain.AvxLdSt.isLoad ? 0x10 : 0x11);
4121    //uu    p = doAMode_M(p, dvreg2ireg(i->Ain.AvxLdSt.reg), i->Ain.AvxLdSt.addr);
4122    //uu      goto done;
4123    //uu }
4124
4125    case Ain_Avx32FLo: {
4126       UInt d = vregEnc3210(i->Ain.Avx32FLo.dst);
4127       UInt v = vregEnc3210(i->Ain.Avx32FLo.src1);
4128       UInt s = vregEnc3210(i->Ain.Avx32FLo.src2);
4129       UInt m = 2, pp = 1;
4130       UInt opcode;
4131       switch (i->Ain.Avx32FLo.op) {
4132          case Asse_VFMADD213:
4133             // VFMADD213SS %xmmS2, %xmmS1, %xmmD (xmm regs range 0 .. 15)
4134             opcode = 0xa9;
4135             break;
4136          default:
4137             goto bad;
4138       }
4139       // 0xC4 : ~d3 1 ~s3 o4 o3 o2 o1 o0 : 0 ~v3 ~v2 ~v1 ~v0 0 p1 p0 : opcode_byte
4140       //      :   1 1  d2 d1 d0 s2 s1 s0
4141       *p++ = 0xC4; // 3-byte VEX
4142       *p++ = ((((~d)>>3)&1)<<7) | (1<<6) | ((((~s)>>3)&1)<<5) | m;
4143       *p++ = ((~v&0x0f) << 3) | pp;
4144       *p++ = opcode;
4145       *p++ = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0);
4146       goto done;
4147    }
4148    case Ain_Avx64FLo: {
4149       UInt d = vregEnc3210(i->Ain.Avx64FLo.dst);
4150       UInt v = vregEnc3210(i->Ain.Avx64FLo.src1);
4151       UInt s = vregEnc3210(i->Ain.Avx64FLo.src2);
4152       UInt m = 2, pp = 1;
4153       UInt opcode;
4154       switch (i->Ain.Avx64FLo.op) {
4155          case Asse_VFMADD213:
4156             // VFMADD213SD %xmmS2, %xmmS1, %xmmD (xmm regs range 0 .. 15)
4157             opcode = 0xa9;
4158             break;
4159          default:
4160             goto bad;
4161       }
4162       // 0xC4 : ~d3 1 ~s3 o4 o3 o2 o1 o0 : 1 ~v3 ~v2 ~v1 ~v0 0 p1 p0 : opcode_byte
4163       //      :   1 1  d2 d1 d0 s2 s1 s0
4164       *p++ = 0xC4; // 3-byte VEX
4165       *p++ = ((((~d)>>3)&1)<<7) | (1<<6) | ((((~s)>>3)&1)<<5) | m;
4166       *p++ = (1<<7)|((~v&0x0f) << 3) | pp;
4167       *p++ = opcode;
4168       *p++ = (1<<7) | (1<<6) | ((d&7) << 3) | ((s&7) << 0);
4169       goto done;
4170    }
4171
4172    case Ain_EvCheck: {
4173       /* We generate:
4174             (3 bytes)  decl 8(%rbp)    8 == offsetof(host_EvC_COUNTER)
4175             (2 bytes)  jns  nofail     expected taken
4176             (3 bytes)  jmp* 0(%rbp)    0 == offsetof(host_EvC_FAILADDR)
4177             nofail:
4178       */
4179       /* This is heavily asserted re instruction lengths.  It needs to
4180          be.  If we get given unexpected forms of .amCounter or
4181          .amFailAddr -- basically, anything that's not of the form
4182          uimm7(%rbp) -- they are likely to fail. */
4183       /* Note also that after the decl we must be very careful not to
4184          read the carry flag, else we get a partial flags stall.
4185          js/jns avoids that, though. */
4186       UChar* p0 = p;
4187       /* ---  decl 8(%rbp) --- */
4188       /* Need to compute the REX byte for the decl in order to prove
4189          that we don't need it, since this is a 32-bit inc and all
4190          registers involved in the amode are < r8.  "1" because
4191          there's no register in this encoding; instead the register
4192          field is used as a sub opcode.  The encoding for "decl r/m32"
4193          is FF /1, hence the "1". */
4194       rex = clearWBit(rexAMode_M_enc(1, i->Ain.EvCheck.amCounter));
4195       if (rex != 0x40) goto bad; /* We don't expect to need the REX byte. */
4196       *p++ = 0xFF;
4197       p = doAMode_M_enc(p, 1, i->Ain.EvCheck.amCounter);
4198       vassert(p - p0 == 3);
4199       /* --- jns nofail --- */
4200       *p++ = 0x79;
4201       *p++ = 0x03; /* need to check this 0x03 after the next insn */
4202       vassert(p - p0 == 5);
4203       /* --- jmp* 0(%rbp) --- */
4204       /* Once again, verify we don't need REX.  The encoding is FF /4.
4205          We don't need REX.W since by default FF /4 in 64-bit mode
4206          implies a 64 bit load. */
4207       rex = clearWBit(rexAMode_M_enc(4, i->Ain.EvCheck.amFailAddr));
4208       if (rex != 0x40) goto bad;
4209       *p++ = 0xFF;
4210       p = doAMode_M_enc(p, 4, i->Ain.EvCheck.amFailAddr);
4211       vassert(p - p0 == 8); /* also ensures that 0x03 offset above is ok */
4212       /* And crosscheck .. */
4213       vassert(evCheckSzB_AMD64() == 8);
4214       goto done;
4215    }
4216
4217    case Ain_ProfInc: {
4218       /* We generate   movabsq $0, %r11
4219                        incq (%r11)
4220          in the expectation that a later call to LibVEX_patchProfCtr
4221          will be used to fill in the immediate field once the right
4222          value is known.
4223          49 BB 00 00 00 00 00 00 00 00
4224          49 FF 03
4225       */
4226       *p++ = 0x49; *p++ = 0xBB;
4227       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
4228       *p++ = 0x00; *p++ = 0x00; *p++ = 0x00; *p++ = 0x00;
4229       *p++ = 0x49; *p++ = 0xFF; *p++ = 0x03;
4230       /* Tell the caller .. */
4231       vassert(!(*is_profInc));
4232       *is_profInc = True;
4233       goto done;
4234    }
4235
4236    default:
4237       goto bad;
4238    }
4239
4240   bad:
4241    ppAMD64Instr(i, mode64);
4242    vpanic("emit_AMD64Instr");
4243    /*NOTREACHED*/
4244
4245   done:
4246    vassert(p - &buf[0] <= 64);
4247    return p - &buf[0];
4248 }
4249
4250
4251 /* How big is an event check?  See case for Ain_EvCheck in
4252    emit_AMD64Instr just above.  That crosschecks what this returns, so
4253    we can tell if we're inconsistent. */
4254 Int evCheckSzB_AMD64 (void)
4255 {
4256    return 8;
4257 }
4258
4259
4260 /* NB: what goes on here has to be very closely coordinated with the
4261    emitInstr case for XDirect, above. */
4262 VexInvalRange chainXDirect_AMD64 ( VexEndness endness_host,
4263                                    void* place_to_chain,
4264                                    const void* disp_cp_chain_me_EXPECTED,
4265                                    const void* place_to_jump_to )
4266 {
4267    vassert(endness_host == VexEndnessLE);
4268
4269    /* What we're expecting to see is:
4270         movabsq $disp_cp_chain_me_EXPECTED, %r11
4271         call *%r11
4272       viz
4273         49 BB <8 bytes value == disp_cp_chain_me_EXPECTED>
4274         41 FF D3
4275    */
4276    UChar* p = (UChar*)place_to_chain;
4277    vassert(p[0] == 0x49);
4278    vassert(p[1] == 0xBB);
4279    vassert(read_misaligned_ULong_LE(&p[2]) == (Addr)disp_cp_chain_me_EXPECTED);
4280    vassert(p[10] == 0x41);
4281    vassert(p[11] == 0xFF);
4282    vassert(p[12] == 0xD3);
4283    /* And what we want to change it to is either:
4284         (general case):
4285           movabsq $place_to_jump_to, %r11
4286           jmpq *%r11
4287         viz
4288           49 BB <8 bytes value == place_to_jump_to>
4289           41 FF E3
4290         So it's the same length (convenient, huh) and we don't
4291         need to change all the bits.
4292       ---OR---
4293         in the case where the displacement falls within 32 bits
4294           jmpq disp32   where disp32 is relative to the next insn
4295           ud2; ud2; ud2; ud2
4296         viz
4297           E9 <4 bytes == disp32>
4298           0F 0B 0F 0B 0F 0B 0F 0B
4299
4300       In both cases the replacement has the same length as the original.
4301       To remain sane & verifiable,
4302       (1) limit the displacement for the short form to
4303           (say) +/- one billion, so as to avoid wraparound
4304           off-by-ones
4305       (2) even if the short form is applicable, once every (say)
4306           1024 times use the long form anyway, so as to maintain
4307           verifiability
4308    */
4309    /* This is the delta we need to put into a JMP d32 insn.  It's
4310       relative to the start of the next insn, hence the -5.  */
4311    Long delta   = (Long)((const UChar *)place_to_jump_to - (const UChar*)p) - 5;
4312    Bool shortOK = delta >= -1000*1000*1000 && delta < 1000*1000*1000;
4313
4314    static UInt shortCTR = 0; /* DO NOT MAKE NON-STATIC */
4315    if (shortOK) {
4316       shortCTR++; // thread safety bleh
4317       if (0 == (shortCTR & 0x3FF)) {
4318          shortOK = False;
4319          if (0)
4320             vex_printf("QQQ chainXDirect_AMD64: shortCTR = %u, "
4321                        "using long jmp\n", shortCTR);
4322       }
4323    }
4324
4325    /* And make the modifications. */
4326    if (shortOK) {
4327       p[0]  = 0xE9;
4328       write_misaligned_UInt_LE(&p[1], (UInt)(Int)delta);
4329       p[5]  = 0x0F; p[6]  = 0x0B;
4330       p[7]  = 0x0F; p[8]  = 0x0B;
4331       p[9]  = 0x0F; p[10] = 0x0B;
4332       p[11] = 0x0F; p[12] = 0x0B;
4333       /* sanity check on the delta -- top 32 are all 0 or all 1 */
4334       delta >>= 32;
4335       vassert(delta == 0LL || delta == -1LL);
4336    } else {
4337       /* Minimal modifications from the starting sequence. */
4338       write_misaligned_ULong_LE(&p[2], (ULong)(Addr)place_to_jump_to);
4339       p[12] = 0xE3;
4340    }
4341    VexInvalRange vir = { (HWord)place_to_chain, 13 };
4342    return vir;
4343 }
4344
4345
4346 /* NB: what goes on here has to be very closely coordinated with the
4347    emitInstr case for XDirect, above. */
4348 VexInvalRange unchainXDirect_AMD64 ( VexEndness endness_host,
4349                                      void* place_to_unchain,
4350                                      const void* place_to_jump_to_EXPECTED,
4351                                      const void* disp_cp_chain_me )
4352 {
4353    vassert(endness_host == VexEndnessLE);
4354
4355    /* What we're expecting to see is either:
4356         (general case)
4357           movabsq $place_to_jump_to_EXPECTED, %r11
4358           jmpq *%r11
4359         viz
4360           49 BB <8 bytes value == place_to_jump_to_EXPECTED>
4361           41 FF E3
4362       ---OR---
4363         in the case where the displacement falls within 32 bits
4364           jmpq d32
4365           ud2; ud2; ud2; ud2
4366         viz
4367           E9 <4 bytes == disp32>
4368           0F 0B 0F 0B 0F 0B 0F 0B
4369    */
4370    UChar* p     = (UChar*)place_to_unchain;
4371    Bool   valid = False;
4372    if (p[0] == 0x49 && p[1] == 0xBB
4373        && read_misaligned_ULong_LE(&p[2])
4374           == (ULong)(Addr)place_to_jump_to_EXPECTED
4375        && p[10] == 0x41 && p[11] == 0xFF && p[12] == 0xE3) {
4376       /* it's the long form */
4377       valid = True;
4378    }
4379    else
4380    if (p[0] == 0xE9
4381        && p[5]  == 0x0F && p[6]  == 0x0B
4382        && p[7]  == 0x0F && p[8]  == 0x0B
4383        && p[9]  == 0x0F && p[10] == 0x0B
4384        && p[11] == 0x0F && p[12] == 0x0B) {
4385       /* It's the short form.  Check the offset is right. */
4386       Int  s32 = (Int)read_misaligned_UInt_LE(&p[1]);
4387       Long s64 = (Long)s32;
4388       if ((UChar*)p + 5 + s64 == place_to_jump_to_EXPECTED) {
4389          valid = True;
4390          if (0)
4391             vex_printf("QQQ unchainXDirect_AMD64: found short form\n");
4392       }
4393    }
4394    vassert(valid);
4395    /* And what we want to change it to is:
4396         movabsq $disp_cp_chain_me, %r11
4397         call *%r11
4398       viz
4399         49 BB <8 bytes value == disp_cp_chain_me>
4400         41 FF D3
4401       So it's the same length (convenient, huh).
4402    */
4403    p[0] = 0x49;
4404    p[1] = 0xBB;
4405    write_misaligned_ULong_LE(&p[2], (ULong)(Addr)disp_cp_chain_me);
4406    p[10] = 0x41;
4407    p[11] = 0xFF;
4408    p[12] = 0xD3;
4409    VexInvalRange vir = { (HWord)place_to_unchain, 13 };
4410    return vir;
4411 }
4412
4413
4414 /* Patch the counter address into a profile inc point, as previously
4415    created by the Ain_ProfInc case for emit_AMD64Instr. */
4416 VexInvalRange patchProfInc_AMD64 ( VexEndness endness_host,
4417                                    void*  place_to_patch,
4418                                    const ULong* location_of_counter )
4419 {
4420    vassert(endness_host == VexEndnessLE);
4421    vassert(sizeof(ULong*) == 8);
4422    UChar* p = (UChar*)place_to_patch;
4423    vassert(p[0] == 0x49);
4424    vassert(p[1] == 0xBB);
4425    vassert(p[2] == 0x00);
4426    vassert(p[3] == 0x00);
4427    vassert(p[4] == 0x00);
4428    vassert(p[5] == 0x00);
4429    vassert(p[6] == 0x00);
4430    vassert(p[7] == 0x00);
4431    vassert(p[8] == 0x00);
4432    vassert(p[9] == 0x00);
4433    vassert(p[10] == 0x49);
4434    vassert(p[11] == 0xFF);
4435    vassert(p[12] == 0x03);
4436    ULong imm64 = (ULong)(Addr)location_of_counter;
4437    p[2] = imm64 & 0xFF; imm64 >>= 8;
4438    p[3] = imm64 & 0xFF; imm64 >>= 8;
4439    p[4] = imm64 & 0xFF; imm64 >>= 8;
4440    p[5] = imm64 & 0xFF; imm64 >>= 8;
4441    p[6] = imm64 & 0xFF; imm64 >>= 8;
4442    p[7] = imm64 & 0xFF; imm64 >>= 8;
4443    p[8] = imm64 & 0xFF; imm64 >>= 8;
4444    p[9] = imm64 & 0xFF; imm64 >>= 8;
4445    VexInvalRange vir = { (HWord)place_to_patch, 13 };
4446    return vir;
4447 }
4448
4449
4450 /*---------------------------------------------------------------*/
4451 /*--- end                                   host_amd64_defs.c ---*/
4452 /*---------------------------------------------------------------*/