tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-ldst.c.inc"
  26 #include "../tcg-pool.c.inc"
  27
  28 #ifdef CONFIG_DEBUG_TCG
  29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  30 #if TCG_TARGET_REG_BITS == 64
  31     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  32 #else
  33     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  34 #endif
  35     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  36     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  37 #if TCG_TARGET_REG_BITS == 64
  38     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  39     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  40 #endif
  41 };
  42 #endif
  43
  44 static const int tcg_target_reg_alloc_order[] = {
  45 #if TCG_TARGET_REG_BITS == 64
  46     TCG_REG_RBP,
  47     TCG_REG_RBX,
  48     TCG_REG_R12,
  49     TCG_REG_R13,
  50     TCG_REG_R14,
  51     TCG_REG_R15,
  52     TCG_REG_R10,
  53     TCG_REG_R11,
  54     TCG_REG_R9,
  55     TCG_REG_R8,
  56     TCG_REG_RCX,
  57     TCG_REG_RDX,
  58     TCG_REG_RSI,
  59     TCG_REG_RDI,
  60     TCG_REG_RAX,
  61 #else
  62     TCG_REG_EBX,
  63     TCG_REG_ESI,
  64     TCG_REG_EDI,
  65     TCG_REG_EBP,
  66     TCG_REG_ECX,
  67     TCG_REG_EDX,
  68     TCG_REG_EAX,
  69 #endif
  70     TCG_REG_XMM0,
  71     TCG_REG_XMM1,
  72     TCG_REG_XMM2,
  73     TCG_REG_XMM3,
  74     TCG_REG_XMM4,
  75     TCG_REG_XMM5,
  76 #ifndef _WIN64
  77     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  78        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  79     TCG_REG_XMM6,
  80     TCG_REG_XMM7,
  81 #if TCG_TARGET_REG_BITS == 64
  82     TCG_REG_XMM8,
  83     TCG_REG_XMM9,
  84     TCG_REG_XMM10,
  85     TCG_REG_XMM11,
  86     TCG_REG_XMM12,
  87     TCG_REG_XMM13,
  88     TCG_REG_XMM14,
  89     TCG_REG_XMM15,
  90 #endif
  91 #endif
  92 };
  93
  94 #define TCG_TMP_VEC  TCG_REG_XMM5
  95
  96 static const int tcg_target_call_iarg_regs[] = {
  97 #if TCG_TARGET_REG_BITS == 64
  98 #if defined(_WIN64)
  99     TCG_REG_RCX,
 100     TCG_REG_RDX,
 101 #else
 102     TCG_REG_RDI,
 103     TCG_REG_RSI,
 104     TCG_REG_RDX,
 105     TCG_REG_RCX,
 106 #endif
 107     TCG_REG_R8,
 108     TCG_REG_R9,
 109 #else
 110     /* 32 bit mode uses stack based calling convention (GCC default). */
 111 #endif
 112 };
 113
 114 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 115 {
 116     switch (kind) {
 117     case TCG_CALL_RET_NORMAL:
 118         tcg_debug_assert(slot >= 0 && slot <= 1);
 119         return slot ? TCG_REG_EDX : TCG_REG_EAX;
 120 #ifdef _WIN64
 121     case TCG_CALL_RET_BY_VEC:
 122         tcg_debug_assert(slot == 0);
 123         return TCG_REG_XMM0;
 124 #endif
 125     default:
 126         g_assert_not_reached();
 127     }
 128 }
 129
 130 /* Constants we accept.  */
 131 #define TCG_CT_CONST_S32 0x100
 132 #define TCG_CT_CONST_U32 0x200
 133 #define TCG_CT_CONST_I32 0x400
 134 #define TCG_CT_CONST_WSZ 0x800
 135 #define TCG_CT_CONST_TST 0x1000
 136 #define TCG_CT_CONST_ZERO 0x2000
 137
 138 /* Registers used with L constraint, which are the first argument
 139    registers on x86_64, and two random call clobbered registers on
 140    i386. */
 141 #if TCG_TARGET_REG_BITS == 64
 142 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 143 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 144 #else
 145 # define TCG_REG_L0 TCG_REG_EAX
 146 # define TCG_REG_L1 TCG_REG_EDX
 147 #endif
 148
 149 #if TCG_TARGET_REG_BITS == 64
 150 # define ALL_GENERAL_REGS      0x0000ffffu
 151 # define ALL_VECTOR_REGS       0xffff0000u
 152 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 153 #else
 154 # define ALL_GENERAL_REGS      0x000000ffu
 155 # define ALL_VECTOR_REGS       0x00ff0000u
 156 # define ALL_BYTEL_REGS        0x0000000fu
 157 #endif
 158 #define SOFTMMU_RESERVE_REGS \
 159     (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
 160
 161 #define have_bmi2       (cpuinfo & CPUINFO_BMI2)
 162 #define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
 163
 164 static const tcg_insn_unit *tb_ret_addr;
 165
 166 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 167                         intptr_t value, intptr_t addend)
 168 {
 169     value += addend;
 170     switch(type) {
 171     case R_386_PC32:
 172         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 173         if (value != (int32_t)value) {
 174             return false;
 175         }
 176         /* FALLTHRU */
 177     case R_386_32:
 178         tcg_patch32(code_ptr, value);
 179         break;
 180     case R_386_PC8:
 181         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 182         if (value != (int8_t)value) {
 183             return false;
 184         }
 185         tcg_patch8(code_ptr, value);
 186         break;
 187     default:
 188         g_assert_not_reached();
 189     }
 190     return true;
 191 }
 192
 193 /* test if a constant matches the constraint */
 194 static bool tcg_target_const_match(int64_t val, int ct,
 195                                    TCGType type, TCGCond cond, int vece)
 196 {
 197     if (ct & TCG_CT_CONST) {
 198         return 1;
 199     }
 200     if (type == TCG_TYPE_I32) {
 201         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 |
 202                   TCG_CT_CONST_I32 | TCG_CT_CONST_TST)) {
 203             return 1;
 204         }
 205     } else {
 206         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 207             return 1;
 208         }
 209         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 210             return 1;
 211         }
 212         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 213             return 1;
 214         }
 215         /*
 216          * This will be used in combination with TCG_CT_CONST_S32,
 217          * so "normal" TESTQ is already matched.  Also accept:
 218          *    TESTQ -> TESTL   (uint32_t)
 219          *    TESTQ -> BT      (is_power_of_2)
 220          */
 221         if ((ct & TCG_CT_CONST_TST)
 222             && is_tst_cond(cond)
 223             && (val == (uint32_t)val || is_power_of_2(val))) {
 224             return 1;
 225         }
 226     }
 227     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 228         return 1;
 229     }
 230     if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
 231         return 1;
 232     }
 233     return 0;
 234 }
 235
 236 # define LOWREGMASK(x)  ((x) & 7)
 237
 238 #define P_EXT           0x100           /* 0x0f opcode prefix */
 239 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 240 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 241 #define P_VEXW          0x1000          /* Set VEX.W = 1 */
 242 #if TCG_TARGET_REG_BITS == 64
 243 # define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 244 # define P_REXB_R       0x2000          /* REG field as byte register */
 245 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 246 # define P_GS           0x8000          /* gs segment override */
 247 #else
 248 # define P_REXW         0
 249 # define P_REXB_R       0
 250 # define P_REXB_RM      0
 251 # define P_GS           0
 252 #endif
 253 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 254 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 255 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 256 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 257 #define P_EVEX          0x100000        /* Requires EVEX encoding */
 258
 259 #define OPC_ARITH_EbIb  (0x80)
 260 #define OPC_ARITH_EvIz  (0x81)
 261 #define OPC_ARITH_EvIb  (0x83)
 262 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 263 #define OPC_ANDN        (0xf2 | P_EXT38)
 264 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 265 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 266 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 267 #define OPC_BSF         (0xbc | P_EXT)
 268 #define OPC_BSR         (0xbd | P_EXT)
 269 #define OPC_BSWAP       (0xc8 | P_EXT)
 270 #define OPC_CALL_Jz     (0xe8)
 271 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 272 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 273 #define OPC_DEC_r32     (0x48)
 274 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 275 #define OPC_IMUL_GvEvIb (0x6b)
 276 #define OPC_IMUL_GvEvIz (0x69)
 277 #define OPC_INC_r32     (0x40)
 278 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 279 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 280 #define OPC_JMP_long    (0xe9)
 281 #define OPC_JMP_short   (0xeb)
 282 #define OPC_LEA         (0x8d)
 283 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 284 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 285 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 286 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 287 #define OPC_MOVB_EvIz   (0xc6)
 288 #define OPC_MOVL_EvIz   (0xc7)
 289 #define OPC_MOVB_Ib     (0xb0)
 290 #define OPC_MOVL_Iv     (0xb8)
 291 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 292 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 293 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 294 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 295 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 296 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 297 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 298 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 299 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 300 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 301 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 302 #define OPC_MOVSBL      (0xbe | P_EXT)
 303 #define OPC_MOVSWL      (0xbf | P_EXT)
 304 #define OPC_MOVSLQ      (0x63 | P_REXW)
 305 #define OPC_MOVZBL      (0xb6 | P_EXT)
 306 #define OPC_MOVZWL      (0xb7 | P_EXT)
 307 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 308 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 309 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 310 #define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 311 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 312 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 313 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 314 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 315 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 316 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 317 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 318 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 319 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 320 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 321 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 322 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 323 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 324 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 325 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 326 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 327 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 328 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 329 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 330 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 331 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 332 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 333 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 334 #define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
 335 #define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
 336 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 337 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 338 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 339 #define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 340 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 341 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 342 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 343 #define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 344 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 345 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 346 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 347 #define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 348 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 349 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 350 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 351 #define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 352 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 353 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 354 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 355 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 356 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 357 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 358 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 359 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 360 #define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 361 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 362 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 363 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 364 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 365 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 366 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 367 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
 368 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 369 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 370 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 371 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 372 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 373 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 374 #define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
 375 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 376 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 377 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 378 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 379 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 380 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 381 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 382 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 383 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 384 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 385 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 386 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 387 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 388 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 389 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 390 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 391 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 392 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 393 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 394 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 395 #define OPC_POP_r32     (0x58)
 396 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 397 #define OPC_PUSH_r32    (0x50)
 398 #define OPC_PUSH_Iv     (0x68)
 399 #define OPC_PUSH_Ib     (0x6a)
 400 #define OPC_RET         (0xc3)
 401 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 402 #define OPC_SHIFT_1     (0xd1)
 403 #define OPC_SHIFT_Ib    (0xc1)
 404 #define OPC_SHIFT_cl    (0xd3)
 405 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 406 #define OPC_SHUFPS      (0xc6 | P_EXT)
 407 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 408 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 409 #define OPC_SHRD_Ib     (0xac | P_EXT)
 410 #define OPC_TESTB       (0x84)
 411 #define OPC_TESTL       (0x85)
 412 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 413 #define OPC_UD2         (0x0b | P_EXT)
 414 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 415 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 416 #define OPC_VPBLENDMB   (0x66 | P_EXT38 | P_DATA16 | P_EVEX)
 417 #define OPC_VPBLENDMW   (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 418 #define OPC_VPBLENDMD   (0x64 | P_EXT38 | P_DATA16 | P_EVEX)
 419 #define OPC_VPBLENDMQ   (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 420 #define OPC_VPCMPB      (0x3f | P_EXT3A | P_DATA16 | P_EVEX)
 421 #define OPC_VPCMPUB     (0x3e | P_EXT3A | P_DATA16 | P_EVEX)
 422 #define OPC_VPCMPW      (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 423 #define OPC_VPCMPUW     (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 424 #define OPC_VPCMPD      (0x1f | P_EXT3A | P_DATA16 | P_EVEX)
 425 #define OPC_VPCMPUD     (0x1e | P_EXT3A | P_DATA16 | P_EVEX)
 426 #define OPC_VPCMPQ      (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 427 #define OPC_VPCMPUQ     (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 428 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 429 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 430 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 431 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 432 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 433 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 434 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 435 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 436 #define OPC_VPMOVM2B    (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX)
 437 #define OPC_VPMOVM2W    (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
 438 #define OPC_VPMOVM2D    (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX)
 439 #define OPC_VPMOVM2Q    (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
 440 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 441 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 442 #define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
 443 #define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 444 #define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
 445 #define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 446 #define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 447 #define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
 448 #define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 449 #define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 450 #define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
 451 #define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 452 #define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 453 #define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
 454 #define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 455 #define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 456 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 457 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 458 #define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 459 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 460 #define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 461 #define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 462 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 463 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 464 #define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 465 #define OPC_VPTESTMB    (0x26 | P_EXT38 | P_DATA16 | P_EVEX)
 466 #define OPC_VPTESTMW    (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 467 #define OPC_VPTESTMD    (0x27 | P_EXT38 | P_DATA16 | P_EVEX)
 468 #define OPC_VPTESTMQ    (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 469 #define OPC_VPTESTNMB   (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX)
 470 #define OPC_VPTESTNMW   (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
 471 #define OPC_VPTESTNMD   (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX)
 472 #define OPC_VPTESTNMQ   (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
 473 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 474 #define OPC_XCHG_ax_r32 (0x90)
 475 #define OPC_XCHG_EvGv   (0x87)
 476
 477 #define OPC_GRP3_Eb     (0xf6)
 478 #define OPC_GRP3_Ev     (0xf7)
 479 #define OPC_GRP5        (0xff)
 480 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 481 #define OPC_GRPBT       (0xba | P_EXT)
 482
 483 #define OPC_GRPBT_BT    4
 484 #define OPC_GRPBT_BTS   5
 485 #define OPC_GRPBT_BTR   6
 486 #define OPC_GRPBT_BTC   7
 487
 488 /* Group 1 opcode extensions for 0x80-0x83.
 489    These are also used as modifiers for OPC_ARITH.  */
 490 #define ARITH_ADD 0
 491 #define ARITH_OR  1
 492 #define ARITH_ADC 2
 493 #define ARITH_SBB 3
 494 #define ARITH_AND 4
 495 #define ARITH_SUB 5
 496 #define ARITH_XOR 6
 497 #define ARITH_CMP 7
 498
 499 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 500 #define SHIFT_ROL 0
 501 #define SHIFT_ROR 1
 502 #define SHIFT_SHL 4
 503 #define SHIFT_SHR 5
 504 #define SHIFT_SAR 7
 505
 506 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 507 #define EXT3_TESTi 0
 508 #define EXT3_NOT   2
 509 #define EXT3_NEG   3
 510 #define EXT3_MUL   4
 511 #define EXT3_IMUL  5
 512 #define EXT3_DIV   6
 513 #define EXT3_IDIV  7
 514
 515 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 516 #define EXT5_INC_Ev     0
 517 #define EXT5_DEC_Ev     1
 518 #define EXT5_CALLN_Ev   2
 519 #define EXT5_JMPN_Ev    4
 520
 521 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 522 #define JCC_JMP (-1)
 523 #define JCC_JO  0x0
 524 #define JCC_JNO 0x1
 525 #define JCC_JB  0x2
 526 #define JCC_JAE 0x3
 527 #define JCC_JE  0x4
 528 #define JCC_JNE 0x5
 529 #define JCC_JBE 0x6
 530 #define JCC_JA  0x7
 531 #define JCC_JS  0x8
 532 #define JCC_JNS 0x9
 533 #define JCC_JP  0xa
 534 #define JCC_JNP 0xb
 535 #define JCC_JL  0xc
 536 #define JCC_JGE 0xd
 537 #define JCC_JLE 0xe
 538 #define JCC_JG  0xf
 539
 540 static const uint8_t tcg_cond_to_jcc[] = {
 541     [TCG_COND_EQ] = JCC_JE,
 542     [TCG_COND_NE] = JCC_JNE,
 543     [TCG_COND_LT] = JCC_JL,
 544     [TCG_COND_GE] = JCC_JGE,
 545     [TCG_COND_LE] = JCC_JLE,
 546     [TCG_COND_GT] = JCC_JG,
 547     [TCG_COND_LTU] = JCC_JB,
 548     [TCG_COND_GEU] = JCC_JAE,
 549     [TCG_COND_LEU] = JCC_JBE,
 550     [TCG_COND_GTU] = JCC_JA,
 551     [TCG_COND_TSTEQ] = JCC_JE,
 552     [TCG_COND_TSTNE] = JCC_JNE,
 553 };
 554
 555 #if TCG_TARGET_REG_BITS == 64
 556 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 557 {
 558     int rex;
 559
 560     if (opc & P_GS) {
 561         tcg_out8(s, 0x65);
 562     }
 563     if (opc & P_DATA16) {
 564         /* We should never be asking for both 16 and 64-bit operation.  */
 565         tcg_debug_assert((opc & P_REXW) == 0);
 566         tcg_out8(s, 0x66);
 567     }
 568     if (opc & P_SIMDF3) {
 569         tcg_out8(s, 0xf3);
 570     } else if (opc & P_SIMDF2) {
 571         tcg_out8(s, 0xf2);
 572     }
 573
 574     rex = 0;
 575     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 576     rex |= (r & 8) >> 1;                /* REX.R */
 577     rex |= (x & 8) >> 2;                /* REX.X */
 578     rex |= (rm & 8) >> 3;               /* REX.B */
 579
 580     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 581        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 582        as otherwise the encoding indicates %[abcd]h.  Note that the values
 583        that are ORed in merely indicate that the REX byte must be present;
 584        those bits get discarded in output.  */
 585     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 586     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 587
 588     if (rex) {
 589         tcg_out8(s, (uint8_t)(rex | 0x40));
 590     }
 591
 592     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 593         tcg_out8(s, 0x0f);
 594         if (opc & P_EXT38) {
 595             tcg_out8(s, 0x38);
 596         } else if (opc & P_EXT3A) {
 597             tcg_out8(s, 0x3a);
 598         }
 599     }
 600
 601     tcg_out8(s, opc);
 602 }
 603 #else
 604 static void tcg_out_opc(TCGContext *s, int opc)
 605 {
 606     if (opc & P_DATA16) {
 607         tcg_out8(s, 0x66);
 608     }
 609     if (opc & P_SIMDF3) {
 610         tcg_out8(s, 0xf3);
 611     } else if (opc & P_SIMDF2) {
 612         tcg_out8(s, 0xf2);
 613     }
 614     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 615         tcg_out8(s, 0x0f);
 616         if (opc & P_EXT38) {
 617             tcg_out8(s, 0x38);
 618         } else if (opc & P_EXT3A) {
 619             tcg_out8(s, 0x3a);
 620         }
 621     }
 622     tcg_out8(s, opc);
 623 }
 624 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 625    the 32-bit compilation paths.  This method works with all versions of gcc,
 626    whereas relying on optimization may not be able to exclude them.  */
 627 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 628 #endif
 629
 630 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 631 {
 632     tcg_out_opc(s, opc, r, rm, 0);
 633     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 634 }
 635
 636 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 637                             int rm, int index)
 638 {
 639     int tmp;
 640
 641     if (opc & P_GS) {
 642         tcg_out8(s, 0x65);
 643     }
 644     /* Use the two byte form if possible, which cannot encode
 645        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 646     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
 647         && ((rm | index) & 8) == 0) {
 648         /* Two byte VEX prefix.  */
 649         tcg_out8(s, 0xc5);
 650
 651         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 652     } else {
 653         /* Three byte VEX prefix.  */
 654         tcg_out8(s, 0xc4);
 655
 656         /* VEX.m-mmmm */
 657         if (opc & P_EXT3A) {
 658             tmp = 3;
 659         } else if (opc & P_EXT38) {
 660             tmp = 2;
 661         } else if (opc & P_EXT) {
 662             tmp = 1;
 663         } else {
 664             g_assert_not_reached();
 665         }
 666         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 667         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 668         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 669         tcg_out8(s, tmp);
 670
 671         tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
 672     }
 673
 674     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 675     /* VEX.pp */
 676     if (opc & P_DATA16) {
 677         tmp |= 1;                          /* 0x66 */
 678     } else if (opc & P_SIMDF3) {
 679         tmp |= 2;                          /* 0xf3 */
 680     } else if (opc & P_SIMDF2) {
 681         tmp |= 3;                          /* 0xf2 */
 682     }
 683     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 684     tcg_out8(s, tmp);
 685     tcg_out8(s, opc);
 686 }
 687
 688 static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
 689                              int rm, int index, int aaa, bool z)
 690 {
 691     /* The entire 4-byte evex prefix; with R' and V' set. */
 692     uint32_t p = 0x08041062;
 693     int mm, pp;
 694
 695     tcg_debug_assert(have_avx512vl);
 696
 697     /* EVEX.mm */
 698     if (opc & P_EXT3A) {
 699         mm = 3;
 700     } else if (opc & P_EXT38) {
 701         mm = 2;
 702     } else if (opc & P_EXT) {
 703         mm = 1;
 704     } else {
 705         g_assert_not_reached();
 706     }
 707
 708     /* EVEX.pp */
 709     if (opc & P_DATA16) {
 710         pp = 1;                          /* 0x66 */
 711     } else if (opc & P_SIMDF3) {
 712         pp = 2;                          /* 0xf3 */
 713     } else if (opc & P_SIMDF2) {
 714         pp = 3;                          /* 0xf2 */
 715     } else {
 716         pp = 0;
 717     }
 718
 719     p = deposit32(p, 8, 2, mm);
 720     p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
 721     p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
 722     p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
 723     p = deposit32(p, 16, 2, pp);
 724     p = deposit32(p, 19, 4, ~v);
 725     p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
 726     p = deposit32(p, 24, 3, aaa);
 727     p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
 728     p = deposit32(p, 31, 1, z);
 729
 730     tcg_out32(s, p);
 731     tcg_out8(s, opc);
 732 }
 733
 734 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 735 {
 736     if (opc & P_EVEX) {
 737         tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false);
 738     } else {
 739         tcg_out_vex_opc(s, opc, r, v, rm, 0);
 740     }
 741     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 742 }
 743
 744 static void tcg_out_vex_modrm_type(TCGContext *s, int opc,
 745                                    int r, int v, int rm, TCGType type)
 746 {
 747     if (type == TCG_TYPE_V256) {
 748         opc |= P_VEXL;
 749     }
 750     tcg_out_vex_modrm(s, opc, r, v, rm);
 751 }
 752
 753 static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v,
 754                                     int rm, int aaa, bool z, TCGType type)
 755 {
 756     if (type == TCG_TYPE_V256) {
 757         opc |= P_VEXL;
 758     }
 759     tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z);
 760     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 761 }
 762
 763 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 764    We handle either RM and INDEX missing with a negative value.  In 64-bit
 765    mode for absolute addresses, ~RM is the size of the immediate operand
 766    that will follow the instruction.  */
 767
 768 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 769                                int shift, intptr_t offset)
 770 {
 771     int mod, len;
 772
 773     if (index < 0 && rm < 0) {
 774         if (TCG_TARGET_REG_BITS == 64) {
 775             /* Try for a rip-relative addressing mode.  This has replaced
 776                the 32-bit-mode absolute addressing encoding.  */
 777             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 778             intptr_t disp = offset - pc;
 779             if (disp == (int32_t)disp) {
 780                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 781                 tcg_out32(s, disp);
 782                 return;
 783             }
 784
 785             /* Try for an absolute address encoding.  This requires the
 786                use of the MODRM+SIB encoding and is therefore larger than
 787                rip-relative addressing.  */
 788             if (offset == (int32_t)offset) {
 789                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 790                 tcg_out8(s, (4 << 3) | 5);
 791                 tcg_out32(s, offset);
 792                 return;
 793             }
 794
 795             /* ??? The memory isn't directly addressable.  */
 796             g_assert_not_reached();
 797         } else {
 798             /* Absolute address.  */
 799             tcg_out8(s, (r << 3) | 5);
 800             tcg_out32(s, offset);
 801             return;
 802         }
 803     }
 804
 805     /* Find the length of the immediate addend.  Note that the encoding
 806        that would be used for (%ebp) indicates absolute addressing.  */
 807     if (rm < 0) {
 808         mod = 0, len = 4, rm = 5;
 809     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 810         mod = 0, len = 0;
 811     } else if (offset == (int8_t)offset) {
 812         mod = 0x40, len = 1;
 813     } else {
 814         mod = 0x80, len = 4;
 815     }
 816
 817     /* Use a single byte MODRM format if possible.  Note that the encoding
 818        that would be used for %esp is the escape to the two byte form.  */
 819     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 820         /* Single byte MODRM format.  */
 821         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 822     } else {
 823         /* Two byte MODRM+SIB format.  */
 824
 825         /* Note that the encoding that would place %esp into the index
 826            field indicates no index register.  In 64-bit mode, the REX.X
 827            bit counts, so %r12 can be used as the index.  */
 828         if (index < 0) {
 829             index = 4;
 830         } else {
 831             tcg_debug_assert(index != TCG_REG_ESP);
 832         }
 833
 834         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 835         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 836     }
 837
 838     if (len == 1) {
 839         tcg_out8(s, offset);
 840     } else if (len == 4) {
 841         tcg_out32(s, offset);
 842     }
 843 }
 844
 845 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 846                                      int index, int shift, intptr_t offset)
 847 {
 848     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 849     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 850 }
 851
 852 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 853                                          int rm, int index, int shift,
 854                                          intptr_t offset)
 855 {
 856     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 857     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 858 }
 859
 860 /* A simplification of the above with no index or shift.  */
 861 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 862                                         int rm, intptr_t offset)
 863 {
 864     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 865 }
 866
 867 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 868                                             int v, int rm, intptr_t offset)
 869 {
 870     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 871 }
 872
 873 /* Output an opcode with an expected reference to the constant pool.  */
 874 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 875 {
 876     tcg_out_opc(s, opc, r, 0, 0);
 877     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 878     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 879     tcg_out32(s, 0);
 880 }
 881
 882 /* Output an opcode with an expected reference to the constant pool.  */
 883 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 884 {
 885     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 886     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 887     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 888     tcg_out32(s, 0);
 889 }
 890
 891 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 892 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 893 {
 894     /* Propagate an opcode prefix, such as P_REXW.  */
 895     int ext = subop & ~0x7;
 896     subop &= 0x7;
 897
 898     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 899 }
 900
 901 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 902 {
 903     int rexw = 0;
 904
 905     if (arg == ret) {
 906         return true;
 907     }
 908     switch (type) {
 909     case TCG_TYPE_I64:
 910         rexw = P_REXW;
 911         /* fallthru */
 912     case TCG_TYPE_I32:
 913         if (ret < 16) {
 914             if (arg < 16) {
 915                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 916             } else {
 917                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 918             }
 919         } else {
 920             if (arg < 16) {
 921                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 922             } else {
 923                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 924             }
 925         }
 926         break;
 927
 928     case TCG_TYPE_V64:
 929         tcg_debug_assert(ret >= 16 && arg >= 16);
 930         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 931         break;
 932     case TCG_TYPE_V128:
 933         tcg_debug_assert(ret >= 16 && arg >= 16);
 934         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 935         break;
 936     case TCG_TYPE_V256:
 937         tcg_debug_assert(ret >= 16 && arg >= 16);
 938         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 939         break;
 940
 941     default:
 942         g_assert_not_reached();
 943     }
 944     return true;
 945 }
 946
 947 static const int avx2_dup_insn[4] = {
 948     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 949     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 950 };
 951
 952 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 953                             TCGReg r, TCGReg a)
 954 {
 955     if (have_avx2) {
 956         tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type);
 957     } else {
 958         switch (vece) {
 959         case MO_8:
 960             /* ??? With zero in a register, use PSHUFB.  */
 961             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 962             a = r;
 963             /* FALLTHRU */
 964         case MO_16:
 965             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 966             a = r;
 967             /* FALLTHRU */
 968         case MO_32:
 969             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 970             /* imm8 operand: all output lanes selected from input lane 0.  */
 971             tcg_out8(s, 0);
 972             break;
 973         case MO_64:
 974             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 975             break;
 976         default:
 977             g_assert_not_reached();
 978         }
 979     }
 980     return true;
 981 }
 982
 983 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 984                              TCGReg r, TCGReg base, intptr_t offset)
 985 {
 986     if (have_avx2) {
 987         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 988         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 989                                  r, 0, base, offset);
 990     } else {
 991         switch (vece) {
 992         case MO_64:
 993             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 994             break;
 995         case MO_32:
 996             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 997             break;
 998         case MO_16:
 999             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
1000             tcg_out8(s, 0); /* imm8 */
1001             tcg_out_dup_vec(s, type, vece, r, r);
1002             break;
1003         case MO_8:
1004             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
1005             tcg_out8(s, 0); /* imm8 */
1006             tcg_out_dup_vec(s, type, vece, r, r);
1007             break;
1008         default:
1009             g_assert_not_reached();
1010         }
1011     }
1012     return true;
1013 }
1014
1015 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
1016                              TCGReg ret, int64_t arg)
1017 {
1018     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
1019
1020     if (arg == 0) {
1021         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1022         return;
1023     }
1024     if (arg == -1) {
1025         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
1026         return;
1027     }
1028
1029     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
1030         if (have_avx2) {
1031             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
1032         } else {
1033             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
1034         }
1035         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1036     } else {
1037         if (type == TCG_TYPE_V64) {
1038             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
1039         } else if (have_avx2) {
1040             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
1041         } else {
1042             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
1043         }
1044         if (TCG_TARGET_REG_BITS == 64) {
1045             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1046         } else {
1047             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
1048         }
1049     }
1050 }
1051
1052 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1053                              TCGReg ret, tcg_target_long arg)
1054 {
1055     if (arg == 0) {
1056         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1057         return;
1058     }
1059     if (arg == -1) {
1060         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1061         return;
1062     }
1063
1064     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1065     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1066     if (TCG_TARGET_REG_BITS == 64) {
1067         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1068     } else {
1069         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1070     }
1071 }
1072
1073 static void tcg_out_movi_int(TCGContext *s, TCGType type,
1074                              TCGReg ret, tcg_target_long arg)
1075 {
1076     tcg_target_long diff;
1077
1078     if (arg == 0) {
1079         tgen_arithr(s, ARITH_XOR, ret, ret);
1080         return;
1081     }
1082     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1083         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1084         tcg_out32(s, arg);
1085         return;
1086     }
1087     if (arg == (int32_t)arg) {
1088         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1089         tcg_out32(s, arg);
1090         return;
1091     }
1092
1093     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1094     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1095     if (diff == (int32_t)diff) {
1096         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1097         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1098         tcg_out32(s, diff);
1099         return;
1100     }
1101
1102     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1103     tcg_out64(s, arg);
1104 }
1105
1106 static void tcg_out_movi(TCGContext *s, TCGType type,
1107                          TCGReg ret, tcg_target_long arg)
1108 {
1109     switch (type) {
1110     case TCG_TYPE_I32:
1111 #if TCG_TARGET_REG_BITS == 64
1112     case TCG_TYPE_I64:
1113 #endif
1114         if (ret < 16) {
1115             tcg_out_movi_int(s, type, ret, arg);
1116         } else {
1117             tcg_out_movi_vec(s, type, ret, arg);
1118         }
1119         break;
1120     default:
1121         g_assert_not_reached();
1122     }
1123 }
1124
1125 static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1126 {
1127     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1128     tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1129     return true;
1130 }
1131
1132 static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1133                              tcg_target_long imm)
1134 {
1135     /* This function is only used for passing structs by reference. */
1136     tcg_debug_assert(imm == (int32_t)imm);
1137     tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1138 }
1139
1140 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1141 {
1142     if (val == (int8_t)val) {
1143         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1144         tcg_out8(s, val);
1145     } else if (val == (int32_t)val) {
1146         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1147         tcg_out32(s, val);
1148     } else {
1149         g_assert_not_reached();
1150     }
1151 }
1152
1153 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1154 {
1155     /* Given the strength of x86 memory ordering, we only need care for
1156        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1157        faster than "mfence", so don't bother with the sse insn.  */
1158     if (a0 & TCG_MO_ST_LD) {
1159         tcg_out8(s, 0xf0);
1160         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1161         tcg_out8(s, 0);
1162     }
1163 }
1164
1165 static inline void tcg_out_push(TCGContext *s, int reg)
1166 {
1167     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1168 }
1169
1170 static inline void tcg_out_pop(TCGContext *s, int reg)
1171 {
1172     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1173 }
1174
1175 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1176                        TCGReg arg1, intptr_t arg2)
1177 {
1178     switch (type) {
1179     case TCG_TYPE_I32:
1180         if (ret < 16) {
1181             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1182         } else {
1183             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1184         }
1185         break;
1186     case TCG_TYPE_I64:
1187         if (ret < 16) {
1188             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1189             break;
1190         }
1191         /* FALLTHRU */
1192     case TCG_TYPE_V64:
1193         /* There is no instruction that can validate 8-byte alignment.  */
1194         tcg_debug_assert(ret >= 16);
1195         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1196         break;
1197     case TCG_TYPE_V128:
1198         /*
1199          * The gvec infrastructure is asserts that v128 vector loads
1200          * and stores use a 16-byte aligned offset.  Validate that the
1201          * final pointer is aligned by using an insn that will SIGSEGV.
1202          */
1203         tcg_debug_assert(ret >= 16);
1204         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1205         break;
1206     case TCG_TYPE_V256:
1207         /*
1208          * The gvec infrastructure only requires 16-byte alignment,
1209          * so here we must use an unaligned load.
1210          */
1211         tcg_debug_assert(ret >= 16);
1212         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1213                                  ret, 0, arg1, arg2);
1214         break;
1215     default:
1216         g_assert_not_reached();
1217     }
1218 }
1219
1220 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1221                        TCGReg arg1, intptr_t arg2)
1222 {
1223     switch (type) {
1224     case TCG_TYPE_I32:
1225         if (arg < 16) {
1226             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1227         } else {
1228             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1229         }
1230         break;
1231     case TCG_TYPE_I64:
1232         if (arg < 16) {
1233             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1234             break;
1235         }
1236         /* FALLTHRU */
1237     case TCG_TYPE_V64:
1238         /* There is no instruction that can validate 8-byte alignment.  */
1239         tcg_debug_assert(arg >= 16);
1240         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1241         break;
1242     case TCG_TYPE_V128:
1243         /*
1244          * The gvec infrastructure is asserts that v128 vector loads
1245          * and stores use a 16-byte aligned offset.  Validate that the
1246          * final pointer is aligned by using an insn that will SIGSEGV.
1247          *
1248          * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1249          * for _WIN64, which must have SSE2 but may not have AVX.
1250          */
1251         tcg_debug_assert(arg >= 16);
1252         if (have_avx1) {
1253             tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1254         } else {
1255             tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1256         }
1257         break;
1258     case TCG_TYPE_V256:
1259         /*
1260          * The gvec infrastructure only requires 16-byte alignment,
1261          * so here we must use an unaligned store.
1262          */
1263         tcg_debug_assert(arg >= 16);
1264         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1265                                  arg, 0, arg1, arg2);
1266         break;
1267     default:
1268         g_assert_not_reached();
1269     }
1270 }
1271
1272 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1273                         TCGReg base, intptr_t ofs)
1274 {
1275     int rexw = 0;
1276     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1277         if (val != (int32_t)val) {
1278             return false;
1279         }
1280         rexw = P_REXW;
1281     } else if (type != TCG_TYPE_I32) {
1282         return false;
1283     }
1284     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1285     tcg_out32(s, val);
1286     return true;
1287 }
1288
1289 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1290 {
1291     /* Propagate an opcode prefix, such as P_DATA16.  */
1292     int ext = subopc & ~0x7;
1293     subopc &= 0x7;
1294
1295     if (count == 1) {
1296         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1297     } else {
1298         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1299         tcg_out8(s, count);
1300     }
1301 }
1302
1303 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1304 {
1305     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1306 }
1307
1308 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1309 {
1310     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1311 }
1312
1313 static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1314 {
1315     /* movzbl */
1316     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1317     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1318 }
1319
1320 static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1321 {
1322     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1323     /* movsbl */
1324     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1325     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1326 }
1327
1328 static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1329 {
1330     /* movzwl */
1331     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1332 }
1333
1334 static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1335 {
1336     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1337     /* movsw[lq] */
1338     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1339 }
1340
1341 static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1342 {
1343     /* 32-bit mov zero extends.  */
1344     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1345 }
1346
1347 static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1348 {
1349     tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1350     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1351 }
1352
1353 static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1354 {
1355     tcg_out_ext32s(s, dest, src);
1356 }
1357
1358 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1359 {
1360     if (dest != src) {
1361         tcg_out_ext32u(s, dest, src);
1362     }
1363 }
1364
1365 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1366 {
1367     tcg_out_ext32u(s, dest, src);
1368 }
1369
1370 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1371 {
1372     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1373 }
1374
1375 static void tgen_arithi(TCGContext *s, int c, int r0,
1376                         tcg_target_long val, int cf)
1377 {
1378     int rexw = 0;
1379
1380     if (TCG_TARGET_REG_BITS == 64) {
1381         rexw = c & -8;
1382         c &= 7;
1383     }
1384
1385     switch (c) {
1386     case ARITH_ADD:
1387     case ARITH_SUB:
1388         if (!cf) {
1389             /*
1390              * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1391              * partial flags update stalls on Pentium4 and are not recommended
1392              * by current Intel optimization manuals.
1393              */
1394             if (val == 1 || val == -1) {
1395                 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1396                 if (TCG_TARGET_REG_BITS == 64) {
1397                     /*
1398                      * The single-byte increment encodings are re-tasked
1399                      * as the REX prefixes.  Use the MODRM encoding.
1400                      */
1401                     tcg_out_modrm(s, OPC_GRP5 + rexw,
1402                                   (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1403                 } else {
1404                     tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1405                 }
1406                 return;
1407             }
1408             if (val == 128) {
1409                 /*
1410                  * Facilitate using an 8-bit immediate.  Carry is inverted
1411                  * by this transformation, so do it only if cf == 0.
1412                  */
1413                 c ^= ARITH_ADD ^ ARITH_SUB;
1414                 val = -128;
1415             }
1416         }
1417         break;
1418
1419     case ARITH_AND:
1420         if (TCG_TARGET_REG_BITS == 64) {
1421             if (val == 0xffffffffu) {
1422                 tcg_out_ext32u(s, r0, r0);
1423                 return;
1424             }
1425             if (val == (uint32_t)val) {
1426                 /* AND with no high bits set can use a 32-bit operation.  */
1427                 rexw = 0;
1428             }
1429         }
1430         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1431             tcg_out_ext8u(s, r0, r0);
1432             return;
1433         }
1434         if (val == 0xffffu) {
1435             tcg_out_ext16u(s, r0, r0);
1436             return;
1437         }
1438         break;
1439
1440     case ARITH_OR:
1441     case ARITH_XOR:
1442         if (val >= 0x80 && val <= 0xff
1443             && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1444             tcg_out_modrm(s, OPC_ARITH_EbIb + P_REXB_RM, c, r0);
1445             tcg_out8(s, val);
1446             return;
1447         }
1448         break;
1449     }
1450
1451     if (val == (int8_t)val) {
1452         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1453         tcg_out8(s, val);
1454         return;
1455     }
1456     if (rexw == 0 || val == (int32_t)val) {
1457         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1458         tcg_out32(s, val);
1459         return;
1460     }
1461
1462     g_assert_not_reached();
1463 }
1464
1465 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1466 {
1467     if (val != 0) {
1468         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1469     }
1470 }
1471
1472 /* Set SMALL to force a short forward branch.  */
1473 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1474 {
1475     int32_t val, val1;
1476
1477     if (l->has_value) {
1478         val = tcg_pcrel_diff(s, l->u.value_ptr);
1479         val1 = val - 2;
1480         if ((int8_t)val1 == val1) {
1481             if (opc == -1) {
1482                 tcg_out8(s, OPC_JMP_short);
1483             } else {
1484                 tcg_out8(s, OPC_JCC_short + opc);
1485             }
1486             tcg_out8(s, val1);
1487         } else {
1488             tcg_debug_assert(!small);
1489             if (opc == -1) {
1490                 tcg_out8(s, OPC_JMP_long);
1491                 tcg_out32(s, val - 5);
1492             } else {
1493                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1494                 tcg_out32(s, val - 6);
1495             }
1496         }
1497     } else if (small) {
1498         if (opc == -1) {
1499             tcg_out8(s, OPC_JMP_short);
1500         } else {
1501             tcg_out8(s, OPC_JCC_short + opc);
1502         }
1503         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1504         s->code_ptr += 1;
1505     } else {
1506         if (opc == -1) {
1507             tcg_out8(s, OPC_JMP_long);
1508         } else {
1509             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1510         }
1511         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1512         s->code_ptr += 4;
1513     }
1514 }
1515
1516 static int tcg_out_cmp(TCGContext *s, TCGCond cond, TCGArg arg1,
1517                        TCGArg arg2, int const_arg2, int rexw)
1518 {
1519     int jz, js;
1520
1521     if (!is_tst_cond(cond)) {
1522         if (!const_arg2) {
1523             tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1524         } else if (arg2 == 0) {
1525             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1526         } else {
1527             tcg_debug_assert(!rexw || arg2 == (int32_t)arg2);
1528             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1529         }
1530         return tcg_cond_to_jcc[cond];
1531     }
1532
1533     jz = tcg_cond_to_jcc[cond];
1534     js = (cond == TCG_COND_TSTNE ? JCC_JS : JCC_JNS);
1535
1536     if (!const_arg2) {
1537         tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg2);
1538         return jz;
1539     }
1540
1541     if (arg2 <= 0xff && (TCG_TARGET_REG_BITS == 64 || arg1 < 4)) {
1542         if (arg2 == 0x80) {
1543             tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1544             return js;
1545         }
1546         if (arg2 == 0xff) {
1547             tcg_out_modrm(s, OPC_TESTB | P_REXB_R, arg1, arg1);
1548             return jz;
1549         }
1550         tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, arg1);
1551         tcg_out8(s, arg2);
1552         return jz;
1553     }
1554
1555     if ((arg2 & ~0xff00) == 0 && arg1 < 4) {
1556         if (arg2 == 0x8000) {
1557             tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1558             return js;
1559         }
1560         if (arg2 == 0xff00) {
1561             tcg_out_modrm(s, OPC_TESTB, arg1 + 4, arg1 + 4);
1562             return jz;
1563         }
1564         tcg_out_modrm(s, OPC_GRP3_Eb, EXT3_TESTi, arg1 + 4);
1565         tcg_out8(s, arg2 >> 8);
1566         return jz;
1567     }
1568
1569     if (arg2 == 0xffff) {
1570         tcg_out_modrm(s, OPC_TESTL | P_DATA16, arg1, arg1);
1571         return jz;
1572     }
1573     if (arg2 == 0xffffffffu) {
1574         tcg_out_modrm(s, OPC_TESTL, arg1, arg1);
1575         return jz;
1576     }
1577
1578     if (is_power_of_2(rexw ? arg2 : (uint32_t)arg2)) {
1579         int jc = (cond == TCG_COND_TSTNE ? JCC_JB : JCC_JAE);
1580         int sh = ctz64(arg2);
1581
1582         rexw = (sh & 32 ? P_REXW : 0);
1583         if ((sh & 31) == 31) {
1584             tcg_out_modrm(s, OPC_TESTL | rexw, arg1, arg1);
1585             return js;
1586         } else {
1587             tcg_out_modrm(s, OPC_GRPBT | rexw, OPC_GRPBT_BT, arg1);
1588             tcg_out8(s, sh);
1589             return jc;
1590         }
1591     }
1592
1593     if (rexw) {
1594         if (arg2 == (uint32_t)arg2) {
1595             rexw = 0;
1596         } else {
1597             tcg_debug_assert(arg2 == (int32_t)arg2);
1598         }
1599     }
1600     tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_TESTi, arg1);
1601     tcg_out32(s, arg2);
1602     return jz;
1603 }
1604
1605 static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1606                            TCGArg arg1, TCGArg arg2, int const_arg2,
1607                            TCGLabel *label, bool small)
1608 {
1609     int jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, rexw);
1610     tcg_out_jxx(s, jcc, label, small);
1611 }
1612
1613 #if TCG_TARGET_REG_BITS == 32
1614 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1615                             const int *const_args, bool small)
1616 {
1617     TCGLabel *label_next = gen_new_label();
1618     TCGLabel *label_this = arg_label(args[5]);
1619     TCGCond cond = args[4];
1620
1621     switch (cond) {
1622     case TCG_COND_EQ:
1623     case TCG_COND_TSTEQ:
1624         tcg_out_brcond(s, 0, tcg_invert_cond(cond),
1625                        args[0], args[2], const_args[2], label_next, 1);
1626         tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1627                        label_this, small);
1628         break;
1629     case TCG_COND_NE:
1630     case TCG_COND_TSTNE:
1631         tcg_out_brcond(s, 0, cond, args[0], args[2], const_args[2],
1632                        label_this, small);
1633         tcg_out_brcond(s, 0, cond, args[1], args[3], const_args[3],
1634                        label_this, small);
1635         break;
1636     case TCG_COND_LT:
1637         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1638                        label_this, small);
1639         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1640         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1641                        label_this, small);
1642         break;
1643     case TCG_COND_LE:
1644         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1645                        label_this, small);
1646         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1647         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1648                        label_this, small);
1649         break;
1650     case TCG_COND_GT:
1651         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1652                        label_this, small);
1653         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1654         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1655                        label_this, small);
1656         break;
1657     case TCG_COND_GE:
1658         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1659                        label_this, small);
1660         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1661         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1662                        label_this, small);
1663         break;
1664     case TCG_COND_LTU:
1665         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1666                        label_this, small);
1667         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1668         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1669                        label_this, small);
1670         break;
1671     case TCG_COND_LEU:
1672         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1673                        label_this, small);
1674         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1675         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1676                        label_this, small);
1677         break;
1678     case TCG_COND_GTU:
1679         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1680                        label_this, small);
1681         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1682         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1683                        label_this, small);
1684         break;
1685     case TCG_COND_GEU:
1686         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1687                        label_this, small);
1688         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1689         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1690                        label_this, small);
1691         break;
1692     default:
1693         g_assert_not_reached();
1694     }
1695     tcg_out_label(s, label_next);
1696 }
1697 #endif
1698
1699 static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1700                             TCGArg dest, TCGArg arg1, TCGArg arg2,
1701                             int const_arg2, bool neg)
1702 {
1703     int cmp_rexw = rexw;
1704     bool inv = false;
1705     bool cleared;
1706     int jcc;
1707
1708     switch (cond) {
1709     case TCG_COND_NE:
1710         inv = true;
1711         /* fall through */
1712     case TCG_COND_EQ:
1713         /* If arg2 is 0, convert to LTU/GEU vs 1. */
1714         if (const_arg2 && arg2 == 0) {
1715             arg2 = 1;
1716             goto do_ltu;
1717         }
1718         break;
1719
1720     case TCG_COND_TSTNE:
1721         inv = true;
1722         /* fall through */
1723     case TCG_COND_TSTEQ:
1724         /* If arg2 is -1, convert to LTU/GEU vs 1. */
1725         if (const_arg2 && arg2 == 0xffffffffu) {
1726             arg2 = 1;
1727             cmp_rexw = 0;
1728             goto do_ltu;
1729         }
1730         break;
1731
1732     case TCG_COND_LEU:
1733         inv = true;
1734         /* fall through */
1735     case TCG_COND_GTU:
1736         /* If arg2 is a register, swap for LTU/GEU. */
1737         if (!const_arg2) {
1738             TCGReg t = arg1;
1739             arg1 = arg2;
1740             arg2 = t;
1741             goto do_ltu;
1742         }
1743         break;
1744
1745     case TCG_COND_GEU:
1746         inv = true;
1747         /* fall through */
1748     case TCG_COND_LTU:
1749     do_ltu:
1750         /*
1751          * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1752          * We can then use NEG or INC to produce the desired result.
1753          * This is always smaller than the SETCC expansion.
1754          */
1755         tcg_out_cmp(s, TCG_COND_LTU, arg1, arg2, const_arg2, cmp_rexw);
1756
1757         /* X - X - C = -C = (C ? -1 : 0) */
1758         tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1759         if (inv && neg) {
1760             /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1761             tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1762         } else if (inv) {
1763             /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1764             tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1765         } else if (!neg) {
1766             /* -(C ? -1 : 0) = (C ? 1 : 0) */
1767             tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1768         }
1769         return;
1770
1771     case TCG_COND_GE:
1772         inv = true;
1773         /* fall through */
1774     case TCG_COND_LT:
1775         /* If arg2 is 0, extract the sign bit. */
1776         if (const_arg2 && arg2 == 0) {
1777             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1778             if (inv) {
1779                 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1780             }
1781             tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1782                            dest, rexw ? 63 : 31);
1783             return;
1784         }
1785         break;
1786
1787     default:
1788         break;
1789     }
1790
1791     /*
1792      * If dest does not overlap the inputs, clearing it first is preferred.
1793      * The XOR breaks any false dependency for the low-byte write to dest,
1794      * and is also one byte smaller than MOVZBL.
1795      */
1796     cleared = false;
1797     if (dest != arg1 && (const_arg2 || dest != arg2)) {
1798         tgen_arithr(s, ARITH_XOR, dest, dest);
1799         cleared = true;
1800     }
1801
1802     jcc = tcg_out_cmp(s, cond, arg1, arg2, const_arg2, cmp_rexw);
1803     tcg_out_modrm(s, OPC_SETCC | jcc, 0, dest);
1804
1805     if (!cleared) {
1806         tcg_out_ext8u(s, dest, dest);
1807     }
1808     if (neg) {
1809         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1810     }
1811 }
1812
1813 #if TCG_TARGET_REG_BITS == 32
1814 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1815                              const int *const_args)
1816 {
1817     TCGArg new_args[6];
1818     TCGLabel *label_true, *label_over;
1819
1820     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1821
1822     if (args[0] == args[1] || args[0] == args[2]
1823         || (!const_args[3] && args[0] == args[3])
1824         || (!const_args[4] && args[0] == args[4])) {
1825         /* When the destination overlaps with one of the argument
1826            registers, don't do anything tricky.  */
1827         label_true = gen_new_label();
1828         label_over = gen_new_label();
1829
1830         new_args[5] = label_arg(label_true);
1831         tcg_out_brcond2(s, new_args, const_args+1, 1);
1832
1833         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1834         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1835         tcg_out_label(s, label_true);
1836
1837         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1838         tcg_out_label(s, label_over);
1839     } else {
1840         /* When the destination does not overlap one of the arguments,
1841            clear the destination first, jump if cond false, and emit an
1842            increment in the true case.  This results in smaller code.  */
1843
1844         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1845
1846         label_over = gen_new_label();
1847         new_args[4] = tcg_invert_cond(new_args[4]);
1848         new_args[5] = label_arg(label_over);
1849         tcg_out_brcond2(s, new_args, const_args+1, 1);
1850
1851         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1852         tcg_out_label(s, label_over);
1853     }
1854 }
1855 #endif
1856
1857 static void tcg_out_cmov(TCGContext *s, int jcc, int rexw,
1858                          TCGReg dest, TCGReg v1)
1859 {
1860     tcg_out_modrm(s, OPC_CMOVCC | jcc | rexw, dest, v1);
1861 }
1862
1863 static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1864                             TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1865                             TCGReg v1)
1866 {
1867     int jcc = tcg_out_cmp(s, cond, c1, c2, const_c2, rexw);
1868     tcg_out_cmov(s, jcc, rexw, dest, v1);
1869 }
1870
1871 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1872                         TCGArg arg2, bool const_a2)
1873 {
1874     if (have_bmi1) {
1875         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1876         if (const_a2) {
1877             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1878         } else {
1879             tcg_debug_assert(dest != arg2);
1880             tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1881         }
1882     } else {
1883         tcg_debug_assert(dest != arg2);
1884         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1885         tcg_out_cmov(s, JCC_JE, rexw, dest, arg2);
1886     }
1887 }
1888
1889 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1890                         TCGArg arg2, bool const_a2)
1891 {
1892     if (have_lzcnt) {
1893         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1894         if (const_a2) {
1895             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1896         } else {
1897             tcg_debug_assert(dest != arg2);
1898             tcg_out_cmov(s, JCC_JB, rexw, dest, arg2);
1899         }
1900     } else {
1901         tcg_debug_assert(!const_a2);
1902         tcg_debug_assert(dest != arg1);
1903         tcg_debug_assert(dest != arg2);
1904
1905         /* Recall that the output of BSR is the index not the count.  */
1906         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1907         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1908
1909         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1910         int jcc = tcg_out_cmp(s, TCG_COND_EQ, arg1, 0, 1, rexw);
1911         tcg_out_cmov(s, jcc, rexw, dest, arg2);
1912     }
1913 }
1914
1915 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1916 {
1917     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1918
1919     if (disp == (int32_t)disp) {
1920         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1921         tcg_out32(s, disp);
1922     } else {
1923         /* rip-relative addressing into the constant pool.
1924            This is 6 + 8 = 14 bytes, as compared to using an
1925            immediate load 10 + 6 = 16 bytes, plus we may
1926            be able to re-use the pool constant for more calls.  */
1927         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1928         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1929         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1930         tcg_out32(s, 0);
1931     }
1932 }
1933
1934 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1935                          const TCGHelperInfo *info)
1936 {
1937     tcg_out_branch(s, 1, dest);
1938
1939 #ifndef _WIN32
1940     if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1941         /*
1942          * The sysv i386 abi for struct return places a reference as the
1943          * first argument of the stack, and pops that argument with the
1944          * return statement.  Since we want to retain the aligned stack
1945          * pointer for the callee, we do not want to actually push that
1946          * argument before the call but rely on the normal store to the
1947          * stack slot.  But we do need to compensate for the pop in order
1948          * to reset our correct stack pointer value.
1949          * Pushing a garbage value back onto the stack is quickest.
1950          */
1951         tcg_out_push(s, TCG_REG_EAX);
1952     }
1953 #endif
1954 }
1955
1956 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1957 {
1958     tcg_out_branch(s, 0, dest);
1959 }
1960
1961 static void tcg_out_nopn(TCGContext *s, int n)
1962 {
1963     int i;
1964     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1965      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1966      * duplicate prefix, and all of the interesting recent cores can
1967      * decode and discard the duplicates in a single cycle.
1968      */
1969     tcg_debug_assert(n >= 1);
1970     for (i = 1; i < n; ++i) {
1971         tcg_out8(s, 0x66);
1972     }
1973     tcg_out8(s, 0x90);
1974 }
1975
1976 typedef struct {
1977     TCGReg base;
1978     int index;
1979     int ofs;
1980     int seg;
1981     TCGAtomAlign aa;
1982 } HostAddress;
1983
1984 bool tcg_target_has_memory_bswap(MemOp memop)
1985 {
1986     TCGAtomAlign aa;
1987
1988     if (!have_movbe) {
1989         return false;
1990     }
1991     if ((memop & MO_SIZE) < MO_128) {
1992         return true;
1993     }
1994
1995     /*
1996      * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1997      * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1998      */
1999     aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
2000     return aa.atom < MO_128;
2001 }
2002
2003 /*
2004  * Because i686 has no register parameters and because x86_64 has xchg
2005  * to handle addr/data register overlap, we have placed all input arguments
2006  * before we need might need a scratch reg.
2007  *
2008  * Even then, a scratch is only needed for l->raddr.  Rather than expose
2009  * a general-purpose scratch when we don't actually know it's available,
2010  * use the ra_gen hook to load into RAX if needed.
2011  */
2012 #if TCG_TARGET_REG_BITS == 64
2013 static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
2014 {
2015     if (arg < 0) {
2016         arg = TCG_REG_RAX;
2017     }
2018     tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
2019     return arg;
2020 }
2021 static const TCGLdstHelperParam ldst_helper_param = {
2022     .ra_gen = ldst_ra_gen
2023 };
2024 #else
2025 static const TCGLdstHelperParam ldst_helper_param = { };
2026 #endif
2027
2028 static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
2029                                 TCGReg l, TCGReg h, TCGReg v)
2030 {
2031     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
2032
2033     /* vpmov{d,q} %v, %l */
2034     tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
2035     /* vpextr{d,q} $1, %v, %h */
2036     tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
2037     tcg_out8(s, 1);
2038 }
2039
2040 static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
2041                                 TCGReg v, TCGReg l, TCGReg h)
2042 {
2043     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
2044
2045     /* vmov{d,q} %l, %v */
2046     tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
2047     /* vpinsr{d,q} $1, %h, %v, %v */
2048     tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
2049     tcg_out8(s, 1);
2050 }
2051
2052 /*
2053  * Generate code for the slow path for a load at the end of block
2054  */
2055 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2056 {
2057     MemOp opc = get_memop(l->oi);
2058     tcg_insn_unit **label_ptr = &l->label_ptr[0];
2059
2060     /* resolve label address */
2061     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2062     if (label_ptr[1]) {
2063         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2064     }
2065
2066     tcg_out_ld_helper_args(s, l, &ldst_helper_param);
2067     tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
2068     tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
2069
2070     tcg_out_jmp(s, l->raddr);
2071     return true;
2072 }
2073
2074 /*
2075  * Generate code for the slow path for a store at the end of block
2076  */
2077 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2078 {
2079     MemOp opc = get_memop(l->oi);
2080     tcg_insn_unit **label_ptr = &l->label_ptr[0];
2081
2082     /* resolve label address */
2083     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
2084     if (label_ptr[1]) {
2085         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
2086     }
2087
2088     tcg_out_st_helper_args(s, l, &ldst_helper_param);
2089     tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
2090
2091     tcg_out_jmp(s, l->raddr);
2092     return true;
2093 }
2094
2095 #ifdef CONFIG_USER_ONLY
2096 static HostAddress x86_guest_base = {
2097     .index = -1
2098 };
2099
2100 #if defined(__x86_64__) && defined(__linux__)
2101 # include <asm/prctl.h>
2102 # include <sys/prctl.h>
2103 int arch_prctl(int code, unsigned long addr);
2104 static inline int setup_guest_base_seg(void)
2105 {
2106     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2107         return P_GS;
2108     }
2109     return 0;
2110 }
2111 #define setup_guest_base_seg  setup_guest_base_seg
2112 #elif defined(__x86_64__) && \
2113       (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
2114 # include <machine/sysarch.h>
2115 static inline int setup_guest_base_seg(void)
2116 {
2117     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2118         return P_GS;
2119     }
2120     return 0;
2121 }
2122 #define setup_guest_base_seg  setup_guest_base_seg
2123 #endif
2124 #else
2125 # define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
2126 #endif /* CONFIG_USER_ONLY */
2127 #ifndef setup_guest_base_seg
2128 # define setup_guest_base_seg()  0
2129 #endif
2130
2131 #define MIN_TLB_MASK_TABLE_OFS  INT_MIN
2132
2133 /*
2134  * For softmmu, perform the TLB load and compare.
2135  * For useronly, perform any required alignment tests.
2136  * In both cases, return a TCGLabelQemuLdst structure if the slow path
2137  * is required and fill in @h with the host address for the fast path.
2138  */
2139 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
2140                                            TCGReg addrlo, TCGReg addrhi,
2141                                            MemOpIdx oi, bool is_ld)
2142 {
2143     TCGLabelQemuLdst *ldst = NULL;
2144     MemOp opc = get_memop(oi);
2145     MemOp s_bits = opc & MO_SIZE;
2146     unsigned a_mask;
2147
2148     if (tcg_use_softmmu) {
2149         h->index = TCG_REG_L0;
2150         h->ofs = 0;
2151         h->seg = 0;
2152     } else {
2153         *h = x86_guest_base;
2154     }
2155     h->base = addrlo;
2156     h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2157     a_mask = (1 << h->aa.align) - 1;
2158
2159     if (tcg_use_softmmu) {
2160         int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2161                             : offsetof(CPUTLBEntry, addr_write);
2162         TCGType ttype = TCG_TYPE_I32;
2163         TCGType tlbtype = TCG_TYPE_I32;
2164         int trexw = 0, hrexw = 0, tlbrexw = 0;
2165         unsigned mem_index = get_mmuidx(oi);
2166         unsigned s_mask = (1 << s_bits) - 1;
2167         int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2168         int tlb_mask;
2169
2170         ldst = new_ldst_label(s);
2171         ldst->is_ld = is_ld;
2172         ldst->oi = oi;
2173         ldst->addrlo_reg = addrlo;
2174         ldst->addrhi_reg = addrhi;
2175
2176         if (TCG_TARGET_REG_BITS == 64) {
2177             ttype = s->addr_type;
2178             trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2179             if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2180                 hrexw = P_REXW;
2181                 if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2182                     tlbtype = TCG_TYPE_I64;
2183                     tlbrexw = P_REXW;
2184                 }
2185             }
2186         }
2187
2188         tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2189         tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2190                        s->page_bits - CPU_TLB_ENTRY_BITS);
2191
2192         tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2193                              fast_ofs + offsetof(CPUTLBDescFast, mask));
2194
2195         tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2196                              fast_ofs + offsetof(CPUTLBDescFast, table));
2197
2198         /*
2199          * If the required alignment is at least as large as the access,
2200          * simply copy the address and mask.  For lesser alignments,
2201          * check that we don't cross pages for the complete access.
2202          */
2203         if (a_mask >= s_mask) {
2204             tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2205         } else {
2206             tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2207                                  addrlo, s_mask - a_mask);
2208         }
2209         tlb_mask = s->page_mask | a_mask;
2210         tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2211
2212         /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2213         tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2214                              TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2215
2216         /* jne slow_path */
2217         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2218         ldst->label_ptr[0] = s->code_ptr;
2219         s->code_ptr += 4;
2220
2221         if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2222             /* cmp 4(TCG_REG_L0), addrhi */
2223             tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2224                                  TCG_REG_L0, cmp_ofs + 4);
2225
2226             /* jne slow_path */
2227             tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2228             ldst->label_ptr[1] = s->code_ptr;
2229             s->code_ptr += 4;
2230         }
2231
2232         /* TLB Hit.  */
2233         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2234                    offsetof(CPUTLBEntry, addend));
2235     } else if (a_mask) {
2236         int jcc;
2237
2238         ldst = new_ldst_label(s);
2239         ldst->is_ld = is_ld;
2240         ldst->oi = oi;
2241         ldst->addrlo_reg = addrlo;
2242         ldst->addrhi_reg = addrhi;
2243
2244         /* jne slow_path */
2245         jcc = tcg_out_cmp(s, TCG_COND_TSTNE, addrlo, a_mask, true, false);
2246         tcg_out_opc(s, OPC_JCC_long + jcc, 0, 0, 0);
2247         ldst->label_ptr[0] = s->code_ptr;
2248         s->code_ptr += 4;
2249     }
2250
2251     return ldst;
2252 }
2253
2254 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2255                                    HostAddress h, TCGType type, MemOp memop)
2256 {
2257     bool use_movbe = false;
2258     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2259     int movop = OPC_MOVL_GvEv;
2260
2261     /* Do big-endian loads with movbe.  */
2262     if (memop & MO_BSWAP) {
2263         tcg_debug_assert(have_movbe);
2264         use_movbe = true;
2265         movop = OPC_MOVBE_GyMy;
2266     }
2267
2268     switch (memop & MO_SSIZE) {
2269     case MO_UB:
2270         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2271                                  h.base, h.index, 0, h.ofs);
2272         break;
2273     case MO_SB:
2274         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2275                                  h.base, h.index, 0, h.ofs);
2276         break;
2277     case MO_UW:
2278         if (use_movbe) {
2279             /* There is no extending movbe; only low 16-bits are modified.  */
2280             if (datalo != h.base && datalo != h.index) {
2281                 /* XOR breaks dependency chains.  */
2282                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2283                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2284                                          datalo, h.base, h.index, 0, h.ofs);
2285             } else {
2286                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2287                                          datalo, h.base, h.index, 0, h.ofs);
2288                 tcg_out_ext16u(s, datalo, datalo);
2289             }
2290         } else {
2291             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2292                                      h.base, h.index, 0, h.ofs);
2293         }
2294         break;
2295     case MO_SW:
2296         if (use_movbe) {
2297             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2298                                      datalo, h.base, h.index, 0, h.ofs);
2299             tcg_out_ext16s(s, type, datalo, datalo);
2300         } else {
2301             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2302                                      datalo, h.base, h.index, 0, h.ofs);
2303         }
2304         break;
2305     case MO_UL:
2306         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2307                                  h.base, h.index, 0, h.ofs);
2308         break;
2309 #if TCG_TARGET_REG_BITS == 64
2310     case MO_SL:
2311         if (use_movbe) {
2312             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2313                                      h.base, h.index, 0, h.ofs);
2314             tcg_out_ext32s(s, datalo, datalo);
2315         } else {
2316             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2317                                      h.base, h.index, 0, h.ofs);
2318         }
2319         break;
2320 #endif
2321     case MO_UQ:
2322         if (TCG_TARGET_REG_BITS == 64) {
2323             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2324                                      h.base, h.index, 0, h.ofs);
2325             break;
2326         }
2327         if (use_movbe) {
2328             TCGReg t = datalo;
2329             datalo = datahi;
2330             datahi = t;
2331         }
2332         if (h.base == datalo || h.index == datalo) {
2333             tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2334                                      h.base, h.index, 0, h.ofs);
2335             tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2336             tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2337         } else {
2338             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2339                                      h.base, h.index, 0, h.ofs);
2340             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2341                                      h.base, h.index, 0, h.ofs + 4);
2342         }
2343         break;
2344
2345     case MO_128:
2346         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2347
2348         /*
2349          * Without 16-byte atomicity, use integer regs.
2350          * That is where we want the data, and it allows bswaps.
2351          */
2352         if (h.aa.atom < MO_128) {
2353             if (use_movbe) {
2354                 TCGReg t = datalo;
2355                 datalo = datahi;
2356                 datahi = t;
2357             }
2358             if (h.base == datalo || h.index == datalo) {
2359                 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2360                                          h.base, h.index, 0, h.ofs);
2361                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2362                                      datalo, datahi, 0);
2363                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2364                                      datahi, datahi, 8);
2365             } else {
2366                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2367                                          h.base, h.index, 0, h.ofs);
2368                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2369                                          h.base, h.index, 0, h.ofs + 8);
2370             }
2371             break;
2372         }
2373
2374         /*
2375          * With 16-byte atomicity, a vector load is required.
2376          * If we already have 16-byte alignment, then VMOVDQA always works.
2377          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2378          * Else use we require a runtime test for alignment for VMOVDQA;
2379          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2380          */
2381         if (h.aa.align >= MO_128) {
2382             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2383                                          TCG_TMP_VEC, 0,
2384                                          h.base, h.index, 0, h.ofs);
2385         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2386             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2387                                          TCG_TMP_VEC, 0,
2388                                          h.base, h.index, 0, h.ofs);
2389         } else {
2390             TCGLabel *l1 = gen_new_label();
2391             TCGLabel *l2 = gen_new_label();
2392             int jcc;
2393
2394             jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2395             tcg_out_jxx(s, jcc, l1, true);
2396
2397             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2398                                          TCG_TMP_VEC, 0,
2399                                          h.base, h.index, 0, h.ofs);
2400             tcg_out_jxx(s, JCC_JMP, l2, true);
2401
2402             tcg_out_label(s, l1);
2403             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2404                                          TCG_TMP_VEC, 0,
2405                                          h.base, h.index, 0, h.ofs);
2406             tcg_out_label(s, l2);
2407         }
2408         tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2409         break;
2410
2411     default:
2412         g_assert_not_reached();
2413     }
2414 }
2415
2416 static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2417                             TCGReg addrlo, TCGReg addrhi,
2418                             MemOpIdx oi, TCGType data_type)
2419 {
2420     TCGLabelQemuLdst *ldst;
2421     HostAddress h;
2422
2423     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2424     tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2425
2426     if (ldst) {
2427         ldst->type = data_type;
2428         ldst->datalo_reg = datalo;
2429         ldst->datahi_reg = datahi;
2430         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2431     }
2432 }
2433
2434 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2435                                    HostAddress h, MemOp memop)
2436 {
2437     bool use_movbe = false;
2438     int movop = OPC_MOVL_EvGv;
2439
2440     /*
2441      * Do big-endian stores with movbe or system-mode.
2442      * User-only without movbe will have its swapping done generically.
2443      */
2444     if (memop & MO_BSWAP) {
2445         tcg_debug_assert(have_movbe);
2446         use_movbe = true;
2447         movop = OPC_MOVBE_MyGy;
2448     }
2449
2450     switch (memop & MO_SIZE) {
2451     case MO_8:
2452         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2453         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2454         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2455                                  datalo, h.base, h.index, 0, h.ofs);
2456         break;
2457     case MO_16:
2458         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2459                                  h.base, h.index, 0, h.ofs);
2460         break;
2461     case MO_32:
2462         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2463                                  h.base, h.index, 0, h.ofs);
2464         break;
2465     case MO_64:
2466         if (TCG_TARGET_REG_BITS == 64) {
2467             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2468                                      h.base, h.index, 0, h.ofs);
2469         } else {
2470             if (use_movbe) {
2471                 TCGReg t = datalo;
2472                 datalo = datahi;
2473                 datahi = t;
2474             }
2475             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2476                                      h.base, h.index, 0, h.ofs);
2477             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2478                                      h.base, h.index, 0, h.ofs + 4);
2479         }
2480         break;
2481
2482     case MO_128:
2483         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2484
2485         /*
2486          * Without 16-byte atomicity, use integer regs.
2487          * That is where we have the data, and it allows bswaps.
2488          */
2489         if (h.aa.atom < MO_128) {
2490             if (use_movbe) {
2491                 TCGReg t = datalo;
2492                 datalo = datahi;
2493                 datahi = t;
2494             }
2495             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2496                                      h.base, h.index, 0, h.ofs);
2497             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2498                                      h.base, h.index, 0, h.ofs + 8);
2499             break;
2500         }
2501
2502         /*
2503          * With 16-byte atomicity, a vector store is required.
2504          * If we already have 16-byte alignment, then VMOVDQA always works.
2505          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2506          * Else use we require a runtime test for alignment for VMOVDQA;
2507          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2508          */
2509         tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2510         if (h.aa.align >= MO_128) {
2511             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2512                                          TCG_TMP_VEC, 0,
2513                                          h.base, h.index, 0, h.ofs);
2514         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2515             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2516                                          TCG_TMP_VEC, 0,
2517                                          h.base, h.index, 0, h.ofs);
2518         } else {
2519             TCGLabel *l1 = gen_new_label();
2520             TCGLabel *l2 = gen_new_label();
2521             int jcc;
2522
2523             jcc = tcg_out_cmp(s, TCG_COND_TSTNE, h.base, 15, true, false);
2524             tcg_out_jxx(s, jcc, l1, true);
2525
2526             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2527                                          TCG_TMP_VEC, 0,
2528                                          h.base, h.index, 0, h.ofs);
2529             tcg_out_jxx(s, JCC_JMP, l2, true);
2530
2531             tcg_out_label(s, l1);
2532             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2533                                          TCG_TMP_VEC, 0,
2534                                          h.base, h.index, 0, h.ofs);
2535             tcg_out_label(s, l2);
2536         }
2537         break;
2538
2539     default:
2540         g_assert_not_reached();
2541     }
2542 }
2543
2544 static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2545                             TCGReg addrlo, TCGReg addrhi,
2546                             MemOpIdx oi, TCGType data_type)
2547 {
2548     TCGLabelQemuLdst *ldst;
2549     HostAddress h;
2550
2551     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2552     tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2553
2554     if (ldst) {
2555         ldst->type = data_type;
2556         ldst->datalo_reg = datalo;
2557         ldst->datahi_reg = datahi;
2558         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2559     }
2560 }
2561
2562 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2563 {
2564     /* Reuse the zeroing that exists for goto_ptr.  */
2565     if (a0 == 0) {
2566         tcg_out_jmp(s, tcg_code_gen_epilogue);
2567     } else {
2568         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2569         tcg_out_jmp(s, tb_ret_addr);
2570     }
2571 }
2572
2573 static void tcg_out_goto_tb(TCGContext *s, int which)
2574 {
2575     /*
2576      * Jump displacement must be aligned for atomic patching;
2577      * see if we need to add extra nops before jump
2578      */
2579     int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2580     if (gap != 1) {
2581         tcg_out_nopn(s, gap - 1);
2582     }
2583     tcg_out8(s, OPC_JMP_long); /* jmp im */
2584     set_jmp_insn_offset(s, which);
2585     tcg_out32(s, 0);
2586     set_jmp_reset_offset(s, which);
2587 }
2588
2589 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2590                               uintptr_t jmp_rx, uintptr_t jmp_rw)
2591 {
2592     /* patch the branch destination */
2593     uintptr_t addr = tb->jmp_target_addr[n];
2594     qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2595     /* no need to flush icache explicitly */
2596 }
2597
2598 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2599                               const TCGArg args[TCG_MAX_OP_ARGS],
2600                               const int const_args[TCG_MAX_OP_ARGS])
2601 {
2602     TCGArg a0, a1, a2;
2603     int c, const_a2, vexop, rexw = 0;
2604
2605 #if TCG_TARGET_REG_BITS == 64
2606 # define OP_32_64(x) \
2607         case glue(glue(INDEX_op_, x), _i64): \
2608             rexw = P_REXW; /* FALLTHRU */    \
2609         case glue(glue(INDEX_op_, x), _i32)
2610 #else
2611 # define OP_32_64(x) \
2612         case glue(glue(INDEX_op_, x), _i32)
2613 #endif
2614
2615     /* Hoist the loads of the most common arguments.  */
2616     a0 = args[0];
2617     a1 = args[1];
2618     a2 = args[2];
2619     const_a2 = const_args[2];
2620
2621     switch (opc) {
2622     case INDEX_op_goto_ptr:
2623         /* jmp to the given host address (could be epilogue) */
2624         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2625         break;
2626     case INDEX_op_br:
2627         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2628         break;
2629     OP_32_64(ld8u):
2630         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2631         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2632         break;
2633     OP_32_64(ld8s):
2634         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2635         break;
2636     OP_32_64(ld16u):
2637         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2638         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2639         break;
2640     OP_32_64(ld16s):
2641         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2642         break;
2643 #if TCG_TARGET_REG_BITS == 64
2644     case INDEX_op_ld32u_i64:
2645 #endif
2646     case INDEX_op_ld_i32:
2647         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2648         break;
2649
2650     OP_32_64(st8):
2651         if (const_args[0]) {
2652             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2653             tcg_out8(s, a0);
2654         } else {
2655             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2656         }
2657         break;
2658     OP_32_64(st16):
2659         if (const_args[0]) {
2660             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2661             tcg_out16(s, a0);
2662         } else {
2663             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2664         }
2665         break;
2666 #if TCG_TARGET_REG_BITS == 64
2667     case INDEX_op_st32_i64:
2668 #endif
2669     case INDEX_op_st_i32:
2670         if (const_args[0]) {
2671             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2672             tcg_out32(s, a0);
2673         } else {
2674             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2675         }
2676         break;
2677
2678     OP_32_64(add):
2679         /* For 3-operand addition, use LEA.  */
2680         if (a0 != a1) {
2681             TCGArg c3 = 0;
2682             if (const_a2) {
2683                 c3 = a2, a2 = -1;
2684             } else if (a0 == a2) {
2685                 /* Watch out for dest = src + dest, since we've removed
2686                    the matching constraint on the add.  */
2687                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2688                 break;
2689             }
2690
2691             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2692             break;
2693         }
2694         c = ARITH_ADD;
2695         goto gen_arith;
2696     OP_32_64(sub):
2697         c = ARITH_SUB;
2698         goto gen_arith;
2699     OP_32_64(and):
2700         c = ARITH_AND;
2701         goto gen_arith;
2702     OP_32_64(or):
2703         c = ARITH_OR;
2704         goto gen_arith;
2705     OP_32_64(xor):
2706         c = ARITH_XOR;
2707         goto gen_arith;
2708     gen_arith:
2709         if (const_a2) {
2710             tgen_arithi(s, c + rexw, a0, a2, 0);
2711         } else {
2712             tgen_arithr(s, c + rexw, a0, a2);
2713         }
2714         break;
2715
2716     OP_32_64(andc):
2717         if (const_a2) {
2718             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2719             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2720         } else {
2721             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2722         }
2723         break;
2724
2725     OP_32_64(mul):
2726         if (const_a2) {
2727             int32_t val;
2728             val = a2;
2729             if (val == (int8_t)val) {
2730                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2731                 tcg_out8(s, val);
2732             } else {
2733                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2734                 tcg_out32(s, val);
2735             }
2736         } else {
2737             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2738         }
2739         break;
2740
2741     OP_32_64(div2):
2742         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2743         break;
2744     OP_32_64(divu2):
2745         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2746         break;
2747
2748     OP_32_64(shl):
2749         /* For small constant 3-operand shift, use LEA.  */
2750         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2751             if (a2 - 1 == 0) {
2752                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2753                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2754             } else {
2755                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2756                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2757             }
2758             break;
2759         }
2760         c = SHIFT_SHL;
2761         vexop = OPC_SHLX;
2762         goto gen_shift_maybe_vex;
2763     OP_32_64(shr):
2764         c = SHIFT_SHR;
2765         vexop = OPC_SHRX;
2766         goto gen_shift_maybe_vex;
2767     OP_32_64(sar):
2768         c = SHIFT_SAR;
2769         vexop = OPC_SARX;
2770         goto gen_shift_maybe_vex;
2771     OP_32_64(rotl):
2772         c = SHIFT_ROL;
2773         goto gen_shift;
2774     OP_32_64(rotr):
2775         c = SHIFT_ROR;
2776         goto gen_shift;
2777     gen_shift_maybe_vex:
2778         if (have_bmi2) {
2779             if (!const_a2) {
2780                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2781                 break;
2782             }
2783             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2784         }
2785         /* FALLTHRU */
2786     gen_shift:
2787         if (const_a2) {
2788             tcg_out_shifti(s, c + rexw, a0, a2);
2789         } else {
2790             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2791         }
2792         break;
2793
2794     OP_32_64(ctz):
2795         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2796         break;
2797     OP_32_64(clz):
2798         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2799         break;
2800     OP_32_64(ctpop):
2801         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2802         break;
2803
2804     OP_32_64(brcond):
2805         tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2806                        arg_label(args[3]), 0);
2807         break;
2808     OP_32_64(setcond):
2809         tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2810         break;
2811     OP_32_64(negsetcond):
2812         tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2813         break;
2814     OP_32_64(movcond):
2815         tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2816         break;
2817
2818     OP_32_64(bswap16):
2819         if (a2 & TCG_BSWAP_OS) {
2820             /* Output must be sign-extended. */
2821             if (rexw) {
2822                 tcg_out_bswap64(s, a0);
2823                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2824             } else {
2825                 tcg_out_bswap32(s, a0);
2826                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2827             }
2828         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2829             /* Output must be zero-extended, but input isn't. */
2830             tcg_out_bswap32(s, a0);
2831             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2832         } else {
2833             tcg_out_rolw_8(s, a0);
2834         }
2835         break;
2836     OP_32_64(bswap32):
2837         tcg_out_bswap32(s, a0);
2838         if (rexw && (a2 & TCG_BSWAP_OS)) {
2839             tcg_out_ext32s(s, a0, a0);
2840         }
2841         break;
2842
2843     OP_32_64(neg):
2844         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2845         break;
2846     OP_32_64(not):
2847         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2848         break;
2849
2850     case INDEX_op_qemu_ld_a64_i32:
2851         if (TCG_TARGET_REG_BITS == 32) {
2852             tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2853             break;
2854         }
2855         /* fall through */
2856     case INDEX_op_qemu_ld_a32_i32:
2857         tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2858         break;
2859     case INDEX_op_qemu_ld_a32_i64:
2860         if (TCG_TARGET_REG_BITS == 64) {
2861             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2862         } else {
2863             tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2864         }
2865         break;
2866     case INDEX_op_qemu_ld_a64_i64:
2867         if (TCG_TARGET_REG_BITS == 64) {
2868             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2869         } else {
2870             tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2871         }
2872         break;
2873     case INDEX_op_qemu_ld_a32_i128:
2874     case INDEX_op_qemu_ld_a64_i128:
2875         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2876         tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2877         break;
2878
2879     case INDEX_op_qemu_st_a64_i32:
2880     case INDEX_op_qemu_st8_a64_i32:
2881         if (TCG_TARGET_REG_BITS == 32) {
2882             tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2883             break;
2884         }
2885         /* fall through */
2886     case INDEX_op_qemu_st_a32_i32:
2887     case INDEX_op_qemu_st8_a32_i32:
2888         tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2889         break;
2890     case INDEX_op_qemu_st_a32_i64:
2891         if (TCG_TARGET_REG_BITS == 64) {
2892             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2893         } else {
2894             tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2895         }
2896         break;
2897     case INDEX_op_qemu_st_a64_i64:
2898         if (TCG_TARGET_REG_BITS == 64) {
2899             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2900         } else {
2901             tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2902         }
2903         break;
2904     case INDEX_op_qemu_st_a32_i128:
2905     case INDEX_op_qemu_st_a64_i128:
2906         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2907         tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2908         break;
2909
2910     OP_32_64(mulu2):
2911         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2912         break;
2913     OP_32_64(muls2):
2914         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2915         break;
2916     OP_32_64(add2):
2917         if (const_args[4]) {
2918             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2919         } else {
2920             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2921         }
2922         if (const_args[5]) {
2923             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2924         } else {
2925             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2926         }
2927         break;
2928     OP_32_64(sub2):
2929         if (const_args[4]) {
2930             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2931         } else {
2932             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2933         }
2934         if (const_args[5]) {
2935             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2936         } else {
2937             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2938         }
2939         break;
2940
2941 #if TCG_TARGET_REG_BITS == 32
2942     case INDEX_op_brcond2_i32:
2943         tcg_out_brcond2(s, args, const_args, 0);
2944         break;
2945     case INDEX_op_setcond2_i32:
2946         tcg_out_setcond2(s, args, const_args);
2947         break;
2948 #else /* TCG_TARGET_REG_BITS == 64 */
2949     case INDEX_op_ld32s_i64:
2950         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2951         break;
2952     case INDEX_op_ld_i64:
2953         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2954         break;
2955     case INDEX_op_st_i64:
2956         if (const_args[0]) {
2957             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2958             tcg_out32(s, a0);
2959         } else {
2960             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2961         }
2962         break;
2963
2964     case INDEX_op_bswap64_i64:
2965         tcg_out_bswap64(s, a0);
2966         break;
2967     case INDEX_op_extrh_i64_i32:
2968         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2969         break;
2970 #endif
2971
2972     OP_32_64(deposit):
2973         if (args[3] == 0 && args[4] == 8) {
2974             /* load bits 0..7 */
2975             if (const_a2) {
2976                 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2977                             0, a0, 0);
2978                 tcg_out8(s, a2);
2979             } else {
2980                 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2981             }
2982         } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2983             /* load bits 8..15 */
2984             if (const_a2) {
2985                 tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2986                 tcg_out8(s, a2);
2987             } else {
2988                 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2989             }
2990         } else if (args[3] == 0 && args[4] == 16) {
2991             /* load bits 0..15 */
2992             if (const_a2) {
2993                 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2994                             0, a0, 0);
2995                 tcg_out16(s, a2);
2996             } else {
2997                 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2998             }
2999         } else {
3000             g_assert_not_reached();
3001         }
3002         break;
3003
3004     case INDEX_op_extract_i64:
3005         if (a2 + args[3] == 32) {
3006             /* This is a 32-bit zero-extending right shift.  */
3007             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
3008             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
3009             break;
3010         }
3011         /* FALLTHRU */
3012     case INDEX_op_extract_i32:
3013         /* On the off-chance that we can use the high-byte registers.
3014            Otherwise we emit the same ext16 + shift pattern that we
3015            would have gotten from the normal tcg-op.c expansion.  */
3016         tcg_debug_assert(a2 == 8 && args[3] == 8);
3017         if (a1 < 4 && a0 < 8) {
3018             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
3019         } else {
3020             tcg_out_ext16u(s, a0, a1);
3021             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
3022         }
3023         break;
3024
3025     case INDEX_op_sextract_i32:
3026         /* We don't implement sextract_i64, as we cannot sign-extend to
3027            64-bits without using the REX prefix that explicitly excludes
3028            access to the high-byte registers.  */
3029         tcg_debug_assert(a2 == 8 && args[3] == 8);
3030         if (a1 < 4 && a0 < 8) {
3031             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
3032         } else {
3033             tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
3034             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
3035         }
3036         break;
3037
3038     OP_32_64(extract2):
3039         /* Note that SHRD outputs to the r/m operand.  */
3040         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
3041         tcg_out8(s, args[3]);
3042         break;
3043
3044     case INDEX_op_mb:
3045         tcg_out_mb(s, a0);
3046         break;
3047     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
3048     case INDEX_op_mov_i64:
3049     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
3050     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
3051     case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
3052     case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
3053     case INDEX_op_ext8s_i64:
3054     case INDEX_op_ext8u_i32:
3055     case INDEX_op_ext8u_i64:
3056     case INDEX_op_ext16s_i32:
3057     case INDEX_op_ext16s_i64:
3058     case INDEX_op_ext16u_i32:
3059     case INDEX_op_ext16u_i64:
3060     case INDEX_op_ext32s_i64:
3061     case INDEX_op_ext32u_i64:
3062     case INDEX_op_ext_i32_i64:
3063     case INDEX_op_extu_i32_i64:
3064     case INDEX_op_extrl_i64_i32:
3065     default:
3066         g_assert_not_reached();
3067     }
3068
3069 #undef OP_32_64
3070 }
3071
3072 static int const umin_insn[4] = {
3073     OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
3074 };
3075
3076 static int const umax_insn[4] = {
3077     OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
3078 };
3079
3080 static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece,
3081                                   TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond)
3082 {
3083     static int const cmpeq_insn[4] = {
3084         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
3085     };
3086     static int const cmpgt_insn[4] = {
3087         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
3088     };
3089
3090     enum {
3091         NEED_INV  = 1,
3092         NEED_SWAP = 2,
3093         NEED_UMIN = 4,
3094         NEED_UMAX = 8,
3095         INVALID   = 16,
3096     };
3097     static const uint8_t cond_fixup[16] = {
3098         [0 ... 15] = INVALID,
3099         [TCG_COND_EQ] = 0,
3100         [TCG_COND_GT] = 0,
3101         [TCG_COND_NE] = NEED_INV,
3102         [TCG_COND_LE] = NEED_INV,
3103         [TCG_COND_LT] = NEED_SWAP,
3104         [TCG_COND_GE] = NEED_SWAP | NEED_INV,
3105         [TCG_COND_LEU] = NEED_UMIN,
3106         [TCG_COND_GTU] = NEED_UMIN | NEED_INV,
3107         [TCG_COND_GEU] = NEED_UMAX,
3108         [TCG_COND_LTU] = NEED_UMAX | NEED_INV,
3109     };
3110     int fixup = cond_fixup[cond];
3111
3112     assert(!(fixup & INVALID));
3113
3114     if (fixup & NEED_INV) {
3115         cond = tcg_invert_cond(cond);
3116     }
3117
3118     if (fixup & NEED_SWAP) {
3119         TCGReg swap = v1;
3120         v1 = v2;
3121         v2 = swap;
3122         cond = tcg_swap_cond(cond);
3123     }
3124
3125     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3126         int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]);
3127
3128         /* avx2 does not have 64-bit min/max; adjusted during expand. */
3129         assert(vece <= MO_32);
3130
3131         tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type);
3132         v2 = TCG_TMP_VEC;
3133         cond = TCG_COND_EQ;
3134     }
3135
3136     switch (cond) {
3137     case TCG_COND_EQ:
3138         tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type);
3139         break;
3140     case TCG_COND_GT:
3141         tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type);
3142         break;
3143     default:
3144         g_assert_not_reached();
3145     }
3146     return fixup & NEED_INV;
3147 }
3148
3149 static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece,
3150                                TCGReg v1, TCGReg v2, TCGCond cond)
3151 {
3152     static const int cmpm_insn[2][4] = {
3153         { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ },
3154         { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ }
3155     };
3156     static const int testm_insn[4] = {
3157         OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ
3158     };
3159     static const int testnm_insn[4] = {
3160         OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ
3161     };
3162
3163     static const int cond_ext[16] = {
3164         [TCG_COND_EQ] = 0,
3165         [TCG_COND_NE] = 4,
3166         [TCG_COND_LT] = 1,
3167         [TCG_COND_LTU] = 1,
3168         [TCG_COND_LE] = 2,
3169         [TCG_COND_LEU] = 2,
3170         [TCG_COND_NEVER] = 3,
3171         [TCG_COND_GE] = 5,
3172         [TCG_COND_GEU] = 5,
3173         [TCG_COND_GT] = 6,
3174         [TCG_COND_GTU] = 6,
3175         [TCG_COND_ALWAYS] = 7,
3176     };
3177
3178     switch (cond) {
3179     case TCG_COND_TSTNE:
3180         tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type);
3181         break;
3182     case TCG_COND_TSTEQ:
3183         tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type);
3184         break;
3185     default:
3186         tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece],
3187                                /* k1 */ 1, v1, v2, type);
3188         tcg_out8(s, cond_ext[cond]);
3189         break;
3190     }
3191 }
3192
3193 static void tcg_out_k1_to_vec(TCGContext *s, TCGType type,
3194                               unsigned vece, TCGReg dest)
3195 {
3196     static const int movm_insn[] = {
3197         OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q
3198     };
3199     tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type);
3200 }
3201
3202 static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece,
3203                             TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond)
3204 {
3205     /*
3206      * With avx512, we have a complete set of comparisons into mask.
3207      * Unless there's a single insn expansion for the comparision,
3208      * expand via a mask in k1.
3209      */
3210     if ((vece <= MO_16 ? have_avx512bw : have_avx512dq)
3211         && cond != TCG_COND_EQ
3212         && cond != TCG_COND_LT
3213         && cond != TCG_COND_GT) {
3214         tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond);
3215         tcg_out_k1_to_vec(s, type, vece, v0);
3216         return;
3217     }
3218
3219     if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) {
3220         tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1);
3221         tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type);
3222     }
3223 }
3224
3225 static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece,
3226                                   TCGReg v0, TCGReg c1, TCGReg c2,
3227                                   TCGReg v3, TCGReg v4, TCGCond cond)
3228 {
3229     static const int vpblendm_insn[] = {
3230         OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ
3231     };
3232     bool z = false;
3233
3234     /* Swap to place constant in V4 to take advantage of zero-masking. */
3235     if (!v3) {
3236         z = true;
3237         v3 = v4;
3238         cond = tcg_invert_cond(cond);
3239     }
3240
3241     tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond);
3242     tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3,
3243                             /* k1 */1, z, type);
3244 }
3245
3246 static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece,
3247                                TCGReg v0, TCGReg c1, TCGReg c2,
3248                                TCGReg v3, TCGReg v4, TCGCond cond)
3249 {
3250     bool inv;
3251
3252     if (vece <= MO_16 ? have_avx512bw : have_avx512vl) {
3253         tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond);
3254         return;
3255     }
3256
3257     inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond);
3258
3259     /*
3260      * Since XMM0 is 16, the only way we get 0 into V3
3261      * is via the constant zero constraint.
3262      */
3263     if (!v3) {
3264         if (inv) {
3265             tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type);
3266         } else {
3267             tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type);
3268         }
3269     } else {
3270         if (inv) {
3271             TCGReg swap = v3;
3272             v3 = v4;
3273             v4 = swap;
3274         }
3275         tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type);
3276         tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4);
3277     }
3278 }
3279
3280 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
3281                            unsigned vecl, unsigned vece,
3282                            const TCGArg args[TCG_MAX_OP_ARGS],
3283                            const int const_args[TCG_MAX_OP_ARGS])
3284 {
3285     static int const add_insn[4] = {
3286         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
3287     };
3288     static int const ssadd_insn[4] = {
3289         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
3290     };
3291     static int const usadd_insn[4] = {
3292         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
3293     };
3294     static int const sub_insn[4] = {
3295         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
3296     };
3297     static int const sssub_insn[4] = {
3298         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
3299     };
3300     static int const ussub_insn[4] = {
3301         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
3302     };
3303     static int const mul_insn[4] = {
3304         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
3305     };
3306     static int const shift_imm_insn[4] = {
3307         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
3308     };
3309     static int const punpckl_insn[4] = {
3310         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
3311     };
3312     static int const punpckh_insn[4] = {
3313         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
3314     };
3315     static int const packss_insn[4] = {
3316         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
3317     };
3318     static int const packus_insn[4] = {
3319         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
3320     };
3321     static int const smin_insn[4] = {
3322         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
3323     };
3324     static int const smax_insn[4] = {
3325         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
3326     };
3327     static int const rotlv_insn[4] = {
3328         OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
3329     };
3330     static int const rotrv_insn[4] = {
3331         OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
3332     };
3333     static int const shlv_insn[4] = {
3334         OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
3335     };
3336     static int const shrv_insn[4] = {
3337         OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
3338     };
3339     static int const sarv_insn[4] = {
3340         OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3341     };
3342     static int const shls_insn[4] = {
3343         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3344     };
3345     static int const shrs_insn[4] = {
3346         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3347     };
3348     static int const sars_insn[4] = {
3349         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3350     };
3351     static int const vpshldi_insn[4] = {
3352         OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3353     };
3354     static int const vpshldv_insn[4] = {
3355         OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3356     };
3357     static int const vpshrdv_insn[4] = {
3358         OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3359     };
3360     static int const abs_insn[4] = {
3361         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3362     };
3363
3364     TCGType type = vecl + TCG_TYPE_V64;
3365     int insn, sub;
3366     TCGArg a0, a1, a2, a3;
3367
3368     a0 = args[0];
3369     a1 = args[1];
3370     a2 = args[2];
3371
3372     switch (opc) {
3373     case INDEX_op_add_vec:
3374         insn = add_insn[vece];
3375         goto gen_simd;
3376     case INDEX_op_ssadd_vec:
3377         insn = ssadd_insn[vece];
3378         goto gen_simd;
3379     case INDEX_op_usadd_vec:
3380         insn = usadd_insn[vece];
3381         goto gen_simd;
3382     case INDEX_op_sub_vec:
3383         insn = sub_insn[vece];
3384         goto gen_simd;
3385     case INDEX_op_sssub_vec:
3386         insn = sssub_insn[vece];
3387         goto gen_simd;
3388     case INDEX_op_ussub_vec:
3389         insn = ussub_insn[vece];
3390         goto gen_simd;
3391     case INDEX_op_mul_vec:
3392         insn = mul_insn[vece];
3393         goto gen_simd;
3394     case INDEX_op_and_vec:
3395         insn = OPC_PAND;
3396         goto gen_simd;
3397     case INDEX_op_or_vec:
3398         insn = OPC_POR;
3399         goto gen_simd;
3400     case INDEX_op_xor_vec:
3401         insn = OPC_PXOR;
3402         goto gen_simd;
3403     case INDEX_op_smin_vec:
3404         insn = smin_insn[vece];
3405         goto gen_simd;
3406     case INDEX_op_umin_vec:
3407         insn = umin_insn[vece];
3408         goto gen_simd;
3409     case INDEX_op_smax_vec:
3410         insn = smax_insn[vece];
3411         goto gen_simd;
3412     case INDEX_op_umax_vec:
3413         insn = umax_insn[vece];
3414         goto gen_simd;
3415     case INDEX_op_shlv_vec:
3416         insn = shlv_insn[vece];
3417         goto gen_simd;
3418     case INDEX_op_shrv_vec:
3419         insn = shrv_insn[vece];
3420         goto gen_simd;
3421     case INDEX_op_sarv_vec:
3422         insn = sarv_insn[vece];
3423         goto gen_simd;
3424     case INDEX_op_rotlv_vec:
3425         insn = rotlv_insn[vece];
3426         goto gen_simd;
3427     case INDEX_op_rotrv_vec:
3428         insn = rotrv_insn[vece];
3429         goto gen_simd;
3430     case INDEX_op_shls_vec:
3431         insn = shls_insn[vece];
3432         goto gen_simd;
3433     case INDEX_op_shrs_vec:
3434         insn = shrs_insn[vece];
3435         goto gen_simd;
3436     case INDEX_op_sars_vec:
3437         insn = sars_insn[vece];
3438         goto gen_simd;
3439     case INDEX_op_x86_punpckl_vec:
3440         insn = punpckl_insn[vece];
3441         goto gen_simd;
3442     case INDEX_op_x86_punpckh_vec:
3443         insn = punpckh_insn[vece];
3444         goto gen_simd;
3445     case INDEX_op_x86_packss_vec:
3446         insn = packss_insn[vece];
3447         goto gen_simd;
3448     case INDEX_op_x86_packus_vec:
3449         insn = packus_insn[vece];
3450         goto gen_simd;
3451     case INDEX_op_x86_vpshldv_vec:
3452         insn = vpshldv_insn[vece];
3453         a1 = a2;
3454         a2 = args[3];
3455         goto gen_simd;
3456     case INDEX_op_x86_vpshrdv_vec:
3457         insn = vpshrdv_insn[vece];
3458         a1 = a2;
3459         a2 = args[3];
3460         goto gen_simd;
3461 #if TCG_TARGET_REG_BITS == 32
3462     case INDEX_op_dup2_vec:
3463         /* First merge the two 32-bit inputs to a single 64-bit element. */
3464         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3465         /* Then replicate the 64-bit elements across the rest of the vector. */
3466         if (type != TCG_TYPE_V64) {
3467             tcg_out_dup_vec(s, type, MO_64, a0, a0);
3468         }
3469         break;
3470 #endif
3471     case INDEX_op_abs_vec:
3472         insn = abs_insn[vece];
3473         a2 = a1;
3474         a1 = 0;
3475         goto gen_simd;
3476     gen_simd:
3477         tcg_debug_assert(insn != OPC_UD2);
3478         tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type);
3479         break;
3480
3481     case INDEX_op_cmp_vec:
3482         tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]);
3483         break;
3484
3485     case INDEX_op_cmpsel_vec:
3486         tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2,
3487                            args[3], args[4], args[5]);
3488         break;
3489
3490     case INDEX_op_andc_vec:
3491         insn = OPC_PANDN;
3492         tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type);
3493         break;
3494
3495     case INDEX_op_shli_vec:
3496         insn = shift_imm_insn[vece];
3497         sub = 6;
3498         goto gen_shift;
3499     case INDEX_op_shri_vec:
3500         insn = shift_imm_insn[vece];
3501         sub = 2;
3502         goto gen_shift;
3503     case INDEX_op_sari_vec:
3504         if (vece == MO_64) {
3505             insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3506         } else {
3507             insn = shift_imm_insn[vece];
3508         }
3509         sub = 4;
3510         goto gen_shift;
3511     case INDEX_op_rotli_vec:
3512         insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3513         if (vece == MO_64) {
3514             insn |= P_VEXW;
3515         }
3516         sub = 1;
3517         goto gen_shift;
3518     gen_shift:
3519         tcg_debug_assert(vece != MO_8);
3520         tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type);
3521         tcg_out8(s, a2);
3522         break;
3523
3524     case INDEX_op_ld_vec:
3525         tcg_out_ld(s, type, a0, a1, a2);
3526         break;
3527     case INDEX_op_st_vec:
3528         tcg_out_st(s, type, a0, a1, a2);
3529         break;
3530     case INDEX_op_dupm_vec:
3531         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3532         break;
3533
3534     case INDEX_op_x86_shufps_vec:
3535         insn = OPC_SHUFPS;
3536         sub = args[3];
3537         goto gen_simd_imm8;
3538     case INDEX_op_x86_blend_vec:
3539         if (vece == MO_16) {
3540             insn = OPC_PBLENDW;
3541         } else if (vece == MO_32) {
3542             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3543         } else {
3544             g_assert_not_reached();
3545         }
3546         sub = args[3];
3547         goto gen_simd_imm8;
3548     case INDEX_op_x86_vperm2i128_vec:
3549         insn = OPC_VPERM2I128;
3550         sub = args[3];
3551         goto gen_simd_imm8;
3552     case INDEX_op_x86_vpshldi_vec:
3553         insn = vpshldi_insn[vece];
3554         sub = args[3];
3555         goto gen_simd_imm8;
3556
3557     case INDEX_op_not_vec:
3558         insn = OPC_VPTERNLOGQ;
3559         a2 = a1;
3560         sub = 0x33; /* !B */
3561         goto gen_simd_imm8;
3562     case INDEX_op_nor_vec:
3563         insn = OPC_VPTERNLOGQ;
3564         sub = 0x11; /* norCB */
3565         goto gen_simd_imm8;
3566     case INDEX_op_nand_vec:
3567         insn = OPC_VPTERNLOGQ;
3568         sub = 0x77; /* nandCB */
3569         goto gen_simd_imm8;
3570     case INDEX_op_eqv_vec:
3571         insn = OPC_VPTERNLOGQ;
3572         sub = 0x99; /* xnorCB */
3573         goto gen_simd_imm8;
3574     case INDEX_op_orc_vec:
3575         insn = OPC_VPTERNLOGQ;
3576         sub = 0xdd; /* orB!C */
3577         goto gen_simd_imm8;
3578
3579     case INDEX_op_bitsel_vec:
3580         insn = OPC_VPTERNLOGQ;
3581         a3 = args[3];
3582         if (a0 == a1) {
3583             a1 = a2;
3584             a2 = a3;
3585             sub = 0xca; /* A?B:C */
3586         } else if (a0 == a2) {
3587             a2 = a3;
3588             sub = 0xe2; /* B?A:C */
3589         } else {
3590             tcg_out_mov(s, type, a0, a3);
3591             sub = 0xb8; /* B?C:A */
3592         }
3593         goto gen_simd_imm8;
3594
3595     gen_simd_imm8:
3596         tcg_debug_assert(insn != OPC_UD2);
3597         tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type);
3598         tcg_out8(s, sub);
3599         break;
3600
3601     case INDEX_op_x86_psrldq_vec:
3602         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3603         tcg_out8(s, a2);
3604         break;
3605
3606     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3607     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3608     default:
3609         g_assert_not_reached();
3610     }
3611 }
3612
3613 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3614 {
3615     switch (op) {
3616     case INDEX_op_goto_ptr:
3617         return C_O0_I1(r);
3618
3619     case INDEX_op_ld8u_i32:
3620     case INDEX_op_ld8u_i64:
3621     case INDEX_op_ld8s_i32:
3622     case INDEX_op_ld8s_i64:
3623     case INDEX_op_ld16u_i32:
3624     case INDEX_op_ld16u_i64:
3625     case INDEX_op_ld16s_i32:
3626     case INDEX_op_ld16s_i64:
3627     case INDEX_op_ld_i32:
3628     case INDEX_op_ld32u_i64:
3629     case INDEX_op_ld32s_i64:
3630     case INDEX_op_ld_i64:
3631         return C_O1_I1(r, r);
3632
3633     case INDEX_op_st8_i32:
3634     case INDEX_op_st8_i64:
3635         return C_O0_I2(qi, r);
3636
3637     case INDEX_op_st16_i32:
3638     case INDEX_op_st16_i64:
3639     case INDEX_op_st_i32:
3640     case INDEX_op_st32_i64:
3641         return C_O0_I2(ri, r);
3642
3643     case INDEX_op_st_i64:
3644         return C_O0_I2(re, r);
3645
3646     case INDEX_op_add_i32:
3647     case INDEX_op_add_i64:
3648         return C_O1_I2(r, r, re);
3649
3650     case INDEX_op_sub_i32:
3651     case INDEX_op_sub_i64:
3652     case INDEX_op_mul_i32:
3653     case INDEX_op_mul_i64:
3654     case INDEX_op_or_i32:
3655     case INDEX_op_or_i64:
3656     case INDEX_op_xor_i32:
3657     case INDEX_op_xor_i64:
3658         return C_O1_I2(r, 0, re);
3659
3660     case INDEX_op_and_i32:
3661     case INDEX_op_and_i64:
3662         return C_O1_I2(r, 0, reZ);
3663
3664     case INDEX_op_andc_i32:
3665     case INDEX_op_andc_i64:
3666         return C_O1_I2(r, r, rI);
3667
3668     case INDEX_op_shl_i32:
3669     case INDEX_op_shl_i64:
3670     case INDEX_op_shr_i32:
3671     case INDEX_op_shr_i64:
3672     case INDEX_op_sar_i32:
3673     case INDEX_op_sar_i64:
3674         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3675
3676     case INDEX_op_rotl_i32:
3677     case INDEX_op_rotl_i64:
3678     case INDEX_op_rotr_i32:
3679     case INDEX_op_rotr_i64:
3680         return C_O1_I2(r, 0, ci);
3681
3682     case INDEX_op_brcond_i32:
3683     case INDEX_op_brcond_i64:
3684         return C_O0_I2(r, reT);
3685
3686     case INDEX_op_bswap16_i32:
3687     case INDEX_op_bswap16_i64:
3688     case INDEX_op_bswap32_i32:
3689     case INDEX_op_bswap32_i64:
3690     case INDEX_op_bswap64_i64:
3691     case INDEX_op_neg_i32:
3692     case INDEX_op_neg_i64:
3693     case INDEX_op_not_i32:
3694     case INDEX_op_not_i64:
3695     case INDEX_op_extrh_i64_i32:
3696         return C_O1_I1(r, 0);
3697
3698     case INDEX_op_ext8s_i32:
3699     case INDEX_op_ext8s_i64:
3700     case INDEX_op_ext8u_i32:
3701     case INDEX_op_ext8u_i64:
3702         return C_O1_I1(r, q);
3703
3704     case INDEX_op_ext16s_i32:
3705     case INDEX_op_ext16s_i64:
3706     case INDEX_op_ext16u_i32:
3707     case INDEX_op_ext16u_i64:
3708     case INDEX_op_ext32s_i64:
3709     case INDEX_op_ext32u_i64:
3710     case INDEX_op_ext_i32_i64:
3711     case INDEX_op_extu_i32_i64:
3712     case INDEX_op_extrl_i64_i32:
3713     case INDEX_op_extract_i32:
3714     case INDEX_op_extract_i64:
3715     case INDEX_op_sextract_i32:
3716     case INDEX_op_ctpop_i32:
3717     case INDEX_op_ctpop_i64:
3718         return C_O1_I1(r, r);
3719
3720     case INDEX_op_extract2_i32:
3721     case INDEX_op_extract2_i64:
3722         return C_O1_I2(r, 0, r);
3723
3724     case INDEX_op_deposit_i32:
3725     case INDEX_op_deposit_i64:
3726         return C_O1_I2(q, 0, qi);
3727
3728     case INDEX_op_setcond_i32:
3729     case INDEX_op_setcond_i64:
3730     case INDEX_op_negsetcond_i32:
3731     case INDEX_op_negsetcond_i64:
3732         return C_O1_I2(q, r, reT);
3733
3734     case INDEX_op_movcond_i32:
3735     case INDEX_op_movcond_i64:
3736         return C_O1_I4(r, r, reT, r, 0);
3737
3738     case INDEX_op_div2_i32:
3739     case INDEX_op_div2_i64:
3740     case INDEX_op_divu2_i32:
3741     case INDEX_op_divu2_i64:
3742         return C_O2_I3(a, d, 0, 1, r);
3743
3744     case INDEX_op_mulu2_i32:
3745     case INDEX_op_mulu2_i64:
3746     case INDEX_op_muls2_i32:
3747     case INDEX_op_muls2_i64:
3748         return C_O2_I2(a, d, a, r);
3749
3750     case INDEX_op_add2_i32:
3751     case INDEX_op_add2_i64:
3752     case INDEX_op_sub2_i32:
3753     case INDEX_op_sub2_i64:
3754         return C_N1_O1_I4(r, r, 0, 1, re, re);
3755
3756     case INDEX_op_ctz_i32:
3757     case INDEX_op_ctz_i64:
3758         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3759
3760     case INDEX_op_clz_i32:
3761     case INDEX_op_clz_i64:
3762         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3763
3764     case INDEX_op_qemu_ld_a32_i32:
3765         return C_O1_I1(r, L);
3766     case INDEX_op_qemu_ld_a64_i32:
3767         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3768
3769     case INDEX_op_qemu_st_a32_i32:
3770         return C_O0_I2(L, L);
3771     case INDEX_op_qemu_st_a64_i32:
3772         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3773     case INDEX_op_qemu_st8_a32_i32:
3774         return C_O0_I2(s, L);
3775     case INDEX_op_qemu_st8_a64_i32:
3776         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3777
3778     case INDEX_op_qemu_ld_a32_i64:
3779         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3780     case INDEX_op_qemu_ld_a64_i64:
3781         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3782
3783     case INDEX_op_qemu_st_a32_i64:
3784         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3785     case INDEX_op_qemu_st_a64_i64:
3786         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3787
3788     case INDEX_op_qemu_ld_a32_i128:
3789     case INDEX_op_qemu_ld_a64_i128:
3790         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3791         return C_O2_I1(r, r, L);
3792     case INDEX_op_qemu_st_a32_i128:
3793     case INDEX_op_qemu_st_a64_i128:
3794         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3795         return C_O0_I3(L, L, L);
3796
3797     case INDEX_op_brcond2_i32:
3798         return C_O0_I4(r, r, ri, ri);
3799
3800     case INDEX_op_setcond2_i32:
3801         return C_O1_I4(r, r, r, ri, ri);
3802
3803     case INDEX_op_ld_vec:
3804     case INDEX_op_dupm_vec:
3805         return C_O1_I1(x, r);
3806
3807     case INDEX_op_st_vec:
3808         return C_O0_I2(x, r);
3809
3810     case INDEX_op_add_vec:
3811     case INDEX_op_sub_vec:
3812     case INDEX_op_mul_vec:
3813     case INDEX_op_and_vec:
3814     case INDEX_op_or_vec:
3815     case INDEX_op_xor_vec:
3816     case INDEX_op_andc_vec:
3817     case INDEX_op_orc_vec:
3818     case INDEX_op_nand_vec:
3819     case INDEX_op_nor_vec:
3820     case INDEX_op_eqv_vec:
3821     case INDEX_op_ssadd_vec:
3822     case INDEX_op_usadd_vec:
3823     case INDEX_op_sssub_vec:
3824     case INDEX_op_ussub_vec:
3825     case INDEX_op_smin_vec:
3826     case INDEX_op_umin_vec:
3827     case INDEX_op_smax_vec:
3828     case INDEX_op_umax_vec:
3829     case INDEX_op_shlv_vec:
3830     case INDEX_op_shrv_vec:
3831     case INDEX_op_sarv_vec:
3832     case INDEX_op_rotlv_vec:
3833     case INDEX_op_rotrv_vec:
3834     case INDEX_op_shls_vec:
3835     case INDEX_op_shrs_vec:
3836     case INDEX_op_sars_vec:
3837     case INDEX_op_cmp_vec:
3838     case INDEX_op_x86_shufps_vec:
3839     case INDEX_op_x86_blend_vec:
3840     case INDEX_op_x86_packss_vec:
3841     case INDEX_op_x86_packus_vec:
3842     case INDEX_op_x86_vperm2i128_vec:
3843     case INDEX_op_x86_punpckl_vec:
3844     case INDEX_op_x86_punpckh_vec:
3845     case INDEX_op_x86_vpshldi_vec:
3846 #if TCG_TARGET_REG_BITS == 32
3847     case INDEX_op_dup2_vec:
3848 #endif
3849         return C_O1_I2(x, x, x);
3850
3851     case INDEX_op_abs_vec:
3852     case INDEX_op_dup_vec:
3853     case INDEX_op_not_vec:
3854     case INDEX_op_shli_vec:
3855     case INDEX_op_shri_vec:
3856     case INDEX_op_sari_vec:
3857     case INDEX_op_rotli_vec:
3858     case INDEX_op_x86_psrldq_vec:
3859         return C_O1_I1(x, x);
3860
3861     case INDEX_op_x86_vpshldv_vec:
3862     case INDEX_op_x86_vpshrdv_vec:
3863         return C_O1_I3(x, 0, x, x);
3864
3865     case INDEX_op_bitsel_vec:
3866         return C_O1_I3(x, x, x, x);
3867     case INDEX_op_cmpsel_vec:
3868         return C_O1_I4(x, x, x, xO, x);
3869
3870     default:
3871         g_assert_not_reached();
3872     }
3873 }
3874
3875 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3876 {
3877     switch (opc) {
3878     case INDEX_op_add_vec:
3879     case INDEX_op_sub_vec:
3880     case INDEX_op_and_vec:
3881     case INDEX_op_or_vec:
3882     case INDEX_op_xor_vec:
3883     case INDEX_op_andc_vec:
3884     case INDEX_op_orc_vec:
3885     case INDEX_op_nand_vec:
3886     case INDEX_op_nor_vec:
3887     case INDEX_op_eqv_vec:
3888     case INDEX_op_not_vec:
3889     case INDEX_op_bitsel_vec:
3890         return 1;
3891     case INDEX_op_cmp_vec:
3892     case INDEX_op_cmpsel_vec:
3893         return -1;
3894
3895     case INDEX_op_rotli_vec:
3896         return have_avx512vl && vece >= MO_32 ? 1 : -1;
3897
3898     case INDEX_op_shli_vec:
3899     case INDEX_op_shri_vec:
3900         /* We must expand the operation for MO_8.  */
3901         return vece == MO_8 ? -1 : 1;
3902
3903     case INDEX_op_sari_vec:
3904         switch (vece) {
3905         case MO_8:
3906             return -1;
3907         case MO_16:
3908         case MO_32:
3909             return 1;
3910         case MO_64:
3911             if (have_avx512vl) {
3912                 return 1;
3913             }
3914             /*
3915              * We can emulate this for MO_64, but it does not pay off
3916              * unless we're producing at least 4 values.
3917              */
3918             return type >= TCG_TYPE_V256 ? -1 : 0;
3919         }
3920         return 0;
3921
3922     case INDEX_op_shls_vec:
3923     case INDEX_op_shrs_vec:
3924         return vece >= MO_16;
3925     case INDEX_op_sars_vec:
3926         switch (vece) {
3927         case MO_16:
3928         case MO_32:
3929             return 1;
3930         case MO_64:
3931             return have_avx512vl;
3932         }
3933         return 0;
3934     case INDEX_op_rotls_vec:
3935         return vece >= MO_16 ? -1 : 0;
3936
3937     case INDEX_op_shlv_vec:
3938     case INDEX_op_shrv_vec:
3939         switch (vece) {
3940         case MO_16:
3941             return have_avx512bw;
3942         case MO_32:
3943         case MO_64:
3944             return have_avx2;
3945         }
3946         return 0;
3947     case INDEX_op_sarv_vec:
3948         switch (vece) {
3949         case MO_16:
3950             return have_avx512bw;
3951         case MO_32:
3952             return have_avx2;
3953         case MO_64:
3954             return have_avx512vl;
3955         }
3956         return 0;
3957     case INDEX_op_rotlv_vec:
3958     case INDEX_op_rotrv_vec:
3959         switch (vece) {
3960         case MO_16:
3961             return have_avx512vbmi2 ? -1 : 0;
3962         case MO_32:
3963         case MO_64:
3964             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3965         }
3966         return 0;
3967
3968     case INDEX_op_mul_vec:
3969         switch (vece) {
3970         case MO_8:
3971             return -1;
3972         case MO_64:
3973             return have_avx512dq;
3974         }
3975         return 1;
3976
3977     case INDEX_op_ssadd_vec:
3978     case INDEX_op_usadd_vec:
3979     case INDEX_op_sssub_vec:
3980     case INDEX_op_ussub_vec:
3981         return vece <= MO_16;
3982     case INDEX_op_smin_vec:
3983     case INDEX_op_smax_vec:
3984     case INDEX_op_umin_vec:
3985     case INDEX_op_umax_vec:
3986     case INDEX_op_abs_vec:
3987         return vece <= MO_32 || have_avx512vl;
3988
3989     default:
3990         return 0;
3991     }
3992 }
3993
3994 static void expand_vec_shi(TCGType type, unsigned vece, bool right,
3995                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3996 {
3997     uint8_t mask;
3998
3999     tcg_debug_assert(vece == MO_8);
4000     if (right) {
4001         mask = 0xff >> imm;
4002         tcg_gen_shri_vec(MO_16, v0, v1, imm);
4003     } else {
4004         mask = 0xff << imm;
4005         tcg_gen_shli_vec(MO_16, v0, v1, imm);
4006     }
4007     tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask));
4008 }
4009
4010 static void expand_vec_sari(TCGType type, unsigned vece,
4011                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
4012 {
4013     TCGv_vec t1, t2;
4014
4015     switch (vece) {
4016     case MO_8:
4017         /* Unpack to 16-bit, shift, and repack.  */
4018         t1 = tcg_temp_new_vec(type);
4019         t2 = tcg_temp_new_vec(type);
4020         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
4021                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
4022         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
4023                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
4024         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
4025         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
4026         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
4027                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
4028         tcg_temp_free_vec(t1);
4029         tcg_temp_free_vec(t2);
4030         break;
4031
4032     case MO_64:
4033         t1 = tcg_temp_new_vec(type);
4034         if (imm <= 32) {
4035             /*
4036              * We can emulate a small sign extend by performing an arithmetic
4037              * 32-bit shift and overwriting the high half of a 64-bit logical
4038              * shift.  Note that the ISA says shift of 32 is valid, but TCG
4039              * does not, so we have to bound the smaller shift -- we get the
4040              * same result in the high half either way.
4041              */
4042             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
4043             tcg_gen_shri_vec(MO_64, v0, v1, imm);
4044             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
4045                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
4046                       tcgv_vec_arg(t1), 0xaa);
4047         } else {
4048             /* Otherwise we will need to use a compare vs 0 to produce
4049              * the sign-extend, shift and merge.
4050              */
4051             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
4052                             tcg_constant_vec(type, MO_64, 0), v1);
4053             tcg_gen_shri_vec(MO_64, v0, v1, imm);
4054             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
4055             tcg_gen_or_vec(MO_64, v0, v0, t1);
4056         }
4057         tcg_temp_free_vec(t1);
4058         break;
4059
4060     default:
4061         g_assert_not_reached();
4062     }
4063 }
4064
4065 static void expand_vec_rotli(TCGType type, unsigned vece,
4066                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
4067 {
4068     TCGv_vec t;
4069
4070     if (vece != MO_8 && have_avx512vbmi2) {
4071         vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
4072                   tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
4073         return;
4074     }
4075
4076     t = tcg_temp_new_vec(type);
4077     tcg_gen_shli_vec(vece, t, v1, imm);
4078     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
4079     tcg_gen_or_vec(vece, v0, v0, t);
4080     tcg_temp_free_vec(t);
4081 }
4082
4083 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
4084                             TCGv_vec v1, TCGv_vec sh, bool right)
4085 {
4086     TCGv_vec t;
4087
4088     if (have_avx512vbmi2) {
4089         vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
4090                   type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
4091                   tcgv_vec_arg(v1), tcgv_vec_arg(sh));
4092         return;
4093     }
4094
4095     t = tcg_temp_new_vec(type);
4096     tcg_gen_dupi_vec(vece, t, 8 << vece);
4097     tcg_gen_sub_vec(vece, t, t, sh);
4098     if (right) {
4099         tcg_gen_shlv_vec(vece, t, v1, t);
4100         tcg_gen_shrv_vec(vece, v0, v1, sh);
4101     } else {
4102         tcg_gen_shrv_vec(vece, t, v1, t);
4103         tcg_gen_shlv_vec(vece, v0, v1, sh);
4104     }
4105     tcg_gen_or_vec(vece, v0, v0, t);
4106     tcg_temp_free_vec(t);
4107 }
4108
4109 static void expand_vec_rotls(TCGType type, unsigned vece,
4110                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
4111 {
4112     TCGv_vec t = tcg_temp_new_vec(type);
4113
4114     tcg_debug_assert(vece != MO_8);
4115
4116     if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
4117         tcg_gen_dup_i32_vec(vece, t, lsh);
4118         if (vece >= MO_32) {
4119             tcg_gen_rotlv_vec(vece, v0, v1, t);
4120         } else {
4121             expand_vec_rotv(type, vece, v0, v1, t, false);
4122         }
4123     } else {
4124         TCGv_i32 rsh = tcg_temp_new_i32();
4125
4126         tcg_gen_neg_i32(rsh, lsh);
4127         tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
4128         tcg_gen_shls_vec(vece, t, v1, lsh);
4129         tcg_gen_shrs_vec(vece, v0, v1, rsh);
4130         tcg_gen_or_vec(vece, v0, v0, t);
4131
4132         tcg_temp_free_i32(rsh);
4133     }
4134
4135     tcg_temp_free_vec(t);
4136 }
4137
4138 static void expand_vec_mul(TCGType type, unsigned vece,
4139                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
4140 {
4141     TCGv_vec t1, t2, t3, t4, zero;
4142
4143     tcg_debug_assert(vece == MO_8);
4144
4145     /*
4146      * Unpack v1 bytes to words, 0 | x.
4147      * Unpack v2 bytes to words, y | 0.
4148      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
4149      * Shift logical right by 8 bits to clear the high 8 bytes before
4150      * using an unsigned saturated pack.
4151      *
4152      * The difference between the V64, V128 and V256 cases is merely how
4153      * we distribute the expansion between temporaries.
4154      */
4155     switch (type) {
4156     case TCG_TYPE_V64:
4157         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
4158         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
4159         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
4160         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
4161                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
4162         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
4163                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
4164         tcg_gen_mul_vec(MO_16, t1, t1, t2);
4165         tcg_gen_shri_vec(MO_16, t1, t1, 8);
4166         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
4167                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
4168         tcg_temp_free_vec(t1);
4169         tcg_temp_free_vec(t2);
4170         break;
4171
4172     case TCG_TYPE_V128:
4173     case TCG_TYPE_V256:
4174         t1 = tcg_temp_new_vec(type);
4175         t2 = tcg_temp_new_vec(type);
4176         t3 = tcg_temp_new_vec(type);
4177         t4 = tcg_temp_new_vec(type);
4178         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
4179         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
4180                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
4181         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
4182                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
4183         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
4184                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
4185         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
4186                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
4187         tcg_gen_mul_vec(MO_16, t1, t1, t2);
4188         tcg_gen_mul_vec(MO_16, t3, t3, t4);
4189         tcg_gen_shri_vec(MO_16, t1, t1, 8);
4190         tcg_gen_shri_vec(MO_16, t3, t3, 8);
4191         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
4192                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
4193         tcg_temp_free_vec(t1);
4194         tcg_temp_free_vec(t2);
4195         tcg_temp_free_vec(t3);
4196         tcg_temp_free_vec(t4);
4197         break;
4198
4199     default:
4200         g_assert_not_reached();
4201     }
4202 }
4203
4204 static TCGCond expand_vec_cond(TCGType type, unsigned vece,
4205                                TCGArg *a1, TCGArg *a2, TCGCond cond)
4206 {
4207     /*
4208      * Without AVX512, there are no 64-bit unsigned comparisons.
4209      * We must bias the inputs so that they become signed.
4210      * All other swapping and inversion are handled during code generation.
4211      */
4212     if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) {
4213         TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1));
4214         TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2));
4215         TCGv_vec t1 = tcg_temp_new_vec(type);
4216         TCGv_vec t2 = tcg_temp_new_vec(type);
4217         TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4218
4219         tcg_gen_sub_vec(vece, t1, v1, t3);
4220         tcg_gen_sub_vec(vece, t2, v2, t3);
4221         *a1 = tcgv_vec_arg(t1);
4222         *a2 = tcgv_vec_arg(t2);
4223         cond = tcg_signed_cond(cond);
4224     }
4225     return cond;
4226 }
4227
4228 static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0,
4229                            TCGArg a1, TCGArg a2, TCGCond cond)
4230 {
4231     cond = expand_vec_cond(type, vece, &a1, &a2, cond);
4232     /* Expand directly; do not recurse.  */
4233     vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
4234 }
4235
4236 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0,
4237                               TCGArg a1, TCGArg a2,
4238                               TCGArg a3, TCGArg a4, TCGCond cond)
4239 {
4240     cond = expand_vec_cond(type, vece, &a1, &a2, cond);
4241     /* Expand directly; do not recurse.  */
4242     vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond);
4243 }
4244
4245 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4246                        TCGArg a0, ...)
4247 {
4248     va_list va;
4249     TCGArg a1, a2, a3, a4, a5;
4250     TCGv_vec v0, v1, v2;
4251
4252     va_start(va, a0);
4253     a1 = va_arg(va, TCGArg);
4254     a2 = va_arg(va, TCGArg);
4255     v0 = temp_tcgv_vec(arg_temp(a0));
4256     v1 = temp_tcgv_vec(arg_temp(a1));
4257
4258     switch (opc) {
4259     case INDEX_op_shli_vec:
4260         expand_vec_shi(type, vece, false, v0, v1, a2);
4261         break;
4262     case INDEX_op_shri_vec:
4263         expand_vec_shi(type, vece, true, v0, v1, a2);
4264         break;
4265     case INDEX_op_sari_vec:
4266         expand_vec_sari(type, vece, v0, v1, a2);
4267         break;
4268
4269     case INDEX_op_rotli_vec:
4270         expand_vec_rotli(type, vece, v0, v1, a2);
4271         break;
4272
4273     case INDEX_op_rotls_vec:
4274         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4275         break;
4276
4277     case INDEX_op_rotlv_vec:
4278         v2 = temp_tcgv_vec(arg_temp(a2));
4279         expand_vec_rotv(type, vece, v0, v1, v2, false);
4280         break;
4281     case INDEX_op_rotrv_vec:
4282         v2 = temp_tcgv_vec(arg_temp(a2));
4283         expand_vec_rotv(type, vece, v0, v1, v2, true);
4284         break;
4285
4286     case INDEX_op_mul_vec:
4287         v2 = temp_tcgv_vec(arg_temp(a2));
4288         expand_vec_mul(type, vece, v0, v1, v2);
4289         break;
4290
4291     case INDEX_op_cmp_vec:
4292         a3 = va_arg(va, TCGArg);
4293         expand_vec_cmp(type, vece, a0, a1, a2, a3);
4294         break;
4295
4296     case INDEX_op_cmpsel_vec:
4297         a3 = va_arg(va, TCGArg);
4298         a4 = va_arg(va, TCGArg);
4299         a5 = va_arg(va, TCGArg);
4300         expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5);
4301         break;
4302
4303     default:
4304         break;
4305     }
4306
4307     va_end(va);
4308 }
4309
4310 static const int tcg_target_callee_save_regs[] = {
4311 #if TCG_TARGET_REG_BITS == 64
4312     TCG_REG_RBP,
4313     TCG_REG_RBX,
4314 #if defined(_WIN64)
4315     TCG_REG_RDI,
4316     TCG_REG_RSI,
4317 #endif
4318     TCG_REG_R12,
4319     TCG_REG_R13,
4320     TCG_REG_R14, /* Currently used for the global env. */
4321     TCG_REG_R15,
4322 #else
4323     TCG_REG_EBP, /* Currently used for the global env. */
4324     TCG_REG_EBX,
4325     TCG_REG_ESI,
4326     TCG_REG_EDI,
4327 #endif
4328 };
4329
4330 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
4331    and tcg_register_jit.  */
4332
4333 #define PUSH_SIZE \
4334     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4335      * (TCG_TARGET_REG_BITS / 8))
4336
4337 #define FRAME_SIZE \
4338     ((PUSH_SIZE \
4339       + TCG_STATIC_CALL_ARGS_SIZE \
4340       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4341       + TCG_TARGET_STACK_ALIGN - 1) \
4342      & ~(TCG_TARGET_STACK_ALIGN - 1))
4343
4344 /* Generate global QEMU prologue and epilogue code */
4345 static void tcg_target_qemu_prologue(TCGContext *s)
4346 {
4347     int i, stack_addend;
4348
4349     /* TB prologue */
4350
4351     /* Reserve some stack space, also for TCG temps.  */
4352     stack_addend = FRAME_SIZE - PUSH_SIZE;
4353     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4354                   CPU_TEMP_BUF_NLONGS * sizeof(long));
4355
4356     /* Save all callee saved registers.  */
4357     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4358         tcg_out_push(s, tcg_target_callee_save_regs[i]);
4359     }
4360
4361     if (!tcg_use_softmmu && guest_base) {
4362         int seg = setup_guest_base_seg();
4363         if (seg != 0) {
4364             x86_guest_base.seg = seg;
4365         } else if (guest_base == (int32_t)guest_base) {
4366             x86_guest_base.ofs = guest_base;
4367         } else {
4368             assert(TCG_TARGET_REG_BITS == 64);
4369             /* Choose R12 because, as a base, it requires a SIB byte. */
4370             x86_guest_base.index = TCG_REG_R12;
4371             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4372             tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4373         }
4374     }
4375
4376     if (TCG_TARGET_REG_BITS == 32) {
4377         tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4378                    (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4379         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4380         /* jmp *tb.  */
4381         tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4382                              (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4383                              + stack_addend);
4384     } else {
4385         tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4386         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4387         /* jmp *tb.  */
4388         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4389     }
4390
4391     /*
4392      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4393      * and fall through to the rest of the epilogue.
4394      */
4395     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4396     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4397
4398     /* TB epilogue */
4399     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4400
4401     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4402
4403     if (have_avx2) {
4404         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4405     }
4406     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4407         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4408     }
4409     tcg_out_opc(s, OPC_RET, 0, 0, 0);
4410 }
4411
4412 static void tcg_out_tb_start(TCGContext *s)
4413 {
4414     /* nothing to do */
4415 }
4416
4417 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4418 {
4419     memset(p, 0x90, count);
4420 }
4421
4422 static void tcg_target_init(TCGContext *s)
4423 {
4424     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4425     if (TCG_TARGET_REG_BITS == 64) {
4426         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4427     }
4428     if (have_avx1) {
4429         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4430         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4431     }
4432     if (have_avx2) {
4433         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4434     }
4435
4436     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4437     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4438     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4439     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4440     if (TCG_TARGET_REG_BITS == 64) {
4441 #if !defined(_WIN64)
4442         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4443         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4444 #endif
4445         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4446         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4447         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4448         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4449     }
4450
4451     s->reserved_regs = 0;
4452     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4453     tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4454 #ifdef _WIN64
4455     /* These are call saved, and we don't save them, so don't use them. */
4456     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4457     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4458     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4459     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4460     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4461     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4462     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4463     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4464     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4465     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4466 #endif
4467 }
4468
4469 typedef struct {
4470     DebugFrameHeader h;
4471     uint8_t fde_def_cfa[4];
4472     uint8_t fde_reg_ofs[14];
4473 } DebugFrame;
4474
4475 /* We're expecting a 2 byte uleb128 encoded value.  */
4476 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4477
4478 #if !defined(__ELF__)
4479     /* Host machine without ELF. */
4480 #elif TCG_TARGET_REG_BITS == 64
4481 #define ELF_HOST_MACHINE EM_X86_64
4482 static const DebugFrame debug_frame = {
4483     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4484     .h.cie.id = -1,
4485     .h.cie.version = 1,
4486     .h.cie.code_align = 1,
4487     .h.cie.data_align = 0x78,             /* sleb128 -8 */
4488     .h.cie.return_column = 16,
4489
4490     /* Total FDE size does not include the "len" member.  */
4491     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4492
4493     .fde_def_cfa = {
4494         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4495         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4496         (FRAME_SIZE >> 7)
4497     },
4498     .fde_reg_ofs = {
4499         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4500         /* The following ordering must match tcg_target_callee_save_regs.  */
4501         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4502         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4503         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4504         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4505         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4506         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4507     }
4508 };
4509 #else
4510 #define ELF_HOST_MACHINE EM_386
4511 static const DebugFrame debug_frame = {
4512     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4513     .h.cie.id = -1,
4514     .h.cie.version = 1,
4515     .h.cie.code_align = 1,
4516     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4517     .h.cie.return_column = 8,
4518
4519     /* Total FDE size does not include the "len" member.  */
4520     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4521
4522     .fde_def_cfa = {
4523         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4524         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4525         (FRAME_SIZE >> 7)
4526     },
4527     .fde_reg_ofs = {
4528         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4529         /* The following ordering must match tcg_target_callee_save_regs.  */
4530         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4531         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4532         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4533         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4534     }
4535 };
4536 #endif
4537
4538 #if defined(ELF_HOST_MACHINE)
4539 void tcg_register_jit(const void *buf, size_t buf_size)
4540 {
4541     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4542 }
4543 #endif