tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-ldst.c.inc"
  26 #include "../tcg-pool.c.inc"
  27
  28 #ifdef CONFIG_DEBUG_TCG
  29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  30 #if TCG_TARGET_REG_BITS == 64
  31     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  32 #else
  33     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  34 #endif
  35     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  36     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  37 #if TCG_TARGET_REG_BITS == 64
  38     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  39     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  40 #endif
  41 };
  42 #endif
  43
  44 static const int tcg_target_reg_alloc_order[] = {
  45 #if TCG_TARGET_REG_BITS == 64
  46     TCG_REG_RBP,
  47     TCG_REG_RBX,
  48     TCG_REG_R12,
  49     TCG_REG_R13,
  50     TCG_REG_R14,
  51     TCG_REG_R15,
  52     TCG_REG_R10,
  53     TCG_REG_R11,
  54     TCG_REG_R9,
  55     TCG_REG_R8,
  56     TCG_REG_RCX,
  57     TCG_REG_RDX,
  58     TCG_REG_RSI,
  59     TCG_REG_RDI,
  60     TCG_REG_RAX,
  61 #else
  62     TCG_REG_EBX,
  63     TCG_REG_ESI,
  64     TCG_REG_EDI,
  65     TCG_REG_EBP,
  66     TCG_REG_ECX,
  67     TCG_REG_EDX,
  68     TCG_REG_EAX,
  69 #endif
  70     TCG_REG_XMM0,
  71     TCG_REG_XMM1,
  72     TCG_REG_XMM2,
  73     TCG_REG_XMM3,
  74     TCG_REG_XMM4,
  75     TCG_REG_XMM5,
  76 #ifndef _WIN64
  77     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  78        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  79     TCG_REG_XMM6,
  80     TCG_REG_XMM7,
  81 #if TCG_TARGET_REG_BITS == 64
  82     TCG_REG_XMM8,
  83     TCG_REG_XMM9,
  84     TCG_REG_XMM10,
  85     TCG_REG_XMM11,
  86     TCG_REG_XMM12,
  87     TCG_REG_XMM13,
  88     TCG_REG_XMM14,
  89     TCG_REG_XMM15,
  90 #endif
  91 #endif
  92 };
  93
  94 static const int tcg_target_call_iarg_regs[] = {
  95 #if TCG_TARGET_REG_BITS == 64
  96 #if defined(_WIN64)
  97     TCG_REG_RCX,
  98     TCG_REG_RDX,
  99 #else
 100     TCG_REG_RDI,
 101     TCG_REG_RSI,
 102     TCG_REG_RDX,
 103     TCG_REG_RCX,
 104 #endif
 105     TCG_REG_R8,
 106     TCG_REG_R9,
 107 #else
 108     /* 32 bit mode uses stack based calling convention (GCC default). */
 109 #endif
 110 };
 111
 112 static const int tcg_target_call_oarg_regs[] = {
 113     TCG_REG_EAX,
 114 #if TCG_TARGET_REG_BITS == 32
 115     TCG_REG_EDX
 116 #endif
 117 };
 118
 119 /* Constants we accept.  */
 120 #define TCG_CT_CONST_S32 0x100
 121 #define TCG_CT_CONST_U32 0x200
 122 #define TCG_CT_CONST_I32 0x400
 123 #define TCG_CT_CONST_WSZ 0x800
 124
 125 /* Registers used with L constraint, which are the first argument
 126    registers on x86_64, and two random call clobbered registers on
 127    i386. */
 128 #if TCG_TARGET_REG_BITS == 64
 129 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 130 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 131 #else
 132 # define TCG_REG_L0 TCG_REG_EAX
 133 # define TCG_REG_L1 TCG_REG_EDX
 134 #endif
 135
 136 #define ALL_BYTEH_REGS         0x0000000fu
 137 #if TCG_TARGET_REG_BITS == 64
 138 # define ALL_GENERAL_REGS      0x0000ffffu
 139 # define ALL_VECTOR_REGS       0xffff0000u
 140 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 141 #else
 142 # define ALL_GENERAL_REGS      0x000000ffu
 143 # define ALL_VECTOR_REGS       0x00ff0000u
 144 # define ALL_BYTEL_REGS        ALL_BYTEH_REGS
 145 #endif
 146 #ifdef CONFIG_SOFTMMU
 147 # define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
 148 #else
 149 # define SOFTMMU_RESERVE_REGS  0
 150 #endif
 151
 152 /* The host compiler should supply <cpuid.h> to enable runtime features
 153    detection, as we're not going to go so far as our own inline assembly.
 154    If not available, default values will be assumed.  */
 155 #if defined(CONFIG_CPUID_H)
 156 #include "qemu/cpuid.h"
 157 #endif
 158
 159 /* For 64-bit, we always know that CMOV is available.  */
 160 #if TCG_TARGET_REG_BITS == 64
 161 # define have_cmov 1
 162 #elif defined(CONFIG_CPUID_H)
 163 static bool have_cmov;
 164 #else
 165 # define have_cmov 0
 166 #endif
 167
 168 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
 169    it there.  Therefore we always define the variable.  */
 170 bool have_bmi1;
 171 bool have_popcnt;
 172 bool have_avx1;
 173 bool have_avx2;
 174 bool have_avx512bw;
 175 bool have_avx512dq;
 176 bool have_avx512vbmi2;
 177 bool have_avx512vl;
 178 bool have_movbe;
 179
 180 #ifdef CONFIG_CPUID_H
 181 static bool have_bmi2;
 182 static bool have_lzcnt;
 183 #else
 184 # define have_bmi2 0
 185 # define have_lzcnt 0
 186 #endif
 187
 188 static const tcg_insn_unit *tb_ret_addr;
 189
 190 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 191                         intptr_t value, intptr_t addend)
 192 {
 193     value += addend;
 194     switch(type) {
 195     case R_386_PC32:
 196         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 197         if (value != (int32_t)value) {
 198             return false;
 199         }
 200         /* FALLTHRU */
 201     case R_386_32:
 202         tcg_patch32(code_ptr, value);
 203         break;
 204     case R_386_PC8:
 205         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 206         if (value != (int8_t)value) {
 207             return false;
 208         }
 209         tcg_patch8(code_ptr, value);
 210         break;
 211     default:
 212         tcg_abort();
 213     }
 214     return true;
 215 }
 216
 217 /* test if a constant matches the constraint */
 218 static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 219 {
 220     if (ct & TCG_CT_CONST) {
 221         return 1;
 222     }
 223     if (type == TCG_TYPE_I32) {
 224         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 225             return 1;
 226         }
 227     } else {
 228         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 229             return 1;
 230         }
 231         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 232             return 1;
 233         }
 234         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 235             return 1;
 236         }
 237     }
 238     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 239         return 1;
 240     }
 241     return 0;
 242 }
 243
 244 # define LOWREGMASK(x)  ((x) & 7)
 245
 246 #define P_EXT           0x100           /* 0x0f opcode prefix */
 247 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 248 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 249 #define P_VEXW          0x1000          /* Set VEX.W = 1 */
 250 #if TCG_TARGET_REG_BITS == 64
 251 # define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 252 # define P_REXB_R       0x2000          /* REG field as byte register */
 253 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 254 # define P_GS           0x8000          /* gs segment override */
 255 #else
 256 # define P_REXW         0
 257 # define P_REXB_R       0
 258 # define P_REXB_RM      0
 259 # define P_GS           0
 260 #endif
 261 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 262 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 263 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 264 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 265 #define P_EVEX          0x100000        /* Requires EVEX encoding */
 266
 267 #define OPC_ARITH_EvIz  (0x81)
 268 #define OPC_ARITH_EvIb  (0x83)
 269 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 270 #define OPC_ANDN        (0xf2 | P_EXT38)
 271 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 272 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 273 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 274 #define OPC_BSF         (0xbc | P_EXT)
 275 #define OPC_BSR         (0xbd | P_EXT)
 276 #define OPC_BSWAP       (0xc8 | P_EXT)
 277 #define OPC_CALL_Jz     (0xe8)
 278 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 279 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 280 #define OPC_DEC_r32     (0x48)
 281 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 282 #define OPC_IMUL_GvEvIb (0x6b)
 283 #define OPC_IMUL_GvEvIz (0x69)
 284 #define OPC_INC_r32     (0x40)
 285 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 286 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 287 #define OPC_JMP_long    (0xe9)
 288 #define OPC_JMP_short   (0xeb)
 289 #define OPC_LEA         (0x8d)
 290 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 291 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 292 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 293 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 294 #define OPC_MOVB_EvIz   (0xc6)
 295 #define OPC_MOVL_EvIz   (0xc7)
 296 #define OPC_MOVL_Iv     (0xb8)
 297 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 298 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 299 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 300 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 301 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 302 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 303 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 304 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 305 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 306 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 307 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 308 #define OPC_MOVSBL      (0xbe | P_EXT)
 309 #define OPC_MOVSWL      (0xbf | P_EXT)
 310 #define OPC_MOVSLQ      (0x63 | P_REXW)
 311 #define OPC_MOVZBL      (0xb6 | P_EXT)
 312 #define OPC_MOVZWL      (0xb7 | P_EXT)
 313 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 314 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 315 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 316 #define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 317 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 318 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 319 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 320 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 321 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 322 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 323 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 324 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 325 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 326 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 327 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 328 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 329 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 330 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 331 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 332 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 333 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 334 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 335 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 336 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 337 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 338 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 339 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 340 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 341 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 342 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 343 #define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 344 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 345 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 346 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 347 #define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 348 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 349 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 350 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 351 #define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 352 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 353 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 354 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 355 #define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 356 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 357 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 358 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 359 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 360 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 361 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 362 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 363 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 364 #define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 365 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 366 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 367 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 368 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 369 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 370 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 371 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
 372 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 373 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 374 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 375 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 376 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 377 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 378 #define OPC_VPSRAQ      (0x72 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
 379 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 380 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 381 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 382 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 383 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 384 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 385 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 386 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 387 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 388 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 389 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 390 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 391 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 392 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 393 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 394 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 395 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 396 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 397 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 398 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 399 #define OPC_POP_r32     (0x58)
 400 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 401 #define OPC_PUSH_r32    (0x50)
 402 #define OPC_PUSH_Iv     (0x68)
 403 #define OPC_PUSH_Ib     (0x6a)
 404 #define OPC_RET         (0xc3)
 405 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 406 #define OPC_SHIFT_1     (0xd1)
 407 #define OPC_SHIFT_Ib    (0xc1)
 408 #define OPC_SHIFT_cl    (0xd3)
 409 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 410 #define OPC_SHUFPS      (0xc6 | P_EXT)
 411 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 412 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 413 #define OPC_SHRD_Ib     (0xac | P_EXT)
 414 #define OPC_TESTL       (0x85)
 415 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 416 #define OPC_UD2         (0x0b | P_EXT)
 417 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 418 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 419 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 420 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 421 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 422 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 423 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 424 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 425 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 426 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 427 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 428 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 429 #define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
 430 #define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 431 #define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
 432 #define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 433 #define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 434 #define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
 435 #define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 436 #define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 437 #define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
 438 #define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 439 #define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 440 #define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
 441 #define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 442 #define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 443 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 444 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 445 #define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 446 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 447 #define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 448 #define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 449 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 450 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 451 #define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 452 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 453 #define OPC_XCHG_ax_r32 (0x90)
 454
 455 #define OPC_GRP3_Eb     (0xf6)
 456 #define OPC_GRP3_Ev     (0xf7)
 457 #define OPC_GRP5        (0xff)
 458 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 459
 460 /* Group 1 opcode extensions for 0x80-0x83.
 461    These are also used as modifiers for OPC_ARITH.  */
 462 #define ARITH_ADD 0
 463 #define ARITH_OR  1
 464 #define ARITH_ADC 2
 465 #define ARITH_SBB 3
 466 #define ARITH_AND 4
 467 #define ARITH_SUB 5
 468 #define ARITH_XOR 6
 469 #define ARITH_CMP 7
 470
 471 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 472 #define SHIFT_ROL 0
 473 #define SHIFT_ROR 1
 474 #define SHIFT_SHL 4
 475 #define SHIFT_SHR 5
 476 #define SHIFT_SAR 7
 477
 478 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 479 #define EXT3_TESTi 0
 480 #define EXT3_NOT   2
 481 #define EXT3_NEG   3
 482 #define EXT3_MUL   4
 483 #define EXT3_IMUL  5
 484 #define EXT3_DIV   6
 485 #define EXT3_IDIV  7
 486
 487 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 488 #define EXT5_INC_Ev     0
 489 #define EXT5_DEC_Ev     1
 490 #define EXT5_CALLN_Ev   2
 491 #define EXT5_JMPN_Ev    4
 492
 493 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 494 #define JCC_JMP (-1)
 495 #define JCC_JO  0x0
 496 #define JCC_JNO 0x1
 497 #define JCC_JB  0x2
 498 #define JCC_JAE 0x3
 499 #define JCC_JE  0x4
 500 #define JCC_JNE 0x5
 501 #define JCC_JBE 0x6
 502 #define JCC_JA  0x7
 503 #define JCC_JS  0x8
 504 #define JCC_JNS 0x9
 505 #define JCC_JP  0xa
 506 #define JCC_JNP 0xb
 507 #define JCC_JL  0xc
 508 #define JCC_JGE 0xd
 509 #define JCC_JLE 0xe
 510 #define JCC_JG  0xf
 511
 512 static const uint8_t tcg_cond_to_jcc[] = {
 513     [TCG_COND_EQ] = JCC_JE,
 514     [TCG_COND_NE] = JCC_JNE,
 515     [TCG_COND_LT] = JCC_JL,
 516     [TCG_COND_GE] = JCC_JGE,
 517     [TCG_COND_LE] = JCC_JLE,
 518     [TCG_COND_GT] = JCC_JG,
 519     [TCG_COND_LTU] = JCC_JB,
 520     [TCG_COND_GEU] = JCC_JAE,
 521     [TCG_COND_LEU] = JCC_JBE,
 522     [TCG_COND_GTU] = JCC_JA,
 523 };
 524
 525 #if TCG_TARGET_REG_BITS == 64
 526 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 527 {
 528     int rex;
 529
 530     if (opc & P_GS) {
 531         tcg_out8(s, 0x65);
 532     }
 533     if (opc & P_DATA16) {
 534         /* We should never be asking for both 16 and 64-bit operation.  */
 535         tcg_debug_assert((opc & P_REXW) == 0);
 536         tcg_out8(s, 0x66);
 537     }
 538     if (opc & P_SIMDF3) {
 539         tcg_out8(s, 0xf3);
 540     } else if (opc & P_SIMDF2) {
 541         tcg_out8(s, 0xf2);
 542     }
 543
 544     rex = 0;
 545     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 546     rex |= (r & 8) >> 1;                /* REX.R */
 547     rex |= (x & 8) >> 2;                /* REX.X */
 548     rex |= (rm & 8) >> 3;               /* REX.B */
 549
 550     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 551        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 552        as otherwise the encoding indicates %[abcd]h.  Note that the values
 553        that are ORed in merely indicate that the REX byte must be present;
 554        those bits get discarded in output.  */
 555     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 556     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 557
 558     if (rex) {
 559         tcg_out8(s, (uint8_t)(rex | 0x40));
 560     }
 561
 562     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 563         tcg_out8(s, 0x0f);
 564         if (opc & P_EXT38) {
 565             tcg_out8(s, 0x38);
 566         } else if (opc & P_EXT3A) {
 567             tcg_out8(s, 0x3a);
 568         }
 569     }
 570
 571     tcg_out8(s, opc);
 572 }
 573 #else
 574 static void tcg_out_opc(TCGContext *s, int opc)
 575 {
 576     if (opc & P_DATA16) {
 577         tcg_out8(s, 0x66);
 578     }
 579     if (opc & P_SIMDF3) {
 580         tcg_out8(s, 0xf3);
 581     } else if (opc & P_SIMDF2) {
 582         tcg_out8(s, 0xf2);
 583     }
 584     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 585         tcg_out8(s, 0x0f);
 586         if (opc & P_EXT38) {
 587             tcg_out8(s, 0x38);
 588         } else if (opc & P_EXT3A) {
 589             tcg_out8(s, 0x3a);
 590         }
 591     }
 592     tcg_out8(s, opc);
 593 }
 594 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 595    the 32-bit compilation paths.  This method works with all versions of gcc,
 596    whereas relying on optimization may not be able to exclude them.  */
 597 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 598 #endif
 599
 600 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 601 {
 602     tcg_out_opc(s, opc, r, rm, 0);
 603     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 604 }
 605
 606 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 607                             int rm, int index)
 608 {
 609     int tmp;
 610
 611     /* Use the two byte form if possible, which cannot encode
 612        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 613     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
 614         && ((rm | index) & 8) == 0) {
 615         /* Two byte VEX prefix.  */
 616         tcg_out8(s, 0xc5);
 617
 618         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 619     } else {
 620         /* Three byte VEX prefix.  */
 621         tcg_out8(s, 0xc4);
 622
 623         /* VEX.m-mmmm */
 624         if (opc & P_EXT3A) {
 625             tmp = 3;
 626         } else if (opc & P_EXT38) {
 627             tmp = 2;
 628         } else if (opc & P_EXT) {
 629             tmp = 1;
 630         } else {
 631             g_assert_not_reached();
 632         }
 633         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 634         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 635         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 636         tcg_out8(s, tmp);
 637
 638         tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
 639     }
 640
 641     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 642     /* VEX.pp */
 643     if (opc & P_DATA16) {
 644         tmp |= 1;                          /* 0x66 */
 645     } else if (opc & P_SIMDF3) {
 646         tmp |= 2;                          /* 0xf3 */
 647     } else if (opc & P_SIMDF2) {
 648         tmp |= 3;                          /* 0xf2 */
 649     }
 650     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 651     tcg_out8(s, tmp);
 652     tcg_out8(s, opc);
 653 }
 654
 655 static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
 656                              int rm, int index)
 657 {
 658     /* The entire 4-byte evex prefix; with R' and V' set. */
 659     uint32_t p = 0x08041062;
 660     int mm, pp;
 661
 662     tcg_debug_assert(have_avx512vl);
 663
 664     /* EVEX.mm */
 665     if (opc & P_EXT3A) {
 666         mm = 3;
 667     } else if (opc & P_EXT38) {
 668         mm = 2;
 669     } else if (opc & P_EXT) {
 670         mm = 1;
 671     } else {
 672         g_assert_not_reached();
 673     }
 674
 675     /* EVEX.pp */
 676     if (opc & P_DATA16) {
 677         pp = 1;                          /* 0x66 */
 678     } else if (opc & P_SIMDF3) {
 679         pp = 2;                          /* 0xf3 */
 680     } else if (opc & P_SIMDF2) {
 681         pp = 3;                          /* 0xf2 */
 682     } else {
 683         pp = 0;
 684     }
 685
 686     p = deposit32(p, 8, 2, mm);
 687     p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
 688     p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
 689     p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
 690     p = deposit32(p, 16, 2, pp);
 691     p = deposit32(p, 19, 4, ~v);
 692     p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
 693     p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
 694
 695     tcg_out32(s, p);
 696     tcg_out8(s, opc);
 697 }
 698
 699 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 700 {
 701     if (opc & P_EVEX) {
 702         tcg_out_evex_opc(s, opc, r, v, rm, 0);
 703     } else {
 704         tcg_out_vex_opc(s, opc, r, v, rm, 0);
 705     }
 706     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 707 }
 708
 709 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 710    We handle either RM and INDEX missing with a negative value.  In 64-bit
 711    mode for absolute addresses, ~RM is the size of the immediate operand
 712    that will follow the instruction.  */
 713
 714 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 715                                int shift, intptr_t offset)
 716 {
 717     int mod, len;
 718
 719     if (index < 0 && rm < 0) {
 720         if (TCG_TARGET_REG_BITS == 64) {
 721             /* Try for a rip-relative addressing mode.  This has replaced
 722                the 32-bit-mode absolute addressing encoding.  */
 723             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 724             intptr_t disp = offset - pc;
 725             if (disp == (int32_t)disp) {
 726                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 727                 tcg_out32(s, disp);
 728                 return;
 729             }
 730
 731             /* Try for an absolute address encoding.  This requires the
 732                use of the MODRM+SIB encoding and is therefore larger than
 733                rip-relative addressing.  */
 734             if (offset == (int32_t)offset) {
 735                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 736                 tcg_out8(s, (4 << 3) | 5);
 737                 tcg_out32(s, offset);
 738                 return;
 739             }
 740
 741             /* ??? The memory isn't directly addressable.  */
 742             g_assert_not_reached();
 743         } else {
 744             /* Absolute address.  */
 745             tcg_out8(s, (r << 3) | 5);
 746             tcg_out32(s, offset);
 747             return;
 748         }
 749     }
 750
 751     /* Find the length of the immediate addend.  Note that the encoding
 752        that would be used for (%ebp) indicates absolute addressing.  */
 753     if (rm < 0) {
 754         mod = 0, len = 4, rm = 5;
 755     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 756         mod = 0, len = 0;
 757     } else if (offset == (int8_t)offset) {
 758         mod = 0x40, len = 1;
 759     } else {
 760         mod = 0x80, len = 4;
 761     }
 762
 763     /* Use a single byte MODRM format if possible.  Note that the encoding
 764        that would be used for %esp is the escape to the two byte form.  */
 765     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 766         /* Single byte MODRM format.  */
 767         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 768     } else {
 769         /* Two byte MODRM+SIB format.  */
 770
 771         /* Note that the encoding that would place %esp into the index
 772            field indicates no index register.  In 64-bit mode, the REX.X
 773            bit counts, so %r12 can be used as the index.  */
 774         if (index < 0) {
 775             index = 4;
 776         } else {
 777             tcg_debug_assert(index != TCG_REG_ESP);
 778         }
 779
 780         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 781         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 782     }
 783
 784     if (len == 1) {
 785         tcg_out8(s, offset);
 786     } else if (len == 4) {
 787         tcg_out32(s, offset);
 788     }
 789 }
 790
 791 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 792                                      int index, int shift, intptr_t offset)
 793 {
 794     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 795     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 796 }
 797
 798 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 799                                          int rm, int index, int shift,
 800                                          intptr_t offset)
 801 {
 802     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 803     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 804 }
 805
 806 /* A simplification of the above with no index or shift.  */
 807 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 808                                         int rm, intptr_t offset)
 809 {
 810     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 811 }
 812
 813 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 814                                             int v, int rm, intptr_t offset)
 815 {
 816     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 817 }
 818
 819 /* Output an opcode with an expected reference to the constant pool.  */
 820 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 821 {
 822     tcg_out_opc(s, opc, r, 0, 0);
 823     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 824     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 825     tcg_out32(s, 0);
 826 }
 827
 828 /* Output an opcode with an expected reference to the constant pool.  */
 829 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 830 {
 831     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 832     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 833     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 834     tcg_out32(s, 0);
 835 }
 836
 837 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 838 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 839 {
 840     /* Propagate an opcode prefix, such as P_REXW.  */
 841     int ext = subop & ~0x7;
 842     subop &= 0x7;
 843
 844     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 845 }
 846
 847 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 848 {
 849     int rexw = 0;
 850
 851     if (arg == ret) {
 852         return true;
 853     }
 854     switch (type) {
 855     case TCG_TYPE_I64:
 856         rexw = P_REXW;
 857         /* fallthru */
 858     case TCG_TYPE_I32:
 859         if (ret < 16) {
 860             if (arg < 16) {
 861                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 862             } else {
 863                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 864             }
 865         } else {
 866             if (arg < 16) {
 867                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 868             } else {
 869                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 870             }
 871         }
 872         break;
 873
 874     case TCG_TYPE_V64:
 875         tcg_debug_assert(ret >= 16 && arg >= 16);
 876         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 877         break;
 878     case TCG_TYPE_V128:
 879         tcg_debug_assert(ret >= 16 && arg >= 16);
 880         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 881         break;
 882     case TCG_TYPE_V256:
 883         tcg_debug_assert(ret >= 16 && arg >= 16);
 884         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 885         break;
 886
 887     default:
 888         g_assert_not_reached();
 889     }
 890     return true;
 891 }
 892
 893 static const int avx2_dup_insn[4] = {
 894     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 895     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 896 };
 897
 898 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 899                             TCGReg r, TCGReg a)
 900 {
 901     if (have_avx2) {
 902         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 903         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 904     } else {
 905         switch (vece) {
 906         case MO_8:
 907             /* ??? With zero in a register, use PSHUFB.  */
 908             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 909             a = r;
 910             /* FALLTHRU */
 911         case MO_16:
 912             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 913             a = r;
 914             /* FALLTHRU */
 915         case MO_32:
 916             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 917             /* imm8 operand: all output lanes selected from input lane 0.  */
 918             tcg_out8(s, 0);
 919             break;
 920         case MO_64:
 921             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 922             break;
 923         default:
 924             g_assert_not_reached();
 925         }
 926     }
 927     return true;
 928 }
 929
 930 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 931                              TCGReg r, TCGReg base, intptr_t offset)
 932 {
 933     if (have_avx2) {
 934         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 935         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 936                                  r, 0, base, offset);
 937     } else {
 938         switch (vece) {
 939         case MO_64:
 940             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 941             break;
 942         case MO_32:
 943             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 944             break;
 945         case MO_16:
 946             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 947             tcg_out8(s, 0); /* imm8 */
 948             tcg_out_dup_vec(s, type, vece, r, r);
 949             break;
 950         case MO_8:
 951             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 952             tcg_out8(s, 0); /* imm8 */
 953             tcg_out_dup_vec(s, type, vece, r, r);
 954             break;
 955         default:
 956             g_assert_not_reached();
 957         }
 958     }
 959     return true;
 960 }
 961
 962 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 963                              TCGReg ret, int64_t arg)
 964 {
 965     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 966
 967     if (arg == 0) {
 968         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 969         return;
 970     }
 971     if (arg == -1) {
 972         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 973         return;
 974     }
 975
 976     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 977         if (have_avx2) {
 978             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 979         } else {
 980             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 981         }
 982         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 983     } else {
 984         if (type == TCG_TYPE_V64) {
 985             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 986         } else if (have_avx2) {
 987             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 988         } else {
 989             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 990         }
 991         if (TCG_TARGET_REG_BITS == 64) {
 992             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 993         } else {
 994             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 995         }
 996     }
 997 }
 998
 999 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
1000                              TCGReg ret, tcg_target_long arg)
1001 {
1002     if (arg == 0) {
1003         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
1004         return;
1005     }
1006     if (arg == -1) {
1007         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
1008         return;
1009     }
1010
1011     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1012     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1013     if (TCG_TARGET_REG_BITS == 64) {
1014         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1015     } else {
1016         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1017     }
1018 }
1019
1020 static void tcg_out_movi_int(TCGContext *s, TCGType type,
1021                              TCGReg ret, tcg_target_long arg)
1022 {
1023     tcg_target_long diff;
1024
1025     if (arg == 0) {
1026         tgen_arithr(s, ARITH_XOR, ret, ret);
1027         return;
1028     }
1029     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1030         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1031         tcg_out32(s, arg);
1032         return;
1033     }
1034     if (arg == (int32_t)arg) {
1035         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1036         tcg_out32(s, arg);
1037         return;
1038     }
1039
1040     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1041     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1042     if (diff == (int32_t)diff) {
1043         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1044         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1045         tcg_out32(s, diff);
1046         return;
1047     }
1048
1049     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1050     tcg_out64(s, arg);
1051 }
1052
1053 static void tcg_out_movi(TCGContext *s, TCGType type,
1054                          TCGReg ret, tcg_target_long arg)
1055 {
1056     switch (type) {
1057     case TCG_TYPE_I32:
1058 #if TCG_TARGET_REG_BITS == 64
1059     case TCG_TYPE_I64:
1060 #endif
1061         if (ret < 16) {
1062             tcg_out_movi_int(s, type, ret, arg);
1063         } else {
1064             tcg_out_movi_vec(s, type, ret, arg);
1065         }
1066         break;
1067     default:
1068         g_assert_not_reached();
1069     }
1070 }
1071
1072 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1073 {
1074     if (val == (int8_t)val) {
1075         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1076         tcg_out8(s, val);
1077     } else if (val == (int32_t)val) {
1078         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1079         tcg_out32(s, val);
1080     } else {
1081         tcg_abort();
1082     }
1083 }
1084
1085 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1086 {
1087     /* Given the strength of x86 memory ordering, we only need care for
1088        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1089        faster than "mfence", so don't bother with the sse insn.  */
1090     if (a0 & TCG_MO_ST_LD) {
1091         tcg_out8(s, 0xf0);
1092         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1093         tcg_out8(s, 0);
1094     }
1095 }
1096
1097 static inline void tcg_out_push(TCGContext *s, int reg)
1098 {
1099     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1100 }
1101
1102 static inline void tcg_out_pop(TCGContext *s, int reg)
1103 {
1104     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1105 }
1106
1107 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1108                        TCGReg arg1, intptr_t arg2)
1109 {
1110     switch (type) {
1111     case TCG_TYPE_I32:
1112         if (ret < 16) {
1113             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1114         } else {
1115             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1116         }
1117         break;
1118     case TCG_TYPE_I64:
1119         if (ret < 16) {
1120             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1121             break;
1122         }
1123         /* FALLTHRU */
1124     case TCG_TYPE_V64:
1125         /* There is no instruction that can validate 8-byte alignment.  */
1126         tcg_debug_assert(ret >= 16);
1127         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1128         break;
1129     case TCG_TYPE_V128:
1130         /*
1131          * The gvec infrastructure is asserts that v128 vector loads
1132          * and stores use a 16-byte aligned offset.  Validate that the
1133          * final pointer is aligned by using an insn that will SIGSEGV.
1134          */
1135         tcg_debug_assert(ret >= 16);
1136         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1137         break;
1138     case TCG_TYPE_V256:
1139         /*
1140          * The gvec infrastructure only requires 16-byte alignment,
1141          * so here we must use an unaligned load.
1142          */
1143         tcg_debug_assert(ret >= 16);
1144         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1145                                  ret, 0, arg1, arg2);
1146         break;
1147     default:
1148         g_assert_not_reached();
1149     }
1150 }
1151
1152 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1153                        TCGReg arg1, intptr_t arg2)
1154 {
1155     switch (type) {
1156     case TCG_TYPE_I32:
1157         if (arg < 16) {
1158             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1159         } else {
1160             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1161         }
1162         break;
1163     case TCG_TYPE_I64:
1164         if (arg < 16) {
1165             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1166             break;
1167         }
1168         /* FALLTHRU */
1169     case TCG_TYPE_V64:
1170         /* There is no instruction that can validate 8-byte alignment.  */
1171         tcg_debug_assert(arg >= 16);
1172         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1173         break;
1174     case TCG_TYPE_V128:
1175         /*
1176          * The gvec infrastructure is asserts that v128 vector loads
1177          * and stores use a 16-byte aligned offset.  Validate that the
1178          * final pointer is aligned by using an insn that will SIGSEGV.
1179          */
1180         tcg_debug_assert(arg >= 16);
1181         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1182         break;
1183     case TCG_TYPE_V256:
1184         /*
1185          * The gvec infrastructure only requires 16-byte alignment,
1186          * so here we must use an unaligned store.
1187          */
1188         tcg_debug_assert(arg >= 16);
1189         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1190                                  arg, 0, arg1, arg2);
1191         break;
1192     default:
1193         g_assert_not_reached();
1194     }
1195 }
1196
1197 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1198                         TCGReg base, intptr_t ofs)
1199 {
1200     int rexw = 0;
1201     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1202         if (val != (int32_t)val) {
1203             return false;
1204         }
1205         rexw = P_REXW;
1206     } else if (type != TCG_TYPE_I32) {
1207         return false;
1208     }
1209     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1210     tcg_out32(s, val);
1211     return true;
1212 }
1213
1214 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1215 {
1216     /* Propagate an opcode prefix, such as P_DATA16.  */
1217     int ext = subopc & ~0x7;
1218     subopc &= 0x7;
1219
1220     if (count == 1) {
1221         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1222     } else {
1223         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1224         tcg_out8(s, count);
1225     }
1226 }
1227
1228 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1229 {
1230     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1231 }
1232
1233 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1234 {
1235     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1236 }
1237
1238 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1239 {
1240     /* movzbl */
1241     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1242     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1243 }
1244
1245 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1246 {
1247     /* movsbl */
1248     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1249     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1250 }
1251
1252 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1253 {
1254     /* movzwl */
1255     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1256 }
1257
1258 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1259 {
1260     /* movsw[lq] */
1261     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1262 }
1263
1264 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1265 {
1266     /* 32-bit mov zero extends.  */
1267     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1268 }
1269
1270 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1271 {
1272     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1273 }
1274
1275 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1276 {
1277     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1278 }
1279
1280 static void tgen_arithi(TCGContext *s, int c, int r0,
1281                         tcg_target_long val, int cf)
1282 {
1283     int rexw = 0;
1284
1285     if (TCG_TARGET_REG_BITS == 64) {
1286         rexw = c & -8;
1287         c &= 7;
1288     }
1289
1290     /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1291        partial flags update stalls on Pentium4 and are not recommended
1292        by current Intel optimization manuals.  */
1293     if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1294         int is_inc = (c == ARITH_ADD) ^ (val < 0);
1295         if (TCG_TARGET_REG_BITS == 64) {
1296             /* The single-byte increment encodings are re-tasked as the
1297                REX prefixes.  Use the MODRM encoding.  */
1298             tcg_out_modrm(s, OPC_GRP5 + rexw,
1299                           (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1300         } else {
1301             tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1302         }
1303         return;
1304     }
1305
1306     if (c == ARITH_AND) {
1307         if (TCG_TARGET_REG_BITS == 64) {
1308             if (val == 0xffffffffu) {
1309                 tcg_out_ext32u(s, r0, r0);
1310                 return;
1311             }
1312             if (val == (uint32_t)val) {
1313                 /* AND with no high bits set can use a 32-bit operation.  */
1314                 rexw = 0;
1315             }
1316         }
1317         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1318             tcg_out_ext8u(s, r0, r0);
1319             return;
1320         }
1321         if (val == 0xffffu) {
1322             tcg_out_ext16u(s, r0, r0);
1323             return;
1324         }
1325     }
1326
1327     if (val == (int8_t)val) {
1328         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1329         tcg_out8(s, val);
1330         return;
1331     }
1332     if (rexw == 0 || val == (int32_t)val) {
1333         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1334         tcg_out32(s, val);
1335         return;
1336     }
1337
1338     tcg_abort();
1339 }
1340
1341 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1342 {
1343     if (val != 0) {
1344         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1345     }
1346 }
1347
1348 /* Use SMALL != 0 to force a short forward branch.  */
1349 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1350 {
1351     int32_t val, val1;
1352
1353     if (l->has_value) {
1354         val = tcg_pcrel_diff(s, l->u.value_ptr);
1355         val1 = val - 2;
1356         if ((int8_t)val1 == val1) {
1357             if (opc == -1) {
1358                 tcg_out8(s, OPC_JMP_short);
1359             } else {
1360                 tcg_out8(s, OPC_JCC_short + opc);
1361             }
1362             tcg_out8(s, val1);
1363         } else {
1364             if (small) {
1365                 tcg_abort();
1366             }
1367             if (opc == -1) {
1368                 tcg_out8(s, OPC_JMP_long);
1369                 tcg_out32(s, val - 5);
1370             } else {
1371                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1372                 tcg_out32(s, val - 6);
1373             }
1374         }
1375     } else if (small) {
1376         if (opc == -1) {
1377             tcg_out8(s, OPC_JMP_short);
1378         } else {
1379             tcg_out8(s, OPC_JCC_short + opc);
1380         }
1381         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1382         s->code_ptr += 1;
1383     } else {
1384         if (opc == -1) {
1385             tcg_out8(s, OPC_JMP_long);
1386         } else {
1387             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1388         }
1389         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1390         s->code_ptr += 4;
1391     }
1392 }
1393
1394 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1395                         int const_arg2, int rexw)
1396 {
1397     if (const_arg2) {
1398         if (arg2 == 0) {
1399             /* test r, r */
1400             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1401         } else {
1402             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1403         }
1404     } else {
1405         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1406     }
1407 }
1408
1409 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1410                              TCGArg arg1, TCGArg arg2, int const_arg2,
1411                              TCGLabel *label, int small)
1412 {
1413     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1414     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1415 }
1416
1417 #if TCG_TARGET_REG_BITS == 64
1418 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1419                              TCGArg arg1, TCGArg arg2, int const_arg2,
1420                              TCGLabel *label, int small)
1421 {
1422     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1423     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1424 }
1425 #else
1426 /* XXX: we implement it at the target level to avoid having to
1427    handle cross basic blocks temporaries */
1428 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1429                             const int *const_args, int small)
1430 {
1431     TCGLabel *label_next = gen_new_label();
1432     TCGLabel *label_this = arg_label(args[5]);
1433
1434     switch(args[4]) {
1435     case TCG_COND_EQ:
1436         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1437                          label_next, 1);
1438         tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1439                          label_this, small);
1440         break;
1441     case TCG_COND_NE:
1442         tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1443                          label_this, small);
1444         tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1445                          label_this, small);
1446         break;
1447     case TCG_COND_LT:
1448         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1449                          label_this, small);
1450         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1451         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1452                          label_this, small);
1453         break;
1454     case TCG_COND_LE:
1455         tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1456                          label_this, small);
1457         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1458         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1459                          label_this, small);
1460         break;
1461     case TCG_COND_GT:
1462         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1463                          label_this, small);
1464         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1465         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1466                          label_this, small);
1467         break;
1468     case TCG_COND_GE:
1469         tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1470                          label_this, small);
1471         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1472         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1473                          label_this, small);
1474         break;
1475     case TCG_COND_LTU:
1476         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1477                          label_this, small);
1478         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1479         tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1480                          label_this, small);
1481         break;
1482     case TCG_COND_LEU:
1483         tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1484                          label_this, small);
1485         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1486         tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1487                          label_this, small);
1488         break;
1489     case TCG_COND_GTU:
1490         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1491                          label_this, small);
1492         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1493         tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1494                          label_this, small);
1495         break;
1496     case TCG_COND_GEU:
1497         tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1498                          label_this, small);
1499         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1500         tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1501                          label_this, small);
1502         break;
1503     default:
1504         tcg_abort();
1505     }
1506     tcg_out_label(s, label_next);
1507 }
1508 #endif
1509
1510 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1511                               TCGArg arg1, TCGArg arg2, int const_arg2)
1512 {
1513     tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1514     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1515     tcg_out_ext8u(s, dest, dest);
1516 }
1517
1518 #if TCG_TARGET_REG_BITS == 64
1519 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1520                               TCGArg arg1, TCGArg arg2, int const_arg2)
1521 {
1522     tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1523     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1524     tcg_out_ext8u(s, dest, dest);
1525 }
1526 #else
1527 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1528                              const int *const_args)
1529 {
1530     TCGArg new_args[6];
1531     TCGLabel *label_true, *label_over;
1532
1533     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1534
1535     if (args[0] == args[1] || args[0] == args[2]
1536         || (!const_args[3] && args[0] == args[3])
1537         || (!const_args[4] && args[0] == args[4])) {
1538         /* When the destination overlaps with one of the argument
1539            registers, don't do anything tricky.  */
1540         label_true = gen_new_label();
1541         label_over = gen_new_label();
1542
1543         new_args[5] = label_arg(label_true);
1544         tcg_out_brcond2(s, new_args, const_args+1, 1);
1545
1546         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1547         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1548         tcg_out_label(s, label_true);
1549
1550         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1551         tcg_out_label(s, label_over);
1552     } else {
1553         /* When the destination does not overlap one of the arguments,
1554            clear the destination first, jump if cond false, and emit an
1555            increment in the true case.  This results in smaller code.  */
1556
1557         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1558
1559         label_over = gen_new_label();
1560         new_args[4] = tcg_invert_cond(new_args[4]);
1561         new_args[5] = label_arg(label_over);
1562         tcg_out_brcond2(s, new_args, const_args+1, 1);
1563
1564         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1565         tcg_out_label(s, label_over);
1566     }
1567 }
1568 #endif
1569
1570 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1571                          TCGReg dest, TCGReg v1)
1572 {
1573     if (have_cmov) {
1574         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1575     } else {
1576         TCGLabel *over = gen_new_label();
1577         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1578         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1579         tcg_out_label(s, over);
1580     }
1581 }
1582
1583 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1584                               TCGReg c1, TCGArg c2, int const_c2,
1585                               TCGReg v1)
1586 {
1587     tcg_out_cmp(s, c1, c2, const_c2, 0);
1588     tcg_out_cmov(s, cond, 0, dest, v1);
1589 }
1590
1591 #if TCG_TARGET_REG_BITS == 64
1592 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1593                               TCGReg c1, TCGArg c2, int const_c2,
1594                               TCGReg v1)
1595 {
1596     tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1597     tcg_out_cmov(s, cond, P_REXW, dest, v1);
1598 }
1599 #endif
1600
1601 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1602                         TCGArg arg2, bool const_a2)
1603 {
1604     if (have_bmi1) {
1605         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1606         if (const_a2) {
1607             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1608         } else {
1609             tcg_debug_assert(dest != arg2);
1610             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1611         }
1612     } else {
1613         tcg_debug_assert(dest != arg2);
1614         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1615         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1616     }
1617 }
1618
1619 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1620                         TCGArg arg2, bool const_a2)
1621 {
1622     if (have_lzcnt) {
1623         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1624         if (const_a2) {
1625             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1626         } else {
1627             tcg_debug_assert(dest != arg2);
1628             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1629         }
1630     } else {
1631         tcg_debug_assert(!const_a2);
1632         tcg_debug_assert(dest != arg1);
1633         tcg_debug_assert(dest != arg2);
1634
1635         /* Recall that the output of BSR is the index not the count.  */
1636         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1637         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1638
1639         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1640         tcg_out_cmp(s, arg1, 0, 1, rexw);
1641         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1642     }
1643 }
1644
1645 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1646 {
1647     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1648
1649     if (disp == (int32_t)disp) {
1650         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1651         tcg_out32(s, disp);
1652     } else {
1653         /* rip-relative addressing into the constant pool.
1654            This is 6 + 8 = 14 bytes, as compared to using an
1655            an immediate load 10 + 6 = 16 bytes, plus we may
1656            be able to re-use the pool constant for more calls.  */
1657         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1658         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1659         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1660         tcg_out32(s, 0);
1661     }
1662 }
1663
1664 static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1665 {
1666     tcg_out_branch(s, 1, dest);
1667 }
1668
1669 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1670 {
1671     tcg_out_branch(s, 0, dest);
1672 }
1673
1674 static void tcg_out_nopn(TCGContext *s, int n)
1675 {
1676     int i;
1677     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1678      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1679      * duplicate prefix, and all of the interesting recent cores can
1680      * decode and discard the duplicates in a single cycle.
1681      */
1682     tcg_debug_assert(n >= 1);
1683     for (i = 1; i < n; ++i) {
1684         tcg_out8(s, 0x66);
1685     }
1686     tcg_out8(s, 0x90);
1687 }
1688
1689 #if defined(CONFIG_SOFTMMU)
1690 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1691  *                                     int mmu_idx, uintptr_t ra)
1692  */
1693 static void * const qemu_ld_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1694     [MO_UB]   = helper_ret_ldub_mmu,
1695     [MO_LEUW] = helper_le_lduw_mmu,
1696     [MO_LEUL] = helper_le_ldul_mmu,
1697     [MO_LEUQ] = helper_le_ldq_mmu,
1698     [MO_BEUW] = helper_be_lduw_mmu,
1699     [MO_BEUL] = helper_be_ldul_mmu,
1700     [MO_BEUQ] = helper_be_ldq_mmu,
1701 };
1702
1703 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1704  *                                     uintxx_t val, int mmu_idx, uintptr_t ra)
1705  */
1706 static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1707     [MO_UB]   = helper_ret_stb_mmu,
1708     [MO_LEUW] = helper_le_stw_mmu,
1709     [MO_LEUL] = helper_le_stl_mmu,
1710     [MO_LEUQ] = helper_le_stq_mmu,
1711     [MO_BEUW] = helper_be_stw_mmu,
1712     [MO_BEUL] = helper_be_stl_mmu,
1713     [MO_BEUQ] = helper_be_stq_mmu,
1714 };
1715
1716 /* Perform the TLB load and compare.
1717
1718    Inputs:
1719    ADDRLO and ADDRHI contain the low and high part of the address.
1720
1721    MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1722
1723    WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1724    This should be offsetof addr_read or addr_write.
1725
1726    Outputs:
1727    LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1728    positions of the displacements of forward jumps to the TLB miss case.
1729
1730    Second argument register is loaded with the low part of the address.
1731    In the TLB hit case, it has been adjusted as indicated by the TLB
1732    and so is a host address.  In the TLB miss case, it continues to
1733    hold a guest address.
1734
1735    First argument register is clobbered.  */
1736
1737 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1738                                     int mem_index, MemOp opc,
1739                                     tcg_insn_unit **label_ptr, int which)
1740 {
1741     const TCGReg r0 = TCG_REG_L0;
1742     const TCGReg r1 = TCG_REG_L1;
1743     TCGType ttype = TCG_TYPE_I32;
1744     TCGType tlbtype = TCG_TYPE_I32;
1745     int trexw = 0, hrexw = 0, tlbrexw = 0;
1746     unsigned a_bits = get_alignment_bits(opc);
1747     unsigned s_bits = opc & MO_SIZE;
1748     unsigned a_mask = (1 << a_bits) - 1;
1749     unsigned s_mask = (1 << s_bits) - 1;
1750     target_ulong tlb_mask;
1751
1752     if (TCG_TARGET_REG_BITS == 64) {
1753         if (TARGET_LONG_BITS == 64) {
1754             ttype = TCG_TYPE_I64;
1755             trexw = P_REXW;
1756         }
1757         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1758             hrexw = P_REXW;
1759             if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1760                 tlbtype = TCG_TYPE_I64;
1761                 tlbrexw = P_REXW;
1762             }
1763         }
1764     }
1765
1766     tcg_out_mov(s, tlbtype, r0, addrlo);
1767     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1768                    TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1769
1770     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1771                          TLB_MASK_TABLE_OFS(mem_index) +
1772                          offsetof(CPUTLBDescFast, mask));
1773
1774     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1775                          TLB_MASK_TABLE_OFS(mem_index) +
1776                          offsetof(CPUTLBDescFast, table));
1777
1778     /* If the required alignment is at least as large as the access, simply
1779        copy the address and mask.  For lesser alignments, check that we don't
1780        cross pages for the complete access.  */
1781     if (a_bits >= s_bits) {
1782         tcg_out_mov(s, ttype, r1, addrlo);
1783     } else {
1784         tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1785     }
1786     tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1787     tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1788
1789     /* cmp 0(r0), r1 */
1790     tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1791
1792     /* Prepare for both the fast path add of the tlb addend, and the slow
1793        path function argument setup.  */
1794     tcg_out_mov(s, ttype, r1, addrlo);
1795
1796     /* jne slow_path */
1797     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1798     label_ptr[0] = s->code_ptr;
1799     s->code_ptr += 4;
1800
1801     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1802         /* cmp 4(r0), addrhi */
1803         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1804
1805         /* jne slow_path */
1806         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1807         label_ptr[1] = s->code_ptr;
1808         s->code_ptr += 4;
1809     }
1810
1811     /* TLB Hit.  */
1812
1813     /* add addend(r0), r1 */
1814     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1815                          offsetof(CPUTLBEntry, addend));
1816 }
1817
1818 /*
1819  * Record the context of a call to the out of line helper code for the slow path
1820  * for a load or store, so that we can later generate the correct helper code
1821  */
1822 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1823                                 MemOpIdx oi,
1824                                 TCGReg datalo, TCGReg datahi,
1825                                 TCGReg addrlo, TCGReg addrhi,
1826                                 tcg_insn_unit *raddr,
1827                                 tcg_insn_unit **label_ptr)
1828 {
1829     TCGLabelQemuLdst *label = new_ldst_label(s);
1830
1831     label->is_ld = is_ld;
1832     label->oi = oi;
1833     label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1834     label->datalo_reg = datalo;
1835     label->datahi_reg = datahi;
1836     label->addrlo_reg = addrlo;
1837     label->addrhi_reg = addrhi;
1838     label->raddr = tcg_splitwx_to_rx(raddr);
1839     label->label_ptr[0] = label_ptr[0];
1840     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1841         label->label_ptr[1] = label_ptr[1];
1842     }
1843 }
1844
1845 /*
1846  * Generate code for the slow path for a load at the end of block
1847  */
1848 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1849 {
1850     MemOpIdx oi = l->oi;
1851     MemOp opc = get_memop(oi);
1852     TCGReg data_reg;
1853     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1854     int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1855
1856     /* resolve label address */
1857     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1858     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1859         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1860     }
1861
1862     if (TCG_TARGET_REG_BITS == 32) {
1863         int ofs = 0;
1864
1865         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1866         ofs += 4;
1867
1868         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1869         ofs += 4;
1870
1871         if (TARGET_LONG_BITS == 64) {
1872             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1873             ofs += 4;
1874         }
1875
1876         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1877         ofs += 4;
1878
1879         tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1880     } else {
1881         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1882         /* The second argument is already loaded with addrlo.  */
1883         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1884         tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1885                      (uintptr_t)l->raddr);
1886     }
1887
1888     tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1889
1890     data_reg = l->datalo_reg;
1891     switch (opc & MO_SSIZE) {
1892     case MO_SB:
1893         tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1894         break;
1895     case MO_SW:
1896         tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1897         break;
1898 #if TCG_TARGET_REG_BITS == 64
1899     case MO_SL:
1900         tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1901         break;
1902 #endif
1903     case MO_UB:
1904     case MO_UW:
1905         /* Note that the helpers have zero-extended to tcg_target_long.  */
1906     case MO_UL:
1907         tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1908         break;
1909     case MO_UQ:
1910         if (TCG_TARGET_REG_BITS == 64) {
1911             tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1912         } else if (data_reg == TCG_REG_EDX) {
1913             /* xchg %edx, %eax */
1914             tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1915             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1916         } else {
1917             tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1918             tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1919         }
1920         break;
1921     default:
1922         tcg_abort();
1923     }
1924
1925     /* Jump to the code corresponding to next IR of qemu_st */
1926     tcg_out_jmp(s, l->raddr);
1927     return true;
1928 }
1929
1930 /*
1931  * Generate code for the slow path for a store at the end of block
1932  */
1933 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1934 {
1935     MemOpIdx oi = l->oi;
1936     MemOp opc = get_memop(oi);
1937     MemOp s_bits = opc & MO_SIZE;
1938     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1939     TCGReg retaddr;
1940
1941     /* resolve label address */
1942     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1943     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1944         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1945     }
1946
1947     if (TCG_TARGET_REG_BITS == 32) {
1948         int ofs = 0;
1949
1950         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1951         ofs += 4;
1952
1953         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1954         ofs += 4;
1955
1956         if (TARGET_LONG_BITS == 64) {
1957             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1958             ofs += 4;
1959         }
1960
1961         tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1962         ofs += 4;
1963
1964         if (s_bits == MO_64) {
1965             tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1966             ofs += 4;
1967         }
1968
1969         tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1970         ofs += 4;
1971
1972         retaddr = TCG_REG_EAX;
1973         tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1974         tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1975     } else {
1976         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1977         /* The second argument is already loaded with addrlo.  */
1978         tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1979                     tcg_target_call_iarg_regs[2], l->datalo_reg);
1980         tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1981
1982         if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1983             retaddr = tcg_target_call_iarg_regs[4];
1984             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1985         } else {
1986             retaddr = TCG_REG_RAX;
1987             tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1988             tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1989                        TCG_TARGET_CALL_STACK_OFFSET);
1990         }
1991     }
1992
1993     /* "Tail call" to the helper, with the return address back inline.  */
1994     tcg_out_push(s, retaddr);
1995     tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1996     return true;
1997 }
1998 #else
1999
2000 static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
2001                                    TCGReg addrhi, unsigned a_bits)
2002 {
2003     unsigned a_mask = (1 << a_bits) - 1;
2004     TCGLabelQemuLdst *label;
2005
2006     /*
2007      * We are expecting a_bits to max out at 7, so we can usually use testb.
2008      * For i686, we have to use testl for %esi/%edi.
2009      */
2010     if (a_mask <= 0xff && (TCG_TARGET_REG_BITS == 64 || addrlo < 4)) {
2011         tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, addrlo);
2012         tcg_out8(s, a_mask);
2013     } else {
2014         tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, addrlo);
2015         tcg_out32(s, a_mask);
2016     }
2017
2018     /* jne slow_path */
2019     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2020
2021     label = new_ldst_label(s);
2022     label->is_ld = is_ld;
2023     label->addrlo_reg = addrlo;
2024     label->addrhi_reg = addrhi;
2025     label->raddr = tcg_splitwx_to_rx(s->code_ptr + 4);
2026     label->label_ptr[0] = s->code_ptr;
2027
2028     s->code_ptr += 4;
2029 }
2030
2031 static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
2032 {
2033     /* resolve label address */
2034     tcg_patch32(l->label_ptr[0], s->code_ptr - l->label_ptr[0] - 4);
2035
2036     if (TCG_TARGET_REG_BITS == 32) {
2037         int ofs = 0;
2038
2039         tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
2040         ofs += 4;
2041
2042         tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
2043         ofs += 4;
2044         if (TARGET_LONG_BITS == 64) {
2045             tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
2046             ofs += 4;
2047         }
2048
2049         tcg_out_pushi(s, (uintptr_t)l->raddr);
2050     } else {
2051         tcg_out_mov(s, TCG_TYPE_TL, tcg_target_call_iarg_regs[1],
2052                     l->addrlo_reg);
2053         tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
2054
2055         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX, (uintptr_t)l->raddr);
2056         tcg_out_push(s, TCG_REG_RAX);
2057     }
2058
2059     /* "Tail call" to the helper, with the return address back inline. */
2060     tcg_out_jmp(s, (const void *)(l->is_ld ? helper_unaligned_ld
2061                                   : helper_unaligned_st));
2062     return true;
2063 }
2064
2065 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2066 {
2067     return tcg_out_fail_alignment(s, l);
2068 }
2069
2070 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
2071 {
2072     return tcg_out_fail_alignment(s, l);
2073 }
2074
2075 #if TCG_TARGET_REG_BITS == 32
2076 # define x86_guest_base_seg     0
2077 # define x86_guest_base_index   -1
2078 # define x86_guest_base_offset  guest_base
2079 #else
2080 static int x86_guest_base_seg;
2081 static int x86_guest_base_index = -1;
2082 static int32_t x86_guest_base_offset;
2083 # if defined(__x86_64__) && defined(__linux__)
2084 #  include <asm/prctl.h>
2085 #  include <sys/prctl.h>
2086 int arch_prctl(int code, unsigned long addr);
2087 static inline int setup_guest_base_seg(void)
2088 {
2089     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
2090         return P_GS;
2091     }
2092     return 0;
2093 }
2094 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
2095 #  include <machine/sysarch.h>
2096 static inline int setup_guest_base_seg(void)
2097 {
2098     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
2099         return P_GS;
2100     }
2101     return 0;
2102 }
2103 # else
2104 static inline int setup_guest_base_seg(void)
2105 {
2106     return 0;
2107 }
2108 # endif
2109 #endif
2110 #endif /* SOFTMMU */
2111
2112 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2113                                    TCGReg base, int index, intptr_t ofs,
2114                                    int seg, bool is64, MemOp memop)
2115 {
2116     bool use_movbe = false;
2117     int rexw = is64 * P_REXW;
2118     int movop = OPC_MOVL_GvEv;
2119
2120     /* Do big-endian loads with movbe.  */
2121     if (memop & MO_BSWAP) {
2122         tcg_debug_assert(have_movbe);
2123         use_movbe = true;
2124         movop = OPC_MOVBE_GyMy;
2125     }
2126
2127     switch (memop & MO_SSIZE) {
2128     case MO_UB:
2129         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2130                                  base, index, 0, ofs);
2131         break;
2132     case MO_SB:
2133         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
2134                                  base, index, 0, ofs);
2135         break;
2136     case MO_UW:
2137         if (use_movbe) {
2138             /* There is no extending movbe; only low 16-bits are modified.  */
2139             if (datalo != base && datalo != index) {
2140                 /* XOR breaks dependency chains.  */
2141                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2142                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2143                                          datalo, base, index, 0, ofs);
2144             } else {
2145                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2146                                          datalo, base, index, 0, ofs);
2147                 tcg_out_ext16u(s, datalo, datalo);
2148             }
2149         } else {
2150             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2151                                      base, index, 0, ofs);
2152         }
2153         break;
2154     case MO_SW:
2155         if (use_movbe) {
2156             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2157                                      datalo, base, index, 0, ofs);
2158             tcg_out_ext16s(s, datalo, datalo, rexw);
2159         } else {
2160             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2161                                      datalo, base, index, 0, ofs);
2162         }
2163         break;
2164     case MO_UL:
2165         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2166         break;
2167 #if TCG_TARGET_REG_BITS == 64
2168     case MO_SL:
2169         if (use_movbe) {
2170             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2171                                      base, index, 0, ofs);
2172             tcg_out_ext32s(s, datalo, datalo);
2173         } else {
2174             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2175                                      base, index, 0, ofs);
2176         }
2177         break;
2178 #endif
2179     case MO_UQ:
2180         if (TCG_TARGET_REG_BITS == 64) {
2181             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2182                                      base, index, 0, ofs);
2183         } else {
2184             if (use_movbe) {
2185                 TCGReg t = datalo;
2186                 datalo = datahi;
2187                 datahi = t;
2188             }
2189             if (base != datalo) {
2190                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2191                                          base, index, 0, ofs);
2192                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2193                                          base, index, 0, ofs + 4);
2194             } else {
2195                 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2196                                          base, index, 0, ofs + 4);
2197                 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2198                                          base, index, 0, ofs);
2199             }
2200         }
2201         break;
2202     default:
2203         g_assert_not_reached();
2204     }
2205 }
2206
2207 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2208    EAX. It will be useful once fixed registers globals are less
2209    common. */
2210 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2211 {
2212     TCGReg datalo, datahi, addrlo;
2213     TCGReg addrhi __attribute__((unused));
2214     MemOpIdx oi;
2215     MemOp opc;
2216 #if defined(CONFIG_SOFTMMU)
2217     int mem_index;
2218     tcg_insn_unit *label_ptr[2];
2219 #else
2220     unsigned a_bits;
2221 #endif
2222
2223     datalo = *args++;
2224     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2225     addrlo = *args++;
2226     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2227     oi = *args++;
2228     opc = get_memop(oi);
2229
2230 #if defined(CONFIG_SOFTMMU)
2231     mem_index = get_mmuidx(oi);
2232
2233     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2234                      label_ptr, offsetof(CPUTLBEntry, addr_read));
2235
2236     /* TLB Hit.  */
2237     tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2238
2239     /* Record the current context of a load into ldst label */
2240     add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2241                         s->code_ptr, label_ptr);
2242 #else
2243     a_bits = get_alignment_bits(opc);
2244     if (a_bits) {
2245         tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
2246     }
2247
2248     tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2249                            x86_guest_base_offset, x86_guest_base_seg,
2250                            is64, opc);
2251 #endif
2252 }
2253
2254 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2255                                    TCGReg base, int index, intptr_t ofs,
2256                                    int seg, MemOp memop)
2257 {
2258     bool use_movbe = false;
2259     int movop = OPC_MOVL_EvGv;
2260
2261     /*
2262      * Do big-endian stores with movbe or softmmu.
2263      * User-only without movbe will have its swapping done generically.
2264      */
2265     if (memop & MO_BSWAP) {
2266         tcg_debug_assert(have_movbe);
2267         use_movbe = true;
2268         movop = OPC_MOVBE_MyGy;
2269     }
2270
2271     switch (memop & MO_SIZE) {
2272     case MO_8:
2273         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2274         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2275         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2276                                  datalo, base, index, 0, ofs);
2277         break;
2278     case MO_16:
2279         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2280                                  base, index, 0, ofs);
2281         break;
2282     case MO_32:
2283         tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2284         break;
2285     case MO_64:
2286         if (TCG_TARGET_REG_BITS == 64) {
2287             tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2288                                      base, index, 0, ofs);
2289         } else {
2290             if (use_movbe) {
2291                 TCGReg t = datalo;
2292                 datalo = datahi;
2293                 datahi = t;
2294             }
2295             tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2296                                      base, index, 0, ofs);
2297             tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2298                                      base, index, 0, ofs + 4);
2299         }
2300         break;
2301     default:
2302         g_assert_not_reached();
2303     }
2304 }
2305
2306 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2307 {
2308     TCGReg datalo, datahi, addrlo;
2309     TCGReg addrhi __attribute__((unused));
2310     MemOpIdx oi;
2311     MemOp opc;
2312 #if defined(CONFIG_SOFTMMU)
2313     int mem_index;
2314     tcg_insn_unit *label_ptr[2];
2315 #else
2316     unsigned a_bits;
2317 #endif
2318
2319     datalo = *args++;
2320     datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2321     addrlo = *args++;
2322     addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2323     oi = *args++;
2324     opc = get_memop(oi);
2325
2326 #if defined(CONFIG_SOFTMMU)
2327     mem_index = get_mmuidx(oi);
2328
2329     tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2330                      label_ptr, offsetof(CPUTLBEntry, addr_write));
2331
2332     /* TLB Hit.  */
2333     tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2334
2335     /* Record the current context of a store into ldst label */
2336     add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2337                         s->code_ptr, label_ptr);
2338 #else
2339     a_bits = get_alignment_bits(opc);
2340     if (a_bits) {
2341         tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
2342     }
2343
2344     tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2345                            x86_guest_base_offset, x86_guest_base_seg, opc);
2346 #endif
2347 }
2348
2349 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2350                               const TCGArg args[TCG_MAX_OP_ARGS],
2351                               const int const_args[TCG_MAX_OP_ARGS])
2352 {
2353     TCGArg a0, a1, a2;
2354     int c, const_a2, vexop, rexw = 0;
2355
2356 #if TCG_TARGET_REG_BITS == 64
2357 # define OP_32_64(x) \
2358         case glue(glue(INDEX_op_, x), _i64): \
2359             rexw = P_REXW; /* FALLTHRU */    \
2360         case glue(glue(INDEX_op_, x), _i32)
2361 #else
2362 # define OP_32_64(x) \
2363         case glue(glue(INDEX_op_, x), _i32)
2364 #endif
2365
2366     /* Hoist the loads of the most common arguments.  */
2367     a0 = args[0];
2368     a1 = args[1];
2369     a2 = args[2];
2370     const_a2 = const_args[2];
2371
2372     switch (opc) {
2373     case INDEX_op_exit_tb:
2374         /* Reuse the zeroing that exists for goto_ptr.  */
2375         if (a0 == 0) {
2376             tcg_out_jmp(s, tcg_code_gen_epilogue);
2377         } else {
2378             tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2379             tcg_out_jmp(s, tb_ret_addr);
2380         }
2381         break;
2382     case INDEX_op_goto_tb:
2383         if (s->tb_jmp_insn_offset) {
2384             /* direct jump method */
2385             int gap;
2386             /* jump displacement must be aligned for atomic patching;
2387              * see if we need to add extra nops before jump
2388              */
2389             gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2390             if (gap != 1) {
2391                 tcg_out_nopn(s, gap - 1);
2392             }
2393             tcg_out8(s, OPC_JMP_long); /* jmp im */
2394             s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2395             tcg_out32(s, 0);
2396         } else {
2397             /* indirect jump method */
2398             tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2399                                  (intptr_t)(s->tb_jmp_target_addr + a0));
2400         }
2401         set_jmp_reset_offset(s, a0);
2402         break;
2403     case INDEX_op_goto_ptr:
2404         /* jmp to the given host address (could be epilogue) */
2405         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2406         break;
2407     case INDEX_op_br:
2408         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2409         break;
2410     OP_32_64(ld8u):
2411         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2412         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2413         break;
2414     OP_32_64(ld8s):
2415         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2416         break;
2417     OP_32_64(ld16u):
2418         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2419         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2420         break;
2421     OP_32_64(ld16s):
2422         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2423         break;
2424 #if TCG_TARGET_REG_BITS == 64
2425     case INDEX_op_ld32u_i64:
2426 #endif
2427     case INDEX_op_ld_i32:
2428         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2429         break;
2430
2431     OP_32_64(st8):
2432         if (const_args[0]) {
2433             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2434             tcg_out8(s, a0);
2435         } else {
2436             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2437         }
2438         break;
2439     OP_32_64(st16):
2440         if (const_args[0]) {
2441             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2442             tcg_out16(s, a0);
2443         } else {
2444             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2445         }
2446         break;
2447 #if TCG_TARGET_REG_BITS == 64
2448     case INDEX_op_st32_i64:
2449 #endif
2450     case INDEX_op_st_i32:
2451         if (const_args[0]) {
2452             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2453             tcg_out32(s, a0);
2454         } else {
2455             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2456         }
2457         break;
2458
2459     OP_32_64(add):
2460         /* For 3-operand addition, use LEA.  */
2461         if (a0 != a1) {
2462             TCGArg c3 = 0;
2463             if (const_a2) {
2464                 c3 = a2, a2 = -1;
2465             } else if (a0 == a2) {
2466                 /* Watch out for dest = src + dest, since we've removed
2467                    the matching constraint on the add.  */
2468                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2469                 break;
2470             }
2471
2472             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2473             break;
2474         }
2475         c = ARITH_ADD;
2476         goto gen_arith;
2477     OP_32_64(sub):
2478         c = ARITH_SUB;
2479         goto gen_arith;
2480     OP_32_64(and):
2481         c = ARITH_AND;
2482         goto gen_arith;
2483     OP_32_64(or):
2484         c = ARITH_OR;
2485         goto gen_arith;
2486     OP_32_64(xor):
2487         c = ARITH_XOR;
2488         goto gen_arith;
2489     gen_arith:
2490         if (const_a2) {
2491             tgen_arithi(s, c + rexw, a0, a2, 0);
2492         } else {
2493             tgen_arithr(s, c + rexw, a0, a2);
2494         }
2495         break;
2496
2497     OP_32_64(andc):
2498         if (const_a2) {
2499             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2500             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2501         } else {
2502             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2503         }
2504         break;
2505
2506     OP_32_64(mul):
2507         if (const_a2) {
2508             int32_t val;
2509             val = a2;
2510             if (val == (int8_t)val) {
2511                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2512                 tcg_out8(s, val);
2513             } else {
2514                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2515                 tcg_out32(s, val);
2516             }
2517         } else {
2518             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2519         }
2520         break;
2521
2522     OP_32_64(div2):
2523         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2524         break;
2525     OP_32_64(divu2):
2526         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2527         break;
2528
2529     OP_32_64(shl):
2530         /* For small constant 3-operand shift, use LEA.  */
2531         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2532             if (a2 - 1 == 0) {
2533                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2534                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2535             } else {
2536                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2537                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2538             }
2539             break;
2540         }
2541         c = SHIFT_SHL;
2542         vexop = OPC_SHLX;
2543         goto gen_shift_maybe_vex;
2544     OP_32_64(shr):
2545         c = SHIFT_SHR;
2546         vexop = OPC_SHRX;
2547         goto gen_shift_maybe_vex;
2548     OP_32_64(sar):
2549         c = SHIFT_SAR;
2550         vexop = OPC_SARX;
2551         goto gen_shift_maybe_vex;
2552     OP_32_64(rotl):
2553         c = SHIFT_ROL;
2554         goto gen_shift;
2555     OP_32_64(rotr):
2556         c = SHIFT_ROR;
2557         goto gen_shift;
2558     gen_shift_maybe_vex:
2559         if (have_bmi2) {
2560             if (!const_a2) {
2561                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2562                 break;
2563             }
2564             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2565         }
2566         /* FALLTHRU */
2567     gen_shift:
2568         if (const_a2) {
2569             tcg_out_shifti(s, c + rexw, a0, a2);
2570         } else {
2571             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2572         }
2573         break;
2574
2575     OP_32_64(ctz):
2576         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2577         break;
2578     OP_32_64(clz):
2579         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2580         break;
2581     OP_32_64(ctpop):
2582         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2583         break;
2584
2585     case INDEX_op_brcond_i32:
2586         tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2587         break;
2588     case INDEX_op_setcond_i32:
2589         tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2590         break;
2591     case INDEX_op_movcond_i32:
2592         tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2593         break;
2594
2595     OP_32_64(bswap16):
2596         if (a2 & TCG_BSWAP_OS) {
2597             /* Output must be sign-extended. */
2598             if (rexw) {
2599                 tcg_out_bswap64(s, a0);
2600                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2601             } else {
2602                 tcg_out_bswap32(s, a0);
2603                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2604             }
2605         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2606             /* Output must be zero-extended, but input isn't. */
2607             tcg_out_bswap32(s, a0);
2608             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2609         } else {
2610             tcg_out_rolw_8(s, a0);
2611         }
2612         break;
2613     OP_32_64(bswap32):
2614         tcg_out_bswap32(s, a0);
2615         if (rexw && (a2 & TCG_BSWAP_OS)) {
2616             tcg_out_ext32s(s, a0, a0);
2617         }
2618         break;
2619
2620     OP_32_64(neg):
2621         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2622         break;
2623     OP_32_64(not):
2624         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2625         break;
2626
2627     OP_32_64(ext8s):
2628         tcg_out_ext8s(s, a0, a1, rexw);
2629         break;
2630     OP_32_64(ext16s):
2631         tcg_out_ext16s(s, a0, a1, rexw);
2632         break;
2633     OP_32_64(ext8u):
2634         tcg_out_ext8u(s, a0, a1);
2635         break;
2636     OP_32_64(ext16u):
2637         tcg_out_ext16u(s, a0, a1);
2638         break;
2639
2640     case INDEX_op_qemu_ld_i32:
2641         tcg_out_qemu_ld(s, args, 0);
2642         break;
2643     case INDEX_op_qemu_ld_i64:
2644         tcg_out_qemu_ld(s, args, 1);
2645         break;
2646     case INDEX_op_qemu_st_i32:
2647     case INDEX_op_qemu_st8_i32:
2648         tcg_out_qemu_st(s, args, 0);
2649         break;
2650     case INDEX_op_qemu_st_i64:
2651         tcg_out_qemu_st(s, args, 1);
2652         break;
2653
2654     OP_32_64(mulu2):
2655         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2656         break;
2657     OP_32_64(muls2):
2658         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2659         break;
2660     OP_32_64(add2):
2661         if (const_args[4]) {
2662             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2663         } else {
2664             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2665         }
2666         if (const_args[5]) {
2667             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2668         } else {
2669             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2670         }
2671         break;
2672     OP_32_64(sub2):
2673         if (const_args[4]) {
2674             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2675         } else {
2676             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2677         }
2678         if (const_args[5]) {
2679             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2680         } else {
2681             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2682         }
2683         break;
2684
2685 #if TCG_TARGET_REG_BITS == 32
2686     case INDEX_op_brcond2_i32:
2687         tcg_out_brcond2(s, args, const_args, 0);
2688         break;
2689     case INDEX_op_setcond2_i32:
2690         tcg_out_setcond2(s, args, const_args);
2691         break;
2692 #else /* TCG_TARGET_REG_BITS == 64 */
2693     case INDEX_op_ld32s_i64:
2694         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2695         break;
2696     case INDEX_op_ld_i64:
2697         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2698         break;
2699     case INDEX_op_st_i64:
2700         if (const_args[0]) {
2701             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2702             tcg_out32(s, a0);
2703         } else {
2704             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2705         }
2706         break;
2707
2708     case INDEX_op_brcond_i64:
2709         tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2710         break;
2711     case INDEX_op_setcond_i64:
2712         tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2713         break;
2714     case INDEX_op_movcond_i64:
2715         tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2716         break;
2717
2718     case INDEX_op_bswap64_i64:
2719         tcg_out_bswap64(s, a0);
2720         break;
2721     case INDEX_op_extu_i32_i64:
2722     case INDEX_op_ext32u_i64:
2723     case INDEX_op_extrl_i64_i32:
2724         tcg_out_ext32u(s, a0, a1);
2725         break;
2726     case INDEX_op_ext_i32_i64:
2727     case INDEX_op_ext32s_i64:
2728         tcg_out_ext32s(s, a0, a1);
2729         break;
2730     case INDEX_op_extrh_i64_i32:
2731         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2732         break;
2733 #endif
2734
2735     OP_32_64(deposit):
2736         if (args[3] == 0 && args[4] == 8) {
2737             /* load bits 0..7 */
2738             tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2739         } else if (args[3] == 8 && args[4] == 8) {
2740             /* load bits 8..15 */
2741             tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2742         } else if (args[3] == 0 && args[4] == 16) {
2743             /* load bits 0..15 */
2744             tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2745         } else {
2746             tcg_abort();
2747         }
2748         break;
2749
2750     case INDEX_op_extract_i64:
2751         if (a2 + args[3] == 32) {
2752             /* This is a 32-bit zero-extending right shift.  */
2753             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2754             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2755             break;
2756         }
2757         /* FALLTHRU */
2758     case INDEX_op_extract_i32:
2759         /* On the off-chance that we can use the high-byte registers.
2760            Otherwise we emit the same ext16 + shift pattern that we
2761            would have gotten from the normal tcg-op.c expansion.  */
2762         tcg_debug_assert(a2 == 8 && args[3] == 8);
2763         if (a1 < 4 && a0 < 8) {
2764             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2765         } else {
2766             tcg_out_ext16u(s, a0, a1);
2767             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2768         }
2769         break;
2770
2771     case INDEX_op_sextract_i32:
2772         /* We don't implement sextract_i64, as we cannot sign-extend to
2773            64-bits without using the REX prefix that explicitly excludes
2774            access to the high-byte registers.  */
2775         tcg_debug_assert(a2 == 8 && args[3] == 8);
2776         if (a1 < 4 && a0 < 8) {
2777             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2778         } else {
2779             tcg_out_ext16s(s, a0, a1, 0);
2780             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2781         }
2782         break;
2783
2784     OP_32_64(extract2):
2785         /* Note that SHRD outputs to the r/m operand.  */
2786         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2787         tcg_out8(s, args[3]);
2788         break;
2789
2790     case INDEX_op_mb:
2791         tcg_out_mb(s, a0);
2792         break;
2793     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2794     case INDEX_op_mov_i64:
2795     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2796     default:
2797         tcg_abort();
2798     }
2799
2800 #undef OP_32_64
2801 }
2802
2803 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2804                            unsigned vecl, unsigned vece,
2805                            const TCGArg args[TCG_MAX_OP_ARGS],
2806                            const int const_args[TCG_MAX_OP_ARGS])
2807 {
2808     static int const add_insn[4] = {
2809         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2810     };
2811     static int const ssadd_insn[4] = {
2812         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2813     };
2814     static int const usadd_insn[4] = {
2815         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2816     };
2817     static int const sub_insn[4] = {
2818         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2819     };
2820     static int const sssub_insn[4] = {
2821         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2822     };
2823     static int const ussub_insn[4] = {
2824         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2825     };
2826     static int const mul_insn[4] = {
2827         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2828     };
2829     static int const shift_imm_insn[4] = {
2830         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2831     };
2832     static int const cmpeq_insn[4] = {
2833         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2834     };
2835     static int const cmpgt_insn[4] = {
2836         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2837     };
2838     static int const punpckl_insn[4] = {
2839         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2840     };
2841     static int const punpckh_insn[4] = {
2842         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2843     };
2844     static int const packss_insn[4] = {
2845         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2846     };
2847     static int const packus_insn[4] = {
2848         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2849     };
2850     static int const smin_insn[4] = {
2851         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2852     };
2853     static int const smax_insn[4] = {
2854         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2855     };
2856     static int const umin_insn[4] = {
2857         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2858     };
2859     static int const umax_insn[4] = {
2860         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2861     };
2862     static int const rotlv_insn[4] = {
2863         OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2864     };
2865     static int const rotrv_insn[4] = {
2866         OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2867     };
2868     static int const shlv_insn[4] = {
2869         OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2870     };
2871     static int const shrv_insn[4] = {
2872         OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2873     };
2874     static int const sarv_insn[4] = {
2875         OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2876     };
2877     static int const shls_insn[4] = {
2878         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2879     };
2880     static int const shrs_insn[4] = {
2881         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2882     };
2883     static int const sars_insn[4] = {
2884         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2885     };
2886     static int const vpshldi_insn[4] = {
2887         OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2888     };
2889     static int const vpshldv_insn[4] = {
2890         OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2891     };
2892     static int const vpshrdv_insn[4] = {
2893         OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
2894     };
2895     static int const abs_insn[4] = {
2896         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
2897     };
2898
2899     TCGType type = vecl + TCG_TYPE_V64;
2900     int insn, sub;
2901     TCGArg a0, a1, a2, a3;
2902
2903     a0 = args[0];
2904     a1 = args[1];
2905     a2 = args[2];
2906
2907     switch (opc) {
2908     case INDEX_op_add_vec:
2909         insn = add_insn[vece];
2910         goto gen_simd;
2911     case INDEX_op_ssadd_vec:
2912         insn = ssadd_insn[vece];
2913         goto gen_simd;
2914     case INDEX_op_usadd_vec:
2915         insn = usadd_insn[vece];
2916         goto gen_simd;
2917     case INDEX_op_sub_vec:
2918         insn = sub_insn[vece];
2919         goto gen_simd;
2920     case INDEX_op_sssub_vec:
2921         insn = sssub_insn[vece];
2922         goto gen_simd;
2923     case INDEX_op_ussub_vec:
2924         insn = ussub_insn[vece];
2925         goto gen_simd;
2926     case INDEX_op_mul_vec:
2927         insn = mul_insn[vece];
2928         goto gen_simd;
2929     case INDEX_op_and_vec:
2930         insn = OPC_PAND;
2931         goto gen_simd;
2932     case INDEX_op_or_vec:
2933         insn = OPC_POR;
2934         goto gen_simd;
2935     case INDEX_op_xor_vec:
2936         insn = OPC_PXOR;
2937         goto gen_simd;
2938     case INDEX_op_smin_vec:
2939         insn = smin_insn[vece];
2940         goto gen_simd;
2941     case INDEX_op_umin_vec:
2942         insn = umin_insn[vece];
2943         goto gen_simd;
2944     case INDEX_op_smax_vec:
2945         insn = smax_insn[vece];
2946         goto gen_simd;
2947     case INDEX_op_umax_vec:
2948         insn = umax_insn[vece];
2949         goto gen_simd;
2950     case INDEX_op_shlv_vec:
2951         insn = shlv_insn[vece];
2952         goto gen_simd;
2953     case INDEX_op_shrv_vec:
2954         insn = shrv_insn[vece];
2955         goto gen_simd;
2956     case INDEX_op_sarv_vec:
2957         insn = sarv_insn[vece];
2958         goto gen_simd;
2959     case INDEX_op_rotlv_vec:
2960         insn = rotlv_insn[vece];
2961         goto gen_simd;
2962     case INDEX_op_rotrv_vec:
2963         insn = rotrv_insn[vece];
2964         goto gen_simd;
2965     case INDEX_op_shls_vec:
2966         insn = shls_insn[vece];
2967         goto gen_simd;
2968     case INDEX_op_shrs_vec:
2969         insn = shrs_insn[vece];
2970         goto gen_simd;
2971     case INDEX_op_sars_vec:
2972         insn = sars_insn[vece];
2973         goto gen_simd;
2974     case INDEX_op_x86_punpckl_vec:
2975         insn = punpckl_insn[vece];
2976         goto gen_simd;
2977     case INDEX_op_x86_punpckh_vec:
2978         insn = punpckh_insn[vece];
2979         goto gen_simd;
2980     case INDEX_op_x86_packss_vec:
2981         insn = packss_insn[vece];
2982         goto gen_simd;
2983     case INDEX_op_x86_packus_vec:
2984         insn = packus_insn[vece];
2985         goto gen_simd;
2986     case INDEX_op_x86_vpshldv_vec:
2987         insn = vpshldv_insn[vece];
2988         a1 = a2;
2989         a2 = args[3];
2990         goto gen_simd;
2991     case INDEX_op_x86_vpshrdv_vec:
2992         insn = vpshrdv_insn[vece];
2993         a1 = a2;
2994         a2 = args[3];
2995         goto gen_simd;
2996 #if TCG_TARGET_REG_BITS == 32
2997     case INDEX_op_dup2_vec:
2998         /* First merge the two 32-bit inputs to a single 64-bit element. */
2999         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3000         /* Then replicate the 64-bit elements across the rest of the vector. */
3001         if (type != TCG_TYPE_V64) {
3002             tcg_out_dup_vec(s, type, MO_64, a0, a0);
3003         }
3004         break;
3005 #endif
3006     case INDEX_op_abs_vec:
3007         insn = abs_insn[vece];
3008         a2 = a1;
3009         a1 = 0;
3010         goto gen_simd;
3011     gen_simd:
3012         tcg_debug_assert(insn != OPC_UD2);
3013         if (type == TCG_TYPE_V256) {
3014             insn |= P_VEXL;
3015         }
3016         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3017         break;
3018
3019     case INDEX_op_cmp_vec:
3020         sub = args[3];
3021         if (sub == TCG_COND_EQ) {
3022             insn = cmpeq_insn[vece];
3023         } else if (sub == TCG_COND_GT) {
3024             insn = cmpgt_insn[vece];
3025         } else {
3026             g_assert_not_reached();
3027         }
3028         goto gen_simd;
3029
3030     case INDEX_op_andc_vec:
3031         insn = OPC_PANDN;
3032         if (type == TCG_TYPE_V256) {
3033             insn |= P_VEXL;
3034         }
3035         tcg_out_vex_modrm(s, insn, a0, a2, a1);
3036         break;
3037
3038     case INDEX_op_shli_vec:
3039         insn = shift_imm_insn[vece];
3040         sub = 6;
3041         goto gen_shift;
3042     case INDEX_op_shri_vec:
3043         insn = shift_imm_insn[vece];
3044         sub = 2;
3045         goto gen_shift;
3046     case INDEX_op_sari_vec:
3047         if (vece == MO_64) {
3048             insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3049         } else {
3050             insn = shift_imm_insn[vece];
3051         }
3052         sub = 4;
3053         goto gen_shift;
3054     case INDEX_op_rotli_vec:
3055         insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3056         if (vece == MO_64) {
3057             insn |= P_VEXW;
3058         }
3059         sub = 1;
3060         goto gen_shift;
3061     gen_shift:
3062         tcg_debug_assert(vece != MO_8);
3063         if (type == TCG_TYPE_V256) {
3064             insn |= P_VEXL;
3065         }
3066         tcg_out_vex_modrm(s, insn, sub, a0, a1);
3067         tcg_out8(s, a2);
3068         break;
3069
3070     case INDEX_op_ld_vec:
3071         tcg_out_ld(s, type, a0, a1, a2);
3072         break;
3073     case INDEX_op_st_vec:
3074         tcg_out_st(s, type, a0, a1, a2);
3075         break;
3076     case INDEX_op_dupm_vec:
3077         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3078         break;
3079
3080     case INDEX_op_x86_shufps_vec:
3081         insn = OPC_SHUFPS;
3082         sub = args[3];
3083         goto gen_simd_imm8;
3084     case INDEX_op_x86_blend_vec:
3085         if (vece == MO_16) {
3086             insn = OPC_PBLENDW;
3087         } else if (vece == MO_32) {
3088             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3089         } else {
3090             g_assert_not_reached();
3091         }
3092         sub = args[3];
3093         goto gen_simd_imm8;
3094     case INDEX_op_x86_vperm2i128_vec:
3095         insn = OPC_VPERM2I128;
3096         sub = args[3];
3097         goto gen_simd_imm8;
3098     case INDEX_op_x86_vpshldi_vec:
3099         insn = vpshldi_insn[vece];
3100         sub = args[3];
3101         goto gen_simd_imm8;
3102
3103     case INDEX_op_not_vec:
3104         insn = OPC_VPTERNLOGQ;
3105         a2 = a1;
3106         sub = 0x33; /* !B */
3107         goto gen_simd_imm8;
3108     case INDEX_op_nor_vec:
3109         insn = OPC_VPTERNLOGQ;
3110         sub = 0x11; /* norCB */
3111         goto gen_simd_imm8;
3112     case INDEX_op_nand_vec:
3113         insn = OPC_VPTERNLOGQ;
3114         sub = 0x77; /* nandCB */
3115         goto gen_simd_imm8;
3116     case INDEX_op_eqv_vec:
3117         insn = OPC_VPTERNLOGQ;
3118         sub = 0x99; /* xnorCB */
3119         goto gen_simd_imm8;
3120     case INDEX_op_orc_vec:
3121         insn = OPC_VPTERNLOGQ;
3122         sub = 0xdd; /* orB!C */
3123         goto gen_simd_imm8;
3124
3125     case INDEX_op_bitsel_vec:
3126         insn = OPC_VPTERNLOGQ;
3127         a3 = args[3];
3128         if (a0 == a1) {
3129             a1 = a2;
3130             a2 = a3;
3131             sub = 0xca; /* A?B:C */
3132         } else if (a0 == a2) {
3133             a2 = a3;
3134             sub = 0xe2; /* B?A:C */
3135         } else {
3136             tcg_out_mov(s, type, a0, a3);
3137             sub = 0xb8; /* B?C:A */
3138         }
3139         goto gen_simd_imm8;
3140
3141     gen_simd_imm8:
3142         tcg_debug_assert(insn != OPC_UD2);
3143         if (type == TCG_TYPE_V256) {
3144             insn |= P_VEXL;
3145         }
3146         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3147         tcg_out8(s, sub);
3148         break;
3149
3150     case INDEX_op_x86_vpblendvb_vec:
3151         insn = OPC_VPBLENDVB;
3152         if (type == TCG_TYPE_V256) {
3153             insn |= P_VEXL;
3154         }
3155         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3156         tcg_out8(s, args[3] << 4);
3157         break;
3158
3159     case INDEX_op_x86_psrldq_vec:
3160         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3161         tcg_out8(s, a2);
3162         break;
3163
3164     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3165     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3166     default:
3167         g_assert_not_reached();
3168     }
3169 }
3170
3171 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3172 {
3173     switch (op) {
3174     case INDEX_op_goto_ptr:
3175         return C_O0_I1(r);
3176
3177     case INDEX_op_ld8u_i32:
3178     case INDEX_op_ld8u_i64:
3179     case INDEX_op_ld8s_i32:
3180     case INDEX_op_ld8s_i64:
3181     case INDEX_op_ld16u_i32:
3182     case INDEX_op_ld16u_i64:
3183     case INDEX_op_ld16s_i32:
3184     case INDEX_op_ld16s_i64:
3185     case INDEX_op_ld_i32:
3186     case INDEX_op_ld32u_i64:
3187     case INDEX_op_ld32s_i64:
3188     case INDEX_op_ld_i64:
3189         return C_O1_I1(r, r);
3190
3191     case INDEX_op_st8_i32:
3192     case INDEX_op_st8_i64:
3193         return C_O0_I2(qi, r);
3194
3195     case INDEX_op_st16_i32:
3196     case INDEX_op_st16_i64:
3197     case INDEX_op_st_i32:
3198     case INDEX_op_st32_i64:
3199         return C_O0_I2(ri, r);
3200
3201     case INDEX_op_st_i64:
3202         return C_O0_I2(re, r);
3203
3204     case INDEX_op_add_i32:
3205     case INDEX_op_add_i64:
3206         return C_O1_I2(r, r, re);
3207
3208     case INDEX_op_sub_i32:
3209     case INDEX_op_sub_i64:
3210     case INDEX_op_mul_i32:
3211     case INDEX_op_mul_i64:
3212     case INDEX_op_or_i32:
3213     case INDEX_op_or_i64:
3214     case INDEX_op_xor_i32:
3215     case INDEX_op_xor_i64:
3216         return C_O1_I2(r, 0, re);
3217
3218     case INDEX_op_and_i32:
3219     case INDEX_op_and_i64:
3220         return C_O1_I2(r, 0, reZ);
3221
3222     case INDEX_op_andc_i32:
3223     case INDEX_op_andc_i64:
3224         return C_O1_I2(r, r, rI);
3225
3226     case INDEX_op_shl_i32:
3227     case INDEX_op_shl_i64:
3228     case INDEX_op_shr_i32:
3229     case INDEX_op_shr_i64:
3230     case INDEX_op_sar_i32:
3231     case INDEX_op_sar_i64:
3232         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3233
3234     case INDEX_op_rotl_i32:
3235     case INDEX_op_rotl_i64:
3236     case INDEX_op_rotr_i32:
3237     case INDEX_op_rotr_i64:
3238         return C_O1_I2(r, 0, ci);
3239
3240     case INDEX_op_brcond_i32:
3241     case INDEX_op_brcond_i64:
3242         return C_O0_I2(r, re);
3243
3244     case INDEX_op_bswap16_i32:
3245     case INDEX_op_bswap16_i64:
3246     case INDEX_op_bswap32_i32:
3247     case INDEX_op_bswap32_i64:
3248     case INDEX_op_bswap64_i64:
3249     case INDEX_op_neg_i32:
3250     case INDEX_op_neg_i64:
3251     case INDEX_op_not_i32:
3252     case INDEX_op_not_i64:
3253     case INDEX_op_extrh_i64_i32:
3254         return C_O1_I1(r, 0);
3255
3256     case INDEX_op_ext8s_i32:
3257     case INDEX_op_ext8s_i64:
3258     case INDEX_op_ext8u_i32:
3259     case INDEX_op_ext8u_i64:
3260         return C_O1_I1(r, q);
3261
3262     case INDEX_op_ext16s_i32:
3263     case INDEX_op_ext16s_i64:
3264     case INDEX_op_ext16u_i32:
3265     case INDEX_op_ext16u_i64:
3266     case INDEX_op_ext32s_i64:
3267     case INDEX_op_ext32u_i64:
3268     case INDEX_op_ext_i32_i64:
3269     case INDEX_op_extu_i32_i64:
3270     case INDEX_op_extrl_i64_i32:
3271     case INDEX_op_extract_i32:
3272     case INDEX_op_extract_i64:
3273     case INDEX_op_sextract_i32:
3274     case INDEX_op_ctpop_i32:
3275     case INDEX_op_ctpop_i64:
3276         return C_O1_I1(r, r);
3277
3278     case INDEX_op_extract2_i32:
3279     case INDEX_op_extract2_i64:
3280         return C_O1_I2(r, 0, r);
3281
3282     case INDEX_op_deposit_i32:
3283     case INDEX_op_deposit_i64:
3284         return C_O1_I2(Q, 0, Q);
3285
3286     case INDEX_op_setcond_i32:
3287     case INDEX_op_setcond_i64:
3288         return C_O1_I2(q, r, re);
3289
3290     case INDEX_op_movcond_i32:
3291     case INDEX_op_movcond_i64:
3292         return C_O1_I4(r, r, re, r, 0);
3293
3294     case INDEX_op_div2_i32:
3295     case INDEX_op_div2_i64:
3296     case INDEX_op_divu2_i32:
3297     case INDEX_op_divu2_i64:
3298         return C_O2_I3(a, d, 0, 1, r);
3299
3300     case INDEX_op_mulu2_i32:
3301     case INDEX_op_mulu2_i64:
3302     case INDEX_op_muls2_i32:
3303     case INDEX_op_muls2_i64:
3304         return C_O2_I2(a, d, a, r);
3305
3306     case INDEX_op_add2_i32:
3307     case INDEX_op_add2_i64:
3308     case INDEX_op_sub2_i32:
3309     case INDEX_op_sub2_i64:
3310         return C_O2_I4(r, r, 0, 1, re, re);
3311
3312     case INDEX_op_ctz_i32:
3313     case INDEX_op_ctz_i64:
3314         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3315
3316     case INDEX_op_clz_i32:
3317     case INDEX_op_clz_i64:
3318         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3319
3320     case INDEX_op_qemu_ld_i32:
3321         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3322                 ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3323
3324     case INDEX_op_qemu_st_i32:
3325         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3326                 ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3327     case INDEX_op_qemu_st8_i32:
3328         return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3329                 ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3330
3331     case INDEX_op_qemu_ld_i64:
3332         return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3333                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3334                 : C_O2_I2(r, r, L, L));
3335
3336     case INDEX_op_qemu_st_i64:
3337         return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3338                 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3339                 : C_O0_I4(L, L, L, L));
3340
3341     case INDEX_op_brcond2_i32:
3342         return C_O0_I4(r, r, ri, ri);
3343
3344     case INDEX_op_setcond2_i32:
3345         return C_O1_I4(r, r, r, ri, ri);
3346
3347     case INDEX_op_ld_vec:
3348     case INDEX_op_dupm_vec:
3349         return C_O1_I1(x, r);
3350
3351     case INDEX_op_st_vec:
3352         return C_O0_I2(x, r);
3353
3354     case INDEX_op_add_vec:
3355     case INDEX_op_sub_vec:
3356     case INDEX_op_mul_vec:
3357     case INDEX_op_and_vec:
3358     case INDEX_op_or_vec:
3359     case INDEX_op_xor_vec:
3360     case INDEX_op_andc_vec:
3361     case INDEX_op_orc_vec:
3362     case INDEX_op_nand_vec:
3363     case INDEX_op_nor_vec:
3364     case INDEX_op_eqv_vec:
3365     case INDEX_op_ssadd_vec:
3366     case INDEX_op_usadd_vec:
3367     case INDEX_op_sssub_vec:
3368     case INDEX_op_ussub_vec:
3369     case INDEX_op_smin_vec:
3370     case INDEX_op_umin_vec:
3371     case INDEX_op_smax_vec:
3372     case INDEX_op_umax_vec:
3373     case INDEX_op_shlv_vec:
3374     case INDEX_op_shrv_vec:
3375     case INDEX_op_sarv_vec:
3376     case INDEX_op_rotlv_vec:
3377     case INDEX_op_rotrv_vec:
3378     case INDEX_op_shls_vec:
3379     case INDEX_op_shrs_vec:
3380     case INDEX_op_sars_vec:
3381     case INDEX_op_cmp_vec:
3382     case INDEX_op_x86_shufps_vec:
3383     case INDEX_op_x86_blend_vec:
3384     case INDEX_op_x86_packss_vec:
3385     case INDEX_op_x86_packus_vec:
3386     case INDEX_op_x86_vperm2i128_vec:
3387     case INDEX_op_x86_punpckl_vec:
3388     case INDEX_op_x86_punpckh_vec:
3389     case INDEX_op_x86_vpshldi_vec:
3390 #if TCG_TARGET_REG_BITS == 32
3391     case INDEX_op_dup2_vec:
3392 #endif
3393         return C_O1_I2(x, x, x);
3394
3395     case INDEX_op_abs_vec:
3396     case INDEX_op_dup_vec:
3397     case INDEX_op_not_vec:
3398     case INDEX_op_shli_vec:
3399     case INDEX_op_shri_vec:
3400     case INDEX_op_sari_vec:
3401     case INDEX_op_rotli_vec:
3402     case INDEX_op_x86_psrldq_vec:
3403         return C_O1_I1(x, x);
3404
3405     case INDEX_op_x86_vpshldv_vec:
3406     case INDEX_op_x86_vpshrdv_vec:
3407         return C_O1_I3(x, 0, x, x);
3408
3409     case INDEX_op_bitsel_vec:
3410     case INDEX_op_x86_vpblendvb_vec:
3411         return C_O1_I3(x, x, x, x);
3412
3413     default:
3414         g_assert_not_reached();
3415     }
3416 }
3417
3418 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3419 {
3420     switch (opc) {
3421     case INDEX_op_add_vec:
3422     case INDEX_op_sub_vec:
3423     case INDEX_op_and_vec:
3424     case INDEX_op_or_vec:
3425     case INDEX_op_xor_vec:
3426     case INDEX_op_andc_vec:
3427     case INDEX_op_orc_vec:
3428     case INDEX_op_nand_vec:
3429     case INDEX_op_nor_vec:
3430     case INDEX_op_eqv_vec:
3431     case INDEX_op_not_vec:
3432     case INDEX_op_bitsel_vec:
3433         return 1;
3434     case INDEX_op_cmp_vec:
3435     case INDEX_op_cmpsel_vec:
3436         return -1;
3437
3438     case INDEX_op_rotli_vec:
3439         return have_avx512vl && vece >= MO_32 ? 1 : -1;
3440
3441     case INDEX_op_shli_vec:
3442     case INDEX_op_shri_vec:
3443         /* We must expand the operation for MO_8.  */
3444         return vece == MO_8 ? -1 : 1;
3445
3446     case INDEX_op_sari_vec:
3447         switch (vece) {
3448         case MO_8:
3449             return -1;
3450         case MO_16:
3451         case MO_32:
3452             return 1;
3453         case MO_64:
3454             if (have_avx512vl) {
3455                 return 1;
3456             }
3457             /*
3458              * We can emulate this for MO_64, but it does not pay off
3459              * unless we're producing at least 4 values.
3460              */
3461             return type >= TCG_TYPE_V256 ? -1 : 0;
3462         }
3463         return 0;
3464
3465     case INDEX_op_shls_vec:
3466     case INDEX_op_shrs_vec:
3467         return vece >= MO_16;
3468     case INDEX_op_sars_vec:
3469         switch (vece) {
3470         case MO_16:
3471         case MO_32:
3472             return 1;
3473         case MO_64:
3474             return have_avx512vl;
3475         }
3476         return 0;
3477     case INDEX_op_rotls_vec:
3478         return vece >= MO_16 ? -1 : 0;
3479
3480     case INDEX_op_shlv_vec:
3481     case INDEX_op_shrv_vec:
3482         switch (vece) {
3483         case MO_16:
3484             return have_avx512bw;
3485         case MO_32:
3486         case MO_64:
3487             return have_avx2;
3488         }
3489         return 0;
3490     case INDEX_op_sarv_vec:
3491         switch (vece) {
3492         case MO_16:
3493             return have_avx512bw;
3494         case MO_32:
3495             return have_avx2;
3496         case MO_64:
3497             return have_avx512vl;
3498         }
3499         return 0;
3500     case INDEX_op_rotlv_vec:
3501     case INDEX_op_rotrv_vec:
3502         switch (vece) {
3503         case MO_16:
3504             return have_avx512vbmi2 ? -1 : 0;
3505         case MO_32:
3506         case MO_64:
3507             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3508         }
3509         return 0;
3510
3511     case INDEX_op_mul_vec:
3512         switch (vece) {
3513         case MO_8:
3514             return -1;
3515         case MO_64:
3516             return have_avx512dq;
3517         }
3518         return 1;
3519
3520     case INDEX_op_ssadd_vec:
3521     case INDEX_op_usadd_vec:
3522     case INDEX_op_sssub_vec:
3523     case INDEX_op_ussub_vec:
3524         return vece <= MO_16;
3525     case INDEX_op_smin_vec:
3526     case INDEX_op_smax_vec:
3527     case INDEX_op_umin_vec:
3528     case INDEX_op_umax_vec:
3529     case INDEX_op_abs_vec:
3530         return vece <= MO_32 || have_avx512vl;
3531
3532     default:
3533         return 0;
3534     }
3535 }
3536
3537 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3538                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3539 {
3540     TCGv_vec t1, t2;
3541
3542     tcg_debug_assert(vece == MO_8);
3543
3544     t1 = tcg_temp_new_vec(type);
3545     t2 = tcg_temp_new_vec(type);
3546
3547     /*
3548      * Unpack to W, shift, and repack.  Tricky bits:
3549      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3550      *     i.e. duplicate in other half of the 16-bit lane.
3551      * (2) For right-shift, add 8 so that the high half of the lane
3552      *     becomes zero.  For left-shift, and left-rotate, we must
3553      *     shift up and down again.
3554      * (3) Step 2 leaves high half zero such that PACKUSWB
3555      *     (pack with unsigned saturation) does not modify
3556      *     the quantity.
3557      */
3558     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3559               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3560     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3561               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3562
3563     if (opc != INDEX_op_rotli_vec) {
3564         imm += 8;
3565     }
3566     if (opc == INDEX_op_shri_vec) {
3567         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3568         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3569     } else {
3570         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3571         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3572         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3573         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3574     }
3575
3576     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3577               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3578     tcg_temp_free_vec(t1);
3579     tcg_temp_free_vec(t2);
3580 }
3581
3582 static void expand_vec_sari(TCGType type, unsigned vece,
3583                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3584 {
3585     TCGv_vec t1, t2;
3586
3587     switch (vece) {
3588     case MO_8:
3589         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3590         t1 = tcg_temp_new_vec(type);
3591         t2 = tcg_temp_new_vec(type);
3592         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3593                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3594         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3595                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3596         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3597         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3598         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3599                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3600         tcg_temp_free_vec(t1);
3601         tcg_temp_free_vec(t2);
3602         break;
3603
3604     case MO_64:
3605         if (imm <= 32) {
3606             /*
3607              * We can emulate a small sign extend by performing an arithmetic
3608              * 32-bit shift and overwriting the high half of a 64-bit logical
3609              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3610              * does not, so we have to bound the smaller shift -- we get the
3611              * same result in the high half either way.
3612              */
3613             t1 = tcg_temp_new_vec(type);
3614             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3615             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3616             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3617                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3618                       tcgv_vec_arg(t1), 0xaa);
3619             tcg_temp_free_vec(t1);
3620         } else {
3621             /* Otherwise we will need to use a compare vs 0 to produce
3622              * the sign-extend, shift and merge.
3623              */
3624             t1 = tcg_const_zeros_vec(type);
3625             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3626             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3627             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3628             tcg_gen_or_vec(MO_64, v0, v0, t1);
3629             tcg_temp_free_vec(t1);
3630         }
3631         break;
3632
3633     default:
3634         g_assert_not_reached();
3635     }
3636 }
3637
3638 static void expand_vec_rotli(TCGType type, unsigned vece,
3639                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3640 {
3641     TCGv_vec t;
3642
3643     if (vece == MO_8) {
3644         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3645         return;
3646     }
3647
3648     if (have_avx512vbmi2) {
3649         vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3650                   tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3651         return;
3652     }
3653
3654     t = tcg_temp_new_vec(type);
3655     tcg_gen_shli_vec(vece, t, v1, imm);
3656     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3657     tcg_gen_or_vec(vece, v0, v0, t);
3658     tcg_temp_free_vec(t);
3659 }
3660
3661 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3662                             TCGv_vec v1, TCGv_vec sh, bool right)
3663 {
3664     TCGv_vec t;
3665
3666     if (have_avx512vbmi2) {
3667         vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3668                   type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3669                   tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3670         return;
3671     }
3672
3673     t = tcg_temp_new_vec(type);
3674     tcg_gen_dupi_vec(vece, t, 8 << vece);
3675     tcg_gen_sub_vec(vece, t, t, sh);
3676     if (right) {
3677         tcg_gen_shlv_vec(vece, t, v1, t);
3678         tcg_gen_shrv_vec(vece, v0, v1, sh);
3679     } else {
3680         tcg_gen_shrv_vec(vece, t, v1, t);
3681         tcg_gen_shlv_vec(vece, v0, v1, sh);
3682     }
3683     tcg_gen_or_vec(vece, v0, v0, t);
3684     tcg_temp_free_vec(t);
3685 }
3686
3687 static void expand_vec_rotls(TCGType type, unsigned vece,
3688                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3689 {
3690     TCGv_vec t = tcg_temp_new_vec(type);
3691
3692     tcg_debug_assert(vece != MO_8);
3693
3694     if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3695         tcg_gen_dup_i32_vec(vece, t, lsh);
3696         if (vece >= MO_32) {
3697             tcg_gen_rotlv_vec(vece, v0, v1, t);
3698         } else {
3699             expand_vec_rotv(type, vece, v0, v1, t, false);
3700         }
3701     } else {
3702         TCGv_i32 rsh = tcg_temp_new_i32();
3703
3704         tcg_gen_neg_i32(rsh, lsh);
3705         tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3706         tcg_gen_shls_vec(vece, t, v1, lsh);
3707         tcg_gen_shrs_vec(vece, v0, v1, rsh);
3708         tcg_gen_or_vec(vece, v0, v0, t);
3709
3710         tcg_temp_free_i32(rsh);
3711     }
3712
3713     tcg_temp_free_vec(t);
3714 }
3715
3716 static void expand_vec_mul(TCGType type, unsigned vece,
3717                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3718 {
3719     TCGv_vec t1, t2, t3, t4, zero;
3720
3721     tcg_debug_assert(vece == MO_8);
3722
3723     /*
3724      * Unpack v1 bytes to words, 0 | x.
3725      * Unpack v2 bytes to words, y | 0.
3726      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3727      * Shift logical right by 8 bits to clear the high 8 bytes before
3728      * using an unsigned saturated pack.
3729      *
3730      * The difference between the V64, V128 and V256 cases is merely how
3731      * we distribute the expansion between temporaries.
3732      */
3733     switch (type) {
3734     case TCG_TYPE_V64:
3735         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3736         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3737         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3738         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3739                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3740         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3741                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3742         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3743         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3744         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3745                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3746         tcg_temp_free_vec(t1);
3747         tcg_temp_free_vec(t2);
3748         break;
3749
3750     case TCG_TYPE_V128:
3751     case TCG_TYPE_V256:
3752         t1 = tcg_temp_new_vec(type);
3753         t2 = tcg_temp_new_vec(type);
3754         t3 = tcg_temp_new_vec(type);
3755         t4 = tcg_temp_new_vec(type);
3756         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3757         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3758                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3759         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3760                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3761         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3762                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3763         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3764                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3765         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3766         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3767         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3768         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3769         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3770                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3771         tcg_temp_free_vec(t1);
3772         tcg_temp_free_vec(t2);
3773         tcg_temp_free_vec(t3);
3774         tcg_temp_free_vec(t4);
3775         break;
3776
3777     default:
3778         g_assert_not_reached();
3779     }
3780 }
3781
3782 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3783                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3784 {
3785     enum {
3786         NEED_INV  = 1,
3787         NEED_SWAP = 2,
3788         NEED_BIAS = 4,
3789         NEED_UMIN = 8,
3790         NEED_UMAX = 16,
3791     };
3792     TCGv_vec t1, t2, t3;
3793     uint8_t fixup;
3794
3795     switch (cond) {
3796     case TCG_COND_EQ:
3797     case TCG_COND_GT:
3798         fixup = 0;
3799         break;
3800     case TCG_COND_NE:
3801     case TCG_COND_LE:
3802         fixup = NEED_INV;
3803         break;
3804     case TCG_COND_LT:
3805         fixup = NEED_SWAP;
3806         break;
3807     case TCG_COND_GE:
3808         fixup = NEED_SWAP | NEED_INV;
3809         break;
3810     case TCG_COND_LEU:
3811         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3812             fixup = NEED_UMIN;
3813         } else {
3814             fixup = NEED_BIAS | NEED_INV;
3815         }
3816         break;
3817     case TCG_COND_GTU:
3818         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3819             fixup = NEED_UMIN | NEED_INV;
3820         } else {
3821             fixup = NEED_BIAS;
3822         }
3823         break;
3824     case TCG_COND_GEU:
3825         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3826             fixup = NEED_UMAX;
3827         } else {
3828             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3829         }
3830         break;
3831     case TCG_COND_LTU:
3832         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3833             fixup = NEED_UMAX | NEED_INV;
3834         } else {
3835             fixup = NEED_BIAS | NEED_SWAP;
3836         }
3837         break;
3838     default:
3839         g_assert_not_reached();
3840     }
3841
3842     if (fixup & NEED_INV) {
3843         cond = tcg_invert_cond(cond);
3844     }
3845     if (fixup & NEED_SWAP) {
3846         t1 = v1, v1 = v2, v2 = t1;
3847         cond = tcg_swap_cond(cond);
3848     }
3849
3850     t1 = t2 = NULL;
3851     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3852         t1 = tcg_temp_new_vec(type);
3853         if (fixup & NEED_UMIN) {
3854             tcg_gen_umin_vec(vece, t1, v1, v2);
3855         } else {
3856             tcg_gen_umax_vec(vece, t1, v1, v2);
3857         }
3858         v2 = t1;
3859         cond = TCG_COND_EQ;
3860     } else if (fixup & NEED_BIAS) {
3861         t1 = tcg_temp_new_vec(type);
3862         t2 = tcg_temp_new_vec(type);
3863         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3864         tcg_gen_sub_vec(vece, t1, v1, t3);
3865         tcg_gen_sub_vec(vece, t2, v2, t3);
3866         v1 = t1;
3867         v2 = t2;
3868         cond = tcg_signed_cond(cond);
3869     }
3870
3871     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3872     /* Expand directly; do not recurse.  */
3873     vec_gen_4(INDEX_op_cmp_vec, type, vece,
3874               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3875
3876     if (t1) {
3877         tcg_temp_free_vec(t1);
3878         if (t2) {
3879             tcg_temp_free_vec(t2);
3880         }
3881     }
3882     return fixup & NEED_INV;
3883 }
3884
3885 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3886                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3887 {
3888     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3889         tcg_gen_not_vec(vece, v0, v0);
3890     }
3891 }
3892
3893 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3894                               TCGv_vec c1, TCGv_vec c2,
3895                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3896 {
3897     TCGv_vec t = tcg_temp_new_vec(type);
3898
3899     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3900         /* Invert the sense of the compare by swapping arguments.  */
3901         TCGv_vec x;
3902         x = v3, v3 = v4, v4 = x;
3903     }
3904     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3905               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3906               tcgv_vec_arg(v3), tcgv_vec_arg(t));
3907     tcg_temp_free_vec(t);
3908 }
3909
3910 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3911                        TCGArg a0, ...)
3912 {
3913     va_list va;
3914     TCGArg a2;
3915     TCGv_vec v0, v1, v2, v3, v4;
3916
3917     va_start(va, a0);
3918     v0 = temp_tcgv_vec(arg_temp(a0));
3919     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3920     a2 = va_arg(va, TCGArg);
3921
3922     switch (opc) {
3923     case INDEX_op_shli_vec:
3924     case INDEX_op_shri_vec:
3925         expand_vec_shi(type, vece, opc, v0, v1, a2);
3926         break;
3927
3928     case INDEX_op_sari_vec:
3929         expand_vec_sari(type, vece, v0, v1, a2);
3930         break;
3931
3932     case INDEX_op_rotli_vec:
3933         expand_vec_rotli(type, vece, v0, v1, a2);
3934         break;
3935
3936     case INDEX_op_rotls_vec:
3937         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3938         break;
3939
3940     case INDEX_op_rotlv_vec:
3941         v2 = temp_tcgv_vec(arg_temp(a2));
3942         expand_vec_rotv(type, vece, v0, v1, v2, false);
3943         break;
3944     case INDEX_op_rotrv_vec:
3945         v2 = temp_tcgv_vec(arg_temp(a2));
3946         expand_vec_rotv(type, vece, v0, v1, v2, true);
3947         break;
3948
3949     case INDEX_op_mul_vec:
3950         v2 = temp_tcgv_vec(arg_temp(a2));
3951         expand_vec_mul(type, vece, v0, v1, v2);
3952         break;
3953
3954     case INDEX_op_cmp_vec:
3955         v2 = temp_tcgv_vec(arg_temp(a2));
3956         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3957         break;
3958
3959     case INDEX_op_cmpsel_vec:
3960         v2 = temp_tcgv_vec(arg_temp(a2));
3961         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3962         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3963         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3964         break;
3965
3966     default:
3967         break;
3968     }
3969
3970     va_end(va);
3971 }
3972
3973 static const int tcg_target_callee_save_regs[] = {
3974 #if TCG_TARGET_REG_BITS == 64
3975     TCG_REG_RBP,
3976     TCG_REG_RBX,
3977 #if defined(_WIN64)
3978     TCG_REG_RDI,
3979     TCG_REG_RSI,
3980 #endif
3981     TCG_REG_R12,
3982     TCG_REG_R13,
3983     TCG_REG_R14, /* Currently used for the global env. */
3984     TCG_REG_R15,
3985 #else
3986     TCG_REG_EBP, /* Currently used for the global env. */
3987     TCG_REG_EBX,
3988     TCG_REG_ESI,
3989     TCG_REG_EDI,
3990 #endif
3991 };
3992
3993 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3994    and tcg_register_jit.  */
3995
3996 #define PUSH_SIZE \
3997     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3998      * (TCG_TARGET_REG_BITS / 8))
3999
4000 #define FRAME_SIZE \
4001     ((PUSH_SIZE \
4002       + TCG_STATIC_CALL_ARGS_SIZE \
4003       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4004       + TCG_TARGET_STACK_ALIGN - 1) \
4005      & ~(TCG_TARGET_STACK_ALIGN - 1))
4006
4007 /* Generate global QEMU prologue and epilogue code */
4008 static void tcg_target_qemu_prologue(TCGContext *s)
4009 {
4010     int i, stack_addend;
4011
4012     /* TB prologue */
4013
4014     /* Reserve some stack space, also for TCG temps.  */
4015     stack_addend = FRAME_SIZE - PUSH_SIZE;
4016     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4017                   CPU_TEMP_BUF_NLONGS * sizeof(long));
4018
4019     /* Save all callee saved registers.  */
4020     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4021         tcg_out_push(s, tcg_target_callee_save_regs[i]);
4022     }
4023
4024 #if TCG_TARGET_REG_BITS == 32
4025     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4026                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4027     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4028     /* jmp *tb.  */
4029     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4030                          (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4031                          + stack_addend);
4032 #else
4033 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
4034     if (guest_base) {
4035         int seg = setup_guest_base_seg();
4036         if (seg != 0) {
4037             x86_guest_base_seg = seg;
4038         } else if (guest_base == (int32_t)guest_base) {
4039             x86_guest_base_offset = guest_base;
4040         } else {
4041             /* Choose R12 because, as a base, it requires a SIB byte. */
4042             x86_guest_base_index = TCG_REG_R12;
4043             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
4044             tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
4045         }
4046     }
4047 # endif
4048     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4049     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4050     /* jmp *tb.  */
4051     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4052 #endif
4053
4054     /*
4055      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4056      * and fall through to the rest of the epilogue.
4057      */
4058     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4059     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4060
4061     /* TB epilogue */
4062     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4063
4064     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4065
4066     if (have_avx2) {
4067         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4068     }
4069     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4070         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4071     }
4072     tcg_out_opc(s, OPC_RET, 0, 0, 0);
4073 }
4074
4075 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4076 {
4077     memset(p, 0x90, count);
4078 }
4079
4080 static void tcg_target_init(TCGContext *s)
4081 {
4082 #ifdef CONFIG_CPUID_H
4083     unsigned a, b, c, d, b7 = 0, c7 = 0;
4084     unsigned max = __get_cpuid_max(0, 0);
4085
4086     if (max >= 7) {
4087         /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs.  */
4088         __cpuid_count(7, 0, a, b7, c7, d);
4089         have_bmi1 = (b7 & bit_BMI) != 0;
4090         have_bmi2 = (b7 & bit_BMI2) != 0;
4091     }
4092
4093     if (max >= 1) {
4094         __cpuid(1, a, b, c, d);
4095 #ifndef have_cmov
4096         /* For 32-bit, 99% certainty that we're running on hardware that
4097            supports cmov, but we still need to check.  In case cmov is not
4098            available, we'll use a small forward branch.  */
4099         have_cmov = (d & bit_CMOV) != 0;
4100 #endif
4101
4102         /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
4103            need to probe for it.  */
4104         have_movbe = (c & bit_MOVBE) != 0;
4105         have_popcnt = (c & bit_POPCNT) != 0;
4106
4107         /* There are a number of things we must check before we can be
4108            sure of not hitting invalid opcode.  */
4109         if (c & bit_OSXSAVE) {
4110             unsigned xcrl, xcrh;
4111             /* The xgetbv instruction is not available to older versions of
4112              * the assembler, so we encode the instruction manually.
4113              */
4114             asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
4115             if ((xcrl & 6) == 6) {
4116                 have_avx1 = (c & bit_AVX) != 0;
4117                 have_avx2 = (b7 & bit_AVX2) != 0;
4118
4119                 /*
4120                  * There are interesting instructions in AVX512, so long
4121                  * as we have AVX512VL, which indicates support for EVEX
4122                  * on sizes smaller than 512 bits.  We are required to
4123                  * check that OPMASK and all extended ZMM state are enabled
4124                  * even if we're not using them -- the insns will fault.
4125                  */
4126                 if ((xcrl & 0xe0) == 0xe0
4127                     && (b7 & bit_AVX512F)
4128                     && (b7 & bit_AVX512VL)) {
4129                     have_avx512vl = true;
4130                     have_avx512bw = (b7 & bit_AVX512BW) != 0;
4131                     have_avx512dq = (b7 & bit_AVX512DQ) != 0;
4132                     have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0;
4133                 }
4134             }
4135         }
4136     }
4137
4138     max = __get_cpuid_max(0x8000000, 0);
4139     if (max >= 1) {
4140         __cpuid(0x80000001, a, b, c, d);
4141         /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs.  */
4142         have_lzcnt = (c & bit_LZCNT) != 0;
4143     }
4144 #endif /* CONFIG_CPUID_H */
4145
4146     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4147     if (TCG_TARGET_REG_BITS == 64) {
4148         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4149     }
4150     if (have_avx1) {
4151         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4152         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4153     }
4154     if (have_avx2) {
4155         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4156     }
4157
4158     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4159     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4160     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4161     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4162     if (TCG_TARGET_REG_BITS == 64) {
4163 #if !defined(_WIN64)
4164         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4165         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4166 #endif
4167         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4168         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4169         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4170         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4171     }
4172
4173     s->reserved_regs = 0;
4174     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4175 }
4176
4177 typedef struct {
4178     DebugFrameHeader h;
4179     uint8_t fde_def_cfa[4];
4180     uint8_t fde_reg_ofs[14];
4181 } DebugFrame;
4182
4183 /* We're expecting a 2 byte uleb128 encoded value.  */
4184 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4185
4186 #if !defined(__ELF__)
4187     /* Host machine without ELF. */
4188 #elif TCG_TARGET_REG_BITS == 64
4189 #define ELF_HOST_MACHINE EM_X86_64
4190 static const DebugFrame debug_frame = {
4191     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4192     .h.cie.id = -1,
4193     .h.cie.version = 1,
4194     .h.cie.code_align = 1,
4195     .h.cie.data_align = 0x78,             /* sleb128 -8 */
4196     .h.cie.return_column = 16,
4197
4198     /* Total FDE size does not include the "len" member.  */
4199     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4200
4201     .fde_def_cfa = {
4202         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4203         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4204         (FRAME_SIZE >> 7)
4205     },
4206     .fde_reg_ofs = {
4207         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4208         /* The following ordering must match tcg_target_callee_save_regs.  */
4209         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4210         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4211         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4212         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4213         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4214         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4215     }
4216 };
4217 #else
4218 #define ELF_HOST_MACHINE EM_386
4219 static const DebugFrame debug_frame = {
4220     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4221     .h.cie.id = -1,
4222     .h.cie.version = 1,
4223     .h.cie.code_align = 1,
4224     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4225     .h.cie.return_column = 8,
4226
4227     /* Total FDE size does not include the "len" member.  */
4228     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4229
4230     .fde_def_cfa = {
4231         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4232         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4233         (FRAME_SIZE >> 7)
4234     },
4235     .fde_reg_ofs = {
4236         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4237         /* The following ordering must match tcg_target_callee_save_regs.  */
4238         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4239         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4240         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4241         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4242     }
4243 };
4244 #endif
4245
4246 #if defined(ELF_HOST_MACHINE)
4247 void tcg_register_jit(const void *buf, size_t buf_size)
4248 {
4249     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4250 }
4251 #endif