gcc/config/i386/x86-tune.def

   1 /* Definitions of x86 tunable features.
   2    Copyright (C) 2013-2024 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 3, or (at your option)
   9 any later version.
  10
  11 GCC is distributed in the hope that it will be useful,
  12 but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 GNU General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License and
  17 a copy of the GCC Runtime Library Exception along with this program;
  18 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 /* Tuning for a given CPU XXXX consists of:
  22     - adding new CPU into:
  23         - adding PROCESSOR_XXX to processor_type (in i386.h)
  24         - possibly adding XXX into CPU attribute in i386.md
  25         - adding XXX to processor_alias_table (in i386.cc)
  26     - introducing ix86_XXX_cost in i386.cc
  27         - Stringop generation table can be build based on test_stringop
  28         - script (once rest of tuning is complete)
  29     - designing a scheduler model in
  30         - XXXX.md file
  31         - Updating ix86_issue_rate and ix86_adjust_cost in i386.md
  32         - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
  33           and ix86_sched_init_global if those tricks are needed.
  34     - Tunning the flags bellow. Those are split into sections and each
  35       section is very roughly ordered by importance.  */
  36
  37 /*****************************************************************************/
  38 /* Scheduling flags.                                                         */
  39 /*****************************************************************************/
  40
  41 /* X86_TUNE_SCHEDULE: Enable scheduling.  */
  42 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
  43           m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
  44           | m_INTEL | m_K6_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
  45           | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
  46           | m_GENERIC)
  47
  48 /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
  49    on modern chips.  Prefer stores affecting whole integer register
  50    over partial stores.  For example prefer MOVZBL or MOVQ to load 8bit
  51    value over movb.  */
  52 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
  53           m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE | m_CORE_AVX2
  54           | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
  55           | m_AMD_MULTIPLE | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
  56           | m_CORE_ATOM | m_GENERIC)
  57
  58 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
  59    destinations to be 128bit to allow register renaming on 128bit SSE units,
  60    but usually results in one extra microop on 64bit SSE units.
  61    Experimental results shows that disabling this option on P4 brings over 20%
  62    SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
  63    that can be partly masked by careful scheduling of moves.  */
  64 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
  65           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  66           | m_BDVER | m_ZNVER | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
  67           | m_CORE_ATOM | m_GENERIC)
  68
  69 /* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids
  70    partial write to the destination in scalar SSE conversion from FP
  71    to FP.  */
  72 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY,
  73           "sse_partial_reg_fp_converts_dependency",
  74           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  75           | m_BDVER | m_ZNVER | m_ZHAOXIN | m_CORE_HYBRID | m_CORE_ATOM
  76           | m_GENERIC)
  77
  78 /* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial
  79    write to the destination in scalar SSE conversion from integer to FP.  */
  80 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
  81           "sse_partial_reg_converts_dependency",
  82           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
  83           | m_BDVER | m_ZNVER | m_ZHAOXIN | m_CORE_HYBRID | m_CORE_ATOM
  84           | m_GENERIC)
  85
  86 /* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before
  87    several insns to break false dependency on the dest register for GLC
  88    micro-architecture.  */
  89 DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC,
  90           "dest_false_dep_for_glc", m_SAPPHIRERAPIDS | m_CORE_HYBRID
  91           | m_CORE_ATOM)
  92
  93 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
  94    are resolved on SSE register parts instead of whole registers, so we may
  95    maintain just lower part of scalar values in proper format leaving the
  96    upper part undefined.  */
  97 DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
  98
  99 /* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of flags
 100    set by instructions affecting just some flags (in particular shifts).
 101    This is because Core2 resolves dependencies on whole flags register
 102    and such sequences introduce false dependency on previous instruction
 103    setting full flags.
 104
 105    The flags does not affect generation of INC and DEC that is controlled
 106    by X86_TUNE_USE_INCDEC.  */
 107
 108 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
 109           m_CORE2)
 110
 111 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
 112    partial dependencies.  */
 113 DEF_TUNE (X86_TUNE_MOVX, "movx",
 114           m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
 115           | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_INTEL
 116           | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN
 117           | m_CORE_AVX2 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
 118
 119 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
 120    full sized loads.  */
 121 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
 122           m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
 123           | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE | m_ZHAOXIN
 124           | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
 125
 126 /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
 127    conditional jump instruction for 32 bit TARGET.  */
 128 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
 129           m_CORE_ALL | m_BDVER | m_ZNVER | m_ZHAOXIN | m_GENERIC)
 130
 131 /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
 132    conditional jump instruction for TARGET_64BIT.  */
 133 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
 134           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
 135           | m_ZNVER | m_ZHAOXIN | m_GENERIC)
 136
 137 /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
 138    subsequent conditional jump instruction when the condition jump
 139    check sign flag (SF) or overflow flag (OF).  */
 140 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
 141           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
 142           | m_ZNVER | m_ZHAOXIN | m_GENERIC)
 143
 144 /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
 145    jump instruction when the alu instruction produces the CCFLAG consumed by
 146    the conditional jump instruction.
 147
 148    TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND,
 149    There is also limitation for immediate and displacement supported.  */
 150 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
 151           m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5)
 152
 153 /* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
 154    and the destination is used by alu.  alu must be one of
 155    ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR.  */
 156 DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu",
 157          m_ZNVER5 | m_GRANITERAPIDS | m_GRANITERAPIDS_D)
 158
 159 /*****************************************************************************/
 160 /* Function prologue, epilogue and function calling sequences.               */
 161 /*****************************************************************************/
 162
 163 /* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
 164    arguments in prologue/epilogue instead of separately for each call
 165    by push/pop instructions.
 166    This increase code size by about 5% in 32bit mode, less so in 64bit mode
 167    because parameters are passed in registers.  It is considerable
 168    win for targets without stack engine that prevents multiple push operations
 169    to happen in parallel.  */
 170
 171 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
 172           m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL
 173           | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8 | m_ZHAOXIN)
 174
 175 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
 176    considered on critical path.  */
 177 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
 178           m_PPRO | m_ATHLON_K8)
 179
 180 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
 181    considered on critical path.  */
 182 DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
 183           m_PPRO | m_ATHLON_K8)
 184
 185 /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits.  */
 186 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
 187           m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN
 188           | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
 189
 190 /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
 191    Some chips, like 486 and Pentium works faster with separate load
 192    and push instructions.  */
 193 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
 194           m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
 195           | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
 196
 197 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
 198    over esp subtraction.  */
 199 DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
 200           | m_LAKEMONT | m_K6_GEODE)
 201
 202 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
 203    over esp subtraction.  */
 204 DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_LAKEMONT
 205           | m_K6_GEODE)
 206
 207 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
 208    over esp addition.  */
 209 DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
 210           | m_LAKEMONT | m_PPRO)
 211
 212 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
 213    over esp addition.  */
 214 DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
 215
 216 /*****************************************************************************/
 217 /* Branch predictor tuning                                                   */
 218 /*****************************************************************************/
 219
 220 /* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
 221    instructions long.  */
 222 DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
 223
 224 /* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
 225    of conditional jump or directly preceded by other jump instruction.
 226    This is important for AND K8-AMDFAM10 because the branch prediction
 227    architecture expect at most one jump per 2 byte window.  Failing to
 228    pad returns leads to misaligned return stack.  */
 229 DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
 230           m_ATHLON_K8 | m_AMDFAM10)
 231
 232 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
 233    than 4 branch instructions in the 16 byte window.  */
 234 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
 235           m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_GOLDMONT
 236           | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
 237
 238 /*****************************************************************************/
 239 /* Integer instruction selection tuning                                      */
 240 /*****************************************************************************/
 241
 242 /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
 243    at -O3.  For the moment, the prefetching seems badly tuned for Intel
 244    chips.  */
 245 DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
 246           m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
 247
 248 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
 249    on 16-bit immediate moves into memory on Core2 and Corei7.  */
 250 DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_ZHAOXIN | m_GENERIC)
 251
 252 /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
 253    as "add mem, reg".  */
 254 DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
 255
 256 /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.
 257
 258    Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
 259    Sandy bridge and Ivy bridge generate extra uop.  On Haswell this extra uop
 260    is output only when the values needs to be really merged, which is not
 261    done by GCC generated code.  */
 262 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
 263           ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM  | m_SANDYBRIDGE
 264             | m_BONNELL | m_SILVERMONT | m_INTEL | m_GOLDMONT
 265             | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
 266             | m_ZHAOXIN | m_GENERIC))
 267
 268 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
 269    for DFmode copies */
 270 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
 271           ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 272             | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
 273             | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
 274             | m_GENERIC))
 275
 276 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
 277    will impact LEA instruction selection. */
 278 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_GOLDMONT
 279           | m_GOLDMONT_PLUS | m_INTEL | m_ZHAOXIN)
 280
 281 /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation.  */
 282 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
 283           m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS)
 284
 285 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
 286    vector path on AMD machines.
 287    FIXME: Do we need to enable this for core? */
 288 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
 289           m_K8 | m_AMDFAM10)
 290
 291 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
 292    machines.
 293    FIXME: Do we need to enable this for core? */
 294 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
 295           m_K8 | m_AMDFAM10)
 296
 297 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
 298    a conditional move.  */
 299 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
 300           m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
 301
 302 /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
 303    as MOVS and STOS (without a REP prefix) to move/set sequences of bytes.  */
 304 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
 305
 306 /* X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB: Enable use of REP MOVSB/STOSB to
 307    move/set sequences of bytes with known size.  */
 308 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
 309           "prefer_known_rep_movsb_stosb",
 310           m_SKYLAKE | m_CORE_HYBRID | m_CORE_ATOM | m_TREMONT | m_CORE_AVX512
 311           | m_ZHAOXIN)
 312
 313 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
 314    compact prologues and epilogues by issuing a misaligned moves.  This
 315    requires target to handle misaligned moves and partial memory stalls
 316    reasonably well.
 317    FIXME: This may actualy be a win on more targets than listed here.  */
 318 DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
 319           "misaligned_move_string_pro_epilogues",
 320           m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_ZHAOXIN | m_TREMONT
 321           | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
 322
 323 /* X86_TUNE_USE_SAHF: Controls use of SAHF.  */
 324 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
 325           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 326           | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER
 327           | m_ZNVER | m_ZHAOXIN | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
 328           | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
 329
 330 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions.  */
 331 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
 332           ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_INTEL
 333             | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS))
 334
 335 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions.  */
 336 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 337           m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL | m_LAKEMONT
 338           | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT | m_GOLDMONT_PLUS
 339           | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
 340
 341 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
 342    for bit-manipulation instructions.  */
 343 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
 344           m_SANDYBRIDGE | m_HASWELL | m_SKYLAKE | m_SKYLAKE_AVX512
 345           | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE
 346           | m_ZHAOXIN | m_GENERIC)
 347
 348 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
 349    on hardware capabilities. Bdver3 hardware has a loop buffer which makes
 350    unrolling small loop less important. For, such architectures we adjust
 351    the unroll factor so that the unrolled loop fits the loop buffer.  */
 352 DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
 353
 354 /* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
 355    if-converted sequence to one.  */
 356 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
 357           m_SILVERMONT | m_HASWELL | m_SKYLAKE | m_GOLDMONT | m_GOLDMONT_PLUS
 358           | m_TREMONT  | m_ZHAOXIN)
 359
 360 /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence.  */
 361 DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
 362          m_CORE_ALL | m_BDVER | m_ZNVER | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
 363          | m_CORE_ATOM | m_GENERIC)
 364
 365 /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
 366    generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) -
 367    (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions.  */
 368 DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs",
 369           m_CORE_ALL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_ZHAOXIN)
 370
 371 /*****************************************************************************/
 372 /* 387 instruction selection tuning                                          */
 373 /*****************************************************************************/
 374
 375 /* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
 376    integer operand.
 377    FIXME: Why this is disabled for modern chips?  */
 378 DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
 379           m_386 | m_486 | m_K6_GEODE)
 380
 381 /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
 382    integer operand.  */
 383 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
 384           ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
 385             | m_SILVERMONT | m_INTEL | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
 386             | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
 387             | m_GENERIC))
 388
 389 /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp.  */
 390 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE | m_ZHAOXIN)
 391
 392 /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI.  */
 393 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
 394           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
 395           | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_ZHAOXIN | m_GOLDMONT
 396           | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
 397           | m_GENERIC)
 398
 399 /*****************************************************************************/
 400 /* SSE instruction selection tuning                                          */
 401 /*****************************************************************************/
 402
 403 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
 404    regs instead of memory.  */
 405 DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
 406           m_CORE_ALL)
 407
 408 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
 409    of a sequence loading registers by parts.  */
 410 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
 411           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_INTEL
 412           | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
 413           | m_CORE_ATOM | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_ZHAOXIN
 414           | m_GENERIC)
 415
 416 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores
 417    instead of a sequence loading registers by parts.  */
 418 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
 419           m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT
 420           | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
 421           | m_CORE_ATOM | m_BDVER | m_ZNVER | m_ZHAOXIN | m_GENERIC)
 422
 423 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single
 424    precision 128bit instructions instead of double where possible.   */
 425 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
 426           m_BDVER | m_ZNVER)
 427
 428 /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores.   */
 429 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
 430           m_AMD_MULTIPLE | m_ZHAOXIN | m_CORE_ALL | m_TREMONT | m_CORE_HYBRID
 431           | m_CORE_ATOM | m_GENERIC)
 432
 433 /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
 434    xorps/xorpd and other variants.  */
 435 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
 436           m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER
 437           | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
 438
 439 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
 440    to SSE registers.  If disabled, the moves will be done by storing
 441    the value to memory and reloading.
 442    Enable this flag for generic - the only relevant architecture preferring
 443    no inter-unit moves is Buldozer. While this makes small regression on SPECfp
 444    scores (sub 0.3%), disabling inter-unit moves penalizes noticeably hand
 445    written vectorized code which use i.e. _mm_set_epi16.  */
 446 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
 447           ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER))
 448
 449 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
 450    to integer registers.  If disabled, the moves will be done by storing
 451    the value to memory and reloading.  */
 452 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
 453           ~m_ATHLON_K8)
 454
 455 /* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
 456    to use both SSE and integer registers at a same time.  */
 457 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
 458           ~(m_AMDFAM10 | m_BDVER))
 459
 460 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
 461    fp converts to destination register.  */
 462 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
 463           m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
 464
 465 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
 466    from FP to FP.  This form of instructions avoids partial write to the
 467    destination.  */
 468 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
 469           m_AMDFAM10)
 470
 471 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
 472    from integer to FP. */
 473 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
 474
 475 /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction.  */
 476 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
 477           m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
 478
 479 /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes.  */
 480 DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
 481           m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
 482           | m_CORE_ATOM | m_INTEL)
 483
 484 /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
 485    elements.  */
 486 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
 487           ~(m_ZNVER | m_CORE_HYBRID
 488             | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 489
 490 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
 491    elements.  */
 492 DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
 493           ~(m_ZNVER4 | m_ZNVER5))
 494
 495 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
 496    elements.  */
 497 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
 498           ~(m_ZNVER | m_CORE_HYBRID
 499             | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 500
 501 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
 502    elements.  */
 503 DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
 504           ~(m_ZNVER4 | m_ZNVER5))
 505
 506 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
 507    elements.  */
 508 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
 509           ~(m_ZNVER | m_CORE_HYBRID | m_CORE_ATOM
 510             | m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS))
 511
 512 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
 513    elements.  */
 514 DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
 515           ~(m_ZNVER4 | m_ZNVER5))
 516
 517 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
 518    smaller FMA chain.  */
 519 DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER
 520           | m_YONGFENG | m_SHIJIDADAO | m_GENERIC)
 521
 522 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
 523    smaller FMA chain.  */
 524 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains",
 525           m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID
 526           | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
 527
 528 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
 529    smaller FMA chain.  */
 530 DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
 531
 532 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
 533    for v2df vector reduction.  */
 534 DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
 535           "v2df_reduction_prefer_haddpd", m_NONE)
 536
 537 /* X86_TUNE_SSE_MOVCC_USE_BLENDV: Prefer blendv instructions to
 538    3-instruction sequence (op1 & mask) | (op2 & ~mask)
 539    for vector condition move.
 540    For Crestmont, 4-operand vex blendv instructions come from MSROM
 541    which is slow.  */
 542 DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV,
 543           "sse_movcc_use_blendv", ~m_CORE_ATOM)
 544
 545 /*****************************************************************************/
 546 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 547 /*****************************************************************************/
 548
 549 /* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
 550    split.  */
 551 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
 552           ~(m_NEHALEM | m_SANDYBRIDGE))
 553
 554 /* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
 555    split.  */
 556 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
 557           ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1))
 558
 559 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops.  */
 560 DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
 561           | m_ZNVER1 | m_CORE_ATOM)
 562
 563 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
 564    the auto-vectorizer.  */
 565 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
 566           | m_ZNVER1)
 567
 568 /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
 569    instructions in the auto-vectorizer.  */
 570 DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
 571
 572 /* X86_TUNE_AVX256_AVOID_VEC_PERM: Avoid using 256-bit cross-lane
 573    vector permutation instructions in the auto-vectorizer.  */
 574 DEF_TUNE (X86_TUNE_AVX256_AVOID_VEC_PERM,
 575          "avx256_avoid_vec_perm", m_CORE_ATOM)
 576
 577 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops.  */
 578 DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
 579
 580 /* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit
 581    AVX instructions.  */
 582 DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
 583           m_CORE_HYBRID | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
 584
 585 /* X86_TUNE_AVX256_STORE_BY_PIECES: Optimize store_by_pieces with 256-bit
 586    AVX instructions.  */
 587 DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces",
 588           m_CORE_HYBRID | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
 589
 590 /* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
 591    AVX instructions.  */
 592 DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
 593           m_SAPPHIRERAPIDS | m_ZNVER4 | m_ZNVER5)
 594
 595 /* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit
 596    AVX instructions.  */
 597 DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
 598           m_SAPPHIRERAPIDS | m_ZNVER4 | m_ZNVER5)
 599
 600 /*****************************************************************************/
 601 /*****************************************************************************/
 602 /* Historical relics: tuning flags that helps a specific old CPU designs     */
 603 /*****************************************************************************/
 604
 605 /* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
 606    an integer register.  */
 607 DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
 608
 609 /* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
 610    such as fsqrt, fprem, fsin, fcos, fsincos etc.
 611    Should be enabled for all targets that always has coprocesor.  */
 612 DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
 613           ~(m_386 | m_486 | m_LAKEMONT))
 614
 615 /* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
 616    inline strlen.  This affects only -minline-all-stringops mode. By
 617    default we always dispatch to a library since our internal strlen
 618    is bad.  */
 619 DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
 620
 621 /* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
 622    longer "sal $1, reg".  */
 623 DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
 624
 625 /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
 626    of mozbl/movwl.  */
 627 DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
 628           m_486 | m_PENT)
 629
 630 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
 631    and SImode multiply, but 386 and 486 do HImode multiply faster.  */
 632 DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
 633           ~(m_386 | m_486))
 634
 635 /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
 636    into 16bit/8bit when resulting sequence is shorter.  For example
 637    for "and $-65536, reg" to 16bit store of 0.  */
 638 DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
 639           ~(m_386 | m_486 | m_PENT | m_LAKEMONT))
 640
 641 /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
 642    such as "add $1, mem".  */
 643 DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
 644           ~(m_PENT | m_LAKEMONT))
 645
 646 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
 647    than a MOV.  */
 648 DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_LAKEMONT)
 649
 650 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
 651    but one byte longer.  */
 652 DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_LAKEMONT)
 653
 654 /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
 655    use of partial registers by renaming.  This improved performance of 16bit
 656    code where upper halves of registers are not used.  It also leads to
 657    an penalty whenever a 16bit store is followed by 32bit use.  This flag
 658    disables production of such sequences in common cases.
 659    See also X86_TUNE_HIMODE_MATH.
 660
 661    In current implementation the partial register stalls are not eliminated
 662    very well - they can be introduced via subregs synthesized by combine
 663    and can happen in caller/callee saving sequences.  */
 664 DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
 665
 666 /* X86_TUNE_PARTIAL_MEMORY_READ_STALL: Reading (possible unaligned) part of
 667    memory location after a large write to the same address causes
 668    store-to-load forwarding stall.  */
 669 DEF_TUNE (X86_TUNE_PARTIAL_MEMORY_READ_STALL, "partial_memory_read_stall",
 670           m_386 | m_486 | m_PENT | m_LAKEMONT | m_PPRO | m_P4_NOCONA | m_CORE2
 671            | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
 672            | m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10)
 673
 674 /* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
 675    corresponding 32bit arithmetic.  */
 676 DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
 677           ~m_PPRO)
 678
 679 /* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic.  Again we avoid
 680    partial register stalls on PentiumPro targets. */
 681 DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
 682
 683 /* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
 684    On PPro this flag is meant to avoid partial register stalls.  */
 685 DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
 686
 687 /* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
 688    directly to memory.  */
 689 DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
 690
 691 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
 692 DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
 693
 694 /* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
 695    integer register.  */
 696 DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
 697
 698 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
 699    operand that cannot be represented using a modRM byte.  The XOR
 700    replacement is long decoded, so this split helps here as well.  */
 701 DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
 702
 703 /* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
 704    forms of instructions on K8 targets.  */
 705 DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
 706           m_K8)
 707
 708 /* X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN, starting with the Redwood Cove
 709    microarchitecture, if the predictor has no stored information about a branch,
 710    the branch has the Intel® SSE2 branch taken hint
 711    (i.e., instruction prefix 3EH), When the codec decodes the branch, it flips
 712    the branch’s prediction from not-taken to taken. It then flushes the pipeline
 713    in front of it and steers this pipeline to fetch the taken path of the
 714    branch.  */
 715 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN, "branch_prediction_hints_taken", m_NONE)
 716
 717 /*****************************************************************************/
 718 /* This never worked well before.                                            */
 719 /*****************************************************************************/
 720
 721 /* X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN: Branch hints were put in P4 based
 722    on simulation result. But after P4 was made, no performance benefit
 723    was observed with branch hints.  It also increases the code size.
 724    As a result, icc never generates branch hints.  */
 725 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN, "branch_prediction_hints_not_taken", m_NONE)
 726
 727 /* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic.  */
 728 DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", m_ALL)
 729
 730 /* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
 731    arithmetic to 32bit via PROMOTE_MODE macro.  This code generation scheme
 732    is usually used for RISC targets.  */
 733 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
 734
 735 /* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag
 736   modifications on architectures where theses operations are slow.  */
 737 DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)
 738
 739 /* X86_TUNE_USE_RCR: Controls use of rcr 1 instruction instead of shrd.  */
 740 DEF_TUNE (X86_TUNE_USE_RCR, "use_rcr", m_AMD_MULTIPLE)