libcpp, c, middle-end: Optimize initializers using #embed in C
[official-gcc.git] / gcc / config / i386 / x86-tune.def
blob6ebb2fd3414e8734ae7601a0d46eecb60e5c0a78
1 /* Definitions of x86 tunable features.
2 Copyright (C) 2013-2024 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License and
17 a copy of the GCC Runtime Library Exception along with this program;
18 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
19 <http://www.gnu.org/licenses/>. */
21 /* Tuning for a given CPU XXXX consists of:
22 - adding new CPU into:
23 - adding PROCESSOR_XXX to processor_type (in i386.h)
24 - possibly adding XXX into CPU attribute in i386.md
25 - adding XXX to processor_alias_table (in i386.cc)
26 - introducing ix86_XXX_cost in i386.cc
27 - Stringop generation table can be build based on test_stringop
28 - script (once rest of tuning is complete)
29 - designing a scheduler model in
30 - XXXX.md file
31 - Updating ix86_issue_rate and ix86_adjust_cost in i386.md
32 - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder
33 and ix86_sched_init_global if those tricks are needed.
34 - Tunning the flags bellow. Those are split into sections and each
35 section is very roughly ordered by importance. */
37 /*****************************************************************************/
38 /* Scheduling flags. */
39 /*****************************************************************************/
41 /* X86_TUNE_SCHEDULE: Enable scheduling. */
42 DEF_TUNE (X86_TUNE_SCHEDULE, "schedule",
43 m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
44 | m_INTEL | m_K6_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
45 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
46 | m_GENERIC)
48 /* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming
49 on modern chips. Prefer stores affecting whole integer register
50 over partial stores. For example prefer MOVZBL or MOVQ to load 8bit
51 value over movb. */
52 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency",
53 m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2
54 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
55 | m_AMD_MULTIPLE | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
56 | m_CORE_ATOM | m_GENERIC)
58 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store
59 destinations to be 128bit to allow register renaming on 128bit SSE units,
60 but usually results in one extra microop on 64bit SSE units.
61 Experimental results shows that disabling this option on P4 brings over 20%
62 SPECfp regression, while enabling it on K8 brings roughly 2.4% regression
63 that can be partly masked by careful scheduling of moves. */
64 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency",
65 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
66 | m_BDVER | m_ZNVER | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
67 | m_CORE_ATOM | m_GENERIC)
69 /* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY: This knob avoids
70 partial write to the destination in scalar SSE conversion from FP
71 to FP. */
72 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY,
73 "sse_partial_reg_fp_converts_dependency",
74 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
75 | m_BDVER | m_ZNVER | m_ZHAOXIN | m_CORE_HYBRID | m_CORE_ATOM
76 | m_GENERIC)
78 /* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY: This knob avoids partial
79 write to the destination in scalar SSE conversion from integer to FP. */
80 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY,
81 "sse_partial_reg_converts_dependency",
82 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
83 | m_BDVER | m_ZNVER | m_ZHAOXIN | m_CORE_HYBRID | m_CORE_ATOM
84 | m_GENERIC)
86 /* X86_TUNE_DEST_FALSE_DEP_FOR_GLC: This knob inserts zero-idiom before
87 several insns to break false dependency on the dest register for GLC
88 micro-architecture. */
89 DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC,
90 "dest_false_dep_for_glc", m_SAPPHIRERAPIDS | m_CORE_HYBRID
91 | m_CORE_ATOM)
93 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
94 are resolved on SSE register parts instead of whole registers, so we may
95 maintain just lower part of scalar values in proper format leaving the
96 upper part undefined. */
97 DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8)
99 /* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of flags
100 set by instructions affecting just some flags (in particular shifts).
101 This is because Core2 resolves dependencies on whole flags register
102 and such sequences introduce false dependency on previous instruction
103 setting full flags.
105 The flags does not affect generation of INC and DEC that is controlled
106 by X86_TUNE_USE_INCDEC. */
108 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall",
109 m_CORE2)
111 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
112 partial dependencies. */
113 DEF_TUNE (X86_TUNE_MOVX, "movx",
114 m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
115 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_INTEL
116 | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN
117 | m_CORE_AVX2 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
119 /* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by
120 full sized loads. */
121 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall",
122 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
123 | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE | m_ZHAOXIN
124 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
126 /* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent
127 conditional jump instruction for 32 bit TARGET. */
128 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32",
129 m_CORE_ALL | m_BDVER | m_ZNVER | m_ZHAOXIN | m_GENERIC)
131 /* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
132 conditional jump instruction for TARGET_64BIT. */
133 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
134 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
135 | m_ZNVER | m_ZHAOXIN | m_GENERIC)
137 /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
138 subsequent conditional jump instruction when the condition jump
139 check sign flag (SF) or overflow flag (OF). */
140 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
141 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
142 | m_ZNVER | m_ZHAOXIN | m_GENERIC)
144 /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
145 jump instruction when the alu instruction produces the CCFLAG consumed by
146 the conditional jump instruction.
148 TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND,
149 There is also limitation for immediate and displacement supported. */
150 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
151 m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5)
153 /* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
154 and the destination is used by alu. alu must be one of
155 ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */
156 DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu",
157 m_ZNVER5 | m_GRANITERAPIDS | m_GRANITERAPIDS_D)
159 /*****************************************************************************/
160 /* Function prologue, epilogue and function calling sequences. */
161 /*****************************************************************************/
163 /* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing
164 arguments in prologue/epilogue instead of separately for each call
165 by push/pop instructions.
166 This increase code size by about 5% in 32bit mode, less so in 64bit mode
167 because parameters are passed in registers. It is considerable
168 win for targets without stack engine that prevents multiple push operations
169 to happen in parallel. */
171 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args",
172 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL
173 | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8 | m_ZHAOXIN)
175 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are
176 considered on critical path. */
177 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move",
178 m_PPRO | m_ATHLON_K8)
180 /* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are
181 considered on critical path. */
182 DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move",
183 m_PPRO | m_ATHLON_K8)
185 /* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */
186 DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave",
187 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN
188 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
190 /* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions.
191 Some chips, like 486 and Pentium works faster with separate load
192 and push instructions. */
193 DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory",
194 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
195 | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
197 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
198 over esp subtraction. */
199 DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT
200 | m_LAKEMONT | m_K6_GEODE)
202 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
203 over esp subtraction. */
204 DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_LAKEMONT
205 | m_K6_GEODE)
207 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
208 over esp addition. */
209 DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT
210 | m_LAKEMONT | m_PPRO)
212 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
213 over esp addition. */
214 DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
216 /*****************************************************************************/
217 /* Branch predictor tuning */
218 /*****************************************************************************/
220 /* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
221 instructions long. */
222 DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL)
224 /* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination
225 of conditional jump or directly preceded by other jump instruction.
226 This is important for AND K8-AMDFAM10 because the branch prediction
227 architecture expect at most one jump per 2 byte window. Failing to
228 pad returns leads to misaligned return stack. */
229 DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns",
230 m_ATHLON_K8 | m_AMDFAM10)
232 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
233 than 4 branch instructions in the 16 byte window. */
234 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
235 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_GOLDMONT
236 | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
238 /*****************************************************************************/
239 /* Integer instruction selection tuning */
240 /*****************************************************************************/
242 /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching
243 at -O3. For the moment, the prefetching seems badly tuned for Intel
244 chips. */
245 DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial",
246 m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
248 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
249 on 16-bit immediate moves into memory on Core2 and Corei7. */
250 DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_ZHAOXIN | m_GENERIC)
252 /* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such
253 as "add mem, reg". */
254 DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO))
256 /* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions.
258 Core2 and nehalem has stall of 7 cycles for partial flag register stalls.
259 Sandy bridge and Ivy bridge generate extra uop. On Haswell this extra uop
260 is output only when the values needs to be really merged, which is not
261 done by GCC generated code. */
262 DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec",
263 ~(m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
264 | m_BONNELL | m_SILVERMONT | m_INTEL | m_GOLDMONT
265 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
266 | m_ZHAOXIN | m_GENERIC))
268 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
269 for DFmode copies */
270 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves",
271 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
272 | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
273 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
274 | m_GENERIC))
276 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
277 will impact LEA instruction selection. */
278 DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_GOLDMONT
279 | m_GOLDMONT_PLUS | m_INTEL | m_ZHAOXIN)
281 /* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */
282 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr",
283 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS)
285 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
286 vector path on AMD machines.
287 FIXME: Do we need to enable this for core? */
288 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem",
289 m_K8 | m_AMDFAM10)
291 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
292 machines.
293 FIXME: Do we need to enable this for core? */
294 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8",
295 m_K8 | m_AMDFAM10)
297 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
298 a conditional move. */
299 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove",
300 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
302 /* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such
303 as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */
304 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA)
306 /* X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB: Enable use of REP MOVSB/STOSB to
307 move/set sequences of bytes with known size. */
308 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
309 "prefer_known_rep_movsb_stosb",
310 m_SKYLAKE | m_CORE_HYBRID | m_CORE_ATOM | m_TREMONT | m_CORE_AVX512
311 | m_ZHAOXIN)
313 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
314 compact prologues and epilogues by issuing a misaligned moves. This
315 requires target to handle misaligned moves and partial memory stalls
316 reasonably well.
317 FIXME: This may actualy be a win on more targets than listed here. */
318 DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES,
319 "misaligned_move_string_pro_epilogues",
320 m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_ZHAOXIN | m_TREMONT
321 | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
323 /* X86_TUNE_USE_SAHF: Controls use of SAHF. */
324 DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf",
325 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
326 | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER
327 | m_ZNVER | m_ZHAOXIN | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
328 | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
330 /* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */
331 DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd",
332 ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_INTEL
333 | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS))
335 /* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */
336 DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
337 m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL | m_LAKEMONT
338 | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT | m_GOLDMONT_PLUS
339 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
341 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
342 for bit-manipulation instructions. */
343 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
344 m_SANDYBRIDGE | m_HASWELL | m_SKYLAKE | m_SKYLAKE_AVX512
345 | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE
346 | m_ZHAOXIN | m_GENERIC)
348 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
349 on hardware capabilities. Bdver3 hardware has a loop buffer which makes
350 unrolling small loop less important. For, such architectures we adjust
351 the unroll factor so that the unrolled loop fits the loop buffer. */
352 DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4)
354 /* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
355 if-converted sequence to one. */
356 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
357 m_SILVERMONT | m_HASWELL | m_SKYLAKE | m_GOLDMONT | m_GOLDMONT_PLUS
358 | m_TREMONT | m_ZHAOXIN)
360 /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence. */
361 DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
362 m_CORE_ALL | m_BDVER | m_ZNVER | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
363 | m_CORE_ATOM | m_GENERIC)
365 /* X86_TUNE_EXPAND_ABS: This enables a new abs pattern by
366 generating instructions for abs (x) = (((signed) x >> (W-1) ^ x) -
367 (signed) x >> (W-1)) instead of cmove or SSE max/abs instructions. */
368 DEF_TUNE (X86_TUNE_EXPAND_ABS, "expand_abs",
369 m_CORE_ALL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_ZHAOXIN)
371 /*****************************************************************************/
372 /* 387 instruction selection tuning */
373 /*****************************************************************************/
375 /* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit
376 integer operand.
377 FIXME: Why this is disabled for modern chips? */
378 DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop",
379 m_386 | m_486 | m_K6_GEODE)
381 /* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit
382 integer operand. */
383 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop",
384 ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
385 | m_SILVERMONT | m_INTEL | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
386 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
387 | m_GENERIC))
389 /* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */
390 DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE | m_ZHAOXIN)
392 /* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */
393 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants",
394 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
395 | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_ZHAOXIN | m_GOLDMONT
396 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
397 | m_GENERIC)
399 /*****************************************************************************/
400 /* SSE instruction selection tuning */
401 /*****************************************************************************/
403 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
404 regs instead of memory. */
405 DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill",
406 m_CORE_ALL)
408 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead
409 of a sequence loading registers by parts. */
410 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
411 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_INTEL
412 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
413 | m_CORE_ATOM | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_ZHAOXIN
414 | m_GENERIC)
416 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores
417 instead of a sequence loading registers by parts. */
418 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
419 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT
420 | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
421 | m_CORE_ATOM | m_BDVER | m_ZNVER | m_ZHAOXIN | m_GENERIC)
423 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL: Use packed single
424 precision 128bit instructions instead of double where possible. */
425 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal",
426 m_BDVER | m_ZNVER)
428 /* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */
429 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores",
430 m_AMD_MULTIPLE | m_ZHAOXIN | m_CORE_ALL | m_TREMONT | m_CORE_HYBRID
431 | m_CORE_ATOM | m_GENERIC)
433 /* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to
434 xorps/xorpd and other variants. */
435 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor",
436 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER
437 | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC)
439 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer
440 to SSE registers. If disabled, the moves will be done by storing
441 the value to memory and reloading.
442 Enable this flag for generic - the only relevant architecture preferring
443 no inter-unit moves is Buldozer. While this makes small regression on SPECfp
444 scores (sub 0.3%), disabling inter-unit moves penalizes noticeably hand
445 written vectorized code which use i.e. _mm_set_epi16. */
446 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec",
447 ~(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER))
449 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE
450 to integer registers. If disabled, the moves will be done by storing
451 the value to memory and reloading. */
452 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec",
453 ~m_ATHLON_K8)
455 /* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions
456 to use both SSE and integer registers at a same time. */
457 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions",
458 ~(m_AMDFAM10 | m_BDVER))
460 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for
461 fp converts to destination register. */
462 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts",
463 m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
465 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
466 from FP to FP. This form of instructions avoids partial write to the
467 destination. */
468 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts",
469 m_AMDFAM10)
471 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
472 from integer to FP. */
473 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10)
475 /* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */
476 DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb",
477 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL)
479 /* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */
480 DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes",
481 m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
482 | m_CORE_ATOM | m_INTEL)
484 /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
485 elements. */
486 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
487 ~(m_ZNVER | m_CORE_HYBRID
488 | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
490 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
491 elements. */
492 DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
493 ~(m_ZNVER4 | m_ZNVER5))
495 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
496 elements. */
497 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
498 ~(m_ZNVER | m_CORE_HYBRID
499 | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
501 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
502 elements. */
503 DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
504 ~(m_ZNVER4 | m_ZNVER5))
506 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
507 elements. */
508 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
509 ~(m_ZNVER | m_CORE_HYBRID | m_CORE_ATOM
510 | m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS))
512 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
513 elements. */
514 DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
515 ~(m_ZNVER4 | m_ZNVER5))
517 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
518 smaller FMA chain. */
519 DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER
520 | m_YONGFENG | m_SHIJIDADAO | m_GENERIC)
522 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
523 smaller FMA chain. */
524 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains",
525 m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID
526 | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
528 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
529 smaller FMA chain. */
530 DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
532 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
533 for v2df vector reduction. */
534 DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
535 "v2df_reduction_prefer_haddpd", m_NONE)
537 /* X86_TUNE_SSE_MOVCC_USE_BLENDV: Prefer blendv instructions to
538 3-instruction sequence (op1 & mask) | (op2 & ~mask)
539 for vector condition move.
540 For Crestmont, 4-operand vex blendv instructions come from MSROM
541 which is slow. */
542 DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV,
543 "sse_movcc_use_blendv", ~m_CORE_ATOM)
545 /*****************************************************************************/
546 /* AVX instruction selection tuning (some of SSE flags affects AVX, too) */
547 /*****************************************************************************/
549 /* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
550 split. */
551 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
552 ~(m_NEHALEM | m_SANDYBRIDGE))
554 /* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
555 split. */
556 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal",
557 ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1))
559 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX256 ops are split into two AVX128 ops. */
560 DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS, "avx256_split_regs",m_BDVER | m_BTVER2
561 | m_ZNVER1 | m_CORE_ATOM)
563 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
564 the auto-vectorizer. */
565 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
566 | m_ZNVER1)
568 /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 512-bit AVX
569 instructions in the auto-vectorizer. */
570 DEF_TUNE (X86_TUNE_AVX256_OPTIMAL, "avx256_optimal", m_CORE_AVX512)
572 /* X86_TUNE_AVX256_AVOID_VEC_PERM: Avoid using 256-bit cross-lane
573 vector permutation instructions in the auto-vectorizer. */
574 DEF_TUNE (X86_TUNE_AVX256_AVOID_VEC_PERM,
575 "avx256_avoid_vec_perm", m_CORE_ATOM)
577 /* X86_TUNE_AVX256_SPLIT_REGS: if true, AVX512 ops are split into two AVX256 ops. */
578 DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS, "avx512_split_regs", m_ZNVER4)
580 /* X86_TUNE_AVX256_MOVE_BY_PIECES: Optimize move_by_pieces with 256-bit
581 AVX instructions. */
582 DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES, "avx256_move_by_pieces",
583 m_CORE_HYBRID | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
585 /* X86_TUNE_AVX256_STORE_BY_PIECES: Optimize store_by_pieces with 256-bit
586 AVX instructions. */
587 DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES, "avx256_store_by_pieces",
588 m_CORE_HYBRID | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3)
590 /* X86_TUNE_AVX512_MOVE_BY_PIECES: Optimize move_by_pieces with 512-bit
591 AVX instructions. */
592 DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES, "avx512_move_by_pieces",
593 m_SAPPHIRERAPIDS | m_ZNVER4 | m_ZNVER5)
595 /* X86_TUNE_AVX512_STORE_BY_PIECES: Optimize store_by_pieces with 512-bit
596 AVX instructions. */
597 DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
598 m_SAPPHIRERAPIDS | m_ZNVER4 | m_ZNVER5)
600 /*****************************************************************************/
601 /*****************************************************************************/
602 /* Historical relics: tuning flags that helps a specific old CPU designs */
603 /*****************************************************************************/
605 /* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in
606 an integer register. */
607 DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386)
609 /* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations,
610 such as fsqrt, fprem, fsin, fcos, fsincos etc.
611 Should be enabled for all targets that always has coprocesor. */
612 DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387",
613 ~(m_386 | m_486 | m_LAKEMONT))
615 /* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for
616 inline strlen. This affects only -minline-all-stringops mode. By
617 default we always dispatch to a library since our internal strlen
618 is bad. */
619 DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386)
621 /* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of
622 longer "sal $1, reg". */
623 DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486)
625 /* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead
626 of mozbl/movwl. */
627 DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and",
628 m_486 | m_PENT)
630 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
631 and SImode multiply, but 386 and 486 do HImode multiply faster. */
632 DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul",
633 ~(m_386 | m_486))
635 /* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic
636 into 16bit/8bit when resulting sequence is shorter. For example
637 for "and $-65536, reg" to 16bit store of 0. */
638 DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix",
639 ~(m_386 | m_486 | m_PENT | m_LAKEMONT))
641 /* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions
642 such as "add $1, mem". */
643 DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write",
644 ~(m_PENT | m_LAKEMONT))
646 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
647 than a MOV. */
648 DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_LAKEMONT)
650 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
651 but one byte longer. */
652 DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_LAKEMONT)
654 /* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled
655 use of partial registers by renaming. This improved performance of 16bit
656 code where upper halves of registers are not used. It also leads to
657 an penalty whenever a 16bit store is followed by 32bit use. This flag
658 disables production of such sequences in common cases.
659 See also X86_TUNE_HIMODE_MATH.
661 In current implementation the partial register stalls are not eliminated
662 very well - they can be introduced via subregs synthesized by combine
663 and can happen in caller/callee saving sequences. */
664 DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO)
666 /* X86_TUNE_PARTIAL_MEMORY_READ_STALL: Reading (possible unaligned) part of
667 memory location after a large write to the same address causes
668 store-to-load forwarding stall. */
669 DEF_TUNE (X86_TUNE_PARTIAL_MEMORY_READ_STALL, "partial_memory_read_stall",
670 m_386 | m_486 | m_PENT | m_LAKEMONT | m_PPRO | m_P4_NOCONA | m_CORE2
671 | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
672 | m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10)
674 /* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to
675 corresponding 32bit arithmetic. */
676 DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode",
677 ~m_PPRO)
679 /* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic. Again we avoid
680 partial register stalls on PentiumPro targets. */
681 DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO)
683 /* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic.
684 On PPro this flag is meant to avoid partial register stalls. */
685 DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO)
687 /* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates
688 directly to memory. */
689 DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO)
691 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
692 DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4)
694 /* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear
695 integer register. */
696 DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6)
698 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
699 operand that cannot be represented using a modRM byte. The XOR
700 replacement is long decoded, so this split helps here as well. */
701 DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6)
703 /* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded
704 forms of instructions on K8 targets. */
705 DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode",
706 m_K8)
708 /* X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN, starting with the Redwood Cove
709 microarchitecture, if the predictor has no stored information about a branch,
710 the branch has the Intel® SSE2 branch taken hint
711 (i.e., instruction prefix 3EH), When the codec decodes the branch, it flips
712 the branch’s prediction from not-taken to taken. It then flushes the pipeline
713 in front of it and steers this pipeline to fetch the taken path of the
714 branch. */
715 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN, "branch_prediction_hints_taken", m_NONE)
717 /*****************************************************************************/
718 /* This never worked well before. */
719 /*****************************************************************************/
721 /* X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN: Branch hints were put in P4 based
722 on simulation result. But after P4 was made, no performance benefit
723 was observed with branch hints. It also increases the code size.
724 As a result, icc never generates branch hints. */
725 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN, "branch_prediction_hints_not_taken", m_NONE)
727 /* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */
728 DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", m_ALL)
730 /* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit
731 arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme
732 is usually used for RISC targets. */
733 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", m_NONE)
735 /* X86_TUNE_SLOW_STC: This disables use of stc, clc and cmc carry flag
736 modifications on architectures where theses operations are slow. */
737 DEF_TUNE (X86_TUNE_SLOW_STC, "slow_stc", m_PENT4)
739 /* X86_TUNE_USE_RCR: Controls use of rcr 1 instruction instead of shrd. */
740 DEF_TUNE (X86_TUNE_USE_RCR, "use_rcr", m_AMD_MULTIPLE)