1 /* Definitions of x86 tunable features.
2 Copyright (C
) 2013-2024 Free Software Foundation
, Inc.
4 This file is part of GCC.
6 GCC is free software
; you can redistribute it and
/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation
; either version
3, or (at your option
)
11 GCC is distributed in the hope that it will be useful
,
12 but WITHOUT ANY WARRANTY
; without even the implied warranty of
13 MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License and
17 a copy of the GCC Runtime Library Exception along with this program
;
18 see the files COPYING3 and COPYING.RUNTIME respectively. If not
, see
19 <http
://www.gnu.org
/licenses
/>.
*/
21 /* Tuning for a given CPU XXXX consists of
:
22 - adding new CPU into
:
23 - adding PROCESSOR_XXX to
processor_type (in i386.h
)
24 - possibly adding XXX into CPU attribute in i386.md
25 - adding XXX to
processor_alias_table (in i386.cc
)
26 - introducing ix86_XXX_cost in i386.cc
27 - Stringop generation table can be build based on test_stringop
28 - script (once rest of tuning is complete
)
29 - designing a scheduler model in
31 - Updating ix86_issue_rate and ix86_adjust_cost in i386.md
32 - possibly updating ia32_multipass_dfa_lookahead
, ix86_sched_reorder
33 and ix86_sched_init_global if those tricks are needed.
34 - Tunning the flags bellow. Those are split into sections and each
35 section is very roughly ordered by importance.
*/
37 /*****************************************************************************/
38 /* Scheduling flags.
*/
39 /*****************************************************************************/
41 /* X86_TUNE_SCHEDULE
: Enable scheduling.
*/
42 DEF_TUNE (X86_TUNE_SCHEDULE
, "schedule",
43 m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT
44 | m_INTEL | m_K6_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
45 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
48 /* X86_TUNE_PARTIAL_REG_DEPENDENCY
: Enable more register renaming
49 on modern chips. Prefer stores affecting whole integer register
50 over partial stores. For example prefer MOVZBL or MOVQ to load
8bit
52 DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY
, "partial_reg_dependency",
53 m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2
54 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
55 | m_AMD_MULTIPLE | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
56 | m_CORE_ATOM | m_GENERIC
)
58 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY
: This knob promotes all store
59 destinations to be
128bit to allow register renaming on
128bit SSE units
,
60 but usually results in one extra microop on
64bit SSE units.
61 Experimental results shows that disabling this option on P4 brings over
20%
62 SPECfp regression
, while enabling it on K8 brings roughly
2.4% regression
63 that can be partly masked by careful scheduling of moves.
*/
64 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY
, "sse_partial_reg_dependency",
65 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
66 | m_BDVER | m_ZNVER | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
67 | m_CORE_ATOM | m_GENERIC
)
69 /* X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY
: This knob avoids
70 partial write to the destination in scalar SSE conversion from FP
72 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY
,
73 "sse_partial_reg_fp_converts_dependency",
74 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
75 | m_BDVER | m_ZNVER | m_ZHAOXIN | m_CORE_HYBRID | m_CORE_ATOM
78 /* X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY
: This knob avoids partial
79 write to the destination in scalar SSE conversion from integer to FP.
*/
80 DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY
,
81 "sse_partial_reg_converts_dependency",
82 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10
83 | m_BDVER | m_ZNVER | m_ZHAOXIN | m_CORE_HYBRID | m_CORE_ATOM
86 /* X86_TUNE_DEST_FALSE_DEP_FOR_GLC
: This knob inserts zero
-idiom before
87 several insns to break false dependency on the dest register for GLC
88 micro
-architecture.
*/
89 DEF_TUNE (X86_TUNE_DEST_FALSE_DEP_FOR_GLC
,
90 "dest_false_dep_for_glc", m_SAPPHIRERAPIDS | m_CORE_HYBRID
93 /* X86_TUNE_SSE_SPLIT_REGS
: Set for machines where the type and dependencies
94 are resolved on SSE register parts instead of whole registers
, so we may
95 maintain just lower part of scalar values in proper format leaving the
96 upper part undefined.
*/
97 DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS
, "sse_split_regs", m_ATHLON_K8
)
99 /* X86_TUNE_PARTIAL_FLAG_REG_STALL
: this flag disables use of flags
100 set by instructions affecting just some
flags (in particular shifts
).
101 This is because Core2 resolves dependencies on whole flags register
102 and such sequences introduce false dependency on previous instruction
105 The flags does not affect generation of
INC and
DEC that is controlled
106 by X86_TUNE_USE_INCDEC.
*/
108 DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL
, "partial_flag_reg_stall",
111 /* X86_TUNE_MOVX
: Enable to zero extend integer registers to avoid
112 partial dependencies.
*/
113 DEF_TUNE (X86_TUNE_MOVX
, "movx",
114 m_PPRO | m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
115 | m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_INTEL
116 | m_GOLDMONT_PLUS | m_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN
117 | m_CORE_AVX2 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC
)
119 /* X86_TUNE_MEMORY_MISMATCH_STALL
: Avoid partial stores that are followed by
121 DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL
, "memory_mismatch_stall",
122 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL
123 | m_GOLDMONT | m_GOLDMONT_PLUS | m_AMD_MULTIPLE | m_ZHAOXIN
124 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC
)
126 /* X86_TUNE_FUSE_CMP_AND_BRANCH_32
: Fuse compare with a subsequent
127 conditional jump instruction for
32 bit TARGET.
*/
128 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32
, "fuse_cmp_and_branch_32",
129 m_CORE_ALL | m_BDVER | m_ZNVER | m_ZHAOXIN | m_GENERIC
)
131 /* X86_TUNE_FUSE_CMP_AND_BRANCH_64
: Fuse compare with a subsequent
132 conditional jump instruction for TARGET_64BIT.
*/
133 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64
, "fuse_cmp_and_branch_64",
134 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
135 | m_ZNVER | m_ZHAOXIN | m_GENERIC
)
137 /* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS
: Fuse compare with a
138 subsequent conditional jump instruction when the condition jump
139 check sign
flag (SF
) or overflow
flag (OF).
*/
140 DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS
, "fuse_cmp_and_branch_soflags",
141 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_BDVER
142 | m_ZNVER | m_ZHAOXIN | m_GENERIC
)
144 /* X86_TUNE_FUSE_ALU_AND_BRANCH
: Fuse alu with a subsequent conditional
145 jump instruction when the alu instruction produces the CCFLAG consumed by
146 the conditional jump instruction.
148 TODO
: znver5 supports fusing with SUB
, ADD
, INC, DEC, OR, AND,
149 There is also limitation for immediate and displacement supported.
*/
150 DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH
, "fuse_alu_and_branch",
151 m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5
)
153 /* X86_TUNE_FUSE_MOV_AND_ALU
: mov and alu in case mov is reg
-reg mov
154 and the destination is used by alu. alu must be one of
155 ADD
, ADC
, AND, XOR
, OR, SUB
, SBB
, INC, DEC, NOT, SAL
, SHL
, SHR
, SAR.
*/
156 DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU
, "fuse_mov_and_alu",
157 m_ZNVER5 | m_GRANITERAPIDS | m_GRANITERAPIDS_D
)
159 /*****************************************************************************/
160 /* Function prologue
, epilogue and function calling sequences.
*/
161 /*****************************************************************************/
163 /* X86_TUNE_ACCUMULATE_OUTGOING_ARGS
: Allocate stack space for outgoing
164 arguments in prologue
/epilogue instead of separately for each call
165 by push
/pop instructions.
166 This increase code size by about
5% in
32bit mode
, less so in
64bit mode
167 because parameters are passed in registers. It is considerable
168 win for targets without stack engine that prevents multiple push operations
169 to happen in parallel.
*/
171 DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS
, "accumulate_outgoing_args",
172 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_INTEL
173 | m_GOLDMONT | m_GOLDMONT_PLUS | m_ATHLON_K8 | m_ZHAOXIN
)
175 /* X86_TUNE_PROLOGUE_USING_MOVE
: Do not use push
/pop in prologues that are
176 considered on critical path.
*/
177 DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE
, "prologue_using_move",
178 m_PPRO | m_ATHLON_K8
)
180 /* X86_TUNE_PROLOGUE_USING_MOVE
: Do not use push
/pop in epilogues that are
181 considered on critical path.
*/
182 DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE
, "epilogue_using_move",
183 m_PPRO | m_ATHLON_K8
)
185 /* X86_TUNE_USE_LEAVE
: Use
"leave" instruction in epilogues where it fits.
*/
186 DEF_TUNE (X86_TUNE_USE_LEAVE
, "use_leave",
187 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN
188 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC
)
190 /* X86_TUNE_PUSH_MEMORY
: Enable generation of
"push mem" instructions.
191 Some chips
, like
486 and Pentium works faster with separate load
192 and push instructions.
*/
193 DEF_TUNE (X86_TUNE_PUSH_MEMORY
, "push_memory",
194 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE
195 | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC
)
197 /* X86_TUNE_SINGLE_PUSH
: Enable if single push insn is preferred
198 over esp subtraction.
*/
199 DEF_TUNE (X86_TUNE_SINGLE_PUSH
, "single_push", m_386 | m_486 | m_PENT
200 | m_LAKEMONT | m_K6_GEODE
)
202 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
203 over esp subtraction.
*/
204 DEF_TUNE (X86_TUNE_DOUBLE_PUSH
, "double_push", m_PENT | m_LAKEMONT
207 /* X86_TUNE_SINGLE_POP
: Enable if single pop insn is preferred
208 over esp addition.
*/
209 DEF_TUNE (X86_TUNE_SINGLE_POP
, "single_pop", m_386 | m_486 | m_PENT
210 | m_LAKEMONT | m_PPRO
)
212 /* X86_TUNE_DOUBLE_POP
: Enable if double pop insn is preferred
213 over esp addition.
*/
214 DEF_TUNE (X86_TUNE_DOUBLE_POP
, "double_pop", m_PENT | m_LAKEMONT
)
216 /*****************************************************************************/
217 /* Branch predictor tuning
*/
218 /*****************************************************************************/
220 /* X86_TUNE_PAD_SHORT_FUNCTION
: Make every function to be at least
4
221 instructions long.
*/
222 DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION
, "pad_short_function", m_BONNELL
)
224 /* X86_TUNE_PAD_RETURNS
: Place NOP before every RET that is a destination
225 of conditional jump or directly preceded by other jump instruction.
226 This is important for
AND K8
-AMDFAM10 because the branch prediction
227 architecture expect at most one jump per
2 byte window. Failing to
228 pad returns leads to misaligned return stack.
*/
229 DEF_TUNE (X86_TUNE_PAD_RETURNS
, "pad_returns",
230 m_ATHLON_K8 | m_AMDFAM10
)
232 /* X86_TUNE_FOUR_JUMP_LIMIT
: Some CPU cores are not able to predict more
233 than
4 branch instructions in the
16 byte window.
*/
234 DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT
, "four_jump_limit",
235 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_GOLDMONT
236 | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10
)
238 /*****************************************************************************/
239 /* Integer instruction selection tuning
*/
240 /*****************************************************************************/
242 /* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL
: Enable software prefetching
243 at
-O3. For the moment
, the prefetching seems badly tuned for Intel
245 DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL
, "software_prefetching_beneficial",
246 m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER
)
248 /* X86_TUNE_LCP_STALL
: Avoid an expensive length
-changing prefix stall
249 on
16-bit immediate moves into memory on Core2 and Corei7.
*/
250 DEF_TUNE (X86_TUNE_LCP_STALL
, "lcp_stall", m_CORE_ALL | m_ZHAOXIN | m_GENERIC
)
252 /* X86_TUNE_READ_MODIFY
: Enable use of read
-modify instructions such
253 as
"add mem, reg".
*/
254 DEF_TUNE (X86_TUNE_READ_MODIFY
, "read_modify", ~
(m_PENT | m_LAKEMONT | m_PPRO
))
256 /* X86_TUNE_USE_INCDEC
: Enable use of inc
/dec instructions.
258 Core2 and nehalem has stall of
7 cycles for partial flag register stalls.
259 Sandy bridge and Ivy bridge generate extra uop. On Haswell this extra uop
260 is output only when the values needs to be really merged
, which is not
261 done by GCC generated code.
*/
262 DEF_TUNE (X86_TUNE_USE_INCDEC
, "use_incdec",
263 ~
(m_P4_NOCONA | m_CORE2 | m_NEHALEM | m_SANDYBRIDGE
264 | m_BONNELL | m_SILVERMONT | m_INTEL | m_GOLDMONT
265 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
266 | m_ZHAOXIN | m_GENERIC
))
268 /* X86_TUNE_INTEGER_DFMODE_MOVES
: Enable if integer moves are preferred
270 DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES
, "integer_dfmode_moves",
271 ~
(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
272 | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
273 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
276 /* X86_TUNE_OPT_AGU
: Optimize for Address Generation Unit. This flag
277 will impact LEA instruction selection.
*/
278 DEF_TUNE (X86_TUNE_OPT_AGU
, "opt_agu", m_BONNELL | m_SILVERMONT | m_GOLDMONT
279 | m_GOLDMONT_PLUS | m_INTEL | m_ZHAOXIN
)
281 /* X86_TUNE_AVOID_LEA_FOR_ADDR
: Avoid lea for address computation.
*/
282 DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR
, "avoid_lea_for_addr",
283 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS
)
285 /* X86_TUNE_SLOW_IMUL_IMM32_MEM
: Imul of
32-bit constant and memory is
286 vector path on AMD machines.
287 FIXME
: Do we need to enable this for core?
*/
288 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM
, "slow_imul_imm32_mem",
291 /* X86_TUNE_SLOW_IMUL_IMM8
: Imul of
8-bit constant is vector path on AMD
293 FIXME
: Do we need to enable this for core?
*/
294 DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8
, "slow_imul_imm8",
297 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE
: Try to avoid memory operands for
298 a conditional move.
*/
299 DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE
, "avoid_mem_opnd_for_cmove",
300 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
)
302 /* X86_TUNE_SINGLE_STRINGOP
: Enable use of single string operations
, such
303 as MOVS and
STOS (without a REP prefix
) to move
/set sequences of bytes.
*/
304 DEF_TUNE (X86_TUNE_SINGLE_STRINGOP
, "single_stringop", m_386 | m_P4_NOCONA
)
306 /* X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB
: Enable use of REP MOVSB
/STOSB to
307 move
/set sequences of bytes with known size.
*/
308 DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB
,
309 "prefer_known_rep_movsb_stosb",
310 m_SKYLAKE | m_CORE_HYBRID | m_CORE_ATOM | m_TREMONT | m_CORE_AVX512
313 /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
: Enable generation of
314 compact prologues and epilogues by issuing a misaligned moves. This
315 requires target to handle misaligned moves and partial memory stalls
317 FIXME
: This may actualy be a win on more targets than listed here.
*/
318 DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
,
319 "misaligned_move_string_pro_epilogues",
320 m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_ZHAOXIN | m_TREMONT
321 | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC
)
323 /* X86_TUNE_USE_SAHF
: Controls use of SAHF.
*/
324 DEF_TUNE (X86_TUNE_USE_SAHF
, "use_sahf",
325 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
326 | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER
327 | m_ZNVER | m_ZHAOXIN | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
328 | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC
)
330 /* X86_TUNE_USE_CLTD
: Controls use of CLTD and CTQO instructions.
*/
331 DEF_TUNE (X86_TUNE_USE_CLTD
, "use_cltd",
332 ~
(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_INTEL
333 | m_K6 | m_GOLDMONT | m_GOLDMONT_PLUS
))
335 /* X86_TUNE_USE_BT
: Enable use of
BT (bit test
) instructions.
*/
336 DEF_TUNE (X86_TUNE_USE_BT
, "use_bt",
337 m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL | m_LAKEMONT
338 | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT | m_GOLDMONT_PLUS
339 | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC
)
341 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI
: Avoid false dependency
342 for bit
-manipulation instructions.
*/
343 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI
, "avoid_false_dep_for_bmi",
344 m_SANDYBRIDGE | m_HASWELL | m_SKYLAKE | m_SKYLAKE_AVX512
345 | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE
346 | m_ZHAOXIN | m_GENERIC
)
348 /* X86_TUNE_ADJUST_UNROLL
: This enables adjusting the unroll factor based
349 on hardware capabilities. Bdver3 hardware has a loop buffer which makes
350 unrolling small loop less important. For
, such architectures we adjust
351 the unroll factor so that the unrolled loop fits the loop buffer.
*/
352 DEF_TUNE (X86_TUNE_ADJUST_UNROLL
, "adjust_unroll_factor", m_BDVER3 | m_BDVER4
)
354 /* X86_TUNE_ONE_IF_CONV_INSNS
: Restrict a number of cmov insns in
355 if
-converted sequence to one.
*/
356 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN
, "one_if_conv_insn",
357 m_SILVERMONT | m_HASWELL | m_SKYLAKE | m_GOLDMONT | m_GOLDMONT_PLUS
358 | m_TREMONT | m_ZHAOXIN
)
360 /* X86_TUNE_AVOID_MFENCE
: Use lock prefixed instructions instead of mfence.
*/
361 DEF_TUNE (X86_TUNE_AVOID_MFENCE
, "avoid_mfence",
362 m_CORE_ALL | m_BDVER | m_ZNVER | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID
363 | m_CORE_ATOM | m_GENERIC
)
365 /* X86_TUNE_EXPAND_ABS
: This enables a new abs pattern by
366 generating instructions for
abs (x
) = (((signed
) x
>> (W
-1) ^ x
) -
367 (signed
) x
>> (W
-1)) instead of cmove or SSE max
/abs instructions.
*/
368 DEF_TUNE (X86_TUNE_EXPAND_ABS
, "expand_abs",
369 m_CORE_ALL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_ZHAOXIN
)
371 /*****************************************************************************/
372 /* 387 instruction selection tuning
*/
373 /*****************************************************************************/
375 /* X86_TUNE_USE_HIMODE_FIOP
: Enables use of x87 instructions with
16bit
377 FIXME
: Why this is disabled for modern chips?
*/
378 DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP
, "use_himode_fiop",
379 m_386 | m_486 | m_K6_GEODE
)
381 /* X86_TUNE_USE_SIMODE_FIOP
: Enables use of x87 instructions with
32bit
383 DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP
, "use_simode_fiop",
384 ~
(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL
385 | m_SILVERMONT | m_INTEL | m_AMD_MULTIPLE | m_ZHAOXIN | m_GOLDMONT
386 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
389 /* X86_TUNE_USE_FFREEP
: Use freep instruction instead of fstp.
*/
390 DEF_TUNE (X86_TUNE_USE_FFREEP
, "use_ffreep", m_AMD_MULTIPLE | m_ZHAOXIN
)
392 /* X86_TUNE_EXT_80387_CONSTANTS
: Use fancy
80387 constants
, such as PI.
*/
393 DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS
, "ext_80387_constants",
394 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT
395 | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_ZHAOXIN | m_GOLDMONT
396 | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM
399 /*****************************************************************************/
400 /* SSE instruction selection tuning
*/
401 /*****************************************************************************/
403 /* X86_TUNE_GENERAL_REGS_SSE_SPILL
: Try to spill general regs to SSE
404 regs instead of memory.
*/
405 DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL
, "general_regs_sse_spill",
408 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL
: Use movups for misaligned loads instead
409 of a sequence loading registers by parts.
*/
410 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL
, "sse_unaligned_load_optimal",
411 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT | m_INTEL
412 | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
413 | m_CORE_ATOM | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER | m_ZHAOXIN
416 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL
: Use movups for misaligned stores
417 instead of a sequence loading registers by parts.
*/
418 DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL
, "sse_unaligned_store_optimal",
419 m_NEHALEM | m_SANDYBRIDGE | m_CORE_AVX2 | m_SILVERMONT
420 | m_INTEL | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
421 | m_CORE_ATOM | m_BDVER | m_ZNVER | m_ZHAOXIN | m_GENERIC
)
423 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL
: Use packed single
424 precision
128bit instructions instead of double where possible.
*/
425 DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL
, "sse_packed_single_insn_optimal",
428 /* X86_TUNE_SSE_TYPELESS_STORES
: Always movaps
/movups for
128bit stores.
*/
429 DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES
, "sse_typeless_stores",
430 m_AMD_MULTIPLE | m_ZHAOXIN | m_CORE_ALL | m_TREMONT | m_CORE_HYBRID
431 | m_CORE_ATOM | m_GENERIC
)
433 /* X86_TUNE_SSE_LOAD0_BY_PXOR
: Always use pxor to load0 as opposed to
434 xorps
/xorpd and other variants.
*/
435 DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR
, "sse_load0_by_pxor",
436 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER
437 | m_ZHAOXIN | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_GENERIC
)
439 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC
: Enable moves in from integer
440 to SSE registers. If disabled
, the moves will be done by storing
441 the value to memory and reloading.
442 Enable this flag for generic
- the only relevant architecture preferring
443 no inter
-unit moves is Buldozer. While this makes small regression on SPECfp
444 scores (sub
0.3%), disabling inter
-unit moves penalizes noticeably hand
445 written vectorized code which use i.e. _mm_set_epi16.
*/
446 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC
, "inter_unit_moves_to_vec",
447 ~
(m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER
))
449 /* X86_TUNE_INTER_UNIT_MOVES_TO_VEC
: Enable moves in from SSE
450 to integer registers. If disabled
, the moves will be done by storing
451 the value to memory and reloading.
*/
452 DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC
, "inter_unit_moves_from_vec",
455 /* X86_TUNE_INTER_UNIT_CONVERSIONS
: Enable float
<->integer conversions
456 to use both SSE and integer registers at a same time.
*/
457 DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS
, "inter_unit_conversions",
458 ~
(m_AMDFAM10 | m_BDVER
))
460 /* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS
: Try to split memory operand for
461 fp converts to destination register.
*/
462 DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS
, "split_mem_opnd_for_fp_converts",
463 m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
)
465 /* X86_TUNE_USE_VECTOR_FP_CONVERTS
: Prefer vector packed SSE conversion
466 from FP to FP. This form of instructions avoids partial write to the
468 DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS
, "use_vector_fp_converts",
471 /* X86_TUNE_USE_VECTOR_CONVERTS
: Prefer vector packed SSE conversion
472 from integer to FP.
*/
473 DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS
, "use_vector_converts", m_AMDFAM10
)
475 /* X86_TUNE_SLOW_SHUFB
: Indicates tunings with slow pshufb instruction.
*/
476 DEF_TUNE (X86_TUNE_SLOW_PSHUFB
, "slow_pshufb",
477 m_BONNELL | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_INTEL
)
479 /* X86_TUNE_AVOID_4BYTE_PREFIXES
: Avoid instructions requiring
4+ bytes of prefixes.
*/
480 DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES
, "avoid_4byte_prefixes",
481 m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT | m_CORE_HYBRID
482 | m_CORE_ATOM | m_INTEL
)
484 /* X86_TUNE_USE_GATHER_2PARTS
: Use gather instructions for vectors with
2
486 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS
, "use_gather_2parts",
487 ~
(m_ZNVER | m_CORE_HYBRID
488 | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS
))
490 /* X86_TUNE_USE_SCATTER_2PARTS
: Use scater instructions for vectors with
2
492 DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS
, "use_scatter_2parts",
493 ~
(m_ZNVER4 | m_ZNVER5
))
495 /* X86_TUNE_USE_GATHER_4PARTS
: Use gather instructions for vectors with
4
497 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS
, "use_gather_4parts",
498 ~
(m_ZNVER | m_CORE_HYBRID
499 | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS
))
501 /* X86_TUNE_USE_SCATTER_4PARTS
: Use scater instructions for vectors with
4
503 DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS
, "use_scatter_4parts",
504 ~
(m_ZNVER4 | m_ZNVER5
))
506 /* X86_TUNE_USE_GATHER
: Use gather instructions for vectors with
8 or more
508 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS
, "use_gather_8parts",
509 ~
(m_ZNVER | m_CORE_HYBRID | m_CORE_ATOM
510 | m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS
))
512 /* X86_TUNE_USE_SCATTER
: Use scater instructions for vectors with
8 or more
514 DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS
, "use_scatter_8parts",
515 ~
(m_ZNVER4 | m_ZNVER5
))
517 /* X86_TUNE_AVOID_128FMA_CHAINS
: Avoid creating loops with tight
128bit or
518 smaller FMA chain.
*/
519 DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS
, "avoid_fma_chains", m_ZNVER
520 | m_YONGFENG | m_SHIJIDADAO | m_GENERIC
)
522 /* X86_TUNE_AVOID_256FMA_CHAINS
: Avoid creating loops with tight
256bit or
523 smaller FMA chain.
*/
524 DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS
, "avoid_fma256_chains",
525 m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID
526 | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC
)
528 /* X86_TUNE_AVOID_512FMA_CHAINS
: Avoid creating loops with tight
512bit or
529 smaller FMA chain.
*/
530 DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS
, "avoid_fma512_chains", m_ZNVER5
)
532 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD
: Prefer haddpd
533 for v2df vector reduction.
*/
534 DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD
,
535 "v2df_reduction_prefer_haddpd", m_NONE
)
537 /* X86_TUNE_SSE_MOVCC_USE_BLENDV
: Prefer blendv instructions to
538 3-instruction
sequence (op1
& mask
) |
(op2
& ~mask
)
539 for vector condition move.
540 For Crestmont
, 4-operand vex blendv instructions come from MSROM
542 DEF_TUNE (X86_TUNE_SSE_MOVCC_USE_BLENDV
,
543 "sse_movcc_use_blendv", ~m_CORE_ATOM
)
545 /*****************************************************************************/
546 /* AVX instruction selection
tuning (some of SSE flags affects AVX
, too
) */
547 /*****************************************************************************/
549 /* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL
: if false
, unaligned loads are
551 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL
, "256_unaligned_load_optimal",
552 ~
(m_NEHALEM | m_SANDYBRIDGE
))
554 /* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL
: if false
, unaligned stores are
556 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL
, "256_unaligned_store_optimal",
557 ~
(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1
))
559 /* X86_TUNE_AVX256_SPLIT_REGS
: if true
, AVX256 ops are split into two AVX128 ops.
*/
560 DEF_TUNE (X86_TUNE_AVX256_SPLIT_REGS
, "avx256_split_regs",m_BDVER | m_BTVER2
561 | m_ZNVER1 | m_CORE_ATOM
)
563 /* X86_TUNE_AVX128_OPTIMAL
: Enable
128-bit AVX instruction generation for
564 the auto
-vectorizer.
*/
565 DEF_TUNE (X86_TUNE_AVX128_OPTIMAL
, "avx128_optimal", m_BDVER | m_BTVER2
568 /* X86_TUNE_AVX256_OPTIMAL
: Use
256-bit AVX instructions instead of
512-bit AVX
569 instructions in the auto
-vectorizer.
*/
570 DEF_TUNE (X86_TUNE_AVX256_OPTIMAL
, "avx256_optimal", m_CORE_AVX512
)
572 /* X86_TUNE_AVX256_AVOID_VEC_PERM
: Avoid using
256-bit cross
-lane
573 vector permutation instructions in the auto
-vectorizer.
*/
574 DEF_TUNE (X86_TUNE_AVX256_AVOID_VEC_PERM
,
575 "avx256_avoid_vec_perm", m_CORE_ATOM
)
577 /* X86_TUNE_AVX256_SPLIT_REGS
: if true
, AVX512 ops are split into two AVX256 ops.
*/
578 DEF_TUNE (X86_TUNE_AVX512_SPLIT_REGS
, "avx512_split_regs", m_ZNVER4
)
580 /* X86_TUNE_AVX256_MOVE_BY_PIECES
: Optimize move_by_pieces with
256-bit
582 DEF_TUNE (X86_TUNE_AVX256_MOVE_BY_PIECES
, "avx256_move_by_pieces",
583 m_CORE_HYBRID | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3
)
585 /* X86_TUNE_AVX256_STORE_BY_PIECES
: Optimize store_by_pieces with
256-bit
587 DEF_TUNE (X86_TUNE_AVX256_STORE_BY_PIECES
, "avx256_store_by_pieces",
588 m_CORE_HYBRID | m_CORE_AVX2 | m_ZNVER1 | m_ZNVER2 | m_ZNVER3
)
590 /* X86_TUNE_AVX512_MOVE_BY_PIECES
: Optimize move_by_pieces with
512-bit
592 DEF_TUNE (X86_TUNE_AVX512_MOVE_BY_PIECES
, "avx512_move_by_pieces",
593 m_SAPPHIRERAPIDS | m_ZNVER4 | m_ZNVER5
)
595 /* X86_TUNE_AVX512_STORE_BY_PIECES
: Optimize store_by_pieces with
512-bit
597 DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES
, "avx512_store_by_pieces",
598 m_SAPPHIRERAPIDS | m_ZNVER4 | m_ZNVER5
)
600 /*****************************************************************************/
601 /*****************************************************************************/
602 /* Historical relics
: tuning flags that helps a specific old CPU designs
*/
603 /*****************************************************************************/
605 /* X86_TUNE_DOUBLE_WITH_ADD
: Use add instead of sal to double value in
606 an integer register.
*/
607 DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD
, "double_with_add", ~m_386
)
609 /* X86_TUNE_ALWAYS_FANCY_MATH_387
: controls use of fancy
387 operations
,
610 such as fsqrt
, fprem
, fsin
, fcos
, fsincos etc.
611 Should be enabled for all targets that always has coprocesor.
*/
612 DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387
, "always_fancy_math_387",
613 ~
(m_386 | m_486 | m_LAKEMONT
))
615 /* X86_TUNE_UNROLL_STRLEN
: Produce (quite lame
) unrolled sequence for
616 inline strlen. This affects only
-minline
-all
-stringops mode. By
617 default we always dispatch to a library since our internal strlen
619 DEF_TUNE (X86_TUNE_UNROLL_STRLEN
, "unroll_strlen", ~m_386
)
621 /* X86_TUNE_SHIFT1
: Enables use of short encoding of
"sal reg" instead of
622 longer
"sal $1, reg".
*/
623 DEF_TUNE (X86_TUNE_SHIFT1
, "shift1", ~m_486
)
625 /* X86_TUNE_ZERO_EXTEND_WITH_AND
: Use
AND instruction instead
627 DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND
, "zero_extend_with_and",
630 /* X86_TUNE_PROMOTE_HIMODE_IMUL
: Modern CPUs have same latency for HImode
631 and SImode multiply
, but
386 and
486 do HImode multiply faster.
*/
632 DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL
, "promote_himode_imul",
635 /* X86_TUNE_FAST_PREFIX
: Enable demoting some
32bit or
64bit arithmetic
636 into
16bit
/8bit when resulting sequence is shorter. For example
637 for
"and $-65536, reg" to
16bit store of
0.
*/
638 DEF_TUNE (X86_TUNE_FAST_PREFIX
, "fast_prefix",
639 ~
(m_386 | m_486 | m_PENT | m_LAKEMONT
))
641 /* X86_TUNE_READ_MODIFY_WRITE
: Enable use of read modify write instructions
642 such as
"add $1, mem".
*/
643 DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE
, "read_modify_write",
644 ~
(m_PENT | m_LAKEMONT
))
646 /* X86_TUNE_MOVE_M1_VIA_OR
: On pentiums
, it is faster to load
-1 via
OR
648 DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR
, "move_m1_via_or", m_PENT | m_LAKEMONT
)
650 /* X86_TUNE_NOT_UNPAIRABLE
: NOT is not pairable on Pentium
, while XOR is
,
651 but one byte longer.
*/
652 DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE
, "not_unpairable", m_PENT | m_LAKEMONT
)
654 /* X86_TUNE_PARTIAL_REG_STALL
: Pentium pro
, unlike later chips
, handled
655 use of partial registers by renaming. This improved performance of
16bit
656 code where upper halves of registers are not used. It also leads to
657 an penalty whenever a
16bit store is followed by
32bit use. This flag
658 disables production of such sequences in common cases.
659 See also X86_TUNE_HIMODE_MATH.
661 In current implementation the partial register stalls are not eliminated
662 very well
- they can be introduced via subregs synthesized by combine
663 and can happen in caller
/callee saving sequences.
*/
664 DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL
, "partial_reg_stall", m_PPRO
)
666 /* X86_TUNE_PARTIAL_MEMORY_READ_STALL
: Reading (possible unaligned
) part of
667 memory location after a large write to the same address causes
668 store
-to
-load forwarding stall.
*/
669 DEF_TUNE (X86_TUNE_PARTIAL_MEMORY_READ_STALL
, "partial_memory_read_stall",
670 m_386 | m_486 | m_PENT | m_LAKEMONT | m_PPRO | m_P4_NOCONA | m_CORE2
671 | m_SILVERMONT | m_GOLDMONT | m_GOLDMONT_PLUS | m_TREMONT
672 | m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10
)
674 /* X86_TUNE_PROMOTE_QIMODE
: When it is cheap
, turn
8bit arithmetic to
675 corresponding
32bit arithmetic.
*/
676 DEF_TUNE (X86_TUNE_PROMOTE_QIMODE
, "promote_qimode",
679 /* X86_TUNE_PROMOTE_HI_REGS
: Same
, but for
16bit artihmetic. Again we avoid
680 partial register stalls on PentiumPro targets.
*/
681 DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS
, "promote_hi_regs", m_PPRO
)
683 /* X86_TUNE_HIMODE_MATH
: Enable use of
16bit arithmetic.
684 On PPro this flag is meant to avoid partial register stalls.
*/
685 DEF_TUNE (X86_TUNE_HIMODE_MATH
, "himode_math", ~m_PPRO
)
687 /* X86_TUNE_SPLIT_LONG_MOVES
: Avoid instructions moving immediates
688 directly to memory.
*/
689 DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES
, "split_long_moves", m_PPRO
)
691 /* X86_TUNE_USE_XCHGB
: Use xchgb
%rh
,%rl instead of rolw
/rorw $
8,rx.
*/
692 DEF_TUNE (X86_TUNE_USE_XCHGB
, "use_xchgb", m_PENT4
)
694 /* X86_TUNE_USE_MOV0
: Use
"mov $0, reg" instead of
"xor reg, reg" to clear
696 DEF_TUNE (X86_TUNE_USE_MOV0
, "use_mov0", m_K6
)
698 /* X86_TUNE_NOT_VECTORMODE
: On AMD K6
, NOT is vector decoded with memory
699 operand that cannot be represented using a modRM byte. The XOR
700 replacement is long decoded
, so this split helps here as well.
*/
701 DEF_TUNE (X86_TUNE_NOT_VECTORMODE
, "not_vectormode", m_K6
)
703 /* X86_TUNE_AVOID_VECTOR_DECODE
: Enable splitters that avoid vector decoded
704 forms of instructions on K8 targets.
*/
705 DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE
, "avoid_vector_decode",
708 /* X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN
, starting with the Redwood Cove
709 microarchitecture
, if the predictor has no stored information about a branch
,
710 the branch has the Intel® SSE2 branch taken hint
711 (i.e.
, instruction prefix
3EH
), When the codec decodes the branch
, it flips
712 the branch’s prediction from not
-taken to taken. It then flushes the pipeline
713 in front of it and steers this pipeline to fetch the taken path of the
715 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN
, "branch_prediction_hints_taken", m_NONE
)
717 /*****************************************************************************/
718 /* This never worked well before.
*/
719 /*****************************************************************************/
721 /* X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN
: Branch hints were put in P4 based
722 on simulation result. But after P4 was made
, no performance benefit
723 was observed with branch hints. It also increases the code size.
724 As a result
, icc never generates branch hints.
*/
725 DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN
, "branch_prediction_hints_not_taken", m_NONE
)
727 /* X86_TUNE_QIMODE_MATH
: Enable use of
8bit arithmetic.
*/
728 DEF_TUNE (X86_TUNE_QIMODE_MATH
, "qimode_math", m_ALL
)
730 /* X86_TUNE_PROMOTE_QI_REGS
: This enables generic code that promotes all
8bit
731 arithmetic to
32bit via PROMOTE_MODE macro. This code generation scheme
732 is usually used for RISC targets.
*/
733 DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS
, "promote_qi_regs", m_NONE
)
735 /* X86_TUNE_SLOW_STC
: This disables use of stc
, clc and cmc carry flag
736 modifications on architectures where theses operations are slow.
*/
737 DEF_TUNE (X86_TUNE_SLOW_STC
, "slow_stc", m_PENT4
)
739 /* X86_TUNE_USE_RCR
: Controls use of rcr
1 instruction instead of shrd.
*/
740 DEF_TUNE (X86_TUNE_USE_RCR
, "use_rcr", m_AMD_MULTIPLE
)