1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// The AMDGPU target machine contains all of the hardware specific
12 /// information needed to emit code for R600 and SI GPUs.
14 //===----------------------------------------------------------------------===//
16 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUAliasAnalysis.h"
19 #include "AMDGPUCallLowering.h"
20 #include "AMDGPUInstructionSelector.h"
21 #include "AMDGPULegalizerInfo.h"
22 #include "AMDGPUMacroFusion.h"
23 #include "AMDGPUTargetObjectFile.h"
24 #include "AMDGPUTargetTransformInfo.h"
25 #include "GCNIterativeScheduler.h"
26 #include "GCNSchedStrategy.h"
27 #include "R600MachineScheduler.h"
28 #include "SIMachineScheduler.h"
29 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
30 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
31 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
32 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
33 #include "llvm/CodeGen/Passes.h"
34 #include "llvm/CodeGen/TargetPassConfig.h"
35 #include "llvm/IR/Attributes.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/LegacyPassManager.h"
38 #include "llvm/Pass.h"
39 #include "llvm/Support/CommandLine.h"
40 #include "llvm/Support/Compiler.h"
41 #include "llvm/Support/TargetRegistry.h"
42 #include "llvm/Target/TargetLoweringObjectFile.h"
43 #include "llvm/Transforms/IPO.h"
44 #include "llvm/Transforms/IPO/AlwaysInliner.h"
45 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
46 #include "llvm/Transforms/Scalar.h"
47 #include "llvm/Transforms/Scalar/GVN.h"
48 #include "llvm/Transforms/Vectorize.h"
53 static cl::opt
<bool> EnableR600StructurizeCFG(
54 "r600-ir-structurize",
55 cl::desc("Use StructurizeCFG IR pass"),
58 static cl::opt
<bool> EnableSROA(
60 cl::desc("Run SROA after promote alloca pass"),
65 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden
,
66 cl::desc("Run early if-conversion"),
69 static cl::opt
<bool> EnableR600IfConvert(
71 cl::desc("Use if conversion pass"),
75 // Option to disable vectorizer for tests.
76 static cl::opt
<bool> EnableLoadStoreVectorizer(
77 "amdgpu-load-store-vectorizer",
78 cl::desc("Enable load store vectorizer"),
82 // Option to control global loads scalarization
83 static cl::opt
<bool> ScalarizeGlobal(
84 "amdgpu-scalarize-global-loads",
85 cl::desc("Enable global load scalarization"),
89 // Option to run internalize pass.
90 static cl::opt
<bool> InternalizeSymbols(
91 "amdgpu-internalize-symbols",
92 cl::desc("Enable elimination of non-kernel functions and unused globals"),
96 // Option to inline all early.
97 static cl::opt
<bool> EarlyInlineAll(
98 "amdgpu-early-inline-all",
99 cl::desc("Inline all functions early"),
103 static cl::opt
<bool> EnableSDWAPeephole(
104 "amdgpu-sdwa-peephole",
105 cl::desc("Enable SDWA peepholer"),
108 // Enable address space based alias analysis
109 static cl::opt
<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden
,
110 cl::desc("Enable AMDGPU Alias Analysis"),
113 // Option to run late CFG structurizer
114 static cl::opt
<bool, true> LateCFGStructurize(
115 "amdgpu-late-structurize",
116 cl::desc("Enable late CFG structurization"),
117 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG
),
120 static cl::opt
<bool, true> EnableAMDGPUFunctionCalls(
121 "amdgpu-function-calls",
122 cl::desc("Enable AMDGPU function call support"),
123 cl::location(AMDGPUTargetMachine::EnableFunctionCalls
),
127 // Enable lib calls simplifications
128 static cl::opt
<bool> EnableLibCallSimplify(
129 "amdgpu-simplify-libcall",
130 cl::desc("Enable amdgpu library simplifications"),
134 static cl::opt
<bool> EnableLowerKernelArguments(
135 "amdgpu-ir-lower-kernel-arguments",
136 cl::desc("Lower kernel argument loads in IR pass"),
140 extern "C" void LLVMInitializeAMDGPUTarget() {
141 // Register the target
142 RegisterTargetMachine
<R600TargetMachine
> X(getTheAMDGPUTarget());
143 RegisterTargetMachine
<GCNTargetMachine
> Y(getTheGCNTarget());
145 PassRegistry
*PR
= PassRegistry::getPassRegistry();
146 initializeR600ClauseMergePassPass(*PR
);
147 initializeR600ControlFlowFinalizerPass(*PR
);
148 initializeR600PacketizerPass(*PR
);
149 initializeR600ExpandSpecialInstrsPassPass(*PR
);
150 initializeR600VectorRegMergerPass(*PR
);
151 initializeGlobalISel(*PR
);
152 initializeAMDGPUDAGToDAGISelPass(*PR
);
153 initializeSILowerI1CopiesPass(*PR
);
154 initializeSIFixSGPRCopiesPass(*PR
);
155 initializeSIFixVGPRCopiesPass(*PR
);
156 initializeSIFoldOperandsPass(*PR
);
157 initializeSIPeepholeSDWAPass(*PR
);
158 initializeSIShrinkInstructionsPass(*PR
);
159 initializeSIOptimizeExecMaskingPreRAPass(*PR
);
160 initializeSILoadStoreOptimizerPass(*PR
);
161 initializeAMDGPUAlwaysInlinePass(*PR
);
162 initializeAMDGPUAnnotateKernelFeaturesPass(*PR
);
163 initializeAMDGPUAnnotateUniformValuesPass(*PR
);
164 initializeAMDGPUArgumentUsageInfoPass(*PR
);
165 initializeAMDGPULowerKernelArgumentsPass(*PR
);
166 initializeAMDGPULowerKernelAttributesPass(*PR
);
167 initializeAMDGPULowerIntrinsicsPass(*PR
);
168 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR
);
169 initializeAMDGPUPromoteAllocaPass(*PR
);
170 initializeAMDGPUCodeGenPreparePass(*PR
);
171 initializeAMDGPURewriteOutArgumentsPass(*PR
);
172 initializeAMDGPUUnifyMetadataPass(*PR
);
173 initializeSIAnnotateControlFlowPass(*PR
);
174 initializeSIInsertWaitcntsPass(*PR
);
175 initializeSIWholeQuadModePass(*PR
);
176 initializeSILowerControlFlowPass(*PR
);
177 initializeSIInsertSkipsPass(*PR
);
178 initializeSIMemoryLegalizerPass(*PR
);
179 initializeSIDebuggerInsertNopsPass(*PR
);
180 initializeSIOptimizeExecMaskingPass(*PR
);
181 initializeSIFixWWMLivenessPass(*PR
);
182 initializeSIFormMemoryClausesPass(*PR
);
183 initializeAMDGPUUnifyDivergentExitNodesPass(*PR
);
184 initializeAMDGPUAAWrapperPassPass(*PR
);
185 initializeAMDGPUUseNativeCallsPass(*PR
);
186 initializeAMDGPUSimplifyLibCallsPass(*PR
);
187 initializeAMDGPUInlinerPass(*PR
);
190 static std::unique_ptr
<TargetLoweringObjectFile
> createTLOF(const Triple
&TT
) {
191 return llvm::make_unique
<AMDGPUTargetObjectFile
>();
194 static ScheduleDAGInstrs
*createR600MachineScheduler(MachineSchedContext
*C
) {
195 return new ScheduleDAGMILive(C
, llvm::make_unique
<R600SchedStrategy
>());
198 static ScheduleDAGInstrs
*createSIMachineScheduler(MachineSchedContext
*C
) {
199 return new SIScheduleDAGMI(C
);
202 static ScheduleDAGInstrs
*
203 createGCNMaxOccupancyMachineScheduler(MachineSchedContext
*C
) {
204 ScheduleDAGMILive
*DAG
=
205 new GCNScheduleDAGMILive(C
, make_unique
<GCNMaxOccupancySchedStrategy
>(C
));
206 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
207 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
208 DAG
->addMutation(createAMDGPUMacroFusionDAGMutation());
212 static ScheduleDAGInstrs
*
213 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext
*C
) {
214 auto DAG
= new GCNIterativeScheduler(C
,
215 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY
);
216 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
217 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
221 static ScheduleDAGInstrs
*createMinRegScheduler(MachineSchedContext
*C
) {
222 return new GCNIterativeScheduler(C
,
223 GCNIterativeScheduler::SCHEDULE_MINREGFORCED
);
226 static ScheduleDAGInstrs
*
227 createIterativeILPMachineScheduler(MachineSchedContext
*C
) {
228 auto DAG
= new GCNIterativeScheduler(C
,
229 GCNIterativeScheduler::SCHEDULE_ILP
);
230 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
231 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
232 DAG
->addMutation(createAMDGPUMacroFusionDAGMutation());
236 static MachineSchedRegistry
237 R600SchedRegistry("r600", "Run R600's custom scheduler",
238 createR600MachineScheduler
);
240 static MachineSchedRegistry
241 SISchedRegistry("si", "Run SI's custom scheduler",
242 createSIMachineScheduler
);
244 static MachineSchedRegistry
245 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
246 "Run GCN scheduler to maximize occupancy",
247 createGCNMaxOccupancyMachineScheduler
);
249 static MachineSchedRegistry
250 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
251 "Run GCN scheduler to maximize occupancy (experimental)",
252 createIterativeGCNMaxOccupancyMachineScheduler
);
254 static MachineSchedRegistry
255 GCNMinRegSchedRegistry("gcn-minreg",
256 "Run GCN iterative scheduler for minimal register usage (experimental)",
257 createMinRegScheduler
);
259 static MachineSchedRegistry
260 GCNILPSchedRegistry("gcn-ilp",
261 "Run GCN iterative scheduler for ILP scheduling (experimental)",
262 createIterativeILPMachineScheduler
);
264 static StringRef
computeDataLayout(const Triple
&TT
) {
265 if (TT
.getArch() == Triple::r600
) {
267 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
268 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
271 // 32-bit private, local, and region pointers. 64-bit global, constant and
273 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
274 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
275 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
279 static StringRef
getGPUOrDefault(const Triple
&TT
, StringRef GPU
) {
283 if (TT
.getArch() == Triple::amdgcn
)
289 static Reloc::Model
getEffectiveRelocModel(Optional
<Reloc::Model
> RM
) {
290 // The AMDGPU toolchain only supports generating shared objects, so we
291 // must always use PIC.
295 static CodeModel::Model
getEffectiveCodeModel(Optional
<CodeModel::Model
> CM
) {
298 return CodeModel::Small
;
301 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target
&T
, const Triple
&TT
,
302 StringRef CPU
, StringRef FS
,
303 TargetOptions Options
,
304 Optional
<Reloc::Model
> RM
,
305 Optional
<CodeModel::Model
> CM
,
306 CodeGenOpt::Level OptLevel
)
307 : LLVMTargetMachine(T
, computeDataLayout(TT
), TT
, getGPUOrDefault(TT
, CPU
),
308 FS
, Options
, getEffectiveRelocModel(RM
),
309 getEffectiveCodeModel(CM
), OptLevel
),
310 TLOF(createTLOF(getTargetTriple())) {
314 bool AMDGPUTargetMachine::EnableLateStructurizeCFG
= false;
315 bool AMDGPUTargetMachine::EnableFunctionCalls
= false;
317 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
319 StringRef
AMDGPUTargetMachine::getGPUName(const Function
&F
) const {
320 Attribute GPUAttr
= F
.getFnAttribute("target-cpu");
321 return GPUAttr
.hasAttribute(Attribute::None
) ?
322 getTargetCPU() : GPUAttr
.getValueAsString();
325 StringRef
AMDGPUTargetMachine::getFeatureString(const Function
&F
) const {
326 Attribute FSAttr
= F
.getFnAttribute("target-features");
328 return FSAttr
.hasAttribute(Attribute::None
) ?
329 getTargetFeatureString() :
330 FSAttr
.getValueAsString();
333 static ImmutablePass
*createAMDGPUExternalAAWrapperPass() {
334 return createExternalAAWrapperPass([](Pass
&P
, Function
&, AAResults
&AAR
) {
335 if (auto *WrapperPass
= P
.getAnalysisIfAvailable
<AMDGPUAAWrapperPass
>())
336 AAR
.addAAResult(WrapperPass
->getResult());
340 /// Predicate for Internalize pass.
341 static bool mustPreserveGV(const GlobalValue
&GV
) {
342 if (const Function
*F
= dyn_cast
<Function
>(&GV
))
343 return F
->isDeclaration() || AMDGPU::isEntryFunctionCC(F
->getCallingConv());
345 return !GV
.use_empty();
348 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder
&Builder
) {
349 Builder
.DivergentTarget
= true;
351 bool EnableOpt
= getOptLevel() > CodeGenOpt::None
;
352 bool Internalize
= InternalizeSymbols
;
353 bool EarlyInline
= EarlyInlineAll
&& EnableOpt
&& !EnableAMDGPUFunctionCalls
;
354 bool AMDGPUAA
= EnableAMDGPUAliasAnalysis
&& EnableOpt
;
355 bool LibCallSimplify
= EnableLibCallSimplify
&& EnableOpt
;
357 if (EnableAMDGPUFunctionCalls
) {
358 delete Builder
.Inliner
;
359 Builder
.Inliner
= createAMDGPUFunctionInliningPass();
362 Builder
.addExtension(
363 PassManagerBuilder::EP_ModuleOptimizerEarly
,
364 [Internalize
, EarlyInline
, AMDGPUAA
](const PassManagerBuilder
&,
365 legacy::PassManagerBase
&PM
) {
367 PM
.add(createAMDGPUAAWrapperPass());
368 PM
.add(createAMDGPUExternalAAWrapperPass());
370 PM
.add(createAMDGPUUnifyMetadataPass());
372 PM
.add(createInternalizePass(mustPreserveGV
));
373 PM
.add(createGlobalDCEPass());
376 PM
.add(createAMDGPUAlwaysInlinePass(false));
379 const auto &Opt
= Options
;
380 Builder
.addExtension(
381 PassManagerBuilder::EP_EarlyAsPossible
,
382 [AMDGPUAA
, LibCallSimplify
, &Opt
](const PassManagerBuilder
&,
383 legacy::PassManagerBase
&PM
) {
385 PM
.add(createAMDGPUAAWrapperPass());
386 PM
.add(createAMDGPUExternalAAWrapperPass());
388 PM
.add(llvm::createAMDGPUUseNativeCallsPass());
390 PM
.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt
));
393 Builder
.addExtension(
394 PassManagerBuilder::EP_CGSCCOptimizerLate
,
395 [](const PassManagerBuilder
&, legacy::PassManagerBase
&PM
) {
396 // Add infer address spaces pass to the opt pipeline after inlining
397 // but before SROA to increase SROA opportunities.
398 PM
.add(createInferAddressSpacesPass());
400 // This should run after inlining to have any chance of doing anything,
401 // and before other cleanup optimizations.
402 PM
.add(createAMDGPULowerKernelAttributesPass());
406 //===----------------------------------------------------------------------===//
407 // R600 Target Machine (R600 -> Cayman)
408 //===----------------------------------------------------------------------===//
410 R600TargetMachine::R600TargetMachine(const Target
&T
, const Triple
&TT
,
411 StringRef CPU
, StringRef FS
,
412 TargetOptions Options
,
413 Optional
<Reloc::Model
> RM
,
414 Optional
<CodeModel::Model
> CM
,
415 CodeGenOpt::Level OL
, bool JIT
)
416 : AMDGPUTargetMachine(T
, TT
, CPU
, FS
, Options
, RM
, CM
, OL
) {
417 setRequiresStructuredCFG(true);
420 const R600Subtarget
*R600TargetMachine::getSubtargetImpl(
421 const Function
&F
) const {
422 StringRef GPU
= getGPUName(F
);
423 StringRef FS
= getFeatureString(F
);
425 SmallString
<128> SubtargetKey(GPU
);
426 SubtargetKey
.append(FS
);
428 auto &I
= SubtargetMap
[SubtargetKey
];
430 // This needs to be done before we create a new subtarget since any
431 // creation will depend on the TM and the code generation flags on the
432 // function that reside in TargetOptions.
433 resetTargetOptions(F
);
434 I
= llvm::make_unique
<R600Subtarget
>(TargetTriple
, GPU
, FS
, *this);
441 R600TargetMachine::getTargetTransformInfo(const Function
&F
) {
442 return TargetTransformInfo(R600TTIImpl(this, F
));
445 //===----------------------------------------------------------------------===//
446 // GCN Target Machine (SI+)
447 //===----------------------------------------------------------------------===//
449 GCNTargetMachine::GCNTargetMachine(const Target
&T
, const Triple
&TT
,
450 StringRef CPU
, StringRef FS
,
451 TargetOptions Options
,
452 Optional
<Reloc::Model
> RM
,
453 Optional
<CodeModel::Model
> CM
,
454 CodeGenOpt::Level OL
, bool JIT
)
455 : AMDGPUTargetMachine(T
, TT
, CPU
, FS
, Options
, RM
, CM
, OL
) {}
457 const GCNSubtarget
*GCNTargetMachine::getSubtargetImpl(const Function
&F
) const {
458 StringRef GPU
= getGPUName(F
);
459 StringRef FS
= getFeatureString(F
);
461 SmallString
<128> SubtargetKey(GPU
);
462 SubtargetKey
.append(FS
);
464 auto &I
= SubtargetMap
[SubtargetKey
];
466 // This needs to be done before we create a new subtarget since any
467 // creation will depend on the TM and the code generation flags on the
468 // function that reside in TargetOptions.
469 resetTargetOptions(F
);
470 I
= llvm::make_unique
<GCNSubtarget
>(TargetTriple
, GPU
, FS
, *this);
473 I
->setScalarizeGlobalBehavior(ScalarizeGlobal
);
479 GCNTargetMachine::getTargetTransformInfo(const Function
&F
) {
480 return TargetTransformInfo(GCNTTIImpl(this, F
));
483 //===----------------------------------------------------------------------===//
485 //===----------------------------------------------------------------------===//
489 class AMDGPUPassConfig
: public TargetPassConfig
{
491 AMDGPUPassConfig(LLVMTargetMachine
&TM
, PassManagerBase
&PM
)
492 : TargetPassConfig(TM
, PM
) {
493 // Exceptions and StackMaps are not supported, so these passes will never do
495 disablePass(&StackMapLivenessID
);
496 disablePass(&FuncletLayoutID
);
499 AMDGPUTargetMachine
&getAMDGPUTargetMachine() const {
500 return getTM
<AMDGPUTargetMachine
>();
504 createMachineScheduler(MachineSchedContext
*C
) const override
{
505 ScheduleDAGMILive
*DAG
= createGenericSchedLive(C
);
506 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
507 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
511 void addEarlyCSEOrGVNPass();
512 void addStraightLineScalarOptimizationPasses();
513 void addIRPasses() override
;
514 void addCodeGenPrepare() override
;
515 bool addPreISel() override
;
516 bool addInstSelector() override
;
517 bool addGCPasses() override
;
520 class R600PassConfig final
: public AMDGPUPassConfig
{
522 R600PassConfig(LLVMTargetMachine
&TM
, PassManagerBase
&PM
)
523 : AMDGPUPassConfig(TM
, PM
) {}
525 ScheduleDAGInstrs
*createMachineScheduler(
526 MachineSchedContext
*C
) const override
{
527 return createR600MachineScheduler(C
);
530 bool addPreISel() override
;
531 bool addInstSelector() override
;
532 void addPreRegAlloc() override
;
533 void addPreSched2() override
;
534 void addPreEmitPass() override
;
537 class GCNPassConfig final
: public AMDGPUPassConfig
{
539 GCNPassConfig(LLVMTargetMachine
&TM
, PassManagerBase
&PM
)
540 : AMDGPUPassConfig(TM
, PM
) {
541 // It is necessary to know the register usage of the entire call graph. We
542 // allow calls without EnableAMDGPUFunctionCalls if they are marked
543 // noinline, so this is always required.
544 setRequiresCodeGenSCCOrder(true);
547 GCNTargetMachine
&getGCNTargetMachine() const {
548 return getTM
<GCNTargetMachine
>();
552 createMachineScheduler(MachineSchedContext
*C
) const override
;
554 bool addPreISel() override
;
555 void addMachineSSAOptimization() override
;
556 bool addILPOpts() override
;
557 bool addInstSelector() override
;
558 bool addIRTranslator() override
;
559 bool addLegalizeMachineIR() override
;
560 bool addRegBankSelect() override
;
561 bool addGlobalInstructionSelect() override
;
562 void addFastRegAlloc(FunctionPass
*RegAllocPass
) override
;
563 void addOptimizedRegAlloc(FunctionPass
*RegAllocPass
) override
;
564 void addPreRegAlloc() override
;
565 void addPostRegAlloc() override
;
566 void addPreSched2() override
;
567 void addPreEmitPass() override
;
570 } // end anonymous namespace
572 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
573 if (getOptLevel() == CodeGenOpt::Aggressive
)
574 addPass(createGVNPass());
576 addPass(createEarlyCSEPass());
579 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
580 addPass(createLICMPass());
581 addPass(createSeparateConstOffsetFromGEPPass());
582 addPass(createSpeculativeExecutionPass());
583 // ReassociateGEPs exposes more opportunites for SLSR. See
584 // the example in reassociate-geps-and-slsr.ll.
585 addPass(createStraightLineStrengthReducePass());
586 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
587 // EarlyCSE can reuse.
588 addEarlyCSEOrGVNPass();
589 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
590 addPass(createNaryReassociatePass());
591 // NaryReassociate on GEPs creates redundant common expressions, so run
592 // EarlyCSE after it.
593 addPass(createEarlyCSEPass());
596 void AMDGPUPassConfig::addIRPasses() {
597 const AMDGPUTargetMachine
&TM
= getAMDGPUTargetMachine();
599 // There is no reason to run these.
600 disablePass(&StackMapLivenessID
);
601 disablePass(&FuncletLayoutID
);
602 disablePass(&PatchableFunctionID
);
604 addPass(createAMDGPULowerIntrinsicsPass());
606 if (TM
.getTargetTriple().getArch() == Triple::r600
||
607 !EnableAMDGPUFunctionCalls
) {
608 // Function calls are not supported, so make sure we inline everything.
609 addPass(createAMDGPUAlwaysInlinePass());
610 addPass(createAlwaysInlinerLegacyPass());
611 // We need to add the barrier noop pass, otherwise adding the function
612 // inlining pass will cause all of the PassConfigs passes to be run
613 // one function at a time, which means if we have a nodule with two
614 // functions, then we will generate code for the first function
615 // without ever running any passes on the second.
616 addPass(createBarrierNoopPass());
619 if (TM
.getTargetTriple().getArch() == Triple::amdgcn
) {
620 // TODO: May want to move later or split into an early and late one.
622 addPass(createAMDGPUCodeGenPreparePass());
625 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
626 if (TM
.getTargetTriple().getArch() == Triple::r600
)
627 addPass(createR600OpenCLImageTypeLoweringPass());
629 // Replace OpenCL enqueued block function pointers with global variables.
630 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
632 if (TM
.getOptLevel() > CodeGenOpt::None
) {
633 addPass(createInferAddressSpacesPass());
634 addPass(createAMDGPUPromoteAlloca());
637 addPass(createSROAPass());
639 addStraightLineScalarOptimizationPasses();
641 if (EnableAMDGPUAliasAnalysis
) {
642 addPass(createAMDGPUAAWrapperPass());
643 addPass(createExternalAAWrapperPass([](Pass
&P
, Function
&,
645 if (auto *WrapperPass
= P
.getAnalysisIfAvailable
<AMDGPUAAWrapperPass
>())
646 AAR
.addAAResult(WrapperPass
->getResult());
651 TargetPassConfig::addIRPasses();
653 // EarlyCSE is not always strong enough to clean up what LSR produces. For
654 // example, GVN can combine
661 // %0 = shl nsw %a, 2
664 // but EarlyCSE can do neither of them.
665 if (getOptLevel() != CodeGenOpt::None
)
666 addEarlyCSEOrGVNPass();
669 void AMDGPUPassConfig::addCodeGenPrepare() {
670 if (TM
->getTargetTriple().getArch() == Triple::amdgcn
&&
671 EnableLowerKernelArguments
)
672 addPass(createAMDGPULowerKernelArgumentsPass());
674 TargetPassConfig::addCodeGenPrepare();
676 if (EnableLoadStoreVectorizer
)
677 addPass(createLoadStoreVectorizerPass());
680 bool AMDGPUPassConfig::addPreISel() {
681 addPass(createFlattenCFGPass());
685 bool AMDGPUPassConfig::addInstSelector() {
686 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
690 bool AMDGPUPassConfig::addGCPasses() {
691 // Do nothing. GC is not supported.
695 //===----------------------------------------------------------------------===//
697 //===----------------------------------------------------------------------===//
699 bool R600PassConfig::addPreISel() {
700 AMDGPUPassConfig::addPreISel();
702 if (EnableR600StructurizeCFG
)
703 addPass(createStructurizeCFGPass());
707 bool R600PassConfig::addInstSelector() {
708 addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
712 void R600PassConfig::addPreRegAlloc() {
713 addPass(createR600VectorRegMerger());
716 void R600PassConfig::addPreSched2() {
717 addPass(createR600EmitClauseMarkers(), false);
718 if (EnableR600IfConvert
)
719 addPass(&IfConverterID
, false);
720 addPass(createR600ClauseMergePass(), false);
723 void R600PassConfig::addPreEmitPass() {
724 addPass(createAMDGPUCFGStructurizerPass(), false);
725 addPass(createR600ExpandSpecialInstrsPass(), false);
726 addPass(&FinalizeMachineBundlesID
, false);
727 addPass(createR600Packetizer(), false);
728 addPass(createR600ControlFlowFinalizer(), false);
731 TargetPassConfig
*R600TargetMachine::createPassConfig(PassManagerBase
&PM
) {
732 return new R600PassConfig(*this, PM
);
735 //===----------------------------------------------------------------------===//
737 //===----------------------------------------------------------------------===//
739 ScheduleDAGInstrs
*GCNPassConfig::createMachineScheduler(
740 MachineSchedContext
*C
) const {
741 const GCNSubtarget
&ST
= C
->MF
->getSubtarget
<GCNSubtarget
>();
742 if (ST
.enableSIScheduler())
743 return createSIMachineScheduler(C
);
744 return createGCNMaxOccupancyMachineScheduler(C
);
747 bool GCNPassConfig::addPreISel() {
748 AMDGPUPassConfig::addPreISel();
750 // FIXME: We need to run a pass to propagate the attributes when calls are
752 addPass(createAMDGPUAnnotateKernelFeaturesPass());
754 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
755 // regions formed by them.
756 addPass(&AMDGPUUnifyDivergentExitNodesID
);
757 if (!LateCFGStructurize
) {
758 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
760 addPass(createSinkingPass());
761 addPass(createAMDGPUAnnotateUniformValues());
762 if (!LateCFGStructurize
) {
763 addPass(createSIAnnotateControlFlowPass());
769 void GCNPassConfig::addMachineSSAOptimization() {
770 TargetPassConfig::addMachineSSAOptimization();
772 // We want to fold operands after PeepholeOptimizer has run (or as part of
773 // it), because it will eliminate extra copies making it easier to fold the
774 // real source operand. We want to eliminate dead instructions after, so that
775 // we see fewer uses of the copies. We then need to clean up the dead
776 // instructions leftover after the operands are folded as well.
778 // XXX - Can we get away without running DeadMachineInstructionElim again?
779 addPass(&SIFoldOperandsID
);
780 addPass(&DeadMachineInstructionElimID
);
781 addPass(&SILoadStoreOptimizerID
);
782 if (EnableSDWAPeephole
) {
783 addPass(&SIPeepholeSDWAID
);
784 addPass(&EarlyMachineLICMID
);
785 addPass(&MachineCSEID
);
786 addPass(&SIFoldOperandsID
);
787 addPass(&DeadMachineInstructionElimID
);
789 addPass(createSIShrinkInstructionsPass());
792 bool GCNPassConfig::addILPOpts() {
793 if (EnableEarlyIfConversion
)
794 addPass(&EarlyIfConverterID
);
796 TargetPassConfig::addILPOpts();
800 bool GCNPassConfig::addInstSelector() {
801 AMDGPUPassConfig::addInstSelector();
802 addPass(createSILowerI1CopiesPass());
803 addPass(&SIFixSGPRCopiesID
);
807 bool GCNPassConfig::addIRTranslator() {
808 addPass(new IRTranslator());
812 bool GCNPassConfig::addLegalizeMachineIR() {
813 addPass(new Legalizer());
817 bool GCNPassConfig::addRegBankSelect() {
818 addPass(new RegBankSelect());
822 bool GCNPassConfig::addGlobalInstructionSelect() {
823 addPass(new InstructionSelect());
827 void GCNPassConfig::addPreRegAlloc() {
828 if (LateCFGStructurize
) {
829 addPass(createAMDGPUMachineCFGStructurizerPass());
831 addPass(createSIWholeQuadModePass());
834 void GCNPassConfig::addFastRegAlloc(FunctionPass
*RegAllocPass
) {
835 // FIXME: We have to disable the verifier here because of PHIElimination +
836 // TwoAddressInstructions disabling it.
838 // This must be run immediately after phi elimination and before
839 // TwoAddressInstructions, otherwise the processing of the tied operand of
840 // SI_ELSE will introduce a copy of the tied operand source after the else.
841 insertPass(&PHIEliminationID
, &SILowerControlFlowID
, false);
843 // This must be run after SILowerControlFlow, since it needs to use the
844 // machine-level CFG, but before register allocation.
845 insertPass(&SILowerControlFlowID
, &SIFixWWMLivenessID
, false);
847 TargetPassConfig::addFastRegAlloc(RegAllocPass
);
850 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass
*RegAllocPass
) {
851 insertPass(&MachineSchedulerID
, &SIOptimizeExecMaskingPreRAID
);
853 insertPass(&SIOptimizeExecMaskingPreRAID
, &SIFormMemoryClausesID
);
855 // This must be run immediately after phi elimination and before
856 // TwoAddressInstructions, otherwise the processing of the tied operand of
857 // SI_ELSE will introduce a copy of the tied operand source after the else.
858 insertPass(&PHIEliminationID
, &SILowerControlFlowID
, false);
860 // This must be run after SILowerControlFlow, since it needs to use the
861 // machine-level CFG, but before register allocation.
862 insertPass(&SILowerControlFlowID
, &SIFixWWMLivenessID
, false);
864 TargetPassConfig::addOptimizedRegAlloc(RegAllocPass
);
867 void GCNPassConfig::addPostRegAlloc() {
868 addPass(&SIFixVGPRCopiesID
);
869 addPass(&SIOptimizeExecMaskingID
);
870 TargetPassConfig::addPostRegAlloc();
873 void GCNPassConfig::addPreSched2() {
876 void GCNPassConfig::addPreEmitPass() {
877 addPass(createSIMemoryLegalizerPass());
878 addPass(createSIInsertWaitcntsPass());
879 addPass(createSIShrinkInstructionsPass());
881 // The hazard recognizer that runs as part of the post-ra scheduler does not
882 // guarantee to be able handle all hazards correctly. This is because if there
883 // are multiple scheduling regions in a basic block, the regions are scheduled
884 // bottom up, so when we begin to schedule a region we don't know what
885 // instructions were emitted directly before it.
887 // Here we add a stand-alone hazard recognizer pass which can handle all
890 // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
891 // be better for it to emit S_NOP <N> when possible.
892 addPass(&PostRAHazardRecognizerID
);
894 addPass(&SIInsertSkipsPassID
);
895 addPass(createSIDebuggerInsertNopsPass());
896 addPass(&BranchRelaxationPassID
);
899 TargetPassConfig
*GCNTargetMachine::createPassConfig(PassManagerBase
&PM
) {
900 return new GCNPassConfig(*this, PM
);