1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information needed to emit code for R600 and SI GPUs.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPUMacroFusion.h"
22 #include "AMDGPUTargetObjectFile.h"
23 #include "AMDGPUTargetTransformInfo.h"
24 #include "GCNIterativeScheduler.h"
25 #include "GCNSchedStrategy.h"
26 #include "R600MachineScheduler.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIMachineScheduler.h"
29 #include "TargetInfo/AMDGPUTargetInfo.h"
30 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
31 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
32 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
33 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
34 #include "llvm/CodeGen/MIRParser/MIParser.h"
35 #include "llvm/CodeGen/Passes.h"
36 #include "llvm/CodeGen/TargetPassConfig.h"
37 #include "llvm/IR/Attributes.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/LegacyPassManager.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Compiler.h"
43 #include "llvm/Support/TargetRegistry.h"
44 #include "llvm/Target/TargetLoweringObjectFile.h"
45 #include "llvm/Transforms/IPO.h"
46 #include "llvm/Transforms/IPO/AlwaysInliner.h"
47 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
48 #include "llvm/Transforms/Scalar.h"
49 #include "llvm/Transforms/Scalar/GVN.h"
50 #include "llvm/Transforms/Utils.h"
51 #include "llvm/Transforms/Vectorize.h"
56 static cl::opt
<bool> EnableR600StructurizeCFG(
57 "r600-ir-structurize",
58 cl::desc("Use StructurizeCFG IR pass"),
61 static cl::opt
<bool> EnableSROA(
63 cl::desc("Run SROA after promote alloca pass"),
68 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden
,
69 cl::desc("Run early if-conversion"),
73 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden
,
74 cl::desc("Run pre-RA exec mask optimizations"),
77 static cl::opt
<bool> EnableR600IfConvert(
79 cl::desc("Use if conversion pass"),
83 // Option to disable vectorizer for tests.
84 static cl::opt
<bool> EnableLoadStoreVectorizer(
85 "amdgpu-load-store-vectorizer",
86 cl::desc("Enable load store vectorizer"),
90 // Option to control global loads scalarization
91 static cl::opt
<bool> ScalarizeGlobal(
92 "amdgpu-scalarize-global-loads",
93 cl::desc("Enable global load scalarization"),
97 // Option to run internalize pass.
98 static cl::opt
<bool> InternalizeSymbols(
99 "amdgpu-internalize-symbols",
100 cl::desc("Enable elimination of non-kernel functions and unused globals"),
104 // Option to inline all early.
105 static cl::opt
<bool> EarlyInlineAll(
106 "amdgpu-early-inline-all",
107 cl::desc("Inline all functions early"),
111 static cl::opt
<bool> EnableSDWAPeephole(
112 "amdgpu-sdwa-peephole",
113 cl::desc("Enable SDWA peepholer"),
116 static cl::opt
<bool> EnableDPPCombine(
117 "amdgpu-dpp-combine",
118 cl::desc("Enable DPP combiner"),
121 // Enable address space based alias analysis
122 static cl::opt
<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden
,
123 cl::desc("Enable AMDGPU Alias Analysis"),
126 // Option to run late CFG structurizer
127 static cl::opt
<bool, true> LateCFGStructurize(
128 "amdgpu-late-structurize",
129 cl::desc("Enable late CFG structurization"),
130 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG
),
133 static cl::opt
<bool, true> EnableAMDGPUFunctionCallsOpt(
134 "amdgpu-function-calls",
135 cl::desc("Enable AMDGPU function call support"),
136 cl::location(AMDGPUTargetMachine::EnableFunctionCalls
),
140 // Enable lib calls simplifications
141 static cl::opt
<bool> EnableLibCallSimplify(
142 "amdgpu-simplify-libcall",
143 cl::desc("Enable amdgpu library simplifications"),
147 static cl::opt
<bool> EnableLowerKernelArguments(
148 "amdgpu-ir-lower-kernel-arguments",
149 cl::desc("Lower kernel argument loads in IR pass"),
153 static cl::opt
<bool> EnableRegReassign(
154 "amdgpu-reassign-regs",
155 cl::desc("Enable register reassign optimizations on gfx10+"),
159 // Enable atomic optimization
160 static cl::opt
<bool> EnableAtomicOptimizations(
161 "amdgpu-atomic-optimizations",
162 cl::desc("Enable atomic optimizations"),
166 // Enable Mode register optimization
167 static cl::opt
<bool> EnableSIModeRegisterPass(
168 "amdgpu-mode-register",
169 cl::desc("Enable mode register pass"),
173 // Option is used in lit tests to prevent deadcoding of patterns inspected.
175 EnableDCEInRA("amdgpu-dce-in-ra",
176 cl::init(true), cl::Hidden
,
177 cl::desc("Enable machine DCE inside regalloc"));
179 static cl::opt
<bool> EnableScalarIRPasses(
180 "amdgpu-scalar-ir-passes",
181 cl::desc("Enable scalar IR passes"),
185 extern "C" void LLVMInitializeAMDGPUTarget() {
186 // Register the target
187 RegisterTargetMachine
<R600TargetMachine
> X(getTheAMDGPUTarget());
188 RegisterTargetMachine
<GCNTargetMachine
> Y(getTheGCNTarget());
190 PassRegistry
*PR
= PassRegistry::getPassRegistry();
191 initializeR600ClauseMergePassPass(*PR
);
192 initializeR600ControlFlowFinalizerPass(*PR
);
193 initializeR600PacketizerPass(*PR
);
194 initializeR600ExpandSpecialInstrsPassPass(*PR
);
195 initializeR600VectorRegMergerPass(*PR
);
196 initializeGlobalISel(*PR
);
197 initializeAMDGPUDAGToDAGISelPass(*PR
);
198 initializeGCNDPPCombinePass(*PR
);
199 initializeSILowerI1CopiesPass(*PR
);
200 initializeSILowerSGPRSpillsPass(*PR
);
201 initializeSIFixSGPRCopiesPass(*PR
);
202 initializeSIFixVGPRCopiesPass(*PR
);
203 initializeSIFixupVectorISelPass(*PR
);
204 initializeSIFoldOperandsPass(*PR
);
205 initializeSIPeepholeSDWAPass(*PR
);
206 initializeSIShrinkInstructionsPass(*PR
);
207 initializeSIOptimizeExecMaskingPreRAPass(*PR
);
208 initializeSILoadStoreOptimizerPass(*PR
);
209 initializeAMDGPUFixFunctionBitcastsPass(*PR
);
210 initializeAMDGPUAlwaysInlinePass(*PR
);
211 initializeAMDGPUAnnotateKernelFeaturesPass(*PR
);
212 initializeAMDGPUAnnotateUniformValuesPass(*PR
);
213 initializeAMDGPUArgumentUsageInfoPass(*PR
);
214 initializeAMDGPUAtomicOptimizerPass(*PR
);
215 initializeAMDGPULowerKernelArgumentsPass(*PR
);
216 initializeAMDGPULowerKernelAttributesPass(*PR
);
217 initializeAMDGPULowerIntrinsicsPass(*PR
);
218 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR
);
219 initializeAMDGPUPromoteAllocaPass(*PR
);
220 initializeAMDGPUCodeGenPreparePass(*PR
);
221 initializeAMDGPUPropagateAttributesEarlyPass(*PR
);
222 initializeAMDGPUPropagateAttributesLatePass(*PR
);
223 initializeAMDGPURewriteOutArgumentsPass(*PR
);
224 initializeAMDGPUUnifyMetadataPass(*PR
);
225 initializeSIAnnotateControlFlowPass(*PR
);
226 initializeSIInsertWaitcntsPass(*PR
);
227 initializeSIModeRegisterPass(*PR
);
228 initializeSIWholeQuadModePass(*PR
);
229 initializeSILowerControlFlowPass(*PR
);
230 initializeSIInsertSkipsPass(*PR
);
231 initializeSIMemoryLegalizerPass(*PR
);
232 initializeSIOptimizeExecMaskingPass(*PR
);
233 initializeSIPreAllocateWWMRegsPass(*PR
);
234 initializeSIFormMemoryClausesPass(*PR
);
235 initializeAMDGPUUnifyDivergentExitNodesPass(*PR
);
236 initializeAMDGPUAAWrapperPassPass(*PR
);
237 initializeAMDGPUExternalAAWrapperPass(*PR
);
238 initializeAMDGPUUseNativeCallsPass(*PR
);
239 initializeAMDGPUSimplifyLibCallsPass(*PR
);
240 initializeAMDGPUInlinerPass(*PR
);
241 initializeGCNRegBankReassignPass(*PR
);
242 initializeGCNNSAReassignPass(*PR
);
245 static std::unique_ptr
<TargetLoweringObjectFile
> createTLOF(const Triple
&TT
) {
246 return llvm::make_unique
<AMDGPUTargetObjectFile
>();
249 static ScheduleDAGInstrs
*createR600MachineScheduler(MachineSchedContext
*C
) {
250 return new ScheduleDAGMILive(C
, llvm::make_unique
<R600SchedStrategy
>());
253 static ScheduleDAGInstrs
*createSIMachineScheduler(MachineSchedContext
*C
) {
254 return new SIScheduleDAGMI(C
);
257 static ScheduleDAGInstrs
*
258 createGCNMaxOccupancyMachineScheduler(MachineSchedContext
*C
) {
259 ScheduleDAGMILive
*DAG
=
260 new GCNScheduleDAGMILive(C
, make_unique
<GCNMaxOccupancySchedStrategy
>(C
));
261 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
262 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
263 DAG
->addMutation(createAMDGPUMacroFusionDAGMutation());
267 static ScheduleDAGInstrs
*
268 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext
*C
) {
269 auto DAG
= new GCNIterativeScheduler(C
,
270 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY
);
271 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
272 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
276 static ScheduleDAGInstrs
*createMinRegScheduler(MachineSchedContext
*C
) {
277 return new GCNIterativeScheduler(C
,
278 GCNIterativeScheduler::SCHEDULE_MINREGFORCED
);
281 static ScheduleDAGInstrs
*
282 createIterativeILPMachineScheduler(MachineSchedContext
*C
) {
283 auto DAG
= new GCNIterativeScheduler(C
,
284 GCNIterativeScheduler::SCHEDULE_ILP
);
285 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
286 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
287 DAG
->addMutation(createAMDGPUMacroFusionDAGMutation());
291 static MachineSchedRegistry
292 R600SchedRegistry("r600", "Run R600's custom scheduler",
293 createR600MachineScheduler
);
295 static MachineSchedRegistry
296 SISchedRegistry("si", "Run SI's custom scheduler",
297 createSIMachineScheduler
);
299 static MachineSchedRegistry
300 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
301 "Run GCN scheduler to maximize occupancy",
302 createGCNMaxOccupancyMachineScheduler
);
304 static MachineSchedRegistry
305 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
306 "Run GCN scheduler to maximize occupancy (experimental)",
307 createIterativeGCNMaxOccupancyMachineScheduler
);
309 static MachineSchedRegistry
310 GCNMinRegSchedRegistry("gcn-minreg",
311 "Run GCN iterative scheduler for minimal register usage (experimental)",
312 createMinRegScheduler
);
314 static MachineSchedRegistry
315 GCNILPSchedRegistry("gcn-ilp",
316 "Run GCN iterative scheduler for ILP scheduling (experimental)",
317 createIterativeILPMachineScheduler
);
319 static StringRef
computeDataLayout(const Triple
&TT
) {
320 if (TT
.getArch() == Triple::r600
) {
322 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
323 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
326 // 32-bit private, local, and region pointers. 64-bit global, constant and
327 // flat, non-integral buffer fat pointers.
328 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
329 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
330 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
335 static StringRef
getGPUOrDefault(const Triple
&TT
, StringRef GPU
) {
339 // Need to default to a target with flat support for HSA.
340 if (TT
.getArch() == Triple::amdgcn
)
341 return TT
.getOS() == Triple::AMDHSA
? "generic-hsa" : "generic";
346 static Reloc::Model
getEffectiveRelocModel(Optional
<Reloc::Model
> RM
) {
347 // The AMDGPU toolchain only supports generating shared objects, so we
348 // must always use PIC.
352 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target
&T
, const Triple
&TT
,
353 StringRef CPU
, StringRef FS
,
354 TargetOptions Options
,
355 Optional
<Reloc::Model
> RM
,
356 Optional
<CodeModel::Model
> CM
,
357 CodeGenOpt::Level OptLevel
)
358 : LLVMTargetMachine(T
, computeDataLayout(TT
), TT
, getGPUOrDefault(TT
, CPU
),
359 FS
, Options
, getEffectiveRelocModel(RM
),
360 getEffectiveCodeModel(CM
, CodeModel::Small
), OptLevel
),
361 TLOF(createTLOF(getTargetTriple())) {
365 bool AMDGPUTargetMachine::EnableLateStructurizeCFG
= false;
366 bool AMDGPUTargetMachine::EnableFunctionCalls
= false;
368 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
370 StringRef
AMDGPUTargetMachine::getGPUName(const Function
&F
) const {
371 Attribute GPUAttr
= F
.getFnAttribute("target-cpu");
372 return GPUAttr
.hasAttribute(Attribute::None
) ?
373 getTargetCPU() : GPUAttr
.getValueAsString();
376 StringRef
AMDGPUTargetMachine::getFeatureString(const Function
&F
) const {
377 Attribute FSAttr
= F
.getFnAttribute("target-features");
379 return FSAttr
.hasAttribute(Attribute::None
) ?
380 getTargetFeatureString() :
381 FSAttr
.getValueAsString();
384 /// Predicate for Internalize pass.
385 static bool mustPreserveGV(const GlobalValue
&GV
) {
386 if (const Function
*F
= dyn_cast
<Function
>(&GV
))
387 return F
->isDeclaration() || AMDGPU::isEntryFunctionCC(F
->getCallingConv());
389 return !GV
.use_empty();
392 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder
&Builder
) {
393 Builder
.DivergentTarget
= true;
395 bool EnableOpt
= getOptLevel() > CodeGenOpt::None
;
396 bool Internalize
= InternalizeSymbols
;
397 bool EarlyInline
= EarlyInlineAll
&& EnableOpt
&& !EnableFunctionCalls
;
398 bool AMDGPUAA
= EnableAMDGPUAliasAnalysis
&& EnableOpt
;
399 bool LibCallSimplify
= EnableLibCallSimplify
&& EnableOpt
;
401 if (EnableFunctionCalls
) {
402 delete Builder
.Inliner
;
403 Builder
.Inliner
= createAMDGPUFunctionInliningPass();
406 Builder
.addExtension(
407 PassManagerBuilder::EP_ModuleOptimizerEarly
,
408 [Internalize
, EarlyInline
, AMDGPUAA
, this](const PassManagerBuilder
&,
409 legacy::PassManagerBase
&PM
) {
411 PM
.add(createAMDGPUAAWrapperPass());
412 PM
.add(createAMDGPUExternalAAWrapperPass());
414 PM
.add(createAMDGPUUnifyMetadataPass());
415 PM
.add(createAMDGPUPropagateAttributesLatePass(this));
417 PM
.add(createInternalizePass(mustPreserveGV
));
418 PM
.add(createGlobalDCEPass());
421 PM
.add(createAMDGPUAlwaysInlinePass(false));
424 const auto &Opt
= Options
;
425 Builder
.addExtension(
426 PassManagerBuilder::EP_EarlyAsPossible
,
427 [AMDGPUAA
, LibCallSimplify
, &Opt
, this](const PassManagerBuilder
&,
428 legacy::PassManagerBase
&PM
) {
430 PM
.add(createAMDGPUAAWrapperPass());
431 PM
.add(createAMDGPUExternalAAWrapperPass());
433 PM
.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
434 PM
.add(llvm::createAMDGPUUseNativeCallsPass());
436 PM
.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt
, this));
439 Builder
.addExtension(
440 PassManagerBuilder::EP_CGSCCOptimizerLate
,
441 [](const PassManagerBuilder
&, legacy::PassManagerBase
&PM
) {
442 // Add infer address spaces pass to the opt pipeline after inlining
443 // but before SROA to increase SROA opportunities.
444 PM
.add(createInferAddressSpacesPass());
446 // This should run after inlining to have any chance of doing anything,
447 // and before other cleanup optimizations.
448 PM
.add(createAMDGPULowerKernelAttributesPass());
452 //===----------------------------------------------------------------------===//
453 // R600 Target Machine (R600 -> Cayman)
454 //===----------------------------------------------------------------------===//
456 R600TargetMachine::R600TargetMachine(const Target
&T
, const Triple
&TT
,
457 StringRef CPU
, StringRef FS
,
458 TargetOptions Options
,
459 Optional
<Reloc::Model
> RM
,
460 Optional
<CodeModel::Model
> CM
,
461 CodeGenOpt::Level OL
, bool JIT
)
462 : AMDGPUTargetMachine(T
, TT
, CPU
, FS
, Options
, RM
, CM
, OL
) {
463 setRequiresStructuredCFG(true);
465 // Override the default since calls aren't supported for r600.
466 if (EnableFunctionCalls
&&
467 EnableAMDGPUFunctionCallsOpt
.getNumOccurrences() == 0)
468 EnableFunctionCalls
= false;
471 const R600Subtarget
*R600TargetMachine::getSubtargetImpl(
472 const Function
&F
) const {
473 StringRef GPU
= getGPUName(F
);
474 StringRef FS
= getFeatureString(F
);
476 SmallString
<128> SubtargetKey(GPU
);
477 SubtargetKey
.append(FS
);
479 auto &I
= SubtargetMap
[SubtargetKey
];
481 // This needs to be done before we create a new subtarget since any
482 // creation will depend on the TM and the code generation flags on the
483 // function that reside in TargetOptions.
484 resetTargetOptions(F
);
485 I
= llvm::make_unique
<R600Subtarget
>(TargetTriple
, GPU
, FS
, *this);
492 R600TargetMachine::getTargetTransformInfo(const Function
&F
) {
493 return TargetTransformInfo(R600TTIImpl(this, F
));
496 //===----------------------------------------------------------------------===//
497 // GCN Target Machine (SI+)
498 //===----------------------------------------------------------------------===//
500 GCNTargetMachine::GCNTargetMachine(const Target
&T
, const Triple
&TT
,
501 StringRef CPU
, StringRef FS
,
502 TargetOptions Options
,
503 Optional
<Reloc::Model
> RM
,
504 Optional
<CodeModel::Model
> CM
,
505 CodeGenOpt::Level OL
, bool JIT
)
506 : AMDGPUTargetMachine(T
, TT
, CPU
, FS
, Options
, RM
, CM
, OL
) {}
508 const GCNSubtarget
*GCNTargetMachine::getSubtargetImpl(const Function
&F
) const {
509 StringRef GPU
= getGPUName(F
);
510 StringRef FS
= getFeatureString(F
);
512 SmallString
<128> SubtargetKey(GPU
);
513 SubtargetKey
.append(FS
);
515 auto &I
= SubtargetMap
[SubtargetKey
];
517 // This needs to be done before we create a new subtarget since any
518 // creation will depend on the TM and the code generation flags on the
519 // function that reside in TargetOptions.
520 resetTargetOptions(F
);
521 I
= llvm::make_unique
<GCNSubtarget
>(TargetTriple
, GPU
, FS
, *this);
524 I
->setScalarizeGlobalBehavior(ScalarizeGlobal
);
530 GCNTargetMachine::getTargetTransformInfo(const Function
&F
) {
531 return TargetTransformInfo(GCNTTIImpl(this, F
));
534 //===----------------------------------------------------------------------===//
536 //===----------------------------------------------------------------------===//
540 class AMDGPUPassConfig
: public TargetPassConfig
{
542 AMDGPUPassConfig(LLVMTargetMachine
&TM
, PassManagerBase
&PM
)
543 : TargetPassConfig(TM
, PM
) {
544 // Exceptions and StackMaps are not supported, so these passes will never do
546 disablePass(&StackMapLivenessID
);
547 disablePass(&FuncletLayoutID
);
550 AMDGPUTargetMachine
&getAMDGPUTargetMachine() const {
551 return getTM
<AMDGPUTargetMachine
>();
555 createMachineScheduler(MachineSchedContext
*C
) const override
{
556 ScheduleDAGMILive
*DAG
= createGenericSchedLive(C
);
557 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
558 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
562 void addEarlyCSEOrGVNPass();
563 void addStraightLineScalarOptimizationPasses();
564 void addIRPasses() override
;
565 void addCodeGenPrepare() override
;
566 bool addPreISel() override
;
567 bool addInstSelector() override
;
568 bool addGCPasses() override
;
570 std::unique_ptr
<CSEConfigBase
> getCSEConfig() const override
;
573 std::unique_ptr
<CSEConfigBase
> AMDGPUPassConfig::getCSEConfig() const {
574 return getStandardCSEConfigForOpt(TM
->getOptLevel());
577 class R600PassConfig final
: public AMDGPUPassConfig
{
579 R600PassConfig(LLVMTargetMachine
&TM
, PassManagerBase
&PM
)
580 : AMDGPUPassConfig(TM
, PM
) {}
582 ScheduleDAGInstrs
*createMachineScheduler(
583 MachineSchedContext
*C
) const override
{
584 return createR600MachineScheduler(C
);
587 bool addPreISel() override
;
588 bool addInstSelector() override
;
589 void addPreRegAlloc() override
;
590 void addPreSched2() override
;
591 void addPreEmitPass() override
;
594 class GCNPassConfig final
: public AMDGPUPassConfig
{
596 GCNPassConfig(LLVMTargetMachine
&TM
, PassManagerBase
&PM
)
597 : AMDGPUPassConfig(TM
, PM
) {
598 // It is necessary to know the register usage of the entire call graph. We
599 // allow calls without EnableAMDGPUFunctionCalls if they are marked
600 // noinline, so this is always required.
601 setRequiresCodeGenSCCOrder(true);
604 GCNTargetMachine
&getGCNTargetMachine() const {
605 return getTM
<GCNTargetMachine
>();
609 createMachineScheduler(MachineSchedContext
*C
) const override
;
611 bool addPreISel() override
;
612 void addMachineSSAOptimization() override
;
613 bool addILPOpts() override
;
614 bool addInstSelector() override
;
615 bool addIRTranslator() override
;
616 bool addLegalizeMachineIR() override
;
617 bool addRegBankSelect() override
;
618 bool addGlobalInstructionSelect() override
;
619 void addFastRegAlloc() override
;
620 void addOptimizedRegAlloc() override
;
621 void addPreRegAlloc() override
;
622 bool addPreRewrite() override
;
623 void addPostRegAlloc() override
;
624 void addPreSched2() override
;
625 void addPreEmitPass() override
;
628 } // end anonymous namespace
630 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
631 if (getOptLevel() == CodeGenOpt::Aggressive
)
632 addPass(createGVNPass());
634 addPass(createEarlyCSEPass());
637 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
638 addPass(createLICMPass());
639 addPass(createSeparateConstOffsetFromGEPPass());
640 addPass(createSpeculativeExecutionPass());
641 // ReassociateGEPs exposes more opportunites for SLSR. See
642 // the example in reassociate-geps-and-slsr.ll.
643 addPass(createStraightLineStrengthReducePass());
644 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
645 // EarlyCSE can reuse.
646 addEarlyCSEOrGVNPass();
647 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
648 addPass(createNaryReassociatePass());
649 // NaryReassociate on GEPs creates redundant common expressions, so run
650 // EarlyCSE after it.
651 addPass(createEarlyCSEPass());
654 void AMDGPUPassConfig::addIRPasses() {
655 const AMDGPUTargetMachine
&TM
= getAMDGPUTargetMachine();
657 // There is no reason to run these.
658 disablePass(&StackMapLivenessID
);
659 disablePass(&FuncletLayoutID
);
660 disablePass(&PatchableFunctionID
);
662 // This must occur before inlining, as the inliner will not look through
664 addPass(createAMDGPUFixFunctionBitcastsPass());
666 // A call to propagate attributes pass in the backend in case opt was not run.
667 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM
));
669 addPass(createAtomicExpandPass());
672 addPass(createAMDGPULowerIntrinsicsPass());
674 // Function calls are not supported, so make sure we inline everything.
675 addPass(createAMDGPUAlwaysInlinePass());
676 addPass(createAlwaysInlinerLegacyPass());
677 // We need to add the barrier noop pass, otherwise adding the function
678 // inlining pass will cause all of the PassConfigs passes to be run
679 // one function at a time, which means if we have a nodule with two
680 // functions, then we will generate code for the first function
681 // without ever running any passes on the second.
682 addPass(createBarrierNoopPass());
684 if (TM
.getTargetTriple().getArch() == Triple::amdgcn
) {
685 // TODO: May want to move later or split into an early and late one.
687 addPass(createAMDGPUCodeGenPreparePass());
690 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
691 if (TM
.getTargetTriple().getArch() == Triple::r600
)
692 addPass(createR600OpenCLImageTypeLoweringPass());
694 // Replace OpenCL enqueued block function pointers with global variables.
695 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
697 if (TM
.getOptLevel() > CodeGenOpt::None
) {
698 addPass(createInferAddressSpacesPass());
699 addPass(createAMDGPUPromoteAlloca());
702 addPass(createSROAPass());
704 if (EnableScalarIRPasses
)
705 addStraightLineScalarOptimizationPasses();
707 if (EnableAMDGPUAliasAnalysis
) {
708 addPass(createAMDGPUAAWrapperPass());
709 addPass(createExternalAAWrapperPass([](Pass
&P
, Function
&,
711 if (auto *WrapperPass
= P
.getAnalysisIfAvailable
<AMDGPUAAWrapperPass
>())
712 AAR
.addAAResult(WrapperPass
->getResult());
717 TargetPassConfig::addIRPasses();
719 // EarlyCSE is not always strong enough to clean up what LSR produces. For
720 // example, GVN can combine
727 // %0 = shl nsw %a, 2
730 // but EarlyCSE can do neither of them.
731 if (getOptLevel() != CodeGenOpt::None
&& EnableScalarIRPasses
)
732 addEarlyCSEOrGVNPass();
735 void AMDGPUPassConfig::addCodeGenPrepare() {
736 if (TM
->getTargetTriple().getArch() == Triple::amdgcn
)
737 addPass(createAMDGPUAnnotateKernelFeaturesPass());
739 if (TM
->getTargetTriple().getArch() == Triple::amdgcn
&&
740 EnableLowerKernelArguments
)
741 addPass(createAMDGPULowerKernelArgumentsPass());
743 addPass(&AMDGPUPerfHintAnalysisID
);
745 TargetPassConfig::addCodeGenPrepare();
747 if (EnableLoadStoreVectorizer
)
748 addPass(createLoadStoreVectorizerPass());
751 bool AMDGPUPassConfig::addPreISel() {
752 addPass(createLowerSwitchPass());
753 addPass(createFlattenCFGPass());
757 bool AMDGPUPassConfig::addInstSelector() {
758 // Defer the verifier until FinalizeISel.
759 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
763 bool AMDGPUPassConfig::addGCPasses() {
764 // Do nothing. GC is not supported.
768 //===----------------------------------------------------------------------===//
770 //===----------------------------------------------------------------------===//
772 bool R600PassConfig::addPreISel() {
773 AMDGPUPassConfig::addPreISel();
775 if (EnableR600StructurizeCFG
)
776 addPass(createStructurizeCFGPass());
780 bool R600PassConfig::addInstSelector() {
781 addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
785 void R600PassConfig::addPreRegAlloc() {
786 addPass(createR600VectorRegMerger());
789 void R600PassConfig::addPreSched2() {
790 addPass(createR600EmitClauseMarkers(), false);
791 if (EnableR600IfConvert
)
792 addPass(&IfConverterID
, false);
793 addPass(createR600ClauseMergePass(), false);
796 void R600PassConfig::addPreEmitPass() {
797 addPass(createAMDGPUCFGStructurizerPass(), false);
798 addPass(createR600ExpandSpecialInstrsPass(), false);
799 addPass(&FinalizeMachineBundlesID
, false);
800 addPass(createR600Packetizer(), false);
801 addPass(createR600ControlFlowFinalizer(), false);
804 TargetPassConfig
*R600TargetMachine::createPassConfig(PassManagerBase
&PM
) {
805 return new R600PassConfig(*this, PM
);
808 //===----------------------------------------------------------------------===//
810 //===----------------------------------------------------------------------===//
812 ScheduleDAGInstrs
*GCNPassConfig::createMachineScheduler(
813 MachineSchedContext
*C
) const {
814 const GCNSubtarget
&ST
= C
->MF
->getSubtarget
<GCNSubtarget
>();
815 if (ST
.enableSIScheduler())
816 return createSIMachineScheduler(C
);
817 return createGCNMaxOccupancyMachineScheduler(C
);
820 bool GCNPassConfig::addPreISel() {
821 AMDGPUPassConfig::addPreISel();
823 if (EnableAtomicOptimizations
) {
824 addPass(createAMDGPUAtomicOptimizerPass());
827 // FIXME: We need to run a pass to propagate the attributes when calls are
830 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
831 // regions formed by them.
832 addPass(&AMDGPUUnifyDivergentExitNodesID
);
833 if (!LateCFGStructurize
) {
834 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
836 addPass(createSinkingPass());
837 addPass(createAMDGPUAnnotateUniformValues());
838 if (!LateCFGStructurize
) {
839 addPass(createSIAnnotateControlFlowPass());
841 addPass(createLCSSAPass());
846 void GCNPassConfig::addMachineSSAOptimization() {
847 TargetPassConfig::addMachineSSAOptimization();
849 // We want to fold operands after PeepholeOptimizer has run (or as part of
850 // it), because it will eliminate extra copies making it easier to fold the
851 // real source operand. We want to eliminate dead instructions after, so that
852 // we see fewer uses of the copies. We then need to clean up the dead
853 // instructions leftover after the operands are folded as well.
855 // XXX - Can we get away without running DeadMachineInstructionElim again?
856 addPass(&SIFoldOperandsID
);
857 if (EnableDPPCombine
)
858 addPass(&GCNDPPCombineID
);
859 addPass(&DeadMachineInstructionElimID
);
860 addPass(&SILoadStoreOptimizerID
);
861 if (EnableSDWAPeephole
) {
862 addPass(&SIPeepholeSDWAID
);
863 addPass(&EarlyMachineLICMID
);
864 addPass(&MachineCSEID
);
865 addPass(&SIFoldOperandsID
);
866 addPass(&DeadMachineInstructionElimID
);
868 addPass(createSIShrinkInstructionsPass());
871 bool GCNPassConfig::addILPOpts() {
872 if (EnableEarlyIfConversion
)
873 addPass(&EarlyIfConverterID
);
875 TargetPassConfig::addILPOpts();
879 bool GCNPassConfig::addInstSelector() {
880 AMDGPUPassConfig::addInstSelector();
881 addPass(&SIFixSGPRCopiesID
);
882 addPass(createSILowerI1CopiesPass());
883 addPass(createSIFixupVectorISelPass());
884 addPass(createSIAddIMGInitPass());
885 // FIXME: Remove this once the phi on CF_END is cleaned up by either removing
886 // LCSSA or other ways.
887 addPass(&UnreachableMachineBlockElimID
);
891 bool GCNPassConfig::addIRTranslator() {
892 addPass(new IRTranslator());
896 bool GCNPassConfig::addLegalizeMachineIR() {
897 addPass(new Legalizer());
901 bool GCNPassConfig::addRegBankSelect() {
902 addPass(new RegBankSelect());
906 bool GCNPassConfig::addGlobalInstructionSelect() {
907 addPass(new InstructionSelect());
911 void GCNPassConfig::addPreRegAlloc() {
912 if (LateCFGStructurize
) {
913 addPass(createAMDGPUMachineCFGStructurizerPass());
915 addPass(createSIWholeQuadModePass());
918 void GCNPassConfig::addFastRegAlloc() {
919 // FIXME: We have to disable the verifier here because of PHIElimination +
920 // TwoAddressInstructions disabling it.
922 // This must be run immediately after phi elimination and before
923 // TwoAddressInstructions, otherwise the processing of the tied operand of
924 // SI_ELSE will introduce a copy of the tied operand source after the else.
925 insertPass(&PHIEliminationID
, &SILowerControlFlowID
, false);
927 // This must be run just after RegisterCoalescing.
928 insertPass(&RegisterCoalescerID
, &SIPreAllocateWWMRegsID
, false);
930 TargetPassConfig::addFastRegAlloc();
933 void GCNPassConfig::addOptimizedRegAlloc() {
934 if (OptExecMaskPreRA
) {
935 insertPass(&MachineSchedulerID
, &SIOptimizeExecMaskingPreRAID
);
936 insertPass(&SIOptimizeExecMaskingPreRAID
, &SIFormMemoryClausesID
);
938 insertPass(&MachineSchedulerID
, &SIFormMemoryClausesID
);
941 // This must be run immediately after phi elimination and before
942 // TwoAddressInstructions, otherwise the processing of the tied operand of
943 // SI_ELSE will introduce a copy of the tied operand source after the else.
944 insertPass(&PHIEliminationID
, &SILowerControlFlowID
, false);
946 // This must be run just after RegisterCoalescing.
947 insertPass(&RegisterCoalescerID
, &SIPreAllocateWWMRegsID
, false);
950 insertPass(&RenameIndependentSubregsID
, &DeadMachineInstructionElimID
);
952 TargetPassConfig::addOptimizedRegAlloc();
955 bool GCNPassConfig::addPreRewrite() {
956 if (EnableRegReassign
) {
957 addPass(&GCNNSAReassignID
);
958 addPass(&GCNRegBankReassignID
);
963 void GCNPassConfig::addPostRegAlloc() {
964 addPass(&SIFixVGPRCopiesID
);
965 if (getOptLevel() > CodeGenOpt::None
)
966 addPass(&SIOptimizeExecMaskingID
);
967 TargetPassConfig::addPostRegAlloc();
969 // Equivalent of PEI for SGPRs.
970 addPass(&SILowerSGPRSpillsID
);
973 void GCNPassConfig::addPreSched2() {
976 void GCNPassConfig::addPreEmitPass() {
977 addPass(createSIMemoryLegalizerPass());
978 addPass(createSIInsertWaitcntsPass());
979 addPass(createSIShrinkInstructionsPass());
980 addPass(createSIModeRegisterPass());
982 // The hazard recognizer that runs as part of the post-ra scheduler does not
983 // guarantee to be able handle all hazards correctly. This is because if there
984 // are multiple scheduling regions in a basic block, the regions are scheduled
985 // bottom up, so when we begin to schedule a region we don't know what
986 // instructions were emitted directly before it.
988 // Here we add a stand-alone hazard recognizer pass which can handle all
991 // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
992 // be better for it to emit S_NOP <N> when possible.
993 addPass(&PostRAHazardRecognizerID
);
995 addPass(&SIInsertSkipsPassID
);
996 addPass(&BranchRelaxationPassID
);
999 TargetPassConfig
*GCNTargetMachine::createPassConfig(PassManagerBase
&PM
) {
1000 return new GCNPassConfig(*this, PM
);
1003 yaml::MachineFunctionInfo
*GCNTargetMachine::createDefaultFuncInfoYAML() const {
1004 return new yaml::SIMachineFunctionInfo();
1007 yaml::MachineFunctionInfo
*
1008 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction
&MF
) const {
1009 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1010 return new yaml::SIMachineFunctionInfo(*MFI
,
1011 *MF
.getSubtarget().getRegisterInfo());
1014 bool GCNTargetMachine::parseMachineFunctionInfo(
1015 const yaml::MachineFunctionInfo
&MFI_
, PerFunctionMIParsingState
&PFS
,
1016 SMDiagnostic
&Error
, SMRange
&SourceRange
) const {
1017 const yaml::SIMachineFunctionInfo
&YamlMFI
=
1018 reinterpret_cast<const yaml::SIMachineFunctionInfo
&>(MFI_
);
1019 MachineFunction
&MF
= PFS
.MF
;
1020 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1022 MFI
->initializeBaseYamlFields(YamlMFI
);
1024 auto parseRegister
= [&](const yaml::StringValue
&RegName
, unsigned &RegVal
) {
1025 if (parseNamedRegisterReference(PFS
, RegVal
, RegName
.Value
, Error
)) {
1026 SourceRange
= RegName
.SourceRange
;
1033 auto diagnoseRegisterClass
= [&](const yaml::StringValue
&RegName
) {
1034 // Create a diagnostic for a the register string literal.
1035 const MemoryBuffer
&Buffer
=
1036 *PFS
.SM
->getMemoryBuffer(PFS
.SM
->getMainFileID());
1037 Error
= SMDiagnostic(*PFS
.SM
, SMLoc(), Buffer
.getBufferIdentifier(), 1,
1038 RegName
.Value
.size(), SourceMgr::DK_Error
,
1039 "incorrect register class for field", RegName
.Value
,
1041 SourceRange
= RegName
.SourceRange
;
1045 if (parseRegister(YamlMFI
.ScratchRSrcReg
, MFI
->ScratchRSrcReg
) ||
1046 parseRegister(YamlMFI
.ScratchWaveOffsetReg
, MFI
->ScratchWaveOffsetReg
) ||
1047 parseRegister(YamlMFI
.FrameOffsetReg
, MFI
->FrameOffsetReg
) ||
1048 parseRegister(YamlMFI
.StackPtrOffsetReg
, MFI
->StackPtrOffsetReg
))
1051 if (MFI
->ScratchRSrcReg
!= AMDGPU::PRIVATE_RSRC_REG
&&
1052 !AMDGPU::SReg_128RegClass
.contains(MFI
->ScratchRSrcReg
)) {
1053 return diagnoseRegisterClass(YamlMFI
.ScratchRSrcReg
);
1056 if (MFI
->ScratchWaveOffsetReg
!= AMDGPU::SCRATCH_WAVE_OFFSET_REG
&&
1057 !AMDGPU::SGPR_32RegClass
.contains(MFI
->ScratchWaveOffsetReg
)) {
1058 return diagnoseRegisterClass(YamlMFI
.ScratchWaveOffsetReg
);
1061 if (MFI
->FrameOffsetReg
!= AMDGPU::FP_REG
&&
1062 !AMDGPU::SGPR_32RegClass
.contains(MFI
->FrameOffsetReg
)) {
1063 return diagnoseRegisterClass(YamlMFI
.FrameOffsetReg
);
1066 if (MFI
->StackPtrOffsetReg
!= AMDGPU::SP_REG
&&
1067 !AMDGPU::SGPR_32RegClass
.contains(MFI
->StackPtrOffsetReg
)) {
1068 return diagnoseRegisterClass(YamlMFI
.StackPtrOffsetReg
);
1071 auto parseAndCheckArgument
= [&](const Optional
<yaml::SIArgument
> &A
,
1072 const TargetRegisterClass
&RC
,
1073 ArgDescriptor
&Arg
, unsigned UserSGPRs
,
1074 unsigned SystemSGPRs
) {
1075 // Skip parsing if it's not present.
1079 if (A
->IsRegister
) {
1081 if (parseNamedRegisterReference(PFS
, Reg
, A
->RegisterName
.Value
, Error
)) {
1082 SourceRange
= A
->RegisterName
.SourceRange
;
1085 if (!RC
.contains(Reg
))
1086 return diagnoseRegisterClass(A
->RegisterName
);
1087 Arg
= ArgDescriptor::createRegister(Reg
);
1089 Arg
= ArgDescriptor::createStack(A
->StackOffset
);
1090 // Check and apply the optional mask.
1092 Arg
= ArgDescriptor::createArg(Arg
, A
->Mask
.getValue());
1094 MFI
->NumUserSGPRs
+= UserSGPRs
;
1095 MFI
->NumSystemSGPRs
+= SystemSGPRs
;
1099 if (YamlMFI
.ArgInfo
&&
1100 (parseAndCheckArgument(YamlMFI
.ArgInfo
->PrivateSegmentBuffer
,
1101 AMDGPU::SReg_128RegClass
,
1102 MFI
->ArgInfo
.PrivateSegmentBuffer
, 4, 0) ||
1103 parseAndCheckArgument(YamlMFI
.ArgInfo
->DispatchPtr
,
1104 AMDGPU::SReg_64RegClass
, MFI
->ArgInfo
.DispatchPtr
,
1106 parseAndCheckArgument(YamlMFI
.ArgInfo
->QueuePtr
, AMDGPU::SReg_64RegClass
,
1107 MFI
->ArgInfo
.QueuePtr
, 2, 0) ||
1108 parseAndCheckArgument(YamlMFI
.ArgInfo
->KernargSegmentPtr
,
1109 AMDGPU::SReg_64RegClass
,
1110 MFI
->ArgInfo
.KernargSegmentPtr
, 2, 0) ||
1111 parseAndCheckArgument(YamlMFI
.ArgInfo
->DispatchID
,
1112 AMDGPU::SReg_64RegClass
, MFI
->ArgInfo
.DispatchID
,
1114 parseAndCheckArgument(YamlMFI
.ArgInfo
->FlatScratchInit
,
1115 AMDGPU::SReg_64RegClass
,
1116 MFI
->ArgInfo
.FlatScratchInit
, 2, 0) ||
1117 parseAndCheckArgument(YamlMFI
.ArgInfo
->PrivateSegmentSize
,
1118 AMDGPU::SGPR_32RegClass
,
1119 MFI
->ArgInfo
.PrivateSegmentSize
, 0, 0) ||
1120 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupIDX
,
1121 AMDGPU::SGPR_32RegClass
, MFI
->ArgInfo
.WorkGroupIDX
,
1123 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupIDY
,
1124 AMDGPU::SGPR_32RegClass
, MFI
->ArgInfo
.WorkGroupIDY
,
1126 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupIDZ
,
1127 AMDGPU::SGPR_32RegClass
, MFI
->ArgInfo
.WorkGroupIDZ
,
1129 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupInfo
,
1130 AMDGPU::SGPR_32RegClass
,
1131 MFI
->ArgInfo
.WorkGroupInfo
, 0, 1) ||
1132 parseAndCheckArgument(YamlMFI
.ArgInfo
->PrivateSegmentWaveByteOffset
,
1133 AMDGPU::SGPR_32RegClass
,
1134 MFI
->ArgInfo
.PrivateSegmentWaveByteOffset
, 0, 1) ||
1135 parseAndCheckArgument(YamlMFI
.ArgInfo
->ImplicitArgPtr
,
1136 AMDGPU::SReg_64RegClass
,
1137 MFI
->ArgInfo
.ImplicitArgPtr
, 0, 0) ||
1138 parseAndCheckArgument(YamlMFI
.ArgInfo
->ImplicitBufferPtr
,
1139 AMDGPU::SReg_64RegClass
,
1140 MFI
->ArgInfo
.ImplicitBufferPtr
, 2, 0) ||
1141 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkItemIDX
,
1142 AMDGPU::VGPR_32RegClass
,
1143 MFI
->ArgInfo
.WorkItemIDX
, 0, 0) ||
1144 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkItemIDY
,
1145 AMDGPU::VGPR_32RegClass
,
1146 MFI
->ArgInfo
.WorkItemIDY
, 0, 0) ||
1147 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkItemIDZ
,
1148 AMDGPU::VGPR_32RegClass
,
1149 MFI
->ArgInfo
.WorkItemIDZ
, 0, 0)))
1152 MFI
->Mode
.IEEE
= YamlMFI
.Mode
.IEEE
;
1153 MFI
->Mode
.DX10Clamp
= YamlMFI
.Mode
.DX10Clamp
;