1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information needed to emit code for R600 and SI GPUs.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUInstructionSelector.h"
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPUMacroFusion.h"
22 #include "AMDGPUTargetObjectFile.h"
23 #include "AMDGPUTargetTransformInfo.h"
24 #include "GCNIterativeScheduler.h"
25 #include "GCNSchedStrategy.h"
26 #include "R600MachineScheduler.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIMachineScheduler.h"
29 #include "TargetInfo/AMDGPUTargetInfo.h"
30 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
31 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
32 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
33 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
34 #include "llvm/CodeGen/MIRParser/MIParser.h"
35 #include "llvm/CodeGen/Passes.h"
36 #include "llvm/CodeGen/TargetPassConfig.h"
37 #include "llvm/IR/Attributes.h"
38 #include "llvm/IR/Function.h"
39 #include "llvm/IR/LegacyPassManager.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Compiler.h"
43 #include "llvm/Support/TargetRegistry.h"
44 #include "llvm/Target/TargetLoweringObjectFile.h"
45 #include "llvm/Transforms/IPO.h"
46 #include "llvm/Transforms/IPO/AlwaysInliner.h"
47 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
48 #include "llvm/Transforms/Scalar.h"
49 #include "llvm/Transforms/Scalar/GVN.h"
50 #include "llvm/Transforms/Utils.h"
51 #include "llvm/Transforms/Vectorize.h"
56 static cl::opt
<bool> EnableR600StructurizeCFG(
57 "r600-ir-structurize",
58 cl::desc("Use StructurizeCFG IR pass"),
61 static cl::opt
<bool> EnableSROA(
63 cl::desc("Run SROA after promote alloca pass"),
68 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden
,
69 cl::desc("Run early if-conversion"),
73 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden
,
74 cl::desc("Run pre-RA exec mask optimizations"),
77 static cl::opt
<bool> EnableR600IfConvert(
79 cl::desc("Use if conversion pass"),
83 // Option to disable vectorizer for tests.
84 static cl::opt
<bool> EnableLoadStoreVectorizer(
85 "amdgpu-load-store-vectorizer",
86 cl::desc("Enable load store vectorizer"),
90 // Option to control global loads scalarization
91 static cl::opt
<bool> ScalarizeGlobal(
92 "amdgpu-scalarize-global-loads",
93 cl::desc("Enable global load scalarization"),
97 // Option to run internalize pass.
98 static cl::opt
<bool> InternalizeSymbols(
99 "amdgpu-internalize-symbols",
100 cl::desc("Enable elimination of non-kernel functions and unused globals"),
104 // Option to inline all early.
105 static cl::opt
<bool> EarlyInlineAll(
106 "amdgpu-early-inline-all",
107 cl::desc("Inline all functions early"),
111 static cl::opt
<bool> EnableSDWAPeephole(
112 "amdgpu-sdwa-peephole",
113 cl::desc("Enable SDWA peepholer"),
116 static cl::opt
<bool> EnableDPPCombine(
117 "amdgpu-dpp-combine",
118 cl::desc("Enable DPP combiner"),
121 // Enable address space based alias analysis
122 static cl::opt
<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden
,
123 cl::desc("Enable AMDGPU Alias Analysis"),
126 // Option to run late CFG structurizer
127 static cl::opt
<bool, true> LateCFGStructurize(
128 "amdgpu-late-structurize",
129 cl::desc("Enable late CFG structurization"),
130 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG
),
133 static cl::opt
<bool, true> EnableAMDGPUFunctionCallsOpt(
134 "amdgpu-function-calls",
135 cl::desc("Enable AMDGPU function call support"),
136 cl::location(AMDGPUTargetMachine::EnableFunctionCalls
),
140 // Enable lib calls simplifications
141 static cl::opt
<bool> EnableLibCallSimplify(
142 "amdgpu-simplify-libcall",
143 cl::desc("Enable amdgpu library simplifications"),
147 static cl::opt
<bool> EnableLowerKernelArguments(
148 "amdgpu-ir-lower-kernel-arguments",
149 cl::desc("Lower kernel argument loads in IR pass"),
153 static cl::opt
<bool> EnableRegReassign(
154 "amdgpu-reassign-regs",
155 cl::desc("Enable register reassign optimizations on gfx10+"),
159 // Enable atomic optimization
160 static cl::opt
<bool> EnableAtomicOptimizations(
161 "amdgpu-atomic-optimizations",
162 cl::desc("Enable atomic optimizations"),
166 // Enable Mode register optimization
167 static cl::opt
<bool> EnableSIModeRegisterPass(
168 "amdgpu-mode-register",
169 cl::desc("Enable mode register pass"),
173 // Option is used in lit tests to prevent deadcoding of patterns inspected.
175 EnableDCEInRA("amdgpu-dce-in-ra",
176 cl::init(true), cl::Hidden
,
177 cl::desc("Enable machine DCE inside regalloc"));
179 static cl::opt
<bool> EnableScalarIRPasses(
180 "amdgpu-scalar-ir-passes",
181 cl::desc("Enable scalar IR passes"),
185 extern "C" void LLVMInitializeAMDGPUTarget() {
186 // Register the target
187 RegisterTargetMachine
<R600TargetMachine
> X(getTheAMDGPUTarget());
188 RegisterTargetMachine
<GCNTargetMachine
> Y(getTheGCNTarget());
190 PassRegistry
*PR
= PassRegistry::getPassRegistry();
191 initializeR600ClauseMergePassPass(*PR
);
192 initializeR600ControlFlowFinalizerPass(*PR
);
193 initializeR600PacketizerPass(*PR
);
194 initializeR600ExpandSpecialInstrsPassPass(*PR
);
195 initializeR600VectorRegMergerPass(*PR
);
196 initializeGlobalISel(*PR
);
197 initializeAMDGPUDAGToDAGISelPass(*PR
);
198 initializeGCNDPPCombinePass(*PR
);
199 initializeSILowerI1CopiesPass(*PR
);
200 initializeSILowerSGPRSpillsPass(*PR
);
201 initializeSIFixSGPRCopiesPass(*PR
);
202 initializeSIFixVGPRCopiesPass(*PR
);
203 initializeSIFixupVectorISelPass(*PR
);
204 initializeSIFoldOperandsPass(*PR
);
205 initializeSIPeepholeSDWAPass(*PR
);
206 initializeSIShrinkInstructionsPass(*PR
);
207 initializeSIOptimizeExecMaskingPreRAPass(*PR
);
208 initializeSILoadStoreOptimizerPass(*PR
);
209 initializeAMDGPUFixFunctionBitcastsPass(*PR
);
210 initializeAMDGPUAlwaysInlinePass(*PR
);
211 initializeAMDGPUAnnotateKernelFeaturesPass(*PR
);
212 initializeAMDGPUAnnotateUniformValuesPass(*PR
);
213 initializeAMDGPUArgumentUsageInfoPass(*PR
);
214 initializeAMDGPUAtomicOptimizerPass(*PR
);
215 initializeAMDGPULowerKernelArgumentsPass(*PR
);
216 initializeAMDGPULowerKernelAttributesPass(*PR
);
217 initializeAMDGPULowerIntrinsicsPass(*PR
);
218 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR
);
219 initializeAMDGPUPromoteAllocaPass(*PR
);
220 initializeAMDGPUCodeGenPreparePass(*PR
);
221 initializeAMDGPUPropagateAttributesEarlyPass(*PR
);
222 initializeAMDGPUPropagateAttributesLatePass(*PR
);
223 initializeAMDGPURewriteOutArgumentsPass(*PR
);
224 initializeAMDGPUUnifyMetadataPass(*PR
);
225 initializeSIAnnotateControlFlowPass(*PR
);
226 initializeSIInsertWaitcntsPass(*PR
);
227 initializeSIModeRegisterPass(*PR
);
228 initializeSIWholeQuadModePass(*PR
);
229 initializeSILowerControlFlowPass(*PR
);
230 initializeSIInsertSkipsPass(*PR
);
231 initializeSIMemoryLegalizerPass(*PR
);
232 initializeSIOptimizeExecMaskingPass(*PR
);
233 initializeSIPreAllocateWWMRegsPass(*PR
);
234 initializeSIFormMemoryClausesPass(*PR
);
235 initializeAMDGPUUnifyDivergentExitNodesPass(*PR
);
236 initializeAMDGPUAAWrapperPassPass(*PR
);
237 initializeAMDGPUExternalAAWrapperPass(*PR
);
238 initializeAMDGPUUseNativeCallsPass(*PR
);
239 initializeAMDGPUSimplifyLibCallsPass(*PR
);
240 initializeAMDGPUInlinerPass(*PR
);
241 initializeAMDGPUPrintfRuntimeBindingPass(*PR
);
242 initializeGCNRegBankReassignPass(*PR
);
243 initializeGCNNSAReassignPass(*PR
);
246 static std::unique_ptr
<TargetLoweringObjectFile
> createTLOF(const Triple
&TT
) {
247 return std::make_unique
<AMDGPUTargetObjectFile
>();
250 static ScheduleDAGInstrs
*createR600MachineScheduler(MachineSchedContext
*C
) {
251 return new ScheduleDAGMILive(C
, std::make_unique
<R600SchedStrategy
>());
254 static ScheduleDAGInstrs
*createSIMachineScheduler(MachineSchedContext
*C
) {
255 return new SIScheduleDAGMI(C
);
258 static ScheduleDAGInstrs
*
259 createGCNMaxOccupancyMachineScheduler(MachineSchedContext
*C
) {
260 ScheduleDAGMILive
*DAG
=
261 new GCNScheduleDAGMILive(C
, std::make_unique
<GCNMaxOccupancySchedStrategy
>(C
));
262 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
263 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
264 DAG
->addMutation(createAMDGPUMacroFusionDAGMutation());
268 static ScheduleDAGInstrs
*
269 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext
*C
) {
270 auto DAG
= new GCNIterativeScheduler(C
,
271 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY
);
272 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
273 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
277 static ScheduleDAGInstrs
*createMinRegScheduler(MachineSchedContext
*C
) {
278 return new GCNIterativeScheduler(C
,
279 GCNIterativeScheduler::SCHEDULE_MINREGFORCED
);
282 static ScheduleDAGInstrs
*
283 createIterativeILPMachineScheduler(MachineSchedContext
*C
) {
284 auto DAG
= new GCNIterativeScheduler(C
,
285 GCNIterativeScheduler::SCHEDULE_ILP
);
286 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
287 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
288 DAG
->addMutation(createAMDGPUMacroFusionDAGMutation());
292 static MachineSchedRegistry
293 R600SchedRegistry("r600", "Run R600's custom scheduler",
294 createR600MachineScheduler
);
296 static MachineSchedRegistry
297 SISchedRegistry("si", "Run SI's custom scheduler",
298 createSIMachineScheduler
);
300 static MachineSchedRegistry
301 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
302 "Run GCN scheduler to maximize occupancy",
303 createGCNMaxOccupancyMachineScheduler
);
305 static MachineSchedRegistry
306 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
307 "Run GCN scheduler to maximize occupancy (experimental)",
308 createIterativeGCNMaxOccupancyMachineScheduler
);
310 static MachineSchedRegistry
311 GCNMinRegSchedRegistry("gcn-minreg",
312 "Run GCN iterative scheduler for minimal register usage (experimental)",
313 createMinRegScheduler
);
315 static MachineSchedRegistry
316 GCNILPSchedRegistry("gcn-ilp",
317 "Run GCN iterative scheduler for ILP scheduling (experimental)",
318 createIterativeILPMachineScheduler
);
320 static StringRef
computeDataLayout(const Triple
&TT
) {
321 if (TT
.getArch() == Triple::r600
) {
323 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
324 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5";
327 // 32-bit private, local, and region pointers. 64-bit global, constant and
328 // flat, non-integral buffer fat pointers.
329 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
330 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
331 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
336 static StringRef
getGPUOrDefault(const Triple
&TT
, StringRef GPU
) {
340 // Need to default to a target with flat support for HSA.
341 if (TT
.getArch() == Triple::amdgcn
)
342 return TT
.getOS() == Triple::AMDHSA
? "generic-hsa" : "generic";
347 static Reloc::Model
getEffectiveRelocModel(Optional
<Reloc::Model
> RM
) {
348 // The AMDGPU toolchain only supports generating shared objects, so we
349 // must always use PIC.
353 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target
&T
, const Triple
&TT
,
354 StringRef CPU
, StringRef FS
,
355 TargetOptions Options
,
356 Optional
<Reloc::Model
> RM
,
357 Optional
<CodeModel::Model
> CM
,
358 CodeGenOpt::Level OptLevel
)
359 : LLVMTargetMachine(T
, computeDataLayout(TT
), TT
, getGPUOrDefault(TT
, CPU
),
360 FS
, Options
, getEffectiveRelocModel(RM
),
361 getEffectiveCodeModel(CM
, CodeModel::Small
), OptLevel
),
362 TLOF(createTLOF(getTargetTriple())) {
366 bool AMDGPUTargetMachine::EnableLateStructurizeCFG
= false;
367 bool AMDGPUTargetMachine::EnableFunctionCalls
= false;
369 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
371 StringRef
AMDGPUTargetMachine::getGPUName(const Function
&F
) const {
372 Attribute GPUAttr
= F
.getFnAttribute("target-cpu");
373 return GPUAttr
.hasAttribute(Attribute::None
) ?
374 getTargetCPU() : GPUAttr
.getValueAsString();
377 StringRef
AMDGPUTargetMachine::getFeatureString(const Function
&F
) const {
378 Attribute FSAttr
= F
.getFnAttribute("target-features");
380 return FSAttr
.hasAttribute(Attribute::None
) ?
381 getTargetFeatureString() :
382 FSAttr
.getValueAsString();
385 /// Predicate for Internalize pass.
386 static bool mustPreserveGV(const GlobalValue
&GV
) {
387 if (const Function
*F
= dyn_cast
<Function
>(&GV
))
388 return F
->isDeclaration() || AMDGPU::isEntryFunctionCC(F
->getCallingConv());
390 return !GV
.use_empty();
393 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder
&Builder
) {
394 Builder
.DivergentTarget
= true;
396 bool EnableOpt
= getOptLevel() > CodeGenOpt::None
;
397 bool Internalize
= InternalizeSymbols
;
398 bool EarlyInline
= EarlyInlineAll
&& EnableOpt
&& !EnableFunctionCalls
;
399 bool AMDGPUAA
= EnableAMDGPUAliasAnalysis
&& EnableOpt
;
400 bool LibCallSimplify
= EnableLibCallSimplify
&& EnableOpt
;
402 if (EnableFunctionCalls
) {
403 delete Builder
.Inliner
;
404 Builder
.Inliner
= createAMDGPUFunctionInliningPass();
407 Builder
.addExtension(
408 PassManagerBuilder::EP_ModuleOptimizerEarly
,
409 [Internalize
, EarlyInline
, AMDGPUAA
, this](const PassManagerBuilder
&,
410 legacy::PassManagerBase
&PM
) {
412 PM
.add(createAMDGPUAAWrapperPass());
413 PM
.add(createAMDGPUExternalAAWrapperPass());
415 PM
.add(createAMDGPUUnifyMetadataPass());
416 PM
.add(createAMDGPUPrintfRuntimeBinding());
417 PM
.add(createAMDGPUPropagateAttributesLatePass(this));
419 PM
.add(createInternalizePass(mustPreserveGV
));
420 PM
.add(createGlobalDCEPass());
423 PM
.add(createAMDGPUAlwaysInlinePass(false));
426 const auto &Opt
= Options
;
427 Builder
.addExtension(
428 PassManagerBuilder::EP_EarlyAsPossible
,
429 [AMDGPUAA
, LibCallSimplify
, &Opt
, this](const PassManagerBuilder
&,
430 legacy::PassManagerBase
&PM
) {
432 PM
.add(createAMDGPUAAWrapperPass());
433 PM
.add(createAMDGPUExternalAAWrapperPass());
435 PM
.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
436 PM
.add(llvm::createAMDGPUUseNativeCallsPass());
438 PM
.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt
, this));
441 Builder
.addExtension(
442 PassManagerBuilder::EP_CGSCCOptimizerLate
,
443 [](const PassManagerBuilder
&, legacy::PassManagerBase
&PM
) {
444 // Add infer address spaces pass to the opt pipeline after inlining
445 // but before SROA to increase SROA opportunities.
446 PM
.add(createInferAddressSpacesPass());
448 // This should run after inlining to have any chance of doing anything,
449 // and before other cleanup optimizations.
450 PM
.add(createAMDGPULowerKernelAttributesPass());
454 //===----------------------------------------------------------------------===//
455 // R600 Target Machine (R600 -> Cayman)
456 //===----------------------------------------------------------------------===//
458 R600TargetMachine::R600TargetMachine(const Target
&T
, const Triple
&TT
,
459 StringRef CPU
, StringRef FS
,
460 TargetOptions Options
,
461 Optional
<Reloc::Model
> RM
,
462 Optional
<CodeModel::Model
> CM
,
463 CodeGenOpt::Level OL
, bool JIT
)
464 : AMDGPUTargetMachine(T
, TT
, CPU
, FS
, Options
, RM
, CM
, OL
) {
465 setRequiresStructuredCFG(true);
467 // Override the default since calls aren't supported for r600.
468 if (EnableFunctionCalls
&&
469 EnableAMDGPUFunctionCallsOpt
.getNumOccurrences() == 0)
470 EnableFunctionCalls
= false;
473 const R600Subtarget
*R600TargetMachine::getSubtargetImpl(
474 const Function
&F
) const {
475 StringRef GPU
= getGPUName(F
);
476 StringRef FS
= getFeatureString(F
);
478 SmallString
<128> SubtargetKey(GPU
);
479 SubtargetKey
.append(FS
);
481 auto &I
= SubtargetMap
[SubtargetKey
];
483 // This needs to be done before we create a new subtarget since any
484 // creation will depend on the TM and the code generation flags on the
485 // function that reside in TargetOptions.
486 resetTargetOptions(F
);
487 I
= std::make_unique
<R600Subtarget
>(TargetTriple
, GPU
, FS
, *this);
494 R600TargetMachine::getTargetTransformInfo(const Function
&F
) {
495 return TargetTransformInfo(R600TTIImpl(this, F
));
498 //===----------------------------------------------------------------------===//
499 // GCN Target Machine (SI+)
500 //===----------------------------------------------------------------------===//
502 GCNTargetMachine::GCNTargetMachine(const Target
&T
, const Triple
&TT
,
503 StringRef CPU
, StringRef FS
,
504 TargetOptions Options
,
505 Optional
<Reloc::Model
> RM
,
506 Optional
<CodeModel::Model
> CM
,
507 CodeGenOpt::Level OL
, bool JIT
)
508 : AMDGPUTargetMachine(T
, TT
, CPU
, FS
, Options
, RM
, CM
, OL
) {}
510 const GCNSubtarget
*GCNTargetMachine::getSubtargetImpl(const Function
&F
) const {
511 StringRef GPU
= getGPUName(F
);
512 StringRef FS
= getFeatureString(F
);
514 SmallString
<128> SubtargetKey(GPU
);
515 SubtargetKey
.append(FS
);
517 auto &I
= SubtargetMap
[SubtargetKey
];
519 // This needs to be done before we create a new subtarget since any
520 // creation will depend on the TM and the code generation flags on the
521 // function that reside in TargetOptions.
522 resetTargetOptions(F
);
523 I
= std::make_unique
<GCNSubtarget
>(TargetTriple
, GPU
, FS
, *this);
526 I
->setScalarizeGlobalBehavior(ScalarizeGlobal
);
532 GCNTargetMachine::getTargetTransformInfo(const Function
&F
) {
533 return TargetTransformInfo(GCNTTIImpl(this, F
));
536 //===----------------------------------------------------------------------===//
538 //===----------------------------------------------------------------------===//
542 class AMDGPUPassConfig
: public TargetPassConfig
{
544 AMDGPUPassConfig(LLVMTargetMachine
&TM
, PassManagerBase
&PM
)
545 : TargetPassConfig(TM
, PM
) {
546 // Exceptions and StackMaps are not supported, so these passes will never do
548 disablePass(&StackMapLivenessID
);
549 disablePass(&FuncletLayoutID
);
552 AMDGPUTargetMachine
&getAMDGPUTargetMachine() const {
553 return getTM
<AMDGPUTargetMachine
>();
557 createMachineScheduler(MachineSchedContext
*C
) const override
{
558 ScheduleDAGMILive
*DAG
= createGenericSchedLive(C
);
559 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
560 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
564 void addEarlyCSEOrGVNPass();
565 void addStraightLineScalarOptimizationPasses();
566 void addIRPasses() override
;
567 void addCodeGenPrepare() override
;
568 bool addPreISel() override
;
569 bool addInstSelector() override
;
570 bool addGCPasses() override
;
572 std::unique_ptr
<CSEConfigBase
> getCSEConfig() const override
;
575 std::unique_ptr
<CSEConfigBase
> AMDGPUPassConfig::getCSEConfig() const {
576 return getStandardCSEConfigForOpt(TM
->getOptLevel());
579 class R600PassConfig final
: public AMDGPUPassConfig
{
581 R600PassConfig(LLVMTargetMachine
&TM
, PassManagerBase
&PM
)
582 : AMDGPUPassConfig(TM
, PM
) {}
584 ScheduleDAGInstrs
*createMachineScheduler(
585 MachineSchedContext
*C
) const override
{
586 return createR600MachineScheduler(C
);
589 bool addPreISel() override
;
590 bool addInstSelector() override
;
591 void addPreRegAlloc() override
;
592 void addPreSched2() override
;
593 void addPreEmitPass() override
;
596 class GCNPassConfig final
: public AMDGPUPassConfig
{
598 GCNPassConfig(LLVMTargetMachine
&TM
, PassManagerBase
&PM
)
599 : AMDGPUPassConfig(TM
, PM
) {
600 // It is necessary to know the register usage of the entire call graph. We
601 // allow calls without EnableAMDGPUFunctionCalls if they are marked
602 // noinline, so this is always required.
603 setRequiresCodeGenSCCOrder(true);
606 GCNTargetMachine
&getGCNTargetMachine() const {
607 return getTM
<GCNTargetMachine
>();
611 createMachineScheduler(MachineSchedContext
*C
) const override
;
613 bool addPreISel() override
;
614 void addMachineSSAOptimization() override
;
615 bool addILPOpts() override
;
616 bool addInstSelector() override
;
617 bool addIRTranslator() override
;
618 bool addLegalizeMachineIR() override
;
619 bool addRegBankSelect() override
;
620 bool addGlobalInstructionSelect() override
;
621 void addFastRegAlloc() override
;
622 void addOptimizedRegAlloc() override
;
623 void addPreRegAlloc() override
;
624 bool addPreRewrite() override
;
625 void addPostRegAlloc() override
;
626 void addPreSched2() override
;
627 void addPreEmitPass() override
;
630 } // end anonymous namespace
632 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
633 if (getOptLevel() == CodeGenOpt::Aggressive
)
634 addPass(createGVNPass());
636 addPass(createEarlyCSEPass());
639 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
640 addPass(createLICMPass());
641 addPass(createSeparateConstOffsetFromGEPPass());
642 addPass(createSpeculativeExecutionPass());
643 // ReassociateGEPs exposes more opportunites for SLSR. See
644 // the example in reassociate-geps-and-slsr.ll.
645 addPass(createStraightLineStrengthReducePass());
646 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
647 // EarlyCSE can reuse.
648 addEarlyCSEOrGVNPass();
649 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
650 addPass(createNaryReassociatePass());
651 // NaryReassociate on GEPs creates redundant common expressions, so run
652 // EarlyCSE after it.
653 addPass(createEarlyCSEPass());
656 void AMDGPUPassConfig::addIRPasses() {
657 const AMDGPUTargetMachine
&TM
= getAMDGPUTargetMachine();
659 // There is no reason to run these.
660 disablePass(&StackMapLivenessID
);
661 disablePass(&FuncletLayoutID
);
662 disablePass(&PatchableFunctionID
);
664 addPass(createAMDGPUPrintfRuntimeBinding());
666 // This must occur before inlining, as the inliner will not look through
668 addPass(createAMDGPUFixFunctionBitcastsPass());
670 // A call to propagate attributes pass in the backend in case opt was not run.
671 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM
));
673 addPass(createAtomicExpandPass());
676 addPass(createAMDGPULowerIntrinsicsPass());
678 // Function calls are not supported, so make sure we inline everything.
679 addPass(createAMDGPUAlwaysInlinePass());
680 addPass(createAlwaysInlinerLegacyPass());
681 // We need to add the barrier noop pass, otherwise adding the function
682 // inlining pass will cause all of the PassConfigs passes to be run
683 // one function at a time, which means if we have a nodule with two
684 // functions, then we will generate code for the first function
685 // without ever running any passes on the second.
686 addPass(createBarrierNoopPass());
688 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
689 if (TM
.getTargetTriple().getArch() == Triple::r600
)
690 addPass(createR600OpenCLImageTypeLoweringPass());
692 // Replace OpenCL enqueued block function pointers with global variables.
693 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
695 if (TM
.getOptLevel() > CodeGenOpt::None
) {
696 addPass(createInferAddressSpacesPass());
697 addPass(createAMDGPUPromoteAlloca());
700 addPass(createSROAPass());
702 if (EnableScalarIRPasses
)
703 addStraightLineScalarOptimizationPasses();
705 if (EnableAMDGPUAliasAnalysis
) {
706 addPass(createAMDGPUAAWrapperPass());
707 addPass(createExternalAAWrapperPass([](Pass
&P
, Function
&,
709 if (auto *WrapperPass
= P
.getAnalysisIfAvailable
<AMDGPUAAWrapperPass
>())
710 AAR
.addAAResult(WrapperPass
->getResult());
715 if (TM
.getTargetTriple().getArch() == Triple::amdgcn
) {
716 // TODO: May want to move later or split into an early and late one.
717 addPass(createAMDGPUCodeGenPreparePass());
720 TargetPassConfig::addIRPasses();
722 // EarlyCSE is not always strong enough to clean up what LSR produces. For
723 // example, GVN can combine
730 // %0 = shl nsw %a, 2
733 // but EarlyCSE can do neither of them.
734 if (getOptLevel() != CodeGenOpt::None
&& EnableScalarIRPasses
)
735 addEarlyCSEOrGVNPass();
738 void AMDGPUPassConfig::addCodeGenPrepare() {
739 if (TM
->getTargetTriple().getArch() == Triple::amdgcn
)
740 addPass(createAMDGPUAnnotateKernelFeaturesPass());
742 if (TM
->getTargetTriple().getArch() == Triple::amdgcn
&&
743 EnableLowerKernelArguments
)
744 addPass(createAMDGPULowerKernelArgumentsPass());
746 addPass(&AMDGPUPerfHintAnalysisID
);
748 TargetPassConfig::addCodeGenPrepare();
750 if (EnableLoadStoreVectorizer
)
751 addPass(createLoadStoreVectorizerPass());
754 bool AMDGPUPassConfig::addPreISel() {
755 addPass(createLowerSwitchPass());
756 addPass(createFlattenCFGPass());
760 bool AMDGPUPassConfig::addInstSelector() {
761 // Defer the verifier until FinalizeISel.
762 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()), false);
766 bool AMDGPUPassConfig::addGCPasses() {
767 // Do nothing. GC is not supported.
771 //===----------------------------------------------------------------------===//
773 //===----------------------------------------------------------------------===//
775 bool R600PassConfig::addPreISel() {
776 AMDGPUPassConfig::addPreISel();
778 if (EnableR600StructurizeCFG
)
779 addPass(createStructurizeCFGPass());
783 bool R600PassConfig::addInstSelector() {
784 addPass(createR600ISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
788 void R600PassConfig::addPreRegAlloc() {
789 addPass(createR600VectorRegMerger());
792 void R600PassConfig::addPreSched2() {
793 addPass(createR600EmitClauseMarkers(), false);
794 if (EnableR600IfConvert
)
795 addPass(&IfConverterID
, false);
796 addPass(createR600ClauseMergePass(), false);
799 void R600PassConfig::addPreEmitPass() {
800 addPass(createAMDGPUCFGStructurizerPass(), false);
801 addPass(createR600ExpandSpecialInstrsPass(), false);
802 addPass(&FinalizeMachineBundlesID
, false);
803 addPass(createR600Packetizer(), false);
804 addPass(createR600ControlFlowFinalizer(), false);
807 TargetPassConfig
*R600TargetMachine::createPassConfig(PassManagerBase
&PM
) {
808 return new R600PassConfig(*this, PM
);
811 //===----------------------------------------------------------------------===//
813 //===----------------------------------------------------------------------===//
815 ScheduleDAGInstrs
*GCNPassConfig::createMachineScheduler(
816 MachineSchedContext
*C
) const {
817 const GCNSubtarget
&ST
= C
->MF
->getSubtarget
<GCNSubtarget
>();
818 if (ST
.enableSIScheduler())
819 return createSIMachineScheduler(C
);
820 return createGCNMaxOccupancyMachineScheduler(C
);
823 bool GCNPassConfig::addPreISel() {
824 AMDGPUPassConfig::addPreISel();
826 if (EnableAtomicOptimizations
) {
827 addPass(createAMDGPUAtomicOptimizerPass());
830 // FIXME: We need to run a pass to propagate the attributes when calls are
833 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
834 // regions formed by them.
835 addPass(&AMDGPUUnifyDivergentExitNodesID
);
836 if (!LateCFGStructurize
) {
837 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
839 addPass(createSinkingPass());
840 addPass(createAMDGPUAnnotateUniformValues());
841 if (!LateCFGStructurize
) {
842 addPass(createSIAnnotateControlFlowPass());
844 addPass(createLCSSAPass());
849 void GCNPassConfig::addMachineSSAOptimization() {
850 TargetPassConfig::addMachineSSAOptimization();
852 // We want to fold operands after PeepholeOptimizer has run (or as part of
853 // it), because it will eliminate extra copies making it easier to fold the
854 // real source operand. We want to eliminate dead instructions after, so that
855 // we see fewer uses of the copies. We then need to clean up the dead
856 // instructions leftover after the operands are folded as well.
858 // XXX - Can we get away without running DeadMachineInstructionElim again?
859 addPass(&SIFoldOperandsID
);
860 if (EnableDPPCombine
)
861 addPass(&GCNDPPCombineID
);
862 addPass(&DeadMachineInstructionElimID
);
863 addPass(&SILoadStoreOptimizerID
);
864 if (EnableSDWAPeephole
) {
865 addPass(&SIPeepholeSDWAID
);
866 addPass(&EarlyMachineLICMID
);
867 addPass(&MachineCSEID
);
868 addPass(&SIFoldOperandsID
);
869 addPass(&DeadMachineInstructionElimID
);
871 addPass(createSIShrinkInstructionsPass());
874 bool GCNPassConfig::addILPOpts() {
875 if (EnableEarlyIfConversion
)
876 addPass(&EarlyIfConverterID
);
878 TargetPassConfig::addILPOpts();
882 bool GCNPassConfig::addInstSelector() {
883 AMDGPUPassConfig::addInstSelector();
884 addPass(&SIFixSGPRCopiesID
);
885 addPass(createSILowerI1CopiesPass());
886 addPass(createSIFixupVectorISelPass());
887 addPass(createSIAddIMGInitPass());
888 // FIXME: Remove this once the phi on CF_END is cleaned up by either removing
889 // LCSSA or other ways.
890 addPass(&UnreachableMachineBlockElimID
);
894 bool GCNPassConfig::addIRTranslator() {
895 addPass(new IRTranslator());
899 bool GCNPassConfig::addLegalizeMachineIR() {
900 addPass(new Legalizer());
904 bool GCNPassConfig::addRegBankSelect() {
905 addPass(new RegBankSelect());
909 bool GCNPassConfig::addGlobalInstructionSelect() {
910 addPass(new InstructionSelect());
914 void GCNPassConfig::addPreRegAlloc() {
915 if (LateCFGStructurize
) {
916 addPass(createAMDGPUMachineCFGStructurizerPass());
918 addPass(createSIWholeQuadModePass());
921 void GCNPassConfig::addFastRegAlloc() {
922 // FIXME: We have to disable the verifier here because of PHIElimination +
923 // TwoAddressInstructions disabling it.
925 // This must be run immediately after phi elimination and before
926 // TwoAddressInstructions, otherwise the processing of the tied operand of
927 // SI_ELSE will introduce a copy of the tied operand source after the else.
928 insertPass(&PHIEliminationID
, &SILowerControlFlowID
, false);
930 // This must be run just after RegisterCoalescing.
931 insertPass(&RegisterCoalescerID
, &SIPreAllocateWWMRegsID
, false);
933 TargetPassConfig::addFastRegAlloc();
936 void GCNPassConfig::addOptimizedRegAlloc() {
937 if (OptExecMaskPreRA
) {
938 insertPass(&MachineSchedulerID
, &SIOptimizeExecMaskingPreRAID
);
939 insertPass(&SIOptimizeExecMaskingPreRAID
, &SIFormMemoryClausesID
);
941 insertPass(&MachineSchedulerID
, &SIFormMemoryClausesID
);
944 // This must be run immediately after phi elimination and before
945 // TwoAddressInstructions, otherwise the processing of the tied operand of
946 // SI_ELSE will introduce a copy of the tied operand source after the else.
947 insertPass(&PHIEliminationID
, &SILowerControlFlowID
, false);
949 // This must be run just after RegisterCoalescing.
950 insertPass(&RegisterCoalescerID
, &SIPreAllocateWWMRegsID
, false);
953 insertPass(&RenameIndependentSubregsID
, &DeadMachineInstructionElimID
);
955 TargetPassConfig::addOptimizedRegAlloc();
958 bool GCNPassConfig::addPreRewrite() {
959 if (EnableRegReassign
) {
960 addPass(&GCNNSAReassignID
);
961 addPass(&GCNRegBankReassignID
);
966 void GCNPassConfig::addPostRegAlloc() {
967 addPass(&SIFixVGPRCopiesID
);
968 if (getOptLevel() > CodeGenOpt::None
)
969 addPass(&SIOptimizeExecMaskingID
);
970 TargetPassConfig::addPostRegAlloc();
972 // Equivalent of PEI for SGPRs.
973 addPass(&SILowerSGPRSpillsID
);
976 void GCNPassConfig::addPreSched2() {
979 void GCNPassConfig::addPreEmitPass() {
980 addPass(createSIMemoryLegalizerPass());
981 addPass(createSIInsertWaitcntsPass());
982 addPass(createSIShrinkInstructionsPass());
983 addPass(createSIModeRegisterPass());
985 // The hazard recognizer that runs as part of the post-ra scheduler does not
986 // guarantee to be able handle all hazards correctly. This is because if there
987 // are multiple scheduling regions in a basic block, the regions are scheduled
988 // bottom up, so when we begin to schedule a region we don't know what
989 // instructions were emitted directly before it.
991 // Here we add a stand-alone hazard recognizer pass which can handle all
994 // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
995 // be better for it to emit S_NOP <N> when possible.
996 addPass(&PostRAHazardRecognizerID
);
998 addPass(&SIInsertSkipsPassID
);
999 addPass(&BranchRelaxationPassID
);
1002 TargetPassConfig
*GCNTargetMachine::createPassConfig(PassManagerBase
&PM
) {
1003 return new GCNPassConfig(*this, PM
);
1006 yaml::MachineFunctionInfo
*GCNTargetMachine::createDefaultFuncInfoYAML() const {
1007 return new yaml::SIMachineFunctionInfo();
1010 yaml::MachineFunctionInfo
*
1011 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction
&MF
) const {
1012 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1013 return new yaml::SIMachineFunctionInfo(*MFI
,
1014 *MF
.getSubtarget().getRegisterInfo());
1017 bool GCNTargetMachine::parseMachineFunctionInfo(
1018 const yaml::MachineFunctionInfo
&MFI_
, PerFunctionMIParsingState
&PFS
,
1019 SMDiagnostic
&Error
, SMRange
&SourceRange
) const {
1020 const yaml::SIMachineFunctionInfo
&YamlMFI
=
1021 reinterpret_cast<const yaml::SIMachineFunctionInfo
&>(MFI_
);
1022 MachineFunction
&MF
= PFS
.MF
;
1023 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1025 MFI
->initializeBaseYamlFields(YamlMFI
);
1027 auto parseRegister
= [&](const yaml::StringValue
&RegName
, unsigned &RegVal
) {
1028 if (parseNamedRegisterReference(PFS
, RegVal
, RegName
.Value
, Error
)) {
1029 SourceRange
= RegName
.SourceRange
;
1036 auto diagnoseRegisterClass
= [&](const yaml::StringValue
&RegName
) {
1037 // Create a diagnostic for a the register string literal.
1038 const MemoryBuffer
&Buffer
=
1039 *PFS
.SM
->getMemoryBuffer(PFS
.SM
->getMainFileID());
1040 Error
= SMDiagnostic(*PFS
.SM
, SMLoc(), Buffer
.getBufferIdentifier(), 1,
1041 RegName
.Value
.size(), SourceMgr::DK_Error
,
1042 "incorrect register class for field", RegName
.Value
,
1044 SourceRange
= RegName
.SourceRange
;
1048 if (parseRegister(YamlMFI
.ScratchRSrcReg
, MFI
->ScratchRSrcReg
) ||
1049 parseRegister(YamlMFI
.ScratchWaveOffsetReg
, MFI
->ScratchWaveOffsetReg
) ||
1050 parseRegister(YamlMFI
.FrameOffsetReg
, MFI
->FrameOffsetReg
) ||
1051 parseRegister(YamlMFI
.StackPtrOffsetReg
, MFI
->StackPtrOffsetReg
))
1054 if (MFI
->ScratchRSrcReg
!= AMDGPU::PRIVATE_RSRC_REG
&&
1055 !AMDGPU::SReg_128RegClass
.contains(MFI
->ScratchRSrcReg
)) {
1056 return diagnoseRegisterClass(YamlMFI
.ScratchRSrcReg
);
1059 if (MFI
->ScratchWaveOffsetReg
!= AMDGPU::SCRATCH_WAVE_OFFSET_REG
&&
1060 !AMDGPU::SGPR_32RegClass
.contains(MFI
->ScratchWaveOffsetReg
)) {
1061 return diagnoseRegisterClass(YamlMFI
.ScratchWaveOffsetReg
);
1064 if (MFI
->FrameOffsetReg
!= AMDGPU::FP_REG
&&
1065 !AMDGPU::SGPR_32RegClass
.contains(MFI
->FrameOffsetReg
)) {
1066 return diagnoseRegisterClass(YamlMFI
.FrameOffsetReg
);
1069 if (MFI
->StackPtrOffsetReg
!= AMDGPU::SP_REG
&&
1070 !AMDGPU::SGPR_32RegClass
.contains(MFI
->StackPtrOffsetReg
)) {
1071 return diagnoseRegisterClass(YamlMFI
.StackPtrOffsetReg
);
1074 auto parseAndCheckArgument
= [&](const Optional
<yaml::SIArgument
> &A
,
1075 const TargetRegisterClass
&RC
,
1076 ArgDescriptor
&Arg
, unsigned UserSGPRs
,
1077 unsigned SystemSGPRs
) {
1078 // Skip parsing if it's not present.
1082 if (A
->IsRegister
) {
1084 if (parseNamedRegisterReference(PFS
, Reg
, A
->RegisterName
.Value
, Error
)) {
1085 SourceRange
= A
->RegisterName
.SourceRange
;
1088 if (!RC
.contains(Reg
))
1089 return diagnoseRegisterClass(A
->RegisterName
);
1090 Arg
= ArgDescriptor::createRegister(Reg
);
1092 Arg
= ArgDescriptor::createStack(A
->StackOffset
);
1093 // Check and apply the optional mask.
1095 Arg
= ArgDescriptor::createArg(Arg
, A
->Mask
.getValue());
1097 MFI
->NumUserSGPRs
+= UserSGPRs
;
1098 MFI
->NumSystemSGPRs
+= SystemSGPRs
;
1102 if (YamlMFI
.ArgInfo
&&
1103 (parseAndCheckArgument(YamlMFI
.ArgInfo
->PrivateSegmentBuffer
,
1104 AMDGPU::SReg_128RegClass
,
1105 MFI
->ArgInfo
.PrivateSegmentBuffer
, 4, 0) ||
1106 parseAndCheckArgument(YamlMFI
.ArgInfo
->DispatchPtr
,
1107 AMDGPU::SReg_64RegClass
, MFI
->ArgInfo
.DispatchPtr
,
1109 parseAndCheckArgument(YamlMFI
.ArgInfo
->QueuePtr
, AMDGPU::SReg_64RegClass
,
1110 MFI
->ArgInfo
.QueuePtr
, 2, 0) ||
1111 parseAndCheckArgument(YamlMFI
.ArgInfo
->KernargSegmentPtr
,
1112 AMDGPU::SReg_64RegClass
,
1113 MFI
->ArgInfo
.KernargSegmentPtr
, 2, 0) ||
1114 parseAndCheckArgument(YamlMFI
.ArgInfo
->DispatchID
,
1115 AMDGPU::SReg_64RegClass
, MFI
->ArgInfo
.DispatchID
,
1117 parseAndCheckArgument(YamlMFI
.ArgInfo
->FlatScratchInit
,
1118 AMDGPU::SReg_64RegClass
,
1119 MFI
->ArgInfo
.FlatScratchInit
, 2, 0) ||
1120 parseAndCheckArgument(YamlMFI
.ArgInfo
->PrivateSegmentSize
,
1121 AMDGPU::SGPR_32RegClass
,
1122 MFI
->ArgInfo
.PrivateSegmentSize
, 0, 0) ||
1123 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupIDX
,
1124 AMDGPU::SGPR_32RegClass
, MFI
->ArgInfo
.WorkGroupIDX
,
1126 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupIDY
,
1127 AMDGPU::SGPR_32RegClass
, MFI
->ArgInfo
.WorkGroupIDY
,
1129 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupIDZ
,
1130 AMDGPU::SGPR_32RegClass
, MFI
->ArgInfo
.WorkGroupIDZ
,
1132 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupInfo
,
1133 AMDGPU::SGPR_32RegClass
,
1134 MFI
->ArgInfo
.WorkGroupInfo
, 0, 1) ||
1135 parseAndCheckArgument(YamlMFI
.ArgInfo
->PrivateSegmentWaveByteOffset
,
1136 AMDGPU::SGPR_32RegClass
,
1137 MFI
->ArgInfo
.PrivateSegmentWaveByteOffset
, 0, 1) ||
1138 parseAndCheckArgument(YamlMFI
.ArgInfo
->ImplicitArgPtr
,
1139 AMDGPU::SReg_64RegClass
,
1140 MFI
->ArgInfo
.ImplicitArgPtr
, 0, 0) ||
1141 parseAndCheckArgument(YamlMFI
.ArgInfo
->ImplicitBufferPtr
,
1142 AMDGPU::SReg_64RegClass
,
1143 MFI
->ArgInfo
.ImplicitBufferPtr
, 2, 0) ||
1144 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkItemIDX
,
1145 AMDGPU::VGPR_32RegClass
,
1146 MFI
->ArgInfo
.WorkItemIDX
, 0, 0) ||
1147 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkItemIDY
,
1148 AMDGPU::VGPR_32RegClass
,
1149 MFI
->ArgInfo
.WorkItemIDY
, 0, 0) ||
1150 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkItemIDZ
,
1151 AMDGPU::VGPR_32RegClass
,
1152 MFI
->ArgInfo
.WorkItemIDZ
, 0, 0)))
1155 MFI
->Mode
.IEEE
= YamlMFI
.Mode
.IEEE
;
1156 MFI
->Mode
.DX10Clamp
= YamlMFI
.Mode
.DX10Clamp
;