1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information needed to emit code for SI+ GPUs.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCodeGenPassBuilder.h"
19 #include "AMDGPUCtorDtorLowering.h"
20 #include "AMDGPUExportClustering.h"
21 #include "AMDGPUIGroupLP.h"
22 #include "AMDGPUISelDAGToDAG.h"
23 #include "AMDGPUMacroFusion.h"
24 #include "AMDGPURegBankSelect.h"
25 #include "AMDGPUSplitModule.h"
26 #include "AMDGPUTargetObjectFile.h"
27 #include "AMDGPUTargetTransformInfo.h"
28 #include "AMDGPUUnifyDivergentExitNodes.h"
29 #include "GCNIterativeScheduler.h"
30 #include "GCNSchedStrategy.h"
31 #include "GCNVOPDUtils.h"
33 #include "R600MachineFunctionInfo.h"
34 #include "R600TargetMachine.h"
35 #include "SIMachineFunctionInfo.h"
36 #include "SIMachineScheduler.h"
37 #include "TargetInfo/AMDGPUTargetInfo.h"
38 #include "Utils/AMDGPUBaseInfo.h"
39 #include "llvm/Analysis/CGSCCPassManager.h"
40 #include "llvm/Analysis/CallGraphSCCPass.h"
41 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
42 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
43 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
44 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
45 #include "llvm/CodeGen/GlobalISel/Localizer.h"
46 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
47 #include "llvm/CodeGen/MIRParser/MIParser.h"
48 #include "llvm/CodeGen/Passes.h"
49 #include "llvm/CodeGen/RegAllocRegistry.h"
50 #include "llvm/CodeGen/TargetPassConfig.h"
51 #include "llvm/IR/IntrinsicsAMDGPU.h"
52 #include "llvm/IR/PassManager.h"
53 #include "llvm/IR/PatternMatch.h"
54 #include "llvm/InitializePasses.h"
55 #include "llvm/MC/TargetRegistry.h"
56 #include "llvm/Passes/PassBuilder.h"
57 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
58 #include "llvm/Transforms/IPO.h"
59 #include "llvm/Transforms/IPO/AlwaysInliner.h"
60 #include "llvm/Transforms/IPO/ExpandVariadics.h"
61 #include "llvm/Transforms/IPO/GlobalDCE.h"
62 #include "llvm/Transforms/IPO/Internalize.h"
63 #include "llvm/Transforms/Scalar.h"
64 #include "llvm/Transforms/Scalar/GVN.h"
65 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
66 #include "llvm/Transforms/Utils.h"
67 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
68 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
72 using namespace llvm::PatternMatch
;
75 class SGPRRegisterRegAlloc
: public RegisterRegAllocBase
<SGPRRegisterRegAlloc
> {
77 SGPRRegisterRegAlloc(const char *N
, const char *D
, FunctionPassCtor C
)
78 : RegisterRegAllocBase(N
, D
, C
) {}
81 class VGPRRegisterRegAlloc
: public RegisterRegAllocBase
<VGPRRegisterRegAlloc
> {
83 VGPRRegisterRegAlloc(const char *N
, const char *D
, FunctionPassCtor C
)
84 : RegisterRegAllocBase(N
, D
, C
) {}
87 static bool onlyAllocateSGPRs(const TargetRegisterInfo
&TRI
,
88 const MachineRegisterInfo
&MRI
,
90 const TargetRegisterClass
*RC
= MRI
.getRegClass(Reg
);
91 return static_cast<const SIRegisterInfo
&>(TRI
).isSGPRClass(RC
);
94 static bool onlyAllocateVGPRs(const TargetRegisterInfo
&TRI
,
95 const MachineRegisterInfo
&MRI
,
97 const TargetRegisterClass
*RC
= MRI
.getRegClass(Reg
);
98 return !static_cast<const SIRegisterInfo
&>(TRI
).isSGPRClass(RC
);
101 /// -{sgpr|vgpr}-regalloc=... command line option.
102 static FunctionPass
*useDefaultRegisterAllocator() { return nullptr; }
104 /// A dummy default pass factory indicates whether the register allocator is
105 /// overridden on the command line.
106 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag
;
107 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag
;
109 static SGPRRegisterRegAlloc
110 defaultSGPRRegAlloc("default",
111 "pick SGPR register allocator based on -O option",
112 useDefaultRegisterAllocator
);
114 static cl::opt
<SGPRRegisterRegAlloc::FunctionPassCtor
, false,
115 RegisterPassParser
<SGPRRegisterRegAlloc
>>
116 SGPRRegAlloc("sgpr-regalloc", cl::Hidden
, cl::init(&useDefaultRegisterAllocator
),
117 cl::desc("Register allocator to use for SGPRs"));
119 static cl::opt
<VGPRRegisterRegAlloc::FunctionPassCtor
, false,
120 RegisterPassParser
<VGPRRegisterRegAlloc
>>
121 VGPRRegAlloc("vgpr-regalloc", cl::Hidden
, cl::init(&useDefaultRegisterAllocator
),
122 cl::desc("Register allocator to use for VGPRs"));
125 static void initializeDefaultSGPRRegisterAllocatorOnce() {
126 RegisterRegAlloc::FunctionPassCtor Ctor
= SGPRRegisterRegAlloc::getDefault();
130 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc
);
134 static void initializeDefaultVGPRRegisterAllocatorOnce() {
135 RegisterRegAlloc::FunctionPassCtor Ctor
= VGPRRegisterRegAlloc::getDefault();
139 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc
);
143 static FunctionPass
*createBasicSGPRRegisterAllocator() {
144 return createBasicRegisterAllocator(onlyAllocateSGPRs
);
147 static FunctionPass
*createGreedySGPRRegisterAllocator() {
148 return createGreedyRegisterAllocator(onlyAllocateSGPRs
);
151 static FunctionPass
*createFastSGPRRegisterAllocator() {
152 return createFastRegisterAllocator(onlyAllocateSGPRs
, false);
155 static FunctionPass
*createBasicVGPRRegisterAllocator() {
156 return createBasicRegisterAllocator(onlyAllocateVGPRs
);
159 static FunctionPass
*createGreedyVGPRRegisterAllocator() {
160 return createGreedyRegisterAllocator(onlyAllocateVGPRs
);
163 static FunctionPass
*createFastVGPRRegisterAllocator() {
164 return createFastRegisterAllocator(onlyAllocateVGPRs
, true);
167 static SGPRRegisterRegAlloc
basicRegAllocSGPR(
168 "basic", "basic register allocator", createBasicSGPRRegisterAllocator
);
169 static SGPRRegisterRegAlloc
greedyRegAllocSGPR(
170 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator
);
172 static SGPRRegisterRegAlloc
fastRegAllocSGPR(
173 "fast", "fast register allocator", createFastSGPRRegisterAllocator
);
176 static VGPRRegisterRegAlloc
basicRegAllocVGPR(
177 "basic", "basic register allocator", createBasicVGPRRegisterAllocator
);
178 static VGPRRegisterRegAlloc
greedyRegAllocVGPR(
179 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator
);
181 static VGPRRegisterRegAlloc
fastRegAllocVGPR(
182 "fast", "fast register allocator", createFastVGPRRegisterAllocator
);
183 } // anonymous namespace
186 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden
,
187 cl::desc("Run early if-conversion"),
191 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden
,
192 cl::desc("Run pre-RA exec mask optimizations"),
196 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
197 cl::desc("Lower GPU ctor / dtors to globals on the device."),
198 cl::init(true), cl::Hidden
);
200 // Option to disable vectorizer for tests.
201 static cl::opt
<bool> EnableLoadStoreVectorizer(
202 "amdgpu-load-store-vectorizer",
203 cl::desc("Enable load store vectorizer"),
207 // Option to control global loads scalarization
208 static cl::opt
<bool> ScalarizeGlobal(
209 "amdgpu-scalarize-global-loads",
210 cl::desc("Enable global load scalarization"),
214 // Option to run internalize pass.
215 static cl::opt
<bool> InternalizeSymbols(
216 "amdgpu-internalize-symbols",
217 cl::desc("Enable elimination of non-kernel functions and unused globals"),
221 // Option to inline all early.
222 static cl::opt
<bool> EarlyInlineAll(
223 "amdgpu-early-inline-all",
224 cl::desc("Inline all functions early"),
228 static cl::opt
<bool> RemoveIncompatibleFunctions(
229 "amdgpu-enable-remove-incompatible-functions", cl::Hidden
,
230 cl::desc("Enable removal of functions when they"
231 "use features not supported by the target GPU"),
234 static cl::opt
<bool> EnableSDWAPeephole(
235 "amdgpu-sdwa-peephole",
236 cl::desc("Enable SDWA peepholer"),
239 static cl::opt
<bool> EnableDPPCombine(
240 "amdgpu-dpp-combine",
241 cl::desc("Enable DPP combiner"),
244 // Enable address space based alias analysis
245 static cl::opt
<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden
,
246 cl::desc("Enable AMDGPU Alias Analysis"),
249 // Option to run late CFG structurizer
250 static cl::opt
<bool, true> LateCFGStructurize(
251 "amdgpu-late-structurize",
252 cl::desc("Enable late CFG structurization"),
253 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG
),
256 // Disable structurizer-based control-flow lowering in order to test convergence
257 // control tokens. This should eventually be replaced by the wave-transform.
258 static cl::opt
<bool, true> DisableStructurizer(
259 "amdgpu-disable-structurizer",
260 cl::desc("Disable structurizer for experiments; produces unusable code"),
261 cl::location(AMDGPUTargetMachine::DisableStructurizer
), cl::ReallyHidden
);
263 // Enable lib calls simplifications
264 static cl::opt
<bool> EnableLibCallSimplify(
265 "amdgpu-simplify-libcall",
266 cl::desc("Enable amdgpu library simplifications"),
270 static cl::opt
<bool> EnableLowerKernelArguments(
271 "amdgpu-ir-lower-kernel-arguments",
272 cl::desc("Lower kernel argument loads in IR pass"),
276 static cl::opt
<bool> EnableRegReassign(
277 "amdgpu-reassign-regs",
278 cl::desc("Enable register reassign optimizations on gfx10+"),
282 static cl::opt
<bool> OptVGPRLiveRange(
283 "amdgpu-opt-vgpr-liverange",
284 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
285 cl::init(true), cl::Hidden
);
287 static cl::opt
<ScanOptions
> AMDGPUAtomicOptimizerStrategy(
288 "amdgpu-atomic-optimizer-strategy",
289 cl::desc("Select DPP or Iterative strategy for scan"),
290 cl::init(ScanOptions::Iterative
),
292 clEnumValN(ScanOptions::DPP
, "DPP", "Use DPP operations for scan"),
293 clEnumValN(ScanOptions::Iterative
, "Iterative",
294 "Use Iterative approach for scan"),
295 clEnumValN(ScanOptions::None
, "None", "Disable atomic optimizer")));
297 // Enable Mode register optimization
298 static cl::opt
<bool> EnableSIModeRegisterPass(
299 "amdgpu-mode-register",
300 cl::desc("Enable mode register pass"),
304 // Enable GFX11.5+ s_singleuse_vdst insertion
306 EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
307 cl::desc("Enable s_singleuse_vdst insertion"),
308 cl::init(false), cl::Hidden
);
310 // Enable GFX11+ s_delay_alu insertion
312 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
313 cl::desc("Enable s_delay_alu insertion"),
314 cl::init(true), cl::Hidden
);
316 // Enable GFX11+ VOPD
318 EnableVOPD("amdgpu-enable-vopd",
319 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
320 cl::init(true), cl::Hidden
);
322 // Option is used in lit tests to prevent deadcoding of patterns inspected.
324 EnableDCEInRA("amdgpu-dce-in-ra",
325 cl::init(true), cl::Hidden
,
326 cl::desc("Enable machine DCE inside regalloc"));
328 static cl::opt
<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
329 cl::desc("Adjust wave priority"),
330 cl::init(false), cl::Hidden
);
332 static cl::opt
<bool> EnableScalarIRPasses(
333 "amdgpu-scalar-ir-passes",
334 cl::desc("Enable scalar IR passes"),
338 static cl::opt
<bool> EnableStructurizerWorkarounds(
339 "amdgpu-enable-structurizer-workarounds",
340 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
343 static cl::opt
<bool, true> EnableLowerModuleLDS(
344 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
345 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS
), cl::init(true),
348 static cl::opt
<bool> EnablePreRAOptimizations(
349 "amdgpu-enable-pre-ra-optimizations",
350 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
353 static cl::opt
<bool> EnablePromoteKernelArguments(
354 "amdgpu-enable-promote-kernel-arguments",
355 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
356 cl::Hidden
, cl::init(true));
358 static cl::opt
<bool> EnableImageIntrinsicOptimizer(
359 "amdgpu-enable-image-intrinsic-optimizer",
360 cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
364 EnableLoopPrefetch("amdgpu-loop-prefetch",
365 cl::desc("Enable loop data prefetch on AMDGPU"),
366 cl::Hidden
, cl::init(false));
368 static cl::opt
<bool> EnableMaxIlpSchedStrategy(
369 "amdgpu-enable-max-ilp-scheduling-strategy",
370 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
371 cl::Hidden
, cl::init(false));
373 static cl::opt
<bool> EnableRewritePartialRegUses(
374 "amdgpu-enable-rewrite-partial-reg-uses",
375 cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
378 static cl::opt
<bool> EnableHipStdPar(
379 "amdgpu-enable-hipstdpar",
380 cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
383 extern "C" LLVM_EXTERNAL_VISIBILITY
void LLVMInitializeAMDGPUTarget() {
384 // Register the target
385 RegisterTargetMachine
<R600TargetMachine
> X(getTheR600Target());
386 RegisterTargetMachine
<GCNTargetMachine
> Y(getTheGCNTarget());
388 PassRegistry
*PR
= PassRegistry::getPassRegistry();
389 initializeR600ClauseMergePassPass(*PR
);
390 initializeR600ControlFlowFinalizerPass(*PR
);
391 initializeR600PacketizerPass(*PR
);
392 initializeR600ExpandSpecialInstrsPassPass(*PR
);
393 initializeR600VectorRegMergerPass(*PR
);
394 initializeGlobalISel(*PR
);
395 initializeAMDGPUDAGToDAGISelLegacyPass(*PR
);
396 initializeGCNDPPCombinePass(*PR
);
397 initializeSILowerI1CopiesPass(*PR
);
398 initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR
);
399 initializeSILowerWWMCopiesPass(*PR
);
400 initializeAMDGPUMarkLastScratchLoadPass(*PR
);
401 initializeSILowerSGPRSpillsPass(*PR
);
402 initializeSIFixSGPRCopiesPass(*PR
);
403 initializeSIFixVGPRCopiesPass(*PR
);
404 initializeSIFoldOperandsPass(*PR
);
405 initializeSIPeepholeSDWAPass(*PR
);
406 initializeSIShrinkInstructionsPass(*PR
);
407 initializeSIOptimizeExecMaskingPreRAPass(*PR
);
408 initializeSIOptimizeVGPRLiveRangePass(*PR
);
409 initializeSILoadStoreOptimizerPass(*PR
);
410 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR
);
411 initializeAMDGPUAlwaysInlinePass(*PR
);
412 initializeAMDGPUAttributorLegacyPass(*PR
);
413 initializeAMDGPUAnnotateKernelFeaturesPass(*PR
);
414 initializeAMDGPUAnnotateUniformValuesPass(*PR
);
415 initializeAMDGPUArgumentUsageInfoPass(*PR
);
416 initializeAMDGPUAtomicOptimizerPass(*PR
);
417 initializeAMDGPULowerKernelArgumentsPass(*PR
);
418 initializeAMDGPUPromoteKernelArgumentsPass(*PR
);
419 initializeAMDGPULowerKernelAttributesPass(*PR
);
420 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR
);
421 initializeAMDGPUPostLegalizerCombinerPass(*PR
);
422 initializeAMDGPUPreLegalizerCombinerPass(*PR
);
423 initializeAMDGPURegBankCombinerPass(*PR
);
424 initializeAMDGPURegBankSelectPass(*PR
);
425 initializeAMDGPUPromoteAllocaPass(*PR
);
426 initializeAMDGPUPromoteAllocaToVectorPass(*PR
);
427 initializeAMDGPUCodeGenPreparePass(*PR
);
428 initializeAMDGPULateCodeGenPreparePass(*PR
);
429 initializeAMDGPURemoveIncompatibleFunctionsPass(*PR
);
430 initializeAMDGPULowerModuleLDSLegacyPass(*PR
);
431 initializeAMDGPULowerBufferFatPointersPass(*PR
);
432 initializeAMDGPURewriteOutArgumentsPass(*PR
);
433 initializeAMDGPURewriteUndefForPHILegacyPass(*PR
);
434 initializeAMDGPUUnifyMetadataPass(*PR
);
435 initializeSIAnnotateControlFlowPass(*PR
);
436 initializeAMDGPUInsertSingleUseVDSTPass(*PR
);
437 initializeAMDGPUInsertDelayAluPass(*PR
);
438 initializeSIInsertHardClausesPass(*PR
);
439 initializeSIInsertWaitcntsPass(*PR
);
440 initializeSIModeRegisterPass(*PR
);
441 initializeSIWholeQuadModePass(*PR
);
442 initializeSILowerControlFlowPass(*PR
);
443 initializeSIPreEmitPeepholePass(*PR
);
444 initializeSILateBranchLoweringPass(*PR
);
445 initializeSIMemoryLegalizerPass(*PR
);
446 initializeSIOptimizeExecMaskingPass(*PR
);
447 initializeSIPreAllocateWWMRegsPass(*PR
);
448 initializeSIFormMemoryClausesPass(*PR
);
449 initializeSIPostRABundlerPass(*PR
);
450 initializeGCNCreateVOPDPass(*PR
);
451 initializeAMDGPUUnifyDivergentExitNodesPass(*PR
);
452 initializeAMDGPUAAWrapperPassPass(*PR
);
453 initializeAMDGPUExternalAAWrapperPass(*PR
);
454 initializeAMDGPUImageIntrinsicOptimizerPass(*PR
);
455 initializeAMDGPUPrintfRuntimeBindingPass(*PR
);
456 initializeAMDGPUResourceUsageAnalysisPass(*PR
);
457 initializeGCNNSAReassignPass(*PR
);
458 initializeGCNPreRAOptimizationsPass(*PR
);
459 initializeGCNPreRALongBranchRegPass(*PR
);
460 initializeGCNRewritePartialRegUsesPass(*PR
);
461 initializeGCNRegPressurePrinterPass(*PR
);
464 static std::unique_ptr
<TargetLoweringObjectFile
> createTLOF(const Triple
&TT
) {
465 return std::make_unique
<AMDGPUTargetObjectFile
>();
468 static ScheduleDAGInstrs
*createSIMachineScheduler(MachineSchedContext
*C
) {
469 return new SIScheduleDAGMI(C
);
472 static ScheduleDAGInstrs
*
473 createGCNMaxOccupancyMachineScheduler(MachineSchedContext
*C
) {
474 const GCNSubtarget
&ST
= C
->MF
->getSubtarget
<GCNSubtarget
>();
475 ScheduleDAGMILive
*DAG
=
476 new GCNScheduleDAGMILive(C
, std::make_unique
<GCNMaxOccupancySchedStrategy
>(C
));
477 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
478 if (ST
.shouldClusterStores())
479 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
480 DAG
->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial
));
481 DAG
->addMutation(createAMDGPUMacroFusionDAGMutation());
482 DAG
->addMutation(createAMDGPUExportClusteringDAGMutation());
486 static ScheduleDAGInstrs
*
487 createGCNMaxILPMachineScheduler(MachineSchedContext
*C
) {
488 ScheduleDAGMILive
*DAG
=
489 new GCNScheduleDAGMILive(C
, std::make_unique
<GCNMaxILPSchedStrategy
>(C
));
490 DAG
->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial
));
494 static ScheduleDAGInstrs
*
495 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext
*C
) {
496 const GCNSubtarget
&ST
= C
->MF
->getSubtarget
<GCNSubtarget
>();
497 auto DAG
= new GCNIterativeScheduler(C
,
498 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY
);
499 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
500 if (ST
.shouldClusterStores())
501 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
505 static ScheduleDAGInstrs
*createMinRegScheduler(MachineSchedContext
*C
) {
506 return new GCNIterativeScheduler(C
,
507 GCNIterativeScheduler::SCHEDULE_MINREGFORCED
);
510 static ScheduleDAGInstrs
*
511 createIterativeILPMachineScheduler(MachineSchedContext
*C
) {
512 const GCNSubtarget
&ST
= C
->MF
->getSubtarget
<GCNSubtarget
>();
513 auto DAG
= new GCNIterativeScheduler(C
,
514 GCNIterativeScheduler::SCHEDULE_ILP
);
515 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
516 if (ST
.shouldClusterStores())
517 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
518 DAG
->addMutation(createAMDGPUMacroFusionDAGMutation());
522 static MachineSchedRegistry
523 SISchedRegistry("si", "Run SI's custom scheduler",
524 createSIMachineScheduler
);
526 static MachineSchedRegistry
527 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
528 "Run GCN scheduler to maximize occupancy",
529 createGCNMaxOccupancyMachineScheduler
);
531 static MachineSchedRegistry
532 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
533 createGCNMaxILPMachineScheduler
);
535 static MachineSchedRegistry
IterativeGCNMaxOccupancySchedRegistry(
536 "gcn-iterative-max-occupancy-experimental",
537 "Run GCN scheduler to maximize occupancy (experimental)",
538 createIterativeGCNMaxOccupancyMachineScheduler
);
540 static MachineSchedRegistry
GCNMinRegSchedRegistry(
541 "gcn-iterative-minreg",
542 "Run GCN iterative scheduler for minimal register usage (experimental)",
543 createMinRegScheduler
);
545 static MachineSchedRegistry
GCNILPSchedRegistry(
547 "Run GCN iterative scheduler for ILP scheduling (experimental)",
548 createIterativeILPMachineScheduler
);
550 static StringRef
computeDataLayout(const Triple
&TT
) {
551 if (TT
.getArch() == Triple::r600
) {
553 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
554 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
557 // 32-bit private, local, and region pointers. 64-bit global, constant and
558 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
559 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
560 // (address space 7), and 128-bit non-integral buffer resourcees (address
561 // space 8) which cannot be non-trivilally accessed by LLVM memory operations
562 // like getelementptr.
563 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
564 "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
566 "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
571 static StringRef
getGPUOrDefault(const Triple
&TT
, StringRef GPU
) {
575 // Need to default to a target with flat support for HSA.
576 if (TT
.getArch() == Triple::amdgcn
)
577 return TT
.getOS() == Triple::AMDHSA
? "generic-hsa" : "generic";
582 static Reloc::Model
getEffectiveRelocModel(std::optional
<Reloc::Model
> RM
) {
583 // The AMDGPU toolchain only supports generating shared objects, so we
584 // must always use PIC.
588 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target
&T
, const Triple
&TT
,
589 StringRef CPU
, StringRef FS
,
590 const TargetOptions
&Options
,
591 std::optional
<Reloc::Model
> RM
,
592 std::optional
<CodeModel::Model
> CM
,
593 CodeGenOptLevel OptLevel
)
594 : LLVMTargetMachine(T
, computeDataLayout(TT
), TT
, getGPUOrDefault(TT
, CPU
),
595 FS
, Options
, getEffectiveRelocModel(RM
),
596 getEffectiveCodeModel(CM
, CodeModel::Small
), OptLevel
),
597 TLOF(createTLOF(getTargetTriple())) {
599 if (TT
.getArch() == Triple::amdgcn
) {
600 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
601 MRI
.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64
));
602 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
603 MRI
.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32
));
607 bool AMDGPUTargetMachine::EnableLateStructurizeCFG
= false;
608 bool AMDGPUTargetMachine::EnableFunctionCalls
= false;
609 bool AMDGPUTargetMachine::EnableLowerModuleLDS
= true;
610 bool AMDGPUTargetMachine::DisableStructurizer
= false;
612 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
614 StringRef
AMDGPUTargetMachine::getGPUName(const Function
&F
) const {
615 Attribute GPUAttr
= F
.getFnAttribute("target-cpu");
616 return GPUAttr
.isValid() ? GPUAttr
.getValueAsString() : getTargetCPU();
619 StringRef
AMDGPUTargetMachine::getFeatureString(const Function
&F
) const {
620 Attribute FSAttr
= F
.getFnAttribute("target-features");
622 return FSAttr
.isValid() ? FSAttr
.getValueAsString()
623 : getTargetFeatureString();
626 /// Predicate for Internalize pass.
627 static bool mustPreserveGV(const GlobalValue
&GV
) {
628 if (const Function
*F
= dyn_cast
<Function
>(&GV
))
629 return F
->isDeclaration() || F
->getName().starts_with("__asan_") ||
630 F
->getName().starts_with("__sanitizer_") ||
631 AMDGPU::isEntryFunctionCC(F
->getCallingConv());
633 GV
.removeDeadConstantUsers();
634 return !GV
.use_empty();
637 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager
&AAM
) {
638 AAM
.registerFunctionAnalysis
<AMDGPUAA
>();
641 static Expected
<ScanOptions
>
642 parseAMDGPUAtomicOptimizerStrategy(StringRef Params
) {
644 return ScanOptions::Iterative
;
645 Params
.consume_front("strategy=");
646 auto Result
= StringSwitch
<std::optional
<ScanOptions
>>(Params
)
647 .Case("dpp", ScanOptions::DPP
)
648 .Cases("iterative", "", ScanOptions::Iterative
)
649 .Case("none", ScanOptions::None
)
650 .Default(std::nullopt
);
653 return make_error
<StringError
>("invalid parameter", inconvertibleErrorCode());
656 Error
AMDGPUTargetMachine::buildCodeGenPipeline(
657 ModulePassManager
&MPM
, raw_pwrite_stream
&Out
, raw_pwrite_stream
*DwoOut
,
658 CodeGenFileType FileType
, const CGPassBuilderOption
&Opts
,
659 PassInstrumentationCallbacks
*PIC
) {
660 AMDGPUCodeGenPassBuilder
CGPB(*this, Opts
, PIC
);
661 return CGPB
.buildPipeline(MPM
, Out
, DwoOut
, FileType
);
664 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder
&PB
) {
666 #define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
667 #include "llvm/Passes/TargetPassRegistry.inc"
669 PB
.registerPipelineStartEPCallback(
670 [](ModulePassManager
&PM
, OptimizationLevel Level
) {
671 FunctionPassManager FPM
;
672 PM
.addPass(createModuleToFunctionPassAdaptor(std::move(FPM
)));
674 PM
.addPass(HipStdParAcceleratorCodeSelectionPass());
677 PB
.registerPipelineEarlySimplificationEPCallback(
678 [](ModulePassManager
&PM
, OptimizationLevel Level
) {
679 PM
.addPass(AMDGPUPrintfRuntimeBindingPass());
681 if (Level
== OptimizationLevel::O0
)
684 PM
.addPass(AMDGPUUnifyMetadataPass());
686 if (InternalizeSymbols
) {
687 PM
.addPass(InternalizePass(mustPreserveGV
));
688 PM
.addPass(GlobalDCEPass());
691 if (EarlyInlineAll
&& !EnableFunctionCalls
)
692 PM
.addPass(AMDGPUAlwaysInlinePass());
695 PB
.registerPeepholeEPCallback(
696 [](FunctionPassManager
&FPM
, OptimizationLevel Level
) {
697 if (Level
== OptimizationLevel::O0
)
700 FPM
.addPass(AMDGPUUseNativeCallsPass());
701 if (EnableLibCallSimplify
)
702 FPM
.addPass(AMDGPUSimplifyLibCallsPass());
705 PB
.registerCGSCCOptimizerLateEPCallback(
706 [this](CGSCCPassManager
&PM
, OptimizationLevel Level
) {
707 if (Level
== OptimizationLevel::O0
)
710 FunctionPassManager FPM
;
712 // Add promote kernel arguments pass to the opt pipeline right before
713 // infer address spaces which is needed to do actual address space
715 if (Level
.getSpeedupLevel() > OptimizationLevel::O1
.getSpeedupLevel() &&
716 EnablePromoteKernelArguments
)
717 FPM
.addPass(AMDGPUPromoteKernelArgumentsPass());
719 // Add infer address spaces pass to the opt pipeline after inlining
720 // but before SROA to increase SROA opportunities.
721 FPM
.addPass(InferAddressSpacesPass());
723 // This should run after inlining to have any chance of doing
724 // anything, and before other cleanup optimizations.
725 FPM
.addPass(AMDGPULowerKernelAttributesPass());
727 if (Level
!= OptimizationLevel::O0
) {
728 // Promote alloca to vector before SROA and loop unroll. If we
729 // manage to eliminate allocas before unroll we may choose to unroll
731 FPM
.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
734 PM
.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM
)));
737 // FIXME: Why is AMDGPUAttributor not in CGSCC?
738 PB
.registerOptimizerLastEPCallback(
739 [this](ModulePassManager
&MPM
, OptimizationLevel Level
) {
740 if (Level
!= OptimizationLevel::O0
) {
741 MPM
.addPass(AMDGPUAttributorPass(*this));
745 PB
.registerFullLinkTimeOptimizationLastEPCallback(
746 [this](ModulePassManager
&PM
, OptimizationLevel Level
) {
747 // We want to support the -lto-partitions=N option as "best effort".
748 // For that, we need to lower LDS earlier in the pipeline before the
749 // module is partitioned for codegen.
750 if (EnableLowerModuleLDS
)
751 PM
.addPass(AMDGPULowerModuleLDSPass(*this));
754 PB
.registerRegClassFilterParsingCallback(
755 [](StringRef FilterName
) -> RegAllocFilterFunc
{
756 if (FilterName
== "sgpr")
757 return onlyAllocateSGPRs
;
758 if (FilterName
== "vgpr")
759 return onlyAllocateVGPRs
;
764 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace
) {
765 return (AddrSpace
== AMDGPUAS::LOCAL_ADDRESS
||
766 AddrSpace
== AMDGPUAS::PRIVATE_ADDRESS
||
767 AddrSpace
== AMDGPUAS::REGION_ADDRESS
)
772 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS
,
773 unsigned DestAS
) const {
774 return AMDGPU::isFlatGlobalAddrSpace(SrcAS
) &&
775 AMDGPU::isFlatGlobalAddrSpace(DestAS
);
778 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value
*V
) const {
779 const auto *LD
= dyn_cast
<LoadInst
>(V
);
781 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE
;
783 // It must be a generic pointer loaded.
784 assert(V
->getType()->isPointerTy() &&
785 V
->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS
);
787 const auto *Ptr
= LD
->getPointerOperand();
788 if (Ptr
->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS
)
789 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE
;
790 // For a generic pointer loaded from the constant memory, it could be assumed
791 // as a global pointer since the constant memory is only populated on the
792 // host side. As implied by the offload programming model, only global
793 // pointers could be referenced on the host side.
794 return AMDGPUAS::GLOBAL_ADDRESS
;
797 std::pair
<const Value
*, unsigned>
798 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value
*V
) const {
799 if (auto *II
= dyn_cast
<IntrinsicInst
>(V
)) {
800 switch (II
->getIntrinsicID()) {
801 case Intrinsic::amdgcn_is_shared
:
802 return std::pair(II
->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS
);
803 case Intrinsic::amdgcn_is_private
:
804 return std::pair(II
->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS
);
808 return std::pair(nullptr, -1);
810 // Check the global pointer predication based on
811 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
812 // the order of 'is_shared' and 'is_private' is not significant.
815 const_cast<Value
*>(V
),
816 m_c_And(m_Not(m_Intrinsic
<Intrinsic::amdgcn_is_shared
>(m_Value(Ptr
))),
817 m_Not(m_Intrinsic
<Intrinsic::amdgcn_is_private
>(
819 return std::pair(Ptr
, AMDGPUAS::GLOBAL_ADDRESS
);
821 return std::pair(nullptr, -1);
825 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind
) const {
827 case PseudoSourceValue::Stack
:
828 case PseudoSourceValue::FixedStack
:
829 return AMDGPUAS::PRIVATE_ADDRESS
;
830 case PseudoSourceValue::ConstantPool
:
831 case PseudoSourceValue::GOT
:
832 case PseudoSourceValue::JumpTable
:
833 case PseudoSourceValue::GlobalValueCallEntry
:
834 case PseudoSourceValue::ExternalSymbolCallEntry
:
835 return AMDGPUAS::CONSTANT_ADDRESS
;
837 return AMDGPUAS::FLAT_ADDRESS
;
840 bool AMDGPUTargetMachine::splitModule(
841 Module
&M
, unsigned NumParts
,
842 function_ref
<void(std::unique_ptr
<Module
> MPart
)> ModuleCallback
) {
843 // FIXME(?): Would be better to use an already existing Analysis/PassManager,
844 // but all current users of this API don't have one ready and would need to
845 // create one anyway. Let's hide the boilerplate for now to keep it simple.
847 LoopAnalysisManager LAM
;
848 FunctionAnalysisManager FAM
;
849 CGSCCAnalysisManager CGAM
;
850 ModuleAnalysisManager MAM
;
852 PassBuilder
PB(this);
853 PB
.registerModuleAnalyses(MAM
);
854 PB
.registerFunctionAnalyses(FAM
);
855 PB
.crossRegisterProxies(LAM
, FAM
, CGAM
, MAM
);
857 ModulePassManager MPM
;
858 MPM
.addPass(AMDGPUSplitModulePass(NumParts
, ModuleCallback
));
863 //===----------------------------------------------------------------------===//
864 // GCN Target Machine (SI+)
865 //===----------------------------------------------------------------------===//
867 GCNTargetMachine::GCNTargetMachine(const Target
&T
, const Triple
&TT
,
868 StringRef CPU
, StringRef FS
,
869 const TargetOptions
&Options
,
870 std::optional
<Reloc::Model
> RM
,
871 std::optional
<CodeModel::Model
> CM
,
872 CodeGenOptLevel OL
, bool JIT
)
873 : AMDGPUTargetMachine(T
, TT
, CPU
, FS
, Options
, RM
, CM
, OL
) {}
875 const TargetSubtargetInfo
*
876 GCNTargetMachine::getSubtargetImpl(const Function
&F
) const {
877 StringRef GPU
= getGPUName(F
);
878 StringRef FS
= getFeatureString(F
);
880 SmallString
<128> SubtargetKey(GPU
);
881 SubtargetKey
.append(FS
);
883 auto &I
= SubtargetMap
[SubtargetKey
];
885 // This needs to be done before we create a new subtarget since any
886 // creation will depend on the TM and the code generation flags on the
887 // function that reside in TargetOptions.
888 resetTargetOptions(F
);
889 I
= std::make_unique
<GCNSubtarget
>(TargetTriple
, GPU
, FS
, *this);
892 I
->setScalarizeGlobalBehavior(ScalarizeGlobal
);
898 GCNTargetMachine::getTargetTransformInfo(const Function
&F
) const {
899 return TargetTransformInfo(GCNTTIImpl(this, F
));
902 //===----------------------------------------------------------------------===//
904 //===----------------------------------------------------------------------===//
906 std::unique_ptr
<CSEConfigBase
> llvm::AMDGPUPassConfig::getCSEConfig() const {
907 return getStandardCSEConfigForOpt(TM
->getOptLevel());
912 class GCNPassConfig final
: public AMDGPUPassConfig
{
914 GCNPassConfig(LLVMTargetMachine
&TM
, PassManagerBase
&PM
)
915 : AMDGPUPassConfig(TM
, PM
) {
916 // It is necessary to know the register usage of the entire call graph. We
917 // allow calls without EnableAMDGPUFunctionCalls if they are marked
918 // noinline, so this is always required.
919 setRequiresCodeGenSCCOrder(true);
920 substitutePass(&PostRASchedulerID
, &PostMachineSchedulerID
);
923 GCNTargetMachine
&getGCNTargetMachine() const {
924 return getTM
<GCNTargetMachine
>();
928 createMachineScheduler(MachineSchedContext
*C
) const override
;
931 createPostMachineScheduler(MachineSchedContext
*C
) const override
{
932 ScheduleDAGMI
*DAG
= new GCNPostScheduleDAGMILive(
933 C
, std::make_unique
<PostGenericScheduler
>(C
),
934 /*RemoveKillFlags=*/true);
935 const GCNSubtarget
&ST
= C
->MF
->getSubtarget
<GCNSubtarget
>();
936 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
937 if (ST
.shouldClusterStores())
938 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
939 DAG
->addMutation(ST
.createFillMFMAShadowMutation(DAG
->TII
));
941 createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA
));
942 if (isPassEnabled(EnableVOPD
, CodeGenOptLevel::Less
))
943 DAG
->addMutation(createVOPDPairingMutation());
947 bool addPreISel() override
;
948 void addMachineSSAOptimization() override
;
949 bool addILPOpts() override
;
950 bool addInstSelector() override
;
951 bool addIRTranslator() override
;
952 void addPreLegalizeMachineIR() override
;
953 bool addLegalizeMachineIR() override
;
954 void addPreRegBankSelect() override
;
955 bool addRegBankSelect() override
;
956 void addPreGlobalInstructionSelect() override
;
957 bool addGlobalInstructionSelect() override
;
958 void addFastRegAlloc() override
;
959 void addOptimizedRegAlloc() override
;
961 FunctionPass
*createSGPRAllocPass(bool Optimized
);
962 FunctionPass
*createVGPRAllocPass(bool Optimized
);
963 FunctionPass
*createRegAllocPass(bool Optimized
) override
;
965 bool addRegAssignAndRewriteFast() override
;
966 bool addRegAssignAndRewriteOptimized() override
;
968 void addPreRegAlloc() override
;
969 bool addPreRewrite() override
;
970 void addPostRegAlloc() override
;
971 void addPreSched2() override
;
972 void addPreEmitPass() override
;
975 } // end anonymous namespace
977 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine
&TM
, PassManagerBase
&PM
)
978 : TargetPassConfig(TM
, PM
) {
979 // Exceptions and StackMaps are not supported, so these passes will never do
981 disablePass(&StackMapLivenessID
);
982 disablePass(&FuncletLayoutID
);
983 // Garbage collection is not supported.
984 disablePass(&GCLoweringID
);
985 disablePass(&ShadowStackGCLoweringID
);
988 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
989 if (getOptLevel() == CodeGenOptLevel::Aggressive
)
990 addPass(createGVNPass());
992 addPass(createEarlyCSEPass());
995 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
996 if (isPassEnabled(EnableLoopPrefetch
, CodeGenOptLevel::Aggressive
))
997 addPass(createLoopDataPrefetchPass());
998 addPass(createSeparateConstOffsetFromGEPPass());
999 // ReassociateGEPs exposes more opportunities for SLSR. See
1000 // the example in reassociate-geps-and-slsr.ll.
1001 addPass(createStraightLineStrengthReducePass());
1002 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
1003 // EarlyCSE can reuse.
1004 addEarlyCSEOrGVNPass();
1005 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
1006 addPass(createNaryReassociatePass());
1007 // NaryReassociate on GEPs creates redundant common expressions, so run
1008 // EarlyCSE after it.
1009 addPass(createEarlyCSEPass());
1012 void AMDGPUPassConfig::addIRPasses() {
1013 const AMDGPUTargetMachine
&TM
= getAMDGPUTargetMachine();
1015 Triple::ArchType Arch
= TM
.getTargetTriple().getArch();
1016 if (RemoveIncompatibleFunctions
&& Arch
== Triple::amdgcn
)
1017 addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM
));
1019 // There is no reason to run these.
1020 disablePass(&StackMapLivenessID
);
1021 disablePass(&FuncletLayoutID
);
1022 disablePass(&PatchableFunctionID
);
1024 addPass(createAMDGPUPrintfRuntimeBinding());
1026 addPass(createAMDGPUCtorDtorLoweringLegacyPass());
1028 if (isPassEnabled(EnableImageIntrinsicOptimizer
))
1029 addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM
));
1031 // This can be disabled by passing ::Disable here or on the command line
1032 // with --expand-variadics-override=disable.
1033 addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering
));
1035 // Function calls are not supported, so make sure we inline everything.
1036 addPass(createAMDGPUAlwaysInlinePass());
1037 addPass(createAlwaysInlinerLegacyPass());
1039 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1040 if (Arch
== Triple::r600
)
1041 addPass(createR600OpenCLImageTypeLoweringPass());
1043 // Replace OpenCL enqueued block function pointers with global variables.
1044 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
1046 // Runs before PromoteAlloca so the latter can account for function uses
1047 if (EnableLowerModuleLDS
) {
1048 addPass(createAMDGPULowerModuleLDSLegacyPass(&TM
));
1051 if (TM
.getOptLevel() > CodeGenOptLevel::None
)
1052 addPass(createInferAddressSpacesPass());
1054 // Run atomic optimizer before Atomic Expand
1055 if ((TM
.getTargetTriple().getArch() == Triple::amdgcn
) &&
1056 (TM
.getOptLevel() >= CodeGenOptLevel::Less
) &&
1057 (AMDGPUAtomicOptimizerStrategy
!= ScanOptions::None
)) {
1058 addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy
));
1061 addPass(createAtomicExpandLegacyPass());
1063 if (TM
.getOptLevel() > CodeGenOptLevel::None
) {
1064 addPass(createAMDGPUPromoteAlloca());
1066 if (isPassEnabled(EnableScalarIRPasses
))
1067 addStraightLineScalarOptimizationPasses();
1069 if (EnableAMDGPUAliasAnalysis
) {
1070 addPass(createAMDGPUAAWrapperPass());
1071 addPass(createExternalAAWrapperPass([](Pass
&P
, Function
&,
1073 if (auto *WrapperPass
= P
.getAnalysisIfAvailable
<AMDGPUAAWrapperPass
>())
1074 AAR
.addAAResult(WrapperPass
->getResult());
1078 if (TM
.getTargetTriple().getArch() == Triple::amdgcn
) {
1079 // TODO: May want to move later or split into an early and late one.
1080 addPass(createAMDGPUCodeGenPreparePass());
1083 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1085 if (TM
.getOptLevel() > CodeGenOptLevel::Less
)
1086 addPass(createLICMPass());
1089 TargetPassConfig::addIRPasses();
1091 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1092 // example, GVN can combine
1099 // %0 = shl nsw %a, 2
1102 // but EarlyCSE can do neither of them.
1103 if (isPassEnabled(EnableScalarIRPasses
))
1104 addEarlyCSEOrGVNPass();
1107 void AMDGPUPassConfig::addCodeGenPrepare() {
1108 if (TM
->getTargetTriple().getArch() == Triple::amdgcn
) {
1109 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1110 // analysis, and should be removed.
1111 addPass(createAMDGPUAnnotateKernelFeaturesPass());
1114 if (TM
->getTargetTriple().getArch() == Triple::amdgcn
&&
1115 EnableLowerKernelArguments
)
1116 addPass(createAMDGPULowerKernelArgumentsPass());
1118 if (TM
->getTargetTriple().getArch() == Triple::amdgcn
) {
1119 // This lowering has been placed after codegenprepare to take advantage of
1120 // address mode matching (which is why it isn't put with the LDS lowerings).
1121 // It could be placed anywhere before uniformity annotations (an analysis
1122 // that it changes by splitting up fat pointers into their components)
1123 // but has been put before switch lowering and CFG flattening so that those
1124 // passes can run on the more optimized control flow this pass creates in
1127 // FIXME: This should ideally be put after the LoadStoreVectorizer.
1128 // However, due to some annoying facts about ResourceUsageAnalysis,
1129 // (especially as exercised in the resource-usage-dead-function test),
1130 // we need all the function passes codegenprepare all the way through
1131 // said resource usage analysis to run on the call graph produced
1132 // before codegenprepare runs (because codegenprepare will knock some
1133 // nodes out of the graph, which leads to function-level passes not
1134 // being run on them, which causes crashes in the resource usage analysis).
1135 addPass(createAMDGPULowerBufferFatPointersPass());
1136 // In accordance with the above FIXME, manually force all the
1137 // function-level passes into a CGSCCPassManager.
1138 addPass(new DummyCGSCCPass());
1141 TargetPassConfig::addCodeGenPrepare();
1143 if (isPassEnabled(EnableLoadStoreVectorizer
))
1144 addPass(createLoadStoreVectorizerPass());
1146 // LowerSwitch pass may introduce unreachable blocks that can
1147 // cause unexpected behavior for subsequent passes. Placing it
1148 // here seems better that these blocks would get cleaned up by
1149 // UnreachableBlockElim inserted next in the pass flow.
1150 addPass(createLowerSwitchPass());
1153 bool AMDGPUPassConfig::addPreISel() {
1154 if (TM
->getOptLevel() > CodeGenOptLevel::None
)
1155 addPass(createFlattenCFGPass());
1159 bool AMDGPUPassConfig::addInstSelector() {
1160 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
1164 bool AMDGPUPassConfig::addGCPasses() {
1165 // Do nothing. GC is not supported.
1169 llvm::ScheduleDAGInstrs
*
1170 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext
*C
) const {
1171 const GCNSubtarget
&ST
= C
->MF
->getSubtarget
<GCNSubtarget
>();
1172 ScheduleDAGMILive
*DAG
= createGenericSchedLive(C
);
1173 DAG
->addMutation(createLoadClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
1174 if (ST
.shouldClusterStores())
1175 DAG
->addMutation(createStoreClusterDAGMutation(DAG
->TII
, DAG
->TRI
));
1179 MachineFunctionInfo
*R600TargetMachine::createMachineFunctionInfo(
1180 BumpPtrAllocator
&Allocator
, const Function
&F
,
1181 const TargetSubtargetInfo
*STI
) const {
1182 return R600MachineFunctionInfo::create
<R600MachineFunctionInfo
>(
1183 Allocator
, F
, static_cast<const R600Subtarget
*>(STI
));
1186 //===----------------------------------------------------------------------===//
1188 //===----------------------------------------------------------------------===//
1190 ScheduleDAGInstrs
*GCNPassConfig::createMachineScheduler(
1191 MachineSchedContext
*C
) const {
1192 const GCNSubtarget
&ST
= C
->MF
->getSubtarget
<GCNSubtarget
>();
1193 if (ST
.enableSIScheduler())
1194 return createSIMachineScheduler(C
);
1196 if (EnableMaxIlpSchedStrategy
)
1197 return createGCNMaxILPMachineScheduler(C
);
1199 return createGCNMaxOccupancyMachineScheduler(C
);
1202 bool GCNPassConfig::addPreISel() {
1203 AMDGPUPassConfig::addPreISel();
1205 if (TM
->getOptLevel() > CodeGenOptLevel::None
)
1206 addPass(createSinkingPass());
1208 if (TM
->getOptLevel() > CodeGenOptLevel::None
)
1209 addPass(createAMDGPULateCodeGenPreparePass());
1211 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1212 // regions formed by them.
1213 addPass(&AMDGPUUnifyDivergentExitNodesID
);
1214 if (!LateCFGStructurize
&& !DisableStructurizer
) {
1215 if (EnableStructurizerWorkarounds
) {
1216 addPass(createFixIrreduciblePass());
1217 addPass(createUnifyLoopExitsPass());
1219 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1221 addPass(createAMDGPUAnnotateUniformValues());
1222 if (!LateCFGStructurize
&& !DisableStructurizer
) {
1223 addPass(createSIAnnotateControlFlowPass());
1224 // TODO: Move this right after structurizeCFG to avoid extra divergence
1225 // analysis. This depends on stopping SIAnnotateControlFlow from making
1226 // control flow modifications.
1227 addPass(createAMDGPURewriteUndefForPHILegacyPass());
1229 addPass(createLCSSAPass());
1231 if (TM
->getOptLevel() > CodeGenOptLevel::Less
)
1232 addPass(&AMDGPUPerfHintAnalysisID
);
1237 void GCNPassConfig::addMachineSSAOptimization() {
1238 TargetPassConfig::addMachineSSAOptimization();
1240 // We want to fold operands after PeepholeOptimizer has run (or as part of
1241 // it), because it will eliminate extra copies making it easier to fold the
1242 // real source operand. We want to eliminate dead instructions after, so that
1243 // we see fewer uses of the copies. We then need to clean up the dead
1244 // instructions leftover after the operands are folded as well.
1246 // XXX - Can we get away without running DeadMachineInstructionElim again?
1247 addPass(&SIFoldOperandsID
);
1248 if (EnableDPPCombine
)
1249 addPass(&GCNDPPCombineID
);
1250 addPass(&SILoadStoreOptimizerID
);
1251 if (isPassEnabled(EnableSDWAPeephole
)) {
1252 addPass(&SIPeepholeSDWAID
);
1253 addPass(&EarlyMachineLICMID
);
1254 addPass(&MachineCSEID
);
1255 addPass(&SIFoldOperandsID
);
1257 addPass(&DeadMachineInstructionElimID
);
1258 addPass(createSIShrinkInstructionsPass());
1261 bool GCNPassConfig::addILPOpts() {
1262 if (EnableEarlyIfConversion
)
1263 addPass(&EarlyIfConverterID
);
1265 TargetPassConfig::addILPOpts();
1269 bool GCNPassConfig::addInstSelector() {
1270 AMDGPUPassConfig::addInstSelector();
1271 addPass(&SIFixSGPRCopiesID
);
1272 addPass(createSILowerI1CopiesPass());
1276 bool GCNPassConfig::addIRTranslator() {
1277 addPass(new IRTranslator(getOptLevel()));
1281 void GCNPassConfig::addPreLegalizeMachineIR() {
1282 bool IsOptNone
= getOptLevel() == CodeGenOptLevel::None
;
1283 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone
));
1284 addPass(new Localizer());
1287 bool GCNPassConfig::addLegalizeMachineIR() {
1288 addPass(new Legalizer());
1292 void GCNPassConfig::addPreRegBankSelect() {
1293 bool IsOptNone
= getOptLevel() == CodeGenOptLevel::None
;
1294 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone
));
1295 addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
1298 bool GCNPassConfig::addRegBankSelect() {
1299 addPass(new AMDGPURegBankSelect());
1303 void GCNPassConfig::addPreGlobalInstructionSelect() {
1304 bool IsOptNone
= getOptLevel() == CodeGenOptLevel::None
;
1305 addPass(createAMDGPURegBankCombiner(IsOptNone
));
1308 bool GCNPassConfig::addGlobalInstructionSelect() {
1309 addPass(new InstructionSelect(getOptLevel()));
1313 void GCNPassConfig::addPreRegAlloc() {
1314 if (LateCFGStructurize
) {
1315 addPass(createAMDGPUMachineCFGStructurizerPass());
1319 void GCNPassConfig::addFastRegAlloc() {
1320 // FIXME: We have to disable the verifier here because of PHIElimination +
1321 // TwoAddressInstructions disabling it.
1323 // This must be run immediately after phi elimination and before
1324 // TwoAddressInstructions, otherwise the processing of the tied operand of
1325 // SI_ELSE will introduce a copy of the tied operand source after the else.
1326 insertPass(&PHIEliminationID
, &SILowerControlFlowID
);
1328 insertPass(&TwoAddressInstructionPassID
, &SIWholeQuadModeID
);
1330 TargetPassConfig::addFastRegAlloc();
1333 void GCNPassConfig::addOptimizedRegAlloc() {
1334 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1335 // instructions that cause scheduling barriers.
1336 insertPass(&MachineSchedulerID
, &SIWholeQuadModeID
);
1338 if (OptExecMaskPreRA
)
1339 insertPass(&MachineSchedulerID
, &SIOptimizeExecMaskingPreRAID
);
1341 if (EnableRewritePartialRegUses
)
1342 insertPass(&RenameIndependentSubregsID
, &GCNRewritePartialRegUsesID
);
1344 if (isPassEnabled(EnablePreRAOptimizations
))
1345 insertPass(&RenameIndependentSubregsID
, &GCNPreRAOptimizationsID
);
1347 // This is not an essential optimization and it has a noticeable impact on
1348 // compilation time, so we only enable it from O2.
1349 if (TM
->getOptLevel() > CodeGenOptLevel::Less
)
1350 insertPass(&MachineSchedulerID
, &SIFormMemoryClausesID
);
1352 // FIXME: when an instruction has a Killed operand, and the instruction is
1353 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1354 // the register in LiveVariables, this would trigger a failure in verifier,
1355 // we should fix it and enable the verifier.
1356 if (OptVGPRLiveRange
)
1357 insertPass(&LiveVariablesID
, &SIOptimizeVGPRLiveRangeID
);
1358 // This must be run immediately after phi elimination and before
1359 // TwoAddressInstructions, otherwise the processing of the tied operand of
1360 // SI_ELSE will introduce a copy of the tied operand source after the else.
1361 insertPass(&PHIEliminationID
, &SILowerControlFlowID
);
1364 insertPass(&DetectDeadLanesID
, &DeadMachineInstructionElimID
);
1366 TargetPassConfig::addOptimizedRegAlloc();
1369 bool GCNPassConfig::addPreRewrite() {
1370 addPass(&SILowerWWMCopiesID
);
1371 if (EnableRegReassign
)
1372 addPass(&GCNNSAReassignID
);
1376 FunctionPass
*GCNPassConfig::createSGPRAllocPass(bool Optimized
) {
1377 // Initialize the global default.
1378 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag
,
1379 initializeDefaultSGPRRegisterAllocatorOnce
);
1381 RegisterRegAlloc::FunctionPassCtor Ctor
= SGPRRegisterRegAlloc::getDefault();
1382 if (Ctor
!= useDefaultRegisterAllocator
)
1386 return createGreedyRegisterAllocator(onlyAllocateSGPRs
);
1388 return createFastRegisterAllocator(onlyAllocateSGPRs
, false);
1391 FunctionPass
*GCNPassConfig::createVGPRAllocPass(bool Optimized
) {
1392 // Initialize the global default.
1393 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag
,
1394 initializeDefaultVGPRRegisterAllocatorOnce
);
1396 RegisterRegAlloc::FunctionPassCtor Ctor
= VGPRRegisterRegAlloc::getDefault();
1397 if (Ctor
!= useDefaultRegisterAllocator
)
1401 return createGreedyVGPRRegisterAllocator();
1403 return createFastVGPRRegisterAllocator();
1406 FunctionPass
*GCNPassConfig::createRegAllocPass(bool Optimized
) {
1407 llvm_unreachable("should not be used");
1410 static const char RegAllocOptNotSupportedMessage
[] =
1411 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1413 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1414 if (!usingDefaultRegAlloc())
1415 report_fatal_error(RegAllocOptNotSupportedMessage
);
1417 addPass(&GCNPreRALongBranchRegID
);
1419 addPass(createSGPRAllocPass(false));
1421 // Equivalent of PEI for SGPRs.
1422 addPass(&SILowerSGPRSpillsID
);
1423 addPass(&SIPreAllocateWWMRegsID
);
1425 addPass(createVGPRAllocPass(false));
1427 addPass(&SILowerWWMCopiesID
);
1431 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1432 if (!usingDefaultRegAlloc())
1433 report_fatal_error(RegAllocOptNotSupportedMessage
);
1435 addPass(&GCNPreRALongBranchRegID
);
1437 addPass(createSGPRAllocPass(true));
1439 // Commit allocated register changes. This is mostly necessary because too
1440 // many things rely on the use lists of the physical registers, such as the
1441 // verifier. This is only necessary with allocators which use LiveIntervals,
1442 // since FastRegAlloc does the replacements itself.
1443 addPass(createVirtRegRewriter(false));
1445 // Equivalent of PEI for SGPRs.
1446 addPass(&SILowerSGPRSpillsID
);
1447 addPass(&SIPreAllocateWWMRegsID
);
1449 addPass(createVGPRAllocPass(true));
1452 addPass(&VirtRegRewriterID
);
1454 addPass(&AMDGPUMarkLastScratchLoadID
);
1459 void GCNPassConfig::addPostRegAlloc() {
1460 addPass(&SIFixVGPRCopiesID
);
1461 if (getOptLevel() > CodeGenOptLevel::None
)
1462 addPass(&SIOptimizeExecMaskingID
);
1463 TargetPassConfig::addPostRegAlloc();
1466 void GCNPassConfig::addPreSched2() {
1467 if (TM
->getOptLevel() > CodeGenOptLevel::None
)
1468 addPass(createSIShrinkInstructionsPass());
1469 addPass(&SIPostRABundlerID
);
1472 void GCNPassConfig::addPreEmitPass() {
1473 if (isPassEnabled(EnableVOPD
, CodeGenOptLevel::Less
))
1474 addPass(&GCNCreateVOPDID
);
1475 addPass(createSIMemoryLegalizerPass());
1476 addPass(createSIInsertWaitcntsPass());
1478 addPass(createSIModeRegisterPass());
1480 if (getOptLevel() > CodeGenOptLevel::None
)
1481 addPass(&SIInsertHardClausesID
);
1483 addPass(&SILateBranchLoweringPassID
);
1484 if (isPassEnabled(EnableSetWavePriority
, CodeGenOptLevel::Less
))
1485 addPass(createAMDGPUSetWavePriorityPass());
1486 if (getOptLevel() > CodeGenOptLevel::None
)
1487 addPass(&SIPreEmitPeepholeID
);
1488 // The hazard recognizer that runs as part of the post-ra scheduler does not
1489 // guarantee to be able handle all hazards correctly. This is because if there
1490 // are multiple scheduling regions in a basic block, the regions are scheduled
1491 // bottom up, so when we begin to schedule a region we don't know what
1492 // instructions were emitted directly before it.
1494 // Here we add a stand-alone hazard recognizer pass which can handle all
1496 addPass(&PostRAHazardRecognizerID
);
1498 if (isPassEnabled(EnableInsertSingleUseVDST
, CodeGenOptLevel::Less
))
1499 addPass(&AMDGPUInsertSingleUseVDSTID
);
1501 if (isPassEnabled(EnableInsertDelayAlu
, CodeGenOptLevel::Less
))
1502 addPass(&AMDGPUInsertDelayAluID
);
1504 addPass(&BranchRelaxationPassID
);
1507 TargetPassConfig
*GCNTargetMachine::createPassConfig(PassManagerBase
&PM
) {
1508 return new GCNPassConfig(*this, PM
);
1511 void GCNTargetMachine::registerMachineRegisterInfoCallback(
1512 MachineFunction
&MF
) const {
1513 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1514 MF
.getRegInfo().addDelegate(MFI
);
1517 MachineFunctionInfo
*GCNTargetMachine::createMachineFunctionInfo(
1518 BumpPtrAllocator
&Allocator
, const Function
&F
,
1519 const TargetSubtargetInfo
*STI
) const {
1520 return SIMachineFunctionInfo::create
<SIMachineFunctionInfo
>(
1521 Allocator
, F
, static_cast<const GCNSubtarget
*>(STI
));
1524 yaml::MachineFunctionInfo
*GCNTargetMachine::createDefaultFuncInfoYAML() const {
1525 return new yaml::SIMachineFunctionInfo();
1528 yaml::MachineFunctionInfo
*
1529 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction
&MF
) const {
1530 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1531 return new yaml::SIMachineFunctionInfo(
1532 *MFI
, *MF
.getSubtarget
<GCNSubtarget
>().getRegisterInfo(), MF
);
1535 bool GCNTargetMachine::parseMachineFunctionInfo(
1536 const yaml::MachineFunctionInfo
&MFI_
, PerFunctionMIParsingState
&PFS
,
1537 SMDiagnostic
&Error
, SMRange
&SourceRange
) const {
1538 const yaml::SIMachineFunctionInfo
&YamlMFI
=
1539 static_cast<const yaml::SIMachineFunctionInfo
&>(MFI_
);
1540 MachineFunction
&MF
= PFS
.MF
;
1541 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1542 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1544 if (MFI
->initializeBaseYamlFields(YamlMFI
, MF
, PFS
, Error
, SourceRange
))
1547 if (MFI
->Occupancy
== 0) {
1548 // Fixup the subtarget dependent default value.
1549 MFI
->Occupancy
= ST
.computeOccupancy(MF
.getFunction(), MFI
->getLDSSize());
1552 auto parseRegister
= [&](const yaml::StringValue
&RegName
, Register
&RegVal
) {
1554 if (parseNamedRegisterReference(PFS
, TempReg
, RegName
.Value
, Error
)) {
1555 SourceRange
= RegName
.SourceRange
;
1563 auto parseOptionalRegister
= [&](const yaml::StringValue
&RegName
,
1565 return !RegName
.Value
.empty() && parseRegister(RegName
, RegVal
);
1568 if (parseOptionalRegister(YamlMFI
.VGPRForAGPRCopy
, MFI
->VGPRForAGPRCopy
))
1571 if (parseOptionalRegister(YamlMFI
.SGPRForEXECCopy
, MFI
->SGPRForEXECCopy
))
1574 if (parseOptionalRegister(YamlMFI
.LongBranchReservedReg
,
1575 MFI
->LongBranchReservedReg
))
1578 auto diagnoseRegisterClass
= [&](const yaml::StringValue
&RegName
) {
1579 // Create a diagnostic for a the register string literal.
1580 const MemoryBuffer
&Buffer
=
1581 *PFS
.SM
->getMemoryBuffer(PFS
.SM
->getMainFileID());
1582 Error
= SMDiagnostic(*PFS
.SM
, SMLoc(), Buffer
.getBufferIdentifier(), 1,
1583 RegName
.Value
.size(), SourceMgr::DK_Error
,
1584 "incorrect register class for field", RegName
.Value
,
1585 std::nullopt
, std::nullopt
);
1586 SourceRange
= RegName
.SourceRange
;
1590 if (parseRegister(YamlMFI
.ScratchRSrcReg
, MFI
->ScratchRSrcReg
) ||
1591 parseRegister(YamlMFI
.FrameOffsetReg
, MFI
->FrameOffsetReg
) ||
1592 parseRegister(YamlMFI
.StackPtrOffsetReg
, MFI
->StackPtrOffsetReg
))
1595 if (MFI
->ScratchRSrcReg
!= AMDGPU::PRIVATE_RSRC_REG
&&
1596 !AMDGPU::SGPR_128RegClass
.contains(MFI
->ScratchRSrcReg
)) {
1597 return diagnoseRegisterClass(YamlMFI
.ScratchRSrcReg
);
1600 if (MFI
->FrameOffsetReg
!= AMDGPU::FP_REG
&&
1601 !AMDGPU::SGPR_32RegClass
.contains(MFI
->FrameOffsetReg
)) {
1602 return diagnoseRegisterClass(YamlMFI
.FrameOffsetReg
);
1605 if (MFI
->StackPtrOffsetReg
!= AMDGPU::SP_REG
&&
1606 !AMDGPU::SGPR_32RegClass
.contains(MFI
->StackPtrOffsetReg
)) {
1607 return diagnoseRegisterClass(YamlMFI
.StackPtrOffsetReg
);
1610 for (const auto &YamlReg
: YamlMFI
.WWMReservedRegs
) {
1612 if (parseRegister(YamlReg
, ParsedReg
))
1615 MFI
->reserveWWMRegister(ParsedReg
);
1618 auto parseAndCheckArgument
= [&](const std::optional
<yaml::SIArgument
> &A
,
1619 const TargetRegisterClass
&RC
,
1620 ArgDescriptor
&Arg
, unsigned UserSGPRs
,
1621 unsigned SystemSGPRs
) {
1622 // Skip parsing if it's not present.
1626 if (A
->IsRegister
) {
1628 if (parseNamedRegisterReference(PFS
, Reg
, A
->RegisterName
.Value
, Error
)) {
1629 SourceRange
= A
->RegisterName
.SourceRange
;
1632 if (!RC
.contains(Reg
))
1633 return diagnoseRegisterClass(A
->RegisterName
);
1634 Arg
= ArgDescriptor::createRegister(Reg
);
1636 Arg
= ArgDescriptor::createStack(A
->StackOffset
);
1637 // Check and apply the optional mask.
1639 Arg
= ArgDescriptor::createArg(Arg
, *A
->Mask
);
1641 MFI
->NumUserSGPRs
+= UserSGPRs
;
1642 MFI
->NumSystemSGPRs
+= SystemSGPRs
;
1646 if (YamlMFI
.ArgInfo
&&
1647 (parseAndCheckArgument(YamlMFI
.ArgInfo
->PrivateSegmentBuffer
,
1648 AMDGPU::SGPR_128RegClass
,
1649 MFI
->ArgInfo
.PrivateSegmentBuffer
, 4, 0) ||
1650 parseAndCheckArgument(YamlMFI
.ArgInfo
->DispatchPtr
,
1651 AMDGPU::SReg_64RegClass
, MFI
->ArgInfo
.DispatchPtr
,
1653 parseAndCheckArgument(YamlMFI
.ArgInfo
->QueuePtr
, AMDGPU::SReg_64RegClass
,
1654 MFI
->ArgInfo
.QueuePtr
, 2, 0) ||
1655 parseAndCheckArgument(YamlMFI
.ArgInfo
->KernargSegmentPtr
,
1656 AMDGPU::SReg_64RegClass
,
1657 MFI
->ArgInfo
.KernargSegmentPtr
, 2, 0) ||
1658 parseAndCheckArgument(YamlMFI
.ArgInfo
->DispatchID
,
1659 AMDGPU::SReg_64RegClass
, MFI
->ArgInfo
.DispatchID
,
1661 parseAndCheckArgument(YamlMFI
.ArgInfo
->FlatScratchInit
,
1662 AMDGPU::SReg_64RegClass
,
1663 MFI
->ArgInfo
.FlatScratchInit
, 2, 0) ||
1664 parseAndCheckArgument(YamlMFI
.ArgInfo
->PrivateSegmentSize
,
1665 AMDGPU::SGPR_32RegClass
,
1666 MFI
->ArgInfo
.PrivateSegmentSize
, 0, 0) ||
1667 parseAndCheckArgument(YamlMFI
.ArgInfo
->LDSKernelId
,
1668 AMDGPU::SGPR_32RegClass
,
1669 MFI
->ArgInfo
.LDSKernelId
, 0, 1) ||
1670 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupIDX
,
1671 AMDGPU::SGPR_32RegClass
, MFI
->ArgInfo
.WorkGroupIDX
,
1673 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupIDY
,
1674 AMDGPU::SGPR_32RegClass
, MFI
->ArgInfo
.WorkGroupIDY
,
1676 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupIDZ
,
1677 AMDGPU::SGPR_32RegClass
, MFI
->ArgInfo
.WorkGroupIDZ
,
1679 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkGroupInfo
,
1680 AMDGPU::SGPR_32RegClass
,
1681 MFI
->ArgInfo
.WorkGroupInfo
, 0, 1) ||
1682 parseAndCheckArgument(YamlMFI
.ArgInfo
->PrivateSegmentWaveByteOffset
,
1683 AMDGPU::SGPR_32RegClass
,
1684 MFI
->ArgInfo
.PrivateSegmentWaveByteOffset
, 0, 1) ||
1685 parseAndCheckArgument(YamlMFI
.ArgInfo
->ImplicitArgPtr
,
1686 AMDGPU::SReg_64RegClass
,
1687 MFI
->ArgInfo
.ImplicitArgPtr
, 0, 0) ||
1688 parseAndCheckArgument(YamlMFI
.ArgInfo
->ImplicitBufferPtr
,
1689 AMDGPU::SReg_64RegClass
,
1690 MFI
->ArgInfo
.ImplicitBufferPtr
, 2, 0) ||
1691 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkItemIDX
,
1692 AMDGPU::VGPR_32RegClass
,
1693 MFI
->ArgInfo
.WorkItemIDX
, 0, 0) ||
1694 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkItemIDY
,
1695 AMDGPU::VGPR_32RegClass
,
1696 MFI
->ArgInfo
.WorkItemIDY
, 0, 0) ||
1697 parseAndCheckArgument(YamlMFI
.ArgInfo
->WorkItemIDZ
,
1698 AMDGPU::VGPR_32RegClass
,
1699 MFI
->ArgInfo
.WorkItemIDZ
, 0, 0)))
1702 if (ST
.hasIEEEMode())
1703 MFI
->Mode
.IEEE
= YamlMFI
.Mode
.IEEE
;
1704 if (ST
.hasDX10ClampMode())
1705 MFI
->Mode
.DX10Clamp
= YamlMFI
.Mode
.DX10Clamp
;
1707 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1708 MFI
->Mode
.FP32Denormals
.Input
= YamlMFI
.Mode
.FP32InputDenormals
1709 ? DenormalMode::IEEE
1710 : DenormalMode::PreserveSign
;
1711 MFI
->Mode
.FP32Denormals
.Output
= YamlMFI
.Mode
.FP32OutputDenormals
1712 ? DenormalMode::IEEE
1713 : DenormalMode::PreserveSign
;
1715 MFI
->Mode
.FP64FP16Denormals
.Input
= YamlMFI
.Mode
.FP64FP16InputDenormals
1716 ? DenormalMode::IEEE
1717 : DenormalMode::PreserveSign
;
1718 MFI
->Mode
.FP64FP16Denormals
.Output
= YamlMFI
.Mode
.FP64FP16OutputDenormals
1719 ? DenormalMode::IEEE
1720 : DenormalMode::PreserveSign
;