1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
11 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12 /// code. When passed an MCAsmStreamer it prints assembly and when passed
13 /// an MCObjectStreamer it outputs binary code.
15 //===----------------------------------------------------------------------===//
18 #include "AMDGPUAsmPrinter.h"
20 #include "AMDGPUHSAMetadataStreamer.h"
21 #include "AMDGPUResourceUsageAnalysis.h"
22 #include "GCNSubtarget.h"
23 #include "MCTargetDesc/AMDGPUInstPrinter.h"
24 #include "MCTargetDesc/AMDGPUMCExpr.h"
25 #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
26 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
27 #include "R600AsmPrinter.h"
28 #include "SIMachineFunctionInfo.h"
29 #include "TargetInfo/AMDGPUTargetInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "Utils/AMDKernelCodeTUtils.h"
32 #include "Utils/SIDefinesUtils.h"
33 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
34 #include "llvm/BinaryFormat/ELF.h"
35 #include "llvm/CodeGen/MachineFrameInfo.h"
36 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
37 #include "llvm/IR/DiagnosticInfo.h"
38 #include "llvm/MC/MCAssembler.h"
39 #include "llvm/MC/MCContext.h"
40 #include "llvm/MC/MCSectionELF.h"
41 #include "llvm/MC/MCStreamer.h"
42 #include "llvm/MC/TargetRegistry.h"
43 #include "llvm/Support/AMDHSAKernelDescriptor.h"
44 #include "llvm/Target/TargetLoweringObjectFile.h"
45 #include "llvm/Target/TargetMachine.h"
46 #include "llvm/TargetParser/TargetParser.h"
49 using namespace llvm::AMDGPU
;
51 // This should get the default rounding mode from the kernel. We just set the
52 // default here, but this could change if the OpenCL rounding mode pragmas are
55 // The denormal mode here should match what is reported by the OpenCL runtime
56 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
57 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
59 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
60 // precision, and leaves single precision to flush all and does not report
61 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
62 // CL_FP_DENORM for both.
64 // FIXME: It seems some instructions do not support single precision denormals
65 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
66 // and sin_f32, cos_f32 on most parts).
68 // We want to use these instructions, and using fp32 denormals also causes
69 // instructions to run at the double precision rate for the device so it's
70 // probably best to just report no single precision denormals.
71 static uint32_t getFPMode(SIModeRegisterDefaults Mode
) {
72 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST
) |
73 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST
) |
74 FP_DENORM_MODE_SP(Mode
.fpDenormModeSPValue()) |
75 FP_DENORM_MODE_DP(Mode
.fpDenormModeDPValue());
79 createAMDGPUAsmPrinterPass(TargetMachine
&tm
,
80 std::unique_ptr
<MCStreamer
> &&Streamer
) {
81 return new AMDGPUAsmPrinter(tm
, std::move(Streamer
));
84 extern "C" LLVM_EXTERNAL_VISIBILITY
void LLVMInitializeAMDGPUAsmPrinter() {
85 TargetRegistry::RegisterAsmPrinter(getTheR600Target(),
86 llvm::createR600AsmPrinterPass
);
87 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
88 createAMDGPUAsmPrinterPass
);
91 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine
&TM
,
92 std::unique_ptr
<MCStreamer
> Streamer
)
93 : AsmPrinter(TM
, std::move(Streamer
)) {
94 assert(OutStreamer
&& "AsmPrinter constructed without streamer");
97 StringRef
AMDGPUAsmPrinter::getPassName() const {
98 return "AMDGPU Assembly Printer";
101 const MCSubtargetInfo
*AMDGPUAsmPrinter::getGlobalSTI() const {
102 return TM
.getMCSubtargetInfo();
105 AMDGPUTargetStreamer
* AMDGPUAsmPrinter::getTargetStreamer() const {
108 return static_cast<AMDGPUTargetStreamer
*>(OutStreamer
->getTargetStreamer());
111 void AMDGPUAsmPrinter::emitStartOfAsmFile(Module
&M
) {
112 IsTargetStreamerInitialized
= false;
115 void AMDGPUAsmPrinter::initTargetStreamer(Module
&M
) {
116 IsTargetStreamerInitialized
= true;
118 // TODO: Which one is called first, emitStartOfAsmFile or
119 // emitFunctionBodyStart?
120 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
121 initializeTargetID(M
);
123 if (TM
.getTargetTriple().getOS() != Triple::AMDHSA
&&
124 TM
.getTargetTriple().getOS() != Triple::AMDPAL
)
127 getTargetStreamer()->EmitDirectiveAMDGCNTarget();
129 if (TM
.getTargetTriple().getOS() == Triple::AMDHSA
) {
130 getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
132 HSAMetadataStream
->begin(M
, *getTargetStreamer()->getTargetID());
135 if (TM
.getTargetTriple().getOS() == Triple::AMDPAL
)
136 getTargetStreamer()->getPALMetadata()->readFromIR(M
);
139 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module
&M
) {
140 // Init target streamer if it has not yet happened
141 if (!IsTargetStreamerInitialized
)
142 initTargetStreamer(M
);
144 if (TM
.getTargetTriple().getOS() != Triple::AMDHSA
)
145 getTargetStreamer()->EmitISAVersion();
147 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
148 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
149 if (TM
.getTargetTriple().getOS() == Triple::AMDHSA
) {
150 HSAMetadataStream
->end();
151 bool Success
= HSAMetadataStream
->emitTo(*getTargetStreamer());
153 assert(Success
&& "Malformed HSA Metadata");
157 void AMDGPUAsmPrinter::emitFunctionBodyStart() {
158 const SIMachineFunctionInfo
&MFI
= *MF
->getInfo
<SIMachineFunctionInfo
>();
159 const GCNSubtarget
&STM
= MF
->getSubtarget
<GCNSubtarget
>();
160 const Function
&F
= MF
->getFunction();
162 // TODO: We're checking this late, would be nice to check it earlier.
163 if (STM
.requiresCodeObjectV6() && CodeObjectVersion
< AMDGPU::AMDHSA_COV6
) {
165 STM
.getCPU() + " is only available on code object version 6 or better",
166 /*gen_crash_diag*/ false);
169 // TODO: Which one is called first, emitStartOfAsmFile or
170 // emitFunctionBodyStart?
171 if (!getTargetStreamer()->getTargetID())
172 initializeTargetID(*F
.getParent());
174 const auto &FunctionTargetID
= STM
.getTargetID();
175 // Make sure function's xnack settings are compatible with module's
177 if (FunctionTargetID
.isXnackSupported() &&
178 FunctionTargetID
.getXnackSetting() != IsaInfo::TargetIDSetting::Any
&&
179 FunctionTargetID
.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
180 OutContext
.reportError({}, "xnack setting of '" + Twine(MF
->getName()) +
181 "' function does not match module xnack setting");
184 // Make sure function's sramecc settings are compatible with module's
186 if (FunctionTargetID
.isSramEccSupported() &&
187 FunctionTargetID
.getSramEccSetting() != IsaInfo::TargetIDSetting::Any
&&
188 FunctionTargetID
.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
189 OutContext
.reportError({}, "sramecc setting of '" + Twine(MF
->getName()) +
190 "' function does not match module sramecc setting");
194 if (!MFI
.isEntryFunction())
197 if (STM
.isMesaKernel(F
) &&
198 (F
.getCallingConv() == CallingConv::AMDGPU_KERNEL
||
199 F
.getCallingConv() == CallingConv::SPIR_KERNEL
)) {
200 AMDGPUMCKernelCodeT KernelCode
;
201 getAmdKernelCode(KernelCode
, CurrentProgramInfo
, *MF
);
202 KernelCode
.validate(&STM
, MF
->getContext());
203 getTargetStreamer()->EmitAMDKernelCodeT(KernelCode
);
206 if (STM
.isAmdHsaOS())
207 HSAMetadataStream
->emitKernel(*MF
, CurrentProgramInfo
);
209 if (MFI
.getNumKernargPreloadedSGPRs() > 0) {
210 assert(AMDGPU::hasKernargPreload(STM
));
211 getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI(),
216 void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
217 const SIMachineFunctionInfo
&MFI
= *MF
->getInfo
<SIMachineFunctionInfo
>();
218 if (!MFI
.isEntryFunction())
221 if (TM
.getTargetTriple().getOS() != Triple::AMDHSA
)
224 auto &Streamer
= getTargetStreamer()->getStreamer();
225 auto &Context
= Streamer
.getContext();
226 auto &ObjectFileInfo
= *Context
.getObjectFileInfo();
227 auto &ReadOnlySection
= *ObjectFileInfo
.getReadOnlySection();
229 Streamer
.pushSection();
230 Streamer
.switchSection(&ReadOnlySection
);
232 // CP microcode requires the kernel descriptor to be allocated on 64 byte
234 Streamer
.emitValueToAlignment(Align(64), 0, 1, 0);
235 ReadOnlySection
.ensureMinAlignment(Align(64));
237 const GCNSubtarget
&STM
= MF
->getSubtarget
<GCNSubtarget
>();
239 SmallString
<128> KernelName
;
240 getNameWithPrefix(KernelName
, &MF
->getFunction());
241 getTargetStreamer()->EmitAmdhsaKernelDescriptor(
242 STM
, KernelName
, getAmdhsaKernelDescriptor(*MF
, CurrentProgramInfo
),
243 CurrentProgramInfo
.NumVGPRsForWavesPerEU
,
244 MCBinaryExpr::createSub(
245 CurrentProgramInfo
.NumSGPRsForWavesPerEU
,
246 AMDGPUMCExpr::createExtraSGPRs(
247 CurrentProgramInfo
.VCCUsed
, CurrentProgramInfo
.FlatUsed
,
248 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context
),
250 CurrentProgramInfo
.VCCUsed
, CurrentProgramInfo
.FlatUsed
);
252 Streamer
.popSection();
255 void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr
*MI
) const {
256 Register RegNo
= MI
->getOperand(0).getReg();
258 SmallString
<128> Str
;
259 raw_svector_ostream
OS(Str
);
260 OS
<< "implicit-def: "
261 << printReg(RegNo
, MF
->getSubtarget().getRegisterInfo());
263 if (MI
->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL
)
264 OS
<< " : SGPR spill to VGPR lane";
266 OutStreamer
->AddComment(OS
.str());
267 OutStreamer
->addBlankLine();
270 void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
271 if (TM
.getTargetTriple().getOS() == Triple::AMDHSA
) {
272 AsmPrinter::emitFunctionEntryLabel();
276 const SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
277 const GCNSubtarget
&STM
= MF
->getSubtarget
<GCNSubtarget
>();
278 if (MFI
->isEntryFunction() && STM
.isAmdHsaOrMesa(MF
->getFunction())) {
279 SmallString
<128> SymbolName
;
280 getNameWithPrefix(SymbolName
, &MF
->getFunction()),
281 getTargetStreamer()->EmitAMDGPUSymbolType(
282 SymbolName
, ELF::STT_AMDGPU_HSA_KERNEL
);
284 if (DumpCodeInstEmitter
) {
285 // Disassemble function name label to text.
286 DisasmLines
.push_back(MF
->getName().str() + ":");
287 DisasmLineMaxLen
= std::max(DisasmLineMaxLen
, DisasmLines
.back().size());
288 HexLines
.emplace_back("");
291 AsmPrinter::emitFunctionEntryLabel();
294 void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock
&MBB
) {
295 if (DumpCodeInstEmitter
&& !isBlockOnlyReachableByFallthrough(&MBB
)) {
296 // Write a line for the basic block label if it is not only fallthrough.
297 DisasmLines
.push_back(
298 (Twine("BB") + Twine(getFunctionNumber())
299 + "_" + Twine(MBB
.getNumber()) + ":").str());
300 DisasmLineMaxLen
= std::max(DisasmLineMaxLen
, DisasmLines
.back().size());
301 HexLines
.emplace_back("");
303 AsmPrinter::emitBasicBlockStart(MBB
);
306 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable
*GV
) {
307 if (GV
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
) {
308 if (GV
->hasInitializer() && !isa
<UndefValue
>(GV
->getInitializer())) {
309 OutContext
.reportError({},
310 Twine(GV
->getName()) +
311 ": unsupported initializer for address space");
315 // LDS variables aren't emitted in HSA or PAL yet.
316 const Triple::OSType OS
= TM
.getTargetTriple().getOS();
317 if (OS
== Triple::AMDHSA
|| OS
== Triple::AMDPAL
)
320 MCSymbol
*GVSym
= getSymbol(GV
);
322 GVSym
->redefineIfPossible();
323 if (GVSym
->isDefined() || GVSym
->isVariable())
324 report_fatal_error("symbol '" + Twine(GVSym
->getName()) +
325 "' is already defined");
327 const DataLayout
&DL
= GV
->getDataLayout();
328 uint64_t Size
= DL
.getTypeAllocSize(GV
->getValueType());
329 Align Alignment
= GV
->getAlign().value_or(Align(4));
331 emitVisibility(GVSym
, GV
->getVisibility(), !GV
->isDeclaration());
332 emitLinkage(GV
, GVSym
);
333 auto TS
= getTargetStreamer();
334 TS
->emitAMDGPULDS(GVSym
, Size
, Alignment
);
338 AsmPrinter::emitGlobalVariable(GV
);
341 bool AMDGPUAsmPrinter::doInitialization(Module
&M
) {
342 CodeObjectVersion
= AMDGPU::getAMDHSACodeObjectVersion(M
);
344 if (TM
.getTargetTriple().getOS() == Triple::AMDHSA
) {
345 switch (CodeObjectVersion
) {
346 case AMDGPU::AMDHSA_COV4
:
347 HSAMetadataStream
= std::make_unique
<HSAMD::MetadataStreamerMsgPackV4
>();
349 case AMDGPU::AMDHSA_COV5
:
350 HSAMetadataStream
= std::make_unique
<HSAMD::MetadataStreamerMsgPackV5
>();
352 case AMDGPU::AMDHSA_COV6
:
353 HSAMetadataStream
= std::make_unique
<HSAMD::MetadataStreamerMsgPackV6
>();
356 report_fatal_error("Unexpected code object version");
359 return AsmPrinter::doInitialization(M
);
362 bool AMDGPUAsmPrinter::doFinalization(Module
&M
) {
363 // Pad with s_code_end to help tools and guard against instruction prefetch
364 // causing stale data in caches. Arguably this should be done by the linker,
365 // which is why this isn't done for Mesa.
366 const MCSubtargetInfo
&STI
= *getGlobalSTI();
367 if ((AMDGPU::isGFX10Plus(STI
) || AMDGPU::isGFX90A(STI
)) &&
368 (STI
.getTargetTriple().getOS() == Triple::AMDHSA
||
369 STI
.getTargetTriple().getOS() == Triple::AMDPAL
)) {
370 OutStreamer
->switchSection(getObjFileLowering().getTextSection());
371 getTargetStreamer()->EmitCodeEnd(STI
);
374 return AsmPrinter::doFinalization(M
);
377 // Print comments that apply to both callable functions and entry points.
378 void AMDGPUAsmPrinter::emitCommonFunctionComments(
379 uint32_t NumVGPR
, std::optional
<uint32_t> NumAGPR
, uint32_t TotalNumVGPR
,
380 uint32_t NumSGPR
, uint64_t ScratchSize
, uint64_t CodeSize
,
381 const AMDGPUMachineFunction
*MFI
) {
382 OutStreamer
->emitRawComment(" codeLenInByte = " + Twine(CodeSize
), false);
383 OutStreamer
->emitRawComment(" NumSgprs: " + Twine(NumSGPR
), false);
384 OutStreamer
->emitRawComment(" NumVgprs: " + Twine(NumVGPR
), false);
386 OutStreamer
->emitRawComment(" NumAgprs: " + Twine(*NumAGPR
), false);
387 OutStreamer
->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR
),
390 OutStreamer
->emitRawComment(" ScratchSize: " + Twine(ScratchSize
), false);
391 OutStreamer
->emitRawComment(" MemoryBound: " + Twine(MFI
->isMemoryBound()),
395 SmallString
<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr
*Value
) {
396 SmallString
<128> Str
;
397 raw_svector_ostream
OSS(Str
);
399 if (Value
->evaluateAsAbsolute(IVal
)) {
400 OSS
<< static_cast<uint64_t>(IVal
);
402 Value
->print(OSS
, MAI
);
407 void AMDGPUAsmPrinter::emitCommonFunctionComments(
408 const MCExpr
*NumVGPR
, const MCExpr
*NumAGPR
, const MCExpr
*TotalNumVGPR
,
409 const MCExpr
*NumSGPR
, const MCExpr
*ScratchSize
, uint64_t CodeSize
,
410 const AMDGPUMachineFunction
*MFI
) {
411 OutStreamer
->emitRawComment(" codeLenInByte = " + Twine(CodeSize
), false);
412 OutStreamer
->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR
), false);
413 OutStreamer
->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR
), false);
414 if (NumAGPR
&& TotalNumVGPR
) {
415 OutStreamer
->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR
), false);
416 OutStreamer
->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR
),
419 OutStreamer
->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize
),
421 OutStreamer
->emitRawComment(" MemoryBound: " + Twine(MFI
->isMemoryBound()),
425 const MCExpr
*AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
426 const MachineFunction
&MF
) const {
427 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
428 MCContext
&Ctx
= MF
.getContext();
429 uint16_t KernelCodeProperties
= 0;
430 const GCNUserSGPRUsageInfo
&UserSGPRInfo
= MFI
.getUserSGPRInfo();
432 if (UserSGPRInfo
.hasPrivateSegmentBuffer()) {
433 KernelCodeProperties
|=
434 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
;
436 if (UserSGPRInfo
.hasDispatchPtr()) {
437 KernelCodeProperties
|=
438 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
;
440 if (UserSGPRInfo
.hasQueuePtr() && CodeObjectVersion
< AMDGPU::AMDHSA_COV5
) {
441 KernelCodeProperties
|=
442 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
;
444 if (UserSGPRInfo
.hasKernargSegmentPtr()) {
445 KernelCodeProperties
|=
446 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
;
448 if (UserSGPRInfo
.hasDispatchID()) {
449 KernelCodeProperties
|=
450 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
;
452 if (UserSGPRInfo
.hasFlatScratchInit()) {
453 KernelCodeProperties
|=
454 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
;
456 if (UserSGPRInfo
.hasPrivateSegmentSize()) {
457 KernelCodeProperties
|=
458 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
;
460 if (MF
.getSubtarget
<GCNSubtarget
>().isWave32()) {
461 KernelCodeProperties
|=
462 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32
;
465 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
466 // un-evaluatable at this point so it cannot be conditionally checked here.
467 // Instead, we'll directly shift the possibly unknown MCExpr into its place
468 // and bitwise-or it into KernelCodeProperties.
469 const MCExpr
*KernelCodePropExpr
=
470 MCConstantExpr::create(KernelCodeProperties
, Ctx
);
471 const MCExpr
*OrValue
= MCConstantExpr::create(
472 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT
, Ctx
);
473 OrValue
= MCBinaryExpr::createShl(CurrentProgramInfo
.DynamicCallStack
,
475 KernelCodePropExpr
= MCBinaryExpr::createOr(KernelCodePropExpr
, OrValue
, Ctx
);
477 return KernelCodePropExpr
;
481 AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction
&MF
,
482 const SIProgramInfo
&PI
) const {
483 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
484 const Function
&F
= MF
.getFunction();
485 const SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
486 MCContext
&Ctx
= MF
.getContext();
488 MCKernelDescriptor KernelDescriptor
;
490 KernelDescriptor
.group_segment_fixed_size
=
491 MCConstantExpr::create(PI
.LDSSize
, Ctx
);
492 KernelDescriptor
.private_segment_fixed_size
= PI
.ScratchSize
;
494 Align MaxKernArgAlign
;
495 KernelDescriptor
.kernarg_size
= MCConstantExpr::create(
496 STM
.getKernArgSegmentSize(F
, MaxKernArgAlign
), Ctx
);
498 KernelDescriptor
.compute_pgm_rsrc1
= PI
.getComputePGMRSrc1(STM
, Ctx
);
499 KernelDescriptor
.compute_pgm_rsrc2
= PI
.getComputePGMRSrc2(Ctx
);
500 KernelDescriptor
.kernel_code_properties
= getAmdhsaKernelCodeProperties(MF
);
502 int64_t PGRM_Rsrc3
= 1;
503 bool EvaluatableRsrc3
=
504 CurrentProgramInfo
.ComputePGMRSrc3GFX90A
->evaluateAsAbsolute(PGRM_Rsrc3
);
506 (void)EvaluatableRsrc3
;
507 assert(STM
.hasGFX90AInsts() || !EvaluatableRsrc3
||
508 static_cast<uint64_t>(PGRM_Rsrc3
) == 0);
509 KernelDescriptor
.compute_pgm_rsrc3
= CurrentProgramInfo
.ComputePGMRSrc3GFX90A
;
511 KernelDescriptor
.kernarg_preload
= MCConstantExpr::create(
512 AMDGPU::hasKernargPreload(STM
) ? Info
->getNumKernargPreloadedSGPRs() : 0,
515 return KernelDescriptor
;
518 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction
&MF
) {
519 // Init target streamer lazily on the first function so that previous passes
521 if (!IsTargetStreamerInitialized
)
522 initTargetStreamer(*MF
.getFunction().getParent());
524 ResourceUsage
= &getAnalysis
<AMDGPUResourceUsageAnalysis
>();
525 CurrentProgramInfo
.reset(MF
);
527 const AMDGPUMachineFunction
*MFI
= MF
.getInfo
<AMDGPUMachineFunction
>();
528 MCContext
&Ctx
= MF
.getContext();
530 // The starting address of all shader programs must be 256 bytes aligned.
531 // Regular functions just need the basic required instruction alignment.
532 MF
.setAlignment(MFI
->isEntryFunction() ? Align(256) : Align(4));
534 SetupMachineFunction(MF
);
536 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
537 MCContext
&Context
= getObjFileLowering().getContext();
538 // FIXME: This should be an explicit check for Mesa.
539 if (!STM
.isAmdHsaOS() && !STM
.isAmdPalOS()) {
540 MCSectionELF
*ConfigSection
=
541 Context
.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS
, 0);
542 OutStreamer
->switchSection(ConfigSection
);
545 if (MFI
->isModuleEntryFunction()) {
546 getSIProgramInfo(CurrentProgramInfo
, MF
);
549 if (STM
.isAmdPalOS()) {
550 if (MFI
->isEntryFunction())
551 EmitPALMetadata(MF
, CurrentProgramInfo
);
552 else if (MFI
->isModuleEntryFunction())
553 emitPALFunctionMetadata(MF
);
554 } else if (!STM
.isAmdHsaOS()) {
555 EmitProgramInfoSI(MF
, CurrentProgramInfo
);
558 DumpCodeInstEmitter
= nullptr;
559 if (STM
.dumpCode()) {
560 // For -dumpcode, get the assembler out of the streamer. This only works
561 // with -filetype=obj.
562 MCAssembler
*Assembler
= OutStreamer
->getAssemblerPtr();
564 DumpCodeInstEmitter
= Assembler
->getEmitterPtr();
569 DisasmLineMaxLen
= 0;
573 emitResourceUsageRemarks(MF
, CurrentProgramInfo
, MFI
->isModuleEntryFunction(),
577 MCSectionELF
*CommentSection
=
578 Context
.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS
, 0);
579 OutStreamer
->switchSection(CommentSection
);
581 if (!MFI
->isEntryFunction()) {
582 OutStreamer
->emitRawComment(" Function info:", false);
583 const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
&Info
=
584 ResourceUsage
->getResourceInfo(&MF
.getFunction());
585 emitCommonFunctionComments(
587 STM
.hasMAIInsts() ? Info
.NumAGPR
: std::optional
<uint32_t>(),
588 Info
.getTotalNumVGPRs(STM
),
589 Info
.getTotalNumSGPRs(MF
.getSubtarget
<GCNSubtarget
>()),
590 Info
.PrivateSegmentSize
, getFunctionCodeSize(MF
), MFI
);
594 OutStreamer
->emitRawComment(" Kernel info:", false);
595 emitCommonFunctionComments(
596 CurrentProgramInfo
.NumArchVGPR
,
597 STM
.hasMAIInsts() ? CurrentProgramInfo
.NumAccVGPR
: nullptr,
598 CurrentProgramInfo
.NumVGPR
, CurrentProgramInfo
.NumSGPR
,
599 CurrentProgramInfo
.ScratchSize
, getFunctionCodeSize(MF
), MFI
);
601 OutStreamer
->emitRawComment(
602 " FloatMode: " + Twine(CurrentProgramInfo
.FloatMode
), false);
603 OutStreamer
->emitRawComment(
604 " IeeeMode: " + Twine(CurrentProgramInfo
.IEEEMode
), false);
605 OutStreamer
->emitRawComment(
606 " LDSByteSize: " + Twine(CurrentProgramInfo
.LDSSize
) +
607 " bytes/workgroup (compile time only)", false);
609 OutStreamer
->emitRawComment(
610 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo
.SGPRBlocks
), false);
612 OutStreamer
->emitRawComment(
613 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo
.VGPRBlocks
), false);
615 OutStreamer
->emitRawComment(
616 " NumSGPRsForWavesPerEU: " +
617 getMCExprStr(CurrentProgramInfo
.NumSGPRsForWavesPerEU
),
619 OutStreamer
->emitRawComment(
620 " NumVGPRsForWavesPerEU: " +
621 getMCExprStr(CurrentProgramInfo
.NumVGPRsForWavesPerEU
),
624 if (STM
.hasGFX90AInsts()) {
625 const MCExpr
*AdjustedAccum
= MCBinaryExpr::createAdd(
626 CurrentProgramInfo
.AccumOffset
, MCConstantExpr::create(1, Ctx
), Ctx
);
627 AdjustedAccum
= MCBinaryExpr::createMul(
628 AdjustedAccum
, MCConstantExpr::create(4, Ctx
), Ctx
);
629 OutStreamer
->emitRawComment(
630 " AccumOffset: " + getMCExprStr(AdjustedAccum
), false);
633 OutStreamer
->emitRawComment(
634 " Occupancy: " + getMCExprStr(CurrentProgramInfo
.Occupancy
), false);
636 OutStreamer
->emitRawComment(
637 " WaveLimiterHint : " + Twine(MFI
->needsWaveLimiter()), false);
639 OutStreamer
->emitRawComment(
640 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
641 getMCExprStr(CurrentProgramInfo
.ScratchEnable
),
643 OutStreamer
->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
644 Twine(CurrentProgramInfo
.UserSGPR
),
646 OutStreamer
->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
647 Twine(CurrentProgramInfo
.TrapHandlerEnable
),
649 OutStreamer
->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
650 Twine(CurrentProgramInfo
.TGIdXEnable
),
652 OutStreamer
->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
653 Twine(CurrentProgramInfo
.TGIdYEnable
),
655 OutStreamer
->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
656 Twine(CurrentProgramInfo
.TGIdZEnable
),
658 OutStreamer
->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
659 Twine(CurrentProgramInfo
.TIdIGCompCount
),
662 [[maybe_unused
]] int64_t PGMRSrc3
;
663 assert(STM
.hasGFX90AInsts() ||
664 (CurrentProgramInfo
.ComputePGMRSrc3GFX90A
->evaluateAsAbsolute(
666 static_cast<uint64_t>(PGMRSrc3
) == 0));
667 if (STM
.hasGFX90AInsts()) {
668 OutStreamer
->emitRawComment(
669 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
670 getMCExprStr(MCKernelDescriptor::bits_get(
671 CurrentProgramInfo
.ComputePGMRSrc3GFX90A
,
672 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT
,
673 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET
, Ctx
)),
675 OutStreamer
->emitRawComment(
676 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
677 getMCExprStr(MCKernelDescriptor::bits_get(
678 CurrentProgramInfo
.ComputePGMRSrc3GFX90A
,
679 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT
,
680 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT
, Ctx
)),
685 if (DumpCodeInstEmitter
) {
687 OutStreamer
->switchSection(
688 Context
.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS
, 0));
690 for (size_t i
= 0; i
< DisasmLines
.size(); ++i
) {
691 std::string Comment
= "\n";
692 if (!HexLines
[i
].empty()) {
693 Comment
= std::string(DisasmLineMaxLen
- DisasmLines
[i
].size(), ' ');
694 Comment
+= " ; " + HexLines
[i
] + "\n";
697 OutStreamer
->emitBytes(StringRef(DisasmLines
[i
]));
698 OutStreamer
->emitBytes(StringRef(Comment
));
705 // TODO: Fold this into emitFunctionBodyStart.
706 void AMDGPUAsmPrinter::initializeTargetID(const Module
&M
) {
707 // In the beginning all features are either 'Any' or 'NotSupported',
708 // depending on global target features. This will cover empty modules.
709 getTargetStreamer()->initializeTargetID(*getGlobalSTI(),
710 getGlobalSTI()->getFeatureString());
712 // If module is empty, we are done.
716 // If module is not empty, need to find first 'Off' or 'On' feature
717 // setting per feature from functions in module.
719 auto &TSTargetID
= getTargetStreamer()->getTargetID();
720 if ((!TSTargetID
->isXnackSupported() || TSTargetID
->isXnackOnOrOff()) &&
721 (!TSTargetID
->isSramEccSupported() || TSTargetID
->isSramEccOnOrOff()))
724 const GCNSubtarget
&STM
= TM
.getSubtarget
<GCNSubtarget
>(F
);
725 const IsaInfo::AMDGPUTargetID
&STMTargetID
= STM
.getTargetID();
726 if (TSTargetID
->isXnackSupported())
727 if (TSTargetID
->getXnackSetting() == IsaInfo::TargetIDSetting::Any
)
728 TSTargetID
->setXnackSetting(STMTargetID
.getXnackSetting());
729 if (TSTargetID
->isSramEccSupported())
730 if (TSTargetID
->getSramEccSetting() == IsaInfo::TargetIDSetting::Any
)
731 TSTargetID
->setSramEccSetting(STMTargetID
.getSramEccSetting());
735 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction
&MF
) const {
736 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
737 const SIInstrInfo
*TII
= STM
.getInstrInfo();
739 uint64_t CodeSize
= 0;
741 for (const MachineBasicBlock
&MBB
: MF
) {
742 for (const MachineInstr
&MI
: MBB
) {
743 // TODO: CodeSize should account for multiple functions.
745 // TODO: Should we count size of debug info?
746 if (MI
.isDebugInstr())
749 CodeSize
+= TII
->getInstSizeInBytes(MI
);
756 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo
&ProgInfo
,
757 const MachineFunction
&MF
) {
758 const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
&Info
=
759 ResourceUsage
->getResourceInfo(&MF
.getFunction());
760 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
761 MCContext
&Ctx
= MF
.getContext();
763 auto CreateExpr
= [&Ctx
](int64_t Value
) {
764 return MCConstantExpr::create(Value
, Ctx
);
767 auto TryGetMCExprValue
= [](const MCExpr
*Value
, uint64_t &Res
) -> bool {
769 if (Value
->evaluateAsAbsolute(Val
)) {
776 ProgInfo
.NumArchVGPR
= CreateExpr(Info
.NumVGPR
);
777 ProgInfo
.NumAccVGPR
= CreateExpr(Info
.NumAGPR
);
778 ProgInfo
.NumVGPR
= CreateExpr(Info
.getTotalNumVGPRs(STM
));
779 ProgInfo
.AccumOffset
=
780 CreateExpr(alignTo(std::max(1, Info
.NumVGPR
), 4) / 4 - 1);
781 ProgInfo
.TgSplit
= STM
.isTgSplitEnabled();
782 ProgInfo
.NumSGPR
= CreateExpr(Info
.NumExplicitSGPR
);
783 ProgInfo
.ScratchSize
= CreateExpr(Info
.PrivateSegmentSize
);
784 ProgInfo
.VCCUsed
= CreateExpr(Info
.UsesVCC
);
785 ProgInfo
.FlatUsed
= CreateExpr(Info
.UsesFlatScratch
);
786 ProgInfo
.DynamicCallStack
=
787 CreateExpr(Info
.HasDynamicallySizedStack
|| Info
.HasRecursion
);
789 const uint64_t MaxScratchPerWorkitem
=
790 STM
.getMaxWaveScratchSize() / STM
.getWavefrontSize();
791 uint64_t ScratchSize
;
792 if (TryGetMCExprValue(ProgInfo
.ScratchSize
, ScratchSize
) &&
793 ScratchSize
> MaxScratchPerWorkitem
) {
794 DiagnosticInfoStackSize
DiagStackSize(MF
.getFunction(), ScratchSize
,
795 MaxScratchPerWorkitem
, DS_Error
);
796 MF
.getFunction().getContext().diagnose(DiagStackSize
);
799 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
801 // The calculations related to SGPR/VGPR blocks are
802 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
804 const MCExpr
*ExtraSGPRs
= AMDGPUMCExpr::createExtraSGPRs(
805 ProgInfo
.VCCUsed
, ProgInfo
.FlatUsed
,
806 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx
);
808 // Check the addressable register limit before we add ExtraSGPRs.
809 if (STM
.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
&&
810 !STM
.hasSGPRInitBug()) {
811 unsigned MaxAddressableNumSGPRs
= STM
.getAddressableNumSGPRs();
813 if (TryGetMCExprValue(ProgInfo
.NumSGPR
, NumSgpr
) &&
814 NumSgpr
> MaxAddressableNumSGPRs
) {
815 // This can happen due to a compiler bug or when using inline asm.
816 LLVMContext
&Ctx
= MF
.getFunction().getContext();
817 DiagnosticInfoResourceLimit
Diag(
818 MF
.getFunction(), "addressable scalar registers", NumSgpr
,
819 MaxAddressableNumSGPRs
, DS_Error
, DK_ResourceLimit
);
821 ProgInfo
.NumSGPR
= CreateExpr(MaxAddressableNumSGPRs
- 1);
825 // Account for extra SGPRs and VGPRs reserved for debugger use.
826 ProgInfo
.NumSGPR
= MCBinaryExpr::createAdd(ProgInfo
.NumSGPR
, ExtraSGPRs
, Ctx
);
828 const Function
&F
= MF
.getFunction();
830 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
831 // dispatch registers are function args.
832 unsigned WaveDispatchNumSGPR
= 0, WaveDispatchNumVGPR
= 0;
834 if (isShader(F
.getCallingConv())) {
836 F
.getCallingConv() == CallingConv::AMDGPU_PS
&& !STM
.isAmdHsaOS();
838 // Calculate the number of VGPR registers based on the SPI input registers
839 uint32_t InputEna
= 0;
840 uint32_t InputAddr
= 0;
841 unsigned LastEna
= 0;
844 // Note for IsPixelShader:
845 // By this stage, all enabled inputs are tagged in InputAddr as well.
846 // We will use InputAddr to determine whether the input counts against the
847 // vgpr total and only use the InputEnable to determine the last input
848 // that is relevant - if extra arguments are used, then we have to honour
849 // the InputAddr for any intermediate non-enabled inputs.
850 InputEna
= MFI
->getPSInputEnable();
851 InputAddr
= MFI
->getPSInputAddr();
853 // We only need to consider input args up to the last used arg.
854 assert((InputEna
|| InputAddr
) &&
855 "PSInputAddr and PSInputEnable should "
856 "never both be 0 for AMDGPU_PS shaders");
857 // There are some rare circumstances where InputAddr is non-zero and
858 // InputEna can be set to 0. In this case we default to setting LastEna
860 LastEna
= InputEna
? llvm::Log2_32(InputEna
) + 1 : 1;
863 // FIXME: We should be using the number of registers determined during
864 // calling convention lowering to legalize the types.
865 const DataLayout
&DL
= F
.getDataLayout();
866 unsigned PSArgCount
= 0;
867 unsigned IntermediateVGPR
= 0;
868 for (auto &Arg
: F
.args()) {
869 unsigned NumRegs
= (DL
.getTypeSizeInBits(Arg
.getType()) + 31) / 32;
870 if (Arg
.hasAttribute(Attribute::InReg
)) {
871 WaveDispatchNumSGPR
+= NumRegs
;
873 // If this is a PS shader and we're processing the PS Input args (first
874 // 16 VGPR), use the InputEna and InputAddr bits to define how many
875 // VGPRs are actually used.
876 // Any extra VGPR arguments are handled as normal arguments (and
877 // contribute to the VGPR count whether they're used or not).
878 if (IsPixelShader
&& PSArgCount
< 16) {
879 if ((1 << PSArgCount
) & InputAddr
) {
880 if (PSArgCount
< LastEna
)
881 WaveDispatchNumVGPR
+= NumRegs
;
883 IntermediateVGPR
+= NumRegs
;
887 // If there are extra arguments we have to include the allocation for
888 // the non-used (but enabled with InputAddr) input arguments
889 if (IntermediateVGPR
) {
890 WaveDispatchNumVGPR
+= IntermediateVGPR
;
891 IntermediateVGPR
= 0;
893 WaveDispatchNumVGPR
+= NumRegs
;
897 ProgInfo
.NumSGPR
= AMDGPUMCExpr::createMax(
898 {ProgInfo
.NumSGPR
, CreateExpr(WaveDispatchNumSGPR
)}, Ctx
);
900 ProgInfo
.NumArchVGPR
= AMDGPUMCExpr::createMax(
901 {ProgInfo
.NumVGPR
, CreateExpr(WaveDispatchNumVGPR
)}, Ctx
);
903 ProgInfo
.NumVGPR
= AMDGPUMCExpr::createTotalNumVGPR(
904 ProgInfo
.NumAccVGPR
, ProgInfo
.NumArchVGPR
, Ctx
);
907 // Adjust number of registers used to meet default/requested minimum/maximum
908 // number of waves per execution unit request.
909 unsigned MaxWaves
= MFI
->getMaxWavesPerEU();
910 ProgInfo
.NumSGPRsForWavesPerEU
=
911 AMDGPUMCExpr::createMax({ProgInfo
.NumSGPR
, CreateExpr(1ul),
912 CreateExpr(STM
.getMinNumSGPRs(MaxWaves
))},
914 ProgInfo
.NumVGPRsForWavesPerEU
=
915 AMDGPUMCExpr::createMax({ProgInfo
.NumVGPR
, CreateExpr(1ul),
916 CreateExpr(STM
.getMinNumVGPRs(MaxWaves
))},
919 if (STM
.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS
||
920 STM
.hasSGPRInitBug()) {
921 unsigned MaxAddressableNumSGPRs
= STM
.getAddressableNumSGPRs();
923 if (TryGetMCExprValue(ProgInfo
.NumSGPR
, NumSgpr
) &&
924 NumSgpr
> MaxAddressableNumSGPRs
) {
925 // This can happen due to a compiler bug or when using inline asm to use
926 // the registers which are usually reserved for vcc etc.
927 LLVMContext
&Ctx
= MF
.getFunction().getContext();
928 DiagnosticInfoResourceLimit
Diag(MF
.getFunction(), "scalar registers",
929 NumSgpr
, MaxAddressableNumSGPRs
,
930 DS_Error
, DK_ResourceLimit
);
932 ProgInfo
.NumSGPR
= CreateExpr(MaxAddressableNumSGPRs
);
933 ProgInfo
.NumSGPRsForWavesPerEU
= CreateExpr(MaxAddressableNumSGPRs
);
937 if (STM
.hasSGPRInitBug()) {
939 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
);
940 ProgInfo
.NumSGPRsForWavesPerEU
=
941 CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
);
944 if (MFI
->getNumUserSGPRs() > STM
.getMaxNumUserSGPRs()) {
945 LLVMContext
&Ctx
= MF
.getFunction().getContext();
946 DiagnosticInfoResourceLimit
Diag(MF
.getFunction(), "user SGPRs",
947 MFI
->getNumUserSGPRs(),
948 STM
.getMaxNumUserSGPRs(), DS_Error
);
952 if (MFI
->getLDSSize() >
953 static_cast<unsigned>(STM
.getAddressableLocalMemorySize())) {
954 LLVMContext
&Ctx
= MF
.getFunction().getContext();
955 DiagnosticInfoResourceLimit
Diag(
956 MF
.getFunction(), "local memory", MFI
->getLDSSize(),
957 STM
.getAddressableLocalMemorySize(), DS_Error
);
960 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
961 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
962 auto GetNumGPRBlocks
= [&CreateExpr
, &Ctx
](const MCExpr
*NumGPR
,
964 const MCExpr
*OneConst
= CreateExpr(1ul);
965 const MCExpr
*GranuleConst
= CreateExpr(Granule
);
966 const MCExpr
*MaxNumGPR
= AMDGPUMCExpr::createMax({NumGPR
, OneConst
}, Ctx
);
967 const MCExpr
*AlignToGPR
=
968 AMDGPUMCExpr::createAlignTo(MaxNumGPR
, GranuleConst
, Ctx
);
969 const MCExpr
*DivGPR
=
970 MCBinaryExpr::createDiv(AlignToGPR
, GranuleConst
, Ctx
);
971 const MCExpr
*SubGPR
= MCBinaryExpr::createSub(DivGPR
, OneConst
, Ctx
);
975 ProgInfo
.SGPRBlocks
= GetNumGPRBlocks(ProgInfo
.NumSGPRsForWavesPerEU
,
976 IsaInfo::getSGPREncodingGranule(&STM
));
977 ProgInfo
.VGPRBlocks
= GetNumGPRBlocks(ProgInfo
.NumVGPRsForWavesPerEU
,
978 IsaInfo::getVGPREncodingGranule(&STM
));
980 const SIModeRegisterDefaults Mode
= MFI
->getMode();
982 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
984 ProgInfo
.FloatMode
= getFPMode(Mode
);
986 ProgInfo
.IEEEMode
= Mode
.IEEE
;
988 // Make clamp modifier on NaN input returns 0.
989 ProgInfo
.DX10Clamp
= Mode
.DX10Clamp
;
991 unsigned LDSAlignShift
;
992 if (STM
.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS
) {
993 // LDS is allocated in 64 dword blocks.
996 // LDS is allocated in 128 dword blocks.
1000 ProgInfo
.SGPRSpill
= MFI
->getNumSpilledSGPRs();
1001 ProgInfo
.VGPRSpill
= MFI
->getNumSpilledVGPRs();
1003 ProgInfo
.LDSSize
= MFI
->getLDSSize();
1004 ProgInfo
.LDSBlocks
=
1005 alignTo(ProgInfo
.LDSSize
, 1ULL << LDSAlignShift
) >> LDSAlignShift
;
1007 // The MCExpr equivalent of divideCeil.
1008 auto DivideCeil
= [&Ctx
](const MCExpr
*Numerator
, const MCExpr
*Denominator
) {
1009 const MCExpr
*Ceil
=
1010 AMDGPUMCExpr::createAlignTo(Numerator
, Denominator
, Ctx
);
1011 return MCBinaryExpr::createDiv(Ceil
, Denominator
, Ctx
);
1014 // Scratch is allocated in 64-dword or 256-dword blocks.
1015 unsigned ScratchAlignShift
=
1016 STM
.getGeneration() >= AMDGPUSubtarget::GFX11
? 8 : 10;
1017 // We need to program the hardware with the amount of scratch memory that
1018 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1019 // scratch memory used per thread.
1020 ProgInfo
.ScratchBlocks
= DivideCeil(
1021 MCBinaryExpr::createMul(ProgInfo
.ScratchSize
,
1022 CreateExpr(STM
.getWavefrontSize()), Ctx
),
1023 CreateExpr(1ULL << ScratchAlignShift
));
1025 if (getIsaVersion(getGlobalSTI()->getCPU()).Major
>= 10) {
1026 ProgInfo
.WgpMode
= STM
.isCuModeEnabled() ? 0 : 1;
1027 ProgInfo
.MemOrdered
= 1;
1030 // 0 = X, 1 = XY, 2 = XYZ
1031 unsigned TIDIGCompCnt
= 0;
1032 if (MFI
->hasWorkItemIDZ())
1034 else if (MFI
->hasWorkItemIDY())
1037 // The private segment wave byte offset is the last of the system SGPRs. We
1038 // initially assumed it was allocated, and may have used it. It shouldn't harm
1039 // anything to disable it if we know the stack isn't used here. We may still
1040 // have emitted code reading it to initialize scratch, but if that's unused
1041 // reading garbage should be OK.
1042 ProgInfo
.ScratchEnable
= MCBinaryExpr::createLOr(
1043 MCBinaryExpr::createGT(ProgInfo
.ScratchBlocks
,
1044 MCConstantExpr::create(0, Ctx
), Ctx
),
1045 ProgInfo
.DynamicCallStack
, Ctx
);
1047 ProgInfo
.UserSGPR
= MFI
->getNumUserSGPRs();
1048 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1049 ProgInfo
.TrapHandlerEnable
=
1050 STM
.isAmdHsaOS() ? 0 : STM
.isTrapHandlerEnabled();
1051 ProgInfo
.TGIdXEnable
= MFI
->hasWorkGroupIDX();
1052 ProgInfo
.TGIdYEnable
= MFI
->hasWorkGroupIDY();
1053 ProgInfo
.TGIdZEnable
= MFI
->hasWorkGroupIDZ();
1054 ProgInfo
.TGSizeEnable
= MFI
->hasWorkGroupInfo();
1055 ProgInfo
.TIdIGCompCount
= TIDIGCompCnt
;
1056 ProgInfo
.EXCPEnMSB
= 0;
1057 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1058 ProgInfo
.LdsSize
= STM
.isAmdHsaOS() ? 0 : ProgInfo
.LDSBlocks
;
1059 ProgInfo
.EXCPEnable
= 0;
1061 if (STM
.hasGFX90AInsts()) {
1062 // return ((Dst & ~Mask) | (Value << Shift))
1063 auto SetBits
= [&Ctx
](const MCExpr
*Dst
, const MCExpr
*Value
, uint32_t Mask
,
1065 auto Shft
= MCConstantExpr::create(Shift
, Ctx
);
1066 auto Msk
= MCConstantExpr::create(Mask
, Ctx
);
1067 Dst
= MCBinaryExpr::createAnd(Dst
, MCUnaryExpr::createNot(Msk
, Ctx
), Ctx
);
1068 Dst
= MCBinaryExpr::createOr(
1069 Dst
, MCBinaryExpr::createShl(Value
, Shft
, Ctx
), Ctx
);
1073 ProgInfo
.ComputePGMRSrc3GFX90A
=
1074 SetBits(ProgInfo
.ComputePGMRSrc3GFX90A
, ProgInfo
.AccumOffset
,
1075 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET
,
1076 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT
);
1077 ProgInfo
.ComputePGMRSrc3GFX90A
=
1078 SetBits(ProgInfo
.ComputePGMRSrc3GFX90A
, CreateExpr(ProgInfo
.TgSplit
),
1079 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT
,
1080 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT
);
1083 ProgInfo
.Occupancy
= AMDGPUMCExpr::createOccupancy(
1084 STM
.computeOccupancy(F
, ProgInfo
.LDSSize
), ProgInfo
.NumSGPRsForWavesPerEU
,
1085 ProgInfo
.NumVGPRsForWavesPerEU
, STM
, Ctx
);
1087 const auto [MinWEU
, MaxWEU
] =
1088 AMDGPU::getIntegerPairAttribute(F
, "amdgpu-waves-per-eu", {0, 0}, true);
1090 if (TryGetMCExprValue(ProgInfo
.Occupancy
, Occupancy
) && Occupancy
< MinWEU
) {
1091 DiagnosticInfoOptimizationFailure
Diag(
1092 F
, F
.getSubprogram(),
1093 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1095 F
.getName() + "': desired occupancy was " + Twine(MinWEU
) +
1096 ", final occupancy is " + Twine(Occupancy
));
1097 F
.getContext().diagnose(Diag
);
1101 static unsigned getRsrcReg(CallingConv::ID CallConv
) {
1103 default: [[fallthrough
]];
1104 case CallingConv::AMDGPU_CS
: return R_00B848_COMPUTE_PGM_RSRC1
;
1105 case CallingConv::AMDGPU_LS
: return R_00B528_SPI_SHADER_PGM_RSRC1_LS
;
1106 case CallingConv::AMDGPU_HS
: return R_00B428_SPI_SHADER_PGM_RSRC1_HS
;
1107 case CallingConv::AMDGPU_ES
: return R_00B328_SPI_SHADER_PGM_RSRC1_ES
;
1108 case CallingConv::AMDGPU_GS
: return R_00B228_SPI_SHADER_PGM_RSRC1_GS
;
1109 case CallingConv::AMDGPU_VS
: return R_00B128_SPI_SHADER_PGM_RSRC1_VS
;
1110 case CallingConv::AMDGPU_PS
: return R_00B028_SPI_SHADER_PGM_RSRC1_PS
;
1114 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction
&MF
,
1115 const SIProgramInfo
&CurrentProgramInfo
) {
1116 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1117 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
1118 unsigned RsrcReg
= getRsrcReg(MF
.getFunction().getCallingConv());
1119 MCContext
&Ctx
= MF
.getContext();
1121 // (((Value) & Mask) << Shift)
1122 auto SetBits
= [&Ctx
](const MCExpr
*Value
, uint32_t Mask
, uint32_t Shift
) {
1123 const MCExpr
*msk
= MCConstantExpr::create(Mask
, Ctx
);
1124 const MCExpr
*shft
= MCConstantExpr::create(Shift
, Ctx
);
1125 return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value
, msk
, Ctx
),
1129 auto EmitResolvedOrExpr
= [this](const MCExpr
*Value
, unsigned Size
) {
1131 if (Value
->evaluateAsAbsolute(Val
))
1132 OutStreamer
->emitIntValue(static_cast<uint64_t>(Val
), Size
);
1134 OutStreamer
->emitValue(Value
, Size
);
1137 if (AMDGPU::isCompute(MF
.getFunction().getCallingConv())) {
1138 OutStreamer
->emitInt32(R_00B848_COMPUTE_PGM_RSRC1
);
1140 EmitResolvedOrExpr(CurrentProgramInfo
.getComputePGMRSrc1(STM
, Ctx
),
1143 OutStreamer
->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2
);
1144 EmitResolvedOrExpr(CurrentProgramInfo
.getComputePGMRSrc2(Ctx
), /*Size=*/4);
1146 OutStreamer
->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE
);
1148 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1149 // appropriate generation.
1150 if (STM
.getGeneration() >= AMDGPUSubtarget::GFX12
) {
1151 EmitResolvedOrExpr(SetBits(CurrentProgramInfo
.ScratchBlocks
,
1152 /*Mask=*/0x3FFFF, /*Shift=*/12),
1154 } else if (STM
.getGeneration() == AMDGPUSubtarget::GFX11
) {
1155 EmitResolvedOrExpr(SetBits(CurrentProgramInfo
.ScratchBlocks
,
1156 /*Mask=*/0x7FFF, /*Shift=*/12),
1159 EmitResolvedOrExpr(SetBits(CurrentProgramInfo
.ScratchBlocks
,
1160 /*Mask=*/0x1FFF, /*Shift=*/12),
1164 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1165 // 0" comment but I don't see a corresponding field in the register spec.
1167 OutStreamer
->emitInt32(RsrcReg
);
1169 const MCExpr
*GPRBlocks
= MCBinaryExpr::createOr(
1170 SetBits(CurrentProgramInfo
.VGPRBlocks
, /*Mask=*/0x3F, /*Shift=*/0),
1171 SetBits(CurrentProgramInfo
.SGPRBlocks
, /*Mask=*/0x0F, /*Shift=*/6),
1173 EmitResolvedOrExpr(GPRBlocks
, /*Size=*/4);
1174 OutStreamer
->emitInt32(R_0286E8_SPI_TMPRING_SIZE
);
1176 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1177 // appropriate generation.
1178 if (STM
.getGeneration() >= AMDGPUSubtarget::GFX12
) {
1179 EmitResolvedOrExpr(SetBits(CurrentProgramInfo
.ScratchBlocks
,
1180 /*Mask=*/0x3FFFF, /*Shift=*/12),
1182 } else if (STM
.getGeneration() == AMDGPUSubtarget::GFX11
) {
1183 EmitResolvedOrExpr(SetBits(CurrentProgramInfo
.ScratchBlocks
,
1184 /*Mask=*/0x7FFF, /*Shift=*/12),
1187 EmitResolvedOrExpr(SetBits(CurrentProgramInfo
.ScratchBlocks
,
1188 /*Mask=*/0x1FFF, /*Shift=*/12),
1193 if (MF
.getFunction().getCallingConv() == CallingConv::AMDGPU_PS
) {
1194 OutStreamer
->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS
);
1195 unsigned ExtraLDSSize
= STM
.getGeneration() >= AMDGPUSubtarget::GFX11
1196 ? divideCeil(CurrentProgramInfo
.LDSBlocks
, 2)
1197 : CurrentProgramInfo
.LDSBlocks
;
1198 OutStreamer
->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize
));
1199 OutStreamer
->emitInt32(R_0286CC_SPI_PS_INPUT_ENA
);
1200 OutStreamer
->emitInt32(MFI
->getPSInputEnable());
1201 OutStreamer
->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR
);
1202 OutStreamer
->emitInt32(MFI
->getPSInputAddr());
1205 OutStreamer
->emitInt32(R_SPILLED_SGPRS
);
1206 OutStreamer
->emitInt32(MFI
->getNumSpilledSGPRs());
1207 OutStreamer
->emitInt32(R_SPILLED_VGPRS
);
1208 OutStreamer
->emitInt32(MFI
->getNumSpilledVGPRs());
1211 // Helper function to add common PAL Metadata 3.0+
1212 static void EmitPALMetadataCommon(AMDGPUPALMetadata
*MD
,
1213 const SIProgramInfo
&CurrentProgramInfo
,
1214 CallingConv::ID CC
, const GCNSubtarget
&ST
) {
1215 if (ST
.hasIEEEMode())
1216 MD
->setHwStage(CC
, ".ieee_mode", (bool)CurrentProgramInfo
.IEEEMode
);
1218 MD
->setHwStage(CC
, ".wgp_mode", (bool)CurrentProgramInfo
.WgpMode
);
1219 MD
->setHwStage(CC
, ".mem_ordered", (bool)CurrentProgramInfo
.MemOrdered
);
1221 if (AMDGPU::isCompute(CC
)) {
1222 MD
->setHwStage(CC
, ".trap_present",
1223 (bool)CurrentProgramInfo
.TrapHandlerEnable
);
1224 MD
->setHwStage(CC
, ".excp_en", CurrentProgramInfo
.EXCPEnable
);
1227 MD
->setHwStage(CC
, ".lds_size",
1228 (unsigned)(CurrentProgramInfo
.LdsSize
*
1229 getLdsDwGranularity(ST
) * sizeof(uint32_t)));
1232 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1233 // is AMDPAL. It stores each compute/SPI register setting and other PAL
1234 // metadata items into the PALMD::Metadata, combining with any provided by the
1235 // frontend as LLVM metadata. Once all functions are written, the PAL metadata
1236 // is then written as a single block in the .note section.
1237 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction
&MF
,
1238 const SIProgramInfo
&CurrentProgramInfo
) {
1239 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1240 auto CC
= MF
.getFunction().getCallingConv();
1241 auto MD
= getTargetStreamer()->getPALMetadata();
1242 auto &Ctx
= MF
.getContext();
1244 MD
->setEntryPoint(CC
, MF
.getFunction().getName());
1245 MD
->setNumUsedVgprs(CC
, CurrentProgramInfo
.NumVGPRsForWavesPerEU
, Ctx
);
1247 // Only set AGPRs for supported devices
1248 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
1249 if (STM
.hasMAIInsts()) {
1250 MD
->setNumUsedAgprs(CC
, CurrentProgramInfo
.NumAccVGPR
);
1253 MD
->setNumUsedSgprs(CC
, CurrentProgramInfo
.NumSGPRsForWavesPerEU
, Ctx
);
1254 if (MD
->getPALMajorVersion() < 3) {
1255 MD
->setRsrc1(CC
, CurrentProgramInfo
.getPGMRSrc1(CC
, STM
, Ctx
), Ctx
);
1256 if (AMDGPU::isCompute(CC
)) {
1257 MD
->setRsrc2(CC
, CurrentProgramInfo
.getComputePGMRSrc2(Ctx
), Ctx
);
1259 const MCExpr
*HasScratchBlocks
=
1260 MCBinaryExpr::createGT(CurrentProgramInfo
.ScratchBlocks
,
1261 MCConstantExpr::create(0, Ctx
), Ctx
);
1262 auto [Shift
, Mask
] = getShiftMask(C_00B84C_SCRATCH_EN
);
1263 MD
->setRsrc2(CC
, maskShiftSet(HasScratchBlocks
, Mask
, Shift
, Ctx
), Ctx
);
1266 MD
->setHwStage(CC
, ".debug_mode", (bool)CurrentProgramInfo
.DebugMode
);
1267 MD
->setHwStage(CC
, ".scratch_en", msgpack::Type::Boolean
,
1268 CurrentProgramInfo
.ScratchEnable
);
1269 EmitPALMetadataCommon(MD
, CurrentProgramInfo
, CC
, STM
);
1272 // ScratchSize is in bytes, 16 aligned.
1275 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo
.ScratchSize
,
1276 MCConstantExpr::create(16, Ctx
), Ctx
),
1279 if (MF
.getFunction().getCallingConv() == CallingConv::AMDGPU_PS
) {
1280 unsigned ExtraLDSSize
= STM
.getGeneration() >= AMDGPUSubtarget::GFX11
1281 ? divideCeil(CurrentProgramInfo
.LDSBlocks
, 2)
1282 : CurrentProgramInfo
.LDSBlocks
;
1283 if (MD
->getPALMajorVersion() < 3) {
1286 MCConstantExpr::create(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize
), Ctx
),
1288 MD
->setSpiPsInputEna(MFI
->getPSInputEnable());
1289 MD
->setSpiPsInputAddr(MFI
->getPSInputAddr());
1291 // Graphics registers
1292 const unsigned ExtraLdsDwGranularity
=
1293 STM
.getGeneration() >= AMDGPUSubtarget::GFX11
? 256 : 128;
1294 MD
->setGraphicsRegisters(
1295 ".ps_extra_lds_size",
1296 (unsigned)(ExtraLDSSize
* ExtraLdsDwGranularity
* sizeof(uint32_t)));
1298 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1299 static StringLiteral
const PsInputFields
[] = {
1300 ".persp_sample_ena", ".persp_center_ena",
1301 ".persp_centroid_ena", ".persp_pull_model_ena",
1302 ".linear_sample_ena", ".linear_center_ena",
1303 ".linear_centroid_ena", ".line_stipple_tex_ena",
1304 ".pos_x_float_ena", ".pos_y_float_ena",
1305 ".pos_z_float_ena", ".pos_w_float_ena",
1306 ".front_face_ena", ".ancillary_ena",
1307 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1308 unsigned PSInputEna
= MFI
->getPSInputEnable();
1309 unsigned PSInputAddr
= MFI
->getPSInputAddr();
1310 for (auto [Idx
, Field
] : enumerate(PsInputFields
)) {
1311 MD
->setGraphicsRegisters(".spi_ps_input_ena", Field
,
1312 (bool)((PSInputEna
>> Idx
) & 1));
1313 MD
->setGraphicsRegisters(".spi_ps_input_addr", Field
,
1314 (bool)((PSInputAddr
>> Idx
) & 1));
1319 // For version 3 and above the wave front size is already set in the metadata
1320 if (MD
->getPALMajorVersion() < 3 && STM
.isWave32())
1321 MD
->setWave32(MF
.getFunction().getCallingConv());
1324 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction
&MF
) {
1325 auto *MD
= getTargetStreamer()->getPALMetadata();
1326 const MachineFrameInfo
&MFI
= MF
.getFrameInfo();
1327 StringRef FnName
= MF
.getFunction().getName();
1328 MD
->setFunctionScratchSize(FnName
, MFI
.getStackSize());
1329 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1330 MCContext
&Ctx
= MF
.getContext();
1332 if (MD
->getPALMajorVersion() < 3) {
1333 // Set compute registers
1335 CallingConv::AMDGPU_CS
,
1336 CurrentProgramInfo
.getPGMRSrc1(CallingConv::AMDGPU_CS
, ST
, Ctx
), Ctx
);
1337 MD
->setRsrc2(CallingConv::AMDGPU_CS
,
1338 CurrentProgramInfo
.getComputePGMRSrc2(Ctx
), Ctx
);
1340 EmitPALMetadataCommon(MD
, CurrentProgramInfo
, CallingConv::AMDGPU_CS
, ST
);
1343 // Set optional info
1344 MD
->setFunctionLdsSize(FnName
, CurrentProgramInfo
.LDSSize
);
1345 MD
->setFunctionNumUsedVgprs(FnName
, CurrentProgramInfo
.NumVGPRsForWavesPerEU
);
1346 MD
->setFunctionNumUsedSgprs(FnName
, CurrentProgramInfo
.NumSGPRsForWavesPerEU
);
1349 // This is supposed to be log2(Size)
1350 static amd_element_byte_size_t
getElementByteSizeValue(unsigned Size
) {
1353 return AMD_ELEMENT_4_BYTES
;
1355 return AMD_ELEMENT_8_BYTES
;
1357 return AMD_ELEMENT_16_BYTES
;
1359 llvm_unreachable("invalid private_element_size");
1363 void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT
&Out
,
1364 const SIProgramInfo
&CurrentProgramInfo
,
1365 const MachineFunction
&MF
) const {
1366 const Function
&F
= MF
.getFunction();
1367 assert(F
.getCallingConv() == CallingConv::AMDGPU_KERNEL
||
1368 F
.getCallingConv() == CallingConv::SPIR_KERNEL
);
1370 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1371 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
1372 MCContext
&Ctx
= MF
.getContext();
1374 Out
.initDefault(&STM
, Ctx
, /*InitMCExpr=*/false);
1376 Out
.compute_pgm_resource1_registers
=
1377 CurrentProgramInfo
.getComputePGMRSrc1(STM
, Ctx
);
1378 Out
.compute_pgm_resource2_registers
=
1379 CurrentProgramInfo
.getComputePGMRSrc2(Ctx
);
1380 Out
.code_properties
|= AMD_CODE_PROPERTY_IS_PTR64
;
1382 Out
.is_dynamic_callstack
= CurrentProgramInfo
.DynamicCallStack
;
1384 AMD_HSA_BITS_SET(Out
.code_properties
, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
,
1385 getElementByteSizeValue(STM
.getMaxPrivateElementSize(true)));
1387 const GCNUserSGPRUsageInfo
&UserSGPRInfo
= MFI
->getUserSGPRInfo();
1388 if (UserSGPRInfo
.hasPrivateSegmentBuffer()) {
1389 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
;
1392 if (UserSGPRInfo
.hasDispatchPtr())
1393 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
;
1395 if (UserSGPRInfo
.hasQueuePtr() && CodeObjectVersion
< AMDGPU::AMDHSA_COV5
)
1396 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
;
1398 if (UserSGPRInfo
.hasKernargSegmentPtr())
1399 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
;
1401 if (UserSGPRInfo
.hasDispatchID())
1402 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
;
1404 if (UserSGPRInfo
.hasFlatScratchInit())
1405 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
;
1407 if (UserSGPRInfo
.hasPrivateSegmentSize())
1408 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
;
1410 if (UserSGPRInfo
.hasDispatchPtr())
1411 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
;
1413 if (STM
.isXNACKEnabled())
1414 Out
.code_properties
|= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
;
1416 Align MaxKernArgAlign
;
1417 Out
.kernarg_segment_byte_size
= STM
.getKernArgSegmentSize(F
, MaxKernArgAlign
);
1418 Out
.wavefront_sgpr_count
= CurrentProgramInfo
.NumSGPR
;
1419 Out
.workitem_vgpr_count
= CurrentProgramInfo
.NumVGPR
;
1420 Out
.workitem_private_segment_byte_size
= CurrentProgramInfo
.ScratchSize
;
1421 Out
.workgroup_group_segment_byte_size
= CurrentProgramInfo
.LDSSize
;
1423 // kernarg_segment_alignment is specified as log of the alignment.
1424 // The minimum alignment is 16.
1425 // FIXME: The metadata treats the minimum as 4?
1426 Out
.kernarg_segment_alignment
= Log2(std::max(Align(16), MaxKernArgAlign
));
1429 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr
*MI
, unsigned OpNo
,
1430 const char *ExtraCode
, raw_ostream
&O
) {
1431 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1432 if (!AsmPrinter::PrintAsmOperand(MI
, OpNo
, ExtraCode
, O
))
1435 if (ExtraCode
&& ExtraCode
[0]) {
1436 if (ExtraCode
[1] != 0)
1437 return true; // Unknown modifier.
1439 switch (ExtraCode
[0]) {
1447 // TODO: Should be able to support other operand types like globals.
1448 const MachineOperand
&MO
= MI
->getOperand(OpNo
);
1450 AMDGPUInstPrinter::printRegOperand(MO
.getReg(), O
,
1451 *MF
->getSubtarget().getRegisterInfo());
1455 int64_t Val
= MO
.getImm();
1456 if (AMDGPU::isInlinableIntLiteral(Val
)) {
1458 } else if (isUInt
<16>(Val
)) {
1459 O
<< format("0x%" PRIx16
, static_cast<uint16_t>(Val
));
1460 } else if (isUInt
<32>(Val
)) {
1461 O
<< format("0x%" PRIx32
, static_cast<uint32_t>(Val
));
1463 O
<< format("0x%" PRIx64
, static_cast<uint64_t>(Val
));
1470 void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage
&AU
) const {
1471 AU
.addRequired
<AMDGPUResourceUsageAnalysis
>();
1472 AU
.addPreserved
<AMDGPUResourceUsageAnalysis
>();
1473 AsmPrinter::getAnalysisUsage(AU
);
1476 void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1477 const MachineFunction
&MF
, const SIProgramInfo
&CurrentProgramInfo
,
1478 bool isModuleEntryFunction
, bool hasMAIInsts
) {
1482 const char *Name
= "kernel-resource-usage";
1483 const char *Indent
= " ";
1485 // If the remark is not specifically enabled, do not output to yaml
1486 LLVMContext
&Ctx
= MF
.getFunction().getContext();
1487 if (!Ctx
.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name
))
1490 // Currently non-kernel functions have no resources to emit.
1491 if (!isEntryFunctionCC(MF
.getFunction().getCallingConv()))
1494 auto EmitResourceUsageRemark
= [&](StringRef RemarkName
,
1495 StringRef RemarkLabel
, auto Argument
) {
1496 // Add an indent for every line besides the line with the kernel name. This
1497 // makes it easier to tell which resource usage go with which kernel since
1498 // the kernel name will always be displayed first.
1499 std::string LabelStr
= RemarkLabel
.str() + ": ";
1500 if (RemarkName
!= "FunctionName")
1501 LabelStr
= Indent
+ LabelStr
;
1504 return MachineOptimizationRemarkAnalysis(Name
, RemarkName
,
1505 MF
.getFunction().getSubprogram(),
1507 << LabelStr
<< ore::NV(RemarkName
, Argument
);
1511 // FIXME: Formatting here is pretty nasty because clang does not accept
1512 // newlines from diagnostics. This forces us to emit multiple diagnostic
1513 // remarks to simulate newlines. If and when clang does accept newlines, this
1514 // formatting should be aggregated into one remark with newlines to avoid
1515 // printing multiple diagnostic location and diag opts.
1516 EmitResourceUsageRemark("FunctionName", "Function Name",
1517 MF
.getFunction().getName());
1518 EmitResourceUsageRemark("NumSGPR", "SGPRs",
1519 getMCExprStr(CurrentProgramInfo
.NumSGPR
));
1520 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1521 getMCExprStr(CurrentProgramInfo
.NumArchVGPR
));
1523 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1524 getMCExprStr(CurrentProgramInfo
.NumAccVGPR
));
1526 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1527 getMCExprStr(CurrentProgramInfo
.ScratchSize
));
1529 bool DynStackEvaluatable
=
1530 CurrentProgramInfo
.DynamicCallStack
->evaluateAsAbsolute(DynStack
);
1531 StringRef DynamicStackStr
=
1532 DynStackEvaluatable
&& DynStack
? "True" : "False";
1533 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr
);
1534 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1535 getMCExprStr(CurrentProgramInfo
.Occupancy
));
1536 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1537 CurrentProgramInfo
.SGPRSpill
);
1538 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1539 CurrentProgramInfo
.VGPRSpill
);
1540 if (isModuleEntryFunction
)
1541 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1542 CurrentProgramInfo
.LDSSize
);