1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
13 /// code. When passed an MCAsmStreamer it prints assembly and when passed
14 /// an MCObjectStreamer it outputs binary code.
16 //===----------------------------------------------------------------------===//
19 #include "AMDGPUAsmPrinter.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "InstPrinter/AMDGPUInstPrinter.h"
24 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
25 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
26 #include "R600AsmPrinter.h"
27 #include "R600Defines.h"
28 #include "R600MachineFunctionInfo.h"
29 #include "R600RegisterInfo.h"
30 #include "SIDefines.h"
31 #include "SIInstrInfo.h"
32 #include "SIMachineFunctionInfo.h"
33 #include "SIRegisterInfo.h"
34 #include "Utils/AMDGPUBaseInfo.h"
35 #include "llvm/BinaryFormat/ELF.h"
36 #include "llvm/CodeGen/MachineFrameInfo.h"
37 #include "llvm/IR/DiagnosticInfo.h"
38 #include "llvm/MC/MCContext.h"
39 #include "llvm/MC/MCSectionELF.h"
40 #include "llvm/MC/MCStreamer.h"
41 #include "llvm/Support/AMDGPUMetadata.h"
42 #include "llvm/Support/MathExtras.h"
43 #include "llvm/Support/TargetParser.h"
44 #include "llvm/Support/TargetRegistry.h"
45 #include "llvm/Target/TargetLoweringObjectFile.h"
48 using namespace llvm::AMDGPU
;
50 // TODO: This should get the default rounding mode from the kernel. We just set
51 // the default here, but this could change if the OpenCL rounding mode pragmas
54 // The denormal mode here should match what is reported by the OpenCL runtime
55 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
56 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
58 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
59 // precision, and leaves single precision to flush all and does not report
60 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
61 // CL_FP_DENORM for both.
63 // FIXME: It seems some instructions do not support single precision denormals
64 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
65 // and sin_f32, cos_f32 on most parts).
67 // We want to use these instructions, and using fp32 denormals also causes
68 // instructions to run at the double precision rate for the device so it's
69 // probably best to just report no single precision denormals.
70 static uint32_t getFPMode(const MachineFunction
&F
) {
71 const GCNSubtarget
& ST
= F
.getSubtarget
<GCNSubtarget
>();
72 // TODO: Is there any real use for the flush in only / flush out only modes?
74 uint32_t FP32Denormals
=
75 ST
.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE
: FP_DENORM_FLUSH_IN_FLUSH_OUT
;
77 uint32_t FP64Denormals
=
78 ST
.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE
: FP_DENORM_FLUSH_IN_FLUSH_OUT
;
80 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST
) |
81 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST
) |
82 FP_DENORM_MODE_SP(FP32Denormals
) |
83 FP_DENORM_MODE_DP(FP64Denormals
);
87 createAMDGPUAsmPrinterPass(TargetMachine
&tm
,
88 std::unique_ptr
<MCStreamer
> &&Streamer
) {
89 return new AMDGPUAsmPrinter(tm
, std::move(Streamer
));
92 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
93 TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
94 llvm::createR600AsmPrinterPass
);
95 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
96 createAMDGPUAsmPrinterPass
);
99 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine
&TM
,
100 std::unique_ptr
<MCStreamer
> Streamer
)
101 : AsmPrinter(TM
, std::move(Streamer
)) {
104 StringRef
AMDGPUAsmPrinter::getPassName() const {
105 return "AMDGPU Assembly Printer";
108 const MCSubtargetInfo
* AMDGPUAsmPrinter::getSTI() const {
109 return TM
.getMCSubtargetInfo();
112 AMDGPUTargetStreamer
* AMDGPUAsmPrinter::getTargetStreamer() const {
115 return static_cast<AMDGPUTargetStreamer
*>(OutStreamer
->getTargetStreamer());
118 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module
&M
) {
119 if (IsaInfo::hasCodeObjectV3(getSTI()) &&
120 TM
.getTargetTriple().getOS() == Triple::AMDHSA
)
123 if (TM
.getTargetTriple().getOS() != Triple::AMDHSA
&&
124 TM
.getTargetTriple().getOS() != Triple::AMDPAL
)
127 if (TM
.getTargetTriple().getOS() == Triple::AMDHSA
)
128 HSAMetadataStream
.begin(M
);
130 if (TM
.getTargetTriple().getOS() == Triple::AMDPAL
)
133 // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
134 if (TM
.getTargetTriple().getOS() == Triple::AMDHSA
)
135 getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
137 // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
138 IsaVersion Version
= getIsaVersion(getSTI()->getCPU());
139 getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
140 Version
.Major
, Version
.Minor
, Version
.Stepping
, "AMD", "AMDGPU");
143 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module
&M
) {
144 // TODO: Add metadata to code object v3.
145 if (IsaInfo::hasCodeObjectV3(getSTI()) &&
146 TM
.getTargetTriple().getOS() == Triple::AMDHSA
)
149 // Following code requires TargetStreamer to be present.
150 if (!getTargetStreamer())
153 // Emit ISA Version (NT_AMD_AMDGPU_ISA).
154 std::string ISAVersionString
;
155 raw_string_ostream
ISAVersionStream(ISAVersionString
);
156 IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream
);
157 getTargetStreamer()->EmitISAVersion(ISAVersionStream
.str());
159 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
160 if (TM
.getTargetTriple().getOS() == Triple::AMDHSA
) {
161 HSAMetadataStream
.end();
162 getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream
.getHSAMetadata());
165 // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
166 if (TM
.getTargetTriple().getOS() == Triple::AMDPAL
) {
167 // Copy the PAL metadata from the map where we collected it into a vector,
168 // then write it as a .note.
169 PALMD::Metadata PALMetadataVector
;
170 for (auto i
: PALMetadataMap
) {
171 PALMetadataVector
.push_back(i
.first
);
172 PALMetadataVector
.push_back(i
.second
);
174 getTargetStreamer()->EmitPALMetadata(PALMetadataVector
);
178 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
179 const MachineBasicBlock
*MBB
) const {
180 if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB
))
186 // If this is a block implementing a long branch, an expression relative to
187 // the start of the block is needed. to the start of the block.
188 // XXX - Is there a smarter way to check this?
189 return (MBB
->back().getOpcode() != AMDGPU::S_SETPC_B64
);
192 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
193 const SIMachineFunctionInfo
&MFI
= *MF
->getInfo
<SIMachineFunctionInfo
>();
194 if (!MFI
.isEntryFunction())
197 const GCNSubtarget
&STM
= MF
->getSubtarget
<GCNSubtarget
>();
198 const Function
&F
= MF
->getFunction();
199 if (!STM
.hasCodeObjectV3() && STM
.isAmdHsaOrMesa(F
) &&
200 (F
.getCallingConv() == CallingConv::AMDGPU_KERNEL
||
201 F
.getCallingConv() == CallingConv::SPIR_KERNEL
)) {
202 amd_kernel_code_t KernelCode
;
203 getAmdKernelCode(KernelCode
, CurrentProgramInfo
, *MF
);
204 getTargetStreamer()->EmitAMDKernelCodeT(KernelCode
);
207 if (TM
.getTargetTriple().getOS() != Triple::AMDHSA
)
210 if (!STM
.hasCodeObjectV3() && STM
.isAmdHsaOS())
211 HSAMetadataStream
.emitKernel(*MF
, CurrentProgramInfo
);
214 void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
215 const SIMachineFunctionInfo
&MFI
= *MF
->getInfo
<SIMachineFunctionInfo
>();
216 if (!MFI
.isEntryFunction())
218 if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
219 TM
.getTargetTriple().getOS() != Triple::AMDHSA
)
222 auto &Streamer
= getTargetStreamer()->getStreamer();
223 auto &Context
= Streamer
.getContext();
224 auto &ObjectFileInfo
= *Context
.getObjectFileInfo();
225 auto &ReadOnlySection
= *ObjectFileInfo
.getReadOnlySection();
227 Streamer
.PushSection();
228 Streamer
.SwitchSection(&ReadOnlySection
);
230 // CP microcode requires the kernel descriptor to be allocated on 64 byte
232 Streamer
.EmitValueToAlignment(64, 0, 1, 0);
233 if (ReadOnlySection
.getAlignment() < 64)
234 ReadOnlySection
.setAlignment(64);
236 SmallString
<128> KernelName
;
237 getNameWithPrefix(KernelName
, &MF
->getFunction());
238 getTargetStreamer()->EmitAmdhsaKernelDescriptor(
239 *getSTI(), KernelName
, getAmdhsaKernelDescriptor(*MF
, CurrentProgramInfo
),
240 CurrentProgramInfo
.NumVGPRsForWavesPerEU
,
241 CurrentProgramInfo
.NumSGPRsForWavesPerEU
-
242 IsaInfo::getNumExtraSGPRs(getSTI(),
243 CurrentProgramInfo
.VCCUsed
,
244 CurrentProgramInfo
.FlatUsed
),
245 CurrentProgramInfo
.VCCUsed
, CurrentProgramInfo
.FlatUsed
,
246 hasXNACK(*getSTI()));
248 Streamer
.PopSection();
251 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
252 if (IsaInfo::hasCodeObjectV3(getSTI()) &&
253 TM
.getTargetTriple().getOS() == Triple::AMDHSA
) {
254 AsmPrinter::EmitFunctionEntryLabel();
258 const SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
259 const GCNSubtarget
&STM
= MF
->getSubtarget
<GCNSubtarget
>();
260 if (MFI
->isEntryFunction() && STM
.isAmdHsaOrMesa(MF
->getFunction())) {
261 SmallString
<128> SymbolName
;
262 getNameWithPrefix(SymbolName
, &MF
->getFunction()),
263 getTargetStreamer()->EmitAMDGPUSymbolType(
264 SymbolName
, ELF::STT_AMDGPU_HSA_KERNEL
);
266 const GCNSubtarget
&STI
= MF
->getSubtarget
<GCNSubtarget
>();
267 if (STI
.dumpCode()) {
268 // Disassemble function name label to text.
269 DisasmLines
.push_back(MF
->getName().str() + ":");
270 DisasmLineMaxLen
= std::max(DisasmLineMaxLen
, DisasmLines
.back().size());
271 HexLines
.push_back("");
274 AsmPrinter::EmitFunctionEntryLabel();
277 void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock
&MBB
) const {
278 const GCNSubtarget
&STI
= MBB
.getParent()->getSubtarget
<GCNSubtarget
>();
279 if (STI
.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB
)) {
280 // Write a line for the basic block label if it is not only fallthrough.
281 DisasmLines
.push_back(
282 (Twine("BB") + Twine(getFunctionNumber())
283 + "_" + Twine(MBB
.getNumber()) + ":").str());
284 DisasmLineMaxLen
= std::max(DisasmLineMaxLen
, DisasmLines
.back().size());
285 HexLines
.push_back("");
287 AsmPrinter::EmitBasicBlockStart(MBB
);
290 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable
*GV
) {
292 // Group segment variables aren't emitted in HSA.
293 if (AMDGPU::isGroupSegment(GV
))
296 AsmPrinter::EmitGlobalVariable(GV
);
299 bool AMDGPUAsmPrinter::doFinalization(Module
&M
) {
300 CallGraphResourceInfo
.clear();
301 return AsmPrinter::doFinalization(M
);
304 // For the amdpal OS type, read the amdgpu.pal.metadata supplied by the
305 // frontend into our PALMetadataMap, ready for per-function modification. It
306 // is a NamedMD containing an MDTuple containing a number of MDNodes each of
307 // which is an integer value, and each two integer values forms a key=value
308 // pair that we store as PALMetadataMap[key]=value in the map.
309 void AMDGPUAsmPrinter::readPALMetadata(Module
&M
) {
310 auto NamedMD
= M
.getNamedMetadata("amdgpu.pal.metadata");
311 if (!NamedMD
|| !NamedMD
->getNumOperands())
313 auto Tuple
= dyn_cast
<MDTuple
>(NamedMD
->getOperand(0));
316 for (unsigned I
= 0, E
= Tuple
->getNumOperands() & -2; I
!= E
; I
+= 2) {
317 auto Key
= mdconst::dyn_extract
<ConstantInt
>(Tuple
->getOperand(I
));
318 auto Val
= mdconst::dyn_extract
<ConstantInt
>(Tuple
->getOperand(I
+ 1));
321 PALMetadataMap
[Key
->getZExtValue()] = Val
->getZExtValue();
325 // Print comments that apply to both callable functions and entry points.
326 void AMDGPUAsmPrinter::emitCommonFunctionComments(
329 uint64_t ScratchSize
,
331 const AMDGPUMachineFunction
*MFI
) {
332 OutStreamer
->emitRawComment(" codeLenInByte = " + Twine(CodeSize
), false);
333 OutStreamer
->emitRawComment(" NumSgprs: " + Twine(NumSGPR
), false);
334 OutStreamer
->emitRawComment(" NumVgprs: " + Twine(NumVGPR
), false);
335 OutStreamer
->emitRawComment(" ScratchSize: " + Twine(ScratchSize
), false);
336 OutStreamer
->emitRawComment(" MemoryBound: " + Twine(MFI
->isMemoryBound()),
340 uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
341 const MachineFunction
&MF
) const {
342 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
343 uint16_t KernelCodeProperties
= 0;
345 if (MFI
.hasPrivateSegmentBuffer()) {
346 KernelCodeProperties
|=
347 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
;
349 if (MFI
.hasDispatchPtr()) {
350 KernelCodeProperties
|=
351 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
;
353 if (MFI
.hasQueuePtr()) {
354 KernelCodeProperties
|=
355 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
;
357 if (MFI
.hasKernargSegmentPtr()) {
358 KernelCodeProperties
|=
359 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
;
361 if (MFI
.hasDispatchID()) {
362 KernelCodeProperties
|=
363 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
;
365 if (MFI
.hasFlatScratchInit()) {
366 KernelCodeProperties
|=
367 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
;
370 return KernelCodeProperties
;
373 amdhsa::kernel_descriptor_t
AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
374 const MachineFunction
&MF
,
375 const SIProgramInfo
&PI
) const {
376 amdhsa::kernel_descriptor_t KernelDescriptor
;
377 memset(&KernelDescriptor
, 0x0, sizeof(KernelDescriptor
));
379 assert(isUInt
<32>(PI
.ScratchSize
));
380 assert(isUInt
<32>(PI
.ComputePGMRSrc1
));
381 assert(isUInt
<32>(PI
.ComputePGMRSrc2
));
383 KernelDescriptor
.group_segment_fixed_size
= PI
.LDSSize
;
384 KernelDescriptor
.private_segment_fixed_size
= PI
.ScratchSize
;
385 KernelDescriptor
.compute_pgm_rsrc1
= PI
.ComputePGMRSrc1
;
386 KernelDescriptor
.compute_pgm_rsrc2
= PI
.ComputePGMRSrc2
;
387 KernelDescriptor
.kernel_code_properties
= getAmdhsaKernelCodeProperties(MF
);
389 return KernelDescriptor
;
392 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction
&MF
) {
393 CurrentProgramInfo
= SIProgramInfo();
395 const AMDGPUMachineFunction
*MFI
= MF
.getInfo
<AMDGPUMachineFunction
>();
397 // The starting address of all shader programs must be 256 bytes aligned.
398 // Regular functions just need the basic required instruction alignment.
399 MF
.setAlignment(MFI
->isEntryFunction() ? 8 : 2);
401 SetupMachineFunction(MF
);
403 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
404 MCContext
&Context
= getObjFileLowering().getContext();
405 // FIXME: This should be an explicit check for Mesa.
406 if (!STM
.isAmdHsaOS() && !STM
.isAmdPalOS()) {
407 MCSectionELF
*ConfigSection
=
408 Context
.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS
, 0);
409 OutStreamer
->SwitchSection(ConfigSection
);
412 if (MFI
->isEntryFunction()) {
413 getSIProgramInfo(CurrentProgramInfo
, MF
);
415 auto I
= CallGraphResourceInfo
.insert(
416 std::make_pair(&MF
.getFunction(), SIFunctionResourceInfo()));
417 SIFunctionResourceInfo
&Info
= I
.first
->second
;
418 assert(I
.second
&& "should only be called once per function");
419 Info
= analyzeResourceUsage(MF
);
422 if (STM
.isAmdPalOS())
423 EmitPALMetadata(MF
, CurrentProgramInfo
);
424 else if (!STM
.isAmdHsaOS()) {
425 EmitProgramInfoSI(MF
, CurrentProgramInfo
);
430 DisasmLineMaxLen
= 0;
435 MCSectionELF
*CommentSection
=
436 Context
.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS
, 0);
437 OutStreamer
->SwitchSection(CommentSection
);
439 if (!MFI
->isEntryFunction()) {
440 OutStreamer
->emitRawComment(" Function info:", false);
441 SIFunctionResourceInfo
&Info
= CallGraphResourceInfo
[&MF
.getFunction()];
442 emitCommonFunctionComments(
444 Info
.getTotalNumSGPRs(MF
.getSubtarget
<GCNSubtarget
>()),
445 Info
.PrivateSegmentSize
,
446 getFunctionCodeSize(MF
), MFI
);
450 OutStreamer
->emitRawComment(" Kernel info:", false);
451 emitCommonFunctionComments(CurrentProgramInfo
.NumVGPR
,
452 CurrentProgramInfo
.NumSGPR
,
453 CurrentProgramInfo
.ScratchSize
,
454 getFunctionCodeSize(MF
), MFI
);
456 OutStreamer
->emitRawComment(
457 " FloatMode: " + Twine(CurrentProgramInfo
.FloatMode
), false);
458 OutStreamer
->emitRawComment(
459 " IeeeMode: " + Twine(CurrentProgramInfo
.IEEEMode
), false);
460 OutStreamer
->emitRawComment(
461 " LDSByteSize: " + Twine(CurrentProgramInfo
.LDSSize
) +
462 " bytes/workgroup (compile time only)", false);
464 OutStreamer
->emitRawComment(
465 " SGPRBlocks: " + Twine(CurrentProgramInfo
.SGPRBlocks
), false);
466 OutStreamer
->emitRawComment(
467 " VGPRBlocks: " + Twine(CurrentProgramInfo
.VGPRBlocks
), false);
469 OutStreamer
->emitRawComment(
470 " NumSGPRsForWavesPerEU: " +
471 Twine(CurrentProgramInfo
.NumSGPRsForWavesPerEU
), false);
472 OutStreamer
->emitRawComment(
473 " NumVGPRsForWavesPerEU: " +
474 Twine(CurrentProgramInfo
.NumVGPRsForWavesPerEU
), false);
476 OutStreamer
->emitRawComment(
477 " WaveLimiterHint : " + Twine(MFI
->needsWaveLimiter()), false);
479 if (MF
.getSubtarget
<GCNSubtarget
>().debuggerEmitPrologue()) {
480 OutStreamer
->emitRawComment(
481 " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
482 Twine(CurrentProgramInfo
.DebuggerWavefrontPrivateSegmentOffsetSGPR
), false);
483 OutStreamer
->emitRawComment(
484 " DebuggerPrivateSegmentBufferSGPR: s" +
485 Twine(CurrentProgramInfo
.DebuggerPrivateSegmentBufferSGPR
), false);
488 OutStreamer
->emitRawComment(
489 " COMPUTE_PGM_RSRC2:USER_SGPR: " +
490 Twine(G_00B84C_USER_SGPR(CurrentProgramInfo
.ComputePGMRSrc2
)), false);
491 OutStreamer
->emitRawComment(
492 " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
493 Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo
.ComputePGMRSrc2
)), false);
494 OutStreamer
->emitRawComment(
495 " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
496 Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo
.ComputePGMRSrc2
)), false);
497 OutStreamer
->emitRawComment(
498 " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
499 Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo
.ComputePGMRSrc2
)), false);
500 OutStreamer
->emitRawComment(
501 " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
502 Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo
.ComputePGMRSrc2
)), false);
503 OutStreamer
->emitRawComment(
504 " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
505 Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo
.ComputePGMRSrc2
)),
509 if (STM
.dumpCode()) {
511 OutStreamer
->SwitchSection(
512 Context
.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE
, 0));
514 for (size_t i
= 0; i
< DisasmLines
.size(); ++i
) {
515 std::string Comment
= "\n";
516 if (!HexLines
[i
].empty()) {
517 Comment
= std::string(DisasmLineMaxLen
- DisasmLines
[i
].size(), ' ');
518 Comment
+= " ; " + HexLines
[i
] + "\n";
521 OutStreamer
->EmitBytes(StringRef(DisasmLines
[i
]));
522 OutStreamer
->EmitBytes(StringRef(Comment
));
529 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction
&MF
) const {
530 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
531 const SIInstrInfo
*TII
= STM
.getInstrInfo();
533 uint64_t CodeSize
= 0;
535 for (const MachineBasicBlock
&MBB
: MF
) {
536 for (const MachineInstr
&MI
: MBB
) {
537 // TODO: CodeSize should account for multiple functions.
539 // TODO: Should we count size of debug info?
540 if (MI
.isDebugInstr())
543 CodeSize
+= TII
->getInstSizeInBytes(MI
);
550 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo
&MRI
,
551 const SIInstrInfo
&TII
,
553 for (const MachineOperand
&UseOp
: MRI
.reg_operands(Reg
)) {
554 if (!UseOp
.isImplicit() || !TII
.isFLAT(*UseOp
.getParent()))
561 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
562 const GCNSubtarget
&ST
) const {
563 return NumExplicitSGPR
+ IsaInfo::getNumExtraSGPRs(&ST
,
564 UsesVCC
, UsesFlatScratch
);
567 AMDGPUAsmPrinter::SIFunctionResourceInfo
AMDGPUAsmPrinter::analyzeResourceUsage(
568 const MachineFunction
&MF
) const {
569 SIFunctionResourceInfo Info
;
571 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
572 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
573 const MachineFrameInfo
&FrameInfo
= MF
.getFrameInfo();
574 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
575 const SIInstrInfo
*TII
= ST
.getInstrInfo();
576 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
578 Info
.UsesFlatScratch
= MRI
.isPhysRegUsed(AMDGPU::FLAT_SCR_LO
) ||
579 MRI
.isPhysRegUsed(AMDGPU::FLAT_SCR_HI
);
581 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
582 // instructions aren't used to access the scratch buffer. Inline assembly may
585 // If we only have implicit uses of flat_scr on flat instructions, it is not
587 if (Info
.UsesFlatScratch
&& !MFI
->hasFlatScratchInit() &&
588 (!hasAnyNonFlatUseOfReg(MRI
, *TII
, AMDGPU::FLAT_SCR
) &&
589 !hasAnyNonFlatUseOfReg(MRI
, *TII
, AMDGPU::FLAT_SCR_LO
) &&
590 !hasAnyNonFlatUseOfReg(MRI
, *TII
, AMDGPU::FLAT_SCR_HI
))) {
591 Info
.UsesFlatScratch
= false;
594 Info
.HasDynamicallySizedStack
= FrameInfo
.hasVarSizedObjects();
595 Info
.PrivateSegmentSize
= FrameInfo
.getStackSize();
596 if (MFI
->isStackRealigned())
597 Info
.PrivateSegmentSize
+= FrameInfo
.getMaxAlignment();
600 Info
.UsesVCC
= MRI
.isPhysRegUsed(AMDGPU::VCC_LO
) ||
601 MRI
.isPhysRegUsed(AMDGPU::VCC_HI
);
603 // If there are no calls, MachineRegisterInfo can tell us the used register
605 // A tail call isn't considered a call for MachineFrameInfo's purposes.
606 if (!FrameInfo
.hasCalls() && !FrameInfo
.hasTailCall()) {
607 MCPhysReg HighestVGPRReg
= AMDGPU::NoRegister
;
608 for (MCPhysReg Reg
: reverse(AMDGPU::VGPR_32RegClass
.getRegisters())) {
609 if (MRI
.isPhysRegUsed(Reg
)) {
610 HighestVGPRReg
= Reg
;
615 MCPhysReg HighestSGPRReg
= AMDGPU::NoRegister
;
616 for (MCPhysReg Reg
: reverse(AMDGPU::SGPR_32RegClass
.getRegisters())) {
617 if (MRI
.isPhysRegUsed(Reg
)) {
618 HighestSGPRReg
= Reg
;
623 // We found the maximum register index. They start at 0, so add one to get the
624 // number of registers.
625 Info
.NumVGPR
= HighestVGPRReg
== AMDGPU::NoRegister
? 0 :
626 TRI
.getHWRegIndex(HighestVGPRReg
) + 1;
627 Info
.NumExplicitSGPR
= HighestSGPRReg
== AMDGPU::NoRegister
? 0 :
628 TRI
.getHWRegIndex(HighestSGPRReg
) + 1;
633 int32_t MaxVGPR
= -1;
634 int32_t MaxSGPR
= -1;
635 uint64_t CalleeFrameSize
= 0;
637 for (const MachineBasicBlock
&MBB
: MF
) {
638 for (const MachineInstr
&MI
: MBB
) {
639 // TODO: Check regmasks? Do they occur anywhere except calls?
640 for (const MachineOperand
&MO
: MI
.operands()) {
647 unsigned Reg
= MO
.getReg();
650 case AMDGPU::EXEC_LO
:
651 case AMDGPU::EXEC_HI
:
654 case AMDGPU::SRC_SHARED_BASE
:
655 case AMDGPU::SRC_SHARED_LIMIT
:
656 case AMDGPU::SRC_PRIVATE_BASE
:
657 case AMDGPU::SRC_PRIVATE_LIMIT
:
660 case AMDGPU::NoRegister
:
661 assert(MI
.isDebugInstr());
670 case AMDGPU::FLAT_SCR
:
671 case AMDGPU::FLAT_SCR_LO
:
672 case AMDGPU::FLAT_SCR_HI
:
675 case AMDGPU::XNACK_MASK
:
676 case AMDGPU::XNACK_MASK_LO
:
677 case AMDGPU::XNACK_MASK_HI
:
678 llvm_unreachable("xnack_mask registers should not be used");
686 llvm_unreachable("trap handler registers should not be used");
692 if (AMDGPU::SReg_32RegClass
.contains(Reg
)) {
693 assert(!AMDGPU::TTMP_32RegClass
.contains(Reg
) &&
694 "trap handler registers should not be used");
697 } else if (AMDGPU::VGPR_32RegClass
.contains(Reg
)) {
700 } else if (AMDGPU::SReg_64RegClass
.contains(Reg
)) {
701 assert(!AMDGPU::TTMP_64RegClass
.contains(Reg
) &&
702 "trap handler registers should not be used");
705 } else if (AMDGPU::VReg_64RegClass
.contains(Reg
)) {
708 } else if (AMDGPU::VReg_96RegClass
.contains(Reg
)) {
711 } else if (AMDGPU::SReg_128RegClass
.contains(Reg
)) {
712 assert(!AMDGPU::TTMP_128RegClass
.contains(Reg
) &&
713 "trap handler registers should not be used");
716 } else if (AMDGPU::VReg_128RegClass
.contains(Reg
)) {
719 } else if (AMDGPU::SReg_256RegClass
.contains(Reg
)) {
720 assert(!AMDGPU::TTMP_256RegClass
.contains(Reg
) &&
721 "trap handler registers should not be used");
724 } else if (AMDGPU::VReg_256RegClass
.contains(Reg
)) {
727 } else if (AMDGPU::SReg_512RegClass
.contains(Reg
)) {
728 assert(!AMDGPU::TTMP_512RegClass
.contains(Reg
) &&
729 "trap handler registers should not be used");
732 } else if (AMDGPU::VReg_512RegClass
.contains(Reg
)) {
736 llvm_unreachable("Unknown register class");
738 unsigned HWReg
= TRI
.getHWRegIndex(Reg
);
739 int MaxUsed
= HWReg
+ Width
- 1;
741 MaxSGPR
= MaxUsed
> MaxSGPR
? MaxUsed
: MaxSGPR
;
743 MaxVGPR
= MaxUsed
> MaxVGPR
? MaxUsed
: MaxVGPR
;
748 // Pseudo used just to encode the underlying global. Is there a better
749 // way to track this?
751 const MachineOperand
*CalleeOp
752 = TII
->getNamedOperand(MI
, AMDGPU::OpName::callee
);
753 const Function
*Callee
= cast
<Function
>(CalleeOp
->getGlobal());
754 if (Callee
->isDeclaration()) {
755 // If this is a call to an external function, we can't do much. Make
756 // conservative guesses.
758 // 48 SGPRs - vcc, - flat_scr, -xnack
760 47 - IsaInfo::getNumExtraSGPRs(getSTI(), true,
761 ST
.hasFlatAddressSpace());
762 MaxSGPR
= std::max(MaxSGPR
, MaxSGPRGuess
);
763 MaxVGPR
= std::max(MaxVGPR
, 23);
765 CalleeFrameSize
= std::max(CalleeFrameSize
, UINT64_C(16384));
767 Info
.UsesFlatScratch
= ST
.hasFlatAddressSpace();
768 Info
.HasDynamicallySizedStack
= true;
770 // We force CodeGen to run in SCC order, so the callee's register
771 // usage etc. should be the cumulative usage of all callees.
772 auto I
= CallGraphResourceInfo
.find(Callee
);
773 assert(I
!= CallGraphResourceInfo
.end() &&
774 "callee should have been handled before caller");
776 MaxSGPR
= std::max(I
->second
.NumExplicitSGPR
- 1, MaxSGPR
);
777 MaxVGPR
= std::max(I
->second
.NumVGPR
- 1, MaxVGPR
);
779 = std::max(I
->second
.PrivateSegmentSize
, CalleeFrameSize
);
780 Info
.UsesVCC
|= I
->second
.UsesVCC
;
781 Info
.UsesFlatScratch
|= I
->second
.UsesFlatScratch
;
782 Info
.HasDynamicallySizedStack
|= I
->second
.HasDynamicallySizedStack
;
783 Info
.HasRecursion
|= I
->second
.HasRecursion
;
786 if (!Callee
->doesNotRecurse())
787 Info
.HasRecursion
= true;
792 Info
.NumExplicitSGPR
= MaxSGPR
+ 1;
793 Info
.NumVGPR
= MaxVGPR
+ 1;
794 Info
.PrivateSegmentSize
+= CalleeFrameSize
;
799 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo
&ProgInfo
,
800 const MachineFunction
&MF
) {
801 SIFunctionResourceInfo Info
= analyzeResourceUsage(MF
);
803 ProgInfo
.NumVGPR
= Info
.NumVGPR
;
804 ProgInfo
.NumSGPR
= Info
.NumExplicitSGPR
;
805 ProgInfo
.ScratchSize
= Info
.PrivateSegmentSize
;
806 ProgInfo
.VCCUsed
= Info
.UsesVCC
;
807 ProgInfo
.FlatUsed
= Info
.UsesFlatScratch
;
808 ProgInfo
.DynamicCallStack
= Info
.HasDynamicallySizedStack
|| Info
.HasRecursion
;
810 if (!isUInt
<32>(ProgInfo
.ScratchSize
)) {
811 DiagnosticInfoStackSize
DiagStackSize(MF
.getFunction(),
812 ProgInfo
.ScratchSize
, DS_Error
);
813 MF
.getFunction().getContext().diagnose(DiagStackSize
);
816 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
817 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
818 const SIInstrInfo
*TII
= STM
.getInstrInfo();
819 const SIRegisterInfo
*RI
= &TII
->getRegisterInfo();
821 // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
822 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
824 unsigned ExtraSGPRs
= IsaInfo::getNumExtraSGPRs(
825 getSTI(), ProgInfo
.VCCUsed
, ProgInfo
.FlatUsed
);
827 // Check the addressable register limit before we add ExtraSGPRs.
828 if (STM
.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
&&
829 !STM
.hasSGPRInitBug()) {
830 unsigned MaxAddressableNumSGPRs
= STM
.getAddressableNumSGPRs();
831 if (ProgInfo
.NumSGPR
> MaxAddressableNumSGPRs
) {
832 // This can happen due to a compiler bug or when using inline asm.
833 LLVMContext
&Ctx
= MF
.getFunction().getContext();
834 DiagnosticInfoResourceLimit
Diag(MF
.getFunction(),
835 "addressable scalar registers",
836 ProgInfo
.NumSGPR
, DS_Error
,
838 MaxAddressableNumSGPRs
);
840 ProgInfo
.NumSGPR
= MaxAddressableNumSGPRs
- 1;
844 // Account for extra SGPRs and VGPRs reserved for debugger use.
845 ProgInfo
.NumSGPR
+= ExtraSGPRs
;
847 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
848 // dispatch registers are function args.
849 unsigned WaveDispatchNumSGPR
= 0, WaveDispatchNumVGPR
= 0;
850 for (auto &Arg
: MF
.getFunction().args()) {
851 unsigned NumRegs
= (Arg
.getType()->getPrimitiveSizeInBits() + 31) / 32;
852 if (Arg
.hasAttribute(Attribute::InReg
))
853 WaveDispatchNumSGPR
+= NumRegs
;
855 WaveDispatchNumVGPR
+= NumRegs
;
857 ProgInfo
.NumSGPR
= std::max(ProgInfo
.NumSGPR
, WaveDispatchNumSGPR
);
858 ProgInfo
.NumVGPR
= std::max(ProgInfo
.NumVGPR
, WaveDispatchNumVGPR
);
860 // Adjust number of registers used to meet default/requested minimum/maximum
861 // number of waves per execution unit request.
862 ProgInfo
.NumSGPRsForWavesPerEU
= std::max(
863 std::max(ProgInfo
.NumSGPR
, 1u), STM
.getMinNumSGPRs(MFI
->getMaxWavesPerEU()));
864 ProgInfo
.NumVGPRsForWavesPerEU
= std::max(
865 std::max(ProgInfo
.NumVGPR
, 1u), STM
.getMinNumVGPRs(MFI
->getMaxWavesPerEU()));
867 if (STM
.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS
||
868 STM
.hasSGPRInitBug()) {
869 unsigned MaxAddressableNumSGPRs
= STM
.getAddressableNumSGPRs();
870 if (ProgInfo
.NumSGPR
> MaxAddressableNumSGPRs
) {
871 // This can happen due to a compiler bug or when using inline asm to use
872 // the registers which are usually reserved for vcc etc.
873 LLVMContext
&Ctx
= MF
.getFunction().getContext();
874 DiagnosticInfoResourceLimit
Diag(MF
.getFunction(),
876 ProgInfo
.NumSGPR
, DS_Error
,
878 MaxAddressableNumSGPRs
);
880 ProgInfo
.NumSGPR
= MaxAddressableNumSGPRs
;
881 ProgInfo
.NumSGPRsForWavesPerEU
= MaxAddressableNumSGPRs
;
885 if (STM
.hasSGPRInitBug()) {
887 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
;
888 ProgInfo
.NumSGPRsForWavesPerEU
=
889 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
;
892 if (MFI
->getNumUserSGPRs() > STM
.getMaxNumUserSGPRs()) {
893 LLVMContext
&Ctx
= MF
.getFunction().getContext();
894 DiagnosticInfoResourceLimit
Diag(MF
.getFunction(), "user SGPRs",
895 MFI
->getNumUserSGPRs(), DS_Error
);
899 if (MFI
->getLDSSize() > static_cast<unsigned>(STM
.getLocalMemorySize())) {
900 LLVMContext
&Ctx
= MF
.getFunction().getContext();
901 DiagnosticInfoResourceLimit
Diag(MF
.getFunction(), "local memory",
902 MFI
->getLDSSize(), DS_Error
);
906 ProgInfo
.SGPRBlocks
= IsaInfo::getNumSGPRBlocks(
907 getSTI(), ProgInfo
.NumSGPRsForWavesPerEU
);
908 ProgInfo
.VGPRBlocks
= IsaInfo::getNumVGPRBlocks(
909 getSTI(), ProgInfo
.NumVGPRsForWavesPerEU
);
911 // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
912 // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
913 // attribute was requested.
914 if (STM
.debuggerEmitPrologue()) {
915 ProgInfo
.DebuggerWavefrontPrivateSegmentOffsetSGPR
=
916 RI
->getHWRegIndex(MFI
->getScratchWaveOffsetReg());
917 ProgInfo
.DebuggerPrivateSegmentBufferSGPR
=
918 RI
->getHWRegIndex(MFI
->getScratchRSrcReg());
921 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
923 ProgInfo
.FloatMode
= getFPMode(MF
);
925 ProgInfo
.IEEEMode
= STM
.enableIEEEBit(MF
);
927 // Make clamp modifier on NaN input returns 0.
928 ProgInfo
.DX10Clamp
= STM
.enableDX10Clamp();
930 unsigned LDSAlignShift
;
931 if (STM
.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS
) {
932 // LDS is allocated in 64 dword blocks.
935 // LDS is allocated in 128 dword blocks.
939 unsigned LDSSpillSize
=
940 MFI
->getLDSWaveSpillSize() * MFI
->getMaxFlatWorkGroupSize();
942 ProgInfo
.LDSSize
= MFI
->getLDSSize() + LDSSpillSize
;
944 alignTo(ProgInfo
.LDSSize
, 1ULL << LDSAlignShift
) >> LDSAlignShift
;
946 // Scratch is allocated in 256 dword blocks.
947 unsigned ScratchAlignShift
= 10;
948 // We need to program the hardware with the amount of scratch memory that
949 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
950 // scratch memory used per thread.
951 ProgInfo
.ScratchBlocks
=
952 alignTo(ProgInfo
.ScratchSize
* STM
.getWavefrontSize(),
953 1ULL << ScratchAlignShift
) >>
956 ProgInfo
.ComputePGMRSrc1
=
957 S_00B848_VGPRS(ProgInfo
.VGPRBlocks
) |
958 S_00B848_SGPRS(ProgInfo
.SGPRBlocks
) |
959 S_00B848_PRIORITY(ProgInfo
.Priority
) |
960 S_00B848_FLOAT_MODE(ProgInfo
.FloatMode
) |
961 S_00B848_PRIV(ProgInfo
.Priv
) |
962 S_00B848_DX10_CLAMP(ProgInfo
.DX10Clamp
) |
963 S_00B848_DEBUG_MODE(ProgInfo
.DebugMode
) |
964 S_00B848_IEEE_MODE(ProgInfo
.IEEEMode
);
966 // 0 = X, 1 = XY, 2 = XYZ
967 unsigned TIDIGCompCnt
= 0;
968 if (MFI
->hasWorkItemIDZ())
970 else if (MFI
->hasWorkItemIDY())
973 ProgInfo
.ComputePGMRSrc2
=
974 S_00B84C_SCRATCH_EN(ProgInfo
.ScratchBlocks
> 0) |
975 S_00B84C_USER_SGPR(MFI
->getNumUserSGPRs()) |
976 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
977 S_00B84C_TRAP_HANDLER(STM
.isAmdHsaOS() ? 0 : STM
.isTrapHandlerEnabled()) |
978 S_00B84C_TGID_X_EN(MFI
->hasWorkGroupIDX()) |
979 S_00B84C_TGID_Y_EN(MFI
->hasWorkGroupIDY()) |
980 S_00B84C_TGID_Z_EN(MFI
->hasWorkGroupIDZ()) |
981 S_00B84C_TG_SIZE_EN(MFI
->hasWorkGroupInfo()) |
982 S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt
) |
983 S_00B84C_EXCP_EN_MSB(0) |
984 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
985 S_00B84C_LDS_SIZE(STM
.isAmdHsaOS() ? 0 : ProgInfo
.LDSBlocks
) |
989 static unsigned getRsrcReg(CallingConv::ID CallConv
) {
991 default: LLVM_FALLTHROUGH
;
992 case CallingConv::AMDGPU_CS
: return R_00B848_COMPUTE_PGM_RSRC1
;
993 case CallingConv::AMDGPU_LS
: return R_00B528_SPI_SHADER_PGM_RSRC1_LS
;
994 case CallingConv::AMDGPU_HS
: return R_00B428_SPI_SHADER_PGM_RSRC1_HS
;
995 case CallingConv::AMDGPU_ES
: return R_00B328_SPI_SHADER_PGM_RSRC1_ES
;
996 case CallingConv::AMDGPU_GS
: return R_00B228_SPI_SHADER_PGM_RSRC1_GS
;
997 case CallingConv::AMDGPU_VS
: return R_00B128_SPI_SHADER_PGM_RSRC1_VS
;
998 case CallingConv::AMDGPU_PS
: return R_00B028_SPI_SHADER_PGM_RSRC1_PS
;
1002 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction
&MF
,
1003 const SIProgramInfo
&CurrentProgramInfo
) {
1004 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
1005 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1006 unsigned RsrcReg
= getRsrcReg(MF
.getFunction().getCallingConv());
1008 if (AMDGPU::isCompute(MF
.getFunction().getCallingConv())) {
1009 OutStreamer
->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1
, 4);
1011 OutStreamer
->EmitIntValue(CurrentProgramInfo
.ComputePGMRSrc1
, 4);
1013 OutStreamer
->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2
, 4);
1014 OutStreamer
->EmitIntValue(CurrentProgramInfo
.ComputePGMRSrc2
, 4);
1016 OutStreamer
->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE
, 4);
1017 OutStreamer
->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo
.ScratchBlocks
), 4);
1019 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1020 // 0" comment but I don't see a corresponding field in the register spec.
1022 OutStreamer
->EmitIntValue(RsrcReg
, 4);
1023 OutStreamer
->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo
.VGPRBlocks
) |
1024 S_00B028_SGPRS(CurrentProgramInfo
.SGPRBlocks
), 4);
1025 if (STM
.isVGPRSpillingEnabled(MF
.getFunction())) {
1026 OutStreamer
->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE
, 4);
1027 OutStreamer
->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo
.ScratchBlocks
), 4);
1031 if (MF
.getFunction().getCallingConv() == CallingConv::AMDGPU_PS
) {
1032 OutStreamer
->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS
, 4);
1033 OutStreamer
->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo
.LDSBlocks
), 4);
1034 OutStreamer
->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA
, 4);
1035 OutStreamer
->EmitIntValue(MFI
->getPSInputEnable(), 4);
1036 OutStreamer
->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR
, 4);
1037 OutStreamer
->EmitIntValue(MFI
->getPSInputAddr(), 4);
1040 OutStreamer
->EmitIntValue(R_SPILLED_SGPRS
, 4);
1041 OutStreamer
->EmitIntValue(MFI
->getNumSpilledSGPRs(), 4);
1042 OutStreamer
->EmitIntValue(R_SPILLED_VGPRS
, 4);
1043 OutStreamer
->EmitIntValue(MFI
->getNumSpilledVGPRs(), 4);
1046 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1047 // is AMDPAL. It stores each compute/SPI register setting and other PAL
1048 // metadata items into the PALMetadataMap, combining with any provided by the
1049 // frontend as LLVM metadata. Once all functions are written, PALMetadataMap is
1050 // then written as a single block in the .note section.
1051 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction
&MF
,
1052 const SIProgramInfo
&CurrentProgramInfo
) {
1053 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1054 // Given the calling convention, calculate the register number for rsrc1. In
1055 // principle the register number could change in future hardware, but we know
1056 // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so
1057 // we can use the same fixed value that .AMDGPU.config has for Mesa. Note
1058 // that we use a register number rather than a byte offset, so we need to
1060 unsigned Rsrc1Reg
= getRsrcReg(MF
.getFunction().getCallingConv()) / 4;
1061 unsigned Rsrc2Reg
= Rsrc1Reg
+ 1;
1062 // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used
1063 // with a constant offset to access any non-register shader-specific PAL
1065 unsigned ScratchSizeKey
= PALMD::Key::CS_SCRATCH_SIZE
;
1066 switch (MF
.getFunction().getCallingConv()) {
1067 case CallingConv::AMDGPU_PS
:
1068 ScratchSizeKey
= PALMD::Key::PS_SCRATCH_SIZE
;
1070 case CallingConv::AMDGPU_VS
:
1071 ScratchSizeKey
= PALMD::Key::VS_SCRATCH_SIZE
;
1073 case CallingConv::AMDGPU_GS
:
1074 ScratchSizeKey
= PALMD::Key::GS_SCRATCH_SIZE
;
1076 case CallingConv::AMDGPU_ES
:
1077 ScratchSizeKey
= PALMD::Key::ES_SCRATCH_SIZE
;
1079 case CallingConv::AMDGPU_HS
:
1080 ScratchSizeKey
= PALMD::Key::HS_SCRATCH_SIZE
;
1082 case CallingConv::AMDGPU_LS
:
1083 ScratchSizeKey
= PALMD::Key::LS_SCRATCH_SIZE
;
1086 unsigned NumUsedVgprsKey
= ScratchSizeKey
+
1087 PALMD::Key::VS_NUM_USED_VGPRS
- PALMD::Key::VS_SCRATCH_SIZE
;
1088 unsigned NumUsedSgprsKey
= ScratchSizeKey
+
1089 PALMD::Key::VS_NUM_USED_SGPRS
- PALMD::Key::VS_SCRATCH_SIZE
;
1090 PALMetadataMap
[NumUsedVgprsKey
] = CurrentProgramInfo
.NumVGPRsForWavesPerEU
;
1091 PALMetadataMap
[NumUsedSgprsKey
] = CurrentProgramInfo
.NumSGPRsForWavesPerEU
;
1092 if (AMDGPU::isCompute(MF
.getFunction().getCallingConv())) {
1093 PALMetadataMap
[Rsrc1Reg
] |= CurrentProgramInfo
.ComputePGMRSrc1
;
1094 PALMetadataMap
[Rsrc2Reg
] |= CurrentProgramInfo
.ComputePGMRSrc2
;
1095 // ScratchSize is in bytes, 16 aligned.
1096 PALMetadataMap
[ScratchSizeKey
] |=
1097 alignTo(CurrentProgramInfo
.ScratchSize
, 16);
1099 PALMetadataMap
[Rsrc1Reg
] |= S_00B028_VGPRS(CurrentProgramInfo
.VGPRBlocks
) |
1100 S_00B028_SGPRS(CurrentProgramInfo
.SGPRBlocks
);
1101 if (CurrentProgramInfo
.ScratchBlocks
> 0)
1102 PALMetadataMap
[Rsrc2Reg
] |= S_00B84C_SCRATCH_EN(1);
1103 // ScratchSize is in bytes, 16 aligned.
1104 PALMetadataMap
[ScratchSizeKey
] |=
1105 alignTo(CurrentProgramInfo
.ScratchSize
, 16);
1107 if (MF
.getFunction().getCallingConv() == CallingConv::AMDGPU_PS
) {
1108 PALMetadataMap
[Rsrc2Reg
] |=
1109 S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo
.LDSBlocks
);
1110 PALMetadataMap
[R_0286CC_SPI_PS_INPUT_ENA
/ 4] |= MFI
->getPSInputEnable();
1111 PALMetadataMap
[R_0286D0_SPI_PS_INPUT_ADDR
/ 4] |= MFI
->getPSInputAddr();
1115 // This is supposed to be log2(Size)
1116 static amd_element_byte_size_t
getElementByteSizeValue(unsigned Size
) {
1119 return AMD_ELEMENT_4_BYTES
;
1121 return AMD_ELEMENT_8_BYTES
;
1123 return AMD_ELEMENT_16_BYTES
;
1125 llvm_unreachable("invalid private_element_size");
1129 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t
&Out
,
1130 const SIProgramInfo
&CurrentProgramInfo
,
1131 const MachineFunction
&MF
) const {
1132 const Function
&F
= MF
.getFunction();
1133 assert(F
.getCallingConv() == CallingConv::AMDGPU_KERNEL
||
1134 F
.getCallingConv() == CallingConv::SPIR_KERNEL
);
1136 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1137 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
1139 AMDGPU::initDefaultAMDKernelCodeT(Out
, getSTI());
1141 Out
.compute_pgm_resource_registers
=
1142 CurrentProgramInfo
.ComputePGMRSrc1
|
1143 (CurrentProgramInfo
.ComputePGMRSrc2
<< 32);
1144 Out
.code_properties
= AMD_CODE_PROPERTY_IS_PTR64
;
1146 if (CurrentProgramInfo
.DynamicCallStack
)
1147 Out
.code_properties
|= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK
;
1149 AMD_HSA_BITS_SET(Out
.code_properties
,
1150 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
,
1151 getElementByteSizeValue(STM
.getMaxPrivateElementSize()));
1153 if (MFI
->hasPrivateSegmentBuffer()) {
1154 Out
.code_properties
|=
1155 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
;
1158 if (MFI
->hasDispatchPtr())
1159 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
;
1161 if (MFI
->hasQueuePtr())
1162 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
;
1164 if (MFI
->hasKernargSegmentPtr())
1165 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
;
1167 if (MFI
->hasDispatchID())
1168 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
;
1170 if (MFI
->hasFlatScratchInit())
1171 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
;
1173 if (MFI
->hasDispatchPtr())
1174 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
;
1176 if (STM
.debuggerSupported())
1177 Out
.code_properties
|= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED
;
1179 if (STM
.isXNACKEnabled())
1180 Out
.code_properties
|= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
;
1182 unsigned MaxKernArgAlign
;
1183 Out
.kernarg_segment_byte_size
= STM
.getKernArgSegmentSize(F
, MaxKernArgAlign
);
1184 Out
.wavefront_sgpr_count
= CurrentProgramInfo
.NumSGPR
;
1185 Out
.workitem_vgpr_count
= CurrentProgramInfo
.NumVGPR
;
1186 Out
.workitem_private_segment_byte_size
= CurrentProgramInfo
.ScratchSize
;
1187 Out
.workgroup_group_segment_byte_size
= CurrentProgramInfo
.LDSSize
;
1189 // These alignment values are specified in powers of two, so alignment =
1190 // 2^n. The minimum alignment is 2^4 = 16.
1191 Out
.kernarg_segment_alignment
= std::max((size_t)4,
1192 countTrailingZeros(MaxKernArgAlign
));
1194 if (STM
.debuggerEmitPrologue()) {
1195 Out
.debug_wavefront_private_segment_offset_sgpr
=
1196 CurrentProgramInfo
.DebuggerWavefrontPrivateSegmentOffsetSGPR
;
1197 Out
.debug_private_segment_buffer_sgpr
=
1198 CurrentProgramInfo
.DebuggerPrivateSegmentBufferSGPR
;
1202 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr
*MI
, unsigned OpNo
,
1203 unsigned AsmVariant
,
1204 const char *ExtraCode
, raw_ostream
&O
) {
1205 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1206 if (!AsmPrinter::PrintAsmOperand(MI
, OpNo
, AsmVariant
, ExtraCode
, O
))
1209 if (ExtraCode
&& ExtraCode
[0]) {
1210 if (ExtraCode
[1] != 0)
1211 return true; // Unknown modifier.
1213 switch (ExtraCode
[0]) {
1221 // TODO: Should be able to support other operand types like globals.
1222 const MachineOperand
&MO
= MI
->getOperand(OpNo
);
1224 AMDGPUInstPrinter::printRegOperand(MO
.getReg(), O
,
1225 *MF
->getSubtarget().getRegisterInfo());