1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
13 /// code. When passed an MCAsmStreamer it prints assembly and when passed
14 /// an MCObjectStreamer it outputs binary code.
16 //===----------------------------------------------------------------------===//
19 #include "AMDGPUAsmPrinter.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "InstPrinter/AMDGPUInstPrinter.h"
24 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
25 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
26 #include "R600AsmPrinter.h"
27 #include "R600Defines.h"
28 #include "R600MachineFunctionInfo.h"
29 #include "R600RegisterInfo.h"
30 #include "SIDefines.h"
31 #include "SIInstrInfo.h"
32 #include "SIMachineFunctionInfo.h"
33 #include "SIRegisterInfo.h"
34 #include "Utils/AMDGPUBaseInfo.h"
35 #include "llvm/BinaryFormat/ELF.h"
36 #include "llvm/CodeGen/MachineFrameInfo.h"
37 #include "llvm/IR/DiagnosticInfo.h"
38 #include "llvm/MC/MCContext.h"
39 #include "llvm/MC/MCSectionELF.h"
40 #include "llvm/MC/MCStreamer.h"
41 #include "llvm/Support/AMDGPUMetadata.h"
42 #include "llvm/Support/MathExtras.h"
43 #include "llvm/Support/TargetParser.h"
44 #include "llvm/Support/TargetRegistry.h"
45 #include "llvm/Target/TargetLoweringObjectFile.h"
48 using namespace llvm::AMDGPU
;
50 // TODO: This should get the default rounding mode from the kernel. We just set
51 // the default here, but this could change if the OpenCL rounding mode pragmas
54 // The denormal mode here should match what is reported by the OpenCL runtime
55 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
56 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
58 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
59 // precision, and leaves single precision to flush all and does not report
60 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
61 // CL_FP_DENORM for both.
63 // FIXME: It seems some instructions do not support single precision denormals
64 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
65 // and sin_f32, cos_f32 on most parts).
67 // We want to use these instructions, and using fp32 denormals also causes
68 // instructions to run at the double precision rate for the device so it's
69 // probably best to just report no single precision denormals.
70 static uint32_t getFPMode(const MachineFunction
&F
) {
71 const GCNSubtarget
& ST
= F
.getSubtarget
<GCNSubtarget
>();
72 // TODO: Is there any real use for the flush in only / flush out only modes?
74 uint32_t FP32Denormals
=
75 ST
.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE
: FP_DENORM_FLUSH_IN_FLUSH_OUT
;
77 uint32_t FP64Denormals
=
78 ST
.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE
: FP_DENORM_FLUSH_IN_FLUSH_OUT
;
80 return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST
) |
81 FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST
) |
82 FP_DENORM_MODE_SP(FP32Denormals
) |
83 FP_DENORM_MODE_DP(FP64Denormals
);
87 createAMDGPUAsmPrinterPass(TargetMachine
&tm
,
88 std::unique_ptr
<MCStreamer
> &&Streamer
) {
89 return new AMDGPUAsmPrinter(tm
, std::move(Streamer
));
92 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
93 TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
94 llvm::createR600AsmPrinterPass
);
95 TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
96 createAMDGPUAsmPrinterPass
);
99 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine
&TM
,
100 std::unique_ptr
<MCStreamer
> Streamer
)
101 : AsmPrinter(TM
, std::move(Streamer
)) {
104 StringRef
AMDGPUAsmPrinter::getPassName() const {
105 return "AMDGPU Assembly Printer";
108 const MCSubtargetInfo
* AMDGPUAsmPrinter::getSTI() const {
109 return TM
.getMCSubtargetInfo();
112 AMDGPUTargetStreamer
* AMDGPUAsmPrinter::getTargetStreamer() const {
115 return static_cast<AMDGPUTargetStreamer
*>(OutStreamer
->getTargetStreamer());
118 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module
&M
) {
119 if (IsaInfo::hasCodeObjectV3(getSTI()) &&
120 TM
.getTargetTriple().getOS() == Triple::AMDHSA
)
123 if (TM
.getTargetTriple().getOS() != Triple::AMDHSA
&&
124 TM
.getTargetTriple().getOS() != Triple::AMDPAL
)
127 if (TM
.getTargetTriple().getOS() == Triple::AMDHSA
)
128 HSAMetadataStream
.begin(M
);
130 if (TM
.getTargetTriple().getOS() == Triple::AMDPAL
)
133 // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
134 if (TM
.getTargetTriple().getOS() == Triple::AMDHSA
)
135 getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
137 // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
138 IsaVersion Version
= getIsaVersion(getSTI()->getCPU());
139 getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
140 Version
.Major
, Version
.Minor
, Version
.Stepping
, "AMD", "AMDGPU");
143 void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module
&M
) {
144 // TODO: Add metadata to code object v3.
145 if (IsaInfo::hasCodeObjectV3(getSTI()) &&
146 TM
.getTargetTriple().getOS() == Triple::AMDHSA
)
149 // Following code requires TargetStreamer to be present.
150 if (!getTargetStreamer())
153 // Emit ISA Version (NT_AMD_AMDGPU_ISA).
154 std::string ISAVersionString
;
155 raw_string_ostream
ISAVersionStream(ISAVersionString
);
156 IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream
);
157 getTargetStreamer()->EmitISAVersion(ISAVersionStream
.str());
159 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
160 if (TM
.getTargetTriple().getOS() == Triple::AMDHSA
) {
161 HSAMetadataStream
.end();
162 getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream
.getHSAMetadata());
165 // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
166 if (TM
.getTargetTriple().getOS() == Triple::AMDPAL
) {
167 // Copy the PAL metadata from the map where we collected it into a vector,
168 // then write it as a .note.
169 PALMD::Metadata PALMetadataVector
;
170 for (auto i
: PALMetadataMap
) {
171 PALMetadataVector
.push_back(i
.first
);
172 PALMetadataVector
.push_back(i
.second
);
174 getTargetStreamer()->EmitPALMetadata(PALMetadataVector
);
178 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
179 const MachineBasicBlock
*MBB
) const {
180 if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB
))
186 // If this is a block implementing a long branch, an expression relative to
187 // the start of the block is needed. to the start of the block.
188 // XXX - Is there a smarter way to check this?
189 return (MBB
->back().getOpcode() != AMDGPU::S_SETPC_B64
);
192 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
193 const SIMachineFunctionInfo
&MFI
= *MF
->getInfo
<SIMachineFunctionInfo
>();
194 if (!MFI
.isEntryFunction())
196 if (IsaInfo::hasCodeObjectV3(getSTI()) &&
197 TM
.getTargetTriple().getOS() == Triple::AMDHSA
)
200 const GCNSubtarget
&STM
= MF
->getSubtarget
<GCNSubtarget
>();
201 const Function
&F
= MF
->getFunction();
202 if (STM
.isAmdCodeObjectV2(F
) &&
203 (F
.getCallingConv() == CallingConv::AMDGPU_KERNEL
||
204 F
.getCallingConv() == CallingConv::SPIR_KERNEL
)) {
205 amd_kernel_code_t KernelCode
;
206 getAmdKernelCode(KernelCode
, CurrentProgramInfo
, *MF
);
207 getTargetStreamer()->EmitAMDKernelCodeT(KernelCode
);
210 if (TM
.getTargetTriple().getOS() != Triple::AMDHSA
)
213 HSAMetadataStream
.emitKernel(*MF
, CurrentProgramInfo
);
216 void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
217 const SIMachineFunctionInfo
&MFI
= *MF
->getInfo
<SIMachineFunctionInfo
>();
218 if (!MFI
.isEntryFunction())
220 if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
221 TM
.getTargetTriple().getOS() != Triple::AMDHSA
)
224 auto &Streamer
= getTargetStreamer()->getStreamer();
225 auto &Context
= Streamer
.getContext();
226 auto &ObjectFileInfo
= *Context
.getObjectFileInfo();
227 auto &ReadOnlySection
= *ObjectFileInfo
.getReadOnlySection();
229 Streamer
.PushSection();
230 Streamer
.SwitchSection(&ReadOnlySection
);
232 // CP microcode requires the kernel descriptor to be allocated on 64 byte
234 Streamer
.EmitValueToAlignment(64, 0, 1, 0);
235 if (ReadOnlySection
.getAlignment() < 64)
236 ReadOnlySection
.setAlignment(64);
238 SmallString
<128> KernelName
;
239 getNameWithPrefix(KernelName
, &MF
->getFunction());
240 getTargetStreamer()->EmitAmdhsaKernelDescriptor(
241 *getSTI(), KernelName
, getAmdhsaKernelDescriptor(*MF
, CurrentProgramInfo
),
242 CurrentProgramInfo
.NumVGPRsForWavesPerEU
,
243 CurrentProgramInfo
.NumSGPRsForWavesPerEU
-
244 IsaInfo::getNumExtraSGPRs(getSTI(),
245 CurrentProgramInfo
.VCCUsed
,
246 CurrentProgramInfo
.FlatUsed
),
247 CurrentProgramInfo
.VCCUsed
, CurrentProgramInfo
.FlatUsed
,
248 hasXNACK(*getSTI()));
250 Streamer
.PopSection();
253 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
254 if (IsaInfo::hasCodeObjectV3(getSTI()) &&
255 TM
.getTargetTriple().getOS() == Triple::AMDHSA
) {
256 AsmPrinter::EmitFunctionEntryLabel();
260 const SIMachineFunctionInfo
*MFI
= MF
->getInfo
<SIMachineFunctionInfo
>();
261 const GCNSubtarget
&STM
= MF
->getSubtarget
<GCNSubtarget
>();
262 if (MFI
->isEntryFunction() && STM
.isAmdCodeObjectV2(MF
->getFunction())) {
263 SmallString
<128> SymbolName
;
264 getNameWithPrefix(SymbolName
, &MF
->getFunction()),
265 getTargetStreamer()->EmitAMDGPUSymbolType(
266 SymbolName
, ELF::STT_AMDGPU_HSA_KERNEL
);
268 const GCNSubtarget
&STI
= MF
->getSubtarget
<GCNSubtarget
>();
269 if (STI
.dumpCode()) {
270 // Disassemble function name label to text.
271 DisasmLines
.push_back(MF
->getName().str() + ":");
272 DisasmLineMaxLen
= std::max(DisasmLineMaxLen
, DisasmLines
.back().size());
273 HexLines
.push_back("");
276 AsmPrinter::EmitFunctionEntryLabel();
279 void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock
&MBB
) const {
280 const GCNSubtarget
&STI
= MBB
.getParent()->getSubtarget
<GCNSubtarget
>();
281 if (STI
.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB
)) {
282 // Write a line for the basic block label if it is not only fallthrough.
283 DisasmLines
.push_back(
284 (Twine("BB") + Twine(getFunctionNumber())
285 + "_" + Twine(MBB
.getNumber()) + ":").str());
286 DisasmLineMaxLen
= std::max(DisasmLineMaxLen
, DisasmLines
.back().size());
287 HexLines
.push_back("");
289 AsmPrinter::EmitBasicBlockStart(MBB
);
292 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable
*GV
) {
294 // Group segment variables aren't emitted in HSA.
295 if (AMDGPU::isGroupSegment(GV
))
298 AsmPrinter::EmitGlobalVariable(GV
);
301 bool AMDGPUAsmPrinter::doFinalization(Module
&M
) {
302 CallGraphResourceInfo
.clear();
303 return AsmPrinter::doFinalization(M
);
306 // For the amdpal OS type, read the amdgpu.pal.metadata supplied by the
307 // frontend into our PALMetadataMap, ready for per-function modification. It
308 // is a NamedMD containing an MDTuple containing a number of MDNodes each of
309 // which is an integer value, and each two integer values forms a key=value
310 // pair that we store as PALMetadataMap[key]=value in the map.
311 void AMDGPUAsmPrinter::readPALMetadata(Module
&M
) {
312 auto NamedMD
= M
.getNamedMetadata("amdgpu.pal.metadata");
313 if (!NamedMD
|| !NamedMD
->getNumOperands())
315 auto Tuple
= dyn_cast
<MDTuple
>(NamedMD
->getOperand(0));
318 for (unsigned I
= 0, E
= Tuple
->getNumOperands() & -2; I
!= E
; I
+= 2) {
319 auto Key
= mdconst::dyn_extract
<ConstantInt
>(Tuple
->getOperand(I
));
320 auto Val
= mdconst::dyn_extract
<ConstantInt
>(Tuple
->getOperand(I
+ 1));
323 PALMetadataMap
[Key
->getZExtValue()] = Val
->getZExtValue();
327 // Print comments that apply to both callable functions and entry points.
328 void AMDGPUAsmPrinter::emitCommonFunctionComments(
331 uint64_t ScratchSize
,
333 const AMDGPUMachineFunction
*MFI
) {
334 OutStreamer
->emitRawComment(" codeLenInByte = " + Twine(CodeSize
), false);
335 OutStreamer
->emitRawComment(" NumSgprs: " + Twine(NumSGPR
), false);
336 OutStreamer
->emitRawComment(" NumVgprs: " + Twine(NumVGPR
), false);
337 OutStreamer
->emitRawComment(" ScratchSize: " + Twine(ScratchSize
), false);
338 OutStreamer
->emitRawComment(" MemoryBound: " + Twine(MFI
->isMemoryBound()),
342 uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
343 const MachineFunction
&MF
) const {
344 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
345 uint16_t KernelCodeProperties
= 0;
347 if (MFI
.hasPrivateSegmentBuffer()) {
348 KernelCodeProperties
|=
349 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
;
351 if (MFI
.hasDispatchPtr()) {
352 KernelCodeProperties
|=
353 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
;
355 if (MFI
.hasQueuePtr()) {
356 KernelCodeProperties
|=
357 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
;
359 if (MFI
.hasKernargSegmentPtr()) {
360 KernelCodeProperties
|=
361 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
;
363 if (MFI
.hasDispatchID()) {
364 KernelCodeProperties
|=
365 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
;
367 if (MFI
.hasFlatScratchInit()) {
368 KernelCodeProperties
|=
369 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
;
372 return KernelCodeProperties
;
375 amdhsa::kernel_descriptor_t
AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
376 const MachineFunction
&MF
,
377 const SIProgramInfo
&PI
) const {
378 amdhsa::kernel_descriptor_t KernelDescriptor
;
379 memset(&KernelDescriptor
, 0x0, sizeof(KernelDescriptor
));
381 assert(isUInt
<32>(PI
.ScratchSize
));
382 assert(isUInt
<32>(PI
.ComputePGMRSrc1
));
383 assert(isUInt
<32>(PI
.ComputePGMRSrc2
));
385 KernelDescriptor
.group_segment_fixed_size
= PI
.LDSSize
;
386 KernelDescriptor
.private_segment_fixed_size
= PI
.ScratchSize
;
387 KernelDescriptor
.compute_pgm_rsrc1
= PI
.ComputePGMRSrc1
;
388 KernelDescriptor
.compute_pgm_rsrc2
= PI
.ComputePGMRSrc2
;
389 KernelDescriptor
.kernel_code_properties
= getAmdhsaKernelCodeProperties(MF
);
391 return KernelDescriptor
;
394 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction
&MF
) {
395 CurrentProgramInfo
= SIProgramInfo();
397 const AMDGPUMachineFunction
*MFI
= MF
.getInfo
<AMDGPUMachineFunction
>();
399 // The starting address of all shader programs must be 256 bytes aligned.
400 // Regular functions just need the basic required instruction alignment.
401 MF
.setAlignment(MFI
->isEntryFunction() ? 8 : 2);
403 SetupMachineFunction(MF
);
405 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
406 MCContext
&Context
= getObjFileLowering().getContext();
407 // FIXME: This should be an explicit check for Mesa.
408 if (!STM
.isAmdHsaOS() && !STM
.isAmdPalOS()) {
409 MCSectionELF
*ConfigSection
=
410 Context
.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS
, 0);
411 OutStreamer
->SwitchSection(ConfigSection
);
414 if (MFI
->isEntryFunction()) {
415 getSIProgramInfo(CurrentProgramInfo
, MF
);
417 auto I
= CallGraphResourceInfo
.insert(
418 std::make_pair(&MF
.getFunction(), SIFunctionResourceInfo()));
419 SIFunctionResourceInfo
&Info
= I
.first
->second
;
420 assert(I
.second
&& "should only be called once per function");
421 Info
= analyzeResourceUsage(MF
);
424 if (STM
.isAmdPalOS())
425 EmitPALMetadata(MF
, CurrentProgramInfo
);
426 else if (!STM
.isAmdHsaOS()) {
427 EmitProgramInfoSI(MF
, CurrentProgramInfo
);
432 DisasmLineMaxLen
= 0;
437 MCSectionELF
*CommentSection
=
438 Context
.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS
, 0);
439 OutStreamer
->SwitchSection(CommentSection
);
441 if (!MFI
->isEntryFunction()) {
442 OutStreamer
->emitRawComment(" Function info:", false);
443 SIFunctionResourceInfo
&Info
= CallGraphResourceInfo
[&MF
.getFunction()];
444 emitCommonFunctionComments(
446 Info
.getTotalNumSGPRs(MF
.getSubtarget
<GCNSubtarget
>()),
447 Info
.PrivateSegmentSize
,
448 getFunctionCodeSize(MF
), MFI
);
452 OutStreamer
->emitRawComment(" Kernel info:", false);
453 emitCommonFunctionComments(CurrentProgramInfo
.NumVGPR
,
454 CurrentProgramInfo
.NumSGPR
,
455 CurrentProgramInfo
.ScratchSize
,
456 getFunctionCodeSize(MF
), MFI
);
458 OutStreamer
->emitRawComment(
459 " FloatMode: " + Twine(CurrentProgramInfo
.FloatMode
), false);
460 OutStreamer
->emitRawComment(
461 " IeeeMode: " + Twine(CurrentProgramInfo
.IEEEMode
), false);
462 OutStreamer
->emitRawComment(
463 " LDSByteSize: " + Twine(CurrentProgramInfo
.LDSSize
) +
464 " bytes/workgroup (compile time only)", false);
466 OutStreamer
->emitRawComment(
467 " SGPRBlocks: " + Twine(CurrentProgramInfo
.SGPRBlocks
), false);
468 OutStreamer
->emitRawComment(
469 " VGPRBlocks: " + Twine(CurrentProgramInfo
.VGPRBlocks
), false);
471 OutStreamer
->emitRawComment(
472 " NumSGPRsForWavesPerEU: " +
473 Twine(CurrentProgramInfo
.NumSGPRsForWavesPerEU
), false);
474 OutStreamer
->emitRawComment(
475 " NumVGPRsForWavesPerEU: " +
476 Twine(CurrentProgramInfo
.NumVGPRsForWavesPerEU
), false);
478 OutStreamer
->emitRawComment(
479 " WaveLimiterHint : " + Twine(MFI
->needsWaveLimiter()), false);
481 if (MF
.getSubtarget
<GCNSubtarget
>().debuggerEmitPrologue()) {
482 OutStreamer
->emitRawComment(
483 " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
484 Twine(CurrentProgramInfo
.DebuggerWavefrontPrivateSegmentOffsetSGPR
), false);
485 OutStreamer
->emitRawComment(
486 " DebuggerPrivateSegmentBufferSGPR: s" +
487 Twine(CurrentProgramInfo
.DebuggerPrivateSegmentBufferSGPR
), false);
490 OutStreamer
->emitRawComment(
491 " COMPUTE_PGM_RSRC2:USER_SGPR: " +
492 Twine(G_00B84C_USER_SGPR(CurrentProgramInfo
.ComputePGMRSrc2
)), false);
493 OutStreamer
->emitRawComment(
494 " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
495 Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo
.ComputePGMRSrc2
)), false);
496 OutStreamer
->emitRawComment(
497 " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
498 Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo
.ComputePGMRSrc2
)), false);
499 OutStreamer
->emitRawComment(
500 " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
501 Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo
.ComputePGMRSrc2
)), false);
502 OutStreamer
->emitRawComment(
503 " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
504 Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo
.ComputePGMRSrc2
)), false);
505 OutStreamer
->emitRawComment(
506 " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
507 Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo
.ComputePGMRSrc2
)),
511 if (STM
.dumpCode()) {
513 OutStreamer
->SwitchSection(
514 Context
.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE
, 0));
516 for (size_t i
= 0; i
< DisasmLines
.size(); ++i
) {
517 std::string Comment
= "\n";
518 if (!HexLines
[i
].empty()) {
519 Comment
= std::string(DisasmLineMaxLen
- DisasmLines
[i
].size(), ' ');
520 Comment
+= " ; " + HexLines
[i
] + "\n";
523 OutStreamer
->EmitBytes(StringRef(DisasmLines
[i
]));
524 OutStreamer
->EmitBytes(StringRef(Comment
));
531 uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction
&MF
) const {
532 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
533 const SIInstrInfo
*TII
= STM
.getInstrInfo();
535 uint64_t CodeSize
= 0;
537 for (const MachineBasicBlock
&MBB
: MF
) {
538 for (const MachineInstr
&MI
: MBB
) {
539 // TODO: CodeSize should account for multiple functions.
541 // TODO: Should we count size of debug info?
542 if (MI
.isDebugInstr())
545 CodeSize
+= TII
->getInstSizeInBytes(MI
);
552 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo
&MRI
,
553 const SIInstrInfo
&TII
,
555 for (const MachineOperand
&UseOp
: MRI
.reg_operands(Reg
)) {
556 if (!UseOp
.isImplicit() || !TII
.isFLAT(*UseOp
.getParent()))
563 int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
564 const GCNSubtarget
&ST
) const {
565 return NumExplicitSGPR
+ IsaInfo::getNumExtraSGPRs(&ST
,
566 UsesVCC
, UsesFlatScratch
);
569 AMDGPUAsmPrinter::SIFunctionResourceInfo
AMDGPUAsmPrinter::analyzeResourceUsage(
570 const MachineFunction
&MF
) const {
571 SIFunctionResourceInfo Info
;
573 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
574 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
575 const MachineFrameInfo
&FrameInfo
= MF
.getFrameInfo();
576 const MachineRegisterInfo
&MRI
= MF
.getRegInfo();
577 const SIInstrInfo
*TII
= ST
.getInstrInfo();
578 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
580 Info
.UsesFlatScratch
= MRI
.isPhysRegUsed(AMDGPU::FLAT_SCR_LO
) ||
581 MRI
.isPhysRegUsed(AMDGPU::FLAT_SCR_HI
);
583 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
584 // instructions aren't used to access the scratch buffer. Inline assembly may
587 // If we only have implicit uses of flat_scr on flat instructions, it is not
589 if (Info
.UsesFlatScratch
&& !MFI
->hasFlatScratchInit() &&
590 (!hasAnyNonFlatUseOfReg(MRI
, *TII
, AMDGPU::FLAT_SCR
) &&
591 !hasAnyNonFlatUseOfReg(MRI
, *TII
, AMDGPU::FLAT_SCR_LO
) &&
592 !hasAnyNonFlatUseOfReg(MRI
, *TII
, AMDGPU::FLAT_SCR_HI
))) {
593 Info
.UsesFlatScratch
= false;
596 Info
.HasDynamicallySizedStack
= FrameInfo
.hasVarSizedObjects();
597 Info
.PrivateSegmentSize
= FrameInfo
.getStackSize();
598 if (MFI
->isStackRealigned())
599 Info
.PrivateSegmentSize
+= FrameInfo
.getMaxAlignment();
602 Info
.UsesVCC
= MRI
.isPhysRegUsed(AMDGPU::VCC_LO
) ||
603 MRI
.isPhysRegUsed(AMDGPU::VCC_HI
);
605 // If there are no calls, MachineRegisterInfo can tell us the used register
607 // A tail call isn't considered a call for MachineFrameInfo's purposes.
608 if (!FrameInfo
.hasCalls() && !FrameInfo
.hasTailCall()) {
609 MCPhysReg HighestVGPRReg
= AMDGPU::NoRegister
;
610 for (MCPhysReg Reg
: reverse(AMDGPU::VGPR_32RegClass
.getRegisters())) {
611 if (MRI
.isPhysRegUsed(Reg
)) {
612 HighestVGPRReg
= Reg
;
617 MCPhysReg HighestSGPRReg
= AMDGPU::NoRegister
;
618 for (MCPhysReg Reg
: reverse(AMDGPU::SGPR_32RegClass
.getRegisters())) {
619 if (MRI
.isPhysRegUsed(Reg
)) {
620 HighestSGPRReg
= Reg
;
625 // We found the maximum register index. They start at 0, so add one to get the
626 // number of registers.
627 Info
.NumVGPR
= HighestVGPRReg
== AMDGPU::NoRegister
? 0 :
628 TRI
.getHWRegIndex(HighestVGPRReg
) + 1;
629 Info
.NumExplicitSGPR
= HighestSGPRReg
== AMDGPU::NoRegister
? 0 :
630 TRI
.getHWRegIndex(HighestSGPRReg
) + 1;
635 int32_t MaxVGPR
= -1;
636 int32_t MaxSGPR
= -1;
637 uint64_t CalleeFrameSize
= 0;
639 for (const MachineBasicBlock
&MBB
: MF
) {
640 for (const MachineInstr
&MI
: MBB
) {
641 // TODO: Check regmasks? Do they occur anywhere except calls?
642 for (const MachineOperand
&MO
: MI
.operands()) {
649 unsigned Reg
= MO
.getReg();
652 case AMDGPU::EXEC_LO
:
653 case AMDGPU::EXEC_HI
:
656 case AMDGPU::SRC_SHARED_BASE
:
657 case AMDGPU::SRC_SHARED_LIMIT
:
658 case AMDGPU::SRC_PRIVATE_BASE
:
659 case AMDGPU::SRC_PRIVATE_LIMIT
:
662 case AMDGPU::NoRegister
:
663 assert(MI
.isDebugInstr());
672 case AMDGPU::FLAT_SCR
:
673 case AMDGPU::FLAT_SCR_LO
:
674 case AMDGPU::FLAT_SCR_HI
:
677 case AMDGPU::XNACK_MASK
:
678 case AMDGPU::XNACK_MASK_LO
:
679 case AMDGPU::XNACK_MASK_HI
:
680 llvm_unreachable("xnack_mask registers should not be used");
688 llvm_unreachable("trap handler registers should not be used");
694 if (AMDGPU::SReg_32RegClass
.contains(Reg
)) {
695 assert(!AMDGPU::TTMP_32RegClass
.contains(Reg
) &&
696 "trap handler registers should not be used");
699 } else if (AMDGPU::VGPR_32RegClass
.contains(Reg
)) {
702 } else if (AMDGPU::SReg_64RegClass
.contains(Reg
)) {
703 assert(!AMDGPU::TTMP_64RegClass
.contains(Reg
) &&
704 "trap handler registers should not be used");
707 } else if (AMDGPU::VReg_64RegClass
.contains(Reg
)) {
710 } else if (AMDGPU::VReg_96RegClass
.contains(Reg
)) {
713 } else if (AMDGPU::SReg_128RegClass
.contains(Reg
)) {
714 assert(!AMDGPU::TTMP_128RegClass
.contains(Reg
) &&
715 "trap handler registers should not be used");
718 } else if (AMDGPU::VReg_128RegClass
.contains(Reg
)) {
721 } else if (AMDGPU::SReg_256RegClass
.contains(Reg
)) {
722 assert(!AMDGPU::TTMP_256RegClass
.contains(Reg
) &&
723 "trap handler registers should not be used");
726 } else if (AMDGPU::VReg_256RegClass
.contains(Reg
)) {
729 } else if (AMDGPU::SReg_512RegClass
.contains(Reg
)) {
730 assert(!AMDGPU::TTMP_512RegClass
.contains(Reg
) &&
731 "trap handler registers should not be used");
734 } else if (AMDGPU::VReg_512RegClass
.contains(Reg
)) {
738 llvm_unreachable("Unknown register class");
740 unsigned HWReg
= TRI
.getHWRegIndex(Reg
);
741 int MaxUsed
= HWReg
+ Width
- 1;
743 MaxSGPR
= MaxUsed
> MaxSGPR
? MaxUsed
: MaxSGPR
;
745 MaxVGPR
= MaxUsed
> MaxVGPR
? MaxUsed
: MaxVGPR
;
750 // Pseudo used just to encode the underlying global. Is there a better
751 // way to track this?
753 const MachineOperand
*CalleeOp
754 = TII
->getNamedOperand(MI
, AMDGPU::OpName::callee
);
755 const Function
*Callee
= cast
<Function
>(CalleeOp
->getGlobal());
756 if (Callee
->isDeclaration()) {
757 // If this is a call to an external function, we can't do much. Make
758 // conservative guesses.
760 // 48 SGPRs - vcc, - flat_scr, -xnack
762 47 - IsaInfo::getNumExtraSGPRs(getSTI(), true,
763 ST
.hasFlatAddressSpace());
764 MaxSGPR
= std::max(MaxSGPR
, MaxSGPRGuess
);
765 MaxVGPR
= std::max(MaxVGPR
, 23);
767 CalleeFrameSize
= std::max(CalleeFrameSize
, UINT64_C(16384));
769 Info
.UsesFlatScratch
= ST
.hasFlatAddressSpace();
770 Info
.HasDynamicallySizedStack
= true;
772 // We force CodeGen to run in SCC order, so the callee's register
773 // usage etc. should be the cumulative usage of all callees.
774 auto I
= CallGraphResourceInfo
.find(Callee
);
775 assert(I
!= CallGraphResourceInfo
.end() &&
776 "callee should have been handled before caller");
778 MaxSGPR
= std::max(I
->second
.NumExplicitSGPR
- 1, MaxSGPR
);
779 MaxVGPR
= std::max(I
->second
.NumVGPR
- 1, MaxVGPR
);
781 = std::max(I
->second
.PrivateSegmentSize
, CalleeFrameSize
);
782 Info
.UsesVCC
|= I
->second
.UsesVCC
;
783 Info
.UsesFlatScratch
|= I
->second
.UsesFlatScratch
;
784 Info
.HasDynamicallySizedStack
|= I
->second
.HasDynamicallySizedStack
;
785 Info
.HasRecursion
|= I
->second
.HasRecursion
;
788 if (!Callee
->doesNotRecurse())
789 Info
.HasRecursion
= true;
794 Info
.NumExplicitSGPR
= MaxSGPR
+ 1;
795 Info
.NumVGPR
= MaxVGPR
+ 1;
796 Info
.PrivateSegmentSize
+= CalleeFrameSize
;
801 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo
&ProgInfo
,
802 const MachineFunction
&MF
) {
803 SIFunctionResourceInfo Info
= analyzeResourceUsage(MF
);
805 ProgInfo
.NumVGPR
= Info
.NumVGPR
;
806 ProgInfo
.NumSGPR
= Info
.NumExplicitSGPR
;
807 ProgInfo
.ScratchSize
= Info
.PrivateSegmentSize
;
808 ProgInfo
.VCCUsed
= Info
.UsesVCC
;
809 ProgInfo
.FlatUsed
= Info
.UsesFlatScratch
;
810 ProgInfo
.DynamicCallStack
= Info
.HasDynamicallySizedStack
|| Info
.HasRecursion
;
812 if (!isUInt
<32>(ProgInfo
.ScratchSize
)) {
813 DiagnosticInfoStackSize
DiagStackSize(MF
.getFunction(),
814 ProgInfo
.ScratchSize
, DS_Error
);
815 MF
.getFunction().getContext().diagnose(DiagStackSize
);
818 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
819 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
820 const SIInstrInfo
*TII
= STM
.getInstrInfo();
821 const SIRegisterInfo
*RI
= &TII
->getRegisterInfo();
823 // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
824 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
826 unsigned ExtraSGPRs
= IsaInfo::getNumExtraSGPRs(
827 getSTI(), ProgInfo
.VCCUsed
, ProgInfo
.FlatUsed
);
829 // Check the addressable register limit before we add ExtraSGPRs.
830 if (STM
.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
&&
831 !STM
.hasSGPRInitBug()) {
832 unsigned MaxAddressableNumSGPRs
= STM
.getAddressableNumSGPRs();
833 if (ProgInfo
.NumSGPR
> MaxAddressableNumSGPRs
) {
834 // This can happen due to a compiler bug or when using inline asm.
835 LLVMContext
&Ctx
= MF
.getFunction().getContext();
836 DiagnosticInfoResourceLimit
Diag(MF
.getFunction(),
837 "addressable scalar registers",
838 ProgInfo
.NumSGPR
, DS_Error
,
840 MaxAddressableNumSGPRs
);
842 ProgInfo
.NumSGPR
= MaxAddressableNumSGPRs
- 1;
846 // Account for extra SGPRs and VGPRs reserved for debugger use.
847 ProgInfo
.NumSGPR
+= ExtraSGPRs
;
849 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
850 // dispatch registers are function args.
851 unsigned WaveDispatchNumSGPR
= 0, WaveDispatchNumVGPR
= 0;
852 for (auto &Arg
: MF
.getFunction().args()) {
853 unsigned NumRegs
= (Arg
.getType()->getPrimitiveSizeInBits() + 31) / 32;
854 if (Arg
.hasAttribute(Attribute::InReg
))
855 WaveDispatchNumSGPR
+= NumRegs
;
857 WaveDispatchNumVGPR
+= NumRegs
;
859 ProgInfo
.NumSGPR
= std::max(ProgInfo
.NumSGPR
, WaveDispatchNumSGPR
);
860 ProgInfo
.NumVGPR
= std::max(ProgInfo
.NumVGPR
, WaveDispatchNumVGPR
);
862 // Adjust number of registers used to meet default/requested minimum/maximum
863 // number of waves per execution unit request.
864 ProgInfo
.NumSGPRsForWavesPerEU
= std::max(
865 std::max(ProgInfo
.NumSGPR
, 1u), STM
.getMinNumSGPRs(MFI
->getMaxWavesPerEU()));
866 ProgInfo
.NumVGPRsForWavesPerEU
= std::max(
867 std::max(ProgInfo
.NumVGPR
, 1u), STM
.getMinNumVGPRs(MFI
->getMaxWavesPerEU()));
869 if (STM
.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS
||
870 STM
.hasSGPRInitBug()) {
871 unsigned MaxAddressableNumSGPRs
= STM
.getAddressableNumSGPRs();
872 if (ProgInfo
.NumSGPR
> MaxAddressableNumSGPRs
) {
873 // This can happen due to a compiler bug or when using inline asm to use
874 // the registers which are usually reserved for vcc etc.
875 LLVMContext
&Ctx
= MF
.getFunction().getContext();
876 DiagnosticInfoResourceLimit
Diag(MF
.getFunction(),
878 ProgInfo
.NumSGPR
, DS_Error
,
880 MaxAddressableNumSGPRs
);
882 ProgInfo
.NumSGPR
= MaxAddressableNumSGPRs
;
883 ProgInfo
.NumSGPRsForWavesPerEU
= MaxAddressableNumSGPRs
;
887 if (STM
.hasSGPRInitBug()) {
889 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
;
890 ProgInfo
.NumSGPRsForWavesPerEU
=
891 AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
;
894 if (MFI
->getNumUserSGPRs() > STM
.getMaxNumUserSGPRs()) {
895 LLVMContext
&Ctx
= MF
.getFunction().getContext();
896 DiagnosticInfoResourceLimit
Diag(MF
.getFunction(), "user SGPRs",
897 MFI
->getNumUserSGPRs(), DS_Error
);
901 if (MFI
->getLDSSize() > static_cast<unsigned>(STM
.getLocalMemorySize())) {
902 LLVMContext
&Ctx
= MF
.getFunction().getContext();
903 DiagnosticInfoResourceLimit
Diag(MF
.getFunction(), "local memory",
904 MFI
->getLDSSize(), DS_Error
);
908 ProgInfo
.SGPRBlocks
= IsaInfo::getNumSGPRBlocks(
909 getSTI(), ProgInfo
.NumSGPRsForWavesPerEU
);
910 ProgInfo
.VGPRBlocks
= IsaInfo::getNumVGPRBlocks(
911 getSTI(), ProgInfo
.NumVGPRsForWavesPerEU
);
913 // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
914 // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
915 // attribute was requested.
916 if (STM
.debuggerEmitPrologue()) {
917 ProgInfo
.DebuggerWavefrontPrivateSegmentOffsetSGPR
=
918 RI
->getHWRegIndex(MFI
->getScratchWaveOffsetReg());
919 ProgInfo
.DebuggerPrivateSegmentBufferSGPR
=
920 RI
->getHWRegIndex(MFI
->getScratchRSrcReg());
923 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
925 ProgInfo
.FloatMode
= getFPMode(MF
);
927 ProgInfo
.IEEEMode
= STM
.enableIEEEBit(MF
);
929 // Make clamp modifier on NaN input returns 0.
930 ProgInfo
.DX10Clamp
= STM
.enableDX10Clamp();
932 unsigned LDSAlignShift
;
933 if (STM
.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS
) {
934 // LDS is allocated in 64 dword blocks.
937 // LDS is allocated in 128 dword blocks.
941 unsigned LDSSpillSize
=
942 MFI
->getLDSWaveSpillSize() * MFI
->getMaxFlatWorkGroupSize();
944 ProgInfo
.LDSSize
= MFI
->getLDSSize() + LDSSpillSize
;
946 alignTo(ProgInfo
.LDSSize
, 1ULL << LDSAlignShift
) >> LDSAlignShift
;
948 // Scratch is allocated in 256 dword blocks.
949 unsigned ScratchAlignShift
= 10;
950 // We need to program the hardware with the amount of scratch memory that
951 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
952 // scratch memory used per thread.
953 ProgInfo
.ScratchBlocks
=
954 alignTo(ProgInfo
.ScratchSize
* STM
.getWavefrontSize(),
955 1ULL << ScratchAlignShift
) >>
958 ProgInfo
.ComputePGMRSrc1
=
959 S_00B848_VGPRS(ProgInfo
.VGPRBlocks
) |
960 S_00B848_SGPRS(ProgInfo
.SGPRBlocks
) |
961 S_00B848_PRIORITY(ProgInfo
.Priority
) |
962 S_00B848_FLOAT_MODE(ProgInfo
.FloatMode
) |
963 S_00B848_PRIV(ProgInfo
.Priv
) |
964 S_00B848_DX10_CLAMP(ProgInfo
.DX10Clamp
) |
965 S_00B848_DEBUG_MODE(ProgInfo
.DebugMode
) |
966 S_00B848_IEEE_MODE(ProgInfo
.IEEEMode
);
968 // 0 = X, 1 = XY, 2 = XYZ
969 unsigned TIDIGCompCnt
= 0;
970 if (MFI
->hasWorkItemIDZ())
972 else if (MFI
->hasWorkItemIDY())
975 ProgInfo
.ComputePGMRSrc2
=
976 S_00B84C_SCRATCH_EN(ProgInfo
.ScratchBlocks
> 0) |
977 S_00B84C_USER_SGPR(MFI
->getNumUserSGPRs()) |
978 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
979 S_00B84C_TRAP_HANDLER(STM
.isAmdHsaOS() ? 0 : STM
.isTrapHandlerEnabled()) |
980 S_00B84C_TGID_X_EN(MFI
->hasWorkGroupIDX()) |
981 S_00B84C_TGID_Y_EN(MFI
->hasWorkGroupIDY()) |
982 S_00B84C_TGID_Z_EN(MFI
->hasWorkGroupIDZ()) |
983 S_00B84C_TG_SIZE_EN(MFI
->hasWorkGroupInfo()) |
984 S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt
) |
985 S_00B84C_EXCP_EN_MSB(0) |
986 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
987 S_00B84C_LDS_SIZE(STM
.isAmdHsaOS() ? 0 : ProgInfo
.LDSBlocks
) |
991 static unsigned getRsrcReg(CallingConv::ID CallConv
) {
993 default: LLVM_FALLTHROUGH
;
994 case CallingConv::AMDGPU_CS
: return R_00B848_COMPUTE_PGM_RSRC1
;
995 case CallingConv::AMDGPU_LS
: return R_00B528_SPI_SHADER_PGM_RSRC1_LS
;
996 case CallingConv::AMDGPU_HS
: return R_00B428_SPI_SHADER_PGM_RSRC1_HS
;
997 case CallingConv::AMDGPU_ES
: return R_00B328_SPI_SHADER_PGM_RSRC1_ES
;
998 case CallingConv::AMDGPU_GS
: return R_00B228_SPI_SHADER_PGM_RSRC1_GS
;
999 case CallingConv::AMDGPU_VS
: return R_00B128_SPI_SHADER_PGM_RSRC1_VS
;
1000 case CallingConv::AMDGPU_PS
: return R_00B028_SPI_SHADER_PGM_RSRC1_PS
;
1004 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction
&MF
,
1005 const SIProgramInfo
&CurrentProgramInfo
) {
1006 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
1007 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1008 unsigned RsrcReg
= getRsrcReg(MF
.getFunction().getCallingConv());
1010 if (AMDGPU::isCompute(MF
.getFunction().getCallingConv())) {
1011 OutStreamer
->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1
, 4);
1013 OutStreamer
->EmitIntValue(CurrentProgramInfo
.ComputePGMRSrc1
, 4);
1015 OutStreamer
->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2
, 4);
1016 OutStreamer
->EmitIntValue(CurrentProgramInfo
.ComputePGMRSrc2
, 4);
1018 OutStreamer
->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE
, 4);
1019 OutStreamer
->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo
.ScratchBlocks
), 4);
1021 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1022 // 0" comment but I don't see a corresponding field in the register spec.
1024 OutStreamer
->EmitIntValue(RsrcReg
, 4);
1025 OutStreamer
->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo
.VGPRBlocks
) |
1026 S_00B028_SGPRS(CurrentProgramInfo
.SGPRBlocks
), 4);
1027 if (STM
.isVGPRSpillingEnabled(MF
.getFunction())) {
1028 OutStreamer
->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE
, 4);
1029 OutStreamer
->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo
.ScratchBlocks
), 4);
1033 if (MF
.getFunction().getCallingConv() == CallingConv::AMDGPU_PS
) {
1034 OutStreamer
->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS
, 4);
1035 OutStreamer
->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo
.LDSBlocks
), 4);
1036 OutStreamer
->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA
, 4);
1037 OutStreamer
->EmitIntValue(MFI
->getPSInputEnable(), 4);
1038 OutStreamer
->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR
, 4);
1039 OutStreamer
->EmitIntValue(MFI
->getPSInputAddr(), 4);
1042 OutStreamer
->EmitIntValue(R_SPILLED_SGPRS
, 4);
1043 OutStreamer
->EmitIntValue(MFI
->getNumSpilledSGPRs(), 4);
1044 OutStreamer
->EmitIntValue(R_SPILLED_VGPRS
, 4);
1045 OutStreamer
->EmitIntValue(MFI
->getNumSpilledVGPRs(), 4);
1048 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1049 // is AMDPAL. It stores each compute/SPI register setting and other PAL
1050 // metadata items into the PALMetadataMap, combining with any provided by the
1051 // frontend as LLVM metadata. Once all functions are written, PALMetadataMap is
1052 // then written as a single block in the .note section.
1053 void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction
&MF
,
1054 const SIProgramInfo
&CurrentProgramInfo
) {
1055 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1056 // Given the calling convention, calculate the register number for rsrc1. In
1057 // principle the register number could change in future hardware, but we know
1058 // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so
1059 // we can use the same fixed value that .AMDGPU.config has for Mesa. Note
1060 // that we use a register number rather than a byte offset, so we need to
1062 unsigned Rsrc1Reg
= getRsrcReg(MF
.getFunction().getCallingConv()) / 4;
1063 unsigned Rsrc2Reg
= Rsrc1Reg
+ 1;
1064 // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used
1065 // with a constant offset to access any non-register shader-specific PAL
1067 unsigned ScratchSizeKey
= PALMD::Key::CS_SCRATCH_SIZE
;
1068 switch (MF
.getFunction().getCallingConv()) {
1069 case CallingConv::AMDGPU_PS
:
1070 ScratchSizeKey
= PALMD::Key::PS_SCRATCH_SIZE
;
1072 case CallingConv::AMDGPU_VS
:
1073 ScratchSizeKey
= PALMD::Key::VS_SCRATCH_SIZE
;
1075 case CallingConv::AMDGPU_GS
:
1076 ScratchSizeKey
= PALMD::Key::GS_SCRATCH_SIZE
;
1078 case CallingConv::AMDGPU_ES
:
1079 ScratchSizeKey
= PALMD::Key::ES_SCRATCH_SIZE
;
1081 case CallingConv::AMDGPU_HS
:
1082 ScratchSizeKey
= PALMD::Key::HS_SCRATCH_SIZE
;
1084 case CallingConv::AMDGPU_LS
:
1085 ScratchSizeKey
= PALMD::Key::LS_SCRATCH_SIZE
;
1088 unsigned NumUsedVgprsKey
= ScratchSizeKey
+
1089 PALMD::Key::VS_NUM_USED_VGPRS
- PALMD::Key::VS_SCRATCH_SIZE
;
1090 unsigned NumUsedSgprsKey
= ScratchSizeKey
+
1091 PALMD::Key::VS_NUM_USED_SGPRS
- PALMD::Key::VS_SCRATCH_SIZE
;
1092 PALMetadataMap
[NumUsedVgprsKey
] = CurrentProgramInfo
.NumVGPRsForWavesPerEU
;
1093 PALMetadataMap
[NumUsedSgprsKey
] = CurrentProgramInfo
.NumSGPRsForWavesPerEU
;
1094 if (AMDGPU::isCompute(MF
.getFunction().getCallingConv())) {
1095 PALMetadataMap
[Rsrc1Reg
] |= CurrentProgramInfo
.ComputePGMRSrc1
;
1096 PALMetadataMap
[Rsrc2Reg
] |= CurrentProgramInfo
.ComputePGMRSrc2
;
1097 // ScratchSize is in bytes, 16 aligned.
1098 PALMetadataMap
[ScratchSizeKey
] |=
1099 alignTo(CurrentProgramInfo
.ScratchSize
, 16);
1101 PALMetadataMap
[Rsrc1Reg
] |= S_00B028_VGPRS(CurrentProgramInfo
.VGPRBlocks
) |
1102 S_00B028_SGPRS(CurrentProgramInfo
.SGPRBlocks
);
1103 if (CurrentProgramInfo
.ScratchBlocks
> 0)
1104 PALMetadataMap
[Rsrc2Reg
] |= S_00B84C_SCRATCH_EN(1);
1105 // ScratchSize is in bytes, 16 aligned.
1106 PALMetadataMap
[ScratchSizeKey
] |=
1107 alignTo(CurrentProgramInfo
.ScratchSize
, 16);
1109 if (MF
.getFunction().getCallingConv() == CallingConv::AMDGPU_PS
) {
1110 PALMetadataMap
[Rsrc2Reg
] |=
1111 S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo
.LDSBlocks
);
1112 PALMetadataMap
[R_0286CC_SPI_PS_INPUT_ENA
/ 4] |= MFI
->getPSInputEnable();
1113 PALMetadataMap
[R_0286D0_SPI_PS_INPUT_ADDR
/ 4] |= MFI
->getPSInputAddr();
1117 // This is supposed to be log2(Size)
1118 static amd_element_byte_size_t
getElementByteSizeValue(unsigned Size
) {
1121 return AMD_ELEMENT_4_BYTES
;
1123 return AMD_ELEMENT_8_BYTES
;
1125 return AMD_ELEMENT_16_BYTES
;
1127 llvm_unreachable("invalid private_element_size");
1131 void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t
&Out
,
1132 const SIProgramInfo
&CurrentProgramInfo
,
1133 const MachineFunction
&MF
) const {
1134 const Function
&F
= MF
.getFunction();
1135 assert(F
.getCallingConv() == CallingConv::AMDGPU_KERNEL
||
1136 F
.getCallingConv() == CallingConv::SPIR_KERNEL
);
1138 const SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
1139 const GCNSubtarget
&STM
= MF
.getSubtarget
<GCNSubtarget
>();
1141 AMDGPU::initDefaultAMDKernelCodeT(Out
, getSTI());
1143 Out
.compute_pgm_resource_registers
=
1144 CurrentProgramInfo
.ComputePGMRSrc1
|
1145 (CurrentProgramInfo
.ComputePGMRSrc2
<< 32);
1146 Out
.code_properties
= AMD_CODE_PROPERTY_IS_PTR64
;
1148 if (CurrentProgramInfo
.DynamicCallStack
)
1149 Out
.code_properties
|= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK
;
1151 AMD_HSA_BITS_SET(Out
.code_properties
,
1152 AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
,
1153 getElementByteSizeValue(STM
.getMaxPrivateElementSize()));
1155 if (MFI
->hasPrivateSegmentBuffer()) {
1156 Out
.code_properties
|=
1157 AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
;
1160 if (MFI
->hasDispatchPtr())
1161 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
;
1163 if (MFI
->hasQueuePtr())
1164 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
;
1166 if (MFI
->hasKernargSegmentPtr())
1167 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
;
1169 if (MFI
->hasDispatchID())
1170 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
;
1172 if (MFI
->hasFlatScratchInit())
1173 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
;
1175 if (MFI
->hasDispatchPtr())
1176 Out
.code_properties
|= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
;
1178 if (STM
.debuggerSupported())
1179 Out
.code_properties
|= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED
;
1181 if (STM
.isXNACKEnabled())
1182 Out
.code_properties
|= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
;
1184 unsigned MaxKernArgAlign
;
1185 Out
.kernarg_segment_byte_size
= STM
.getKernArgSegmentSize(F
, MaxKernArgAlign
);
1186 Out
.wavefront_sgpr_count
= CurrentProgramInfo
.NumSGPR
;
1187 Out
.workitem_vgpr_count
= CurrentProgramInfo
.NumVGPR
;
1188 Out
.workitem_private_segment_byte_size
= CurrentProgramInfo
.ScratchSize
;
1189 Out
.workgroup_group_segment_byte_size
= CurrentProgramInfo
.LDSSize
;
1191 // These alignment values are specified in powers of two, so alignment =
1192 // 2^n. The minimum alignment is 2^4 = 16.
1193 Out
.kernarg_segment_alignment
= std::max((size_t)4,
1194 countTrailingZeros(MaxKernArgAlign
));
1196 if (STM
.debuggerEmitPrologue()) {
1197 Out
.debug_wavefront_private_segment_offset_sgpr
=
1198 CurrentProgramInfo
.DebuggerWavefrontPrivateSegmentOffsetSGPR
;
1199 Out
.debug_private_segment_buffer_sgpr
=
1200 CurrentProgramInfo
.DebuggerPrivateSegmentBufferSGPR
;
1204 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr
*MI
, unsigned OpNo
,
1205 unsigned AsmVariant
,
1206 const char *ExtraCode
, raw_ostream
&O
) {
1207 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1208 if (!AsmPrinter::PrintAsmOperand(MI
, OpNo
, AsmVariant
, ExtraCode
, O
))
1211 if (ExtraCode
&& ExtraCode
[0]) {
1212 if (ExtraCode
[1] != 0)
1213 return true; // Unknown modifier.
1215 switch (ExtraCode
[0]) {
1223 // TODO: Should be able to support other operand types like globals.
1224 const MachineOperand
&MO
= MI
->getOperand(OpNo
);
1226 AMDGPUInstPrinter::printRegOperand(MO
.getReg(), O
,
1227 *MF
->getSubtarget().getRegisterInfo());