1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
32 #define DEBUG_TYPE "amdgpu-subtarget"
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
43 GCNSubtarget::~GCNSubtarget() = default;
46 R600Subtarget::initializeSubtargetDependencies(const Triple
&TT
,
47 StringRef GPU
, StringRef FS
) {
48 SmallString
<256> FullFS("+promote-alloca,+dx10-clamp,");
50 ParseSubtargetFeatures(GPU
, FullFS
);
52 // FIXME: I don't think think Evergreen has any useful support for
53 // denormals, but should be checked. Should we issue a warning somewhere
54 // if someone tries to enable these?
55 if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS
) {
56 FP32Denormals
= false;
59 HasMulU24
= getGeneration() >= EVERGREEN
;
60 HasMulI24
= hasCaymanISA();
66 GCNSubtarget::initializeSubtargetDependencies(const Triple
&TT
,
67 StringRef GPU
, StringRef FS
) {
68 // Determine default and user-specified characteristics
69 // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
70 // enabled, but some instructions do not respect them and they run at the
71 // double precision rate, so don't enable by default.
73 // We want to be able to turn these off, but making this a subtarget feature
74 // for SI has the unhelpful behavior that it unsets everything else if you
77 // Similarly we want enable-prt-strict-null to be on by default and not to
78 // unset everything else if it is disabled
80 SmallString
<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
82 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
83 FullFS
+= "+flat-address-space,+flat-for-global,+unaligned-buffer-access,+trap-handler,";
85 // FIXME: I don't think think Evergreen has any useful support for
86 // denormals, but should be checked. Should we issue a warning somewhere
87 // if someone tries to enable these?
88 if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS
) {
89 FullFS
+= "+fp64-fp16-denormals,";
91 FullFS
+= "-fp32-denormals,";
94 FullFS
+= "+enable-prt-strict-null,"; // This is overridden by a disable in FS
98 ParseSubtargetFeatures(GPU
, FullFS
);
100 // We don't support FP64 for EG/NI atm.
101 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS
));
103 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
104 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
105 // variants of MUBUF instructions.
106 if (!hasAddr64() && !FS
.contains("flat-for-global")) {
107 FlatForGlobal
= true;
110 // Set defaults if needed.
111 if (MaxPrivateElementSize
== 0)
112 MaxPrivateElementSize
= 4;
114 if (LDSBankCount
== 0)
117 if (TT
.getArch() == Triple::amdgcn
) {
118 if (LocalMemorySize
== 0)
119 LocalMemorySize
= 32768;
121 // Do something sensible for unspecified target.
122 if (!HasMovrel
&& !HasVGPRIndexMode
)
126 // Don't crash on invalid devices.
127 if (WavefrontSize
== 0)
130 HasFminFmaxLegacy
= getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS
;
135 AMDGPUSubtarget::AMDGPUSubtarget(const Triple
&TT
) :
137 Has16BitInsts(false),
138 HasMadMixInsts(false),
139 FP32Denormals(false),
142 HasVOP3PInsts(false),
145 HasInv2PiInlineImm(false),
146 HasFminFmaxLegacy(true),
147 EnablePromoteAlloca(false),
148 HasTrigReducedRange(false),
153 GCNSubtarget::GCNSubtarget(const Triple
&TT
, StringRef GPU
, StringRef FS
,
154 const GCNTargetMachine
&TM
) :
155 AMDGPUGenSubtargetInfo(TT
, GPU
, FS
),
158 Gen(SOUTHERN_ISLANDS
),
159 InstrItins(getInstrItineraryForCPU(GPU
)),
161 MaxPrivateElementSize(0),
164 HalfRate64Ops(false),
166 FP64FP16Denormals(false),
168 FlatForGlobal(false),
169 AutoWaitcntBeforeBarrier(false),
171 UnalignedScratchAccess(false),
172 UnalignedBufferAccess(false),
174 HasApertureRegs(false),
177 DebuggerInsertNops(false),
178 DebuggerEmitPrologue(false),
180 EnableHugePrivateBuffer(false),
181 EnableLoadStoreOpt(false),
182 EnableUnsafeDSOffsetFolding(false),
183 EnableSIScheduler(false),
185 EnablePRTStrictNull(false),
194 HasSMemRealTime(false),
196 HasFmaMixInsts(false),
198 HasVGPRIndexMode(false),
199 HasScalarStores(false),
200 HasScalarAtomics(false),
202 HasSDWAScalar(false),
205 HasSDWAOutModsVOPC(false),
211 EnableSRAMECC(false),
212 FlatAddressSpace(false),
213 FlatInstOffsets(false),
214 FlatGlobalInsts(false),
215 FlatScratchInsts(false),
216 AddNoCarryInsts(false),
217 HasUnpackedD16VMem(false),
219 ScalarizeGlobal(false),
221 FeatureDisable(false),
222 InstrInfo(initializeSubtargetDependencies(TT
, GPU
, FS
)),
224 FrameLowering(TargetFrameLowering::StackGrowsUp
, getStackAlignment(), 0) {
225 CallLoweringInfo
.reset(new AMDGPUCallLowering(*getTargetLowering()));
226 Legalizer
.reset(new AMDGPULegalizerInfo(*this, TM
));
227 RegBankInfo
.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
228 InstSelector
.reset(new AMDGPUInstructionSelector(
229 *this, *static_cast<AMDGPURegisterBankInfo
*>(RegBankInfo
.get()), TM
));
232 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves
,
233 const Function
&F
) const {
235 return getLocalMemorySize();
236 unsigned WorkGroupSize
= getFlatWorkGroupSizes(F
).second
;
237 unsigned WorkGroupsPerCu
= getMaxWorkGroupsPerCU(WorkGroupSize
);
238 unsigned MaxWaves
= getMaxWavesPerEU();
239 return getLocalMemorySize() * MaxWaves
/ WorkGroupsPerCu
/ NWaves
;
242 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes
,
243 const Function
&F
) const {
244 unsigned WorkGroupSize
= getFlatWorkGroupSizes(F
).second
;
245 unsigned WorkGroupsPerCu
= getMaxWorkGroupsPerCU(WorkGroupSize
);
246 unsigned MaxWaves
= getMaxWavesPerEU();
247 unsigned Limit
= getLocalMemorySize() * MaxWaves
/ WorkGroupsPerCu
;
248 unsigned NumWaves
= Limit
/ (Bytes
? Bytes
: 1u);
249 NumWaves
= std::min(NumWaves
, MaxWaves
);
250 NumWaves
= std::max(NumWaves
, 1u);
255 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction
&MF
) const {
256 const auto *MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
257 return getOccupancyWithLocalMemSize(MFI
->getLDSSize(), MF
.getFunction());
260 std::pair
<unsigned, unsigned>
261 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC
) const {
263 case CallingConv::AMDGPU_CS
:
264 case CallingConv::AMDGPU_KERNEL
:
265 case CallingConv::SPIR_KERNEL
:
266 return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4);
267 case CallingConv::AMDGPU_VS
:
268 case CallingConv::AMDGPU_LS
:
269 case CallingConv::AMDGPU_HS
:
270 case CallingConv::AMDGPU_ES
:
271 case CallingConv::AMDGPU_GS
:
272 case CallingConv::AMDGPU_PS
:
273 return std::make_pair(1, getWavefrontSize());
275 return std::make_pair(1, 16 * getWavefrontSize());
279 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
280 const Function
&F
) const {
281 // FIXME: 1024 if function.
282 // Default minimum/maximum flat work group sizes.
283 std::pair
<unsigned, unsigned> Default
=
284 getDefaultFlatWorkGroupSize(F
.getCallingConv());
286 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
287 // starts using "amdgpu-flat-work-group-size" attribute.
288 Default
.second
= AMDGPU::getIntegerAttribute(
289 F
, "amdgpu-max-work-group-size", Default
.second
);
290 Default
.first
= std::min(Default
.first
, Default
.second
);
292 // Requested minimum/maximum flat work group sizes.
293 std::pair
<unsigned, unsigned> Requested
= AMDGPU::getIntegerPairAttribute(
294 F
, "amdgpu-flat-work-group-size", Default
);
296 // Make sure requested minimum is less than requested maximum.
297 if (Requested
.first
> Requested
.second
)
300 // Make sure requested values do not violate subtarget's specifications.
301 if (Requested
.first
< getMinFlatWorkGroupSize())
303 if (Requested
.second
> getMaxFlatWorkGroupSize())
309 std::pair
<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
310 const Function
&F
) const {
311 // Default minimum/maximum number of waves per execution unit.
312 std::pair
<unsigned, unsigned> Default(1, getMaxWavesPerEU());
314 // Default/requested minimum/maximum flat work group sizes.
315 std::pair
<unsigned, unsigned> FlatWorkGroupSizes
= getFlatWorkGroupSizes(F
);
317 // If minimum/maximum flat work group sizes were explicitly requested using
318 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
319 // number of waves per execution unit to values implied by requested
320 // minimum/maximum flat work group sizes.
321 unsigned MinImpliedByFlatWorkGroupSize
=
322 getMaxWavesPerEU(FlatWorkGroupSizes
.second
);
323 bool RequestedFlatWorkGroupSize
= false;
325 // TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
326 // starts using "amdgpu-flat-work-group-size" attribute.
327 if (F
.hasFnAttribute("amdgpu-max-work-group-size") ||
328 F
.hasFnAttribute("amdgpu-flat-work-group-size")) {
329 Default
.first
= MinImpliedByFlatWorkGroupSize
;
330 RequestedFlatWorkGroupSize
= true;
333 // Requested minimum/maximum number of waves per execution unit.
334 std::pair
<unsigned, unsigned> Requested
= AMDGPU::getIntegerPairAttribute(
335 F
, "amdgpu-waves-per-eu", Default
, true);
337 // Make sure requested minimum is less than requested maximum.
338 if (Requested
.second
&& Requested
.first
> Requested
.second
)
341 // Make sure requested values do not violate subtarget's specifications.
342 if (Requested
.first
< getMinWavesPerEU() ||
343 Requested
.first
> getMaxWavesPerEU())
345 if (Requested
.second
> getMaxWavesPerEU())
348 // Make sure requested values are compatible with values implied by requested
349 // minimum/maximum flat work group sizes.
350 if (RequestedFlatWorkGroupSize
&&
351 Requested
.first
< MinImpliedByFlatWorkGroupSize
)
357 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction
*I
) const {
358 Function
*Kernel
= I
->getParent()->getParent();
359 unsigned MinSize
= 0;
360 unsigned MaxSize
= getFlatWorkGroupSizes(*Kernel
).second
;
361 bool IdQuery
= false;
363 // If reqd_work_group_size is present it narrows value down.
364 if (auto *CI
= dyn_cast
<CallInst
>(I
)) {
365 const Function
*F
= CI
->getCalledFunction();
367 unsigned Dim
= UINT_MAX
;
368 switch (F
->getIntrinsicID()) {
369 case Intrinsic::amdgcn_workitem_id_x
:
370 case Intrinsic::r600_read_tidig_x
:
373 case Intrinsic::r600_read_local_size_x
:
376 case Intrinsic::amdgcn_workitem_id_y
:
377 case Intrinsic::r600_read_tidig_y
:
380 case Intrinsic::r600_read_local_size_y
:
383 case Intrinsic::amdgcn_workitem_id_z
:
384 case Intrinsic::r600_read_tidig_z
:
387 case Intrinsic::r600_read_local_size_z
:
394 if (auto Node
= Kernel
->getMetadata("reqd_work_group_size"))
395 if (Node
->getNumOperands() == 3)
396 MinSize
= MaxSize
= mdconst::extract
<ConstantInt
>(
397 Node
->getOperand(Dim
))->getZExtValue();
405 // Range metadata is [Lo, Hi). For ID query we need to pass max size
406 // as Hi. For size query we need to pass Hi + 1.
412 MDBuilder
MDB(I
->getContext());
413 MDNode
*MaxWorkGroupSizeRange
= MDB
.createRange(APInt(32, MinSize
),
415 I
->setMetadata(LLVMContext::MD_range
, MaxWorkGroupSizeRange
);
419 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function
&F
,
420 unsigned &MaxAlign
) const {
421 assert(F
.getCallingConv() == CallingConv::AMDGPU_KERNEL
||
422 F
.getCallingConv() == CallingConv::SPIR_KERNEL
);
424 const DataLayout
&DL
= F
.getParent()->getDataLayout();
425 uint64_t ExplicitArgBytes
= 0;
428 for (const Argument
&Arg
: F
.args()) {
429 Type
*ArgTy
= Arg
.getType();
431 unsigned Align
= DL
.getABITypeAlignment(ArgTy
);
432 uint64_t AllocSize
= DL
.getTypeAllocSize(ArgTy
);
433 ExplicitArgBytes
= alignTo(ExplicitArgBytes
, Align
) + AllocSize
;
434 MaxAlign
= std::max(MaxAlign
, Align
);
437 return ExplicitArgBytes
;
440 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function
&F
,
441 unsigned &MaxAlign
) const {
442 uint64_t ExplicitArgBytes
= getExplicitKernArgSize(F
, MaxAlign
);
444 unsigned ExplicitOffset
= getExplicitKernelArgOffset(F
);
446 uint64_t TotalSize
= ExplicitOffset
+ ExplicitArgBytes
;
447 unsigned ImplicitBytes
= getImplicitArgNumBytes(F
);
448 if (ImplicitBytes
!= 0) {
449 unsigned Alignment
= getAlignmentForImplicitArgPtr();
450 TotalSize
= alignTo(ExplicitArgBytes
, Alignment
) + ImplicitBytes
;
453 // Being able to dereference past the end is useful for emitting scalar loads.
454 return alignTo(TotalSize
, 4);
457 R600Subtarget::R600Subtarget(const Triple
&TT
, StringRef GPU
, StringRef FS
,
458 const TargetMachine
&TM
) :
459 R600GenSubtargetInfo(TT
, GPU
, FS
),
462 FrameLowering(TargetFrameLowering::StackGrowsUp
, getStackAlignment(), 0),
467 HasVertexCache(false),
472 TLInfo(TM
, initializeSubtargetDependencies(TT
, GPU
, FS
)),
473 InstrItins(getInstrItineraryForCPU(GPU
)) { }
475 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy
&Policy
,
476 unsigned NumRegionInstrs
) const {
477 // Track register pressure so the scheduler can try to decrease
478 // pressure once register usage is above the threshold defined by
479 // SIRegisterInfo::getRegPressureSetLimit()
480 Policy
.ShouldTrackPressure
= true;
482 // Enabling both top down and bottom up scheduling seems to give us less
483 // register spills than just using one of these approaches on its own.
484 Policy
.OnlyTopDown
= false;
485 Policy
.OnlyBottomUp
= false;
487 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
488 if (!enableSIScheduler())
489 Policy
.ShouldTrackLaneMasks
= true;
492 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs
) const {
493 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
515 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs
) const {
537 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction
&MF
) const {
538 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
539 if (MFI
.hasFlatScratchInit()) {
540 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
541 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
542 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS
)
543 return 4; // FLAT_SCRATCH, VCC (in that order).
546 if (isXNACKEnabled())
547 return 4; // XNACK, VCC (in that order).
551 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction
&MF
) const {
552 const Function
&F
= MF
.getFunction();
553 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
555 // Compute maximum number of SGPRs function can use using default/requested
556 // minimum number of waves per execution unit.
557 std::pair
<unsigned, unsigned> WavesPerEU
= MFI
.getWavesPerEU();
558 unsigned MaxNumSGPRs
= getMaxNumSGPRs(WavesPerEU
.first
, false);
559 unsigned MaxAddressableNumSGPRs
= getMaxNumSGPRs(WavesPerEU
.first
, true);
561 // Check if maximum number of SGPRs was explicitly requested using
562 // "amdgpu-num-sgpr" attribute.
563 if (F
.hasFnAttribute("amdgpu-num-sgpr")) {
564 unsigned Requested
= AMDGPU::getIntegerAttribute(
565 F
, "amdgpu-num-sgpr", MaxNumSGPRs
);
567 // Make sure requested value does not violate subtarget's specifications.
568 if (Requested
&& (Requested
<= getReservedNumSGPRs(MF
)))
571 // If more SGPRs are required to support the input user/system SGPRs,
572 // increase to accommodate them.
574 // FIXME: This really ends up using the requested number of SGPRs + number
575 // of reserved special registers in total. Theoretically you could re-use
576 // the last input registers for these special registers, but this would
577 // require a lot of complexity to deal with the weird aliasing.
578 unsigned InputNumSGPRs
= MFI
.getNumPreloadedSGPRs();
579 if (Requested
&& Requested
< InputNumSGPRs
)
580 Requested
= InputNumSGPRs
;
582 // Make sure requested value is compatible with values implied by
583 // default/requested minimum/maximum number of waves per execution unit.
584 if (Requested
&& Requested
> getMaxNumSGPRs(WavesPerEU
.first
, false))
586 if (WavesPerEU
.second
&&
587 Requested
&& Requested
< getMinNumSGPRs(WavesPerEU
.second
))
591 MaxNumSGPRs
= Requested
;
594 if (hasSGPRInitBug())
595 MaxNumSGPRs
= AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG
;
597 return std::min(MaxNumSGPRs
- getReservedNumSGPRs(MF
),
598 MaxAddressableNumSGPRs
);
601 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction
&MF
) const {
602 const Function
&F
= MF
.getFunction();
603 const SIMachineFunctionInfo
&MFI
= *MF
.getInfo
<SIMachineFunctionInfo
>();
605 // Compute maximum number of VGPRs function can use using default/requested
606 // minimum number of waves per execution unit.
607 std::pair
<unsigned, unsigned> WavesPerEU
= MFI
.getWavesPerEU();
608 unsigned MaxNumVGPRs
= getMaxNumVGPRs(WavesPerEU
.first
);
610 // Check if maximum number of VGPRs was explicitly requested using
611 // "amdgpu-num-vgpr" attribute.
612 if (F
.hasFnAttribute("amdgpu-num-vgpr")) {
613 unsigned Requested
= AMDGPU::getIntegerAttribute(
614 F
, "amdgpu-num-vgpr", MaxNumVGPRs
);
616 // Make sure requested value is compatible with values implied by
617 // default/requested minimum/maximum number of waves per execution unit.
618 if (Requested
&& Requested
> getMaxNumVGPRs(WavesPerEU
.first
))
620 if (WavesPerEU
.second
&&
621 Requested
&& Requested
< getMinNumVGPRs(WavesPerEU
.second
))
625 MaxNumVGPRs
= Requested
;
632 struct MemOpClusterMutation
: ScheduleDAGMutation
{
633 const SIInstrInfo
*TII
;
635 MemOpClusterMutation(const SIInstrInfo
*tii
) : TII(tii
) {}
637 void apply(ScheduleDAGInstrs
*DAGInstrs
) override
{
638 ScheduleDAGMI
*DAG
= static_cast<ScheduleDAGMI
*>(DAGInstrs
);
640 SUnit
*SUa
= nullptr;
641 // Search for two consequent memory operations and link them
642 // to prevent scheduler from moving them apart.
643 // In DAG pre-process SUnits are in the original order of
644 // the instructions before scheduling.
645 for (SUnit
&SU
: DAG
->SUnits
) {
646 MachineInstr
&MI2
= *SU
.getInstr();
647 if (!MI2
.mayLoad() && !MI2
.mayStore()) {
656 MachineInstr
&MI1
= *SUa
->getInstr();
657 if ((TII
->isVMEM(MI1
) && TII
->isVMEM(MI2
)) ||
658 (TII
->isFLAT(MI1
) && TII
->isFLAT(MI2
)) ||
659 (TII
->isSMRD(MI1
) && TII
->isSMRD(MI2
)) ||
660 (TII
->isDS(MI1
) && TII
->isDS(MI2
))) {
661 SU
.addPredBarrier(SUa
);
663 for (const SDep
&SI
: SU
.Preds
) {
664 if (SI
.getSUnit() != SUa
)
665 SUa
->addPred(SDep(SI
.getSUnit(), SDep::Artificial
));
668 if (&SU
!= &DAG
->ExitSU
) {
669 for (const SDep
&SI
: SUa
->Succs
) {
670 if (SI
.getSUnit() != &SU
)
671 SI
.getSUnit()->addPred(SDep(&SU
, SDep::Artificial
));
682 void GCNSubtarget::getPostRAMutations(
683 std::vector
<std::unique_ptr
<ScheduleDAGMutation
>> &Mutations
) const {
684 Mutations
.push_back(llvm::make_unique
<MemOpClusterMutation
>(&InstrInfo
));
687 const AMDGPUSubtarget
&AMDGPUSubtarget::get(const MachineFunction
&MF
) {
688 if (MF
.getTarget().getTargetTriple().getArch() == Triple::amdgcn
)
689 return static_cast<const AMDGPUSubtarget
&>(MF
.getSubtarget
<GCNSubtarget
>());
691 return static_cast<const AMDGPUSubtarget
&>(MF
.getSubtarget
<R600Subtarget
>());
694 const AMDGPUSubtarget
&AMDGPUSubtarget::get(const TargetMachine
&TM
, const Function
&F
) {
695 if (TM
.getTargetTriple().getArch() == Triple::amdgcn
)
696 return static_cast<const AMDGPUSubtarget
&>(TM
.getSubtarget
<GCNSubtarget
>(F
));
698 return static_cast<const AMDGPUSubtarget
&>(TM
.getSubtarget
<R600Subtarget
>(F
));