[ARM] Better patterns for fp <> predicate vectors
[llvm-complete.git] / lib / Target / AMDGPU / R600MachineScheduler.cpp
blob34267a909b5e6f58825509241a9dfd64400c3033
1 //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// R600 Machine Scheduler interface
12 //===----------------------------------------------------------------------===//
14 #include "R600MachineScheduler.h"
15 #include "AMDGPUSubtarget.h"
16 #include "R600InstrInfo.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "llvm/CodeGen/MachineRegisterInfo.h"
19 #include "llvm/IR/LegacyPassManager.h"
20 #include "llvm/Pass.h"
21 #include "llvm/Support/raw_ostream.h"
23 using namespace llvm;
25 #define DEBUG_TYPE "machine-scheduler"
27 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
28 assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
29 DAG = static_cast<ScheduleDAGMILive*>(dag);
30 const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
31 TII = static_cast<const R600InstrInfo*>(DAG->TII);
32 TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
33 VLIW5 = !ST.hasCaymanISA();
34 MRI = &DAG->MRI;
35 CurInstKind = IDOther;
36 CurEmitted = 0;
37 OccupedSlotsMask = 31;
38 InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
39 InstKindLimit[IDOther] = 32;
40 InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
41 AluInstCount = 0;
42 FetchInstCount = 0;
45 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
46 std::vector<SUnit *> &QDst)
48 QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
49 QSrc.clear();
52 static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
53 assert (GPRCount && "GPRCount cannot be 0");
54 return 248 / GPRCount;
57 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
58 SUnit *SU = nullptr;
59 NextInstKind = IDOther;
61 IsTopNode = false;
63 // check if we might want to switch current clause type
64 bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
65 (Available[CurInstKind].empty());
66 bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
67 (!Available[IDFetch].empty() || !Available[IDOther].empty());
69 if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
70 // We use the heuristic provided by AMD Accelerated Parallel Processing
71 // OpenCL Programming Guide :
72 // The approx. number of WF that allows TEX inst to hide ALU inst is :
73 // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
74 float ALUFetchRationEstimate =
75 (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
76 (FetchInstCount + Available[IDFetch].size());
77 if (ALUFetchRationEstimate == 0) {
78 AllowSwitchFromAlu = true;
79 } else {
80 unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
81 LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
82 // We assume the local GPR requirements to be "dominated" by the requirement
83 // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
84 // after TEX are indeed likely to consume or generate values from/for the
85 // TEX clause.
86 // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
87 // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
88 // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
89 // (TODO : use RegisterPressure)
90 // If we are going too use too many GPR, we flush Fetch instruction to lower
91 // register pressure on 128 bits regs.
92 unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
93 if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
94 AllowSwitchFromAlu = true;
98 if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
99 (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
100 // try to pick ALU
101 SU = pickAlu();
102 if (!SU && !PhysicalRegCopy.empty()) {
103 SU = PhysicalRegCopy.front();
104 PhysicalRegCopy.erase(PhysicalRegCopy.begin());
106 if (SU) {
107 if (CurEmitted >= InstKindLimit[IDAlu])
108 CurEmitted = 0;
109 NextInstKind = IDAlu;
113 if (!SU) {
114 // try to pick FETCH
115 SU = pickOther(IDFetch);
116 if (SU)
117 NextInstKind = IDFetch;
120 // try to pick other
121 if (!SU) {
122 SU = pickOther(IDOther);
123 if (SU)
124 NextInstKind = IDOther;
127 LLVM_DEBUG(if (SU) {
128 dbgs() << " ** Pick node **\n";
129 DAG->dumpNode(*SU);
130 } else {
131 dbgs() << "NO NODE \n";
132 for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
133 const SUnit &S = DAG->SUnits[i];
134 if (!S.isScheduled)
135 DAG->dumpNode(S);
139 return SU;
142 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
143 if (NextInstKind != CurInstKind) {
144 LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
145 if (NextInstKind != IDAlu)
146 OccupedSlotsMask |= 31;
147 CurEmitted = 0;
148 CurInstKind = NextInstKind;
151 if (CurInstKind == IDAlu) {
152 AluInstCount ++;
153 switch (getAluKind(SU)) {
154 case AluT_XYZW:
155 CurEmitted += 4;
156 break;
157 case AluDiscarded:
158 break;
159 default: {
160 ++CurEmitted;
161 for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
162 E = SU->getInstr()->operands_end(); It != E; ++It) {
163 MachineOperand &MO = *It;
164 if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
165 ++CurEmitted;
169 } else {
170 ++CurEmitted;
173 LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
175 if (CurInstKind != IDFetch) {
176 MoveUnits(Pending[IDFetch], Available[IDFetch]);
177 } else
178 FetchInstCount++;
181 static bool
182 isPhysicalRegCopy(MachineInstr *MI) {
183 if (MI->getOpcode() != R600::COPY)
184 return false;
186 return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
189 void R600SchedStrategy::releaseTopNode(SUnit *SU) {
190 LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU));
193 void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
194 LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU));
195 if (isPhysicalRegCopy(SU->getInstr())) {
196 PhysicalRegCopy.push_back(SU);
197 return;
200 int IK = getInstKind(SU);
202 // There is no export clause, we can schedule one as soon as its ready
203 if (IK == IDOther)
204 Available[IDOther].push_back(SU);
205 else
206 Pending[IK].push_back(SU);
210 bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
211 const TargetRegisterClass *RC) const {
212 if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
213 return RC->contains(Reg);
214 } else {
215 return MRI->getRegClass(Reg) == RC;
219 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
220 MachineInstr *MI = SU->getInstr();
222 if (TII->isTransOnly(*MI))
223 return AluTrans;
225 switch (MI->getOpcode()) {
226 case R600::PRED_X:
227 return AluPredX;
228 case R600::INTERP_PAIR_XY:
229 case R600::INTERP_PAIR_ZW:
230 case R600::INTERP_VEC_LOAD:
231 case R600::DOT_4:
232 return AluT_XYZW;
233 case R600::COPY:
234 if (MI->getOperand(1).isUndef()) {
235 // MI will become a KILL, don't considers it in scheduling
236 return AluDiscarded;
238 break;
239 default:
240 break;
243 // Does the instruction take a whole IG ?
244 // XXX: Is it possible to add a helper function in R600InstrInfo that can
245 // be used here and in R600PacketizerList::isSoloInstruction() ?
246 if(TII->isVector(*MI) ||
247 TII->isCubeOp(MI->getOpcode()) ||
248 TII->isReductionOp(MI->getOpcode()) ||
249 MI->getOpcode() == R600::GROUP_BARRIER) {
250 return AluT_XYZW;
253 if (TII->isLDSInstr(MI->getOpcode())) {
254 return AluT_X;
257 // Is the result already assigned to a channel ?
258 unsigned DestSubReg = MI->getOperand(0).getSubReg();
259 switch (DestSubReg) {
260 case R600::sub0:
261 return AluT_X;
262 case R600::sub1:
263 return AluT_Y;
264 case R600::sub2:
265 return AluT_Z;
266 case R600::sub3:
267 return AluT_W;
268 default:
269 break;
272 // Is the result already member of a X/Y/Z/W class ?
273 unsigned DestReg = MI->getOperand(0).getReg();
274 if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
275 regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
276 return AluT_X;
277 if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
278 return AluT_Y;
279 if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
280 return AluT_Z;
281 if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
282 return AluT_W;
283 if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
284 return AluT_XYZW;
286 // LDS src registers cannot be used in the Trans slot.
287 if (TII->readsLDSSrcReg(*MI))
288 return AluT_XYZW;
290 return AluAny;
293 int R600SchedStrategy::getInstKind(SUnit* SU) {
294 int Opcode = SU->getInstr()->getOpcode();
296 if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
297 return IDFetch;
299 if (TII->isALUInstr(Opcode)) {
300 return IDAlu;
303 switch (Opcode) {
304 case R600::PRED_X:
305 case R600::COPY:
306 case R600::CONST_COPY:
307 case R600::INTERP_PAIR_XY:
308 case R600::INTERP_PAIR_ZW:
309 case R600::INTERP_VEC_LOAD:
310 case R600::DOT_4:
311 return IDAlu;
312 default:
313 return IDOther;
317 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
318 if (Q.empty())
319 return nullptr;
320 for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
321 It != E; ++It) {
322 SUnit *SU = *It;
323 InstructionsGroupCandidate.push_back(SU->getInstr());
324 if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
325 (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
326 InstructionsGroupCandidate.pop_back();
327 Q.erase((It + 1).base());
328 return SU;
329 } else {
330 InstructionsGroupCandidate.pop_back();
333 return nullptr;
336 void R600SchedStrategy::LoadAlu() {
337 std::vector<SUnit *> &QSrc = Pending[IDAlu];
338 for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
339 AluKind AK = getAluKind(QSrc[i]);
340 AvailableAlus[AK].push_back(QSrc[i]);
342 QSrc.clear();
345 void R600SchedStrategy::PrepareNextSlot() {
346 LLVM_DEBUG(dbgs() << "New Slot\n");
347 assert (OccupedSlotsMask && "Slot wasn't filled");
348 OccupedSlotsMask = 0;
349 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
350 // OccupedSlotsMask |= 16;
351 InstructionsGroupCandidate.clear();
352 LoadAlu();
355 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
356 int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
357 if (DstIndex == -1) {
358 return;
360 unsigned DestReg = MI->getOperand(DstIndex).getReg();
361 // PressureRegister crashes if an operand is def and used in the same inst
362 // and we try to constraint its regclass
363 for (MachineInstr::mop_iterator It = MI->operands_begin(),
364 E = MI->operands_end(); It != E; ++It) {
365 MachineOperand &MO = *It;
366 if (MO.isReg() && !MO.isDef() &&
367 MO.getReg() == DestReg)
368 return;
370 // Constrains the regclass of DestReg to assign it to Slot
371 switch (Slot) {
372 case 0:
373 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
374 break;
375 case 1:
376 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
377 break;
378 case 2:
379 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
380 break;
381 case 3:
382 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
383 break;
387 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
388 static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
389 SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
390 if (SlotedSU)
391 return SlotedSU;
392 SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
393 if (UnslotedSU)
394 AssignSlot(UnslotedSU->getInstr(), Slot);
395 return UnslotedSU;
398 unsigned R600SchedStrategy::AvailablesAluCount() const {
399 return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
400 AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
401 AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
402 AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
403 AvailableAlus[AluPredX].size();
406 SUnit* R600SchedStrategy::pickAlu() {
407 while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
408 if (!OccupedSlotsMask) {
409 // Bottom up scheduling : predX must comes first
410 if (!AvailableAlus[AluPredX].empty()) {
411 OccupedSlotsMask |= 31;
412 return PopInst(AvailableAlus[AluPredX], false);
414 // Flush physical reg copies (RA will discard them)
415 if (!AvailableAlus[AluDiscarded].empty()) {
416 OccupedSlotsMask |= 31;
417 return PopInst(AvailableAlus[AluDiscarded], false);
419 // If there is a T_XYZW alu available, use it
420 if (!AvailableAlus[AluT_XYZW].empty()) {
421 OccupedSlotsMask |= 15;
422 return PopInst(AvailableAlus[AluT_XYZW], false);
425 bool TransSlotOccuped = OccupedSlotsMask & 16;
426 if (!TransSlotOccuped && VLIW5) {
427 if (!AvailableAlus[AluTrans].empty()) {
428 OccupedSlotsMask |= 16;
429 return PopInst(AvailableAlus[AluTrans], false);
431 SUnit *SU = AttemptFillSlot(3, true);
432 if (SU) {
433 OccupedSlotsMask |= 16;
434 return SU;
437 for (int Chan = 3; Chan > -1; --Chan) {
438 bool isOccupied = OccupedSlotsMask & (1 << Chan);
439 if (!isOccupied) {
440 SUnit *SU = AttemptFillSlot(Chan, false);
441 if (SU) {
442 OccupedSlotsMask |= (1 << Chan);
443 InstructionsGroupCandidate.push_back(SU->getInstr());
444 return SU;
448 PrepareNextSlot();
450 return nullptr;
453 SUnit* R600SchedStrategy::pickOther(int QID) {
454 SUnit *SU = nullptr;
455 std::vector<SUnit *> &AQ = Available[QID];
457 if (AQ.empty()) {
458 MoveUnits(Pending[QID], AQ);
460 if (!AQ.empty()) {
461 SU = AQ.back();
462 AQ.pop_back();
464 return SU;