1 //===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the Hexagon specific subclass of TargetSubtarget.
11 //===----------------------------------------------------------------------===//
14 #include "HexagonInstrInfo.h"
15 #include "HexagonRegisterInfo.h"
16 #include "HexagonSubtarget.h"
17 #include "MCTargetDesc/HexagonMCTargetDesc.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallSet.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineOperand.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/ErrorHandling.h"
35 #define DEBUG_TYPE "hexagon-subtarget"
37 #define GET_SUBTARGETINFO_CTOR
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #include "HexagonGenSubtargetInfo.inc"
42 static cl::opt
<bool> EnableBSBSched("enable-bsb-sched",
43 cl::Hidden
, cl::ZeroOrMore
, cl::init(true));
45 static cl::opt
<bool> EnableTCLatencySched("enable-tc-latency-sched",
46 cl::Hidden
, cl::ZeroOrMore
, cl::init(false));
48 static cl::opt
<bool> EnableDotCurSched("enable-cur-sched",
49 cl::Hidden
, cl::ZeroOrMore
, cl::init(true),
50 cl::desc("Enable the scheduler to generate .cur"));
52 static cl::opt
<bool> DisableHexagonMISched("disable-hexagon-misched",
53 cl::Hidden
, cl::ZeroOrMore
, cl::init(false),
54 cl::desc("Disable Hexagon MI Scheduling"));
56 static cl::opt
<bool> EnableSubregLiveness("hexagon-subreg-liveness",
57 cl::Hidden
, cl::ZeroOrMore
, cl::init(true),
58 cl::desc("Enable subregister liveness tracking for Hexagon"));
60 static cl::opt
<bool> OverrideLongCalls("hexagon-long-calls",
61 cl::Hidden
, cl::ZeroOrMore
, cl::init(false),
62 cl::desc("If present, forces/disables the use of long calls"));
64 static cl::opt
<bool> EnablePredicatedCalls("hexagon-pred-calls",
65 cl::Hidden
, cl::ZeroOrMore
, cl::init(false),
66 cl::desc("Consider calls to be predicable"));
68 static cl::opt
<bool> SchedPredsCloser("sched-preds-closer",
69 cl::Hidden
, cl::ZeroOrMore
, cl::init(true));
71 static cl::opt
<bool> SchedRetvalOptimization("sched-retval-optimization",
72 cl::Hidden
, cl::ZeroOrMore
, cl::init(true));
74 static cl::opt
<bool> EnableCheckBankConflict("hexagon-check-bank-conflict",
75 cl::Hidden
, cl::ZeroOrMore
, cl::init(true),
76 cl::desc("Enable checking for cache bank conflicts"));
79 HexagonSubtarget::HexagonSubtarget(const Triple
&TT
, StringRef CPU
,
80 StringRef FS
, const TargetMachine
&TM
)
81 : HexagonGenSubtargetInfo(TT
, CPU
, FS
), OptLevel(TM
.getOptLevel()),
82 CPUString(Hexagon_MC::selectHexagonCPU(CPU
)),
83 InstrInfo(initializeSubtargetDependencies(CPU
, FS
)),
84 RegInfo(getHwMode()), TLInfo(TM
, *this),
85 InstrItins(getInstrItineraryForCPU(CPUString
)) {
86 // Beware of the default constructor of InstrItineraryData: it will
87 // reset all members to 0.
88 assert(InstrItins
.Itineraries
!= nullptr && "InstrItins not initialized");
92 HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU
, StringRef FS
) {
93 static std::map
<StringRef
, Hexagon::ArchEnum
> CpuTable
{
94 {"generic", Hexagon::ArchEnum::V60
},
95 {"hexagonv5", Hexagon::ArchEnum::V5
},
96 {"hexagonv55", Hexagon::ArchEnum::V55
},
97 {"hexagonv60", Hexagon::ArchEnum::V60
},
98 {"hexagonv62", Hexagon::ArchEnum::V62
},
99 {"hexagonv65", Hexagon::ArchEnum::V65
},
100 {"hexagonv66", Hexagon::ArchEnum::V66
},
103 auto FoundIt
= CpuTable
.find(CPUString
);
104 if (FoundIt
!= CpuTable
.end())
105 HexagonArchVersion
= FoundIt
->second
;
107 llvm_unreachable("Unrecognized Hexagon processor version");
109 UseHVX128BOps
= false;
110 UseHVX64BOps
= false;
111 UseLongCalls
= false;
113 UseBSBScheduling
= hasV60Ops() && EnableBSBSched
;
115 ParseSubtargetFeatures(CPUString
, FS
);
117 if (OverrideLongCalls
.getPosition())
118 UseLongCalls
= OverrideLongCalls
;
120 FeatureBitset Features
= getFeatureBits();
121 if (HexagonDisableDuplex
)
122 setFeatureBits(Features
.set(Hexagon::FeatureDuplex
, false));
123 setFeatureBits(Hexagon_MC::completeHVXFeatures(Features
));
128 void HexagonSubtarget::UsrOverflowMutation::apply(ScheduleDAGInstrs
*DAG
) {
129 for (SUnit
&SU
: DAG
->SUnits
) {
132 SmallVector
<SDep
, 4> Erase
;
133 for (auto &D
: SU
.Preds
)
134 if (D
.getKind() == SDep::Output
&& D
.getReg() == Hexagon::USR_OVF
)
136 for (auto &E
: Erase
)
141 void HexagonSubtarget::HVXMemLatencyMutation::apply(ScheduleDAGInstrs
*DAG
) {
142 for (SUnit
&SU
: DAG
->SUnits
) {
143 // Update the latency of chain edges between v60 vector load or store
144 // instructions to be 1. These instruction cannot be scheduled in the
146 MachineInstr
&MI1
= *SU
.getInstr();
147 auto *QII
= static_cast<const HexagonInstrInfo
*>(DAG
->TII
);
148 bool IsStoreMI1
= MI1
.mayStore();
149 bool IsLoadMI1
= MI1
.mayLoad();
150 if (!QII
->isHVXVec(MI1
) || !(IsStoreMI1
|| IsLoadMI1
))
152 for (SDep
&SI
: SU
.Succs
) {
153 if (SI
.getKind() != SDep::Order
|| SI
.getLatency() != 0)
155 MachineInstr
&MI2
= *SI
.getSUnit()->getInstr();
156 if (!QII
->isHVXVec(MI2
))
158 if ((IsStoreMI1
&& MI2
.mayStore()) || (IsLoadMI1
&& MI2
.mayLoad())) {
161 // Change the dependence in the opposite direction too.
162 for (SDep
&PI
: SI
.getSUnit()->Preds
) {
163 if (PI
.getSUnit() != &SU
|| PI
.getKind() != SDep::Order
)
166 SI
.getSUnit()->setDepthDirty();
173 // Check if a call and subsequent A2_tfrpi instructions should maintain
174 // scheduling affinity. We are looking for the TFRI to be consumed in
175 // the next instruction. This should help reduce the instances of
176 // double register pairs being allocated and scheduled before a call
177 // when not used until after the call. This situation is exacerbated
178 // by the fact that we allocate the pair from the callee saves list,
179 // leading to excess spills and restores.
180 bool HexagonSubtarget::CallMutation::shouldTFRICallBind(
181 const HexagonInstrInfo
&HII
, const SUnit
&Inst1
,
182 const SUnit
&Inst2
) const {
183 if (Inst1
.getInstr()->getOpcode() != Hexagon::A2_tfrpi
)
186 // TypeXTYPE are 64 bit operations.
187 unsigned Type
= HII
.getType(*Inst2
.getInstr());
188 return Type
== HexagonII::TypeS_2op
|| Type
== HexagonII::TypeS_3op
||
189 Type
== HexagonII::TypeALU64
|| Type
== HexagonII::TypeM
;
192 void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs
*DAGInstrs
) {
193 ScheduleDAGMI
*DAG
= static_cast<ScheduleDAGMI
*>(DAGInstrs
);
194 SUnit
* LastSequentialCall
= nullptr;
195 // Map from virtual register to physical register from the copy.
196 DenseMap
<unsigned, unsigned> VRegHoldingReg
;
197 // Map from the physical register to the instruction that uses virtual
198 // register. This is used to create the barrier edge.
199 DenseMap
<unsigned, SUnit
*> LastVRegUse
;
200 auto &TRI
= *DAG
->MF
.getSubtarget().getRegisterInfo();
201 auto &HII
= *DAG
->MF
.getSubtarget
<HexagonSubtarget
>().getInstrInfo();
203 // Currently we only catch the situation when compare gets scheduled
204 // before preceding call.
205 for (unsigned su
= 0, e
= DAG
->SUnits
.size(); su
!= e
; ++su
) {
206 // Remember the call.
207 if (DAG
->SUnits
[su
].getInstr()->isCall())
208 LastSequentialCall
= &DAG
->SUnits
[su
];
209 // Look for a compare that defines a predicate.
210 else if (DAG
->SUnits
[su
].getInstr()->isCompare() && LastSequentialCall
)
211 DAG
->addEdge(&DAG
->SUnits
[su
], SDep(LastSequentialCall
, SDep::Barrier
));
212 // Look for call and tfri* instructions.
213 else if (SchedPredsCloser
&& LastSequentialCall
&& su
> 1 && su
< e
-1 &&
214 shouldTFRICallBind(HII
, DAG
->SUnits
[su
], DAG
->SUnits
[su
+1]))
215 DAG
->addEdge(&DAG
->SUnits
[su
], SDep(&DAG
->SUnits
[su
-1], SDep::Barrier
));
216 // Prevent redundant register copies due to reads and writes of physical
217 // registers. The original motivation for this was the code generated
218 // between two calls, which are caused both the return value and the
219 // argument for the next call being in %r0.
222 // 2: %vreg = COPY %r0
226 // The scheduler would often swap 3 and 4, so an additional register is
227 // needed. This code inserts a Barrier dependence between 3 & 4 to prevent
229 // The code below checks for all the physical registers, not just R0/D0/V0.
230 else if (SchedRetvalOptimization
) {
231 const MachineInstr
*MI
= DAG
->SUnits
[su
].getInstr();
233 Register::isPhysicalRegister(MI
->getOperand(1).getReg())) {
235 VRegHoldingReg
[MI
->getOperand(0).getReg()] = MI
->getOperand(1).getReg();
236 LastVRegUse
.erase(MI
->getOperand(1).getReg());
238 for (unsigned i
= 0, e
= MI
->getNumOperands(); i
!= e
; ++i
) {
239 const MachineOperand
&MO
= MI
->getOperand(i
);
242 if (MO
.isUse() && !MI
->isCopy() &&
243 VRegHoldingReg
.count(MO
.getReg())) {
245 LastVRegUse
[VRegHoldingReg
[MO
.getReg()]] = &DAG
->SUnits
[su
];
246 } else if (MO
.isDef() && Register::isPhysicalRegister(MO
.getReg())) {
247 for (MCRegAliasIterator
AI(MO
.getReg(), &TRI
, true); AI
.isValid();
249 if (LastVRegUse
.count(*AI
) &&
250 LastVRegUse
[*AI
] != &DAG
->SUnits
[su
])
252 DAG
->addEdge(&DAG
->SUnits
[su
], SDep(LastVRegUse
[*AI
], SDep::Barrier
));
253 LastVRegUse
.erase(*AI
);
262 void HexagonSubtarget::BankConflictMutation::apply(ScheduleDAGInstrs
*DAG
) {
263 if (!EnableCheckBankConflict
)
266 const auto &HII
= static_cast<const HexagonInstrInfo
&>(*DAG
->TII
);
268 // Create artificial edges between loads that could likely cause a bank
269 // conflict. Since such loads would normally not have any dependency
270 // between them, we cannot rely on existing edges.
271 for (unsigned i
= 0, e
= DAG
->SUnits
.size(); i
!= e
; ++i
) {
272 SUnit
&S0
= DAG
->SUnits
[i
];
273 MachineInstr
&L0
= *S0
.getInstr();
274 if (!L0
.mayLoad() || L0
.mayStore() ||
275 HII
.getAddrMode(L0
) != HexagonII::BaseImmOffset
)
279 MachineOperand
*BaseOp0
= HII
.getBaseAndOffset(L0
, Offset0
, Size0
);
280 // Is the access size is longer than the L1 cache line, skip the check.
281 if (BaseOp0
== nullptr || !BaseOp0
->isReg() || Size0
>= 32)
283 // Scan only up to 32 instructions ahead (to avoid n^2 complexity).
284 for (unsigned j
= i
+1, m
= std::min(i
+32, e
); j
!= m
; ++j
) {
285 SUnit
&S1
= DAG
->SUnits
[j
];
286 MachineInstr
&L1
= *S1
.getInstr();
287 if (!L1
.mayLoad() || L1
.mayStore() ||
288 HII
.getAddrMode(L1
) != HexagonII::BaseImmOffset
)
292 MachineOperand
*BaseOp1
= HII
.getBaseAndOffset(L1
, Offset1
, Size1
);
293 if (BaseOp1
== nullptr || !BaseOp1
->isReg() || Size1
>= 32 ||
294 BaseOp0
->getReg() != BaseOp1
->getReg())
296 // Check bits 3 and 4 of the offset: if they differ, a bank conflict
298 if (((Offset0
^ Offset1
) & 0x18) != 0)
300 // Bits 3 and 4 are the same, add an artificial edge and set extra
302 SDep
A(&S0
, SDep::Artificial
);
309 /// Enable use of alias analysis during code generation (during MI
310 /// scheduling, DAGCombine, etc.).
311 bool HexagonSubtarget::useAA() const {
312 if (OptLevel
!= CodeGenOpt::None
)
317 /// Perform target specific adjustments to the latency of a schedule
319 void HexagonSubtarget::adjustSchedDependency(SUnit
*Src
, SUnit
*Dst
,
321 MachineInstr
*SrcInst
= Src
->getInstr();
322 MachineInstr
*DstInst
= Dst
->getInstr();
323 if (!Src
->isInstr() || !Dst
->isInstr())
326 const HexagonInstrInfo
*QII
= getInstrInfo();
328 // Instructions with .new operands have zero latency.
329 SmallSet
<SUnit
*, 4> ExclSrc
;
330 SmallSet
<SUnit
*, 4> ExclDst
;
331 if (QII
->canExecuteInBundle(*SrcInst
, *DstInst
) &&
332 isBestZeroLatency(Src
, Dst
, QII
, ExclSrc
, ExclDst
)) {
340 // Set the latency for a copy to zero since we hope that is will get removed.
341 if (DstInst
->isCopy())
344 // If it's a REG_SEQUENCE/COPY, use its destination instruction to determine
345 // the correct latency.
346 if ((DstInst
->isRegSequence() || DstInst
->isCopy()) && Dst
->NumSuccs
== 1) {
347 unsigned DReg
= DstInst
->getOperand(0).getReg();
348 MachineInstr
*DDst
= Dst
->Succs
[0].getSUnit()->getInstr();
349 unsigned UseIdx
= -1;
350 for (unsigned OpNum
= 0; OpNum
< DDst
->getNumOperands(); OpNum
++) {
351 const MachineOperand
&MO
= DDst
->getOperand(OpNum
);
352 if (MO
.isReg() && MO
.getReg() && MO
.isUse() && MO
.getReg() == DReg
) {
357 int DLatency
= (InstrInfo
.getOperandLatency(&InstrItins
, *SrcInst
,
359 DLatency
= std::max(DLatency
, 0);
360 Dep
.setLatency((unsigned)DLatency
);
363 // Try to schedule uses near definitions to generate .cur.
366 if (EnableDotCurSched
&& QII
->isToBeScheduledASAP(*SrcInst
, *DstInst
) &&
367 isBestZeroLatency(Src
, Dst
, QII
, ExclSrc
, ExclDst
)) {
372 updateLatency(*SrcInst
, *DstInst
, Dep
);
375 void HexagonSubtarget::getPostRAMutations(
376 std::vector
<std::unique_ptr
<ScheduleDAGMutation
>> &Mutations
) const {
377 Mutations
.push_back(llvm::make_unique
<UsrOverflowMutation
>());
378 Mutations
.push_back(llvm::make_unique
<HVXMemLatencyMutation
>());
379 Mutations
.push_back(llvm::make_unique
<BankConflictMutation
>());
382 void HexagonSubtarget::getSMSMutations(
383 std::vector
<std::unique_ptr
<ScheduleDAGMutation
>> &Mutations
) const {
384 Mutations
.push_back(llvm::make_unique
<UsrOverflowMutation
>());
385 Mutations
.push_back(llvm::make_unique
<HVXMemLatencyMutation
>());
388 // Pin the vtable to this file.
389 void HexagonSubtarget::anchor() {}
391 bool HexagonSubtarget::enableMachineScheduler() const {
392 if (DisableHexagonMISched
.getNumOccurrences())
393 return !DisableHexagonMISched
;
397 bool HexagonSubtarget::usePredicatedCalls() const {
398 return EnablePredicatedCalls
;
401 void HexagonSubtarget::updateLatency(MachineInstr
&SrcInst
,
402 MachineInstr
&DstInst
, SDep
&Dep
) const {
403 if (Dep
.isArtificial()) {
411 auto &QII
= static_cast<const HexagonInstrInfo
&>(*getInstrInfo());
414 if (QII
.isHVXVec(SrcInst
) || useBSBScheduling())
415 Dep
.setLatency((Dep
.getLatency() + 1) >> 1);
418 void HexagonSubtarget::restoreLatency(SUnit
*Src
, SUnit
*Dst
) const {
419 MachineInstr
*SrcI
= Src
->getInstr();
420 for (auto &I
: Src
->Succs
) {
421 if (!I
.isAssignedRegDep() || I
.getSUnit() != Dst
)
423 unsigned DepR
= I
.getReg();
425 for (unsigned OpNum
= 0; OpNum
< SrcI
->getNumOperands(); OpNum
++) {
426 const MachineOperand
&MO
= SrcI
->getOperand(OpNum
);
427 if (MO
.isReg() && MO
.isDef() && MO
.getReg() == DepR
)
430 assert(DefIdx
>= 0 && "Def Reg not found in Src MI");
431 MachineInstr
*DstI
= Dst
->getInstr();
433 for (unsigned OpNum
= 0; OpNum
< DstI
->getNumOperands(); OpNum
++) {
434 const MachineOperand
&MO
= DstI
->getOperand(OpNum
);
435 if (MO
.isReg() && MO
.isUse() && MO
.getReg() == DepR
) {
436 int Latency
= (InstrInfo
.getOperandLatency(&InstrItins
, *SrcI
,
437 DefIdx
, *DstI
, OpNum
));
439 // For some instructions (ex: COPY), we might end up with < 0 latency
440 // as they don't have any Itinerary class associated with them.
441 Latency
= std::max(Latency
, 0);
443 I
.setLatency(Latency
);
444 updateLatency(*SrcI
, *DstI
, I
);
448 // Update the latency of opposite edge too.
450 auto F
= std::find(Dst
->Preds
.begin(), Dst
->Preds
.end(), T
);
451 assert(F
!= Dst
->Preds
.end());
452 F
->setLatency(I
.getLatency());
456 /// Change the latency between the two SUnits.
457 void HexagonSubtarget::changeLatency(SUnit
*Src
, SUnit
*Dst
, unsigned Lat
)
459 for (auto &I
: Src
->Succs
) {
460 if (!I
.isAssignedRegDep() || I
.getSUnit() != Dst
)
465 // Update the latency of opposite edge too.
467 auto F
= std::find(Dst
->Preds
.begin(), Dst
->Preds
.end(), T
);
468 assert(F
!= Dst
->Preds
.end());
473 /// If the SUnit has a zero latency edge, return the other SUnit.
474 static SUnit
*getZeroLatency(SUnit
*N
, SmallVector
<SDep
, 4> &Deps
) {
476 if (I
.isAssignedRegDep() && I
.getLatency() == 0 &&
477 !I
.getSUnit()->getInstr()->isPseudo())
482 // Return true if these are the best two instructions to schedule
483 // together with a zero latency. Only one dependence should have a zero
484 // latency. If there are multiple choices, choose the best, and change
485 // the others, if needed.
486 bool HexagonSubtarget::isBestZeroLatency(SUnit
*Src
, SUnit
*Dst
,
487 const HexagonInstrInfo
*TII
, SmallSet
<SUnit
*, 4> &ExclSrc
,
488 SmallSet
<SUnit
*, 4> &ExclDst
) const {
489 MachineInstr
&SrcInst
= *Src
->getInstr();
490 MachineInstr
&DstInst
= *Dst
->getInstr();
492 // Ignore Boundary SU nodes as these have null instructions.
493 if (Dst
->isBoundaryNode())
496 if (SrcInst
.isPHI() || DstInst
.isPHI())
499 if (!TII
->isToBeScheduledASAP(SrcInst
, DstInst
) &&
500 !TII
->canExecuteInBundle(SrcInst
, DstInst
))
503 // The architecture doesn't allow three dependent instructions in the same
504 // packet. So, if the destination has a zero latency successor, then it's
505 // not a candidate for a zero latency predecessor.
506 if (getZeroLatency(Dst
, Dst
->Succs
) != nullptr)
509 // Check if the Dst instruction is the best candidate first.
510 SUnit
*Best
= nullptr;
511 SUnit
*DstBest
= nullptr;
512 SUnit
*SrcBest
= getZeroLatency(Dst
, Dst
->Preds
);
513 if (SrcBest
== nullptr || Src
->NodeNum
>= SrcBest
->NodeNum
) {
514 // Check that Src doesn't have a better candidate.
515 DstBest
= getZeroLatency(Src
, Src
->Succs
);
516 if (DstBest
== nullptr || Dst
->NodeNum
<= DstBest
->NodeNum
)
522 // The caller frequently adds the same dependence twice. If so, then
523 // return true for this case too.
524 if ((Src
== SrcBest
&& Dst
== DstBest
) ||
525 (SrcBest
== nullptr && Dst
== DstBest
) ||
526 (Src
== SrcBest
&& Dst
== nullptr))
529 // Reassign the latency for the previous bests, which requires setting
530 // the dependence edge in both directions.
531 if (SrcBest
!= nullptr) {
533 changeLatency(SrcBest
, Dst
, 1);
535 restoreLatency(SrcBest
, Dst
);
537 if (DstBest
!= nullptr) {
539 changeLatency(Src
, DstBest
, 1);
541 restoreLatency(Src
, DstBest
);
544 // Attempt to find another opprotunity for zero latency in a different
546 if (SrcBest
&& DstBest
)
547 // If there is an edge from SrcBest to DstBst, then try to change that
549 changeLatency(SrcBest
, DstBest
, 0);
551 // Check if the previous best destination instruction has a new zero
552 // latency dependence opportunity.
554 for (auto &I
: DstBest
->Preds
)
555 if (ExclSrc
.count(I
.getSUnit()) == 0 &&
556 isBestZeroLatency(I
.getSUnit(), DstBest
, TII
, ExclSrc
, ExclDst
))
557 changeLatency(I
.getSUnit(), DstBest
, 0);
558 } else if (SrcBest
) {
559 // Check if previous best source instruction has a new zero latency
560 // dependence opportunity.
562 for (auto &I
: SrcBest
->Succs
)
563 if (ExclDst
.count(I
.getSUnit()) == 0 &&
564 isBestZeroLatency(SrcBest
, I
.getSUnit(), TII
, ExclSrc
, ExclDst
))
565 changeLatency(SrcBest
, I
.getSUnit(), 0);
571 unsigned HexagonSubtarget::getL1CacheLineSize() const {
575 unsigned HexagonSubtarget::getL1PrefetchDistance() const {
579 bool HexagonSubtarget::enableSubRegLiveness() const {
580 return EnableSubregLiveness
;