1 //===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the Hexagon specific subclass of TargetSubtarget.
11 //===----------------------------------------------------------------------===//
14 #include "HexagonInstrInfo.h"
15 #include "HexagonRegisterInfo.h"
16 #include "HexagonSubtarget.h"
17 #include "MCTargetDesc/HexagonMCTargetDesc.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallSet.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineOperand.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/ErrorHandling.h"
35 #define DEBUG_TYPE "hexagon-subtarget"
37 #define GET_SUBTARGETINFO_CTOR
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #include "HexagonGenSubtargetInfo.inc"
42 static cl::opt
<bool> EnableBSBSched("enable-bsb-sched",
43 cl::Hidden
, cl::ZeroOrMore
, cl::init(true));
45 static cl::opt
<bool> EnableTCLatencySched("enable-tc-latency-sched",
46 cl::Hidden
, cl::ZeroOrMore
, cl::init(false));
48 static cl::opt
<bool> EnableDotCurSched("enable-cur-sched",
49 cl::Hidden
, cl::ZeroOrMore
, cl::init(true),
50 cl::desc("Enable the scheduler to generate .cur"));
52 static cl::opt
<bool> DisableHexagonMISched("disable-hexagon-misched",
53 cl::Hidden
, cl::ZeroOrMore
, cl::init(false),
54 cl::desc("Disable Hexagon MI Scheduling"));
56 static cl::opt
<bool> EnableSubregLiveness("hexagon-subreg-liveness",
57 cl::Hidden
, cl::ZeroOrMore
, cl::init(true),
58 cl::desc("Enable subregister liveness tracking for Hexagon"));
60 static cl::opt
<bool> OverrideLongCalls("hexagon-long-calls",
61 cl::Hidden
, cl::ZeroOrMore
, cl::init(false),
62 cl::desc("If present, forces/disables the use of long calls"));
64 static cl::opt
<bool> EnablePredicatedCalls("hexagon-pred-calls",
65 cl::Hidden
, cl::ZeroOrMore
, cl::init(false),
66 cl::desc("Consider calls to be predicable"));
68 static cl::opt
<bool> SchedPredsCloser("sched-preds-closer",
69 cl::Hidden
, cl::ZeroOrMore
, cl::init(true));
71 static cl::opt
<bool> SchedRetvalOptimization("sched-retval-optimization",
72 cl::Hidden
, cl::ZeroOrMore
, cl::init(true));
74 static cl::opt
<bool> EnableCheckBankConflict("hexagon-check-bank-conflict",
75 cl::Hidden
, cl::ZeroOrMore
, cl::init(true),
76 cl::desc("Enable checking for cache bank conflicts"));
79 HexagonSubtarget::HexagonSubtarget(const Triple
&TT
, StringRef CPU
,
80 StringRef FS
, const TargetMachine
&TM
)
81 : HexagonGenSubtargetInfo(TT
, CPU
, FS
), OptLevel(TM
.getOptLevel()),
82 CPUString(Hexagon_MC::selectHexagonCPU(CPU
)),
83 InstrInfo(initializeSubtargetDependencies(CPU
, FS
)),
84 RegInfo(getHwMode()), TLInfo(TM
, *this),
85 InstrItins(getInstrItineraryForCPU(CPUString
)) {
86 // Beware of the default constructor of InstrItineraryData: it will
87 // reset all members to 0.
88 assert(InstrItins
.Itineraries
!= nullptr && "InstrItins not initialized");
92 HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU
, StringRef FS
) {
93 static std::map
<StringRef
, Hexagon::ArchEnum
> CpuTable
{
94 {"generic", Hexagon::ArchEnum::V60
},
95 {"hexagonv5", Hexagon::ArchEnum::V5
},
96 {"hexagonv55", Hexagon::ArchEnum::V55
},
97 {"hexagonv60", Hexagon::ArchEnum::V60
},
98 {"hexagonv62", Hexagon::ArchEnum::V62
},
99 {"hexagonv65", Hexagon::ArchEnum::V65
},
100 {"hexagonv66", Hexagon::ArchEnum::V66
},
103 auto FoundIt
= CpuTable
.find(CPUString
);
104 if (FoundIt
!= CpuTable
.end())
105 HexagonArchVersion
= FoundIt
->second
;
107 llvm_unreachable("Unrecognized Hexagon processor version");
109 UseHVX128BOps
= false;
110 UseHVX64BOps
= false;
111 UseLongCalls
= false;
113 UseBSBScheduling
= hasV60Ops() && EnableBSBSched
;
115 ParseSubtargetFeatures(CPUString
, FS
);
117 if (OverrideLongCalls
.getPosition())
118 UseLongCalls
= OverrideLongCalls
;
120 FeatureBitset Features
= getFeatureBits();
121 if (HexagonDisableDuplex
)
122 setFeatureBits(Features
.set(Hexagon::FeatureDuplex
, false));
123 setFeatureBits(Hexagon_MC::completeHVXFeatures(Features
));
128 void HexagonSubtarget::UsrOverflowMutation::apply(ScheduleDAGInstrs
*DAG
) {
129 for (SUnit
&SU
: DAG
->SUnits
) {
132 SmallVector
<SDep
, 4> Erase
;
133 for (auto &D
: SU
.Preds
)
134 if (D
.getKind() == SDep::Output
&& D
.getReg() == Hexagon::USR_OVF
)
136 for (auto &E
: Erase
)
141 void HexagonSubtarget::HVXMemLatencyMutation::apply(ScheduleDAGInstrs
*DAG
) {
142 for (SUnit
&SU
: DAG
->SUnits
) {
143 // Update the latency of chain edges between v60 vector load or store
144 // instructions to be 1. These instruction cannot be scheduled in the
146 MachineInstr
&MI1
= *SU
.getInstr();
147 auto *QII
= static_cast<const HexagonInstrInfo
*>(DAG
->TII
);
148 bool IsStoreMI1
= MI1
.mayStore();
149 bool IsLoadMI1
= MI1
.mayLoad();
150 if (!QII
->isHVXVec(MI1
) || !(IsStoreMI1
|| IsLoadMI1
))
152 for (SDep
&SI
: SU
.Succs
) {
153 if (SI
.getKind() != SDep::Order
|| SI
.getLatency() != 0)
155 MachineInstr
&MI2
= *SI
.getSUnit()->getInstr();
156 if (!QII
->isHVXVec(MI2
))
158 if ((IsStoreMI1
&& MI2
.mayStore()) || (IsLoadMI1
&& MI2
.mayLoad())) {
161 // Change the dependence in the opposite direction too.
162 for (SDep
&PI
: SI
.getSUnit()->Preds
) {
163 if (PI
.getSUnit() != &SU
|| PI
.getKind() != SDep::Order
)
166 SI
.getSUnit()->setDepthDirty();
173 // Check if a call and subsequent A2_tfrpi instructions should maintain
174 // scheduling affinity. We are looking for the TFRI to be consumed in
175 // the next instruction. This should help reduce the instances of
176 // double register pairs being allocated and scheduled before a call
177 // when not used until after the call. This situation is exacerbated
178 // by the fact that we allocate the pair from the callee saves list,
179 // leading to excess spills and restores.
180 bool HexagonSubtarget::CallMutation::shouldTFRICallBind(
181 const HexagonInstrInfo
&HII
, const SUnit
&Inst1
,
182 const SUnit
&Inst2
) const {
183 if (Inst1
.getInstr()->getOpcode() != Hexagon::A2_tfrpi
)
186 // TypeXTYPE are 64 bit operations.
187 unsigned Type
= HII
.getType(*Inst2
.getInstr());
188 return Type
== HexagonII::TypeS_2op
|| Type
== HexagonII::TypeS_3op
||
189 Type
== HexagonII::TypeALU64
|| Type
== HexagonII::TypeM
;
192 void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs
*DAGInstrs
) {
193 ScheduleDAGMI
*DAG
= static_cast<ScheduleDAGMI
*>(DAGInstrs
);
194 SUnit
* LastSequentialCall
= nullptr;
195 // Map from virtual register to physical register from the copy.
196 DenseMap
<unsigned, unsigned> VRegHoldingReg
;
197 // Map from the physical register to the instruction that uses virtual
198 // register. This is used to create the barrier edge.
199 DenseMap
<unsigned, SUnit
*> LastVRegUse
;
200 auto &TRI
= *DAG
->MF
.getSubtarget().getRegisterInfo();
201 auto &HII
= *DAG
->MF
.getSubtarget
<HexagonSubtarget
>().getInstrInfo();
203 // Currently we only catch the situation when compare gets scheduled
204 // before preceding call.
205 for (unsigned su
= 0, e
= DAG
->SUnits
.size(); su
!= e
; ++su
) {
206 // Remember the call.
207 if (DAG
->SUnits
[su
].getInstr()->isCall())
208 LastSequentialCall
= &DAG
->SUnits
[su
];
209 // Look for a compare that defines a predicate.
210 else if (DAG
->SUnits
[su
].getInstr()->isCompare() && LastSequentialCall
)
211 DAG
->addEdge(&DAG
->SUnits
[su
], SDep(LastSequentialCall
, SDep::Barrier
));
212 // Look for call and tfri* instructions.
213 else if (SchedPredsCloser
&& LastSequentialCall
&& su
> 1 && su
< e
-1 &&
214 shouldTFRICallBind(HII
, DAG
->SUnits
[su
], DAG
->SUnits
[su
+1]))
215 DAG
->addEdge(&DAG
->SUnits
[su
], SDep(&DAG
->SUnits
[su
-1], SDep::Barrier
));
216 // Prevent redundant register copies due to reads and writes of physical
217 // registers. The original motivation for this was the code generated
218 // between two calls, which are caused both the return value and the
219 // argument for the next call being in %r0.
222 // 2: %vreg = COPY %r0
226 // The scheduler would often swap 3 and 4, so an additional register is
227 // needed. This code inserts a Barrier dependence between 3 & 4 to prevent
229 // The code below checks for all the physical registers, not just R0/D0/V0.
230 else if (SchedRetvalOptimization
) {
231 const MachineInstr
*MI
= DAG
->SUnits
[su
].getInstr();
233 TargetRegisterInfo::isPhysicalRegister(MI
->getOperand(1).getReg())) {
235 VRegHoldingReg
[MI
->getOperand(0).getReg()] = MI
->getOperand(1).getReg();
236 LastVRegUse
.erase(MI
->getOperand(1).getReg());
238 for (unsigned i
= 0, e
= MI
->getNumOperands(); i
!= e
; ++i
) {
239 const MachineOperand
&MO
= MI
->getOperand(i
);
242 if (MO
.isUse() && !MI
->isCopy() &&
243 VRegHoldingReg
.count(MO
.getReg())) {
245 LastVRegUse
[VRegHoldingReg
[MO
.getReg()]] = &DAG
->SUnits
[su
];
246 } else if (MO
.isDef() &&
247 TargetRegisterInfo::isPhysicalRegister(MO
.getReg())) {
248 for (MCRegAliasIterator
AI(MO
.getReg(), &TRI
, true); AI
.isValid();
250 if (LastVRegUse
.count(*AI
) &&
251 LastVRegUse
[*AI
] != &DAG
->SUnits
[su
])
253 DAG
->addEdge(&DAG
->SUnits
[su
], SDep(LastVRegUse
[*AI
], SDep::Barrier
));
254 LastVRegUse
.erase(*AI
);
263 void HexagonSubtarget::BankConflictMutation::apply(ScheduleDAGInstrs
*DAG
) {
264 if (!EnableCheckBankConflict
)
267 const auto &HII
= static_cast<const HexagonInstrInfo
&>(*DAG
->TII
);
269 // Create artificial edges between loads that could likely cause a bank
270 // conflict. Since such loads would normally not have any dependency
271 // between them, we cannot rely on existing edges.
272 for (unsigned i
= 0, e
= DAG
->SUnits
.size(); i
!= e
; ++i
) {
273 SUnit
&S0
= DAG
->SUnits
[i
];
274 MachineInstr
&L0
= *S0
.getInstr();
275 if (!L0
.mayLoad() || L0
.mayStore() ||
276 HII
.getAddrMode(L0
) != HexagonII::BaseImmOffset
)
280 MachineOperand
*BaseOp0
= HII
.getBaseAndOffset(L0
, Offset0
, Size0
);
281 // Is the access size is longer than the L1 cache line, skip the check.
282 if (BaseOp0
== nullptr || !BaseOp0
->isReg() || Size0
>= 32)
284 // Scan only up to 32 instructions ahead (to avoid n^2 complexity).
285 for (unsigned j
= i
+1, m
= std::min(i
+32, e
); j
!= m
; ++j
) {
286 SUnit
&S1
= DAG
->SUnits
[j
];
287 MachineInstr
&L1
= *S1
.getInstr();
288 if (!L1
.mayLoad() || L1
.mayStore() ||
289 HII
.getAddrMode(L1
) != HexagonII::BaseImmOffset
)
293 MachineOperand
*BaseOp1
= HII
.getBaseAndOffset(L1
, Offset1
, Size1
);
294 if (BaseOp1
== nullptr || !BaseOp1
->isReg() || Size1
>= 32 ||
295 BaseOp0
->getReg() != BaseOp1
->getReg())
297 // Check bits 3 and 4 of the offset: if they differ, a bank conflict
299 if (((Offset0
^ Offset1
) & 0x18) != 0)
301 // Bits 3 and 4 are the same, add an artificial edge and set extra
303 SDep
A(&S0
, SDep::Artificial
);
310 /// Enable use of alias analysis during code generation (during MI
311 /// scheduling, DAGCombine, etc.).
312 bool HexagonSubtarget::useAA() const {
313 if (OptLevel
!= CodeGenOpt::None
)
318 /// Perform target specific adjustments to the latency of a schedule
320 void HexagonSubtarget::adjustSchedDependency(SUnit
*Src
, SUnit
*Dst
,
322 MachineInstr
*SrcInst
= Src
->getInstr();
323 MachineInstr
*DstInst
= Dst
->getInstr();
324 if (!Src
->isInstr() || !Dst
->isInstr())
327 const HexagonInstrInfo
*QII
= getInstrInfo();
329 // Instructions with .new operands have zero latency.
330 SmallSet
<SUnit
*, 4> ExclSrc
;
331 SmallSet
<SUnit
*, 4> ExclDst
;
332 if (QII
->canExecuteInBundle(*SrcInst
, *DstInst
) &&
333 isBestZeroLatency(Src
, Dst
, QII
, ExclSrc
, ExclDst
)) {
341 // Set the latency for a copy to zero since we hope that is will get removed.
342 if (DstInst
->isCopy())
345 // If it's a REG_SEQUENCE/COPY, use its destination instruction to determine
346 // the correct latency.
347 if ((DstInst
->isRegSequence() || DstInst
->isCopy()) && Dst
->NumSuccs
== 1) {
348 unsigned DReg
= DstInst
->getOperand(0).getReg();
349 MachineInstr
*DDst
= Dst
->Succs
[0].getSUnit()->getInstr();
350 unsigned UseIdx
= -1;
351 for (unsigned OpNum
= 0; OpNum
< DDst
->getNumOperands(); OpNum
++) {
352 const MachineOperand
&MO
= DDst
->getOperand(OpNum
);
353 if (MO
.isReg() && MO
.getReg() && MO
.isUse() && MO
.getReg() == DReg
) {
358 int DLatency
= (InstrInfo
.getOperandLatency(&InstrItins
, *SrcInst
,
360 DLatency
= std::max(DLatency
, 0);
361 Dep
.setLatency((unsigned)DLatency
);
364 // Try to schedule uses near definitions to generate .cur.
367 if (EnableDotCurSched
&& QII
->isToBeScheduledASAP(*SrcInst
, *DstInst
) &&
368 isBestZeroLatency(Src
, Dst
, QII
, ExclSrc
, ExclDst
)) {
373 updateLatency(*SrcInst
, *DstInst
, Dep
);
376 void HexagonSubtarget::getPostRAMutations(
377 std::vector
<std::unique_ptr
<ScheduleDAGMutation
>> &Mutations
) const {
378 Mutations
.push_back(llvm::make_unique
<UsrOverflowMutation
>());
379 Mutations
.push_back(llvm::make_unique
<HVXMemLatencyMutation
>());
380 Mutations
.push_back(llvm::make_unique
<BankConflictMutation
>());
383 void HexagonSubtarget::getSMSMutations(
384 std::vector
<std::unique_ptr
<ScheduleDAGMutation
>> &Mutations
) const {
385 Mutations
.push_back(llvm::make_unique
<UsrOverflowMutation
>());
386 Mutations
.push_back(llvm::make_unique
<HVXMemLatencyMutation
>());
389 // Pin the vtable to this file.
390 void HexagonSubtarget::anchor() {}
392 bool HexagonSubtarget::enableMachineScheduler() const {
393 if (DisableHexagonMISched
.getNumOccurrences())
394 return !DisableHexagonMISched
;
398 bool HexagonSubtarget::usePredicatedCalls() const {
399 return EnablePredicatedCalls
;
402 void HexagonSubtarget::updateLatency(MachineInstr
&SrcInst
,
403 MachineInstr
&DstInst
, SDep
&Dep
) const {
404 if (Dep
.isArtificial()) {
412 auto &QII
= static_cast<const HexagonInstrInfo
&>(*getInstrInfo());
415 if (QII
.isHVXVec(SrcInst
) || useBSBScheduling())
416 Dep
.setLatency((Dep
.getLatency() + 1) >> 1);
419 void HexagonSubtarget::restoreLatency(SUnit
*Src
, SUnit
*Dst
) const {
420 MachineInstr
*SrcI
= Src
->getInstr();
421 for (auto &I
: Src
->Succs
) {
422 if (!I
.isAssignedRegDep() || I
.getSUnit() != Dst
)
424 unsigned DepR
= I
.getReg();
426 for (unsigned OpNum
= 0; OpNum
< SrcI
->getNumOperands(); OpNum
++) {
427 const MachineOperand
&MO
= SrcI
->getOperand(OpNum
);
428 if (MO
.isReg() && MO
.isDef() && MO
.getReg() == DepR
)
431 assert(DefIdx
>= 0 && "Def Reg not found in Src MI");
432 MachineInstr
*DstI
= Dst
->getInstr();
434 for (unsigned OpNum
= 0; OpNum
< DstI
->getNumOperands(); OpNum
++) {
435 const MachineOperand
&MO
= DstI
->getOperand(OpNum
);
436 if (MO
.isReg() && MO
.isUse() && MO
.getReg() == DepR
) {
437 int Latency
= (InstrInfo
.getOperandLatency(&InstrItins
, *SrcI
,
438 DefIdx
, *DstI
, OpNum
));
440 // For some instructions (ex: COPY), we might end up with < 0 latency
441 // as they don't have any Itinerary class associated with them.
442 Latency
= std::max(Latency
, 0);
444 I
.setLatency(Latency
);
445 updateLatency(*SrcI
, *DstI
, I
);
449 // Update the latency of opposite edge too.
451 auto F
= std::find(Dst
->Preds
.begin(), Dst
->Preds
.end(), T
);
452 assert(F
!= Dst
->Preds
.end());
453 F
->setLatency(I
.getLatency());
457 /// Change the latency between the two SUnits.
458 void HexagonSubtarget::changeLatency(SUnit
*Src
, SUnit
*Dst
, unsigned Lat
)
460 for (auto &I
: Src
->Succs
) {
461 if (!I
.isAssignedRegDep() || I
.getSUnit() != Dst
)
466 // Update the latency of opposite edge too.
468 auto F
= std::find(Dst
->Preds
.begin(), Dst
->Preds
.end(), T
);
469 assert(F
!= Dst
->Preds
.end());
474 /// If the SUnit has a zero latency edge, return the other SUnit.
475 static SUnit
*getZeroLatency(SUnit
*N
, SmallVector
<SDep
, 4> &Deps
) {
477 if (I
.isAssignedRegDep() && I
.getLatency() == 0 &&
478 !I
.getSUnit()->getInstr()->isPseudo())
483 // Return true if these are the best two instructions to schedule
484 // together with a zero latency. Only one dependence should have a zero
485 // latency. If there are multiple choices, choose the best, and change
486 // the others, if needed.
487 bool HexagonSubtarget::isBestZeroLatency(SUnit
*Src
, SUnit
*Dst
,
488 const HexagonInstrInfo
*TII
, SmallSet
<SUnit
*, 4> &ExclSrc
,
489 SmallSet
<SUnit
*, 4> &ExclDst
) const {
490 MachineInstr
&SrcInst
= *Src
->getInstr();
491 MachineInstr
&DstInst
= *Dst
->getInstr();
493 // Ignore Boundary SU nodes as these have null instructions.
494 if (Dst
->isBoundaryNode())
497 if (SrcInst
.isPHI() || DstInst
.isPHI())
500 if (!TII
->isToBeScheduledASAP(SrcInst
, DstInst
) &&
501 !TII
->canExecuteInBundle(SrcInst
, DstInst
))
504 // The architecture doesn't allow three dependent instructions in the same
505 // packet. So, if the destination has a zero latency successor, then it's
506 // not a candidate for a zero latency predecessor.
507 if (getZeroLatency(Dst
, Dst
->Succs
) != nullptr)
510 // Check if the Dst instruction is the best candidate first.
511 SUnit
*Best
= nullptr;
512 SUnit
*DstBest
= nullptr;
513 SUnit
*SrcBest
= getZeroLatency(Dst
, Dst
->Preds
);
514 if (SrcBest
== nullptr || Src
->NodeNum
>= SrcBest
->NodeNum
) {
515 // Check that Src doesn't have a better candidate.
516 DstBest
= getZeroLatency(Src
, Src
->Succs
);
517 if (DstBest
== nullptr || Dst
->NodeNum
<= DstBest
->NodeNum
)
523 // The caller frequently adds the same dependence twice. If so, then
524 // return true for this case too.
525 if ((Src
== SrcBest
&& Dst
== DstBest
) ||
526 (SrcBest
== nullptr && Dst
== DstBest
) ||
527 (Src
== SrcBest
&& Dst
== nullptr))
530 // Reassign the latency for the previous bests, which requires setting
531 // the dependence edge in both directions.
532 if (SrcBest
!= nullptr) {
534 changeLatency(SrcBest
, Dst
, 1);
536 restoreLatency(SrcBest
, Dst
);
538 if (DstBest
!= nullptr) {
540 changeLatency(Src
, DstBest
, 1);
542 restoreLatency(Src
, DstBest
);
545 // Attempt to find another opprotunity for zero latency in a different
547 if (SrcBest
&& DstBest
)
548 // If there is an edge from SrcBest to DstBst, then try to change that
550 changeLatency(SrcBest
, DstBest
, 0);
552 // Check if the previous best destination instruction has a new zero
553 // latency dependence opportunity.
555 for (auto &I
: DstBest
->Preds
)
556 if (ExclSrc
.count(I
.getSUnit()) == 0 &&
557 isBestZeroLatency(I
.getSUnit(), DstBest
, TII
, ExclSrc
, ExclDst
))
558 changeLatency(I
.getSUnit(), DstBest
, 0);
559 } else if (SrcBest
) {
560 // Check if previous best source instruction has a new zero latency
561 // dependence opportunity.
563 for (auto &I
: SrcBest
->Succs
)
564 if (ExclDst
.count(I
.getSUnit()) == 0 &&
565 isBestZeroLatency(SrcBest
, I
.getSUnit(), TII
, ExclSrc
, ExclDst
))
566 changeLatency(SrcBest
, I
.getSUnit(), 0);
572 unsigned HexagonSubtarget::getL1CacheLineSize() const {
576 unsigned HexagonSubtarget::getL1PrefetchDistance() const {
580 bool HexagonSubtarget::enableSubRegLiveness() const {
581 return EnableSubregLiveness
;