1 //===- HexagonSubtarget.cpp - Hexagon Subtarget Information ---------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the Hexagon specific subclass of TargetSubtarget.
11 //===----------------------------------------------------------------------===//
13 #include "HexagonSubtarget.h"
14 #include "HexagonInstrInfo.h"
15 #include "HexagonRegisterInfo.h"
16 #include "MCTargetDesc/HexagonMCTargetDesc.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallSet.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/CodeGen/MachineInstr.h"
22 #include "llvm/CodeGen/MachineOperand.h"
23 #include "llvm/CodeGen/MachineScheduler.h"
24 #include "llvm/CodeGen/ScheduleDAG.h"
25 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
26 #include "llvm/IR/IntrinsicsHexagon.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/ErrorHandling.h"
29 #include "llvm/Target/TargetMachine.h"
37 #define DEBUG_TYPE "hexagon-subtarget"
39 #define GET_SUBTARGETINFO_CTOR
40 #define GET_SUBTARGETINFO_TARGET_DESC
41 #include "HexagonGenSubtargetInfo.inc"
43 static cl::opt
<bool> EnableBSBSched("enable-bsb-sched", cl::Hidden
,
46 static cl::opt
<bool> EnableTCLatencySched("enable-tc-latency-sched", cl::Hidden
,
50 EnableDotCurSched("enable-cur-sched", cl::Hidden
, cl::init(true),
51 cl::desc("Enable the scheduler to generate .cur"));
54 DisableHexagonMISched("disable-hexagon-misched", cl::Hidden
,
55 cl::desc("Disable Hexagon MI Scheduling"));
57 static cl::opt
<bool> OverrideLongCalls(
58 "hexagon-long-calls", cl::Hidden
,
59 cl::desc("If present, forces/disables the use of long calls"));
62 EnablePredicatedCalls("hexagon-pred-calls", cl::Hidden
,
63 cl::desc("Consider calls to be predicable"));
65 static cl::opt
<bool> SchedPredsCloser("sched-preds-closer", cl::Hidden
,
68 static cl::opt
<bool> SchedRetvalOptimization("sched-retval-optimization",
69 cl::Hidden
, cl::init(true));
71 static cl::opt
<bool> EnableCheckBankConflict(
72 "hexagon-check-bank-conflict", cl::Hidden
, cl::init(true),
73 cl::desc("Enable checking for cache bank conflicts"));
75 HexagonSubtarget::HexagonSubtarget(const Triple
&TT
, StringRef CPU
,
76 StringRef FS
, const TargetMachine
&TM
)
77 : HexagonGenSubtargetInfo(TT
, CPU
, /*TuneCPU*/ CPU
, FS
),
78 OptLevel(TM
.getOptLevel()),
79 CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU
))),
80 TargetTriple(TT
), InstrInfo(initializeSubtargetDependencies(CPU
, FS
)),
81 RegInfo(getHwMode()), TLInfo(TM
, *this),
82 InstrItins(getInstrItineraryForCPU(CPUString
)) {
83 Hexagon_MC::addArchSubtarget(this, FS
);
84 // Beware of the default constructor of InstrItineraryData: it will
85 // reset all members to 0.
86 assert(InstrItins
.Itineraries
!= nullptr && "InstrItins not initialized");
90 HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU
, StringRef FS
) {
91 std::optional
<Hexagon::ArchEnum
> ArchVer
= Hexagon::getCpu(CPUString
);
93 HexagonArchVersion
= *ArchVer
;
95 llvm_unreachable("Unrecognized Hexagon processor version");
97 UseHVX128BOps
= false;
100 UseLongCalls
= false;
102 SubtargetFeatures
Features(FS
);
104 // Turn on QFloat if the HVX version is v68+.
105 // The function ParseSubtargetFeatures will set feature bits and initialize
106 // subtarget's variables all in one, so there isn't a good way to preprocess
107 // the feature string, other than by tinkering with it directly.
108 auto IsQFloatFS
= [](StringRef F
) {
109 return F
== "+hvx-qfloat" || F
== "-hvx-qfloat";
111 if (!llvm::count_if(Features
.getFeatures(), IsQFloatFS
)) {
112 auto getHvxVersion
= [&Features
](StringRef FS
) -> StringRef
{
113 for (StringRef F
: llvm::reverse(Features
.getFeatures())) {
114 if (F
.starts_with("+hvxv"))
117 for (StringRef F
: llvm::reverse(Features
.getFeatures())) {
120 if (F
.starts_with("+hvx") || F
== "-hvx")
121 return F
.take_front(4); // Return "+hvx" or "-hvx".
126 bool AddQFloat
= false;
127 StringRef HvxVer
= getHvxVersion(FS
);
128 if (HvxVer
.starts_with("+hvxv")) {
130 if (!HvxVer
.drop_front(5).consumeInteger(10, Ver
) && Ver
>= 68)
132 } else if (HvxVer
== "+hvx") {
138 Features
.AddFeature("+hvx-qfloat");
141 std::string FeatureString
= Features
.getString();
142 ParseSubtargetFeatures(CPUString
, /*TuneCPU*/ CPUString
, FeatureString
);
145 UseHVXFloatingPoint
= UseHVXIEEEFPOps
|| UseHVXQFloatOps
;
147 if (UseHVXQFloatOps
&& UseHVXIEEEFPOps
&& UseHVXFloatingPoint
)
149 dbgs() << "Behavior is undefined for simultaneous qfloat and ieee hvx codegen...");
151 if (OverrideLongCalls
.getPosition())
152 UseLongCalls
= OverrideLongCalls
;
154 UseBSBScheduling
= hasV60Ops() && EnableBSBSched
;
157 // Tiny core has a single thread, so back-to-back scheduling is enabled by
159 if (!EnableBSBSched
.getPosition())
160 UseBSBScheduling
= false;
163 FeatureBitset FeatureBits
= getFeatureBits();
164 if (HexagonDisableDuplex
)
165 setFeatureBits(FeatureBits
.reset(Hexagon::FeatureDuplex
));
166 setFeatureBits(Hexagon_MC::completeHVXFeatures(FeatureBits
));
171 bool HexagonSubtarget::isHVXElementType(MVT Ty
, bool IncludeBool
) const {
175 Ty
= Ty
.getVectorElementType();
176 if (IncludeBool
&& Ty
== MVT::i1
)
178 ArrayRef
<MVT
> ElemTypes
= getHVXElementTypes();
179 return llvm::is_contained(ElemTypes
, Ty
);
182 bool HexagonSubtarget::isHVXVectorType(EVT VecTy
, bool IncludeBool
) const {
183 if (!VecTy
.isSimple())
185 if (!VecTy
.isVector() || !useHVXOps() || VecTy
.isScalableVector())
187 MVT ElemTy
= VecTy
.getSimpleVT().getVectorElementType();
188 if (!IncludeBool
&& ElemTy
== MVT::i1
)
191 unsigned HwLen
= getVectorLength();
192 unsigned NumElems
= VecTy
.getVectorNumElements();
193 ArrayRef
<MVT
> ElemTypes
= getHVXElementTypes();
195 if (IncludeBool
&& ElemTy
== MVT::i1
) {
196 // Boolean HVX vector types are formed from regular HVX vector types
197 // by replacing the element type with i1.
198 for (MVT T
: ElemTypes
)
199 if (NumElems
* T
.getSizeInBits() == 8 * HwLen
)
204 unsigned VecWidth
= VecTy
.getSizeInBits();
205 if (VecWidth
!= 8 * HwLen
&& VecWidth
!= 16 * HwLen
)
207 return llvm::is_contained(ElemTypes
, ElemTy
);
210 bool HexagonSubtarget::isTypeForHVX(Type
*VecTy
, bool IncludeBool
) const {
211 if (!VecTy
->isVectorTy() || isa
<ScalableVectorType
>(VecTy
))
213 // Avoid types like <2 x i32*>.
214 Type
*ScalTy
= VecTy
->getScalarType();
215 if (!ScalTy
->isIntegerTy() &&
216 !(ScalTy
->isFloatingPointTy() && useHVXFloatingPoint()))
218 // The given type may be something like <17 x i32>, which is not MVT,
219 // but can be represented as (non-simple) EVT.
220 EVT Ty
= EVT::getEVT(VecTy
, /*HandleUnknown*/false);
221 if (!Ty
.getVectorElementType().isSimple())
224 auto isHvxTy
= [this, IncludeBool
](MVT SimpleTy
) {
225 if (isHVXVectorType(SimpleTy
, IncludeBool
))
227 auto Action
= getTargetLowering()->getPreferredVectorAction(SimpleTy
);
228 return Action
== TargetLoweringBase::TypeWidenVector
;
231 // Round up EVT to have power-of-2 elements, and keep checking if it
232 // qualifies for HVX, dividing it in half after each step.
233 MVT ElemTy
= Ty
.getVectorElementType().getSimpleVT();
234 unsigned VecLen
= PowerOf2Ceil(Ty
.getVectorNumElements());
236 MVT SimpleTy
= MVT::getVectorVT(ElemTy
, VecLen
);
237 if (SimpleTy
.isValid() && isHvxTy(SimpleTy
))
245 void HexagonSubtarget::UsrOverflowMutation::apply(ScheduleDAGInstrs
*DAG
) {
246 for (SUnit
&SU
: DAG
->SUnits
) {
249 SmallVector
<SDep
, 4> Erase
;
250 for (auto &D
: SU
.Preds
)
251 if (D
.getKind() == SDep::Output
&& D
.getReg() == Hexagon::USR_OVF
)
253 for (auto &E
: Erase
)
258 void HexagonSubtarget::HVXMemLatencyMutation::apply(ScheduleDAGInstrs
*DAG
) {
259 for (SUnit
&SU
: DAG
->SUnits
) {
260 // Update the latency of chain edges between v60 vector load or store
261 // instructions to be 1. These instruction cannot be scheduled in the
263 MachineInstr
&MI1
= *SU
.getInstr();
264 auto *QII
= static_cast<const HexagonInstrInfo
*>(DAG
->TII
);
265 bool IsStoreMI1
= MI1
.mayStore();
266 bool IsLoadMI1
= MI1
.mayLoad();
267 if (!QII
->isHVXVec(MI1
) || !(IsStoreMI1
|| IsLoadMI1
))
269 for (SDep
&SI
: SU
.Succs
) {
270 if (SI
.getKind() != SDep::Order
|| SI
.getLatency() != 0)
272 MachineInstr
&MI2
= *SI
.getSUnit()->getInstr();
273 if (!QII
->isHVXVec(MI2
))
275 if ((IsStoreMI1
&& MI2
.mayStore()) || (IsLoadMI1
&& MI2
.mayLoad())) {
278 // Change the dependence in the opposite direction too.
279 for (SDep
&PI
: SI
.getSUnit()->Preds
) {
280 if (PI
.getSUnit() != &SU
|| PI
.getKind() != SDep::Order
)
283 SI
.getSUnit()->setDepthDirty();
290 // Check if a call and subsequent A2_tfrpi instructions should maintain
291 // scheduling affinity. We are looking for the TFRI to be consumed in
292 // the next instruction. This should help reduce the instances of
293 // double register pairs being allocated and scheduled before a call
294 // when not used until after the call. This situation is exacerbated
295 // by the fact that we allocate the pair from the callee saves list,
296 // leading to excess spills and restores.
297 bool HexagonSubtarget::CallMutation::shouldTFRICallBind(
298 const HexagonInstrInfo
&HII
, const SUnit
&Inst1
,
299 const SUnit
&Inst2
) const {
300 if (Inst1
.getInstr()->getOpcode() != Hexagon::A2_tfrpi
)
303 // TypeXTYPE are 64 bit operations.
304 unsigned Type
= HII
.getType(*Inst2
.getInstr());
305 return Type
== HexagonII::TypeS_2op
|| Type
== HexagonII::TypeS_3op
||
306 Type
== HexagonII::TypeALU64
|| Type
== HexagonII::TypeM
;
309 void HexagonSubtarget::CallMutation::apply(ScheduleDAGInstrs
*DAGInstrs
) {
310 ScheduleDAGMI
*DAG
= static_cast<ScheduleDAGMI
*>(DAGInstrs
);
311 SUnit
* LastSequentialCall
= nullptr;
312 // Map from virtual register to physical register from the copy.
313 DenseMap
<unsigned, unsigned> VRegHoldingReg
;
314 // Map from the physical register to the instruction that uses virtual
315 // register. This is used to create the barrier edge.
316 DenseMap
<unsigned, SUnit
*> LastVRegUse
;
317 auto &TRI
= *DAG
->MF
.getSubtarget().getRegisterInfo();
318 auto &HII
= *DAG
->MF
.getSubtarget
<HexagonSubtarget
>().getInstrInfo();
320 // Currently we only catch the situation when compare gets scheduled
321 // before preceding call.
322 for (unsigned su
= 0, e
= DAG
->SUnits
.size(); su
!= e
; ++su
) {
323 // Remember the call.
324 if (DAG
->SUnits
[su
].getInstr()->isCall())
325 LastSequentialCall
= &DAG
->SUnits
[su
];
326 // Look for a compare that defines a predicate.
327 else if (DAG
->SUnits
[su
].getInstr()->isCompare() && LastSequentialCall
)
328 DAG
->addEdge(&DAG
->SUnits
[su
], SDep(LastSequentialCall
, SDep::Barrier
));
329 // Look for call and tfri* instructions.
330 else if (SchedPredsCloser
&& LastSequentialCall
&& su
> 1 && su
< e
-1 &&
331 shouldTFRICallBind(HII
, DAG
->SUnits
[su
], DAG
->SUnits
[su
+1]))
332 DAG
->addEdge(&DAG
->SUnits
[su
], SDep(&DAG
->SUnits
[su
-1], SDep::Barrier
));
333 // Prevent redundant register copies due to reads and writes of physical
334 // registers. The original motivation for this was the code generated
335 // between two calls, which are caused both the return value and the
336 // argument for the next call being in %r0.
339 // 2: %vreg = COPY %r0
343 // The scheduler would often swap 3 and 4, so an additional register is
344 // needed. This code inserts a Barrier dependence between 3 & 4 to prevent
346 // The code below checks for all the physical registers, not just R0/D0/V0.
347 else if (SchedRetvalOptimization
) {
348 const MachineInstr
*MI
= DAG
->SUnits
[su
].getInstr();
349 if (MI
->isCopy() && MI
->getOperand(1).getReg().isPhysical()) {
351 VRegHoldingReg
[MI
->getOperand(0).getReg()] = MI
->getOperand(1).getReg();
352 LastVRegUse
.erase(MI
->getOperand(1).getReg());
354 for (const MachineOperand
&MO
: MI
->operands()) {
357 if (MO
.isUse() && !MI
->isCopy() &&
358 VRegHoldingReg
.count(MO
.getReg())) {
360 LastVRegUse
[VRegHoldingReg
[MO
.getReg()]] = &DAG
->SUnits
[su
];
361 } else if (MO
.isDef() && MO
.getReg().isPhysical()) {
362 for (MCRegAliasIterator
AI(MO
.getReg(), &TRI
, true); AI
.isValid();
364 if (LastVRegUse
.count(*AI
) &&
365 LastVRegUse
[*AI
] != &DAG
->SUnits
[su
])
367 DAG
->addEdge(&DAG
->SUnits
[su
], SDep(LastVRegUse
[*AI
], SDep::Barrier
));
368 LastVRegUse
.erase(*AI
);
377 void HexagonSubtarget::BankConflictMutation::apply(ScheduleDAGInstrs
*DAG
) {
378 if (!EnableCheckBankConflict
)
381 const auto &HII
= static_cast<const HexagonInstrInfo
&>(*DAG
->TII
);
383 // Create artificial edges between loads that could likely cause a bank
384 // conflict. Since such loads would normally not have any dependency
385 // between them, we cannot rely on existing edges.
386 for (unsigned i
= 0, e
= DAG
->SUnits
.size(); i
!= e
; ++i
) {
387 SUnit
&S0
= DAG
->SUnits
[i
];
388 MachineInstr
&L0
= *S0
.getInstr();
389 if (!L0
.mayLoad() || L0
.mayStore() ||
390 HII
.getAddrMode(L0
) != HexagonII::BaseImmOffset
)
393 LocationSize Size0
= 0;
394 MachineOperand
*BaseOp0
= HII
.getBaseAndOffset(L0
, Offset0
, Size0
);
395 // Is the access size is longer than the L1 cache line, skip the check.
396 if (BaseOp0
== nullptr || !BaseOp0
->isReg() || !Size0
.hasValue() ||
397 Size0
.getValue() >= 32)
399 // Scan only up to 32 instructions ahead (to avoid n^2 complexity).
400 for (unsigned j
= i
+1, m
= std::min(i
+32, e
); j
!= m
; ++j
) {
401 SUnit
&S1
= DAG
->SUnits
[j
];
402 MachineInstr
&L1
= *S1
.getInstr();
403 if (!L1
.mayLoad() || L1
.mayStore() ||
404 HII
.getAddrMode(L1
) != HexagonII::BaseImmOffset
)
407 LocationSize Size1
= 0;
408 MachineOperand
*BaseOp1
= HII
.getBaseAndOffset(L1
, Offset1
, Size1
);
409 if (BaseOp1
== nullptr || !BaseOp1
->isReg() || !Size0
.hasValue() ||
410 Size1
.getValue() >= 32 || BaseOp0
->getReg() != BaseOp1
->getReg())
412 // Check bits 3 and 4 of the offset: if they differ, a bank conflict
414 if (((Offset0
^ Offset1
) & 0x18) != 0)
416 // Bits 3 and 4 are the same, add an artificial edge and set extra
418 SDep
A(&S0
, SDep::Artificial
);
425 /// Enable use of alias analysis during code generation (during MI
426 /// scheduling, DAGCombine, etc.).
427 bool HexagonSubtarget::useAA() const {
428 if (OptLevel
!= CodeGenOptLevel::None
)
433 /// Perform target specific adjustments to the latency of a schedule
435 void HexagonSubtarget::adjustSchedDependency(
436 SUnit
*Src
, int SrcOpIdx
, SUnit
*Dst
, int DstOpIdx
, SDep
&Dep
,
437 const TargetSchedModel
*SchedModel
) const {
438 if (!Src
->isInstr() || !Dst
->isInstr())
441 MachineInstr
*SrcInst
= Src
->getInstr();
442 MachineInstr
*DstInst
= Dst
->getInstr();
443 const HexagonInstrInfo
*QII
= getInstrInfo();
445 // Instructions with .new operands have zero latency.
446 SmallSet
<SUnit
*, 4> ExclSrc
;
447 SmallSet
<SUnit
*, 4> ExclDst
;
448 if (QII
->canExecuteInBundle(*SrcInst
, *DstInst
) &&
449 isBestZeroLatency(Src
, Dst
, QII
, ExclSrc
, ExclDst
)) {
454 // Set the latency for a copy to zero since we hope that is will get
456 if (DstInst
->isCopy())
459 // If it's a REG_SEQUENCE/COPY, use its destination instruction to determine
460 // the correct latency.
461 // If there are multiple uses of the def of COPY/REG_SEQUENCE, set the latency
462 // only if the latencies on all the uses are equal, otherwise set it to
464 if ((DstInst
->isRegSequence() || DstInst
->isCopy())) {
465 Register DReg
= DstInst
->getOperand(0).getReg();
466 std::optional
<unsigned> DLatency
;
467 for (const auto &DDep
: Dst
->Succs
) {
468 MachineInstr
*DDst
= DDep
.getSUnit()->getInstr();
470 for (unsigned OpNum
= 0; OpNum
< DDst
->getNumOperands(); OpNum
++) {
471 const MachineOperand
&MO
= DDst
->getOperand(OpNum
);
472 if (MO
.isReg() && MO
.getReg() && MO
.isUse() && MO
.getReg() == DReg
) {
481 std::optional
<unsigned> Latency
=
482 InstrInfo
.getOperandLatency(&InstrItins
, *SrcInst
, 0, *DDst
, UseIdx
);
484 // Set DLatency for the first time.
488 // For multiple uses, if the Latency is different across uses, reset
490 if (DLatency
!= Latency
) {
491 DLatency
= std::nullopt
;
495 Dep
.setLatency(DLatency
.value_or(0));
498 // Try to schedule uses near definitions to generate .cur.
501 if (EnableDotCurSched
&& QII
->isToBeScheduledASAP(*SrcInst
, *DstInst
) &&
502 isBestZeroLatency(Src
, Dst
, QII
, ExclSrc
, ExclDst
)) {
506 int Latency
= Dep
.getLatency();
507 bool IsArtificial
= Dep
.isArtificial();
508 Latency
= updateLatency(*SrcInst
, *DstInst
, IsArtificial
, Latency
);
509 Dep
.setLatency(Latency
);
512 void HexagonSubtarget::getPostRAMutations(
513 std::vector
<std::unique_ptr
<ScheduleDAGMutation
>> &Mutations
) const {
514 Mutations
.push_back(std::make_unique
<UsrOverflowMutation
>());
515 Mutations
.push_back(std::make_unique
<HVXMemLatencyMutation
>());
516 Mutations
.push_back(std::make_unique
<BankConflictMutation
>());
519 void HexagonSubtarget::getSMSMutations(
520 std::vector
<std::unique_ptr
<ScheduleDAGMutation
>> &Mutations
) const {
521 Mutations
.push_back(std::make_unique
<UsrOverflowMutation
>());
522 Mutations
.push_back(std::make_unique
<HVXMemLatencyMutation
>());
525 // Pin the vtable to this file.
526 void HexagonSubtarget::anchor() {}
528 bool HexagonSubtarget::enableMachineScheduler() const {
529 if (DisableHexagonMISched
.getNumOccurrences())
530 return !DisableHexagonMISched
;
534 bool HexagonSubtarget::usePredicatedCalls() const {
535 return EnablePredicatedCalls
;
538 int HexagonSubtarget::updateLatency(MachineInstr
&SrcInst
,
539 MachineInstr
&DstInst
, bool IsArtificial
,
546 auto &QII
= static_cast<const HexagonInstrInfo
&>(*getInstrInfo());
548 if (QII
.isHVXVec(SrcInst
) || useBSBScheduling())
549 Latency
= (Latency
+ 1) >> 1;
553 void HexagonSubtarget::restoreLatency(SUnit
*Src
, SUnit
*Dst
) const {
554 MachineInstr
*SrcI
= Src
->getInstr();
555 for (auto &I
: Src
->Succs
) {
556 if (!I
.isAssignedRegDep() || I
.getSUnit() != Dst
)
558 Register DepR
= I
.getReg();
560 for (unsigned OpNum
= 0; OpNum
< SrcI
->getNumOperands(); OpNum
++) {
561 const MachineOperand
&MO
= SrcI
->getOperand(OpNum
);
562 bool IsSameOrSubReg
= false;
564 Register MOReg
= MO
.getReg();
565 if (DepR
.isVirtual()) {
566 IsSameOrSubReg
= (MOReg
== DepR
);
568 IsSameOrSubReg
= getRegisterInfo()->isSubRegisterEq(DepR
, MOReg
);
570 if (MO
.isDef() && IsSameOrSubReg
)
574 assert(DefIdx
>= 0 && "Def Reg not found in Src MI");
575 MachineInstr
*DstI
= Dst
->getInstr();
577 for (unsigned OpNum
= 0; OpNum
< DstI
->getNumOperands(); OpNum
++) {
578 const MachineOperand
&MO
= DstI
->getOperand(OpNum
);
579 if (MO
.isReg() && MO
.isUse() && MO
.getReg() == DepR
) {
580 std::optional
<unsigned> Latency
= InstrInfo
.getOperandLatency(
581 &InstrItins
, *SrcI
, DefIdx
, *DstI
, OpNum
);
583 // For some instructions (ex: COPY), we might end up with < 0 latency
584 // as they don't have any Itinerary class associated with them.
587 bool IsArtificial
= I
.isArtificial();
588 Latency
= updateLatency(*SrcI
, *DstI
, IsArtificial
, *Latency
);
589 I
.setLatency(*Latency
);
593 // Update the latency of opposite edge too.
595 auto F
= find(Dst
->Preds
, T
);
596 assert(F
!= Dst
->Preds
.end());
597 F
->setLatency(I
.getLatency());
601 /// Change the latency between the two SUnits.
602 void HexagonSubtarget::changeLatency(SUnit
*Src
, SUnit
*Dst
, unsigned Lat
)
604 for (auto &I
: Src
->Succs
) {
605 if (!I
.isAssignedRegDep() || I
.getSUnit() != Dst
)
610 // Update the latency of opposite edge too.
612 auto F
= find(Dst
->Preds
, T
);
613 assert(F
!= Dst
->Preds
.end());
618 /// If the SUnit has a zero latency edge, return the other SUnit.
619 static SUnit
*getZeroLatency(SUnit
*N
, SmallVector
<SDep
, 4> &Deps
) {
621 if (I
.isAssignedRegDep() && I
.getLatency() == 0 &&
622 !I
.getSUnit()->getInstr()->isPseudo())
627 // Return true if these are the best two instructions to schedule
628 // together with a zero latency. Only one dependence should have a zero
629 // latency. If there are multiple choices, choose the best, and change
630 // the others, if needed.
631 bool HexagonSubtarget::isBestZeroLatency(SUnit
*Src
, SUnit
*Dst
,
632 const HexagonInstrInfo
*TII
, SmallSet
<SUnit
*, 4> &ExclSrc
,
633 SmallSet
<SUnit
*, 4> &ExclDst
) const {
634 MachineInstr
&SrcInst
= *Src
->getInstr();
635 MachineInstr
&DstInst
= *Dst
->getInstr();
637 // Ignore Boundary SU nodes as these have null instructions.
638 if (Dst
->isBoundaryNode())
641 if (SrcInst
.isPHI() || DstInst
.isPHI())
644 if (!TII
->isToBeScheduledASAP(SrcInst
, DstInst
) &&
645 !TII
->canExecuteInBundle(SrcInst
, DstInst
))
648 // The architecture doesn't allow three dependent instructions in the same
649 // packet. So, if the destination has a zero latency successor, then it's
650 // not a candidate for a zero latency predecessor.
651 if (getZeroLatency(Dst
, Dst
->Succs
) != nullptr)
654 // Check if the Dst instruction is the best candidate first.
655 SUnit
*Best
= nullptr;
656 SUnit
*DstBest
= nullptr;
657 SUnit
*SrcBest
= getZeroLatency(Dst
, Dst
->Preds
);
658 if (SrcBest
== nullptr || Src
->NodeNum
>= SrcBest
->NodeNum
) {
659 // Check that Src doesn't have a better candidate.
660 DstBest
= getZeroLatency(Src
, Src
->Succs
);
661 if (DstBest
== nullptr || Dst
->NodeNum
<= DstBest
->NodeNum
)
667 // The caller frequently adds the same dependence twice. If so, then
668 // return true for this case too.
669 if ((Src
== SrcBest
&& Dst
== DstBest
) ||
670 (SrcBest
== nullptr && Dst
== DstBest
) ||
671 (Src
== SrcBest
&& Dst
== nullptr))
674 // Reassign the latency for the previous bests, which requires setting
675 // the dependence edge in both directions.
676 if (SrcBest
!= nullptr) {
678 changeLatency(SrcBest
, Dst
, 1);
680 restoreLatency(SrcBest
, Dst
);
682 if (DstBest
!= nullptr) {
684 changeLatency(Src
, DstBest
, 1);
686 restoreLatency(Src
, DstBest
);
689 // Attempt to find another opprotunity for zero latency in a different
691 if (SrcBest
&& DstBest
)
692 // If there is an edge from SrcBest to DstBst, then try to change that
694 changeLatency(SrcBest
, DstBest
, 0);
696 // Check if the previous best destination instruction has a new zero
697 // latency dependence opportunity.
699 for (auto &I
: DstBest
->Preds
)
700 if (ExclSrc
.count(I
.getSUnit()) == 0 &&
701 isBestZeroLatency(I
.getSUnit(), DstBest
, TII
, ExclSrc
, ExclDst
))
702 changeLatency(I
.getSUnit(), DstBest
, 0);
703 } else if (SrcBest
) {
704 // Check if previous best source instruction has a new zero latency
705 // dependence opportunity.
707 for (auto &I
: SrcBest
->Succs
)
708 if (ExclDst
.count(I
.getSUnit()) == 0 &&
709 isBestZeroLatency(SrcBest
, I
.getSUnit(), TII
, ExclSrc
, ExclDst
))
710 changeLatency(SrcBest
, I
.getSUnit(), 0);
716 unsigned HexagonSubtarget::getL1CacheLineSize() const {
720 unsigned HexagonSubtarget::getL1PrefetchDistance() const {
724 bool HexagonSubtarget::enableSubRegLiveness() const { return true; }
726 Intrinsic::ID
HexagonSubtarget::getIntrinsicId(unsigned Opc
) const {
733 Intrinsic::ID Int64Id
, Int128Id
;
736 static Scalar ScalarInts
[] = {
737 #define GET_SCALAR_INTRINSICS
738 #include "HexagonDepInstrIntrinsics.inc"
739 #undef GET_SCALAR_INTRINSICS
742 static Hvx HvxInts
[] = {
743 #define GET_HVX_INTRINSICS
744 #include "HexagonDepInstrIntrinsics.inc"
745 #undef GET_HVX_INTRINSICS
748 const auto CmpOpcode
= [](auto A
, auto B
) { return A
.Opcode
< B
.Opcode
; };
749 [[maybe_unused
]] static bool SortedScalar
=
750 (llvm::sort(ScalarInts
, CmpOpcode
), true);
751 [[maybe_unused
]] static bool SortedHvx
=
752 (llvm::sort(HvxInts
, CmpOpcode
), true);
754 auto [BS
, ES
] = std::make_pair(std::begin(ScalarInts
), std::end(ScalarInts
));
755 auto [BH
, EH
] = std::make_pair(std::begin(HvxInts
), std::end(HvxInts
));
757 auto FoundScalar
= std::lower_bound(BS
, ES
, Scalar
{Opc
, 0}, CmpOpcode
);
758 if (FoundScalar
!= ES
&& FoundScalar
->Opcode
== Opc
)
759 return FoundScalar
->IntId
;
761 auto FoundHvx
= std::lower_bound(BH
, EH
, Hvx
{Opc
, 0, 0}, CmpOpcode
);
762 if (FoundHvx
!= EH
&& FoundHvx
->Opcode
== Opc
) {
763 unsigned HwLen
= getVectorLength();
765 return FoundHvx
->Int64Id
;
767 return FoundHvx
->Int128Id
;
770 std::string error
= "Invalid opcode (" + std::to_string(Opc
) + ")";
771 llvm_unreachable(error
.c_str());