1 //===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// \file This file contains the ARM definition DAG scheduling mutations which
10 /// change inter-instruction latencies
12 //===----------------------------------------------------------------------===//
14 #include "ARMLatencyMutations.h"
15 #include "ARMSubtarget.h"
16 #include "Thumb2InstrInfo.h"
17 #include "llvm/Analysis/AliasAnalysis.h"
18 #include "llvm/CodeGen/ScheduleDAG.h"
19 #include "llvm/CodeGen/ScheduleDAGMutation.h"
20 #include "llvm/CodeGen/TargetInstrInfo.h"
23 #include <initializer_list>
30 // Precompute information about opcodes to speed up pass
32 class InstructionInformation
{
35 bool HasBRegAddr
: 1; // B-side of addr gen is a register
36 bool HasBRegAddrShift
: 1; // B-side of addr gen has a shift
37 bool IsDivide
: 1; // Some form of integer divide
38 bool IsInlineShiftALU
: 1; // Inline shift+ALU
39 bool IsMultiply
: 1; // Some form of integer multiply
40 bool IsMVEIntMAC
: 1; // MVE 8/16/32-bit integer MAC operation
41 bool IsNonSubwordLoad
: 1; // Load which is a word or larger
42 bool IsShift
: 1; // Shift operation
43 bool IsRev
: 1; // REV operation
44 bool ProducesQP
: 1; // Produces a vector register result
45 bool ProducesDP
: 1; // Produces a double-precision register result
46 bool ProducesSP
: 1; // Produces a single-precision register result
47 bool ConsumesQP
: 1; // Consumes a vector register result
48 bool ConsumesDP
: 1; // Consumes a double-precision register result
49 bool ConsumesSP
: 1; // Consumes a single-precision register result
50 unsigned MVEIntMACMatched
; // Matched operand type (for MVE)
51 unsigned AddressOpMask
; // Mask indicating which operands go into AGU
53 : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),
54 IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),
55 IsNonSubwordLoad(false), IsShift(false), IsRev(false),
56 ProducesQP(false), ProducesDP(false), ProducesSP(false),
57 ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),
58 MVEIntMACMatched(0), AddressOpMask(0) {}
60 typedef std::array
<IInfo
, ARM::INSTRUCTION_LIST_END
> IInfoArray
;
64 // Always available information
65 unsigned getAddressOpMask(unsigned Op
) { return Info
[Op
].AddressOpMask
; }
66 bool hasBRegAddr(unsigned Op
) { return Info
[Op
].HasBRegAddr
; }
67 bool hasBRegAddrShift(unsigned Op
) { return Info
[Op
].HasBRegAddrShift
; }
68 bool isDivide(unsigned Op
) { return Info
[Op
].IsDivide
; }
69 bool isInlineShiftALU(unsigned Op
) { return Info
[Op
].IsInlineShiftALU
; }
70 bool isMultiply(unsigned Op
) { return Info
[Op
].IsMultiply
; }
71 bool isMVEIntMAC(unsigned Op
) { return Info
[Op
].IsMVEIntMAC
; }
72 bool isNonSubwordLoad(unsigned Op
) { return Info
[Op
].IsNonSubwordLoad
; }
73 bool isRev(unsigned Op
) { return Info
[Op
].IsRev
; }
74 bool isShift(unsigned Op
) { return Info
[Op
].IsShift
; }
76 // information available if markDPConsumers is called.
77 bool producesQP(unsigned Op
) { return Info
[Op
].ProducesQP
; }
78 bool producesDP(unsigned Op
) { return Info
[Op
].ProducesDP
; }
79 bool producesSP(unsigned Op
) { return Info
[Op
].ProducesSP
; }
80 bool consumesQP(unsigned Op
) { return Info
[Op
].ConsumesQP
; }
81 bool consumesDP(unsigned Op
) { return Info
[Op
].ConsumesDP
; }
82 bool consumesSP(unsigned Op
) { return Info
[Op
].ConsumesSP
; }
84 bool isMVEIntMACMatched(unsigned SrcOp
, unsigned DstOp
) {
85 return SrcOp
== DstOp
|| Info
[DstOp
].MVEIntMACMatched
== SrcOp
;
88 InstructionInformation(const ARMBaseInstrInfo
*TII
);
91 void markDPProducersConsumers(const ARMBaseInstrInfo
*TII
);
94 InstructionInformation::InstructionInformation(const ARMBaseInstrInfo
*TII
) {
97 std::initializer_list
<unsigned> hasBRegAddrList
= {
98 t2LDRs
, t2LDRBs
, t2LDRHs
, t2STRs
, t2STRBs
, t2STRHs
,
99 tLDRr
, tLDRBr
, tLDRHr
, tSTRr
, tSTRBr
, tSTRHr
,
101 for (auto op
: hasBRegAddrList
) {
102 Info
[op
].HasBRegAddr
= true;
105 std::initializer_list
<unsigned> hasBRegAddrShiftList
= {
106 t2LDRs
, t2LDRBs
, t2LDRHs
, t2STRs
, t2STRBs
, t2STRHs
,
108 for (auto op
: hasBRegAddrShiftList
) {
109 Info
[op
].HasBRegAddrShift
= true;
112 Info
[t2SDIV
].IsDivide
= Info
[t2UDIV
].IsDivide
= true;
114 std::initializer_list
<unsigned> isInlineShiftALUList
= {
115 t2ADCrs
, t2ADDSrs
, t2ADDrs
, t2BICrs
, t2EORrs
,
116 t2ORNrs
, t2RSBSrs
, t2RSBrs
, t2SBCrs
, t2SUBrs
,
117 t2SUBSrs
, t2CMPrs
, t2CMNzrs
, t2TEQrs
, t2TSTrs
,
119 for (auto op
: isInlineShiftALUList
) {
120 Info
[op
].IsInlineShiftALU
= true;
123 Info
[t2SDIV
].IsDivide
= Info
[t2UDIV
].IsDivide
= true;
125 std::initializer_list
<unsigned> isMultiplyList
= {
126 t2MUL
, t2MLA
, t2MLS
, t2SMLABB
, t2SMLABT
, t2SMLAD
, t2SMLADX
,
127 t2SMLAL
, t2SMLALBB
, t2SMLALBT
, t2SMLALD
, t2SMLALDX
, t2SMLALTB
, t2SMLALTT
,
128 t2SMLATB
, t2SMLATT
, t2SMLAWT
, t2SMLSD
, t2SMLSDX
, t2SMLSLD
, t2SMLSLDX
,
129 t2SMMLA
, t2SMMLAR
, t2SMMLS
, t2SMMLSR
, t2SMMUL
, t2SMMULR
, t2SMUAD
,
130 t2SMUADX
, t2SMULBB
, t2SMULBT
, t2SMULL
, t2SMULTB
, t2SMULTT
, t2SMULWT
,
131 t2SMUSD
, t2SMUSDX
, t2UMAAL
, t2UMLAL
, t2UMULL
, tMUL
,
133 for (auto op
: isMultiplyList
) {
134 Info
[op
].IsMultiply
= true;
137 std::initializer_list
<unsigned> isMVEIntMACList
= {
138 MVE_VMLAS_qr_i16
, MVE_VMLAS_qr_i32
, MVE_VMLAS_qr_i8
,
139 MVE_VMLA_qr_i16
, MVE_VMLA_qr_i32
, MVE_VMLA_qr_i8
,
140 MVE_VQDMLAH_qrs16
, MVE_VQDMLAH_qrs32
, MVE_VQDMLAH_qrs8
,
141 MVE_VQDMLASH_qrs16
, MVE_VQDMLASH_qrs32
, MVE_VQDMLASH_qrs8
,
142 MVE_VQRDMLAH_qrs16
, MVE_VQRDMLAH_qrs32
, MVE_VQRDMLAH_qrs8
,
143 MVE_VQRDMLASH_qrs16
, MVE_VQRDMLASH_qrs32
, MVE_VQRDMLASH_qrs8
,
144 MVE_VQDMLADHXs16
, MVE_VQDMLADHXs32
, MVE_VQDMLADHXs8
,
145 MVE_VQDMLADHs16
, MVE_VQDMLADHs32
, MVE_VQDMLADHs8
,
146 MVE_VQDMLSDHXs16
, MVE_VQDMLSDHXs32
, MVE_VQDMLSDHXs8
,
147 MVE_VQDMLSDHs16
, MVE_VQDMLSDHs32
, MVE_VQDMLSDHs8
,
148 MVE_VQRDMLADHXs16
, MVE_VQRDMLADHXs32
, MVE_VQRDMLADHXs8
,
149 MVE_VQRDMLADHs16
, MVE_VQRDMLADHs32
, MVE_VQRDMLADHs8
,
150 MVE_VQRDMLSDHXs16
, MVE_VQRDMLSDHXs32
, MVE_VQRDMLSDHXs8
,
151 MVE_VQRDMLSDHs16
, MVE_VQRDMLSDHs32
, MVE_VQRDMLSDHs8
,
153 for (auto op
: isMVEIntMACList
) {
154 Info
[op
].IsMVEIntMAC
= true;
157 std::initializer_list
<unsigned> isNonSubwordLoadList
= {
158 t2LDRi12
, t2LDRi8
, t2LDR_POST
, t2LDR_PRE
, t2LDRpci
,
159 t2LDRs
, t2LDRDi8
, t2LDRD_POST
, t2LDRD_PRE
, tLDRi
,
160 tLDRpci
, tLDRr
, tLDRspi
,
162 for (auto op
: isNonSubwordLoadList
) {
163 Info
[op
].IsNonSubwordLoad
= true;
166 std::initializer_list
<unsigned> isRevList
= {
167 t2REV
, t2REV16
, t2REVSH
, t2RBIT
, tREV
, tREV16
, tREVSH
,
169 for (auto op
: isRevList
) {
170 Info
[op
].IsRev
= true;
173 std::initializer_list
<unsigned> isShiftList
= {
174 t2ASRri
, t2ASRrr
, t2LSLri
, t2LSLrr
, t2LSRri
, t2LSRrr
, t2RORri
, t2RORrr
,
175 tASRri
, tASRrr
, tLSLSri
, tLSLri
, tLSLrr
, tLSRri
, tLSRrr
, tROR
,
177 for (auto op
: isShiftList
) {
178 Info
[op
].IsShift
= true;
181 std::initializer_list
<unsigned> Address1List
= {
316 std::initializer_list
<unsigned> Address2List
= {
406 std::initializer_list
<unsigned> Address3List
= {
412 // Compute a mask of which operands are involved in address computation
413 for (auto &op
: Address1List
) {
414 Info
[op
].AddressOpMask
= 0x6;
416 for (auto &op
: Address2List
) {
417 Info
[op
].AddressOpMask
= 0xc;
419 for (auto &op
: Address3List
) {
420 Info
[op
].AddressOpMask
= 0x18;
422 for (auto &op
: hasBRegAddrShiftList
) {
423 Info
[op
].AddressOpMask
|= 0x8;
427 void InstructionInformation::markDPProducersConsumers(
428 const ARMBaseInstrInfo
*TII
) {
429 // Learn about all instructions which have FP source/dest registers
430 for (unsigned MI
= 0; MI
< ARM::INSTRUCTION_LIST_END
; ++MI
) {
431 const MCInstrDesc
&MID
= TII
->get(MI
);
432 auto Operands
= MID
.operands();
433 for (unsigned OI
= 0, OIE
= MID
.getNumOperands(); OI
!= OIE
; ++OI
) {
434 bool MarkQP
= false, MarkDP
= false, MarkSP
= false;
435 switch (Operands
[OI
].RegClass
) {
436 case ARM::MQPRRegClassID
:
437 case ARM::DPRRegClassID
:
438 case ARM::DPR_8RegClassID
:
439 case ARM::DPR_VFP2RegClassID
:
440 case ARM::DPairRegClassID
:
441 case ARM::DPairSpcRegClassID
:
442 case ARM::DQuadRegClassID
:
443 case ARM::DQuadSpcRegClassID
:
444 case ARM::DTripleRegClassID
:
445 case ARM::DTripleSpcRegClassID
:
448 case ARM::QPRRegClassID
:
449 case ARM::QPR_8RegClassID
:
450 case ARM::QPR_VFP2RegClassID
:
451 case ARM::QQPRRegClassID
:
452 case ARM::QQQQPRRegClassID
:
455 case ARM::SPRRegClassID
:
456 case ARM::SPR_8RegClassID
:
457 case ARM::FPWithVPRRegClassID
:
464 if (OI
< MID
.getNumDefs())
465 Info
[MI
].ProducesQP
= true;
467 Info
[MI
].ConsumesQP
= true;
470 if (OI
< MID
.getNumDefs())
471 Info
[MI
].ProducesDP
= true;
473 Info
[MI
].ConsumesDP
= true;
476 if (OI
< MID
.getNumDefs())
477 Info
[MI
].ProducesSP
= true;
479 Info
[MI
].ConsumesSP
= true;
485 } // anonymous namespace
487 static bool hasImplicitCPSRUse(const MachineInstr
*MI
) {
488 return MI
->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR
);
491 void ARMOverrideBypasses::setBidirLatencies(SUnit
&SrcSU
, SDep
&SrcDep
,
493 SDep Reverse
= SrcDep
;
494 Reverse
.setSUnit(&SrcSU
);
495 for (SDep
&PDep
: SrcDep
.getSUnit()->Preds
) {
496 if (PDep
== Reverse
) {
497 PDep
.setLatency(latency
);
498 SrcDep
.getSUnit()->setDepthDirty();
502 SrcDep
.setLatency(latency
);
503 SrcSU
.setHeightDirty();
506 static bool mismatchedPred(ARMCC::CondCodes a
, ARMCC::CondCodes b
) {
507 return (a
& 0xe) != (b
& 0xe);
510 // Set output dependences to zero latency for processors which can
511 // simultaneously issue to the same register. Returns true if a change
513 bool ARMOverrideBypasses::zeroOutputDependences(SUnit
&ISU
, SDep
&Dep
) {
514 if (Dep
.getKind() == SDep::Output
) {
515 setBidirLatencies(ISU
, Dep
, 0);
521 // The graph doesn't look inside of bundles to determine their
522 // scheduling boundaries and reports zero latency into and out of them
523 // (except for CPSR into the bundle, which has latency 1).
524 // Make some better scheduling assumptions:
525 // 1) CPSR uses have zero latency; other uses have incoming latency 1
526 // 2) CPSR defs retain a latency of zero; others have a latency of 1.
528 // Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise
529 unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit
&ISU
, SDep
&Dep
) {
531 SUnit
&DepSU
= *Dep
.getSUnit();
532 const MachineInstr
*SrcMI
= ISU
.getInstr();
533 unsigned SrcOpcode
= SrcMI
->getOpcode();
534 const MachineInstr
*DstMI
= DepSU
.getInstr();
535 unsigned DstOpcode
= DstMI
->getOpcode();
537 if (DstOpcode
== ARM::BUNDLE
&& TII
->isPredicated(*DstMI
)) {
540 (Dep
.isAssignedRegDep() && Dep
.getReg() == ARM::CPSR
) ? 0 : 1);
543 if (SrcOpcode
== ARM::BUNDLE
&& TII
->isPredicated(*SrcMI
) &&
544 Dep
.isAssignedRegDep() && Dep
.getReg() != ARM::CPSR
) {
545 setBidirLatencies(ISU
, Dep
, 1);
551 // Determine whether there is a memory RAW hazard here and set up latency
553 bool ARMOverrideBypasses::memoryRAWHazard(SUnit
&ISU
, SDep
&Dep
,
555 if (!Dep
.isNormalMemory())
557 auto &SrcInst
= *ISU
.getInstr();
558 auto &DstInst
= *Dep
.getSUnit()->getInstr();
559 if (!SrcInst
.mayStore() || !DstInst
.mayLoad())
562 auto SrcMO
= *SrcInst
.memoperands().begin();
563 auto DstMO
= *DstInst
.memoperands().begin();
564 auto SrcVal
= SrcMO
->getValue();
565 auto DstVal
= DstMO
->getValue();
566 auto SrcPseudoVal
= SrcMO
->getPseudoValue();
567 auto DstPseudoVal
= DstMO
->getPseudoValue();
568 if (SrcVal
&& DstVal
&& AA
->alias(SrcVal
, DstVal
) == AliasResult::MustAlias
&&
569 SrcMO
->getOffset() == DstMO
->getOffset()) {
570 setBidirLatencies(ISU
, Dep
, latency
);
572 } else if (SrcPseudoVal
&& DstPseudoVal
&&
573 SrcPseudoVal
->kind() == DstPseudoVal
->kind() &&
574 SrcPseudoVal
->kind() == PseudoSourceValue::FixedStack
) {
576 auto FS0
= cast
<FixedStackPseudoSourceValue
>(SrcPseudoVal
);
577 auto FS1
= cast
<FixedStackPseudoSourceValue
>(DstPseudoVal
);
579 setBidirLatencies(ISU
, Dep
, latency
);
588 std::unique_ptr
<InstructionInformation
> II
;
590 class CortexM7InstructionInformation
: public InstructionInformation
{
592 CortexM7InstructionInformation(const ARMBaseInstrInfo
*TII
)
593 : InstructionInformation(TII
) {}
596 class CortexM7Overrides
: public ARMOverrideBypasses
{
598 CortexM7Overrides(const ARMBaseInstrInfo
*TII
, AAResults
*AA
)
599 : ARMOverrideBypasses(TII
, AA
) {
601 II
.reset(new CortexM7InstructionInformation(TII
));
604 void modifyBypasses(SUnit
&) override
;
607 void CortexM7Overrides::modifyBypasses(SUnit
&ISU
) {
608 const MachineInstr
*SrcMI
= ISU
.getInstr();
609 unsigned SrcOpcode
= SrcMI
->getOpcode();
610 bool isNSWload
= II
->isNonSubwordLoad(SrcOpcode
);
612 // Walk the successors looking for latency overrides that are needed
613 for (SDep
&Dep
: ISU
.Succs
) {
615 // Output dependences should have 0 latency, as M7 is able to
616 // schedule writers to the same register for simultaneous issue.
617 if (zeroOutputDependences(ISU
, Dep
))
620 if (memoryRAWHazard(ISU
, Dep
, 4))
623 // Ignore dependencies other than data
624 if (Dep
.getKind() != SDep::Data
)
627 SUnit
&DepSU
= *Dep
.getSUnit();
628 if (DepSU
.isBoundaryNode())
631 if (makeBundleAssumptions(ISU
, Dep
) == 1)
634 const MachineInstr
*DstMI
= DepSU
.getInstr();
635 unsigned DstOpcode
= DstMI
->getOpcode();
637 // Word loads into any multiply or divide instruction are considered
638 // cannot bypass their scheduling stage. Didn't do this in the .td file
639 // because we cannot easily create a read advance that is 0 from certain
640 // writer classes and 1 from all the rest.
641 // (The other way around would have been easy.)
642 if (isNSWload
&& (II
->isMultiply(DstOpcode
) || II
->isDivide(DstOpcode
)))
643 setBidirLatencies(ISU
, Dep
, Dep
.getLatency() + 1);
645 // Word loads into B operand of a load/store are considered cannot bypass
646 // their scheduling stage. Cannot do in the .td file because
647 // need to decide between -1 and -2 for ReadAdvance
648 if (isNSWload
&& II
->hasBRegAddr(DstOpcode
) &&
649 DstMI
->getOperand(2).getReg() == Dep
.getReg())
650 setBidirLatencies(ISU
, Dep
, Dep
.getLatency() + 1);
652 // Multiplies into any address generation cannot bypass from EX3. Cannot do
653 // in the .td file because need to decide between -1 and -2 for ReadAdvance
654 if (II
->isMultiply(SrcOpcode
)) {
655 unsigned OpMask
= II
->getAddressOpMask(DstOpcode
) >> 1;
656 for (unsigned i
= 1; OpMask
; ++i
, OpMask
>>= 1) {
657 if ((OpMask
& 1) && DstMI
->getOperand(i
).isReg() &&
658 DstMI
->getOperand(i
).getReg() == Dep
.getReg()) {
659 setBidirLatencies(ISU
, Dep
, 4); // first legal bypass is EX4->EX1
665 // Mismatched conditional producers take longer on M7; they end up looking
666 // like they were produced at EX3 and read at IS.
667 if (TII
->isPredicated(*SrcMI
) && Dep
.isAssignedRegDep() &&
668 (SrcOpcode
== ARM::BUNDLE
||
669 mismatchedPred(TII
->getPredicate(*SrcMI
),
670 TII
->getPredicate(*DstMI
)))) {
672 // Operand A of shift+ALU is treated as an EX1 read instead of EX2.
673 if (II
->isInlineShiftALU(DstOpcode
) && DstMI
->getOperand(3).getImm() &&
674 DstMI
->getOperand(1).getReg() == Dep
.getReg())
676 Lat
= std::min(3u, Dep
.getLatency() + Lat
);
677 setBidirLatencies(ISU
, Dep
, std::max(Dep
.getLatency(), Lat
));
680 // CC setter into conditional producer shouldn't have a latency of more
681 // than 1 unless it's due to an implicit read. (All the "true" readers
682 // of the condition code use an implicit read, and predicates use an
684 if (Dep
.isAssignedRegDep() && Dep
.getReg() == ARM::CPSR
&&
685 TII
->isPredicated(*DstMI
) && !hasImplicitCPSRUse(DstMI
))
686 setBidirLatencies(ISU
, Dep
, 1);
688 // REV instructions cannot bypass directly into the EX1 shifter. The
689 // code is slightly inexact as it doesn't attempt to ensure that the bypass
690 // is to the shifter operands.
691 if (II
->isRev(SrcOpcode
)) {
692 if (II
->isInlineShiftALU(DstOpcode
))
693 setBidirLatencies(ISU
, Dep
, 2);
694 else if (II
->isShift(DstOpcode
))
695 setBidirLatencies(ISU
, Dep
, 1);
700 class M85InstructionInformation
: public InstructionInformation
{
702 M85InstructionInformation(const ARMBaseInstrInfo
*t
)
703 : InstructionInformation(t
) {
704 markDPProducersConsumers(t
);
708 class M85Overrides
: public ARMOverrideBypasses
{
710 M85Overrides(const ARMBaseInstrInfo
*t
, AAResults
*a
)
711 : ARMOverrideBypasses(t
, a
) {
713 II
.reset(new M85InstructionInformation(t
));
716 void modifyBypasses(SUnit
&) override
;
719 unsigned computeBypassStage(const MCSchedClassDesc
*SCD
);
720 signed modifyMixedWidthFP(const MachineInstr
*SrcMI
,
721 const MachineInstr
*DstMI
, unsigned RegID
,
722 const MCSchedClassDesc
*SCD
);
725 unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc
*SCDesc
) {
726 auto SM
= DAG
->getSchedModel();
727 unsigned DefIdx
= 0; // just look for the first output's timing
728 if (DefIdx
< SCDesc
->NumWriteLatencyEntries
) {
729 // Lookup the definition's write latency in SubtargetInfo.
730 const MCWriteLatencyEntry
*WLEntry
=
731 SM
->getSubtargetInfo()->getWriteLatencyEntry(SCDesc
, DefIdx
);
732 unsigned Latency
= WLEntry
->Cycles
>= 0 ? WLEntry
->Cycles
: 1000;
735 else if (Latency
== 5)
737 else if (Latency
> 3)
745 // Latency changes for bypassing between FP registers of different sizes:
747 // Note that mixed DP/SP are unlikely because of the semantics
748 // of C. Mixed MVE/SP are quite common when MVE intrinsics are used.
749 signed M85Overrides::modifyMixedWidthFP(const MachineInstr
*SrcMI
,
750 const MachineInstr
*DstMI
,
752 const MCSchedClassDesc
*SCD
) {
754 if (!II
->producesSP(SrcMI
->getOpcode()) &&
755 !II
->producesDP(SrcMI
->getOpcode()) &&
756 !II
->producesQP(SrcMI
->getOpcode()))
759 if (Register::isVirtualRegister(RegID
)) {
760 if (II
->producesSP(SrcMI
->getOpcode()) &&
761 II
->consumesDP(DstMI
->getOpcode())) {
762 for (auto &OP
: SrcMI
->operands())
763 if (OP
.isReg() && OP
.isDef() && OP
.getReg() == RegID
&&
764 OP
.getSubReg() == ARM::ssub_1
)
765 return 5 - computeBypassStage(SCD
);
766 } else if (II
->producesSP(SrcMI
->getOpcode()) &&
767 II
->consumesQP(DstMI
->getOpcode())) {
768 for (auto &OP
: SrcMI
->operands())
769 if (OP
.isReg() && OP
.isDef() && OP
.getReg() == RegID
&&
770 (OP
.getSubReg() == ARM::ssub_1
|| OP
.getSubReg() == ARM::ssub_3
))
771 return 5 - computeBypassStage(SCD
) -
772 ((OP
.getSubReg() == ARM::ssub_2
||
773 OP
.getSubReg() == ARM::ssub_3
)
776 } else if (II
->producesDP(SrcMI
->getOpcode()) &&
777 II
->consumesQP(DstMI
->getOpcode())) {
778 for (auto &OP
: SrcMI
->operands())
779 if (OP
.isReg() && OP
.isDef() && OP
.getReg() == RegID
&&
780 OP
.getSubReg() == ARM::ssub_1
)
782 } else if (II
->producesDP(SrcMI
->getOpcode()) &&
783 II
->consumesSP(DstMI
->getOpcode())) {
784 for (auto &OP
: DstMI
->operands())
785 if (OP
.isReg() && OP
.isUse() && OP
.getReg() == RegID
&&
786 OP
.getSubReg() == ARM::ssub_1
)
787 return 5 - computeBypassStage(SCD
);
788 } else if (II
->producesQP(SrcMI
->getOpcode()) &&
789 II
->consumesSP(DstMI
->getOpcode())) {
790 for (auto &OP
: DstMI
->operands())
791 if (OP
.isReg() && OP
.isUse() && OP
.getReg() == RegID
&&
792 (OP
.getSubReg() == ARM::ssub_1
|| OP
.getSubReg() == ARM::ssub_3
))
793 return 5 - computeBypassStage(SCD
) +
794 ((OP
.getSubReg() == ARM::ssub_2
||
795 OP
.getSubReg() == ARM::ssub_3
)
798 } else if (II
->producesQP(SrcMI
->getOpcode()) &&
799 II
->consumesDP(DstMI
->getOpcode())) {
800 for (auto &OP
: DstMI
->operands())
801 if (OP
.isReg() && OP
.isUse() && OP
.getReg() == RegID
&&
802 OP
.getSubReg() == ARM::ssub_1
)
805 } else if (Register::isPhysicalRegister(RegID
)) {
806 // Note that when the producer is narrower, not all of the producers
807 // may be present in the scheduling graph; somewhere earlier in the
808 // compiler, an implicit def/use of the aliased full register gets
809 // added to the producer, and so only that producer is seen as *the*
810 // single producer. This behavior also has the unfortunate effect of
811 // serializing the producers in the compiler's view of things.
812 if (II
->producesSP(SrcMI
->getOpcode()) &&
813 II
->consumesDP(DstMI
->getOpcode())) {
814 for (auto &OP
: SrcMI
->operands())
815 if (OP
.isReg() && OP
.isDef() && OP
.getReg() >= ARM::S1
&&
816 OP
.getReg() <= ARM::S31
&& (OP
.getReg() - ARM::S0
) % 2 &&
817 (OP
.getReg() == RegID
||
818 (OP
.getReg() - ARM::S0
) / 2 + ARM::D0
== RegID
||
819 (OP
.getReg() - ARM::S0
) / 4 + ARM::Q0
== RegID
))
820 return 5 - computeBypassStage(SCD
);
821 } else if (II
->producesSP(SrcMI
->getOpcode()) &&
822 II
->consumesQP(DstMI
->getOpcode())) {
823 for (auto &OP
: SrcMI
->operands())
824 if (OP
.isReg() && OP
.isDef() && OP
.getReg() >= ARM::S1
&&
825 OP
.getReg() <= ARM::S31
&& (OP
.getReg() - ARM::S0
) % 2 &&
826 (OP
.getReg() == RegID
||
827 (OP
.getReg() - ARM::S0
) / 2 + ARM::D0
== RegID
||
828 (OP
.getReg() - ARM::S0
) / 4 + ARM::Q0
== RegID
))
829 return 5 - computeBypassStage(SCD
) -
830 (((OP
.getReg() - ARM::S0
) / 2) % 2 ? 1 : 0);
831 } else if (II
->producesDP(SrcMI
->getOpcode()) &&
832 II
->consumesQP(DstMI
->getOpcode())) {
833 for (auto &OP
: SrcMI
->operands())
834 if (OP
.isReg() && OP
.isDef() && OP
.getReg() >= ARM::D0
&&
835 OP
.getReg() <= ARM::D15
&& (OP
.getReg() - ARM::D0
) % 2 &&
836 (OP
.getReg() == RegID
||
837 (OP
.getReg() - ARM::D0
) / 2 + ARM::Q0
== RegID
))
839 } else if (II
->producesDP(SrcMI
->getOpcode()) &&
840 II
->consumesSP(DstMI
->getOpcode())) {
841 if (RegID
>= ARM::S1
&& RegID
<= ARM::S31
&& (RegID
- ARM::S0
) % 2)
842 return 5 - computeBypassStage(SCD
);
843 } else if (II
->producesQP(SrcMI
->getOpcode()) &&
844 II
->consumesSP(DstMI
->getOpcode())) {
845 if (RegID
>= ARM::S1
&& RegID
<= ARM::S31
&& (RegID
- ARM::S0
) % 2)
846 return 5 - computeBypassStage(SCD
) +
847 (((RegID
- ARM::S0
) / 2) % 2 ? 1 : 0);
848 } else if (II
->producesQP(SrcMI
->getOpcode()) &&
849 II
->consumesDP(DstMI
->getOpcode())) {
850 if (RegID
>= ARM::D1
&& RegID
<= ARM::D15
&& (RegID
- ARM::D0
) % 2)
857 void M85Overrides::modifyBypasses(SUnit
&ISU
) {
858 const MachineInstr
*SrcMI
= ISU
.getInstr();
859 unsigned SrcOpcode
= SrcMI
->getOpcode();
860 bool isNSWload
= II
->isNonSubwordLoad(SrcOpcode
);
862 // Walk the successors looking for latency overrides that are needed
863 for (SDep
&Dep
: ISU
.Succs
) {
865 // Output dependences should have 0 latency, as CortexM85 is able to
866 // schedule writers to the same register for simultaneous issue.
867 if (zeroOutputDependences(ISU
, Dep
))
870 if (memoryRAWHazard(ISU
, Dep
, 3))
873 // Ignore dependencies other than data or strong ordering.
874 if (Dep
.getKind() != SDep::Data
)
877 SUnit
&DepSU
= *Dep
.getSUnit();
878 if (DepSU
.isBoundaryNode())
881 if (makeBundleAssumptions(ISU
, Dep
) == 1)
884 const MachineInstr
*DstMI
= DepSU
.getInstr();
885 unsigned DstOpcode
= DstMI
->getOpcode();
887 // Word loads into B operand of a load/store with cannot bypass their
888 // scheduling stage. Cannot do in the .td file because need to decide
889 // between -1 and -2 for ReadAdvance
891 if (isNSWload
&& II
->hasBRegAddrShift(DstOpcode
) &&
892 DstMI
->getOperand(3).getImm() != 0 && // shift operand
893 DstMI
->getOperand(2).getReg() == Dep
.getReg())
894 setBidirLatencies(ISU
, Dep
, Dep
.getLatency() + 1);
896 if (isNSWload
&& isMVEVectorInstruction(DstMI
)) {
897 setBidirLatencies(ISU
, Dep
, Dep
.getLatency() + 1);
900 if (II
->isMVEIntMAC(DstOpcode
) &&
901 II
->isMVEIntMACMatched(SrcOpcode
, DstOpcode
) &&
902 DstMI
->getOperand(0).isReg() &&
903 DstMI
->getOperand(0).getReg() == Dep
.getReg())
904 setBidirLatencies(ISU
, Dep
, Dep
.getLatency() - 1);
906 // CC setter into conditional producer shouldn't have a latency of more
907 // than 0 unless it's due to an implicit read.
908 if (Dep
.isAssignedRegDep() && Dep
.getReg() == ARM::CPSR
&&
909 TII
->isPredicated(*DstMI
) && !hasImplicitCPSRUse(DstMI
))
910 setBidirLatencies(ISU
, Dep
, 0);
912 if (signed ALat
= modifyMixedWidthFP(SrcMI
, DstMI
, Dep
.getReg(),
913 DAG
->getSchedClass(&ISU
)))
914 setBidirLatencies(ISU
, Dep
, std::max(0, signed(Dep
.getLatency()) + ALat
));
916 if (II
->isRev(SrcOpcode
)) {
917 if (II
->isInlineShiftALU(DstOpcode
))
918 setBidirLatencies(ISU
, Dep
, 1);
919 else if (II
->isShift(DstOpcode
))
920 setBidirLatencies(ISU
, Dep
, 1);
925 // Add M55 specific overrides for latencies between instructions. Currently it:
926 // - Adds an extra cycle latency between MVE VMLAV and scalar instructions.
927 class CortexM55Overrides
: public ARMOverrideBypasses
{
929 CortexM55Overrides(const ARMBaseInstrInfo
*TII
, AAResults
*AA
)
930 : ARMOverrideBypasses(TII
, AA
) {}
932 void modifyBypasses(SUnit
&SU
) override
{
933 MachineInstr
*SrcMI
= SU
.getInstr();
934 if (!(SrcMI
->getDesc().TSFlags
& ARMII::HorizontalReduction
))
937 for (SDep
&Dep
: SU
.Succs
) {
938 if (Dep
.getKind() != SDep::Data
)
940 SUnit
&DepSU
= *Dep
.getSUnit();
941 if (DepSU
.isBoundaryNode())
943 MachineInstr
*DstMI
= DepSU
.getInstr();
945 if (!isMVEVectorInstruction(DstMI
) && !DstMI
->mayStore())
946 setBidirLatencies(SU
, Dep
, 3);
951 } // end anonymous namespace
953 void ARMOverrideBypasses::apply(ScheduleDAGInstrs
*DAGInstrs
) {
955 for (SUnit
&ISU
: DAGInstrs
->SUnits
) {
956 if (ISU
.isBoundaryNode())
960 if (DAGInstrs
->ExitSU
.getInstr())
961 modifyBypasses(DAGInstrs
->ExitSU
);
964 std::unique_ptr
<ScheduleDAGMutation
>
965 createARMLatencyMutations(const ARMSubtarget
&ST
, AAResults
*AA
) {
966 if (ST
.isCortexM85())
967 return std::make_unique
<M85Overrides
>(ST
.getInstrInfo(), AA
);
968 else if (ST
.isCortexM7())
969 return std::make_unique
<CortexM7Overrides
>(ST
.getInstrInfo(), AA
);
970 else if (ST
.isCortexM55())
971 return std::make_unique
<CortexM55Overrides
>(ST
.getInstrInfo(), AA
);
976 } // end namespace llvm