Add gfx950 mfma instructions to ROCDL dialect (#123361)
[llvm-project.git] / llvm / lib / Target / ARM / ARMLatencyMutations.cpp
blob85bad4f1925a43b9744a178bbf23e6cdddb446aa
1 //===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file contains the ARM definition DAG scheduling mutations which
10 /// change inter-instruction latencies
12 //===----------------------------------------------------------------------===//
14 #include "ARMLatencyMutations.h"
15 #include "ARMSubtarget.h"
16 #include "Thumb2InstrInfo.h"
17 #include "llvm/Analysis/AliasAnalysis.h"
18 #include "llvm/CodeGen/ScheduleDAG.h"
19 #include "llvm/CodeGen/ScheduleDAGMutation.h"
20 #include "llvm/CodeGen/TargetInstrInfo.h"
21 #include <algorithm>
22 #include <array>
23 #include <initializer_list>
24 #include <memory>
26 namespace llvm {
28 namespace {
30 // Precompute information about opcodes to speed up pass
32 class InstructionInformation {
33 protected:
34 struct IInfo {
35 bool HasBRegAddr : 1; // B-side of addr gen is a register
36 bool HasBRegAddrShift : 1; // B-side of addr gen has a shift
37 bool IsDivide : 1; // Some form of integer divide
38 bool IsInlineShiftALU : 1; // Inline shift+ALU
39 bool IsMultiply : 1; // Some form of integer multiply
40 bool IsMVEIntMAC : 1; // MVE 8/16/32-bit integer MAC operation
41 bool IsNonSubwordLoad : 1; // Load which is a word or larger
42 bool IsShift : 1; // Shift operation
43 bool IsRev : 1; // REV operation
44 bool ProducesQP : 1; // Produces a vector register result
45 bool ProducesDP : 1; // Produces a double-precision register result
46 bool ProducesSP : 1; // Produces a single-precision register result
47 bool ConsumesQP : 1; // Consumes a vector register result
48 bool ConsumesDP : 1; // Consumes a double-precision register result
49 bool ConsumesSP : 1; // Consumes a single-precision register result
50 unsigned MVEIntMACMatched; // Matched operand type (for MVE)
51 unsigned AddressOpMask; // Mask indicating which operands go into AGU
52 IInfo()
53 : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),
54 IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),
55 IsNonSubwordLoad(false), IsShift(false), IsRev(false),
56 ProducesQP(false), ProducesDP(false), ProducesSP(false),
57 ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),
58 MVEIntMACMatched(0), AddressOpMask(0) {}
60 typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;
61 IInfoArray Info;
63 public:
64 // Always available information
65 unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; }
66 bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; }
67 bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; }
68 bool isDivide(unsigned Op) { return Info[Op].IsDivide; }
69 bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; }
70 bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; }
71 bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; }
72 bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; }
73 bool isRev(unsigned Op) { return Info[Op].IsRev; }
74 bool isShift(unsigned Op) { return Info[Op].IsShift; }
76 // information available if markDPConsumers is called.
77 bool producesQP(unsigned Op) { return Info[Op].ProducesQP; }
78 bool producesDP(unsigned Op) { return Info[Op].ProducesDP; }
79 bool producesSP(unsigned Op) { return Info[Op].ProducesSP; }
80 bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; }
81 bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; }
82 bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; }
84 bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {
85 return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp;
88 InstructionInformation(const ARMBaseInstrInfo *TII);
90 protected:
91 void markDPProducersConsumers(const ARMBaseInstrInfo *TII);
94 InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {
95 using namespace ARM;
97 std::initializer_list<unsigned> hasBRegAddrList = {
98 t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
99 tLDRr, tLDRBr, tLDRHr, tSTRr, tSTRBr, tSTRHr,
101 for (auto op : hasBRegAddrList) {
102 Info[op].HasBRegAddr = true;
105 std::initializer_list<unsigned> hasBRegAddrShiftList = {
106 t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
108 for (auto op : hasBRegAddrShiftList) {
109 Info[op].HasBRegAddrShift = true;
112 Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
114 std::initializer_list<unsigned> isInlineShiftALUList = {
115 t2ADCrs, t2ADDSrs, t2ADDrs, t2BICrs, t2EORrs,
116 t2ORNrs, t2RSBSrs, t2RSBrs, t2SBCrs, t2SUBrs,
117 t2SUBSrs, t2CMPrs, t2CMNzrs, t2TEQrs, t2TSTrs,
119 for (auto op : isInlineShiftALUList) {
120 Info[op].IsInlineShiftALU = true;
123 Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
125 std::initializer_list<unsigned> isMultiplyList = {
126 t2MUL, t2MLA, t2MLS, t2SMLABB, t2SMLABT, t2SMLAD, t2SMLADX,
127 t2SMLAL, t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,
128 t2SMLATB, t2SMLATT, t2SMLAWT, t2SMLSD, t2SMLSDX, t2SMLSLD, t2SMLSLDX,
129 t2SMMLA, t2SMMLAR, t2SMMLS, t2SMMLSR, t2SMMUL, t2SMMULR, t2SMUAD,
130 t2SMUADX, t2SMULBB, t2SMULBT, t2SMULL, t2SMULTB, t2SMULTT, t2SMULWT,
131 t2SMUSD, t2SMUSDX, t2UMAAL, t2UMLAL, t2UMULL, tMUL,
133 for (auto op : isMultiplyList) {
134 Info[op].IsMultiply = true;
137 std::initializer_list<unsigned> isMVEIntMACList = {
138 MVE_VMLAS_qr_i16, MVE_VMLAS_qr_i32, MVE_VMLAS_qr_i8,
139 MVE_VMLA_qr_i16, MVE_VMLA_qr_i32, MVE_VMLA_qr_i8,
140 MVE_VQDMLAH_qrs16, MVE_VQDMLAH_qrs32, MVE_VQDMLAH_qrs8,
141 MVE_VQDMLASH_qrs16, MVE_VQDMLASH_qrs32, MVE_VQDMLASH_qrs8,
142 MVE_VQRDMLAH_qrs16, MVE_VQRDMLAH_qrs32, MVE_VQRDMLAH_qrs8,
143 MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,
144 MVE_VQDMLADHXs16, MVE_VQDMLADHXs32, MVE_VQDMLADHXs8,
145 MVE_VQDMLADHs16, MVE_VQDMLADHs32, MVE_VQDMLADHs8,
146 MVE_VQDMLSDHXs16, MVE_VQDMLSDHXs32, MVE_VQDMLSDHXs8,
147 MVE_VQDMLSDHs16, MVE_VQDMLSDHs32, MVE_VQDMLSDHs8,
148 MVE_VQRDMLADHXs16, MVE_VQRDMLADHXs32, MVE_VQRDMLADHXs8,
149 MVE_VQRDMLADHs16, MVE_VQRDMLADHs32, MVE_VQRDMLADHs8,
150 MVE_VQRDMLSDHXs16, MVE_VQRDMLSDHXs32, MVE_VQRDMLSDHXs8,
151 MVE_VQRDMLSDHs16, MVE_VQRDMLSDHs32, MVE_VQRDMLSDHs8,
153 for (auto op : isMVEIntMACList) {
154 Info[op].IsMVEIntMAC = true;
157 std::initializer_list<unsigned> isNonSubwordLoadList = {
158 t2LDRi12, t2LDRi8, t2LDR_POST, t2LDR_PRE, t2LDRpci,
159 t2LDRs, t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,
160 tLDRpci, tLDRr, tLDRspi,
162 for (auto op : isNonSubwordLoadList) {
163 Info[op].IsNonSubwordLoad = true;
166 std::initializer_list<unsigned> isRevList = {
167 t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,
169 for (auto op : isRevList) {
170 Info[op].IsRev = true;
173 std::initializer_list<unsigned> isShiftList = {
174 t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,
175 tASRri, tASRrr, tLSLSri, tLSLri, tLSLrr, tLSRri, tLSRrr, tROR,
177 for (auto op : isShiftList) {
178 Info[op].IsShift = true;
181 std::initializer_list<unsigned> Address1List = {
182 t2LDRBi12,
183 t2LDRBi8,
184 t2LDRBpci,
185 t2LDRBs,
186 t2LDRHi12,
187 t2LDRHi8,
188 t2LDRHpci,
189 t2LDRHs,
190 t2LDRSBi12,
191 t2LDRSBi8,
192 t2LDRSBpci,
193 t2LDRSBs,
194 t2LDRSHi12,
195 t2LDRSHi8,
196 t2LDRSHpci,
197 t2LDRSHs,
198 t2LDRi12,
199 t2LDRi8,
200 t2LDRpci,
201 t2LDRs,
202 tLDRBi,
203 tLDRBr,
204 tLDRHi,
205 tLDRHr,
206 tLDRSB,
207 tLDRSH,
208 tLDRi,
209 tLDRpci,
210 tLDRr,
211 tLDRspi,
212 t2STRBi12,
213 t2STRBi8,
214 t2STRBs,
215 t2STRHi12,
216 t2STRHi8,
217 t2STRHs,
218 t2STRi12,
219 t2STRi8,
220 t2STRs,
221 tSTRBi,
222 tSTRBr,
223 tSTRHi,
224 tSTRHr,
225 tSTRi,
226 tSTRr,
227 tSTRspi,
228 VLDRD,
229 VLDRH,
230 VLDRS,
231 VSTRD,
232 VSTRH,
233 VSTRS,
234 MVE_VLD20_16,
235 MVE_VLD20_32,
236 MVE_VLD20_8,
237 MVE_VLD21_16,
238 MVE_VLD21_32,
239 MVE_VLD21_8,
240 MVE_VLD40_16,
241 MVE_VLD40_32,
242 MVE_VLD40_8,
243 MVE_VLD41_16,
244 MVE_VLD41_32,
245 MVE_VLD41_8,
246 MVE_VLD42_16,
247 MVE_VLD42_32,
248 MVE_VLD42_8,
249 MVE_VLD43_16,
250 MVE_VLD43_32,
251 MVE_VLD43_8,
252 MVE_VLDRBS16,
253 MVE_VLDRBS16_rq,
254 MVE_VLDRBS32,
255 MVE_VLDRBS32_rq,
256 MVE_VLDRBU16,
257 MVE_VLDRBU16_rq,
258 MVE_VLDRBU32,
259 MVE_VLDRBU32_rq,
260 MVE_VLDRBU8,
261 MVE_VLDRBU8_rq,
262 MVE_VLDRDU64_qi,
263 MVE_VLDRDU64_rq,
264 MVE_VLDRDU64_rq_u,
265 MVE_VLDRHS32,
266 MVE_VLDRHS32_rq,
267 MVE_VLDRHS32_rq_u,
268 MVE_VLDRHU16,
269 MVE_VLDRHU16_rq,
270 MVE_VLDRHU16_rq_u,
271 MVE_VLDRHU32,
272 MVE_VLDRHU32_rq,
273 MVE_VLDRHU32_rq_u,
274 MVE_VLDRWU32,
275 MVE_VLDRWU32_qi,
276 MVE_VLDRWU32_rq,
277 MVE_VLDRWU32_rq_u,
278 MVE_VST20_16,
279 MVE_VST20_32,
280 MVE_VST20_8,
281 MVE_VST21_16,
282 MVE_VST21_32,
283 MVE_VST21_8,
284 MVE_VST40_16,
285 MVE_VST40_32,
286 MVE_VST40_8,
287 MVE_VST41_16,
288 MVE_VST41_32,
289 MVE_VST41_8,
290 MVE_VST42_16,
291 MVE_VST42_32,
292 MVE_VST42_8,
293 MVE_VST43_16,
294 MVE_VST43_32,
295 MVE_VST43_8,
296 MVE_VSTRB16,
297 MVE_VSTRB16_rq,
298 MVE_VSTRB32,
299 MVE_VSTRB32_rq,
300 MVE_VSTRBU8,
301 MVE_VSTRB8_rq,
302 MVE_VSTRD64_qi,
303 MVE_VSTRD64_rq,
304 MVE_VSTRD64_rq_u,
305 MVE_VSTRH32,
306 MVE_VSTRH32_rq,
307 MVE_VSTRH32_rq_u,
308 MVE_VSTRHU16,
309 MVE_VSTRH16_rq,
310 MVE_VSTRH16_rq_u,
311 MVE_VSTRWU32,
312 MVE_VSTRW32_qi,
313 MVE_VSTRW32_rq,
314 MVE_VSTRW32_rq_u,
316 std::initializer_list<unsigned> Address2List = {
317 t2LDRB_POST,
318 t2LDRB_PRE,
319 t2LDRDi8,
320 t2LDRH_POST,
321 t2LDRH_PRE,
322 t2LDRSB_POST,
323 t2LDRSB_PRE,
324 t2LDRSH_POST,
325 t2LDRSH_PRE,
326 t2LDR_POST,
327 t2LDR_PRE,
328 t2STRB_POST,
329 t2STRB_PRE,
330 t2STRDi8,
331 t2STRH_POST,
332 t2STRH_PRE,
333 t2STR_POST,
334 t2STR_PRE,
335 MVE_VLD20_16_wb,
336 MVE_VLD20_32_wb,
337 MVE_VLD20_8_wb,
338 MVE_VLD21_16_wb,
339 MVE_VLD21_32_wb,
340 MVE_VLD21_8_wb,
341 MVE_VLD40_16_wb,
342 MVE_VLD40_32_wb,
343 MVE_VLD40_8_wb,
344 MVE_VLD41_16_wb,
345 MVE_VLD41_32_wb,
346 MVE_VLD41_8_wb,
347 MVE_VLD42_16_wb,
348 MVE_VLD42_32_wb,
349 MVE_VLD42_8_wb,
350 MVE_VLD43_16_wb,
351 MVE_VLD43_32_wb,
352 MVE_VLD43_8_wb,
353 MVE_VLDRBS16_post,
354 MVE_VLDRBS16_pre,
355 MVE_VLDRBS32_post,
356 MVE_VLDRBS32_pre,
357 MVE_VLDRBU16_post,
358 MVE_VLDRBU16_pre,
359 MVE_VLDRBU32_post,
360 MVE_VLDRBU32_pre,
361 MVE_VLDRBU8_post,
362 MVE_VLDRBU8_pre,
363 MVE_VLDRDU64_qi_pre,
364 MVE_VLDRHS32_post,
365 MVE_VLDRHS32_pre,
366 MVE_VLDRHU16_post,
367 MVE_VLDRHU16_pre,
368 MVE_VLDRHU32_post,
369 MVE_VLDRHU32_pre,
370 MVE_VLDRWU32_post,
371 MVE_VLDRWU32_pre,
372 MVE_VLDRWU32_qi_pre,
373 MVE_VST20_16_wb,
374 MVE_VST20_32_wb,
375 MVE_VST20_8_wb,
376 MVE_VST21_16_wb,
377 MVE_VST21_32_wb,
378 MVE_VST21_8_wb,
379 MVE_VST40_16_wb,
380 MVE_VST40_32_wb,
381 MVE_VST40_8_wb,
382 MVE_VST41_16_wb,
383 MVE_VST41_32_wb,
384 MVE_VST41_8_wb,
385 MVE_VST42_16_wb,
386 MVE_VST42_32_wb,
387 MVE_VST42_8_wb,
388 MVE_VST43_16_wb,
389 MVE_VST43_32_wb,
390 MVE_VST43_8_wb,
391 MVE_VSTRB16_post,
392 MVE_VSTRB16_pre,
393 MVE_VSTRB32_post,
394 MVE_VSTRB32_pre,
395 MVE_VSTRBU8_post,
396 MVE_VSTRBU8_pre,
397 MVE_VSTRD64_qi_pre,
398 MVE_VSTRH32_post,
399 MVE_VSTRH32_pre,
400 MVE_VSTRHU16_post,
401 MVE_VSTRHU16_pre,
402 MVE_VSTRWU32_post,
403 MVE_VSTRWU32_pre,
404 MVE_VSTRW32_qi_pre,
406 std::initializer_list<unsigned> Address3List = {
407 t2LDRD_POST,
408 t2LDRD_PRE,
409 t2STRD_POST,
410 t2STRD_PRE,
412 // Compute a mask of which operands are involved in address computation
413 for (auto &op : Address1List) {
414 Info[op].AddressOpMask = 0x6;
416 for (auto &op : Address2List) {
417 Info[op].AddressOpMask = 0xc;
419 for (auto &op : Address3List) {
420 Info[op].AddressOpMask = 0x18;
422 for (auto &op : hasBRegAddrShiftList) {
423 Info[op].AddressOpMask |= 0x8;
427 void InstructionInformation::markDPProducersConsumers(
428 const ARMBaseInstrInfo *TII) {
429 // Learn about all instructions which have FP source/dest registers
430 for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) {
431 const MCInstrDesc &MID = TII->get(MI);
432 auto Operands = MID.operands();
433 for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) {
434 bool MarkQP = false, MarkDP = false, MarkSP = false;
435 switch (Operands[OI].RegClass) {
436 case ARM::MQPRRegClassID:
437 case ARM::DPRRegClassID:
438 case ARM::DPR_8RegClassID:
439 case ARM::DPR_VFP2RegClassID:
440 case ARM::DPairRegClassID:
441 case ARM::DPairSpcRegClassID:
442 case ARM::DQuadRegClassID:
443 case ARM::DQuadSpcRegClassID:
444 case ARM::DTripleRegClassID:
445 case ARM::DTripleSpcRegClassID:
446 MarkDP = true;
447 break;
448 case ARM::QPRRegClassID:
449 case ARM::QPR_8RegClassID:
450 case ARM::QPR_VFP2RegClassID:
451 case ARM::QQPRRegClassID:
452 case ARM::QQQQPRRegClassID:
453 MarkQP = true;
454 break;
455 case ARM::SPRRegClassID:
456 case ARM::SPR_8RegClassID:
457 case ARM::FPWithVPRRegClassID:
458 MarkSP = true;
459 break;
460 default:
461 break;
463 if (MarkQP) {
464 if (OI < MID.getNumDefs())
465 Info[MI].ProducesQP = true;
466 else
467 Info[MI].ConsumesQP = true;
469 if (MarkDP) {
470 if (OI < MID.getNumDefs())
471 Info[MI].ProducesDP = true;
472 else
473 Info[MI].ConsumesDP = true;
475 if (MarkSP) {
476 if (OI < MID.getNumDefs())
477 Info[MI].ProducesSP = true;
478 else
479 Info[MI].ConsumesSP = true;
485 } // anonymous namespace
487 static bool hasImplicitCPSRUse(const MachineInstr *MI) {
488 return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR);
491 void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep,
492 unsigned latency) {
493 SDep Reverse = SrcDep;
494 Reverse.setSUnit(&SrcSU);
495 for (SDep &PDep : SrcDep.getSUnit()->Preds) {
496 if (PDep == Reverse) {
497 PDep.setLatency(latency);
498 SrcDep.getSUnit()->setDepthDirty();
499 break;
502 SrcDep.setLatency(latency);
503 SrcSU.setHeightDirty();
506 static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) {
507 return (a & 0xe) != (b & 0xe);
510 // Set output dependences to zero latency for processors which can
511 // simultaneously issue to the same register. Returns true if a change
512 // was made.
513 bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) {
514 if (Dep.getKind() == SDep::Output) {
515 setBidirLatencies(ISU, Dep, 0);
516 return true;
518 return false;
521 // The graph doesn't look inside of bundles to determine their
522 // scheduling boundaries and reports zero latency into and out of them
523 // (except for CPSR into the bundle, which has latency 1).
524 // Make some better scheduling assumptions:
525 // 1) CPSR uses have zero latency; other uses have incoming latency 1
526 // 2) CPSR defs retain a latency of zero; others have a latency of 1.
528 // Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise
529 unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) {
531 SUnit &DepSU = *Dep.getSUnit();
532 const MachineInstr *SrcMI = ISU.getInstr();
533 unsigned SrcOpcode = SrcMI->getOpcode();
534 const MachineInstr *DstMI = DepSU.getInstr();
535 unsigned DstOpcode = DstMI->getOpcode();
537 if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) {
538 setBidirLatencies(
539 ISU, Dep,
540 (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1);
541 return 1;
543 if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) &&
544 Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {
545 setBidirLatencies(ISU, Dep, 1);
546 return 2;
548 return 0;
551 // Determine whether there is a memory RAW hazard here and set up latency
552 // accordingly
553 bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep,
554 unsigned latency) {
555 if (!Dep.isNormalMemory())
556 return false;
557 auto &SrcInst = *ISU.getInstr();
558 auto &DstInst = *Dep.getSUnit()->getInstr();
559 if (!SrcInst.mayStore() || !DstInst.mayLoad())
560 return false;
562 auto SrcMO = *SrcInst.memoperands().begin();
563 auto DstMO = *DstInst.memoperands().begin();
564 auto SrcVal = SrcMO->getValue();
565 auto DstVal = DstMO->getValue();
566 auto SrcPseudoVal = SrcMO->getPseudoValue();
567 auto DstPseudoVal = DstMO->getPseudoValue();
568 if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias &&
569 SrcMO->getOffset() == DstMO->getOffset()) {
570 setBidirLatencies(ISU, Dep, latency);
571 return true;
572 } else if (SrcPseudoVal && DstPseudoVal &&
573 SrcPseudoVal->kind() == DstPseudoVal->kind() &&
574 SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {
575 // Spills/fills
576 auto FS0 = cast<FixedStackPseudoSourceValue>(SrcPseudoVal);
577 auto FS1 = cast<FixedStackPseudoSourceValue>(DstPseudoVal);
578 if (FS0 == FS1) {
579 setBidirLatencies(ISU, Dep, latency);
580 return true;
583 return false;
586 namespace {
588 std::unique_ptr<InstructionInformation> II;
590 class CortexM7InstructionInformation : public InstructionInformation {
591 public:
592 CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)
593 : InstructionInformation(TII) {}
596 class CortexM7Overrides : public ARMOverrideBypasses {
597 public:
598 CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
599 : ARMOverrideBypasses(TII, AA) {
600 if (!II)
601 II.reset(new CortexM7InstructionInformation(TII));
604 void modifyBypasses(SUnit &) override;
607 void CortexM7Overrides::modifyBypasses(SUnit &ISU) {
608 const MachineInstr *SrcMI = ISU.getInstr();
609 unsigned SrcOpcode = SrcMI->getOpcode();
610 bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
612 // Walk the successors looking for latency overrides that are needed
613 for (SDep &Dep : ISU.Succs) {
615 // Output dependences should have 0 latency, as M7 is able to
616 // schedule writers to the same register for simultaneous issue.
617 if (zeroOutputDependences(ISU, Dep))
618 continue;
620 if (memoryRAWHazard(ISU, Dep, 4))
621 continue;
623 // Ignore dependencies other than data
624 if (Dep.getKind() != SDep::Data)
625 continue;
627 SUnit &DepSU = *Dep.getSUnit();
628 if (DepSU.isBoundaryNode())
629 continue;
631 if (makeBundleAssumptions(ISU, Dep) == 1)
632 continue;
634 const MachineInstr *DstMI = DepSU.getInstr();
635 unsigned DstOpcode = DstMI->getOpcode();
637 // Word loads into any multiply or divide instruction are considered
638 // cannot bypass their scheduling stage. Didn't do this in the .td file
639 // because we cannot easily create a read advance that is 0 from certain
640 // writer classes and 1 from all the rest.
641 // (The other way around would have been easy.)
642 if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode)))
643 setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
645 // Word loads into B operand of a load/store are considered cannot bypass
646 // their scheduling stage. Cannot do in the .td file because
647 // need to decide between -1 and -2 for ReadAdvance
648 if (isNSWload && II->hasBRegAddr(DstOpcode) &&
649 DstMI->getOperand(2).getReg() == Dep.getReg())
650 setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
652 // Multiplies into any address generation cannot bypass from EX3. Cannot do
653 // in the .td file because need to decide between -1 and -2 for ReadAdvance
654 if (II->isMultiply(SrcOpcode)) {
655 unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1;
656 for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) {
657 if ((OpMask & 1) && DstMI->getOperand(i).isReg() &&
658 DstMI->getOperand(i).getReg() == Dep.getReg()) {
659 setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1
660 break;
665 // Mismatched conditional producers take longer on M7; they end up looking
666 // like they were produced at EX3 and read at IS.
667 if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() &&
668 (SrcOpcode == ARM::BUNDLE ||
669 mismatchedPred(TII->getPredicate(*SrcMI),
670 TII->getPredicate(*DstMI)))) {
671 unsigned Lat = 1;
672 // Operand A of shift+ALU is treated as an EX1 read instead of EX2.
673 if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() &&
674 DstMI->getOperand(1).getReg() == Dep.getReg())
675 Lat = 2;
676 Lat = std::min(3u, Dep.getLatency() + Lat);
677 setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat));
680 // CC setter into conditional producer shouldn't have a latency of more
681 // than 1 unless it's due to an implicit read. (All the "true" readers
682 // of the condition code use an implicit read, and predicates use an
683 // explicit.)
684 if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
685 TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
686 setBidirLatencies(ISU, Dep, 1);
688 // REV instructions cannot bypass directly into the EX1 shifter. The
689 // code is slightly inexact as it doesn't attempt to ensure that the bypass
690 // is to the shifter operands.
691 if (II->isRev(SrcOpcode)) {
692 if (II->isInlineShiftALU(DstOpcode))
693 setBidirLatencies(ISU, Dep, 2);
694 else if (II->isShift(DstOpcode))
695 setBidirLatencies(ISU, Dep, 1);
700 class M85InstructionInformation : public InstructionInformation {
701 public:
702 M85InstructionInformation(const ARMBaseInstrInfo *t)
703 : InstructionInformation(t) {
704 markDPProducersConsumers(t);
708 class M85Overrides : public ARMOverrideBypasses {
709 public:
710 M85Overrides(const ARMBaseInstrInfo *t, AAResults *a)
711 : ARMOverrideBypasses(t, a) {
712 if (!II)
713 II.reset(new M85InstructionInformation(t));
716 void modifyBypasses(SUnit &) override;
718 private:
719 unsigned computeBypassStage(const MCSchedClassDesc *SCD);
720 signed modifyMixedWidthFP(const MachineInstr *SrcMI,
721 const MachineInstr *DstMI, unsigned RegID,
722 const MCSchedClassDesc *SCD);
725 unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) {
726 auto SM = DAG->getSchedModel();
727 unsigned DefIdx = 0; // just look for the first output's timing
728 if (DefIdx < SCDesc->NumWriteLatencyEntries) {
729 // Lookup the definition's write latency in SubtargetInfo.
730 const MCWriteLatencyEntry *WLEntry =
731 SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx);
732 unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000;
733 if (Latency == 4)
734 return 2;
735 else if (Latency == 5)
736 return 3;
737 else if (Latency > 3)
738 return 3;
739 else
740 return Latency;
742 return 2;
745 // Latency changes for bypassing between FP registers of different sizes:
747 // Note that mixed DP/SP are unlikely because of the semantics
748 // of C. Mixed MVE/SP are quite common when MVE intrinsics are used.
749 signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,
750 const MachineInstr *DstMI,
751 unsigned RegID,
752 const MCSchedClassDesc *SCD) {
754 if (!II->producesSP(SrcMI->getOpcode()) &&
755 !II->producesDP(SrcMI->getOpcode()) &&
756 !II->producesQP(SrcMI->getOpcode()))
757 return 0;
759 if (Register::isVirtualRegister(RegID)) {
760 if (II->producesSP(SrcMI->getOpcode()) &&
761 II->consumesDP(DstMI->getOpcode())) {
762 for (auto &OP : SrcMI->operands())
763 if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
764 OP.getSubReg() == ARM::ssub_1)
765 return 5 - computeBypassStage(SCD);
766 } else if (II->producesSP(SrcMI->getOpcode()) &&
767 II->consumesQP(DstMI->getOpcode())) {
768 for (auto &OP : SrcMI->operands())
769 if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
770 (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
771 return 5 - computeBypassStage(SCD) -
772 ((OP.getSubReg() == ARM::ssub_2 ||
773 OP.getSubReg() == ARM::ssub_3)
775 : 0);
776 } else if (II->producesDP(SrcMI->getOpcode()) &&
777 II->consumesQP(DstMI->getOpcode())) {
778 for (auto &OP : SrcMI->operands())
779 if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
780 OP.getSubReg() == ARM::ssub_1)
781 return -1;
782 } else if (II->producesDP(SrcMI->getOpcode()) &&
783 II->consumesSP(DstMI->getOpcode())) {
784 for (auto &OP : DstMI->operands())
785 if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
786 OP.getSubReg() == ARM::ssub_1)
787 return 5 - computeBypassStage(SCD);
788 } else if (II->producesQP(SrcMI->getOpcode()) &&
789 II->consumesSP(DstMI->getOpcode())) {
790 for (auto &OP : DstMI->operands())
791 if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
792 (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
793 return 5 - computeBypassStage(SCD) +
794 ((OP.getSubReg() == ARM::ssub_2 ||
795 OP.getSubReg() == ARM::ssub_3)
797 : 0);
798 } else if (II->producesQP(SrcMI->getOpcode()) &&
799 II->consumesDP(DstMI->getOpcode())) {
800 for (auto &OP : DstMI->operands())
801 if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
802 OP.getSubReg() == ARM::ssub_1)
803 return 1;
805 } else if (Register::isPhysicalRegister(RegID)) {
806 // Note that when the producer is narrower, not all of the producers
807 // may be present in the scheduling graph; somewhere earlier in the
808 // compiler, an implicit def/use of the aliased full register gets
809 // added to the producer, and so only that producer is seen as *the*
810 // single producer. This behavior also has the unfortunate effect of
811 // serializing the producers in the compiler's view of things.
812 if (II->producesSP(SrcMI->getOpcode()) &&
813 II->consumesDP(DstMI->getOpcode())) {
814 for (auto &OP : SrcMI->operands())
815 if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
816 OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
817 (OP.getReg() == RegID ||
818 (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
819 (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
820 return 5 - computeBypassStage(SCD);
821 } else if (II->producesSP(SrcMI->getOpcode()) &&
822 II->consumesQP(DstMI->getOpcode())) {
823 for (auto &OP : SrcMI->operands())
824 if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
825 OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
826 (OP.getReg() == RegID ||
827 (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
828 (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
829 return 5 - computeBypassStage(SCD) -
830 (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0);
831 } else if (II->producesDP(SrcMI->getOpcode()) &&
832 II->consumesQP(DstMI->getOpcode())) {
833 for (auto &OP : SrcMI->operands())
834 if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 &&
835 OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 &&
836 (OP.getReg() == RegID ||
837 (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID))
838 return -1;
839 } else if (II->producesDP(SrcMI->getOpcode()) &&
840 II->consumesSP(DstMI->getOpcode())) {
841 if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
842 return 5 - computeBypassStage(SCD);
843 } else if (II->producesQP(SrcMI->getOpcode()) &&
844 II->consumesSP(DstMI->getOpcode())) {
845 if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
846 return 5 - computeBypassStage(SCD) +
847 (((RegID - ARM::S0) / 2) % 2 ? 1 : 0);
848 } else if (II->producesQP(SrcMI->getOpcode()) &&
849 II->consumesDP(DstMI->getOpcode())) {
850 if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2)
851 return 1;
854 return 0;
857 void M85Overrides::modifyBypasses(SUnit &ISU) {
858 const MachineInstr *SrcMI = ISU.getInstr();
859 unsigned SrcOpcode = SrcMI->getOpcode();
860 bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
862 // Walk the successors looking for latency overrides that are needed
863 for (SDep &Dep : ISU.Succs) {
865 // Output dependences should have 0 latency, as CortexM85 is able to
866 // schedule writers to the same register for simultaneous issue.
867 if (zeroOutputDependences(ISU, Dep))
868 continue;
870 if (memoryRAWHazard(ISU, Dep, 3))
871 continue;
873 // Ignore dependencies other than data or strong ordering.
874 if (Dep.getKind() != SDep::Data)
875 continue;
877 SUnit &DepSU = *Dep.getSUnit();
878 if (DepSU.isBoundaryNode())
879 continue;
881 if (makeBundleAssumptions(ISU, Dep) == 1)
882 continue;
884 const MachineInstr *DstMI = DepSU.getInstr();
885 unsigned DstOpcode = DstMI->getOpcode();
887 // Word loads into B operand of a load/store with cannot bypass their
888 // scheduling stage. Cannot do in the .td file because need to decide
889 // between -1 and -2 for ReadAdvance
891 if (isNSWload && II->hasBRegAddrShift(DstOpcode) &&
892 DstMI->getOperand(3).getImm() != 0 && // shift operand
893 DstMI->getOperand(2).getReg() == Dep.getReg())
894 setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
896 if (isNSWload && isMVEVectorInstruction(DstMI)) {
897 setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
900 if (II->isMVEIntMAC(DstOpcode) &&
901 II->isMVEIntMACMatched(SrcOpcode, DstOpcode) &&
902 DstMI->getOperand(0).isReg() &&
903 DstMI->getOperand(0).getReg() == Dep.getReg())
904 setBidirLatencies(ISU, Dep, Dep.getLatency() - 1);
906 // CC setter into conditional producer shouldn't have a latency of more
907 // than 0 unless it's due to an implicit read.
908 if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
909 TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
910 setBidirLatencies(ISU, Dep, 0);
912 if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(),
913 DAG->getSchedClass(&ISU)))
914 setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat));
916 if (II->isRev(SrcOpcode)) {
917 if (II->isInlineShiftALU(DstOpcode))
918 setBidirLatencies(ISU, Dep, 1);
919 else if (II->isShift(DstOpcode))
920 setBidirLatencies(ISU, Dep, 1);
925 // Add M55 specific overrides for latencies between instructions. Currently it:
926 // - Adds an extra cycle latency between MVE VMLAV and scalar instructions.
927 class CortexM55Overrides : public ARMOverrideBypasses {
928 public:
929 CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
930 : ARMOverrideBypasses(TII, AA) {}
932 void modifyBypasses(SUnit &SU) override {
933 MachineInstr *SrcMI = SU.getInstr();
934 if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction))
935 return;
937 for (SDep &Dep : SU.Succs) {
938 if (Dep.getKind() != SDep::Data)
939 continue;
940 SUnit &DepSU = *Dep.getSUnit();
941 if (DepSU.isBoundaryNode())
942 continue;
943 MachineInstr *DstMI = DepSU.getInstr();
945 if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore())
946 setBidirLatencies(SU, Dep, 3);
951 } // end anonymous namespace
953 void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) {
954 DAG = DAGInstrs;
955 for (SUnit &ISU : DAGInstrs->SUnits) {
956 if (ISU.isBoundaryNode())
957 continue;
958 modifyBypasses(ISU);
960 if (DAGInstrs->ExitSU.getInstr())
961 modifyBypasses(DAGInstrs->ExitSU);
964 std::unique_ptr<ScheduleDAGMutation>
965 createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) {
966 if (ST.isCortexM85())
967 return std::make_unique<M85Overrides>(ST.getInstrInfo(), AA);
968 else if (ST.isCortexM7())
969 return std::make_unique<CortexM7Overrides>(ST.getInstrInfo(), AA);
970 else if (ST.isCortexM55())
971 return std::make_unique<CortexM55Overrides>(ST.getInstrInfo(), AA);
973 return nullptr;
976 } // end namespace llvm