llvm/lib/Target/ARM/ARMLatencyMutations.cpp

   1 //===- ARMLatencyMutations.cpp - ARM Latency Mutations --------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file This file contains the ARM definition DAG scheduling mutations which
  10 /// change inter-instruction latencies
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "ARMLatencyMutations.h"
  15 #include "ARMSubtarget.h"
  16 #include "Thumb2InstrInfo.h"
  17 #include "llvm/Analysis/AliasAnalysis.h"
  18 #include "llvm/CodeGen/ScheduleDAG.h"
  19 #include "llvm/CodeGen/ScheduleDAGMutation.h"
  20 #include "llvm/CodeGen/TargetInstrInfo.h"
  21 #include <algorithm>
  22 #include <array>
  23 #include <initializer_list>
  24 #include <memory>
  25
  26 namespace llvm {
  27
  28 namespace {
  29
  30 // Precompute information about opcodes to speed up pass
  31
  32 class InstructionInformation {
  33 protected:
  34   struct IInfo {
  35     bool HasBRegAddr : 1;      // B-side of addr gen is a register
  36     bool HasBRegAddrShift : 1; // B-side of addr gen has a shift
  37     bool IsDivide : 1;         // Some form of integer divide
  38     bool IsInlineShiftALU : 1; // Inline shift+ALU
  39     bool IsMultiply : 1;       // Some form of integer multiply
  40     bool IsMVEIntMAC : 1;      // MVE 8/16/32-bit integer MAC operation
  41     bool IsNonSubwordLoad : 1; // Load which is a word or larger
  42     bool IsShift : 1;          // Shift operation
  43     bool IsRev : 1;            // REV operation
  44     bool ProducesQP : 1;       // Produces a vector register result
  45     bool ProducesDP : 1;       // Produces a double-precision register result
  46     bool ProducesSP : 1;       // Produces a single-precision register result
  47     bool ConsumesQP : 1;       // Consumes a vector register result
  48     bool ConsumesDP : 1;       // Consumes a double-precision register result
  49     bool ConsumesSP : 1;       // Consumes a single-precision register result
  50     unsigned MVEIntMACMatched; // Matched operand type (for MVE)
  51     unsigned AddressOpMask;    // Mask indicating which operands go into AGU
  52     IInfo()
  53         : HasBRegAddr(false), HasBRegAddrShift(false), IsDivide(false),
  54           IsInlineShiftALU(false), IsMultiply(false), IsMVEIntMAC(false),
  55           IsNonSubwordLoad(false), IsShift(false), IsRev(false),
  56           ProducesQP(false), ProducesDP(false), ProducesSP(false),
  57           ConsumesQP(false), ConsumesDP(false), ConsumesSP(false),
  58           MVEIntMACMatched(0), AddressOpMask(0) {}
  59   };
  60   typedef std::array<IInfo, ARM::INSTRUCTION_LIST_END> IInfoArray;
  61   IInfoArray Info;
  62
  63 public:
  64   // Always available information
  65   unsigned getAddressOpMask(unsigned Op) { return Info[Op].AddressOpMask; }
  66   bool hasBRegAddr(unsigned Op) { return Info[Op].HasBRegAddr; }
  67   bool hasBRegAddrShift(unsigned Op) { return Info[Op].HasBRegAddrShift; }
  68   bool isDivide(unsigned Op) { return Info[Op].IsDivide; }
  69   bool isInlineShiftALU(unsigned Op) { return Info[Op].IsInlineShiftALU; }
  70   bool isMultiply(unsigned Op) { return Info[Op].IsMultiply; }
  71   bool isMVEIntMAC(unsigned Op) { return Info[Op].IsMVEIntMAC; }
  72   bool isNonSubwordLoad(unsigned Op) { return Info[Op].IsNonSubwordLoad; }
  73   bool isRev(unsigned Op) { return Info[Op].IsRev; }
  74   bool isShift(unsigned Op) { return Info[Op].IsShift; }
  75
  76   // information available if markDPConsumers is called.
  77   bool producesQP(unsigned Op) { return Info[Op].ProducesQP; }
  78   bool producesDP(unsigned Op) { return Info[Op].ProducesDP; }
  79   bool producesSP(unsigned Op) { return Info[Op].ProducesSP; }
  80   bool consumesQP(unsigned Op) { return Info[Op].ConsumesQP; }
  81   bool consumesDP(unsigned Op) { return Info[Op].ConsumesDP; }
  82   bool consumesSP(unsigned Op) { return Info[Op].ConsumesSP; }
  83
  84   bool isMVEIntMACMatched(unsigned SrcOp, unsigned DstOp) {
  85     return SrcOp == DstOp || Info[DstOp].MVEIntMACMatched == SrcOp;
  86   }
  87
  88   InstructionInformation(const ARMBaseInstrInfo *TII);
  89
  90 protected:
  91   void markDPProducersConsumers(const ARMBaseInstrInfo *TII);
  92 };
  93
  94 InstructionInformation::InstructionInformation(const ARMBaseInstrInfo *TII) {
  95   using namespace ARM;
  96
  97   std::initializer_list<unsigned> hasBRegAddrList = {
  98       t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
  99       tLDRr,  tLDRBr,  tLDRHr,  tSTRr,  tSTRBr,  tSTRHr,
 100   };
 101   for (auto op : hasBRegAddrList) {
 102     Info[op].HasBRegAddr = true;
 103   }
 104
 105   std::initializer_list<unsigned> hasBRegAddrShiftList = {
 106       t2LDRs, t2LDRBs, t2LDRHs, t2STRs, t2STRBs, t2STRHs,
 107   };
 108   for (auto op : hasBRegAddrShiftList) {
 109     Info[op].HasBRegAddrShift = true;
 110   }
 111
 112   Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
 113
 114   std::initializer_list<unsigned> isInlineShiftALUList = {
 115       t2ADCrs,  t2ADDSrs, t2ADDrs,  t2BICrs, t2EORrs,
 116       t2ORNrs,  t2RSBSrs, t2RSBrs,  t2SBCrs, t2SUBrs,
 117       t2SUBSrs, t2CMPrs,  t2CMNzrs, t2TEQrs, t2TSTrs,
 118   };
 119   for (auto op : isInlineShiftALUList) {
 120     Info[op].IsInlineShiftALU = true;
 121   }
 122
 123   Info[t2SDIV].IsDivide = Info[t2UDIV].IsDivide = true;
 124
 125   std::initializer_list<unsigned> isMultiplyList = {
 126       t2MUL,    t2MLA,     t2MLS,     t2SMLABB, t2SMLABT,  t2SMLAD,   t2SMLADX,
 127       t2SMLAL,  t2SMLALBB, t2SMLALBT, t2SMLALD, t2SMLALDX, t2SMLALTB, t2SMLALTT,
 128       t2SMLATB, t2SMLATT,  t2SMLAWT,  t2SMLSD,  t2SMLSDX,  t2SMLSLD,  t2SMLSLDX,
 129       t2SMMLA,  t2SMMLAR,  t2SMMLS,   t2SMMLSR, t2SMMUL,   t2SMMULR,  t2SMUAD,
 130       t2SMUADX, t2SMULBB,  t2SMULBT,  t2SMULL,  t2SMULTB,  t2SMULTT,  t2SMULWT,
 131       t2SMUSD,  t2SMUSDX,  t2UMAAL,   t2UMLAL,  t2UMULL,   tMUL,
 132   };
 133   for (auto op : isMultiplyList) {
 134     Info[op].IsMultiply = true;
 135   }
 136
 137   std::initializer_list<unsigned> isMVEIntMACList = {
 138       MVE_VMLAS_qr_i16,    MVE_VMLAS_qr_i32,    MVE_VMLAS_qr_i8,
 139       MVE_VMLA_qr_i16,     MVE_VMLA_qr_i32,     MVE_VMLA_qr_i8,
 140       MVE_VQDMLAH_qrs16,   MVE_VQDMLAH_qrs32,   MVE_VQDMLAH_qrs8,
 141       MVE_VQDMLASH_qrs16,  MVE_VQDMLASH_qrs32,  MVE_VQDMLASH_qrs8,
 142       MVE_VQRDMLAH_qrs16,  MVE_VQRDMLAH_qrs32,  MVE_VQRDMLAH_qrs8,
 143       MVE_VQRDMLASH_qrs16, MVE_VQRDMLASH_qrs32, MVE_VQRDMLASH_qrs8,
 144       MVE_VQDMLADHXs16,    MVE_VQDMLADHXs32,    MVE_VQDMLADHXs8,
 145       MVE_VQDMLADHs16,     MVE_VQDMLADHs32,     MVE_VQDMLADHs8,
 146       MVE_VQDMLSDHXs16,    MVE_VQDMLSDHXs32,    MVE_VQDMLSDHXs8,
 147       MVE_VQDMLSDHs16,     MVE_VQDMLSDHs32,     MVE_VQDMLSDHs8,
 148       MVE_VQRDMLADHXs16,   MVE_VQRDMLADHXs32,   MVE_VQRDMLADHXs8,
 149       MVE_VQRDMLADHs16,    MVE_VQRDMLADHs32,    MVE_VQRDMLADHs8,
 150       MVE_VQRDMLSDHXs16,   MVE_VQRDMLSDHXs32,   MVE_VQRDMLSDHXs8,
 151       MVE_VQRDMLSDHs16,    MVE_VQRDMLSDHs32,    MVE_VQRDMLSDHs8,
 152   };
 153   for (auto op : isMVEIntMACList) {
 154     Info[op].IsMVEIntMAC = true;
 155   }
 156
 157   std::initializer_list<unsigned> isNonSubwordLoadList = {
 158       t2LDRi12, t2LDRi8,  t2LDR_POST,  t2LDR_PRE,  t2LDRpci,
 159       t2LDRs,   t2LDRDi8, t2LDRD_POST, t2LDRD_PRE, tLDRi,
 160       tLDRpci,  tLDRr,    tLDRspi,
 161   };
 162   for (auto op : isNonSubwordLoadList) {
 163     Info[op].IsNonSubwordLoad = true;
 164   }
 165
 166   std::initializer_list<unsigned> isRevList = {
 167       t2REV, t2REV16, t2REVSH, t2RBIT, tREV, tREV16, tREVSH,
 168   };
 169   for (auto op : isRevList) {
 170     Info[op].IsRev = true;
 171   }
 172
 173   std::initializer_list<unsigned> isShiftList = {
 174       t2ASRri, t2ASRrr, t2LSLri, t2LSLrr, t2LSRri, t2LSRrr, t2RORri, t2RORrr,
 175       tASRri,  tASRrr,  tLSLSri, tLSLri,  tLSLrr,  tLSRri,  tLSRrr,  tROR,
 176   };
 177   for (auto op : isShiftList) {
 178     Info[op].IsShift = true;
 179   }
 180
 181   std::initializer_list<unsigned> Address1List = {
 182       t2LDRBi12,
 183       t2LDRBi8,
 184       t2LDRBpci,
 185       t2LDRBs,
 186       t2LDRHi12,
 187       t2LDRHi8,
 188       t2LDRHpci,
 189       t2LDRHs,
 190       t2LDRSBi12,
 191       t2LDRSBi8,
 192       t2LDRSBpci,
 193       t2LDRSBs,
 194       t2LDRSHi12,
 195       t2LDRSHi8,
 196       t2LDRSHpci,
 197       t2LDRSHs,
 198       t2LDRi12,
 199       t2LDRi8,
 200       t2LDRpci,
 201       t2LDRs,
 202       tLDRBi,
 203       tLDRBr,
 204       tLDRHi,
 205       tLDRHr,
 206       tLDRSB,
 207       tLDRSH,
 208       tLDRi,
 209       tLDRpci,
 210       tLDRr,
 211       tLDRspi,
 212       t2STRBi12,
 213       t2STRBi8,
 214       t2STRBs,
 215       t2STRHi12,
 216       t2STRHi8,
 217       t2STRHs,
 218       t2STRi12,
 219       t2STRi8,
 220       t2STRs,
 221       tSTRBi,
 222       tSTRBr,
 223       tSTRHi,
 224       tSTRHr,
 225       tSTRi,
 226       tSTRr,
 227       tSTRspi,
 228       VLDRD,
 229       VLDRH,
 230       VLDRS,
 231       VSTRD,
 232       VSTRH,
 233       VSTRS,
 234       MVE_VLD20_16,
 235       MVE_VLD20_32,
 236       MVE_VLD20_8,
 237       MVE_VLD21_16,
 238       MVE_VLD21_32,
 239       MVE_VLD21_8,
 240       MVE_VLD40_16,
 241       MVE_VLD40_32,
 242       MVE_VLD40_8,
 243       MVE_VLD41_16,
 244       MVE_VLD41_32,
 245       MVE_VLD41_8,
 246       MVE_VLD42_16,
 247       MVE_VLD42_32,
 248       MVE_VLD42_8,
 249       MVE_VLD43_16,
 250       MVE_VLD43_32,
 251       MVE_VLD43_8,
 252       MVE_VLDRBS16,
 253       MVE_VLDRBS16_rq,
 254       MVE_VLDRBS32,
 255       MVE_VLDRBS32_rq,
 256       MVE_VLDRBU16,
 257       MVE_VLDRBU16_rq,
 258       MVE_VLDRBU32,
 259       MVE_VLDRBU32_rq,
 260       MVE_VLDRBU8,
 261       MVE_VLDRBU8_rq,
 262       MVE_VLDRDU64_qi,
 263       MVE_VLDRDU64_rq,
 264       MVE_VLDRDU64_rq_u,
 265       MVE_VLDRHS32,
 266       MVE_VLDRHS32_rq,
 267       MVE_VLDRHS32_rq_u,
 268       MVE_VLDRHU16,
 269       MVE_VLDRHU16_rq,
 270       MVE_VLDRHU16_rq_u,
 271       MVE_VLDRHU32,
 272       MVE_VLDRHU32_rq,
 273       MVE_VLDRHU32_rq_u,
 274       MVE_VLDRWU32,
 275       MVE_VLDRWU32_qi,
 276       MVE_VLDRWU32_rq,
 277       MVE_VLDRWU32_rq_u,
 278       MVE_VST20_16,
 279       MVE_VST20_32,
 280       MVE_VST20_8,
 281       MVE_VST21_16,
 282       MVE_VST21_32,
 283       MVE_VST21_8,
 284       MVE_VST40_16,
 285       MVE_VST40_32,
 286       MVE_VST40_8,
 287       MVE_VST41_16,
 288       MVE_VST41_32,
 289       MVE_VST41_8,
 290       MVE_VST42_16,
 291       MVE_VST42_32,
 292       MVE_VST42_8,
 293       MVE_VST43_16,
 294       MVE_VST43_32,
 295       MVE_VST43_8,
 296       MVE_VSTRB16,
 297       MVE_VSTRB16_rq,
 298       MVE_VSTRB32,
 299       MVE_VSTRB32_rq,
 300       MVE_VSTRBU8,
 301       MVE_VSTRB8_rq,
 302       MVE_VSTRD64_qi,
 303       MVE_VSTRD64_rq,
 304       MVE_VSTRD64_rq_u,
 305       MVE_VSTRH32,
 306       MVE_VSTRH32_rq,
 307       MVE_VSTRH32_rq_u,
 308       MVE_VSTRHU16,
 309       MVE_VSTRH16_rq,
 310       MVE_VSTRH16_rq_u,
 311       MVE_VSTRWU32,
 312       MVE_VSTRW32_qi,
 313       MVE_VSTRW32_rq,
 314       MVE_VSTRW32_rq_u,
 315   };
 316   std::initializer_list<unsigned> Address2List = {
 317       t2LDRB_POST,
 318       t2LDRB_PRE,
 319       t2LDRDi8,
 320       t2LDRH_POST,
 321       t2LDRH_PRE,
 322       t2LDRSB_POST,
 323       t2LDRSB_PRE,
 324       t2LDRSH_POST,
 325       t2LDRSH_PRE,
 326       t2LDR_POST,
 327       t2LDR_PRE,
 328       t2STRB_POST,
 329       t2STRB_PRE,
 330       t2STRDi8,
 331       t2STRH_POST,
 332       t2STRH_PRE,
 333       t2STR_POST,
 334       t2STR_PRE,
 335       MVE_VLD20_16_wb,
 336       MVE_VLD20_32_wb,
 337       MVE_VLD20_8_wb,
 338       MVE_VLD21_16_wb,
 339       MVE_VLD21_32_wb,
 340       MVE_VLD21_8_wb,
 341       MVE_VLD40_16_wb,
 342       MVE_VLD40_32_wb,
 343       MVE_VLD40_8_wb,
 344       MVE_VLD41_16_wb,
 345       MVE_VLD41_32_wb,
 346       MVE_VLD41_8_wb,
 347       MVE_VLD42_16_wb,
 348       MVE_VLD42_32_wb,
 349       MVE_VLD42_8_wb,
 350       MVE_VLD43_16_wb,
 351       MVE_VLD43_32_wb,
 352       MVE_VLD43_8_wb,
 353       MVE_VLDRBS16_post,
 354       MVE_VLDRBS16_pre,
 355       MVE_VLDRBS32_post,
 356       MVE_VLDRBS32_pre,
 357       MVE_VLDRBU16_post,
 358       MVE_VLDRBU16_pre,
 359       MVE_VLDRBU32_post,
 360       MVE_VLDRBU32_pre,
 361       MVE_VLDRBU8_post,
 362       MVE_VLDRBU8_pre,
 363       MVE_VLDRDU64_qi_pre,
 364       MVE_VLDRHS32_post,
 365       MVE_VLDRHS32_pre,
 366       MVE_VLDRHU16_post,
 367       MVE_VLDRHU16_pre,
 368       MVE_VLDRHU32_post,
 369       MVE_VLDRHU32_pre,
 370       MVE_VLDRWU32_post,
 371       MVE_VLDRWU32_pre,
 372       MVE_VLDRWU32_qi_pre,
 373       MVE_VST20_16_wb,
 374       MVE_VST20_32_wb,
 375       MVE_VST20_8_wb,
 376       MVE_VST21_16_wb,
 377       MVE_VST21_32_wb,
 378       MVE_VST21_8_wb,
 379       MVE_VST40_16_wb,
 380       MVE_VST40_32_wb,
 381       MVE_VST40_8_wb,
 382       MVE_VST41_16_wb,
 383       MVE_VST41_32_wb,
 384       MVE_VST41_8_wb,
 385       MVE_VST42_16_wb,
 386       MVE_VST42_32_wb,
 387       MVE_VST42_8_wb,
 388       MVE_VST43_16_wb,
 389       MVE_VST43_32_wb,
 390       MVE_VST43_8_wb,
 391       MVE_VSTRB16_post,
 392       MVE_VSTRB16_pre,
 393       MVE_VSTRB32_post,
 394       MVE_VSTRB32_pre,
 395       MVE_VSTRBU8_post,
 396       MVE_VSTRBU8_pre,
 397       MVE_VSTRD64_qi_pre,
 398       MVE_VSTRH32_post,
 399       MVE_VSTRH32_pre,
 400       MVE_VSTRHU16_post,
 401       MVE_VSTRHU16_pre,
 402       MVE_VSTRWU32_post,
 403       MVE_VSTRWU32_pre,
 404       MVE_VSTRW32_qi_pre,
 405   };
 406   std::initializer_list<unsigned> Address3List = {
 407       t2LDRD_POST,
 408       t2LDRD_PRE,
 409       t2STRD_POST,
 410       t2STRD_PRE,
 411   };
 412   // Compute a mask of which operands are involved in address computation
 413   for (auto &op : Address1List) {
 414     Info[op].AddressOpMask = 0x6;
 415   }
 416   for (auto &op : Address2List) {
 417     Info[op].AddressOpMask = 0xc;
 418   }
 419   for (auto &op : Address3List) {
 420     Info[op].AddressOpMask = 0x18;
 421   }
 422   for (auto &op : hasBRegAddrShiftList) {
 423     Info[op].AddressOpMask |= 0x8;
 424   }
 425 }
 426
 427 void InstructionInformation::markDPProducersConsumers(
 428     const ARMBaseInstrInfo *TII) {
 429   // Learn about all instructions which have FP source/dest registers
 430   for (unsigned MI = 0; MI < ARM::INSTRUCTION_LIST_END; ++MI) {
 431     const MCInstrDesc &MID = TII->get(MI);
 432     auto Operands = MID.operands();
 433     for (unsigned OI = 0, OIE = MID.getNumOperands(); OI != OIE; ++OI) {
 434       bool MarkQP = false, MarkDP = false, MarkSP = false;
 435       switch (Operands[OI].RegClass) {
 436       case ARM::MQPRRegClassID:
 437       case ARM::DPRRegClassID:
 438       case ARM::DPR_8RegClassID:
 439       case ARM::DPR_VFP2RegClassID:
 440       case ARM::DPairRegClassID:
 441       case ARM::DPairSpcRegClassID:
 442       case ARM::DQuadRegClassID:
 443       case ARM::DQuadSpcRegClassID:
 444       case ARM::DTripleRegClassID:
 445       case ARM::DTripleSpcRegClassID:
 446         MarkDP = true;
 447         break;
 448       case ARM::QPRRegClassID:
 449       case ARM::QPR_8RegClassID:
 450       case ARM::QPR_VFP2RegClassID:
 451       case ARM::QQPRRegClassID:
 452       case ARM::QQQQPRRegClassID:
 453         MarkQP = true;
 454         break;
 455       case ARM::SPRRegClassID:
 456       case ARM::SPR_8RegClassID:
 457       case ARM::FPWithVPRRegClassID:
 458         MarkSP = true;
 459         break;
 460       default:
 461         break;
 462       }
 463       if (MarkQP) {
 464         if (OI < MID.getNumDefs())
 465           Info[MI].ProducesQP = true;
 466         else
 467           Info[MI].ConsumesQP = true;
 468       }
 469       if (MarkDP) {
 470         if (OI < MID.getNumDefs())
 471           Info[MI].ProducesDP = true;
 472         else
 473           Info[MI].ConsumesDP = true;
 474       }
 475       if (MarkSP) {
 476         if (OI < MID.getNumDefs())
 477           Info[MI].ProducesSP = true;
 478         else
 479           Info[MI].ConsumesSP = true;
 480       }
 481     }
 482   }
 483 }
 484
 485 } // anonymous namespace
 486
 487 static bool hasImplicitCPSRUse(const MachineInstr *MI) {
 488   return MI->getDesc().hasImplicitUseOfPhysReg(ARM::CPSR);
 489 }
 490
 491 void ARMOverrideBypasses::setBidirLatencies(SUnit &SrcSU, SDep &SrcDep,
 492                                             unsigned latency) {
 493   SDep Reverse = SrcDep;
 494   Reverse.setSUnit(&SrcSU);
 495   for (SDep &PDep : SrcDep.getSUnit()->Preds) {
 496     if (PDep == Reverse) {
 497       PDep.setLatency(latency);
 498       SrcDep.getSUnit()->setDepthDirty();
 499       break;
 500     }
 501   }
 502   SrcDep.setLatency(latency);
 503   SrcSU.setHeightDirty();
 504 }
 505
 506 static bool mismatchedPred(ARMCC::CondCodes a, ARMCC::CondCodes b) {
 507   return (a & 0xe) != (b & 0xe);
 508 }
 509
 510 // Set output dependences to zero latency for processors which can
 511 // simultaneously issue to the same register.  Returns true if a change
 512 // was made.
 513 bool ARMOverrideBypasses::zeroOutputDependences(SUnit &ISU, SDep &Dep) {
 514   if (Dep.getKind() == SDep::Output) {
 515     setBidirLatencies(ISU, Dep, 0);
 516     return true;
 517   }
 518   return false;
 519 }
 520
 521 // The graph doesn't look inside of bundles to determine their
 522 // scheduling boundaries and reports zero latency into and out of them
 523 // (except for CPSR into the bundle, which has latency 1).
 524 // Make some better scheduling assumptions:
 525 // 1) CPSR uses have zero latency; other uses have incoming latency 1
 526 // 2) CPSR defs retain a latency of zero; others have a latency of 1.
 527 //
 528 // Returns 1 if a use change was made; 2 if a def change was made; 0 otherwise
 529 unsigned ARMOverrideBypasses::makeBundleAssumptions(SUnit &ISU, SDep &Dep) {
 530
 531   SUnit &DepSU = *Dep.getSUnit();
 532   const MachineInstr *SrcMI = ISU.getInstr();
 533   unsigned SrcOpcode = SrcMI->getOpcode();
 534   const MachineInstr *DstMI = DepSU.getInstr();
 535   unsigned DstOpcode = DstMI->getOpcode();
 536
 537   if (DstOpcode == ARM::BUNDLE && TII->isPredicated(*DstMI)) {
 538     setBidirLatencies(
 539         ISU, Dep,
 540         (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR) ? 0 : 1);
 541     return 1;
 542   }
 543   if (SrcOpcode == ARM::BUNDLE && TII->isPredicated(*SrcMI) &&
 544       Dep.isAssignedRegDep() && Dep.getReg() != ARM::CPSR) {
 545     setBidirLatencies(ISU, Dep, 1);
 546     return 2;
 547   }
 548   return 0;
 549 }
 550
 551 // Determine whether there is a memory RAW hazard here and set up latency
 552 // accordingly
 553 bool ARMOverrideBypasses::memoryRAWHazard(SUnit &ISU, SDep &Dep,
 554                                           unsigned latency) {
 555   if (!Dep.isNormalMemory())
 556     return false;
 557   auto &SrcInst = *ISU.getInstr();
 558   auto &DstInst = *Dep.getSUnit()->getInstr();
 559   if (!SrcInst.mayStore() || !DstInst.mayLoad())
 560     return false;
 561
 562   auto SrcMO = *SrcInst.memoperands().begin();
 563   auto DstMO = *DstInst.memoperands().begin();
 564   auto SrcVal = SrcMO->getValue();
 565   auto DstVal = DstMO->getValue();
 566   auto SrcPseudoVal = SrcMO->getPseudoValue();
 567   auto DstPseudoVal = DstMO->getPseudoValue();
 568   if (SrcVal && DstVal && AA->alias(SrcVal, DstVal) == AliasResult::MustAlias &&
 569       SrcMO->getOffset() == DstMO->getOffset()) {
 570     setBidirLatencies(ISU, Dep, latency);
 571     return true;
 572   } else if (SrcPseudoVal && DstPseudoVal &&
 573              SrcPseudoVal->kind() == DstPseudoVal->kind() &&
 574              SrcPseudoVal->kind() == PseudoSourceValue::FixedStack) {
 575     // Spills/fills
 576     auto FS0 = cast<FixedStackPseudoSourceValue>(SrcPseudoVal);
 577     auto FS1 = cast<FixedStackPseudoSourceValue>(DstPseudoVal);
 578     if (FS0 == FS1) {
 579       setBidirLatencies(ISU, Dep, latency);
 580       return true;
 581     }
 582   }
 583   return false;
 584 }
 585
 586 namespace {
 587
 588 std::unique_ptr<InstructionInformation> II;
 589
 590 class CortexM7InstructionInformation : public InstructionInformation {
 591 public:
 592   CortexM7InstructionInformation(const ARMBaseInstrInfo *TII)
 593       : InstructionInformation(TII) {}
 594 };
 595
 596 class CortexM7Overrides : public ARMOverrideBypasses {
 597 public:
 598   CortexM7Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
 599       : ARMOverrideBypasses(TII, AA) {
 600     if (!II)
 601       II.reset(new CortexM7InstructionInformation(TII));
 602   }
 603
 604   void modifyBypasses(SUnit &) override;
 605 };
 606
 607 void CortexM7Overrides::modifyBypasses(SUnit &ISU) {
 608   const MachineInstr *SrcMI = ISU.getInstr();
 609   unsigned SrcOpcode = SrcMI->getOpcode();
 610   bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
 611
 612   // Walk the successors looking for latency overrides that are needed
 613   for (SDep &Dep : ISU.Succs) {
 614
 615     // Output dependences should have 0 latency, as M7 is able to
 616     // schedule writers to the same register for simultaneous issue.
 617     if (zeroOutputDependences(ISU, Dep))
 618       continue;
 619
 620     if (memoryRAWHazard(ISU, Dep, 4))
 621       continue;
 622
 623     // Ignore dependencies other than data
 624     if (Dep.getKind() != SDep::Data)
 625       continue;
 626
 627     SUnit &DepSU = *Dep.getSUnit();
 628     if (DepSU.isBoundaryNode())
 629       continue;
 630
 631     if (makeBundleAssumptions(ISU, Dep) == 1)
 632       continue;
 633
 634     const MachineInstr *DstMI = DepSU.getInstr();
 635     unsigned DstOpcode = DstMI->getOpcode();
 636
 637     // Word loads into any multiply or divide instruction are considered
 638     // cannot bypass their scheduling stage. Didn't do this in the .td file
 639     // because we cannot easily create a read advance that is 0 from certain
 640     // writer classes and 1 from all the rest.
 641     // (The other way around would have been easy.)
 642     if (isNSWload && (II->isMultiply(DstOpcode) || II->isDivide(DstOpcode)))
 643       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
 644
 645     // Word loads into B operand of a load/store are considered cannot bypass
 646     // their scheduling stage. Cannot do in the .td file because
 647     // need to decide between -1 and -2 for ReadAdvance
 648     if (isNSWload && II->hasBRegAddr(DstOpcode) &&
 649         DstMI->getOperand(2).getReg() == Dep.getReg())
 650       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
 651
 652     // Multiplies into any address generation cannot bypass from EX3.  Cannot do
 653     // in the .td file because need to decide between -1 and -2 for ReadAdvance
 654     if (II->isMultiply(SrcOpcode)) {
 655       unsigned OpMask = II->getAddressOpMask(DstOpcode) >> 1;
 656       for (unsigned i = 1; OpMask; ++i, OpMask >>= 1) {
 657         if ((OpMask & 1) && DstMI->getOperand(i).isReg() &&
 658             DstMI->getOperand(i).getReg() == Dep.getReg()) {
 659           setBidirLatencies(ISU, Dep, 4); // first legal bypass is EX4->EX1
 660           break;
 661         }
 662       }
 663     }
 664
 665     // Mismatched conditional producers take longer on M7; they end up looking
 666     // like they were produced at EX3 and read at IS.
 667     if (TII->isPredicated(*SrcMI) && Dep.isAssignedRegDep() &&
 668         (SrcOpcode == ARM::BUNDLE ||
 669          mismatchedPred(TII->getPredicate(*SrcMI),
 670                         TII->getPredicate(*DstMI)))) {
 671       unsigned Lat = 1;
 672       // Operand A of shift+ALU is treated as an EX1 read instead of EX2.
 673       if (II->isInlineShiftALU(DstOpcode) && DstMI->getOperand(3).getImm() &&
 674           DstMI->getOperand(1).getReg() == Dep.getReg())
 675         Lat = 2;
 676       Lat = std::min(3u, Dep.getLatency() + Lat);
 677       setBidirLatencies(ISU, Dep, std::max(Dep.getLatency(), Lat));
 678     }
 679
 680     // CC setter into conditional producer shouldn't have a latency of more
 681     // than 1 unless it's due to an implicit read. (All the "true" readers
 682     // of the condition code use an implicit read, and predicates use an
 683     // explicit.)
 684     if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
 685         TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
 686       setBidirLatencies(ISU, Dep, 1);
 687
 688     // REV instructions cannot bypass directly into the EX1 shifter.  The
 689     // code is slightly inexact as it doesn't attempt to ensure that the bypass
 690     // is to the shifter operands.
 691     if (II->isRev(SrcOpcode)) {
 692       if (II->isInlineShiftALU(DstOpcode))
 693         setBidirLatencies(ISU, Dep, 2);
 694       else if (II->isShift(DstOpcode))
 695         setBidirLatencies(ISU, Dep, 1);
 696     }
 697   }
 698 }
 699
 700 class M85InstructionInformation : public InstructionInformation {
 701 public:
 702   M85InstructionInformation(const ARMBaseInstrInfo *t)
 703       : InstructionInformation(t) {
 704     markDPProducersConsumers(t);
 705   }
 706 };
 707
 708 class M85Overrides : public ARMOverrideBypasses {
 709 public:
 710   M85Overrides(const ARMBaseInstrInfo *t, AAResults *a)
 711       : ARMOverrideBypasses(t, a) {
 712     if (!II)
 713       II.reset(new M85InstructionInformation(t));
 714   }
 715
 716   void modifyBypasses(SUnit &) override;
 717
 718 private:
 719   unsigned computeBypassStage(const MCSchedClassDesc *SCD);
 720   signed modifyMixedWidthFP(const MachineInstr *SrcMI,
 721                             const MachineInstr *DstMI, unsigned RegID,
 722                             const MCSchedClassDesc *SCD);
 723 };
 724
 725 unsigned M85Overrides::computeBypassStage(const MCSchedClassDesc *SCDesc) {
 726   auto SM = DAG->getSchedModel();
 727   unsigned DefIdx = 0; // just look for the first output's timing
 728   if (DefIdx < SCDesc->NumWriteLatencyEntries) {
 729     // Lookup the definition's write latency in SubtargetInfo.
 730     const MCWriteLatencyEntry *WLEntry =
 731         SM->getSubtargetInfo()->getWriteLatencyEntry(SCDesc, DefIdx);
 732     unsigned Latency = WLEntry->Cycles >= 0 ? WLEntry->Cycles : 1000;
 733     if (Latency == 4)
 734       return 2;
 735     else if (Latency == 5)
 736       return 3;
 737     else if (Latency > 3)
 738       return 3;
 739     else
 740       return Latency;
 741   }
 742   return 2;
 743 }
 744
 745 // Latency changes for bypassing between FP registers of different sizes:
 746 //
 747 // Note that mixed DP/SP are unlikely because of the semantics
 748 // of C.  Mixed MVE/SP are quite common when MVE intrinsics are used.
 749 signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,
 750                                         const MachineInstr *DstMI,
 751                                         unsigned RegID,
 752                                         const MCSchedClassDesc *SCD) {
 753
 754   if (!II->producesSP(SrcMI->getOpcode()) &&
 755       !II->producesDP(SrcMI->getOpcode()) &&
 756       !II->producesQP(SrcMI->getOpcode()))
 757     return 0;
 758
 759   if (Register::isVirtualRegister(RegID)) {
 760     if (II->producesSP(SrcMI->getOpcode()) &&
 761         II->consumesDP(DstMI->getOpcode())) {
 762       for (auto &OP : SrcMI->operands())
 763         if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
 764             OP.getSubReg() == ARM::ssub_1)
 765           return 5 - computeBypassStage(SCD);
 766     } else if (II->producesSP(SrcMI->getOpcode()) &&
 767                II->consumesQP(DstMI->getOpcode())) {
 768       for (auto &OP : SrcMI->operands())
 769         if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
 770             (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
 771           return 5 - computeBypassStage(SCD) -
 772                  ((OP.getSubReg() == ARM::ssub_2 ||
 773                    OP.getSubReg() == ARM::ssub_3)
 774                       ? 1
 775                       : 0);
 776     } else if (II->producesDP(SrcMI->getOpcode()) &&
 777                II->consumesQP(DstMI->getOpcode())) {
 778       for (auto &OP : SrcMI->operands())
 779         if (OP.isReg() && OP.isDef() && OP.getReg() == RegID &&
 780             OP.getSubReg() == ARM::ssub_1)
 781           return -1;
 782     } else if (II->producesDP(SrcMI->getOpcode()) &&
 783                II->consumesSP(DstMI->getOpcode())) {
 784       for (auto &OP : DstMI->operands())
 785         if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
 786             OP.getSubReg() == ARM::ssub_1)
 787           return 5 - computeBypassStage(SCD);
 788     } else if (II->producesQP(SrcMI->getOpcode()) &&
 789                II->consumesSP(DstMI->getOpcode())) {
 790       for (auto &OP : DstMI->operands())
 791         if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
 792             (OP.getSubReg() == ARM::ssub_1 || OP.getSubReg() == ARM::ssub_3))
 793           return 5 - computeBypassStage(SCD) +
 794                  ((OP.getSubReg() == ARM::ssub_2 ||
 795                    OP.getSubReg() == ARM::ssub_3)
 796                       ? 1
 797                       : 0);
 798     } else if (II->producesQP(SrcMI->getOpcode()) &&
 799                II->consumesDP(DstMI->getOpcode())) {
 800       for (auto &OP : DstMI->operands())
 801         if (OP.isReg() && OP.isUse() && OP.getReg() == RegID &&
 802             OP.getSubReg() == ARM::ssub_1)
 803           return 1;
 804     }
 805   } else if (Register::isPhysicalRegister(RegID)) {
 806     // Note that when the producer is narrower, not all of the producers
 807     // may be present in the scheduling graph; somewhere earlier in the
 808     // compiler, an implicit def/use of the aliased full register gets
 809     // added to the producer, and so only that producer is seen as *the*
 810     // single producer.  This behavior also has the unfortunate effect of
 811     // serializing the producers in the compiler's view of things.
 812     if (II->producesSP(SrcMI->getOpcode()) &&
 813         II->consumesDP(DstMI->getOpcode())) {
 814       for (auto &OP : SrcMI->operands())
 815         if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
 816             OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
 817             (OP.getReg() == RegID ||
 818              (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
 819              (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
 820           return 5 - computeBypassStage(SCD);
 821     } else if (II->producesSP(SrcMI->getOpcode()) &&
 822                II->consumesQP(DstMI->getOpcode())) {
 823       for (auto &OP : SrcMI->operands())
 824         if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::S1 &&
 825             OP.getReg() <= ARM::S31 && (OP.getReg() - ARM::S0) % 2 &&
 826             (OP.getReg() == RegID ||
 827              (OP.getReg() - ARM::S0) / 2 + ARM::D0 == RegID ||
 828              (OP.getReg() - ARM::S0) / 4 + ARM::Q0 == RegID))
 829           return 5 - computeBypassStage(SCD) -
 830                  (((OP.getReg() - ARM::S0) / 2) % 2 ? 1 : 0);
 831     } else if (II->producesDP(SrcMI->getOpcode()) &&
 832                II->consumesQP(DstMI->getOpcode())) {
 833       for (auto &OP : SrcMI->operands())
 834         if (OP.isReg() && OP.isDef() && OP.getReg() >= ARM::D0 &&
 835             OP.getReg() <= ARM::D15 && (OP.getReg() - ARM::D0) % 2 &&
 836             (OP.getReg() == RegID ||
 837              (OP.getReg() - ARM::D0) / 2 + ARM::Q0 == RegID))
 838           return -1;
 839     } else if (II->producesDP(SrcMI->getOpcode()) &&
 840                II->consumesSP(DstMI->getOpcode())) {
 841       if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
 842         return 5 - computeBypassStage(SCD);
 843     } else if (II->producesQP(SrcMI->getOpcode()) &&
 844                II->consumesSP(DstMI->getOpcode())) {
 845       if (RegID >= ARM::S1 && RegID <= ARM::S31 && (RegID - ARM::S0) % 2)
 846         return 5 - computeBypassStage(SCD) +
 847                (((RegID - ARM::S0) / 2) % 2 ? 1 : 0);
 848     } else if (II->producesQP(SrcMI->getOpcode()) &&
 849                II->consumesDP(DstMI->getOpcode())) {
 850       if (RegID >= ARM::D1 && RegID <= ARM::D15 && (RegID - ARM::D0) % 2)
 851         return 1;
 852     }
 853   }
 854   return 0;
 855 }
 856
 857 void M85Overrides::modifyBypasses(SUnit &ISU) {
 858   const MachineInstr *SrcMI = ISU.getInstr();
 859   unsigned SrcOpcode = SrcMI->getOpcode();
 860   bool isNSWload = II->isNonSubwordLoad(SrcOpcode);
 861
 862   // Walk the successors looking for latency overrides that are needed
 863   for (SDep &Dep : ISU.Succs) {
 864
 865     // Output dependences should have 0 latency, as CortexM85 is able to
 866     // schedule writers to the same register for simultaneous issue.
 867     if (zeroOutputDependences(ISU, Dep))
 868       continue;
 869
 870     if (memoryRAWHazard(ISU, Dep, 3))
 871       continue;
 872
 873     // Ignore dependencies other than data or strong ordering.
 874     if (Dep.getKind() != SDep::Data)
 875       continue;
 876
 877     SUnit &DepSU = *Dep.getSUnit();
 878     if (DepSU.isBoundaryNode())
 879       continue;
 880
 881     if (makeBundleAssumptions(ISU, Dep) == 1)
 882       continue;
 883
 884     const MachineInstr *DstMI = DepSU.getInstr();
 885     unsigned DstOpcode = DstMI->getOpcode();
 886
 887     // Word loads into B operand of a load/store with cannot bypass their
 888     // scheduling stage. Cannot do in the .td file because need to decide
 889     // between -1 and -2 for ReadAdvance
 890
 891     if (isNSWload && II->hasBRegAddrShift(DstOpcode) &&
 892         DstMI->getOperand(3).getImm() != 0 && // shift operand
 893         DstMI->getOperand(2).getReg() == Dep.getReg())
 894       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
 895
 896     if (isNSWload && isMVEVectorInstruction(DstMI)) {
 897       setBidirLatencies(ISU, Dep, Dep.getLatency() + 1);
 898     }
 899
 900     if (II->isMVEIntMAC(DstOpcode) &&
 901         II->isMVEIntMACMatched(SrcOpcode, DstOpcode) &&
 902         DstMI->getOperand(0).isReg() &&
 903         DstMI->getOperand(0).getReg() == Dep.getReg())
 904       setBidirLatencies(ISU, Dep, Dep.getLatency() - 1);
 905
 906     // CC setter into conditional producer shouldn't have a latency of more
 907     // than 0 unless it's due to an implicit read.
 908     if (Dep.isAssignedRegDep() && Dep.getReg() == ARM::CPSR &&
 909         TII->isPredicated(*DstMI) && !hasImplicitCPSRUse(DstMI))
 910       setBidirLatencies(ISU, Dep, 0);
 911
 912     if (signed ALat = modifyMixedWidthFP(SrcMI, DstMI, Dep.getReg(),
 913                                          DAG->getSchedClass(&ISU)))
 914       setBidirLatencies(ISU, Dep, std::max(0, signed(Dep.getLatency()) + ALat));
 915
 916     if (II->isRev(SrcOpcode)) {
 917       if (II->isInlineShiftALU(DstOpcode))
 918         setBidirLatencies(ISU, Dep, 1);
 919       else if (II->isShift(DstOpcode))
 920         setBidirLatencies(ISU, Dep, 1);
 921     }
 922   }
 923 }
 924
 925 // Add M55 specific overrides for latencies between instructions. Currently it:
 926 //  - Adds an extra cycle latency between MVE VMLAV and scalar instructions.
 927 class CortexM55Overrides : public ARMOverrideBypasses {
 928 public:
 929   CortexM55Overrides(const ARMBaseInstrInfo *TII, AAResults *AA)
 930       : ARMOverrideBypasses(TII, AA) {}
 931
 932   void modifyBypasses(SUnit &SU) override {
 933     MachineInstr *SrcMI = SU.getInstr();
 934     if (!(SrcMI->getDesc().TSFlags & ARMII::HorizontalReduction))
 935       return;
 936
 937     for (SDep &Dep : SU.Succs) {
 938       if (Dep.getKind() != SDep::Data)
 939         continue;
 940       SUnit &DepSU = *Dep.getSUnit();
 941       if (DepSU.isBoundaryNode())
 942         continue;
 943       MachineInstr *DstMI = DepSU.getInstr();
 944
 945       if (!isMVEVectorInstruction(DstMI) && !DstMI->mayStore())
 946         setBidirLatencies(SU, Dep, 3);
 947     }
 948   }
 949 };
 950
 951 } // end anonymous namespace
 952
 953 void ARMOverrideBypasses::apply(ScheduleDAGInstrs *DAGInstrs) {
 954   DAG = DAGInstrs;
 955   for (SUnit &ISU : DAGInstrs->SUnits) {
 956     if (ISU.isBoundaryNode())
 957       continue;
 958     modifyBypasses(ISU);
 959   }
 960   if (DAGInstrs->ExitSU.getInstr())
 961     modifyBypasses(DAGInstrs->ExitSU);
 962 }
 963
 964 std::unique_ptr<ScheduleDAGMutation>
 965 createARMLatencyMutations(const ARMSubtarget &ST, AAResults *AA) {
 966   if (ST.isCortexM85())
 967     return std::make_unique<M85Overrides>(ST.getInstrInfo(), AA);
 968   else if (ST.isCortexM7())
 969     return std::make_unique<CortexM7Overrides>(ST.getInstrInfo(), AA);
 970   else if (ST.isCortexM55())
 971     return std::make_unique<CortexM55Overrides>(ST.getInstrInfo(), AA);
 972
 973   return nullptr;
 974 }
 975
 976 } // end namespace llvm