llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

   1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// Memory legalizer - implements memory model. More information can be
  11 /// found here:
  12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "AMDGPU.h"
  17 #include "AMDGPUMachineModuleInfo.h"
  18 #include "GCNSubtarget.h"
  19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  20 #include "llvm/ADT/BitmaskEnum.h"
  21 #include "llvm/CodeGen/MachineBasicBlock.h"
  22 #include "llvm/IR/DiagnosticInfo.h"
  23 #include "llvm/Support/AtomicOrdering.h"
  24 #include "llvm/Support/TargetParser.h"
  25
  26 using namespace llvm;
  27 using namespace llvm::AMDGPU;
  28
  29 #define DEBUG_TYPE "si-memory-legalizer"
  30 #define PASS_NAME "SI Memory Legalizer"
  31
  32 static cl::opt<bool> AmdgcnSkipCacheInvalidations(
  33     "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
  34     cl::desc("Use this to skip inserting cache invalidating instructions."));
  35
  36 namespace {
  37
  38 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
  39
  40 /// Memory operation flags. Can be ORed together.
  41 enum class SIMemOp {
  42   NONE = 0u,
  43   LOAD = 1u << 0,
  44   STORE = 1u << 1,
  45   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
  46 };
  47
  48 /// Position to insert a new instruction relative to an existing
  49 /// instruction.
  50 enum class Position {
  51   BEFORE,
  52   AFTER
  53 };
  54
  55 /// The atomic synchronization scopes supported by the AMDGPU target.
  56 enum class SIAtomicScope {
  57   NONE,
  58   SINGLETHREAD,
  59   WAVEFRONT,
  60   WORKGROUP,
  61   AGENT,
  62   SYSTEM
  63 };
  64
  65 /// The distinct address spaces supported by the AMDGPU target for
  66 /// atomic memory operation. Can be ORed toether.
  67 enum class SIAtomicAddrSpace {
  68   NONE = 0u,
  69   GLOBAL = 1u << 0,
  70   LDS = 1u << 1,
  71   SCRATCH = 1u << 2,
  72   GDS = 1u << 3,
  73   OTHER = 1u << 4,
  74
  75   /// The address spaces that can be accessed by a FLAT instruction.
  76   FLAT = GLOBAL | LDS | SCRATCH,
  77
  78   /// The address spaces that support atomic instructions.
  79   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
  80
  81   /// All address spaces.
  82   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
  83
  84   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
  85 };
  86
  87 class SIMemOpInfo final {
  88 private:
  89
  90   friend class SIMemOpAccess;
  91
  92   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
  93   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
  94   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
  95   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
  96   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
  97   bool IsCrossAddressSpaceOrdering = false;
  98   bool IsVolatile = false;
  99   bool IsNonTemporal = false;
 100
 101   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
 102               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
 103               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
 104               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
 105               bool IsCrossAddressSpaceOrdering = true,
 106               AtomicOrdering FailureOrdering =
 107                 AtomicOrdering::SequentiallyConsistent,
 108               bool IsVolatile = false,
 109               bool IsNonTemporal = false)
 110     : Ordering(Ordering), FailureOrdering(FailureOrdering),
 111       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
 112       InstrAddrSpace(InstrAddrSpace),
 113       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
 114       IsVolatile(IsVolatile),
 115       IsNonTemporal(IsNonTemporal) {
 116
 117     if (Ordering == AtomicOrdering::NotAtomic) {
 118       assert(Scope == SIAtomicScope::NONE &&
 119              OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
 120              !IsCrossAddressSpaceOrdering &&
 121              FailureOrdering == AtomicOrdering::NotAtomic);
 122       return;
 123     }
 124
 125     assert(Scope != SIAtomicScope::NONE &&
 126            (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
 127                SIAtomicAddrSpace::NONE &&
 128            (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
 129                SIAtomicAddrSpace::NONE);
 130
 131     // There is also no cross address space ordering if the ordering
 132     // address space is the same as the instruction address space and
 133     // only contains a single address space.
 134     if ((OrderingAddrSpace == InstrAddrSpace) &&
 135         isPowerOf2_32(uint32_t(InstrAddrSpace)))
 136       this->IsCrossAddressSpaceOrdering = false;
 137
 138     // Limit the scope to the maximum supported by the instruction's address
 139     // spaces.
 140     if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
 141         SIAtomicAddrSpace::NONE) {
 142       this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
 143     } else if ((InstrAddrSpace &
 144                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
 145                SIAtomicAddrSpace::NONE) {
 146       this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
 147     } else if ((InstrAddrSpace &
 148                 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
 149                   SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
 150       this->Scope = std::min(Scope, SIAtomicScope::AGENT);
 151     }
 152   }
 153
 154 public:
 155   /// \returns Atomic synchronization scope of the machine instruction used to
 156   /// create this SIMemOpInfo.
 157   SIAtomicScope getScope() const {
 158     return Scope;
 159   }
 160
 161   /// \returns Ordering constraint of the machine instruction used to
 162   /// create this SIMemOpInfo.
 163   AtomicOrdering getOrdering() const {
 164     return Ordering;
 165   }
 166
 167   /// \returns Failure ordering constraint of the machine instruction used to
 168   /// create this SIMemOpInfo.
 169   AtomicOrdering getFailureOrdering() const {
 170     return FailureOrdering;
 171   }
 172
 173   /// \returns The address spaces be accessed by the machine
 174   /// instruction used to create this SiMemOpInfo.
 175   SIAtomicAddrSpace getInstrAddrSpace() const {
 176     return InstrAddrSpace;
 177   }
 178
 179   /// \returns The address spaces that must be ordered by the machine
 180   /// instruction used to create this SiMemOpInfo.
 181   SIAtomicAddrSpace getOrderingAddrSpace() const {
 182     return OrderingAddrSpace;
 183   }
 184
 185   /// \returns Return true iff memory ordering of operations on
 186   /// different address spaces is required.
 187   bool getIsCrossAddressSpaceOrdering() const {
 188     return IsCrossAddressSpaceOrdering;
 189   }
 190
 191   /// \returns True if memory access of the machine instruction used to
 192   /// create this SIMemOpInfo is volatile, false otherwise.
 193   bool isVolatile() const {
 194     return IsVolatile;
 195   }
 196
 197   /// \returns True if memory access of the machine instruction used to
 198   /// create this SIMemOpInfo is nontemporal, false otherwise.
 199   bool isNonTemporal() const {
 200     return IsNonTemporal;
 201   }
 202
 203   /// \returns True if ordering constraint of the machine instruction used to
 204   /// create this SIMemOpInfo is unordered or higher, false otherwise.
 205   bool isAtomic() const {
 206     return Ordering != AtomicOrdering::NotAtomic;
 207   }
 208
 209 };
 210
 211 class SIMemOpAccess final {
 212 private:
 213   AMDGPUMachineModuleInfo *MMI = nullptr;
 214
 215   /// Reports unsupported message \p Msg for \p MI to LLVM context.
 216   void reportUnsupported(const MachineBasicBlock::iterator &MI,
 217                          const char *Msg) const;
 218
 219   /// Inspects the target synchronization scope \p SSID and determines
 220   /// the SI atomic scope it corresponds to, the address spaces it
 221   /// covers, and whether the memory ordering applies between address
 222   /// spaces.
 223   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
 224   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
 225
 226   /// \return Return a bit set of the address spaces accessed by \p AS.
 227   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
 228
 229   /// \returns Info constructed from \p MI, which has at least machine memory
 230   /// operand.
 231   Optional<SIMemOpInfo> constructFromMIWithMMO(
 232       const MachineBasicBlock::iterator &MI) const;
 233
 234 public:
 235   /// Construct class to support accessing the machine memory operands
 236   /// of instructions in the machine function \p MF.
 237   SIMemOpAccess(MachineFunction &MF);
 238
 239   /// \returns Load info if \p MI is a load operation, "None" otherwise.
 240   Optional<SIMemOpInfo> getLoadInfo(
 241       const MachineBasicBlock::iterator &MI) const;
 242
 243   /// \returns Store info if \p MI is a store operation, "None" otherwise.
 244   Optional<SIMemOpInfo> getStoreInfo(
 245       const MachineBasicBlock::iterator &MI) const;
 246
 247   /// \returns Atomic fence info if \p MI is an atomic fence operation,
 248   /// "None" otherwise.
 249   Optional<SIMemOpInfo> getAtomicFenceInfo(
 250       const MachineBasicBlock::iterator &MI) const;
 251
 252   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
 253   /// rmw operation, "None" otherwise.
 254   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
 255       const MachineBasicBlock::iterator &MI) const;
 256 };
 257
 258 class SICacheControl {
 259 protected:
 260
 261   /// AMDGPU subtarget info.
 262   const GCNSubtarget &ST;
 263
 264   /// Instruction info.
 265   const SIInstrInfo *TII = nullptr;
 266
 267   IsaVersion IV;
 268
 269   /// Whether to insert cache invalidating instructions.
 270   bool InsertCacheInv;
 271
 272   SICacheControl(const GCNSubtarget &ST);
 273
 274   /// Sets named bit \p BitName to "true" if present in instruction \p MI.
 275   /// \returns Returns true if \p MI is modified, false otherwise.
 276   bool enableNamedBit(const MachineBasicBlock::iterator MI,
 277                       AMDGPU::CPol::CPol Bit) const;
 278
 279 public:
 280
 281   /// Create a cache control for the subtarget \p ST.
 282   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
 283
 284   /// Update \p MI memory load instruction to bypass any caches up to
 285   /// the \p Scope memory scope for address spaces \p
 286   /// AddrSpace. Return true iff the instruction was modified.
 287   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
 288                                      SIAtomicScope Scope,
 289                                      SIAtomicAddrSpace AddrSpace) const = 0;
 290
 291   /// Update \p MI memory store instruction to bypass any caches up to
 292   /// the \p Scope memory scope for address spaces \p
 293   /// AddrSpace. Return true iff the instruction was modified.
 294   virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
 295                                       SIAtomicScope Scope,
 296                                       SIAtomicAddrSpace AddrSpace) const = 0;
 297
 298   /// Update \p MI memory read-modify-write instruction to bypass any caches up
 299   /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
 300   /// iff the instruction was modified.
 301   virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
 302                                     SIAtomicScope Scope,
 303                                     SIAtomicAddrSpace AddrSpace) const = 0;
 304
 305   /// Update \p MI memory instruction of kind \p Op associated with address
 306   /// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
 307   /// true iff the instruction was modified.
 308   virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
 309                                               SIAtomicAddrSpace AddrSpace,
 310                                               SIMemOp Op, bool IsVolatile,
 311                                               bool IsNonTemporal) const = 0;
 312
 313   /// Inserts any necessary instructions at position \p Pos relative
 314   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
 315   /// \p Op associated with address spaces \p AddrSpace have completed. Used
 316   /// between memory instructions to enforce the order they become visible as
 317   /// observed by other memory instructions executing in memory scope \p Scope.
 318   /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
 319   /// address spaces. Returns true iff any instructions inserted.
 320   virtual bool insertWait(MachineBasicBlock::iterator &MI,
 321                           SIAtomicScope Scope,
 322                           SIAtomicAddrSpace AddrSpace,
 323                           SIMemOp Op,
 324                           bool IsCrossAddrSpaceOrdering,
 325                           Position Pos) const = 0;
 326
 327   /// Inserts any necessary instructions at position \p Pos relative to
 328   /// instruction \p MI to ensure any subsequent memory instructions of this
 329   /// thread with address spaces \p AddrSpace will observe the previous memory
 330   /// operations by any thread for memory scopes up to memory scope \p Scope .
 331   /// Returns true iff any instructions inserted.
 332   virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
 333                              SIAtomicScope Scope,
 334                              SIAtomicAddrSpace AddrSpace,
 335                              Position Pos) const = 0;
 336
 337   /// Inserts any necessary instructions at position \p Pos relative to
 338   /// instruction \p MI to ensure previous memory instructions by this thread
 339   /// with address spaces \p AddrSpace have completed and can be observed by
 340   /// subsequent memory instructions by any thread executing in memory scope \p
 341   /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
 342   /// between address spaces. Returns true iff any instructions inserted.
 343   virtual bool insertRelease(MachineBasicBlock::iterator &MI,
 344                              SIAtomicScope Scope,
 345                              SIAtomicAddrSpace AddrSpace,
 346                              bool IsCrossAddrSpaceOrdering,
 347                              Position Pos) const = 0;
 348
 349   /// Virtual destructor to allow derivations to be deleted.
 350   virtual ~SICacheControl() = default;
 351
 352 };
 353
 354 class SIGfx6CacheControl : public SICacheControl {
 355 protected:
 356
 357   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
 358   /// is modified, false otherwise.
 359   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
 360     return enableNamedBit(MI, AMDGPU::CPol::GLC);
 361   }
 362
 363   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
 364   /// is modified, false otherwise.
 365   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
 366     return enableNamedBit(MI, AMDGPU::CPol::SLC);
 367   }
 368
 369 public:
 370
 371   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
 372
 373   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
 374                              SIAtomicScope Scope,
 375                              SIAtomicAddrSpace AddrSpace) const override;
 376
 377   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
 378                               SIAtomicScope Scope,
 379                               SIAtomicAddrSpace AddrSpace) const override;
 380
 381   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
 382                             SIAtomicScope Scope,
 383                             SIAtomicAddrSpace AddrSpace) const override;
 384
 385   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
 386                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
 387                                       bool IsVolatile,
 388                                       bool IsNonTemporal) const override;
 389
 390   bool insertWait(MachineBasicBlock::iterator &MI,
 391                   SIAtomicScope Scope,
 392                   SIAtomicAddrSpace AddrSpace,
 393                   SIMemOp Op,
 394                   bool IsCrossAddrSpaceOrdering,
 395                   Position Pos) const override;
 396
 397   bool insertAcquire(MachineBasicBlock::iterator &MI,
 398                      SIAtomicScope Scope,
 399                      SIAtomicAddrSpace AddrSpace,
 400                      Position Pos) const override;
 401
 402   bool insertRelease(MachineBasicBlock::iterator &MI,
 403                      SIAtomicScope Scope,
 404                      SIAtomicAddrSpace AddrSpace,
 405                      bool IsCrossAddrSpaceOrdering,
 406                      Position Pos) const override;
 407 };
 408
 409 class SIGfx7CacheControl : public SIGfx6CacheControl {
 410 public:
 411
 412   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
 413
 414   bool insertAcquire(MachineBasicBlock::iterator &MI,
 415                      SIAtomicScope Scope,
 416                      SIAtomicAddrSpace AddrSpace,
 417                      Position Pos) const override;
 418
 419 };
 420
 421 class SIGfx90ACacheControl : public SIGfx7CacheControl {
 422 public:
 423
 424   SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
 425
 426   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
 427                              SIAtomicScope Scope,
 428                              SIAtomicAddrSpace AddrSpace) const override;
 429
 430   bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
 431                               SIAtomicScope Scope,
 432                               SIAtomicAddrSpace AddrSpace) const override;
 433
 434   bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
 435                             SIAtomicScope Scope,
 436                             SIAtomicAddrSpace AddrSpace) const override;
 437
 438   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
 439                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
 440                                       bool IsVolatile,
 441                                       bool IsNonTemporal) const override;
 442
 443   bool insertWait(MachineBasicBlock::iterator &MI,
 444                   SIAtomicScope Scope,
 445                   SIAtomicAddrSpace AddrSpace,
 446                   SIMemOp Op,
 447                   bool IsCrossAddrSpaceOrdering,
 448                   Position Pos) const override;
 449
 450   bool insertAcquire(MachineBasicBlock::iterator &MI,
 451                      SIAtomicScope Scope,
 452                      SIAtomicAddrSpace AddrSpace,
 453                      Position Pos) const override;
 454
 455   bool insertRelease(MachineBasicBlock::iterator &MI,
 456                      SIAtomicScope Scope,
 457                      SIAtomicAddrSpace AddrSpace,
 458                      bool IsCrossAddrSpaceOrdering,
 459                      Position Pos) const override;
 460 };
 461
 462 class SIGfx10CacheControl : public SIGfx7CacheControl {
 463 protected:
 464
 465   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
 466   /// is modified, false otherwise.
 467   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
 468     return enableNamedBit(MI, AMDGPU::CPol::DLC);
 469   }
 470
 471 public:
 472
 473   SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
 474
 475   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
 476                              SIAtomicScope Scope,
 477                              SIAtomicAddrSpace AddrSpace) const override;
 478
 479   bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
 480                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
 481                                       bool IsVolatile,
 482                                       bool IsNonTemporal) const override;
 483
 484   bool insertWait(MachineBasicBlock::iterator &MI,
 485                   SIAtomicScope Scope,
 486                   SIAtomicAddrSpace AddrSpace,
 487                   SIMemOp Op,
 488                   bool IsCrossAddrSpaceOrdering,
 489                   Position Pos) const override;
 490
 491   bool insertAcquire(MachineBasicBlock::iterator &MI,
 492                      SIAtomicScope Scope,
 493                      SIAtomicAddrSpace AddrSpace,
 494                      Position Pos) const override;
 495 };
 496
 497 class SIMemoryLegalizer final : public MachineFunctionPass {
 498 private:
 499
 500   /// Cache Control.
 501   std::unique_ptr<SICacheControl> CC = nullptr;
 502
 503   /// List of atomic pseudo instructions.
 504   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
 505
 506   /// Return true iff instruction \p MI is a atomic instruction that
 507   /// returns a result.
 508   bool isAtomicRet(const MachineInstr &MI) const {
 509     return SIInstrInfo::isAtomicRet(MI);
 510   }
 511
 512   /// Removes all processed atomic pseudo instructions from the current
 513   /// function. Returns true if current function is modified, false otherwise.
 514   bool removeAtomicPseudoMIs();
 515
 516   /// Expands load operation \p MI. Returns true if instructions are
 517   /// added/deleted or \p MI is modified, false otherwise.
 518   bool expandLoad(const SIMemOpInfo &MOI,
 519                   MachineBasicBlock::iterator &MI);
 520   /// Expands store operation \p MI. Returns true if instructions are
 521   /// added/deleted or \p MI is modified, false otherwise.
 522   bool expandStore(const SIMemOpInfo &MOI,
 523                    MachineBasicBlock::iterator &MI);
 524   /// Expands atomic fence operation \p MI. Returns true if
 525   /// instructions are added/deleted or \p MI is modified, false otherwise.
 526   bool expandAtomicFence(const SIMemOpInfo &MOI,
 527                          MachineBasicBlock::iterator &MI);
 528   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
 529   /// instructions are added/deleted or \p MI is modified, false otherwise.
 530   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
 531                                 MachineBasicBlock::iterator &MI);
 532
 533 public:
 534   static char ID;
 535
 536   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
 537
 538   void getAnalysisUsage(AnalysisUsage &AU) const override {
 539     AU.setPreservesCFG();
 540     MachineFunctionPass::getAnalysisUsage(AU);
 541   }
 542
 543   StringRef getPassName() const override {
 544     return PASS_NAME;
 545   }
 546
 547   bool runOnMachineFunction(MachineFunction &MF) override;
 548 };
 549
 550 } // end namespace anonymous
 551
 552 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
 553                                       const char *Msg) const {
 554   const Function &Func = MI->getParent()->getParent()->getFunction();
 555   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
 556   Func.getContext().diagnose(Diag);
 557 }
 558
 559 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
 560 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
 561                                SIAtomicAddrSpace InstrAddrSpace) const {
 562   if (SSID == SyncScope::System)
 563     return std::make_tuple(SIAtomicScope::SYSTEM,
 564                            SIAtomicAddrSpace::ATOMIC,
 565                            true);
 566   if (SSID == MMI->getAgentSSID())
 567     return std::make_tuple(SIAtomicScope::AGENT,
 568                            SIAtomicAddrSpace::ATOMIC,
 569                            true);
 570   if (SSID == MMI->getWorkgroupSSID())
 571     return std::make_tuple(SIAtomicScope::WORKGROUP,
 572                            SIAtomicAddrSpace::ATOMIC,
 573                            true);
 574   if (SSID == MMI->getWavefrontSSID())
 575     return std::make_tuple(SIAtomicScope::WAVEFRONT,
 576                            SIAtomicAddrSpace::ATOMIC,
 577                            true);
 578   if (SSID == SyncScope::SingleThread)
 579     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
 580                            SIAtomicAddrSpace::ATOMIC,
 581                            true);
 582   if (SSID == MMI->getSystemOneAddressSpaceSSID())
 583     return std::make_tuple(SIAtomicScope::SYSTEM,
 584                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
 585                            false);
 586   if (SSID == MMI->getAgentOneAddressSpaceSSID())
 587     return std::make_tuple(SIAtomicScope::AGENT,
 588                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
 589                            false);
 590   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
 591     return std::make_tuple(SIAtomicScope::WORKGROUP,
 592                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
 593                            false);
 594   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
 595     return std::make_tuple(SIAtomicScope::WAVEFRONT,
 596                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
 597                            false);
 598   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
 599     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
 600                            SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
 601                            false);
 602   return None;
 603 }
 604
 605 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
 606   if (AS == AMDGPUAS::FLAT_ADDRESS)
 607     return SIAtomicAddrSpace::FLAT;
 608   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
 609     return SIAtomicAddrSpace::GLOBAL;
 610   if (AS == AMDGPUAS::LOCAL_ADDRESS)
 611     return SIAtomicAddrSpace::LDS;
 612   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
 613     return SIAtomicAddrSpace::SCRATCH;
 614   if (AS == AMDGPUAS::REGION_ADDRESS)
 615     return SIAtomicAddrSpace::GDS;
 616
 617   return SIAtomicAddrSpace::OTHER;
 618 }
 619
 620 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
 621   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
 622 }
 623
 624 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
 625     const MachineBasicBlock::iterator &MI) const {
 626   assert(MI->getNumMemOperands() > 0);
 627
 628   SyncScope::ID SSID = SyncScope::SingleThread;
 629   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
 630   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
 631   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
 632   bool IsNonTemporal = true;
 633   bool IsVolatile = false;
 634
 635   // Validator should check whether or not MMOs cover the entire set of
 636   // locations accessed by the memory instruction.
 637   for (const auto &MMO : MI->memoperands()) {
 638     IsNonTemporal &= MMO->isNonTemporal();
 639     IsVolatile |= MMO->isVolatile();
 640     InstrAddrSpace |=
 641       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
 642     AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
 643     if (OpOrdering != AtomicOrdering::NotAtomic) {
 644       const auto &IsSyncScopeInclusion =
 645           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
 646       if (!IsSyncScopeInclusion) {
 647         reportUnsupported(MI,
 648           "Unsupported non-inclusive atomic synchronization scope");
 649         return None;
 650       }
 651
 652       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
 653       Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
 654       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
 655              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
 656       FailureOrdering =
 657           getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
 658     }
 659   }
 660
 661   SIAtomicScope Scope = SIAtomicScope::NONE;
 662   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
 663   bool IsCrossAddressSpaceOrdering = false;
 664   if (Ordering != AtomicOrdering::NotAtomic) {
 665     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
 666     if (!ScopeOrNone) {
 667       reportUnsupported(MI, "Unsupported atomic synchronization scope");
 668       return None;
 669     }
 670     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
 671       ScopeOrNone.getValue();
 672     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
 673         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
 674         ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
 675       reportUnsupported(MI, "Unsupported atomic address space");
 676       return None;
 677     }
 678   }
 679   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
 680                      IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
 681                      IsNonTemporal);
 682 }
 683
 684 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
 685     const MachineBasicBlock::iterator &MI) const {
 686   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 687
 688   if (!(MI->mayLoad() && !MI->mayStore()))
 689     return None;
 690
 691   // Be conservative if there are no memory operands.
 692   if (MI->getNumMemOperands() == 0)
 693     return SIMemOpInfo();
 694
 695   return constructFromMIWithMMO(MI);
 696 }
 697
 698 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
 699     const MachineBasicBlock::iterator &MI) const {
 700   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 701
 702   if (!(!MI->mayLoad() && MI->mayStore()))
 703     return None;
 704
 705   // Be conservative if there are no memory operands.
 706   if (MI->getNumMemOperands() == 0)
 707     return SIMemOpInfo();
 708
 709   return constructFromMIWithMMO(MI);
 710 }
 711
 712 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
 713     const MachineBasicBlock::iterator &MI) const {
 714   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 715
 716   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
 717     return None;
 718
 719   AtomicOrdering Ordering =
 720     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
 721
 722   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
 723   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
 724   if (!ScopeOrNone) {
 725     reportUnsupported(MI, "Unsupported atomic synchronization scope");
 726     return None;
 727   }
 728
 729   SIAtomicScope Scope = SIAtomicScope::NONE;
 730   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
 731   bool IsCrossAddressSpaceOrdering = false;
 732   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
 733     ScopeOrNone.getValue();
 734
 735   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
 736       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
 737     reportUnsupported(MI, "Unsupported atomic address space");
 738     return None;
 739   }
 740
 741   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
 742                      IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
 743 }
 744
 745 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
 746     const MachineBasicBlock::iterator &MI) const {
 747   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 748
 749   if (!(MI->mayLoad() && MI->mayStore()))
 750     return None;
 751
 752   // Be conservative if there are no memory operands.
 753   if (MI->getNumMemOperands() == 0)
 754     return SIMemOpInfo();
 755
 756   return constructFromMIWithMMO(MI);
 757 }
 758
 759 SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
 760   TII = ST.getInstrInfo();
 761   IV = getIsaVersion(ST.getCPU());
 762   InsertCacheInv = !AmdgcnSkipCacheInvalidations;
 763 }
 764
 765 bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
 766                                     AMDGPU::CPol::CPol Bit) const {
 767   MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
 768   if (!CPol)
 769     return false;
 770
 771   CPol->setImm(CPol->getImm() | Bit);
 772   return true;
 773 }
 774
 775 /* static */
 776 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
 777   GCNSubtarget::Generation Generation = ST.getGeneration();
 778   if (ST.hasGFX90AInsts())
 779     return std::make_unique<SIGfx90ACacheControl>(ST);
 780   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
 781     return std::make_unique<SIGfx6CacheControl>(ST);
 782   if (Generation < AMDGPUSubtarget::GFX10)
 783     return std::make_unique<SIGfx7CacheControl>(ST);
 784   return std::make_unique<SIGfx10CacheControl>(ST);
 785 }
 786
 787 bool SIGfx6CacheControl::enableLoadCacheBypass(
 788     const MachineBasicBlock::iterator &MI,
 789     SIAtomicScope Scope,
 790     SIAtomicAddrSpace AddrSpace) const {
 791   assert(MI->mayLoad() && !MI->mayStore());
 792   bool Changed = false;
 793
 794   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
 795     switch (Scope) {
 796     case SIAtomicScope::SYSTEM:
 797     case SIAtomicScope::AGENT:
 798       Changed |= enableGLCBit(MI);
 799       break;
 800     case SIAtomicScope::WORKGROUP:
 801     case SIAtomicScope::WAVEFRONT:
 802     case SIAtomicScope::SINGLETHREAD:
 803       // No cache to bypass.
 804       break;
 805     default:
 806       llvm_unreachable("Unsupported synchronization scope");
 807     }
 808   }
 809
 810   /// The scratch address space does not need the global memory caches
 811   /// to be bypassed as all memory operations by the same thread are
 812   /// sequentially consistent, and no other thread can access scratch
 813   /// memory.
 814
 815   /// Other address spaces do not have a cache.
 816
 817   return Changed;
 818 }
 819
 820 bool SIGfx6CacheControl::enableStoreCacheBypass(
 821     const MachineBasicBlock::iterator &MI,
 822     SIAtomicScope Scope,
 823     SIAtomicAddrSpace AddrSpace) const {
 824   assert(!MI->mayLoad() && MI->mayStore());
 825   bool Changed = false;
 826
 827   /// The L1 cache is write through so does not need to be bypassed. There is no
 828   /// bypass control for the L2 cache at the isa level.
 829
 830   return Changed;
 831 }
 832
 833 bool SIGfx6CacheControl::enableRMWCacheBypass(
 834     const MachineBasicBlock::iterator &MI,
 835     SIAtomicScope Scope,
 836     SIAtomicAddrSpace AddrSpace) const {
 837   assert(MI->mayLoad() && MI->mayStore());
 838   bool Changed = false;
 839
 840   /// The L1 cache is write through so does not need to be bypassed. There is no
 841   /// bypass control for the L2 cache at the isa level.
 842
 843   return Changed;
 844 }
 845
 846 bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
 847     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
 848     bool IsVolatile, bool IsNonTemporal) const {
 849   // Only handle load and store, not atomic read-modify-write insructions. The
 850   // latter use glc to indicate if the atomic returns a result and so must not
 851   // be used for cache control.
 852   assert(MI->mayLoad() ^ MI->mayStore());
 853
 854   // Only update load and store, not LLVM IR atomic read-modify-write
 855   // instructions. The latter are always marked as volatile so cannot sensibly
 856   // handle it as do not want to pessimize all atomics. Also they do not support
 857   // the nontemporal attribute.
 858   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
 859
 860   bool Changed = false;
 861
 862   if (IsVolatile) {
 863     if (Op == SIMemOp::LOAD)
 864       Changed |= enableGLCBit(MI);
 865
 866     // Ensure operation has completed at system scope to cause all volatile
 867     // operations to be visible outside the program in a global order. Do not
 868     // request cross address space as only the global address space can be
 869     // observable outside the program, so no need to cause a waitcnt for LDS
 870     // address space operations.
 871     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
 872                           Position::AFTER);
 873
 874     return Changed;
 875   }
 876
 877   if (IsNonTemporal) {
 878     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
 879     Changed |= enableGLCBit(MI);
 880     Changed |= enableSLCBit(MI);
 881     return Changed;
 882   }
 883
 884   return Changed;
 885 }
 886
 887 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
 888                                     SIAtomicScope Scope,
 889                                     SIAtomicAddrSpace AddrSpace,
 890                                     SIMemOp Op,
 891                                     bool IsCrossAddrSpaceOrdering,
 892                                     Position Pos) const {
 893   bool Changed = false;
 894
 895   MachineBasicBlock &MBB = *MI->getParent();
 896   DebugLoc DL = MI->getDebugLoc();
 897
 898   if (Pos == Position::AFTER)
 899     ++MI;
 900
 901   bool VMCnt = false;
 902   bool LGKMCnt = false;
 903
 904   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
 905       SIAtomicAddrSpace::NONE) {
 906     switch (Scope) {
 907     case SIAtomicScope::SYSTEM:
 908     case SIAtomicScope::AGENT:
 909       VMCnt |= true;
 910       break;
 911     case SIAtomicScope::WORKGROUP:
 912     case SIAtomicScope::WAVEFRONT:
 913     case SIAtomicScope::SINGLETHREAD:
 914       // The L1 cache keeps all memory operations in order for
 915       // wavefronts in the same work-group.
 916       break;
 917     default:
 918       llvm_unreachable("Unsupported synchronization scope");
 919     }
 920   }
 921
 922   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
 923     switch (Scope) {
 924     case SIAtomicScope::SYSTEM:
 925     case SIAtomicScope::AGENT:
 926     case SIAtomicScope::WORKGROUP:
 927       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
 928       // not needed as LDS operations for all waves are executed in a total
 929       // global ordering as observed by all waves. Required if also
 930       // synchronizing with global/GDS memory as LDS operations could be
 931       // reordered with respect to later global/GDS memory operations of the
 932       // same wave.
 933       LGKMCnt |= IsCrossAddrSpaceOrdering;
 934       break;
 935     case SIAtomicScope::WAVEFRONT:
 936     case SIAtomicScope::SINGLETHREAD:
 937       // The LDS keeps all memory operations in order for
 938       // the same wavesfront.
 939       break;
 940     default:
 941       llvm_unreachable("Unsupported synchronization scope");
 942     }
 943   }
 944
 945   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
 946     switch (Scope) {
 947     case SIAtomicScope::SYSTEM:
 948     case SIAtomicScope::AGENT:
 949       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
 950       // is not needed as GDS operations for all waves are executed in a total
 951       // global ordering as observed by all waves. Required if also
 952       // synchronizing with global/LDS memory as GDS operations could be
 953       // reordered with respect to later global/LDS memory operations of the
 954       // same wave.
 955       LGKMCnt |= IsCrossAddrSpaceOrdering;
 956       break;
 957     case SIAtomicScope::WORKGROUP:
 958     case SIAtomicScope::WAVEFRONT:
 959     case SIAtomicScope::SINGLETHREAD:
 960       // The GDS keeps all memory operations in order for
 961       // the same work-group.
 962       break;
 963     default:
 964       llvm_unreachable("Unsupported synchronization scope");
 965     }
 966   }
 967
 968   if (VMCnt || LGKMCnt) {
 969     unsigned WaitCntImmediate =
 970       AMDGPU::encodeWaitcnt(IV,
 971                             VMCnt ? 0 : getVmcntBitMask(IV),
 972                             getExpcntBitMask(IV),
 973                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
 974     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
 975     Changed = true;
 976   }
 977
 978   if (Pos == Position::AFTER)
 979     --MI;
 980
 981   return Changed;
 982 }
 983
 984 bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
 985                                        SIAtomicScope Scope,
 986                                        SIAtomicAddrSpace AddrSpace,
 987                                        Position Pos) const {
 988   if (!InsertCacheInv)
 989     return false;
 990
 991   bool Changed = false;
 992
 993   MachineBasicBlock &MBB = *MI->getParent();
 994   DebugLoc DL = MI->getDebugLoc();
 995
 996   if (Pos == Position::AFTER)
 997     ++MI;
 998
 999   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1000     switch (Scope) {
1001     case SIAtomicScope::SYSTEM:
1002     case SIAtomicScope::AGENT:
1003       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1004       Changed = true;
1005       break;
1006     case SIAtomicScope::WORKGROUP:
1007     case SIAtomicScope::WAVEFRONT:
1008     case SIAtomicScope::SINGLETHREAD:
1009       // No cache to invalidate.
1010       break;
1011     default:
1012       llvm_unreachable("Unsupported synchronization scope");
1013     }
1014   }
1015
1016   /// The scratch address space does not need the global memory cache
1017   /// to be flushed as all memory operations by the same thread are
1018   /// sequentially consistent, and no other thread can access scratch
1019   /// memory.
1020
1021   /// Other address spaces do not have a cache.
1022
1023   if (Pos == Position::AFTER)
1024     --MI;
1025
1026   return Changed;
1027 }
1028
1029 bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1030                                        SIAtomicScope Scope,
1031                                        SIAtomicAddrSpace AddrSpace,
1032                                        bool IsCrossAddrSpaceOrdering,
1033                                        Position Pos) const {
1034     return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1035                       IsCrossAddrSpaceOrdering, Pos);
1036 }
1037
1038 bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1039                                        SIAtomicScope Scope,
1040                                        SIAtomicAddrSpace AddrSpace,
1041                                        Position Pos) const {
1042   if (!InsertCacheInv)
1043     return false;
1044
1045   bool Changed = false;
1046
1047   MachineBasicBlock &MBB = *MI->getParent();
1048   DebugLoc DL = MI->getDebugLoc();
1049
1050   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1051
1052   const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1053                                     ? AMDGPU::BUFFER_WBINVL1
1054                                     : AMDGPU::BUFFER_WBINVL1_VOL;
1055
1056   if (Pos == Position::AFTER)
1057     ++MI;
1058
1059   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1060     switch (Scope) {
1061     case SIAtomicScope::SYSTEM:
1062     case SIAtomicScope::AGENT:
1063       BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1064       Changed = true;
1065       break;
1066     case SIAtomicScope::WORKGROUP:
1067     case SIAtomicScope::WAVEFRONT:
1068     case SIAtomicScope::SINGLETHREAD:
1069       // No cache to invalidate.
1070       break;
1071     default:
1072       llvm_unreachable("Unsupported synchronization scope");
1073     }
1074   }
1075
1076   /// The scratch address space does not need the global memory cache
1077   /// to be flushed as all memory operations by the same thread are
1078   /// sequentially consistent, and no other thread can access scratch
1079   /// memory.
1080
1081   /// Other address spaces do not have a cache.
1082
1083   if (Pos == Position::AFTER)
1084     --MI;
1085
1086   return Changed;
1087 }
1088
1089 bool SIGfx90ACacheControl::enableLoadCacheBypass(
1090     const MachineBasicBlock::iterator &MI,
1091     SIAtomicScope Scope,
1092     SIAtomicAddrSpace AddrSpace) const {
1093   assert(MI->mayLoad() && !MI->mayStore());
1094   bool Changed = false;
1095
1096   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1097     switch (Scope) {
1098     case SIAtomicScope::SYSTEM:
1099     case SIAtomicScope::AGENT:
1100       Changed |= enableGLCBit(MI);
1101       break;
1102     case SIAtomicScope::WORKGROUP:
1103       // In threadgroup split mode the waves of a work-group can be executing on
1104       // different CUs. Therefore need to bypass the L1 which is per CU.
1105       // Otherwise in non-threadgroup split mode all waves of a work-group are
1106       // on the same CU, and so the L1 does not need to be bypassed.
1107       if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
1108       break;
1109     case SIAtomicScope::WAVEFRONT:
1110     case SIAtomicScope::SINGLETHREAD:
1111       // No cache to bypass.
1112       break;
1113     default:
1114       llvm_unreachable("Unsupported synchronization scope");
1115     }
1116   }
1117
1118   /// The scratch address space does not need the global memory caches
1119   /// to be bypassed as all memory operations by the same thread are
1120   /// sequentially consistent, and no other thread can access scratch
1121   /// memory.
1122
1123   /// Other address spaces do not have a cache.
1124
1125   return Changed;
1126 }
1127
1128 bool SIGfx90ACacheControl::enableStoreCacheBypass(
1129     const MachineBasicBlock::iterator &MI,
1130     SIAtomicScope Scope,
1131     SIAtomicAddrSpace AddrSpace) const {
1132   assert(!MI->mayLoad() && MI->mayStore());
1133   bool Changed = false;
1134
1135   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1136     switch (Scope) {
1137     case SIAtomicScope::SYSTEM:
1138     case SIAtomicScope::AGENT:
1139       /// Do not set glc for store atomic operations as they implicitly write
1140       /// through the L1 cache.
1141       break;
1142     case SIAtomicScope::WORKGROUP:
1143     case SIAtomicScope::WAVEFRONT:
1144     case SIAtomicScope::SINGLETHREAD:
1145       // No cache to bypass. Store atomics implicitly write through the L1
1146       // cache.
1147       break;
1148     default:
1149       llvm_unreachable("Unsupported synchronization scope");
1150     }
1151   }
1152
1153   /// The scratch address space does not need the global memory caches
1154   /// to be bypassed as all memory operations by the same thread are
1155   /// sequentially consistent, and no other thread can access scratch
1156   /// memory.
1157
1158   /// Other address spaces do not have a cache.
1159
1160   return Changed;
1161 }
1162
1163 bool SIGfx90ACacheControl::enableRMWCacheBypass(
1164     const MachineBasicBlock::iterator &MI,
1165     SIAtomicScope Scope,
1166     SIAtomicAddrSpace AddrSpace) const {
1167   assert(MI->mayLoad() && MI->mayStore());
1168   bool Changed = false;
1169
1170   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1171     switch (Scope) {
1172     case SIAtomicScope::SYSTEM:
1173     case SIAtomicScope::AGENT:
1174       /// Do not set glc for RMW atomic operations as they implicitly bypass
1175       /// the L1 cache, and the glc bit is instead used to indicate if they are
1176       /// return or no-return.
1177       break;
1178     case SIAtomicScope::WORKGROUP:
1179     case SIAtomicScope::WAVEFRONT:
1180     case SIAtomicScope::SINGLETHREAD:
1181       // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1182       break;
1183     default:
1184       llvm_unreachable("Unsupported synchronization scope");
1185     }
1186   }
1187
1188   return Changed;
1189 }
1190
1191 bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1192     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1193     bool IsVolatile, bool IsNonTemporal) const {
1194   // Only handle load and store, not atomic read-modify-write insructions. The
1195   // latter use glc to indicate if the atomic returns a result and so must not
1196   // be used for cache control.
1197   assert(MI->mayLoad() ^ MI->mayStore());
1198
1199   // Only update load and store, not LLVM IR atomic read-modify-write
1200   // instructions. The latter are always marked as volatile so cannot sensibly
1201   // handle it as do not want to pessimize all atomics. Also they do not support
1202   // the nontemporal attribute.
1203   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1204
1205   bool Changed = false;
1206
1207   if (IsVolatile) {
1208     if (Op == SIMemOp::LOAD) {
1209       Changed |= enableGLCBit(MI);
1210     }
1211
1212     // Ensure operation has completed at system scope to cause all volatile
1213     // operations to be visible outside the program in a global order. Do not
1214     // request cross address space as only the global address space can be
1215     // observable outside the program, so no need to cause a waitcnt for LDS
1216     // address space operations.
1217     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1218                           Position::AFTER);
1219
1220     return Changed;
1221   }
1222
1223   if (IsNonTemporal) {
1224     // Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
1225     Changed |= enableGLCBit(MI);
1226     Changed |= enableSLCBit(MI);
1227     return Changed;
1228   }
1229
1230   return Changed;
1231 }
1232
1233 bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1234                                       SIAtomicScope Scope,
1235                                       SIAtomicAddrSpace AddrSpace,
1236                                       SIMemOp Op,
1237                                       bool IsCrossAddrSpaceOrdering,
1238                                       Position Pos) const {
1239   if (ST.isTgSplitEnabled()) {
1240     // In threadgroup split mode the waves of a work-group can be executing on
1241     // different CUs. Therefore need to wait for global or GDS memory operations
1242     // to complete to ensure they are visible to waves in the other CUs.
1243     // Otherwise in non-threadgroup split mode all waves of a work-group are on
1244     // the same CU, so no need to wait for global memory as all waves in the
1245     // work-group access the same the L1, nor wait for GDS as access are ordered
1246     // on a CU.
1247     if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1248                        SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1249         (Scope == SIAtomicScope::WORKGROUP)) {
1250       // Same as GFX7 using agent scope.
1251       Scope = SIAtomicScope::AGENT;
1252     }
1253     // In threadgroup split mode LDS cannot be allocated so no need to wait for
1254     // LDS memory operations.
1255     AddrSpace &= ~SIAtomicAddrSpace::LDS;
1256   }
1257   return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1258                                         IsCrossAddrSpaceOrdering, Pos);
1259 }
1260
1261 bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1262                                          SIAtomicScope Scope,
1263                                          SIAtomicAddrSpace AddrSpace,
1264                                          Position Pos) const {
1265   if (!InsertCacheInv)
1266     return false;
1267
1268   bool Changed = false;
1269
1270   MachineBasicBlock &MBB = *MI->getParent();
1271   DebugLoc DL = MI->getDebugLoc();
1272
1273   if (Pos == Position::AFTER)
1274     ++MI;
1275
1276   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1277     switch (Scope) {
1278     case SIAtomicScope::SYSTEM:
1279       // Ensures that following loads will not see stale remote VMEM data or
1280       // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1281       // CC will never be stale due to the local memory probes.
1282       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1283       // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1284       // hardware does not reorder memory operations by the same wave with
1285       // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1286       // remove any cache lines of earlier writes by the same wave and ensures
1287       // later reads by the same wave will refetch the cache lines.
1288       Changed = true;
1289       break;
1290     case SIAtomicScope::AGENT:
1291       // Same as GFX7.
1292       break;
1293     case SIAtomicScope::WORKGROUP:
1294       // In threadgroup split mode the waves of a work-group can be executing on
1295       // different CUs. Therefore need to invalidate the L1 which is per CU.
1296       // Otherwise in non-threadgroup split mode all waves of a work-group are
1297       // on the same CU, and so the L1 does not need to be invalidated.
1298       if (ST.isTgSplitEnabled()) {
1299         // Same as GFX7 using agent scope.
1300         Scope = SIAtomicScope::AGENT;
1301       }
1302       break;
1303     case SIAtomicScope::WAVEFRONT:
1304     case SIAtomicScope::SINGLETHREAD:
1305       // Same as GFX7.
1306       break;
1307     default:
1308       llvm_unreachable("Unsupported synchronization scope");
1309     }
1310   }
1311
1312   /// The scratch address space does not need the global memory cache
1313   /// to be flushed as all memory operations by the same thread are
1314   /// sequentially consistent, and no other thread can access scratch
1315   /// memory.
1316
1317   /// Other address spaces do not have a cache.
1318
1319   if (Pos == Position::AFTER)
1320     --MI;
1321
1322   Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1323
1324   return Changed;
1325 }
1326
1327 bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1328                                          SIAtomicScope Scope,
1329                                          SIAtomicAddrSpace AddrSpace,
1330                                          bool IsCrossAddrSpaceOrdering,
1331                                          Position Pos) const {
1332   bool Changed = false;
1333
1334   MachineBasicBlock &MBB = *MI->getParent();
1335   DebugLoc DL = MI->getDebugLoc();
1336
1337   if (Pos == Position::AFTER)
1338     ++MI;
1339
1340   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1341     switch (Scope) {
1342     case SIAtomicScope::SYSTEM:
1343       // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1344       // hardware does not reorder memory operations by the same wave with
1345       // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1346       // to initiate writeback of any dirty cache lines of earlier writes by the
1347       // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1348       // writeback has completed.
1349       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
1350       // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1351       // vmcnt(0)" needed by the "BUFFER_WBL2".
1352       Changed = true;
1353       break;
1354     case SIAtomicScope::AGENT:
1355     case SIAtomicScope::WORKGROUP:
1356     case SIAtomicScope::WAVEFRONT:
1357     case SIAtomicScope::SINGLETHREAD:
1358       // Same as GFX7.
1359       break;
1360     default:
1361       llvm_unreachable("Unsupported synchronization scope");
1362     }
1363   }
1364
1365   if (Pos == Position::AFTER)
1366     --MI;
1367
1368   Changed |=
1369       SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1370                                         IsCrossAddrSpaceOrdering, Pos);
1371
1372   return Changed;
1373 }
1374
1375 bool SIGfx10CacheControl::enableLoadCacheBypass(
1376     const MachineBasicBlock::iterator &MI,
1377     SIAtomicScope Scope,
1378     SIAtomicAddrSpace AddrSpace) const {
1379   assert(MI->mayLoad() && !MI->mayStore());
1380   bool Changed = false;
1381
1382   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1383     /// TODO Do not set glc for rmw atomic operations as they
1384     /// implicitly bypass the L0/L1 caches.
1385
1386     switch (Scope) {
1387     case SIAtomicScope::SYSTEM:
1388     case SIAtomicScope::AGENT:
1389       Changed |= enableGLCBit(MI);
1390       Changed |= enableDLCBit(MI);
1391       break;
1392     case SIAtomicScope::WORKGROUP:
1393       // In WGP mode the waves of a work-group can be executing on either CU of
1394       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1395       // CU mode all waves of a work-group are on the same CU, and so the L0
1396       // does not need to be bypassed.
1397       if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
1398       break;
1399     case SIAtomicScope::WAVEFRONT:
1400     case SIAtomicScope::SINGLETHREAD:
1401       // No cache to bypass.
1402       break;
1403     default:
1404       llvm_unreachable("Unsupported synchronization scope");
1405     }
1406   }
1407
1408   /// The scratch address space does not need the global memory caches
1409   /// to be bypassed as all memory operations by the same thread are
1410   /// sequentially consistent, and no other thread can access scratch
1411   /// memory.
1412
1413   /// Other address spaces do not have a cache.
1414
1415   return Changed;
1416 }
1417
1418 bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1419     MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1420     bool IsVolatile, bool IsNonTemporal) const {
1421
1422   // Only handle load and store, not atomic read-modify-write insructions. The
1423   // latter use glc to indicate if the atomic returns a result and so must not
1424   // be used for cache control.
1425   assert(MI->mayLoad() ^ MI->mayStore());
1426
1427   // Only update load and store, not LLVM IR atomic read-modify-write
1428   // instructions. The latter are always marked as volatile so cannot sensibly
1429   // handle it as do not want to pessimize all atomics. Also they do not support
1430   // the nontemporal attribute.
1431   assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1432
1433   bool Changed = false;
1434
1435   if (IsVolatile) {
1436
1437     if (Op == SIMemOp::LOAD) {
1438       Changed |= enableGLCBit(MI);
1439       Changed |= enableDLCBit(MI);
1440     }
1441
1442     // Ensure operation has completed at system scope to cause all volatile
1443     // operations to be visible outside the program in a global order. Do not
1444     // request cross address space as only the global address space can be
1445     // observable outside the program, so no need to cause a waitcnt for LDS
1446     // address space operations.
1447     Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1448                           Position::AFTER);
1449     return Changed;
1450   }
1451
1452   if (IsNonTemporal) {
1453     // Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1454     Changed |= enableSLCBit(MI);
1455     return Changed;
1456   }
1457
1458   return Changed;
1459 }
1460
1461 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1462                                      SIAtomicScope Scope,
1463                                      SIAtomicAddrSpace AddrSpace,
1464                                      SIMemOp Op,
1465                                      bool IsCrossAddrSpaceOrdering,
1466                                      Position Pos) const {
1467   bool Changed = false;
1468
1469   MachineBasicBlock &MBB = *MI->getParent();
1470   DebugLoc DL = MI->getDebugLoc();
1471
1472   if (Pos == Position::AFTER)
1473     ++MI;
1474
1475   bool VMCnt = false;
1476   bool VSCnt = false;
1477   bool LGKMCnt = false;
1478
1479   if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1480       SIAtomicAddrSpace::NONE) {
1481     switch (Scope) {
1482     case SIAtomicScope::SYSTEM:
1483     case SIAtomicScope::AGENT:
1484       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1485         VMCnt |= true;
1486       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1487         VSCnt |= true;
1488       break;
1489     case SIAtomicScope::WORKGROUP:
1490       // In WGP mode the waves of a work-group can be executing on either CU of
1491       // the WGP. Therefore need to wait for operations to complete to ensure
1492       // they are visible to waves in the other CU as the L0 is per CU.
1493       // Otherwise in CU mode and all waves of a work-group are on the same CU
1494       // which shares the same L0.
1495       if (!ST.isCuModeEnabled()) {
1496         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1497           VMCnt |= true;
1498         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1499           VSCnt |= true;
1500       }
1501       break;
1502     case SIAtomicScope::WAVEFRONT:
1503     case SIAtomicScope::SINGLETHREAD:
1504       // The L0 cache keeps all memory operations in order for
1505       // work-items in the same wavefront.
1506       break;
1507     default:
1508       llvm_unreachable("Unsupported synchronization scope");
1509     }
1510   }
1511
1512   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1513     switch (Scope) {
1514     case SIAtomicScope::SYSTEM:
1515     case SIAtomicScope::AGENT:
1516     case SIAtomicScope::WORKGROUP:
1517       // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1518       // not needed as LDS operations for all waves are executed in a total
1519       // global ordering as observed by all waves. Required if also
1520       // synchronizing with global/GDS memory as LDS operations could be
1521       // reordered with respect to later global/GDS memory operations of the
1522       // same wave.
1523       LGKMCnt |= IsCrossAddrSpaceOrdering;
1524       break;
1525     case SIAtomicScope::WAVEFRONT:
1526     case SIAtomicScope::SINGLETHREAD:
1527       // The LDS keeps all memory operations in order for
1528       // the same wavesfront.
1529       break;
1530     default:
1531       llvm_unreachable("Unsupported synchronization scope");
1532     }
1533   }
1534
1535   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1536     switch (Scope) {
1537     case SIAtomicScope::SYSTEM:
1538     case SIAtomicScope::AGENT:
1539       // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1540       // is not needed as GDS operations for all waves are executed in a total
1541       // global ordering as observed by all waves. Required if also
1542       // synchronizing with global/LDS memory as GDS operations could be
1543       // reordered with respect to later global/LDS memory operations of the
1544       // same wave.
1545       LGKMCnt |= IsCrossAddrSpaceOrdering;
1546       break;
1547     case SIAtomicScope::WORKGROUP:
1548     case SIAtomicScope::WAVEFRONT:
1549     case SIAtomicScope::SINGLETHREAD:
1550       // The GDS keeps all memory operations in order for
1551       // the same work-group.
1552       break;
1553     default:
1554       llvm_unreachable("Unsupported synchronization scope");
1555     }
1556   }
1557
1558   if (VMCnt || LGKMCnt) {
1559     unsigned WaitCntImmediate =
1560       AMDGPU::encodeWaitcnt(IV,
1561                             VMCnt ? 0 : getVmcntBitMask(IV),
1562                             getExpcntBitMask(IV),
1563                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1564     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1565     Changed = true;
1566   }
1567
1568   if (VSCnt) {
1569     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1570       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1571       .addImm(0);
1572     Changed = true;
1573   }
1574
1575   if (Pos == Position::AFTER)
1576     --MI;
1577
1578   return Changed;
1579 }
1580
1581 bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1582                                         SIAtomicScope Scope,
1583                                         SIAtomicAddrSpace AddrSpace,
1584                                         Position Pos) const {
1585   if (!InsertCacheInv)
1586     return false;
1587
1588   bool Changed = false;
1589
1590   MachineBasicBlock &MBB = *MI->getParent();
1591   DebugLoc DL = MI->getDebugLoc();
1592
1593   if (Pos == Position::AFTER)
1594     ++MI;
1595
1596   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1597     switch (Scope) {
1598     case SIAtomicScope::SYSTEM:
1599     case SIAtomicScope::AGENT:
1600       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1601       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
1602       Changed = true;
1603       break;
1604     case SIAtomicScope::WORKGROUP:
1605       // In WGP mode the waves of a work-group can be executing on either CU of
1606       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
1607       // in CU mode and all waves of a work-group are on the same CU, and so the
1608       // L0 does not need to be invalidated.
1609       if (!ST.isCuModeEnabled()) {
1610         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
1611         Changed = true;
1612       }
1613       break;
1614     case SIAtomicScope::WAVEFRONT:
1615     case SIAtomicScope::SINGLETHREAD:
1616       // No cache to invalidate.
1617       break;
1618     default:
1619       llvm_unreachable("Unsupported synchronization scope");
1620     }
1621   }
1622
1623   /// The scratch address space does not need the global memory cache
1624   /// to be flushed as all memory operations by the same thread are
1625   /// sequentially consistent, and no other thread can access scratch
1626   /// memory.
1627
1628   /// Other address spaces do not have a cache.
1629
1630   if (Pos == Position::AFTER)
1631     --MI;
1632
1633   return Changed;
1634 }
1635
1636 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1637   if (AtomicPseudoMIs.empty())
1638     return false;
1639
1640   for (auto &MI : AtomicPseudoMIs)
1641     MI->eraseFromParent();
1642
1643   AtomicPseudoMIs.clear();
1644   return true;
1645 }
1646
1647 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1648                                    MachineBasicBlock::iterator &MI) {
1649   assert(MI->mayLoad() && !MI->mayStore());
1650
1651   bool Changed = false;
1652
1653   if (MOI.isAtomic()) {
1654     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1655         MOI.getOrdering() == AtomicOrdering::Acquire ||
1656         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1657       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1658                                            MOI.getOrderingAddrSpace());
1659     }
1660
1661     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1662       Changed |= CC->insertWait(MI, MOI.getScope(),
1663                                 MOI.getOrderingAddrSpace(),
1664                                 SIMemOp::LOAD | SIMemOp::STORE,
1665                                 MOI.getIsCrossAddressSpaceOrdering(),
1666                                 Position::BEFORE);
1667
1668     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1669         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1670       Changed |= CC->insertWait(MI, MOI.getScope(),
1671                                 MOI.getInstrAddrSpace(),
1672                                 SIMemOp::LOAD,
1673                                 MOI.getIsCrossAddressSpaceOrdering(),
1674                                 Position::AFTER);
1675       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1676                                    MOI.getOrderingAddrSpace(),
1677                                    Position::AFTER);
1678     }
1679
1680     return Changed;
1681   }
1682
1683   // Atomic instructions already bypass caches to the scope specified by the
1684   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1685   // need additional treatment.
1686   Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1687                                                 SIMemOp::LOAD, MOI.isVolatile(),
1688                                                 MOI.isNonTemporal());
1689   return Changed;
1690 }
1691
1692 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1693                                     MachineBasicBlock::iterator &MI) {
1694   assert(!MI->mayLoad() && MI->mayStore());
1695
1696   bool Changed = false;
1697
1698   if (MOI.isAtomic()) {
1699     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1700         MOI.getOrdering() == AtomicOrdering::Release ||
1701         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1702       Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
1703                                             MOI.getOrderingAddrSpace());
1704     }
1705
1706     if (MOI.getOrdering() == AtomicOrdering::Release ||
1707         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1708       Changed |= CC->insertRelease(MI, MOI.getScope(),
1709                                    MOI.getOrderingAddrSpace(),
1710                                    MOI.getIsCrossAddressSpaceOrdering(),
1711                                    Position::BEFORE);
1712
1713     return Changed;
1714   }
1715
1716   // Atomic instructions already bypass caches to the scope specified by the
1717   // SyncScope operand. Only non-atomic volatile and nontemporal instructions
1718   // need additional treatment.
1719   Changed |= CC->enableVolatileAndOrNonTemporal(
1720       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1721       MOI.isNonTemporal());
1722   return Changed;
1723 }
1724
1725 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1726                                           MachineBasicBlock::iterator &MI) {
1727   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1728
1729   AtomicPseudoMIs.push_back(MI);
1730   bool Changed = false;
1731
1732   if (MOI.isAtomic()) {
1733     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1734         MOI.getOrdering() == AtomicOrdering::Release ||
1735         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1736         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1737       /// TODO: This relies on a barrier always generating a waitcnt
1738       /// for LDS to ensure it is not reordered with the completion of
1739       /// the proceeding LDS operations. If barrier had a memory
1740       /// ordering and memory scope, then library does not need to
1741       /// generate a fence. Could add support in this file for
1742       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1743       /// adding S_WAITCNT before a S_BARRIER.
1744       Changed |= CC->insertRelease(MI, MOI.getScope(),
1745                                    MOI.getOrderingAddrSpace(),
1746                                    MOI.getIsCrossAddressSpaceOrdering(),
1747                                    Position::BEFORE);
1748
1749     // TODO: If both release and invalidate are happening they could be combined
1750     // to use the single "BUFFER_WBINV*" instruction. This could be done by
1751     // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
1752     // track cache invalidate and write back instructions.
1753
1754     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1755         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1756         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1757       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1758                                    MOI.getOrderingAddrSpace(),
1759                                    Position::BEFORE);
1760
1761     return Changed;
1762   }
1763
1764   return Changed;
1765 }
1766
1767 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1768   MachineBasicBlock::iterator &MI) {
1769   assert(MI->mayLoad() && MI->mayStore());
1770
1771   bool Changed = false;
1772
1773   if (MOI.isAtomic()) {
1774     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1775         MOI.getOrdering() == AtomicOrdering::Acquire ||
1776         MOI.getOrdering() == AtomicOrdering::Release ||
1777         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1778         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1779       Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
1780                                           MOI.getInstrAddrSpace());
1781     }
1782
1783     if (MOI.getOrdering() == AtomicOrdering::Release ||
1784         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1785         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1786         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1787       Changed |= CC->insertRelease(MI, MOI.getScope(),
1788                                    MOI.getOrderingAddrSpace(),
1789                                    MOI.getIsCrossAddressSpaceOrdering(),
1790                                    Position::BEFORE);
1791
1792     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1793         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1794         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1795         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1796         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1797       Changed |= CC->insertWait(MI, MOI.getScope(),
1798                                 MOI.getInstrAddrSpace(),
1799                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1800                                                    SIMemOp::STORE,
1801                                 MOI.getIsCrossAddressSpaceOrdering(),
1802                                 Position::AFTER);
1803       Changed |= CC->insertAcquire(MI, MOI.getScope(),
1804                                    MOI.getOrderingAddrSpace(),
1805                                    Position::AFTER);
1806     }
1807
1808     return Changed;
1809   }
1810
1811   return Changed;
1812 }
1813
1814 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1815   bool Changed = false;
1816
1817   SIMemOpAccess MOA(MF);
1818   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1819
1820   for (auto &MBB : MF) {
1821     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1822
1823       // Unbundle instructions after the post-RA scheduler.
1824       if (MI->isBundle() && MI->mayLoadOrStore()) {
1825         MachineBasicBlock::instr_iterator II(MI->getIterator());
1826         for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
1827              I != E && I->isBundledWithPred(); ++I) {
1828           I->unbundleFromPred();
1829           for (MachineOperand &MO : I->operands())
1830             if (MO.isReg())
1831               MO.setIsInternalRead(false);
1832         }
1833
1834         MI->eraseFromParent();
1835         MI = II->getIterator();
1836       }
1837
1838       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1839         continue;
1840
1841       if (const auto &MOI = MOA.getLoadInfo(MI))
1842         Changed |= expandLoad(MOI.getValue(), MI);
1843       else if (const auto &MOI = MOA.getStoreInfo(MI))
1844         Changed |= expandStore(MOI.getValue(), MI);
1845       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1846         Changed |= expandAtomicFence(MOI.getValue(), MI);
1847       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1848         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1849     }
1850   }
1851
1852   Changed |= removeAtomicPseudoMIs();
1853   return Changed;
1854 }
1855
1856 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1857
1858 char SIMemoryLegalizer::ID = 0;
1859 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1860
1861 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1862   return new SIMemoryLegalizer();
1863 }