lib/Target/AMDGPU/SIMemoryLegalizer.cpp

   1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 /// \file
  10 /// Memory legalizer - implements memory model. More information can be
  11 /// found here:
  12 ///   http://llvm.org/docs/AMDGPUUsage.html#memory-model
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "AMDGPU.h"
  17 #include "AMDGPUMachineModuleInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "SIDefines.h"
  20 #include "SIInstrInfo.h"
  21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  22 #include "Utils/AMDGPUBaseInfo.h"
  23 #include "llvm/ADT/BitmaskEnum.h"
  24 #include "llvm/ADT/None.h"
  25 #include "llvm/ADT/Optional.h"
  26 #include "llvm/CodeGen/MachineBasicBlock.h"
  27 #include "llvm/CodeGen/MachineFunction.h"
  28 #include "llvm/CodeGen/MachineFunctionPass.h"
  29 #include "llvm/CodeGen/MachineInstrBuilder.h"
  30 #include "llvm/CodeGen/MachineMemOperand.h"
  31 #include "llvm/CodeGen/MachineModuleInfo.h"
  32 #include "llvm/CodeGen/MachineOperand.h"
  33 #include "llvm/IR/DebugLoc.h"
  34 #include "llvm/IR/DiagnosticInfo.h"
  35 #include "llvm/IR/Function.h"
  36 #include "llvm/IR/LLVMContext.h"
  37 #include "llvm/MC/MCInstrDesc.h"
  38 #include "llvm/Pass.h"
  39 #include "llvm/Support/AtomicOrdering.h"
  40 #include "llvm/Support/MathExtras.h"
  41 #include <cassert>
  42 #include <list>
  43
  44 using namespace llvm;
  45 using namespace llvm::AMDGPU;
  46
  47 #define DEBUG_TYPE "si-memory-legalizer"
  48 #define PASS_NAME "SI Memory Legalizer"
  49
  50 namespace {
  51
  52 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
  53
  54 /// Memory operation flags. Can be ORed together.
  55 enum class SIMemOp {
  56   NONE = 0u,
  57   LOAD = 1u << 0,
  58   STORE = 1u << 1,
  59   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
  60 };
  61
  62 /// Position to insert a new instruction relative to an existing
  63 /// instruction.
  64 enum class Position {
  65   BEFORE,
  66   AFTER
  67 };
  68
  69 /// The atomic synchronization scopes supported by the AMDGPU target.
  70 enum class SIAtomicScope {
  71   NONE,
  72   SINGLETHREAD,
  73   WAVEFRONT,
  74   WORKGROUP,
  75   AGENT,
  76   SYSTEM
  77 };
  78
  79 /// The distinct address spaces supported by the AMDGPU target for
  80 /// atomic memory operation. Can be ORed toether.
  81 enum class SIAtomicAddrSpace {
  82   NONE = 0u,
  83   GLOBAL = 1u << 0,
  84   LDS = 1u << 1,
  85   SCRATCH = 1u << 2,
  86   GDS = 1u << 3,
  87   OTHER = 1u << 4,
  88
  89   /// The address spaces that can be accessed by a FLAT instruction.
  90   FLAT = GLOBAL | LDS | SCRATCH,
  91
  92   /// The address spaces that support atomic instructions.
  93   ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
  94
  95   /// All address spaces.
  96   ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
  97
  98   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
  99 };
 100
 101 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
 102 /// \returns Returns true if \p MI is modified, false otherwise.
 103 template <uint16_t BitName>
 104 bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
 105   int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
 106   if (BitIdx == -1)
 107     return false;
 108
 109   MachineOperand &Bit = MI->getOperand(BitIdx);
 110   if (Bit.getImm() != 0)
 111     return false;
 112
 113   Bit.setImm(1);
 114   return true;
 115 }
 116
 117 class SIMemOpInfo final {
 118 private:
 119
 120   friend class SIMemOpAccess;
 121
 122   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
 123   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
 124   SIAtomicScope Scope = SIAtomicScope::SYSTEM;
 125   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
 126   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
 127   bool IsCrossAddressSpaceOrdering = false;
 128   bool IsNonTemporal = false;
 129
 130   SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
 131               SIAtomicScope Scope = SIAtomicScope::SYSTEM,
 132               SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
 133               SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
 134               bool IsCrossAddressSpaceOrdering = true,
 135               AtomicOrdering FailureOrdering =
 136                 AtomicOrdering::SequentiallyConsistent,
 137               bool IsNonTemporal = false)
 138     : Ordering(Ordering), FailureOrdering(FailureOrdering),
 139       Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
 140       InstrAddrSpace(InstrAddrSpace),
 141       IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
 142       IsNonTemporal(IsNonTemporal) {
 143     // There is also no cross address space ordering if the ordering
 144     // address space is the same as the instruction address space and
 145     // only contains a single address space.
 146     if ((OrderingAddrSpace == InstrAddrSpace) &&
 147         isPowerOf2_32(uint32_t(InstrAddrSpace)))
 148       this->IsCrossAddressSpaceOrdering = false;
 149   }
 150
 151 public:
 152   /// \returns Atomic synchronization scope of the machine instruction used to
 153   /// create this SIMemOpInfo.
 154   SIAtomicScope getScope() const {
 155     return Scope;
 156   }
 157
 158   /// \returns Ordering constraint of the machine instruction used to
 159   /// create this SIMemOpInfo.
 160   AtomicOrdering getOrdering() const {
 161     return Ordering;
 162   }
 163
 164   /// \returns Failure ordering constraint of the machine instruction used to
 165   /// create this SIMemOpInfo.
 166   AtomicOrdering getFailureOrdering() const {
 167     return FailureOrdering;
 168   }
 169
 170   /// \returns The address spaces be accessed by the machine
 171   /// instruction used to create this SiMemOpInfo.
 172   SIAtomicAddrSpace getInstrAddrSpace() const {
 173     return InstrAddrSpace;
 174   }
 175
 176   /// \returns The address spaces that must be ordered by the machine
 177   /// instruction used to create this SiMemOpInfo.
 178   SIAtomicAddrSpace getOrderingAddrSpace() const {
 179     return OrderingAddrSpace;
 180   }
 181
 182   /// \returns Return true iff memory ordering of operations on
 183   /// different address spaces is required.
 184   bool getIsCrossAddressSpaceOrdering() const {
 185     return IsCrossAddressSpaceOrdering;
 186   }
 187
 188   /// \returns True if memory access of the machine instruction used to
 189   /// create this SIMemOpInfo is non-temporal, false otherwise.
 190   bool isNonTemporal() const {
 191     return IsNonTemporal;
 192   }
 193
 194   /// \returns True if ordering constraint of the machine instruction used to
 195   /// create this SIMemOpInfo is unordered or higher, false otherwise.
 196   bool isAtomic() const {
 197     return Ordering != AtomicOrdering::NotAtomic;
 198   }
 199
 200 };
 201
 202 class SIMemOpAccess final {
 203 private:
 204   AMDGPUMachineModuleInfo *MMI = nullptr;
 205
 206   /// Reports unsupported message \p Msg for \p MI to LLVM context.
 207   void reportUnsupported(const MachineBasicBlock::iterator &MI,
 208                          const char *Msg) const;
 209
 210   /// Inspects the target synchonization scope \p SSID and determines
 211   /// the SI atomic scope it corresponds to, the address spaces it
 212   /// covers, and whether the memory ordering applies between address
 213   /// spaces.
 214   Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
 215   toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
 216
 217   /// \return Return a bit set of the address spaces accessed by \p AS.
 218   SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
 219
 220   /// \returns Info constructed from \p MI, which has at least machine memory
 221   /// operand.
 222   Optional<SIMemOpInfo> constructFromMIWithMMO(
 223       const MachineBasicBlock::iterator &MI) const;
 224
 225 public:
 226   /// Construct class to support accessing the machine memory operands
 227   /// of instructions in the machine function \p MF.
 228   SIMemOpAccess(MachineFunction &MF);
 229
 230   /// \returns Load info if \p MI is a load operation, "None" otherwise.
 231   Optional<SIMemOpInfo> getLoadInfo(
 232       const MachineBasicBlock::iterator &MI) const;
 233
 234   /// \returns Store info if \p MI is a store operation, "None" otherwise.
 235   Optional<SIMemOpInfo> getStoreInfo(
 236       const MachineBasicBlock::iterator &MI) const;
 237
 238   /// \returns Atomic fence info if \p MI is an atomic fence operation,
 239   /// "None" otherwise.
 240   Optional<SIMemOpInfo> getAtomicFenceInfo(
 241       const MachineBasicBlock::iterator &MI) const;
 242
 243   /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
 244   /// rmw operation, "None" otherwise.
 245   Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
 246       const MachineBasicBlock::iterator &MI) const;
 247 };
 248
 249 class SICacheControl {
 250 protected:
 251
 252   /// Instruction info.
 253   const SIInstrInfo *TII = nullptr;
 254
 255   IsaVersion IV;
 256
 257   SICacheControl(const GCNSubtarget &ST);
 258
 259 public:
 260
 261   /// Create a cache control for the subtarget \p ST.
 262   static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
 263
 264   /// Update \p MI memory load instruction to bypass any caches up to
 265   /// the \p Scope memory scope for address spaces \p
 266   /// AddrSpace. Return true iff the instruction was modified.
 267   virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
 268                                      SIAtomicScope Scope,
 269                                      SIAtomicAddrSpace AddrSpace) const = 0;
 270
 271   /// Update \p MI memory instruction to indicate it is
 272   /// nontemporal. Return true iff the instruction was modified.
 273   virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
 274     const = 0;
 275
 276   /// Inserts any necessary instructions at position \p Pos relative
 277   /// to instruction \p MI to ensure any caches associated with
 278   /// address spaces \p AddrSpace for memory scopes up to memory scope
 279   /// \p Scope are invalidated. Returns true iff any instructions
 280   /// inserted.
 281   virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
 282                                      SIAtomicScope Scope,
 283                                      SIAtomicAddrSpace AddrSpace,
 284                                      Position Pos) const = 0;
 285
 286   /// Inserts any necessary instructions at position \p Pos relative
 287   /// to instruction \p MI to ensure memory instructions of kind \p Op
 288   /// associated with address spaces \p AddrSpace have completed as
 289   /// observed by other memory instructions executing in memory scope
 290   /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
 291   /// ordering is between address spaces. Returns true iff any
 292   /// instructions inserted.
 293   virtual bool insertWait(MachineBasicBlock::iterator &MI,
 294                           SIAtomicScope Scope,
 295                           SIAtomicAddrSpace AddrSpace,
 296                           SIMemOp Op,
 297                           bool IsCrossAddrSpaceOrdering,
 298                           Position Pos) const = 0;
 299
 300   /// Virtual destructor to allow derivations to be deleted.
 301   virtual ~SICacheControl() = default;
 302
 303 };
 304
 305 class SIGfx6CacheControl : public SICacheControl {
 306 protected:
 307
 308   /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
 309   /// is modified, false otherwise.
 310   bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
 311     return enableNamedBit<AMDGPU::OpName::glc>(MI);
 312   }
 313
 314   /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
 315   /// is modified, false otherwise.
 316   bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
 317     return enableNamedBit<AMDGPU::OpName::slc>(MI);
 318   }
 319
 320 public:
 321
 322   SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
 323
 324   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
 325                              SIAtomicScope Scope,
 326                              SIAtomicAddrSpace AddrSpace) const override;
 327
 328   bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
 329
 330   bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
 331                              SIAtomicScope Scope,
 332                              SIAtomicAddrSpace AddrSpace,
 333                              Position Pos) const override;
 334
 335   bool insertWait(MachineBasicBlock::iterator &MI,
 336                   SIAtomicScope Scope,
 337                   SIAtomicAddrSpace AddrSpace,
 338                   SIMemOp Op,
 339                   bool IsCrossAddrSpaceOrdering,
 340                   Position Pos) const override;
 341 };
 342
 343 class SIGfx7CacheControl : public SIGfx6CacheControl {
 344 public:
 345
 346   SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
 347
 348   bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
 349                              SIAtomicScope Scope,
 350                              SIAtomicAddrSpace AddrSpace,
 351                              Position Pos) const override;
 352
 353 };
 354
 355 class SIGfx10CacheControl : public SIGfx7CacheControl {
 356 protected:
 357   bool CuMode = false;
 358
 359   /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
 360   /// is modified, false otherwise.
 361   bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
 362     return enableNamedBit<AMDGPU::OpName::dlc>(MI);
 363   }
 364
 365 public:
 366
 367   SIGfx10CacheControl(const GCNSubtarget &ST, bool CuMode) :
 368     SIGfx7CacheControl(ST), CuMode(CuMode) {};
 369
 370   bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
 371                              SIAtomicScope Scope,
 372                              SIAtomicAddrSpace AddrSpace) const override;
 373
 374   bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
 375
 376   bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
 377                              SIAtomicScope Scope,
 378                              SIAtomicAddrSpace AddrSpace,
 379                              Position Pos) const override;
 380
 381   bool insertWait(MachineBasicBlock::iterator &MI,
 382                   SIAtomicScope Scope,
 383                   SIAtomicAddrSpace AddrSpace,
 384                   SIMemOp Op,
 385                   bool IsCrossAddrSpaceOrdering,
 386                   Position Pos) const override;
 387 };
 388
 389 class SIMemoryLegalizer final : public MachineFunctionPass {
 390 private:
 391
 392   /// Cache Control.
 393   std::unique_ptr<SICacheControl> CC = nullptr;
 394
 395   /// List of atomic pseudo instructions.
 396   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
 397
 398   /// Return true iff instruction \p MI is a atomic instruction that
 399   /// returns a result.
 400   bool isAtomicRet(const MachineInstr &MI) const {
 401     return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
 402   }
 403
 404   /// Removes all processed atomic pseudo instructions from the current
 405   /// function. Returns true if current function is modified, false otherwise.
 406   bool removeAtomicPseudoMIs();
 407
 408   /// Expands load operation \p MI. Returns true if instructions are
 409   /// added/deleted or \p MI is modified, false otherwise.
 410   bool expandLoad(const SIMemOpInfo &MOI,
 411                   MachineBasicBlock::iterator &MI);
 412   /// Expands store operation \p MI. Returns true if instructions are
 413   /// added/deleted or \p MI is modified, false otherwise.
 414   bool expandStore(const SIMemOpInfo &MOI,
 415                    MachineBasicBlock::iterator &MI);
 416   /// Expands atomic fence operation \p MI. Returns true if
 417   /// instructions are added/deleted or \p MI is modified, false otherwise.
 418   bool expandAtomicFence(const SIMemOpInfo &MOI,
 419                          MachineBasicBlock::iterator &MI);
 420   /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
 421   /// instructions are added/deleted or \p MI is modified, false otherwise.
 422   bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
 423                                 MachineBasicBlock::iterator &MI);
 424
 425 public:
 426   static char ID;
 427
 428   SIMemoryLegalizer() : MachineFunctionPass(ID) {}
 429
 430   void getAnalysisUsage(AnalysisUsage &AU) const override {
 431     AU.setPreservesCFG();
 432     MachineFunctionPass::getAnalysisUsage(AU);
 433   }
 434
 435   StringRef getPassName() const override {
 436     return PASS_NAME;
 437   }
 438
 439   bool runOnMachineFunction(MachineFunction &MF) override;
 440 };
 441
 442 } // end namespace anonymous
 443
 444 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
 445                                       const char *Msg) const {
 446   const Function &Func = MI->getParent()->getParent()->getFunction();
 447   DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
 448   Func.getContext().diagnose(Diag);
 449 }
 450
 451 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
 452 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
 453                                SIAtomicAddrSpace InstrScope) const {
 454   if (SSID == SyncScope::System)
 455     return std::make_tuple(SIAtomicScope::SYSTEM,
 456                            SIAtomicAddrSpace::ATOMIC,
 457                            true);
 458   if (SSID == MMI->getAgentSSID())
 459     return std::make_tuple(SIAtomicScope::AGENT,
 460                            SIAtomicAddrSpace::ATOMIC,
 461                            true);
 462   if (SSID == MMI->getWorkgroupSSID())
 463     return std::make_tuple(SIAtomicScope::WORKGROUP,
 464                            SIAtomicAddrSpace::ATOMIC,
 465                            true);
 466   if (SSID == MMI->getWavefrontSSID())
 467     return std::make_tuple(SIAtomicScope::WAVEFRONT,
 468                            SIAtomicAddrSpace::ATOMIC,
 469                            true);
 470   if (SSID == SyncScope::SingleThread)
 471     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
 472                            SIAtomicAddrSpace::ATOMIC,
 473                            true);
 474   if (SSID == MMI->getSystemOneAddressSpaceSSID())
 475     return std::make_tuple(SIAtomicScope::SYSTEM,
 476                            SIAtomicAddrSpace::ATOMIC & InstrScope,
 477                            false);
 478   if (SSID == MMI->getAgentOneAddressSpaceSSID())
 479     return std::make_tuple(SIAtomicScope::AGENT,
 480                            SIAtomicAddrSpace::ATOMIC & InstrScope,
 481                            false);
 482   if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
 483     return std::make_tuple(SIAtomicScope::WORKGROUP,
 484                            SIAtomicAddrSpace::ATOMIC & InstrScope,
 485                            false);
 486   if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
 487     return std::make_tuple(SIAtomicScope::WAVEFRONT,
 488                            SIAtomicAddrSpace::ATOMIC & InstrScope,
 489                            false);
 490   if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
 491     return std::make_tuple(SIAtomicScope::SINGLETHREAD,
 492                            SIAtomicAddrSpace::ATOMIC & InstrScope,
 493                            false);
 494   return None;
 495 }
 496
 497 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
 498   if (AS == AMDGPUAS::FLAT_ADDRESS)
 499     return SIAtomicAddrSpace::FLAT;
 500   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
 501     return SIAtomicAddrSpace::GLOBAL;
 502   if (AS == AMDGPUAS::LOCAL_ADDRESS)
 503     return SIAtomicAddrSpace::LDS;
 504   if (AS == AMDGPUAS::PRIVATE_ADDRESS)
 505     return SIAtomicAddrSpace::SCRATCH;
 506   if (AS == AMDGPUAS::REGION_ADDRESS)
 507     return SIAtomicAddrSpace::GDS;
 508
 509   return SIAtomicAddrSpace::OTHER;
 510 }
 511
 512 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
 513   MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
 514 }
 515
 516 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
 517     const MachineBasicBlock::iterator &MI) const {
 518   assert(MI->getNumMemOperands() > 0);
 519
 520   SyncScope::ID SSID = SyncScope::SingleThread;
 521   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
 522   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
 523   SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
 524   bool IsNonTemporal = true;
 525
 526   // Validator should check whether or not MMOs cover the entire set of
 527   // locations accessed by the memory instruction.
 528   for (const auto &MMO : MI->memoperands()) {
 529     IsNonTemporal &= MMO->isNonTemporal();
 530     InstrAddrSpace |=
 531       toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
 532     AtomicOrdering OpOrdering = MMO->getOrdering();
 533     if (OpOrdering != AtomicOrdering::NotAtomic) {
 534       const auto &IsSyncScopeInclusion =
 535           MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
 536       if (!IsSyncScopeInclusion) {
 537         reportUnsupported(MI,
 538           "Unsupported non-inclusive atomic synchronization scope");
 539         return None;
 540       }
 541
 542       SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
 543       Ordering =
 544           isStrongerThan(Ordering, OpOrdering) ?
 545               Ordering : MMO->getOrdering();
 546       assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
 547              MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
 548       FailureOrdering =
 549           isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
 550               FailureOrdering : MMO->getFailureOrdering();
 551     }
 552   }
 553
 554   SIAtomicScope Scope = SIAtomicScope::NONE;
 555   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
 556   bool IsCrossAddressSpaceOrdering = false;
 557   if (Ordering != AtomicOrdering::NotAtomic) {
 558     auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
 559     if (!ScopeOrNone) {
 560       reportUnsupported(MI, "Unsupported atomic synchronization scope");
 561       return None;
 562     }
 563     std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
 564       ScopeOrNone.getValue();
 565     if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
 566         ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
 567       reportUnsupported(MI, "Unsupported atomic address space");
 568       return None;
 569     }
 570   }
 571   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
 572                      IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
 573 }
 574
 575 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
 576     const MachineBasicBlock::iterator &MI) const {
 577   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 578
 579   if (!(MI->mayLoad() && !MI->mayStore()))
 580     return None;
 581
 582   // Be conservative if there are no memory operands.
 583   if (MI->getNumMemOperands() == 0)
 584     return SIMemOpInfo();
 585
 586   return constructFromMIWithMMO(MI);
 587 }
 588
 589 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
 590     const MachineBasicBlock::iterator &MI) const {
 591   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 592
 593   if (!(!MI->mayLoad() && MI->mayStore()))
 594     return None;
 595
 596   // Be conservative if there are no memory operands.
 597   if (MI->getNumMemOperands() == 0)
 598     return SIMemOpInfo();
 599
 600   return constructFromMIWithMMO(MI);
 601 }
 602
 603 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
 604     const MachineBasicBlock::iterator &MI) const {
 605   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 606
 607   if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
 608     return None;
 609
 610   AtomicOrdering Ordering =
 611     static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
 612
 613   SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
 614   auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
 615   if (!ScopeOrNone) {
 616     reportUnsupported(MI, "Unsupported atomic synchronization scope");
 617     return None;
 618   }
 619
 620   SIAtomicScope Scope = SIAtomicScope::NONE;
 621   SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
 622   bool IsCrossAddressSpaceOrdering = false;
 623   std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
 624     ScopeOrNone.getValue();
 625
 626   if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
 627       ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
 628     reportUnsupported(MI, "Unsupported atomic address space");
 629     return None;
 630   }
 631
 632   return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
 633                      IsCrossAddressSpaceOrdering);
 634 }
 635
 636 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
 637     const MachineBasicBlock::iterator &MI) const {
 638   assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
 639
 640   if (!(MI->mayLoad() && MI->mayStore()))
 641     return None;
 642
 643   // Be conservative if there are no memory operands.
 644   if (MI->getNumMemOperands() == 0)
 645     return SIMemOpInfo();
 646
 647   return constructFromMIWithMMO(MI);
 648 }
 649
 650 SICacheControl::SICacheControl(const GCNSubtarget &ST) {
 651   TII = ST.getInstrInfo();
 652   IV = getIsaVersion(ST.getCPU());
 653 }
 654
 655 /* static */
 656 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
 657   GCNSubtarget::Generation Generation = ST.getGeneration();
 658   if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
 659     return std::make_unique<SIGfx6CacheControl>(ST);
 660   if (Generation < AMDGPUSubtarget::GFX10)
 661     return std::make_unique<SIGfx7CacheControl>(ST);
 662   return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
 663 }
 664
 665 bool SIGfx6CacheControl::enableLoadCacheBypass(
 666     const MachineBasicBlock::iterator &MI,
 667     SIAtomicScope Scope,
 668     SIAtomicAddrSpace AddrSpace) const {
 669   assert(MI->mayLoad() && !MI->mayStore());
 670   bool Changed = false;
 671
 672   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
 673     /// TODO: Do not set glc for rmw atomic operations as they
 674     /// implicitly bypass the L1 cache.
 675
 676     switch (Scope) {
 677     case SIAtomicScope::SYSTEM:
 678     case SIAtomicScope::AGENT:
 679       Changed |= enableGLCBit(MI);
 680       break;
 681     case SIAtomicScope::WORKGROUP:
 682     case SIAtomicScope::WAVEFRONT:
 683     case SIAtomicScope::SINGLETHREAD:
 684       // No cache to bypass.
 685       break;
 686     default:
 687       llvm_unreachable("Unsupported synchronization scope");
 688     }
 689   }
 690
 691   /// The scratch address space does not need the global memory caches
 692   /// to be bypassed as all memory operations by the same thread are
 693   /// sequentially consistent, and no other thread can access scratch
 694   /// memory.
 695
 696   /// Other address spaces do not hava a cache.
 697
 698   return Changed;
 699 }
 700
 701 bool SIGfx6CacheControl::enableNonTemporal(
 702     const MachineBasicBlock::iterator &MI) const {
 703   assert(MI->mayLoad() ^ MI->mayStore());
 704   bool Changed = false;
 705
 706   /// TODO: Do not enableGLCBit if rmw atomic.
 707   Changed |= enableGLCBit(MI);
 708   Changed |= enableSLCBit(MI);
 709
 710   return Changed;
 711 }
 712
 713 bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
 714                                                SIAtomicScope Scope,
 715                                                SIAtomicAddrSpace AddrSpace,
 716                                                Position Pos) const {
 717   bool Changed = false;
 718
 719   MachineBasicBlock &MBB = *MI->getParent();
 720   DebugLoc DL = MI->getDebugLoc();
 721
 722   if (Pos == Position::AFTER)
 723     ++MI;
 724
 725   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
 726     switch (Scope) {
 727     case SIAtomicScope::SYSTEM:
 728     case SIAtomicScope::AGENT:
 729       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
 730       Changed = true;
 731       break;
 732     case SIAtomicScope::WORKGROUP:
 733     case SIAtomicScope::WAVEFRONT:
 734     case SIAtomicScope::SINGLETHREAD:
 735       // No cache to invalidate.
 736       break;
 737     default:
 738       llvm_unreachable("Unsupported synchronization scope");
 739     }
 740   }
 741
 742   /// The scratch address space does not need the global memory cache
 743   /// to be flushed as all memory operations by the same thread are
 744   /// sequentially consistent, and no other thread can access scratch
 745   /// memory.
 746
 747   /// Other address spaces do not hava a cache.
 748
 749   if (Pos == Position::AFTER)
 750     --MI;
 751
 752   return Changed;
 753 }
 754
 755 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
 756                                     SIAtomicScope Scope,
 757                                     SIAtomicAddrSpace AddrSpace,
 758                                     SIMemOp Op,
 759                                     bool IsCrossAddrSpaceOrdering,
 760                                     Position Pos) const {
 761   bool Changed = false;
 762
 763   MachineBasicBlock &MBB = *MI->getParent();
 764   DebugLoc DL = MI->getDebugLoc();
 765
 766   if (Pos == Position::AFTER)
 767     ++MI;
 768
 769   bool VMCnt = false;
 770   bool LGKMCnt = false;
 771
 772   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
 773     switch (Scope) {
 774     case SIAtomicScope::SYSTEM:
 775     case SIAtomicScope::AGENT:
 776       VMCnt |= true;
 777       break;
 778     case SIAtomicScope::WORKGROUP:
 779     case SIAtomicScope::WAVEFRONT:
 780     case SIAtomicScope::SINGLETHREAD:
 781       // The L1 cache keeps all memory operations in order for
 782       // wavefronts in the same work-group.
 783       break;
 784     default:
 785       llvm_unreachable("Unsupported synchronization scope");
 786     }
 787   }
 788
 789   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
 790     switch (Scope) {
 791     case SIAtomicScope::SYSTEM:
 792     case SIAtomicScope::AGENT:
 793     case SIAtomicScope::WORKGROUP:
 794       // If no cross address space ordering then an LDS waitcnt is not
 795       // needed as LDS operations for all waves are executed in a
 796       // total global ordering as observed by all waves. Required if
 797       // also synchronizing with global/GDS memory as LDS operations
 798       // could be reordered with respect to later global/GDS memory
 799       // operations of the same wave.
 800       LGKMCnt |= IsCrossAddrSpaceOrdering;
 801       break;
 802     case SIAtomicScope::WAVEFRONT:
 803     case SIAtomicScope::SINGLETHREAD:
 804       // The LDS keeps all memory operations in order for
 805       // the same wavesfront.
 806       break;
 807     default:
 808       llvm_unreachable("Unsupported synchronization scope");
 809     }
 810   }
 811
 812   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
 813     switch (Scope) {
 814     case SIAtomicScope::SYSTEM:
 815     case SIAtomicScope::AGENT:
 816       // If no cross address space ordering then an GDS waitcnt is not
 817       // needed as GDS operations for all waves are executed in a
 818       // total global ordering as observed by all waves. Required if
 819       // also synchronizing with global/LDS memory as GDS operations
 820       // could be reordered with respect to later global/LDS memory
 821       // operations of the same wave.
 822       LGKMCnt |= IsCrossAddrSpaceOrdering;
 823       break;
 824     case SIAtomicScope::WORKGROUP:
 825     case SIAtomicScope::WAVEFRONT:
 826     case SIAtomicScope::SINGLETHREAD:
 827       // The GDS keeps all memory operations in order for
 828       // the same work-group.
 829       break;
 830     default:
 831       llvm_unreachable("Unsupported synchronization scope");
 832     }
 833   }
 834
 835   if (VMCnt || LGKMCnt) {
 836     unsigned WaitCntImmediate =
 837       AMDGPU::encodeWaitcnt(IV,
 838                             VMCnt ? 0 : getVmcntBitMask(IV),
 839                             getExpcntBitMask(IV),
 840                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
 841     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
 842     Changed = true;
 843   }
 844
 845   if (Pos == Position::AFTER)
 846     --MI;
 847
 848   return Changed;
 849 }
 850
 851 bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
 852                                                SIAtomicScope Scope,
 853                                                SIAtomicAddrSpace AddrSpace,
 854                                                Position Pos) const {
 855   bool Changed = false;
 856
 857   MachineBasicBlock &MBB = *MI->getParent();
 858   DebugLoc DL = MI->getDebugLoc();
 859
 860   const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
 861
 862   const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
 863                              ? AMDGPU::BUFFER_WBINVL1
 864                              : AMDGPU::BUFFER_WBINVL1_VOL;
 865
 866   if (Pos == Position::AFTER)
 867     ++MI;
 868
 869   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
 870     switch (Scope) {
 871     case SIAtomicScope::SYSTEM:
 872     case SIAtomicScope::AGENT:
 873       BuildMI(MBB, MI, DL, TII->get(Flush));
 874       Changed = true;
 875       break;
 876     case SIAtomicScope::WORKGROUP:
 877     case SIAtomicScope::WAVEFRONT:
 878     case SIAtomicScope::SINGLETHREAD:
 879       // No cache to invalidate.
 880       break;
 881     default:
 882       llvm_unreachable("Unsupported synchronization scope");
 883     }
 884   }
 885
 886   /// The scratch address space does not need the global memory cache
 887   /// to be flushed as all memory operations by the same thread are
 888   /// sequentially consistent, and no other thread can access scratch
 889   /// memory.
 890
 891   /// Other address spaces do not hava a cache.
 892
 893   if (Pos == Position::AFTER)
 894     --MI;
 895
 896   return Changed;
 897 }
 898
 899 bool SIGfx10CacheControl::enableLoadCacheBypass(
 900     const MachineBasicBlock::iterator &MI,
 901     SIAtomicScope Scope,
 902     SIAtomicAddrSpace AddrSpace) const {
 903   assert(MI->mayLoad() && !MI->mayStore());
 904   bool Changed = false;
 905
 906   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
 907     /// TODO Do not set glc for rmw atomic operations as they
 908     /// implicitly bypass the L0/L1 caches.
 909
 910     switch (Scope) {
 911     case SIAtomicScope::SYSTEM:
 912     case SIAtomicScope::AGENT:
 913       Changed |= enableGLCBit(MI);
 914       Changed |= enableDLCBit(MI);
 915       break;
 916     case SIAtomicScope::WORKGROUP:
 917       // In WGP mode the waves of a work-group can be executing on either CU of
 918       // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
 919       // CU mode and all waves of a work-group are on the same CU, and so the
 920       // L0 does not need to be bypassed.
 921       if (!CuMode) Changed |= enableGLCBit(MI);
 922       break;
 923     case SIAtomicScope::WAVEFRONT:
 924     case SIAtomicScope::SINGLETHREAD:
 925       // No cache to bypass.
 926       break;
 927     default:
 928       llvm_unreachable("Unsupported synchronization scope");
 929     }
 930   }
 931
 932   /// The scratch address space does not need the global memory caches
 933   /// to be bypassed as all memory operations by the same thread are
 934   /// sequentially consistent, and no other thread can access scratch
 935   /// memory.
 936
 937   /// Other address spaces do not hava a cache.
 938
 939   return Changed;
 940 }
 941
 942 bool SIGfx10CacheControl::enableNonTemporal(
 943     const MachineBasicBlock::iterator &MI) const {
 944   assert(MI->mayLoad() ^ MI->mayStore());
 945   bool Changed = false;
 946
 947   Changed |= enableSLCBit(MI);
 948   /// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
 949
 950   return Changed;
 951 }
 952
 953 bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
 954                                                 SIAtomicScope Scope,
 955                                                 SIAtomicAddrSpace AddrSpace,
 956                                                 Position Pos) const {
 957   bool Changed = false;
 958
 959   MachineBasicBlock &MBB = *MI->getParent();
 960   DebugLoc DL = MI->getDebugLoc();
 961
 962   if (Pos == Position::AFTER)
 963     ++MI;
 964
 965   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
 966     switch (Scope) {
 967     case SIAtomicScope::SYSTEM:
 968     case SIAtomicScope::AGENT:
 969       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
 970       BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
 971       Changed = true;
 972       break;
 973     case SIAtomicScope::WORKGROUP:
 974       // In WGP mode the waves of a work-group can be executing on either CU of
 975       // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
 976       // in CU mode and all waves of a work-group are on the same CU, and so the
 977       // L0 does not need to be invalidated.
 978       if (!CuMode) {
 979         BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
 980         Changed = true;
 981       }
 982       break;
 983     case SIAtomicScope::WAVEFRONT:
 984     case SIAtomicScope::SINGLETHREAD:
 985       // No cache to invalidate.
 986       break;
 987     default:
 988       llvm_unreachable("Unsupported synchronization scope");
 989     }
 990   }
 991
 992   /// The scratch address space does not need the global memory cache
 993   /// to be flushed as all memory operations by the same thread are
 994   /// sequentially consistent, and no other thread can access scratch
 995   /// memory.
 996
 997   /// Other address spaces do not hava a cache.
 998
 999   if (Pos == Position::AFTER)
1000     --MI;
1001
1002   return Changed;
1003 }
1004
1005 bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1006                                      SIAtomicScope Scope,
1007                                      SIAtomicAddrSpace AddrSpace,
1008                                      SIMemOp Op,
1009                                      bool IsCrossAddrSpaceOrdering,
1010                                      Position Pos) const {
1011   bool Changed = false;
1012
1013   MachineBasicBlock &MBB = *MI->getParent();
1014   DebugLoc DL = MI->getDebugLoc();
1015
1016   if (Pos == Position::AFTER)
1017     ++MI;
1018
1019   bool VMCnt = false;
1020   bool VSCnt = false;
1021   bool LGKMCnt = false;
1022
1023   if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1024     switch (Scope) {
1025     case SIAtomicScope::SYSTEM:
1026     case SIAtomicScope::AGENT:
1027       if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1028         VMCnt |= true;
1029       if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1030         VSCnt |= true;
1031       break;
1032     case SIAtomicScope::WORKGROUP:
1033       // In WGP mode the waves of a work-group can be executing on either CU of
1034       // the WGP. Therefore need to wait for operations to complete to ensure
1035       // they are visible to waves in the other CU as the L0 is per CU.
1036       // Otherwise in CU mode and all waves of a work-group are on the same CU
1037       // which shares the same L0.
1038       if (!CuMode) {
1039         if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
1040           VMCnt |= true;
1041         if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
1042           VSCnt |= true;
1043       }
1044       break;
1045     case SIAtomicScope::WAVEFRONT:
1046     case SIAtomicScope::SINGLETHREAD:
1047       // The L0 cache keeps all memory operations in order for
1048       // work-items in the same wavefront.
1049       break;
1050     default:
1051       llvm_unreachable("Unsupported synchronization scope");
1052     }
1053   }
1054
1055   if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1056     switch (Scope) {
1057     case SIAtomicScope::SYSTEM:
1058     case SIAtomicScope::AGENT:
1059     case SIAtomicScope::WORKGROUP:
1060       // If no cross address space ordering then an LDS waitcnt is not
1061       // needed as LDS operations for all waves are executed in a
1062       // total global ordering as observed by all waves. Required if
1063       // also synchronizing with global/GDS memory as LDS operations
1064       // could be reordered with respect to later global/GDS memory
1065       // operations of the same wave.
1066       LGKMCnt |= IsCrossAddrSpaceOrdering;
1067       break;
1068     case SIAtomicScope::WAVEFRONT:
1069     case SIAtomicScope::SINGLETHREAD:
1070       // The LDS keeps all memory operations in order for
1071       // the same wavesfront.
1072       break;
1073     default:
1074       llvm_unreachable("Unsupported synchronization scope");
1075     }
1076   }
1077
1078   if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1079     switch (Scope) {
1080     case SIAtomicScope::SYSTEM:
1081     case SIAtomicScope::AGENT:
1082       // If no cross address space ordering then an GDS waitcnt is not
1083       // needed as GDS operations for all waves are executed in a
1084       // total global ordering as observed by all waves. Required if
1085       // also synchronizing with global/LDS memory as GDS operations
1086       // could be reordered with respect to later global/LDS memory
1087       // operations of the same wave.
1088       LGKMCnt |= IsCrossAddrSpaceOrdering;
1089       break;
1090     case SIAtomicScope::WORKGROUP:
1091     case SIAtomicScope::WAVEFRONT:
1092     case SIAtomicScope::SINGLETHREAD:
1093       // The GDS keeps all memory operations in order for
1094       // the same work-group.
1095       break;
1096     default:
1097       llvm_unreachable("Unsupported synchronization scope");
1098     }
1099   }
1100
1101   if (VMCnt || LGKMCnt) {
1102     unsigned WaitCntImmediate =
1103       AMDGPU::encodeWaitcnt(IV,
1104                             VMCnt ? 0 : getVmcntBitMask(IV),
1105                             getExpcntBitMask(IV),
1106                             LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1107     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
1108     Changed = true;
1109   }
1110
1111   if (VSCnt) {
1112     BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1113       .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1114       .addImm(0);
1115     Changed = true;
1116   }
1117
1118   if (Pos == Position::AFTER)
1119     --MI;
1120
1121   return Changed;
1122 }
1123
1124 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
1125   if (AtomicPseudoMIs.empty())
1126     return false;
1127
1128   for (auto &MI : AtomicPseudoMIs)
1129     MI->eraseFromParent();
1130
1131   AtomicPseudoMIs.clear();
1132   return true;
1133 }
1134
1135 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
1136                                    MachineBasicBlock::iterator &MI) {
1137   assert(MI->mayLoad() && !MI->mayStore());
1138
1139   bool Changed = false;
1140
1141   if (MOI.isAtomic()) {
1142     if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
1143         MOI.getOrdering() == AtomicOrdering::Acquire ||
1144         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1145       Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
1146                                            MOI.getOrderingAddrSpace());
1147     }
1148
1149     if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1150       Changed |= CC->insertWait(MI, MOI.getScope(),
1151                                 MOI.getOrderingAddrSpace(),
1152                                 SIMemOp::LOAD | SIMemOp::STORE,
1153                                 MOI.getIsCrossAddressSpaceOrdering(),
1154                                 Position::BEFORE);
1155
1156     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1157         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
1158       Changed |= CC->insertWait(MI, MOI.getScope(),
1159                                 MOI.getInstrAddrSpace(),
1160                                 SIMemOp::LOAD,
1161                                 MOI.getIsCrossAddressSpaceOrdering(),
1162                                 Position::AFTER);
1163       Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1164                                            MOI.getOrderingAddrSpace(),
1165                                            Position::AFTER);
1166     }
1167
1168     return Changed;
1169   }
1170
1171   // Atomic instructions do not have the nontemporal attribute.
1172   if (MOI.isNonTemporal()) {
1173     Changed |= CC->enableNonTemporal(MI);
1174     return Changed;
1175   }
1176
1177   return Changed;
1178 }
1179
1180 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
1181                                     MachineBasicBlock::iterator &MI) {
1182   assert(!MI->mayLoad() && MI->mayStore());
1183
1184   bool Changed = false;
1185
1186   if (MOI.isAtomic()) {
1187     if (MOI.getOrdering() == AtomicOrdering::Release ||
1188         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1189       Changed |= CC->insertWait(MI, MOI.getScope(),
1190                                 MOI.getOrderingAddrSpace(),
1191                                 SIMemOp::LOAD | SIMemOp::STORE,
1192                                 MOI.getIsCrossAddressSpaceOrdering(),
1193                                 Position::BEFORE);
1194
1195     return Changed;
1196   }
1197
1198   // Atomic instructions do not have the nontemporal attribute.
1199   if (MOI.isNonTemporal()) {
1200     Changed |= CC->enableNonTemporal(MI);
1201     return Changed;
1202   }
1203
1204   return Changed;
1205 }
1206
1207 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
1208                                           MachineBasicBlock::iterator &MI) {
1209   assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
1210
1211   AtomicPseudoMIs.push_back(MI);
1212   bool Changed = false;
1213
1214   if (MOI.isAtomic()) {
1215     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1216         MOI.getOrdering() == AtomicOrdering::Release ||
1217         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1218         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1219       /// TODO: This relies on a barrier always generating a waitcnt
1220       /// for LDS to ensure it is not reordered with the completion of
1221       /// the proceeding LDS operations. If barrier had a memory
1222       /// ordering and memory scope, then library does not need to
1223       /// generate a fence. Could add support in this file for
1224       /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
1225       /// adding waitcnt before a S_BARRIER.
1226       Changed |= CC->insertWait(MI, MOI.getScope(),
1227                                 MOI.getOrderingAddrSpace(),
1228                                 SIMemOp::LOAD | SIMemOp::STORE,
1229                                 MOI.getIsCrossAddressSpaceOrdering(),
1230                                 Position::BEFORE);
1231
1232     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1233         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1234         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
1235       Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1236                                            MOI.getOrderingAddrSpace(),
1237                                            Position::BEFORE);
1238
1239     return Changed;
1240   }
1241
1242   return Changed;
1243 }
1244
1245 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
1246   MachineBasicBlock::iterator &MI) {
1247   assert(MI->mayLoad() && MI->mayStore());
1248
1249   bool Changed = false;
1250
1251   if (MOI.isAtomic()) {
1252     if (MOI.getOrdering() == AtomicOrdering::Release ||
1253         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1254         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1255         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
1256       Changed |= CC->insertWait(MI, MOI.getScope(),
1257                                 MOI.getOrderingAddrSpace(),
1258                                 SIMemOp::LOAD | SIMemOp::STORE,
1259                                 MOI.getIsCrossAddressSpaceOrdering(),
1260                                 Position::BEFORE);
1261
1262     if (MOI.getOrdering() == AtomicOrdering::Acquire ||
1263         MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
1264         MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
1265         MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
1266         MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
1267       Changed |= CC->insertWait(MI, MOI.getScope(),
1268                                 MOI.getOrderingAddrSpace(),
1269                                 isAtomicRet(*MI) ? SIMemOp::LOAD :
1270                                                    SIMemOp::STORE,
1271                                 MOI.getIsCrossAddressSpaceOrdering(),
1272                                 Position::AFTER);
1273       Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1274                                            MOI.getOrderingAddrSpace(),
1275                                            Position::AFTER);
1276     }
1277
1278     return Changed;
1279   }
1280
1281   return Changed;
1282 }
1283
1284 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1285   bool Changed = false;
1286
1287   SIMemOpAccess MOA(MF);
1288   CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1289
1290   for (auto &MBB : MF) {
1291     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1292       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1293         continue;
1294
1295       if (const auto &MOI = MOA.getLoadInfo(MI))
1296         Changed |= expandLoad(MOI.getValue(), MI);
1297       else if (const auto &MOI = MOA.getStoreInfo(MI))
1298         Changed |= expandStore(MOI.getValue(), MI);
1299       else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1300         Changed |= expandAtomicFence(MOI.getValue(), MI);
1301       else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1302         Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1303     }
1304   }
1305
1306   Changed |= removeAtomicPseudoMIs();
1307   return Changed;
1308 }
1309
1310 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1311
1312 char SIMemoryLegalizer::ID = 0;
1313 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1314
1315 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1316   return new SIMemoryLegalizer();
1317 }