Revert r354244 "[DAGCombiner] Eliminate dead stores to stack."
[llvm-complete.git] / lib / Target / AMDGPU / SIMemoryLegalizer.cpp
blob1080332d6e4a7e314996690355bf1e7ac59dd552
1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Memory legalizer - implements memory model. More information can be
11 /// found here:
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
14 //===----------------------------------------------------------------------===//
16 #include "AMDGPU.h"
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIDefines.h"
20 #include "SIInstrInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/BitmaskEnum.h"
24 #include "llvm/ADT/None.h"
25 #include "llvm/ADT/Optional.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineFunction.h"
28 #include "llvm/CodeGen/MachineFunctionPass.h"
29 #include "llvm/CodeGen/MachineInstrBuilder.h"
30 #include "llvm/CodeGen/MachineMemOperand.h"
31 #include "llvm/CodeGen/MachineModuleInfo.h"
32 #include "llvm/CodeGen/MachineOperand.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/LLVMContext.h"
37 #include "llvm/MC/MCInstrDesc.h"
38 #include "llvm/Pass.h"
39 #include "llvm/Support/AtomicOrdering.h"
40 #include "llvm/Support/MathExtras.h"
41 #include <cassert>
42 #include <list>
44 using namespace llvm;
45 using namespace llvm::AMDGPU;
47 #define DEBUG_TYPE "si-memory-legalizer"
48 #define PASS_NAME "SI Memory Legalizer"
50 namespace {
52 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
54 /// Memory operation flags. Can be ORed together.
55 enum class SIMemOp {
56 NONE = 0u,
57 LOAD = 1u << 0,
58 STORE = 1u << 1,
59 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
62 /// Position to insert a new instruction relative to an existing
63 /// instruction.
64 enum class Position {
65 BEFORE,
66 AFTER
69 /// The atomic synchronization scopes supported by the AMDGPU target.
70 enum class SIAtomicScope {
71 NONE,
72 SINGLETHREAD,
73 WAVEFRONT,
74 WORKGROUP,
75 AGENT,
76 SYSTEM
79 /// The distinct address spaces supported by the AMDGPU target for
80 /// atomic memory operation. Can be ORed toether.
81 enum class SIAtomicAddrSpace {
82 NONE = 0u,
83 GLOBAL = 1u << 0,
84 LDS = 1u << 1,
85 SCRATCH = 1u << 2,
86 GDS = 1u << 3,
87 OTHER = 1u << 4,
89 /// The address spaces that can be accessed by a FLAT instruction.
90 FLAT = GLOBAL | LDS | SCRATCH,
92 /// The address spaces that support atomic instructions.
93 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
95 /// All address spaces.
96 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
98 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
101 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
102 /// \returns Returns true if \p MI is modified, false otherwise.
103 template <uint16_t BitName>
104 bool enableNamedBit(const MachineBasicBlock::iterator &MI) {
105 int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
106 if (BitIdx == -1)
107 return false;
109 MachineOperand &Bit = MI->getOperand(BitIdx);
110 if (Bit.getImm() != 0)
111 return false;
113 Bit.setImm(1);
114 return true;
117 class SIMemOpInfo final {
118 private:
120 friend class SIMemOpAccess;
122 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
123 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
124 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
125 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
126 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
127 bool IsCrossAddressSpaceOrdering = false;
128 bool IsNonTemporal = false;
130 SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
131 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
132 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
133 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
134 bool IsCrossAddressSpaceOrdering = true,
135 AtomicOrdering FailureOrdering =
136 AtomicOrdering::SequentiallyConsistent,
137 bool IsNonTemporal = false)
138 : Ordering(Ordering), FailureOrdering(FailureOrdering),
139 Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
140 InstrAddrSpace(InstrAddrSpace),
141 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
142 IsNonTemporal(IsNonTemporal) {
143 // There is also no cross address space ordering if the ordering
144 // address space is the same as the instruction address space and
145 // only contains a single address space.
146 if ((OrderingAddrSpace == InstrAddrSpace) &&
147 isPowerOf2_32(uint32_t(InstrAddrSpace)))
148 IsCrossAddressSpaceOrdering = false;
151 public:
152 /// \returns Atomic synchronization scope of the machine instruction used to
153 /// create this SIMemOpInfo.
154 SIAtomicScope getScope() const {
155 return Scope;
158 /// \returns Ordering constraint of the machine instruction used to
159 /// create this SIMemOpInfo.
160 AtomicOrdering getOrdering() const {
161 return Ordering;
164 /// \returns Failure ordering constraint of the machine instruction used to
165 /// create this SIMemOpInfo.
166 AtomicOrdering getFailureOrdering() const {
167 return FailureOrdering;
170 /// \returns The address spaces be accessed by the machine
171 /// instruction used to create this SiMemOpInfo.
172 SIAtomicAddrSpace getInstrAddrSpace() const {
173 return InstrAddrSpace;
176 /// \returns The address spaces that must be ordered by the machine
177 /// instruction used to create this SiMemOpInfo.
178 SIAtomicAddrSpace getOrderingAddrSpace() const {
179 return OrderingAddrSpace;
182 /// \returns Return true iff memory ordering of operations on
183 /// different address spaces is required.
184 bool getIsCrossAddressSpaceOrdering() const {
185 return IsCrossAddressSpaceOrdering;
188 /// \returns True if memory access of the machine instruction used to
189 /// create this SIMemOpInfo is non-temporal, false otherwise.
190 bool isNonTemporal() const {
191 return IsNonTemporal;
194 /// \returns True if ordering constraint of the machine instruction used to
195 /// create this SIMemOpInfo is unordered or higher, false otherwise.
196 bool isAtomic() const {
197 return Ordering != AtomicOrdering::NotAtomic;
202 class SIMemOpAccess final {
203 private:
204 AMDGPUMachineModuleInfo *MMI = nullptr;
206 /// Reports unsupported message \p Msg for \p MI to LLVM context.
207 void reportUnsupported(const MachineBasicBlock::iterator &MI,
208 const char *Msg) const;
210 /// Inspects the target synchonization scope \p SSID and determines
211 /// the SI atomic scope it corresponds to, the address spaces it
212 /// covers, and whether the memory ordering applies between address
213 /// spaces.
214 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
215 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrScope) const;
217 /// \return Return a bit set of the address spaces accessed by \p AS.
218 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
220 /// \returns Info constructed from \p MI, which has at least machine memory
221 /// operand.
222 Optional<SIMemOpInfo> constructFromMIWithMMO(
223 const MachineBasicBlock::iterator &MI) const;
225 public:
226 /// Construct class to support accessing the machine memory operands
227 /// of instructions in the machine function \p MF.
228 SIMemOpAccess(MachineFunction &MF);
230 /// \returns Load info if \p MI is a load operation, "None" otherwise.
231 Optional<SIMemOpInfo> getLoadInfo(
232 const MachineBasicBlock::iterator &MI) const;
234 /// \returns Store info if \p MI is a store operation, "None" otherwise.
235 Optional<SIMemOpInfo> getStoreInfo(
236 const MachineBasicBlock::iterator &MI) const;
238 /// \returns Atomic fence info if \p MI is an atomic fence operation,
239 /// "None" otherwise.
240 Optional<SIMemOpInfo> getAtomicFenceInfo(
241 const MachineBasicBlock::iterator &MI) const;
243 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
244 /// rmw operation, "None" otherwise.
245 Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
246 const MachineBasicBlock::iterator &MI) const;
249 class SICacheControl {
250 protected:
252 /// Instruction info.
253 const SIInstrInfo *TII = nullptr;
255 IsaVersion IV;
257 SICacheControl(const GCNSubtarget &ST);
259 public:
261 /// Create a cache control for the subtarget \p ST.
262 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
264 /// Update \p MI memory load instruction to bypass any caches up to
265 /// the \p Scope memory scope for address spaces \p
266 /// AddrSpace. Return true iff the instruction was modified.
267 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
268 SIAtomicScope Scope,
269 SIAtomicAddrSpace AddrSpace) const = 0;
271 /// Update \p MI memory instruction to indicate it is
272 /// nontemporal. Return true iff the instruction was modified.
273 virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
274 const = 0;
276 /// Inserts any necessary instructions at position \p Pos relative
277 /// to instruction \p MI to ensure any caches associated with
278 /// address spaces \p AddrSpace for memory scopes up to memory scope
279 /// \p Scope are invalidated. Returns true iff any instructions
280 /// inserted.
281 virtual bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
282 SIAtomicScope Scope,
283 SIAtomicAddrSpace AddrSpace,
284 Position Pos) const = 0;
286 /// Inserts any necessary instructions at position \p Pos relative
287 /// to instruction \p MI to ensure memory instructions of kind \p Op
288 /// associated with address spaces \p AddrSpace have completed as
289 /// observed by other memory instructions executing in memory scope
290 /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
291 /// ordering is between address spaces. Returns true iff any
292 /// instructions inserted.
293 virtual bool insertWait(MachineBasicBlock::iterator &MI,
294 SIAtomicScope Scope,
295 SIAtomicAddrSpace AddrSpace,
296 SIMemOp Op,
297 bool IsCrossAddrSpaceOrdering,
298 Position Pos) const = 0;
300 /// Virtual destructor to allow derivations to be deleted.
301 virtual ~SICacheControl() = default;
305 class SIGfx6CacheControl : public SICacheControl {
306 protected:
308 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
309 /// is modified, false otherwise.
310 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
311 return enableNamedBit<AMDGPU::OpName::glc>(MI);
314 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
315 /// is modified, false otherwise.
316 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
317 return enableNamedBit<AMDGPU::OpName::slc>(MI);
320 public:
322 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
324 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
325 SIAtomicScope Scope,
326 SIAtomicAddrSpace AddrSpace) const override;
328 bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
330 bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
331 SIAtomicScope Scope,
332 SIAtomicAddrSpace AddrSpace,
333 Position Pos) const override;
335 bool insertWait(MachineBasicBlock::iterator &MI,
336 SIAtomicScope Scope,
337 SIAtomicAddrSpace AddrSpace,
338 SIMemOp Op,
339 bool IsCrossAddrSpaceOrdering,
340 Position Pos) const override;
343 class SIGfx7CacheControl : public SIGfx6CacheControl {
344 public:
346 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
348 bool insertCacheInvalidate(MachineBasicBlock::iterator &MI,
349 SIAtomicScope Scope,
350 SIAtomicAddrSpace AddrSpace,
351 Position Pos) const override;
355 class SIMemoryLegalizer final : public MachineFunctionPass {
356 private:
358 /// Cache Control.
359 std::unique_ptr<SICacheControl> CC = nullptr;
361 /// List of atomic pseudo instructions.
362 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
364 /// Return true iff instruction \p MI is a atomic instruction that
365 /// returns a result.
366 bool isAtomicRet(const MachineInstr &MI) const {
367 return AMDGPU::getAtomicNoRetOp(MI.getOpcode()) != -1;
370 /// Removes all processed atomic pseudo instructions from the current
371 /// function. Returns true if current function is modified, false otherwise.
372 bool removeAtomicPseudoMIs();
374 /// Expands load operation \p MI. Returns true if instructions are
375 /// added/deleted or \p MI is modified, false otherwise.
376 bool expandLoad(const SIMemOpInfo &MOI,
377 MachineBasicBlock::iterator &MI);
378 /// Expands store operation \p MI. Returns true if instructions are
379 /// added/deleted or \p MI is modified, false otherwise.
380 bool expandStore(const SIMemOpInfo &MOI,
381 MachineBasicBlock::iterator &MI);
382 /// Expands atomic fence operation \p MI. Returns true if
383 /// instructions are added/deleted or \p MI is modified, false otherwise.
384 bool expandAtomicFence(const SIMemOpInfo &MOI,
385 MachineBasicBlock::iterator &MI);
386 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
387 /// instructions are added/deleted or \p MI is modified, false otherwise.
388 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
389 MachineBasicBlock::iterator &MI);
391 public:
392 static char ID;
394 SIMemoryLegalizer() : MachineFunctionPass(ID) {}
396 void getAnalysisUsage(AnalysisUsage &AU) const override {
397 AU.setPreservesCFG();
398 MachineFunctionPass::getAnalysisUsage(AU);
401 StringRef getPassName() const override {
402 return PASS_NAME;
405 bool runOnMachineFunction(MachineFunction &MF) override;
408 } // end namespace anonymous
410 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
411 const char *Msg) const {
412 const Function &Func = MI->getParent()->getParent()->getFunction();
413 DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
414 Func.getContext().diagnose(Diag);
417 Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
418 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
419 SIAtomicAddrSpace InstrScope) const {
420 /// TODO: For now assume OpenCL memory model which treats each
421 /// address space as having a separate happens-before relation, and
422 /// so an instruction only has ordering with respect to the address
423 /// space it accesses, and if it accesses multiple address spaces it
424 /// does not require ordering of operations in different address
425 /// spaces.
426 if (SSID == SyncScope::System)
427 return std::make_tuple(SIAtomicScope::SYSTEM,
428 SIAtomicAddrSpace::ATOMIC & InstrScope,
429 false);
430 if (SSID == MMI->getAgentSSID())
431 return std::make_tuple(SIAtomicScope::AGENT,
432 SIAtomicAddrSpace::ATOMIC & InstrScope,
433 false);
434 if (SSID == MMI->getWorkgroupSSID())
435 return std::make_tuple(SIAtomicScope::WORKGROUP,
436 SIAtomicAddrSpace::ATOMIC & InstrScope,
437 false);
438 if (SSID == MMI->getWavefrontSSID())
439 return std::make_tuple(SIAtomicScope::WAVEFRONT,
440 SIAtomicAddrSpace::ATOMIC & InstrScope,
441 false);
442 if (SSID == SyncScope::SingleThread)
443 return std::make_tuple(SIAtomicScope::SINGLETHREAD,
444 SIAtomicAddrSpace::ATOMIC & InstrScope,
445 false);
446 /// TODO: To support HSA Memory Model need to add additional memory
447 /// scopes that specify that do require cross address space
448 /// ordering.
449 return None;
452 SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
453 if (AS == AMDGPUAS::FLAT_ADDRESS)
454 return SIAtomicAddrSpace::FLAT;
455 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
456 return SIAtomicAddrSpace::GLOBAL;
457 if (AS == AMDGPUAS::LOCAL_ADDRESS)
458 return SIAtomicAddrSpace::LDS;
459 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
460 return SIAtomicAddrSpace::SCRATCH;
461 if (AS == AMDGPUAS::REGION_ADDRESS)
462 return SIAtomicAddrSpace::GDS;
464 return SIAtomicAddrSpace::OTHER;
467 SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
468 MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
471 Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
472 const MachineBasicBlock::iterator &MI) const {
473 assert(MI->getNumMemOperands() > 0);
475 SyncScope::ID SSID = SyncScope::SingleThread;
476 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
477 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
478 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
479 bool IsNonTemporal = true;
481 // Validator should check whether or not MMOs cover the entire set of
482 // locations accessed by the memory instruction.
483 for (const auto &MMO : MI->memoperands()) {
484 IsNonTemporal &= MMO->isNonTemporal();
485 InstrAddrSpace |=
486 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
487 AtomicOrdering OpOrdering = MMO->getOrdering();
488 if (OpOrdering != AtomicOrdering::NotAtomic) {
489 const auto &IsSyncScopeInclusion =
490 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
491 if (!IsSyncScopeInclusion) {
492 reportUnsupported(MI,
493 "Unsupported non-inclusive atomic synchronization scope");
494 return None;
497 SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
498 Ordering =
499 isStrongerThan(Ordering, OpOrdering) ?
500 Ordering : MMO->getOrdering();
501 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
502 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
503 FailureOrdering =
504 isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
505 FailureOrdering : MMO->getFailureOrdering();
509 SIAtomicScope Scope = SIAtomicScope::NONE;
510 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
511 bool IsCrossAddressSpaceOrdering = false;
512 if (Ordering != AtomicOrdering::NotAtomic) {
513 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
514 if (!ScopeOrNone) {
515 reportUnsupported(MI, "Unsupported atomic synchronization scope");
516 return None;
518 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
519 ScopeOrNone.getValue();
520 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
521 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
522 reportUnsupported(MI, "Unsupported atomic address space");
523 return None;
526 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
527 IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
530 Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
531 const MachineBasicBlock::iterator &MI) const {
532 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
534 if (!(MI->mayLoad() && !MI->mayStore()))
535 return None;
537 // Be conservative if there are no memory operands.
538 if (MI->getNumMemOperands() == 0)
539 return SIMemOpInfo();
541 return constructFromMIWithMMO(MI);
544 Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
545 const MachineBasicBlock::iterator &MI) const {
546 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
548 if (!(!MI->mayLoad() && MI->mayStore()))
549 return None;
551 // Be conservative if there are no memory operands.
552 if (MI->getNumMemOperands() == 0)
553 return SIMemOpInfo();
555 return constructFromMIWithMMO(MI);
558 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
559 const MachineBasicBlock::iterator &MI) const {
560 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
562 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
563 return None;
565 AtomicOrdering Ordering =
566 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
568 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
569 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
570 if (!ScopeOrNone) {
571 reportUnsupported(MI, "Unsupported atomic synchronization scope");
572 return None;
575 SIAtomicScope Scope = SIAtomicScope::NONE;
576 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
577 bool IsCrossAddressSpaceOrdering = false;
578 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
579 ScopeOrNone.getValue();
581 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
582 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
583 reportUnsupported(MI, "Unsupported atomic address space");
584 return None;
587 return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
588 IsCrossAddressSpaceOrdering);
591 Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
592 const MachineBasicBlock::iterator &MI) const {
593 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
595 if (!(MI->mayLoad() && MI->mayStore()))
596 return None;
598 // Be conservative if there are no memory operands.
599 if (MI->getNumMemOperands() == 0)
600 return SIMemOpInfo();
602 return constructFromMIWithMMO(MI);
605 SICacheControl::SICacheControl(const GCNSubtarget &ST) {
606 TII = ST.getInstrInfo();
607 IV = getIsaVersion(ST.getCPU());
610 /* static */
611 std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
612 GCNSubtarget::Generation Generation = ST.getGeneration();
613 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
614 return make_unique<SIGfx6CacheControl>(ST);
615 return make_unique<SIGfx7CacheControl>(ST);
618 bool SIGfx6CacheControl::enableLoadCacheBypass(
619 const MachineBasicBlock::iterator &MI,
620 SIAtomicScope Scope,
621 SIAtomicAddrSpace AddrSpace) const {
622 assert(MI->mayLoad() && !MI->mayStore());
623 bool Changed = false;
625 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
626 /// TODO: Do not set glc for rmw atomic operations as they
627 /// implicitly bypass the L1 cache.
629 switch (Scope) {
630 case SIAtomicScope::SYSTEM:
631 case SIAtomicScope::AGENT:
632 Changed |= enableGLCBit(MI);
633 break;
634 case SIAtomicScope::WORKGROUP:
635 case SIAtomicScope::WAVEFRONT:
636 case SIAtomicScope::SINGLETHREAD:
637 // No cache to bypass.
638 break;
639 default:
640 llvm_unreachable("Unsupported synchronization scope");
644 /// The scratch address space does not need the global memory caches
645 /// to be bypassed as all memory operations by the same thread are
646 /// sequentially consistent, and no other thread can access scratch
647 /// memory.
649 /// Other address spaces do not hava a cache.
651 return Changed;
654 bool SIGfx6CacheControl::enableNonTemporal(
655 const MachineBasicBlock::iterator &MI) const {
656 assert(MI->mayLoad() ^ MI->mayStore());
657 bool Changed = false;
659 /// TODO: Do not enableGLCBit if rmw atomic.
660 Changed |= enableGLCBit(MI);
661 Changed |= enableSLCBit(MI);
663 return Changed;
666 bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
667 SIAtomicScope Scope,
668 SIAtomicAddrSpace AddrSpace,
669 Position Pos) const {
670 bool Changed = false;
672 MachineBasicBlock &MBB = *MI->getParent();
673 DebugLoc DL = MI->getDebugLoc();
675 if (Pos == Position::AFTER)
676 ++MI;
678 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
679 switch (Scope) {
680 case SIAtomicScope::SYSTEM:
681 case SIAtomicScope::AGENT:
682 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
683 Changed = true;
684 break;
685 case SIAtomicScope::WORKGROUP:
686 case SIAtomicScope::WAVEFRONT:
687 case SIAtomicScope::SINGLETHREAD:
688 // No cache to invalidate.
689 break;
690 default:
691 llvm_unreachable("Unsupported synchronization scope");
695 /// The scratch address space does not need the global memory cache
696 /// to be flushed as all memory operations by the same thread are
697 /// sequentially consistent, and no other thread can access scratch
698 /// memory.
700 /// Other address spaces do not hava a cache.
702 if (Pos == Position::AFTER)
703 --MI;
705 return Changed;
708 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
709 SIAtomicScope Scope,
710 SIAtomicAddrSpace AddrSpace,
711 SIMemOp Op,
712 bool IsCrossAddrSpaceOrdering,
713 Position Pos) const {
714 bool Changed = false;
716 MachineBasicBlock &MBB = *MI->getParent();
717 DebugLoc DL = MI->getDebugLoc();
719 if (Pos == Position::AFTER)
720 ++MI;
722 bool VMCnt = false;
723 bool LGKMCnt = false;
724 bool EXPCnt = false;
726 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
727 switch (Scope) {
728 case SIAtomicScope::SYSTEM:
729 case SIAtomicScope::AGENT:
730 VMCnt = true;
731 break;
732 case SIAtomicScope::WORKGROUP:
733 case SIAtomicScope::WAVEFRONT:
734 case SIAtomicScope::SINGLETHREAD:
735 // The L1 cache keeps all memory operations in order for
736 // wavefronts in the same work-group.
737 break;
738 default:
739 llvm_unreachable("Unsupported synchronization scope");
743 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
744 switch (Scope) {
745 case SIAtomicScope::SYSTEM:
746 case SIAtomicScope::AGENT:
747 case SIAtomicScope::WORKGROUP:
748 // If no cross address space ordering then an LDS waitcnt is not
749 // needed as LDS operations for all waves are executed in a
750 // total global ordering as observed by all waves. Required if
751 // also synchronizing with global/GDS memory as LDS operations
752 // could be reordered with respect to later global/GDS memory
753 // operations of the same wave.
754 LGKMCnt = IsCrossAddrSpaceOrdering;
755 break;
756 case SIAtomicScope::WAVEFRONT:
757 case SIAtomicScope::SINGLETHREAD:
758 // The LDS keeps all memory operations in order for
759 // the same wavesfront.
760 break;
761 default:
762 llvm_unreachable("Unsupported synchronization scope");
766 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
767 switch (Scope) {
768 case SIAtomicScope::SYSTEM:
769 case SIAtomicScope::AGENT:
770 // If no cross address space ordering then an GDS waitcnt is not
771 // needed as GDS operations for all waves are executed in a
772 // total global ordering as observed by all waves. Required if
773 // also synchronizing with global/LDS memory as GDS operations
774 // could be reordered with respect to later global/LDS memory
775 // operations of the same wave.
776 EXPCnt = IsCrossAddrSpaceOrdering;
777 break;
778 case SIAtomicScope::WORKGROUP:
779 case SIAtomicScope::WAVEFRONT:
780 case SIAtomicScope::SINGLETHREAD:
781 // The GDS keeps all memory operations in order for
782 // the same work-group.
783 break;
784 default:
785 llvm_unreachable("Unsupported synchronization scope");
789 if (VMCnt || LGKMCnt || EXPCnt) {
790 unsigned WaitCntImmediate =
791 AMDGPU::encodeWaitcnt(IV,
792 VMCnt ? 0 : getVmcntBitMask(IV),
793 EXPCnt ? 0 : getExpcntBitMask(IV),
794 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
795 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
796 Changed = true;
799 if (Pos == Position::AFTER)
800 --MI;
802 return Changed;
805 bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
806 SIAtomicScope Scope,
807 SIAtomicAddrSpace AddrSpace,
808 Position Pos) const {
809 bool Changed = false;
811 MachineBasicBlock &MBB = *MI->getParent();
812 DebugLoc DL = MI->getDebugLoc();
814 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
816 const unsigned Flush = STM.isAmdPalOS() || STM.isMesa3DOS()
817 ? AMDGPU::BUFFER_WBINVL1
818 : AMDGPU::BUFFER_WBINVL1_VOL;
820 if (Pos == Position::AFTER)
821 ++MI;
823 if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
824 switch (Scope) {
825 case SIAtomicScope::SYSTEM:
826 case SIAtomicScope::AGENT:
827 BuildMI(MBB, MI, DL, TII->get(Flush));
828 Changed = true;
829 break;
830 case SIAtomicScope::WORKGROUP:
831 case SIAtomicScope::WAVEFRONT:
832 case SIAtomicScope::SINGLETHREAD:
833 // No cache to invalidate.
834 break;
835 default:
836 llvm_unreachable("Unsupported synchronization scope");
840 /// The scratch address space does not need the global memory cache
841 /// to be flushed as all memory operations by the same thread are
842 /// sequentially consistent, and no other thread can access scratch
843 /// memory.
845 /// Other address spaces do not hava a cache.
847 if (Pos == Position::AFTER)
848 --MI;
850 return Changed;
853 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
854 if (AtomicPseudoMIs.empty())
855 return false;
857 for (auto &MI : AtomicPseudoMIs)
858 MI->eraseFromParent();
860 AtomicPseudoMIs.clear();
861 return true;
864 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
865 MachineBasicBlock::iterator &MI) {
866 assert(MI->mayLoad() && !MI->mayStore());
868 bool Changed = false;
870 if (MOI.isAtomic()) {
871 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
872 MOI.getOrdering() == AtomicOrdering::Acquire ||
873 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
874 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
875 MOI.getOrderingAddrSpace());
878 if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
879 Changed |= CC->insertWait(MI, MOI.getScope(),
880 MOI.getOrderingAddrSpace(),
881 SIMemOp::LOAD | SIMemOp::STORE,
882 MOI.getIsCrossAddressSpaceOrdering(),
883 Position::BEFORE);
885 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
886 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
887 Changed |= CC->insertWait(MI, MOI.getScope(),
888 MOI.getInstrAddrSpace(),
889 SIMemOp::LOAD,
890 MOI.getIsCrossAddressSpaceOrdering(),
891 Position::AFTER);
892 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
893 MOI.getOrderingAddrSpace(),
894 Position::AFTER);
897 return Changed;
900 // Atomic instructions do not have the nontemporal attribute.
901 if (MOI.isNonTemporal()) {
902 Changed |= CC->enableNonTemporal(MI);
903 return Changed;
906 return Changed;
909 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
910 MachineBasicBlock::iterator &MI) {
911 assert(!MI->mayLoad() && MI->mayStore());
913 bool Changed = false;
915 if (MOI.isAtomic()) {
916 if (MOI.getOrdering() == AtomicOrdering::Release ||
917 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
918 Changed |= CC->insertWait(MI, MOI.getScope(),
919 MOI.getOrderingAddrSpace(),
920 SIMemOp::LOAD | SIMemOp::STORE,
921 MOI.getIsCrossAddressSpaceOrdering(),
922 Position::BEFORE);
924 return Changed;
927 // Atomic instructions do not have the nontemporal attribute.
928 if (MOI.isNonTemporal()) {
929 Changed |= CC->enableNonTemporal(MI);
930 return Changed;
933 return Changed;
936 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
937 MachineBasicBlock::iterator &MI) {
938 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
940 AtomicPseudoMIs.push_back(MI);
941 bool Changed = false;
943 if (MOI.isAtomic()) {
944 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
945 MOI.getOrdering() == AtomicOrdering::Release ||
946 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
947 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
948 /// TODO: This relies on a barrier always generating a waitcnt
949 /// for LDS to ensure it is not reordered with the completion of
950 /// the proceeding LDS operations. If barrier had a memory
951 /// ordering and memory scope, then library does not need to
952 /// generate a fence. Could add support in this file for
953 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
954 /// adding waitcnt before a S_BARRIER.
955 Changed |= CC->insertWait(MI, MOI.getScope(),
956 MOI.getOrderingAddrSpace(),
957 SIMemOp::LOAD | SIMemOp::STORE,
958 MOI.getIsCrossAddressSpaceOrdering(),
959 Position::BEFORE);
961 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
962 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
963 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
964 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
965 MOI.getOrderingAddrSpace(),
966 Position::BEFORE);
968 return Changed;
971 return Changed;
974 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
975 MachineBasicBlock::iterator &MI) {
976 assert(MI->mayLoad() && MI->mayStore());
978 bool Changed = false;
980 if (MOI.isAtomic()) {
981 if (MOI.getOrdering() == AtomicOrdering::Release ||
982 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
983 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
984 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
985 Changed |= CC->insertWait(MI, MOI.getScope(),
986 MOI.getOrderingAddrSpace(),
987 SIMemOp::LOAD | SIMemOp::STORE,
988 MOI.getIsCrossAddressSpaceOrdering(),
989 Position::BEFORE);
991 if (MOI.getOrdering() == AtomicOrdering::Acquire ||
992 MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
993 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
994 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
995 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
996 Changed |= CC->insertWait(MI, MOI.getScope(),
997 MOI.getOrderingAddrSpace(),
998 isAtomicRet(*MI) ? SIMemOp::LOAD :
999 SIMemOp::STORE,
1000 MOI.getIsCrossAddressSpaceOrdering(),
1001 Position::AFTER);
1002 Changed |= CC->insertCacheInvalidate(MI, MOI.getScope(),
1003 MOI.getOrderingAddrSpace(),
1004 Position::AFTER);
1007 return Changed;
1010 return Changed;
1013 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
1014 bool Changed = false;
1016 SIMemOpAccess MOA(MF);
1017 CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
1019 for (auto &MBB : MF) {
1020 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
1021 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
1022 continue;
1024 if (const auto &MOI = MOA.getLoadInfo(MI))
1025 Changed |= expandLoad(MOI.getValue(), MI);
1026 else if (const auto &MOI = MOA.getStoreInfo(MI))
1027 Changed |= expandStore(MOI.getValue(), MI);
1028 else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
1029 Changed |= expandAtomicFence(MOI.getValue(), MI);
1030 else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
1031 Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
1035 Changed |= removeAtomicPseudoMIs();
1036 return Changed;
1039 INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
1041 char SIMemoryLegalizer::ID = 0;
1042 char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
1044 FunctionPass *llvm::createSIMemoryLegalizerPass() {
1045 return new SIMemoryLegalizer();