1 //===- SIMemoryLegalizer.cpp ----------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Memory legalizer - implements memory model. More information can be
12 /// http://llvm.org/docs/AMDGPUUsage.html#memory-model
14 //===----------------------------------------------------------------------===//
17 #include "AMDGPUMachineModuleInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIDefines.h"
20 #include "SIInstrInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/BitmaskEnum.h"
24 #include "llvm/ADT/None.h"
25 #include "llvm/ADT/Optional.h"
26 #include "llvm/CodeGen/MachineBasicBlock.h"
27 #include "llvm/CodeGen/MachineFunction.h"
28 #include "llvm/CodeGen/MachineFunctionPass.h"
29 #include "llvm/CodeGen/MachineInstrBuilder.h"
30 #include "llvm/CodeGen/MachineMemOperand.h"
31 #include "llvm/CodeGen/MachineModuleInfo.h"
32 #include "llvm/CodeGen/MachineOperand.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/LLVMContext.h"
37 #include "llvm/MC/MCInstrDesc.h"
38 #include "llvm/Pass.h"
39 #include "llvm/Support/AtomicOrdering.h"
40 #include "llvm/Support/MathExtras.h"
45 using namespace llvm::AMDGPU
;
47 #define DEBUG_TYPE "si-memory-legalizer"
48 #define PASS_NAME "SI Memory Legalizer"
52 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
54 /// Memory operation flags. Can be ORed together.
59 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE
)
62 /// Position to insert a new instruction relative to an existing
69 /// The atomic synchronization scopes supported by the AMDGPU target.
70 enum class SIAtomicScope
{
79 /// The distinct address spaces supported by the AMDGPU target for
80 /// atomic memory operation. Can be ORed toether.
81 enum class SIAtomicAddrSpace
{
89 /// The address spaces that can be accessed by a FLAT instruction.
90 FLAT
= GLOBAL
| LDS
| SCRATCH
,
92 /// The address spaces that support atomic instructions.
93 ATOMIC
= GLOBAL
| LDS
| SCRATCH
| GDS
,
95 /// All address spaces.
96 ALL
= GLOBAL
| LDS
| SCRATCH
| GDS
| OTHER
,
98 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL
)
101 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
102 /// \returns Returns true if \p MI is modified, false otherwise.
103 template <uint16_t BitName
>
104 bool enableNamedBit(const MachineBasicBlock::iterator
&MI
) {
105 int BitIdx
= AMDGPU::getNamedOperandIdx(MI
->getOpcode(), BitName
);
109 MachineOperand
&Bit
= MI
->getOperand(BitIdx
);
110 if (Bit
.getImm() != 0)
117 class SIMemOpInfo final
{
120 friend class SIMemOpAccess
;
122 AtomicOrdering Ordering
= AtomicOrdering::NotAtomic
;
123 AtomicOrdering FailureOrdering
= AtomicOrdering::NotAtomic
;
124 SIAtomicScope Scope
= SIAtomicScope::SYSTEM
;
125 SIAtomicAddrSpace OrderingAddrSpace
= SIAtomicAddrSpace::NONE
;
126 SIAtomicAddrSpace InstrAddrSpace
= SIAtomicAddrSpace::NONE
;
127 bool IsCrossAddressSpaceOrdering
= false;
128 bool IsNonTemporal
= false;
130 SIMemOpInfo(AtomicOrdering Ordering
= AtomicOrdering::SequentiallyConsistent
,
131 SIAtomicScope Scope
= SIAtomicScope::SYSTEM
,
132 SIAtomicAddrSpace OrderingAddrSpace
= SIAtomicAddrSpace::ATOMIC
,
133 SIAtomicAddrSpace InstrAddrSpace
= SIAtomicAddrSpace::ALL
,
134 bool IsCrossAddressSpaceOrdering
= true,
135 AtomicOrdering FailureOrdering
=
136 AtomicOrdering::SequentiallyConsistent
,
137 bool IsNonTemporal
= false)
138 : Ordering(Ordering
), FailureOrdering(FailureOrdering
),
139 Scope(Scope
), OrderingAddrSpace(OrderingAddrSpace
),
140 InstrAddrSpace(InstrAddrSpace
),
141 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering
),
142 IsNonTemporal(IsNonTemporal
) {
143 // There is also no cross address space ordering if the ordering
144 // address space is the same as the instruction address space and
145 // only contains a single address space.
146 if ((OrderingAddrSpace
== InstrAddrSpace
) &&
147 isPowerOf2_32(uint32_t(InstrAddrSpace
)))
148 IsCrossAddressSpaceOrdering
= false;
152 /// \returns Atomic synchronization scope of the machine instruction used to
153 /// create this SIMemOpInfo.
154 SIAtomicScope
getScope() const {
158 /// \returns Ordering constraint of the machine instruction used to
159 /// create this SIMemOpInfo.
160 AtomicOrdering
getOrdering() const {
164 /// \returns Failure ordering constraint of the machine instruction used to
165 /// create this SIMemOpInfo.
166 AtomicOrdering
getFailureOrdering() const {
167 return FailureOrdering
;
170 /// \returns The address spaces be accessed by the machine
171 /// instruction used to create this SiMemOpInfo.
172 SIAtomicAddrSpace
getInstrAddrSpace() const {
173 return InstrAddrSpace
;
176 /// \returns The address spaces that must be ordered by the machine
177 /// instruction used to create this SiMemOpInfo.
178 SIAtomicAddrSpace
getOrderingAddrSpace() const {
179 return OrderingAddrSpace
;
182 /// \returns Return true iff memory ordering of operations on
183 /// different address spaces is required.
184 bool getIsCrossAddressSpaceOrdering() const {
185 return IsCrossAddressSpaceOrdering
;
188 /// \returns True if memory access of the machine instruction used to
189 /// create this SIMemOpInfo is non-temporal, false otherwise.
190 bool isNonTemporal() const {
191 return IsNonTemporal
;
194 /// \returns True if ordering constraint of the machine instruction used to
195 /// create this SIMemOpInfo is unordered or higher, false otherwise.
196 bool isAtomic() const {
197 return Ordering
!= AtomicOrdering::NotAtomic
;
202 class SIMemOpAccess final
{
204 AMDGPUMachineModuleInfo
*MMI
= nullptr;
206 /// Reports unsupported message \p Msg for \p MI to LLVM context.
207 void reportUnsupported(const MachineBasicBlock::iterator
&MI
,
208 const char *Msg
) const;
210 /// Inspects the target synchonization scope \p SSID and determines
211 /// the SI atomic scope it corresponds to, the address spaces it
212 /// covers, and whether the memory ordering applies between address
214 Optional
<std::tuple
<SIAtomicScope
, SIAtomicAddrSpace
, bool>>
215 toSIAtomicScope(SyncScope::ID SSID
, SIAtomicAddrSpace InstrScope
) const;
217 /// \return Return a bit set of the address spaces accessed by \p AS.
218 SIAtomicAddrSpace
toSIAtomicAddrSpace(unsigned AS
) const;
220 /// \returns Info constructed from \p MI, which has at least machine memory
222 Optional
<SIMemOpInfo
> constructFromMIWithMMO(
223 const MachineBasicBlock::iterator
&MI
) const;
226 /// Construct class to support accessing the machine memory operands
227 /// of instructions in the machine function \p MF.
228 SIMemOpAccess(MachineFunction
&MF
);
230 /// \returns Load info if \p MI is a load operation, "None" otherwise.
231 Optional
<SIMemOpInfo
> getLoadInfo(
232 const MachineBasicBlock::iterator
&MI
) const;
234 /// \returns Store info if \p MI is a store operation, "None" otherwise.
235 Optional
<SIMemOpInfo
> getStoreInfo(
236 const MachineBasicBlock::iterator
&MI
) const;
238 /// \returns Atomic fence info if \p MI is an atomic fence operation,
239 /// "None" otherwise.
240 Optional
<SIMemOpInfo
> getAtomicFenceInfo(
241 const MachineBasicBlock::iterator
&MI
) const;
243 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
244 /// rmw operation, "None" otherwise.
245 Optional
<SIMemOpInfo
> getAtomicCmpxchgOrRmwInfo(
246 const MachineBasicBlock::iterator
&MI
) const;
249 class SICacheControl
{
252 /// Instruction info.
253 const SIInstrInfo
*TII
= nullptr;
257 SICacheControl(const GCNSubtarget
&ST
);
261 /// Create a cache control for the subtarget \p ST.
262 static std::unique_ptr
<SICacheControl
> create(const GCNSubtarget
&ST
);
264 /// Update \p MI memory load instruction to bypass any caches up to
265 /// the \p Scope memory scope for address spaces \p
266 /// AddrSpace. Return true iff the instruction was modified.
267 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator
&MI
,
269 SIAtomicAddrSpace AddrSpace
) const = 0;
271 /// Update \p MI memory instruction to indicate it is
272 /// nontemporal. Return true iff the instruction was modified.
273 virtual bool enableNonTemporal(const MachineBasicBlock::iterator
&MI
)
276 /// Inserts any necessary instructions at position \p Pos relative
277 /// to instruction \p MI to ensure any caches associated with
278 /// address spaces \p AddrSpace for memory scopes up to memory scope
279 /// \p Scope are invalidated. Returns true iff any instructions
281 virtual bool insertCacheInvalidate(MachineBasicBlock::iterator
&MI
,
283 SIAtomicAddrSpace AddrSpace
,
284 Position Pos
) const = 0;
286 /// Inserts any necessary instructions at position \p Pos relative
287 /// to instruction \p MI to ensure memory instructions of kind \p Op
288 /// associated with address spaces \p AddrSpace have completed as
289 /// observed by other memory instructions executing in memory scope
290 /// \p Scope. \p IsCrossAddrSpaceOrdering indicates if the memory
291 /// ordering is between address spaces. Returns true iff any
292 /// instructions inserted.
293 virtual bool insertWait(MachineBasicBlock::iterator
&MI
,
295 SIAtomicAddrSpace AddrSpace
,
297 bool IsCrossAddrSpaceOrdering
,
298 Position Pos
) const = 0;
300 /// Virtual destructor to allow derivations to be deleted.
301 virtual ~SICacheControl() = default;
305 class SIGfx6CacheControl
: public SICacheControl
{
308 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
309 /// is modified, false otherwise.
310 bool enableGLCBit(const MachineBasicBlock::iterator
&MI
) const {
311 return enableNamedBit
<AMDGPU::OpName::glc
>(MI
);
314 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
315 /// is modified, false otherwise.
316 bool enableSLCBit(const MachineBasicBlock::iterator
&MI
) const {
317 return enableNamedBit
<AMDGPU::OpName::slc
>(MI
);
322 SIGfx6CacheControl(const GCNSubtarget
&ST
) : SICacheControl(ST
) {};
324 bool enableLoadCacheBypass(const MachineBasicBlock::iterator
&MI
,
326 SIAtomicAddrSpace AddrSpace
) const override
;
328 bool enableNonTemporal(const MachineBasicBlock::iterator
&MI
) const override
;
330 bool insertCacheInvalidate(MachineBasicBlock::iterator
&MI
,
332 SIAtomicAddrSpace AddrSpace
,
333 Position Pos
) const override
;
335 bool insertWait(MachineBasicBlock::iterator
&MI
,
337 SIAtomicAddrSpace AddrSpace
,
339 bool IsCrossAddrSpaceOrdering
,
340 Position Pos
) const override
;
343 class SIGfx7CacheControl
: public SIGfx6CacheControl
{
346 SIGfx7CacheControl(const GCNSubtarget
&ST
) : SIGfx6CacheControl(ST
) {};
348 bool insertCacheInvalidate(MachineBasicBlock::iterator
&MI
,
350 SIAtomicAddrSpace AddrSpace
,
351 Position Pos
) const override
;
355 class SIMemoryLegalizer final
: public MachineFunctionPass
{
359 std::unique_ptr
<SICacheControl
> CC
= nullptr;
361 /// List of atomic pseudo instructions.
362 std::list
<MachineBasicBlock::iterator
> AtomicPseudoMIs
;
364 /// Return true iff instruction \p MI is a atomic instruction that
365 /// returns a result.
366 bool isAtomicRet(const MachineInstr
&MI
) const {
367 return AMDGPU::getAtomicNoRetOp(MI
.getOpcode()) != -1;
370 /// Removes all processed atomic pseudo instructions from the current
371 /// function. Returns true if current function is modified, false otherwise.
372 bool removeAtomicPseudoMIs();
374 /// Expands load operation \p MI. Returns true if instructions are
375 /// added/deleted or \p MI is modified, false otherwise.
376 bool expandLoad(const SIMemOpInfo
&MOI
,
377 MachineBasicBlock::iterator
&MI
);
378 /// Expands store operation \p MI. Returns true if instructions are
379 /// added/deleted or \p MI is modified, false otherwise.
380 bool expandStore(const SIMemOpInfo
&MOI
,
381 MachineBasicBlock::iterator
&MI
);
382 /// Expands atomic fence operation \p MI. Returns true if
383 /// instructions are added/deleted or \p MI is modified, false otherwise.
384 bool expandAtomicFence(const SIMemOpInfo
&MOI
,
385 MachineBasicBlock::iterator
&MI
);
386 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
387 /// instructions are added/deleted or \p MI is modified, false otherwise.
388 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo
&MOI
,
389 MachineBasicBlock::iterator
&MI
);
394 SIMemoryLegalizer() : MachineFunctionPass(ID
) {}
396 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
397 AU
.setPreservesCFG();
398 MachineFunctionPass::getAnalysisUsage(AU
);
401 StringRef
getPassName() const override
{
405 bool runOnMachineFunction(MachineFunction
&MF
) override
;
408 } // end namespace anonymous
410 void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator
&MI
,
411 const char *Msg
) const {
412 const Function
&Func
= MI
->getParent()->getParent()->getFunction();
413 DiagnosticInfoUnsupported
Diag(Func
, Msg
, MI
->getDebugLoc());
414 Func
.getContext().diagnose(Diag
);
417 Optional
<std::tuple
<SIAtomicScope
, SIAtomicAddrSpace
, bool>>
418 SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID
,
419 SIAtomicAddrSpace InstrScope
) const {
420 /// TODO: For now assume OpenCL memory model which treats each
421 /// address space as having a separate happens-before relation, and
422 /// so an instruction only has ordering with respect to the address
423 /// space it accesses, and if it accesses multiple address spaces it
424 /// does not require ordering of operations in different address
426 if (SSID
== SyncScope::System
)
427 return std::make_tuple(SIAtomicScope::SYSTEM
,
428 SIAtomicAddrSpace::ATOMIC
& InstrScope
,
430 if (SSID
== MMI
->getAgentSSID())
431 return std::make_tuple(SIAtomicScope::AGENT
,
432 SIAtomicAddrSpace::ATOMIC
& InstrScope
,
434 if (SSID
== MMI
->getWorkgroupSSID())
435 return std::make_tuple(SIAtomicScope::WORKGROUP
,
436 SIAtomicAddrSpace::ATOMIC
& InstrScope
,
438 if (SSID
== MMI
->getWavefrontSSID())
439 return std::make_tuple(SIAtomicScope::WAVEFRONT
,
440 SIAtomicAddrSpace::ATOMIC
& InstrScope
,
442 if (SSID
== SyncScope::SingleThread
)
443 return std::make_tuple(SIAtomicScope::SINGLETHREAD
,
444 SIAtomicAddrSpace::ATOMIC
& InstrScope
,
446 /// TODO: To support HSA Memory Model need to add additional memory
447 /// scopes that specify that do require cross address space
452 SIAtomicAddrSpace
SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS
) const {
453 if (AS
== AMDGPUAS::FLAT_ADDRESS
)
454 return SIAtomicAddrSpace::FLAT
;
455 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
)
456 return SIAtomicAddrSpace::GLOBAL
;
457 if (AS
== AMDGPUAS::LOCAL_ADDRESS
)
458 return SIAtomicAddrSpace::LDS
;
459 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
)
460 return SIAtomicAddrSpace::SCRATCH
;
461 if (AS
== AMDGPUAS::REGION_ADDRESS
)
462 return SIAtomicAddrSpace::GDS
;
464 return SIAtomicAddrSpace::OTHER
;
467 SIMemOpAccess::SIMemOpAccess(MachineFunction
&MF
) {
468 MMI
= &MF
.getMMI().getObjFileInfo
<AMDGPUMachineModuleInfo
>();
471 Optional
<SIMemOpInfo
> SIMemOpAccess::constructFromMIWithMMO(
472 const MachineBasicBlock::iterator
&MI
) const {
473 assert(MI
->getNumMemOperands() > 0);
475 SyncScope::ID SSID
= SyncScope::SingleThread
;
476 AtomicOrdering Ordering
= AtomicOrdering::NotAtomic
;
477 AtomicOrdering FailureOrdering
= AtomicOrdering::NotAtomic
;
478 SIAtomicAddrSpace InstrAddrSpace
= SIAtomicAddrSpace::NONE
;
479 bool IsNonTemporal
= true;
481 // Validator should check whether or not MMOs cover the entire set of
482 // locations accessed by the memory instruction.
483 for (const auto &MMO
: MI
->memoperands()) {
484 IsNonTemporal
&= MMO
->isNonTemporal();
486 toSIAtomicAddrSpace(MMO
->getPointerInfo().getAddrSpace());
487 AtomicOrdering OpOrdering
= MMO
->getOrdering();
488 if (OpOrdering
!= AtomicOrdering::NotAtomic
) {
489 const auto &IsSyncScopeInclusion
=
490 MMI
->isSyncScopeInclusion(SSID
, MMO
->getSyncScopeID());
491 if (!IsSyncScopeInclusion
) {
492 reportUnsupported(MI
,
493 "Unsupported non-inclusive atomic synchronization scope");
497 SSID
= IsSyncScopeInclusion
.getValue() ? SSID
: MMO
->getSyncScopeID();
499 isStrongerThan(Ordering
, OpOrdering
) ?
500 Ordering
: MMO
->getOrdering();
501 assert(MMO
->getFailureOrdering() != AtomicOrdering::Release
&&
502 MMO
->getFailureOrdering() != AtomicOrdering::AcquireRelease
);
504 isStrongerThan(FailureOrdering
, MMO
->getFailureOrdering()) ?
505 FailureOrdering
: MMO
->getFailureOrdering();
509 SIAtomicScope Scope
= SIAtomicScope::NONE
;
510 SIAtomicAddrSpace OrderingAddrSpace
= SIAtomicAddrSpace::NONE
;
511 bool IsCrossAddressSpaceOrdering
= false;
512 if (Ordering
!= AtomicOrdering::NotAtomic
) {
513 auto ScopeOrNone
= toSIAtomicScope(SSID
, InstrAddrSpace
);
515 reportUnsupported(MI
, "Unsupported atomic synchronization scope");
518 std::tie(Scope
, OrderingAddrSpace
, IsCrossAddressSpaceOrdering
) =
519 ScopeOrNone
.getValue();
520 if ((OrderingAddrSpace
== SIAtomicAddrSpace::NONE
) ||
521 ((OrderingAddrSpace
& SIAtomicAddrSpace::ATOMIC
) != OrderingAddrSpace
)) {
522 reportUnsupported(MI
, "Unsupported atomic address space");
526 return SIMemOpInfo(Ordering
, Scope
, OrderingAddrSpace
, InstrAddrSpace
,
527 IsCrossAddressSpaceOrdering
, FailureOrdering
, IsNonTemporal
);
530 Optional
<SIMemOpInfo
> SIMemOpAccess::getLoadInfo(
531 const MachineBasicBlock::iterator
&MI
) const {
532 assert(MI
->getDesc().TSFlags
& SIInstrFlags::maybeAtomic
);
534 if (!(MI
->mayLoad() && !MI
->mayStore()))
537 // Be conservative if there are no memory operands.
538 if (MI
->getNumMemOperands() == 0)
539 return SIMemOpInfo();
541 return constructFromMIWithMMO(MI
);
544 Optional
<SIMemOpInfo
> SIMemOpAccess::getStoreInfo(
545 const MachineBasicBlock::iterator
&MI
) const {
546 assert(MI
->getDesc().TSFlags
& SIInstrFlags::maybeAtomic
);
548 if (!(!MI
->mayLoad() && MI
->mayStore()))
551 // Be conservative if there are no memory operands.
552 if (MI
->getNumMemOperands() == 0)
553 return SIMemOpInfo();
555 return constructFromMIWithMMO(MI
);
558 Optional
<SIMemOpInfo
> SIMemOpAccess::getAtomicFenceInfo(
559 const MachineBasicBlock::iterator
&MI
) const {
560 assert(MI
->getDesc().TSFlags
& SIInstrFlags::maybeAtomic
);
562 if (MI
->getOpcode() != AMDGPU::ATOMIC_FENCE
)
565 AtomicOrdering Ordering
=
566 static_cast<AtomicOrdering
>(MI
->getOperand(0).getImm());
568 SyncScope::ID SSID
= static_cast<SyncScope::ID
>(MI
->getOperand(1).getImm());
569 auto ScopeOrNone
= toSIAtomicScope(SSID
, SIAtomicAddrSpace::ATOMIC
);
571 reportUnsupported(MI
, "Unsupported atomic synchronization scope");
575 SIAtomicScope Scope
= SIAtomicScope::NONE
;
576 SIAtomicAddrSpace OrderingAddrSpace
= SIAtomicAddrSpace::NONE
;
577 bool IsCrossAddressSpaceOrdering
= false;
578 std::tie(Scope
, OrderingAddrSpace
, IsCrossAddressSpaceOrdering
) =
579 ScopeOrNone
.getValue();
581 if ((OrderingAddrSpace
== SIAtomicAddrSpace::NONE
) ||
582 ((OrderingAddrSpace
& SIAtomicAddrSpace::ATOMIC
) != OrderingAddrSpace
)) {
583 reportUnsupported(MI
, "Unsupported atomic address space");
587 return SIMemOpInfo(Ordering
, Scope
, OrderingAddrSpace
, SIAtomicAddrSpace::ATOMIC
,
588 IsCrossAddressSpaceOrdering
);
591 Optional
<SIMemOpInfo
> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
592 const MachineBasicBlock::iterator
&MI
) const {
593 assert(MI
->getDesc().TSFlags
& SIInstrFlags::maybeAtomic
);
595 if (!(MI
->mayLoad() && MI
->mayStore()))
598 // Be conservative if there are no memory operands.
599 if (MI
->getNumMemOperands() == 0)
600 return SIMemOpInfo();
602 return constructFromMIWithMMO(MI
);
605 SICacheControl::SICacheControl(const GCNSubtarget
&ST
) {
606 TII
= ST
.getInstrInfo();
607 IV
= getIsaVersion(ST
.getCPU());
611 std::unique_ptr
<SICacheControl
> SICacheControl::create(const GCNSubtarget
&ST
) {
612 GCNSubtarget::Generation Generation
= ST
.getGeneration();
613 if (Generation
<= AMDGPUSubtarget::SOUTHERN_ISLANDS
)
614 return make_unique
<SIGfx6CacheControl
>(ST
);
615 return make_unique
<SIGfx7CacheControl
>(ST
);
618 bool SIGfx6CacheControl::enableLoadCacheBypass(
619 const MachineBasicBlock::iterator
&MI
,
621 SIAtomicAddrSpace AddrSpace
) const {
622 assert(MI
->mayLoad() && !MI
->mayStore());
623 bool Changed
= false;
625 if ((AddrSpace
& SIAtomicAddrSpace::GLOBAL
) != SIAtomicAddrSpace::NONE
) {
626 /// TODO: Do not set glc for rmw atomic operations as they
627 /// implicitly bypass the L1 cache.
630 case SIAtomicScope::SYSTEM
:
631 case SIAtomicScope::AGENT
:
632 Changed
|= enableGLCBit(MI
);
634 case SIAtomicScope::WORKGROUP
:
635 case SIAtomicScope::WAVEFRONT
:
636 case SIAtomicScope::SINGLETHREAD
:
637 // No cache to bypass.
640 llvm_unreachable("Unsupported synchronization scope");
644 /// The scratch address space does not need the global memory caches
645 /// to be bypassed as all memory operations by the same thread are
646 /// sequentially consistent, and no other thread can access scratch
649 /// Other address spaces do not hava a cache.
654 bool SIGfx6CacheControl::enableNonTemporal(
655 const MachineBasicBlock::iterator
&MI
) const {
656 assert(MI
->mayLoad() ^ MI
->mayStore());
657 bool Changed
= false;
659 /// TODO: Do not enableGLCBit if rmw atomic.
660 Changed
|= enableGLCBit(MI
);
661 Changed
|= enableSLCBit(MI
);
666 bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator
&MI
,
668 SIAtomicAddrSpace AddrSpace
,
669 Position Pos
) const {
670 bool Changed
= false;
672 MachineBasicBlock
&MBB
= *MI
->getParent();
673 DebugLoc DL
= MI
->getDebugLoc();
675 if (Pos
== Position::AFTER
)
678 if ((AddrSpace
& SIAtomicAddrSpace::GLOBAL
) != SIAtomicAddrSpace::NONE
) {
680 case SIAtomicScope::SYSTEM
:
681 case SIAtomicScope::AGENT
:
682 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::BUFFER_WBINVL1
));
685 case SIAtomicScope::WORKGROUP
:
686 case SIAtomicScope::WAVEFRONT
:
687 case SIAtomicScope::SINGLETHREAD
:
688 // No cache to invalidate.
691 llvm_unreachable("Unsupported synchronization scope");
695 /// The scratch address space does not need the global memory cache
696 /// to be flushed as all memory operations by the same thread are
697 /// sequentially consistent, and no other thread can access scratch
700 /// Other address spaces do not hava a cache.
702 if (Pos
== Position::AFTER
)
708 bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator
&MI
,
710 SIAtomicAddrSpace AddrSpace
,
712 bool IsCrossAddrSpaceOrdering
,
713 Position Pos
) const {
714 bool Changed
= false;
716 MachineBasicBlock
&MBB
= *MI
->getParent();
717 DebugLoc DL
= MI
->getDebugLoc();
719 if (Pos
== Position::AFTER
)
723 bool LGKMCnt
= false;
726 if ((AddrSpace
& SIAtomicAddrSpace::GLOBAL
) != SIAtomicAddrSpace::NONE
) {
728 case SIAtomicScope::SYSTEM
:
729 case SIAtomicScope::AGENT
:
732 case SIAtomicScope::WORKGROUP
:
733 case SIAtomicScope::WAVEFRONT
:
734 case SIAtomicScope::SINGLETHREAD
:
735 // The L1 cache keeps all memory operations in order for
736 // wavefronts in the same work-group.
739 llvm_unreachable("Unsupported synchronization scope");
743 if ((AddrSpace
& SIAtomicAddrSpace::LDS
) != SIAtomicAddrSpace::NONE
) {
745 case SIAtomicScope::SYSTEM
:
746 case SIAtomicScope::AGENT
:
747 case SIAtomicScope::WORKGROUP
:
748 // If no cross address space ordering then an LDS waitcnt is not
749 // needed as LDS operations for all waves are executed in a
750 // total global ordering as observed by all waves. Required if
751 // also synchronizing with global/GDS memory as LDS operations
752 // could be reordered with respect to later global/GDS memory
753 // operations of the same wave.
754 LGKMCnt
= IsCrossAddrSpaceOrdering
;
756 case SIAtomicScope::WAVEFRONT
:
757 case SIAtomicScope::SINGLETHREAD
:
758 // The LDS keeps all memory operations in order for
759 // the same wavesfront.
762 llvm_unreachable("Unsupported synchronization scope");
766 if ((AddrSpace
& SIAtomicAddrSpace::GDS
) != SIAtomicAddrSpace::NONE
) {
768 case SIAtomicScope::SYSTEM
:
769 case SIAtomicScope::AGENT
:
770 // If no cross address space ordering then an GDS waitcnt is not
771 // needed as GDS operations for all waves are executed in a
772 // total global ordering as observed by all waves. Required if
773 // also synchronizing with global/LDS memory as GDS operations
774 // could be reordered with respect to later global/LDS memory
775 // operations of the same wave.
776 EXPCnt
= IsCrossAddrSpaceOrdering
;
778 case SIAtomicScope::WORKGROUP
:
779 case SIAtomicScope::WAVEFRONT
:
780 case SIAtomicScope::SINGLETHREAD
:
781 // The GDS keeps all memory operations in order for
782 // the same work-group.
785 llvm_unreachable("Unsupported synchronization scope");
789 if (VMCnt
|| LGKMCnt
|| EXPCnt
) {
790 unsigned WaitCntImmediate
=
791 AMDGPU::encodeWaitcnt(IV
,
792 VMCnt
? 0 : getVmcntBitMask(IV
),
793 EXPCnt
? 0 : getExpcntBitMask(IV
),
794 LGKMCnt
? 0 : getLgkmcntBitMask(IV
));
795 BuildMI(MBB
, MI
, DL
, TII
->get(AMDGPU::S_WAITCNT
)).addImm(WaitCntImmediate
);
799 if (Pos
== Position::AFTER
)
805 bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator
&MI
,
807 SIAtomicAddrSpace AddrSpace
,
808 Position Pos
) const {
809 bool Changed
= false;
811 MachineBasicBlock
&MBB
= *MI
->getParent();
812 DebugLoc DL
= MI
->getDebugLoc();
814 const GCNSubtarget
&STM
= MBB
.getParent()->getSubtarget
<GCNSubtarget
>();
816 const unsigned Flush
= STM
.isAmdPalOS() || STM
.isMesa3DOS()
817 ? AMDGPU::BUFFER_WBINVL1
818 : AMDGPU::BUFFER_WBINVL1_VOL
;
820 if (Pos
== Position::AFTER
)
823 if ((AddrSpace
& SIAtomicAddrSpace::GLOBAL
) != SIAtomicAddrSpace::NONE
) {
825 case SIAtomicScope::SYSTEM
:
826 case SIAtomicScope::AGENT
:
827 BuildMI(MBB
, MI
, DL
, TII
->get(Flush
));
830 case SIAtomicScope::WORKGROUP
:
831 case SIAtomicScope::WAVEFRONT
:
832 case SIAtomicScope::SINGLETHREAD
:
833 // No cache to invalidate.
836 llvm_unreachable("Unsupported synchronization scope");
840 /// The scratch address space does not need the global memory cache
841 /// to be flushed as all memory operations by the same thread are
842 /// sequentially consistent, and no other thread can access scratch
845 /// Other address spaces do not hava a cache.
847 if (Pos
== Position::AFTER
)
853 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
854 if (AtomicPseudoMIs
.empty())
857 for (auto &MI
: AtomicPseudoMIs
)
858 MI
->eraseFromParent();
860 AtomicPseudoMIs
.clear();
864 bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo
&MOI
,
865 MachineBasicBlock::iterator
&MI
) {
866 assert(MI
->mayLoad() && !MI
->mayStore());
868 bool Changed
= false;
870 if (MOI
.isAtomic()) {
871 if (MOI
.getOrdering() == AtomicOrdering::Monotonic
||
872 MOI
.getOrdering() == AtomicOrdering::Acquire
||
873 MOI
.getOrdering() == AtomicOrdering::SequentiallyConsistent
) {
874 Changed
|= CC
->enableLoadCacheBypass(MI
, MOI
.getScope(),
875 MOI
.getOrderingAddrSpace());
878 if (MOI
.getOrdering() == AtomicOrdering::SequentiallyConsistent
)
879 Changed
|= CC
->insertWait(MI
, MOI
.getScope(),
880 MOI
.getOrderingAddrSpace(),
881 SIMemOp::LOAD
| SIMemOp::STORE
,
882 MOI
.getIsCrossAddressSpaceOrdering(),
885 if (MOI
.getOrdering() == AtomicOrdering::Acquire
||
886 MOI
.getOrdering() == AtomicOrdering::SequentiallyConsistent
) {
887 Changed
|= CC
->insertWait(MI
, MOI
.getScope(),
888 MOI
.getInstrAddrSpace(),
890 MOI
.getIsCrossAddressSpaceOrdering(),
892 Changed
|= CC
->insertCacheInvalidate(MI
, MOI
.getScope(),
893 MOI
.getOrderingAddrSpace(),
900 // Atomic instructions do not have the nontemporal attribute.
901 if (MOI
.isNonTemporal()) {
902 Changed
|= CC
->enableNonTemporal(MI
);
909 bool SIMemoryLegalizer::expandStore(const SIMemOpInfo
&MOI
,
910 MachineBasicBlock::iterator
&MI
) {
911 assert(!MI
->mayLoad() && MI
->mayStore());
913 bool Changed
= false;
915 if (MOI
.isAtomic()) {
916 if (MOI
.getOrdering() == AtomicOrdering::Release
||
917 MOI
.getOrdering() == AtomicOrdering::SequentiallyConsistent
)
918 Changed
|= CC
->insertWait(MI
, MOI
.getScope(),
919 MOI
.getOrderingAddrSpace(),
920 SIMemOp::LOAD
| SIMemOp::STORE
,
921 MOI
.getIsCrossAddressSpaceOrdering(),
927 // Atomic instructions do not have the nontemporal attribute.
928 if (MOI
.isNonTemporal()) {
929 Changed
|= CC
->enableNonTemporal(MI
);
936 bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo
&MOI
,
937 MachineBasicBlock::iterator
&MI
) {
938 assert(MI
->getOpcode() == AMDGPU::ATOMIC_FENCE
);
940 AtomicPseudoMIs
.push_back(MI
);
941 bool Changed
= false;
943 if (MOI
.isAtomic()) {
944 if (MOI
.getOrdering() == AtomicOrdering::Acquire
||
945 MOI
.getOrdering() == AtomicOrdering::Release
||
946 MOI
.getOrdering() == AtomicOrdering::AcquireRelease
||
947 MOI
.getOrdering() == AtomicOrdering::SequentiallyConsistent
)
948 /// TODO: This relies on a barrier always generating a waitcnt
949 /// for LDS to ensure it is not reordered with the completion of
950 /// the proceeding LDS operations. If barrier had a memory
951 /// ordering and memory scope, then library does not need to
952 /// generate a fence. Could add support in this file for
953 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
954 /// adding waitcnt before a S_BARRIER.
955 Changed
|= CC
->insertWait(MI
, MOI
.getScope(),
956 MOI
.getOrderingAddrSpace(),
957 SIMemOp::LOAD
| SIMemOp::STORE
,
958 MOI
.getIsCrossAddressSpaceOrdering(),
961 if (MOI
.getOrdering() == AtomicOrdering::Acquire
||
962 MOI
.getOrdering() == AtomicOrdering::AcquireRelease
||
963 MOI
.getOrdering() == AtomicOrdering::SequentiallyConsistent
)
964 Changed
|= CC
->insertCacheInvalidate(MI
, MOI
.getScope(),
965 MOI
.getOrderingAddrSpace(),
974 bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo
&MOI
,
975 MachineBasicBlock::iterator
&MI
) {
976 assert(MI
->mayLoad() && MI
->mayStore());
978 bool Changed
= false;
980 if (MOI
.isAtomic()) {
981 if (MOI
.getOrdering() == AtomicOrdering::Release
||
982 MOI
.getOrdering() == AtomicOrdering::AcquireRelease
||
983 MOI
.getOrdering() == AtomicOrdering::SequentiallyConsistent
||
984 MOI
.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent
)
985 Changed
|= CC
->insertWait(MI
, MOI
.getScope(),
986 MOI
.getOrderingAddrSpace(),
987 SIMemOp::LOAD
| SIMemOp::STORE
,
988 MOI
.getIsCrossAddressSpaceOrdering(),
991 if (MOI
.getOrdering() == AtomicOrdering::Acquire
||
992 MOI
.getOrdering() == AtomicOrdering::AcquireRelease
||
993 MOI
.getOrdering() == AtomicOrdering::SequentiallyConsistent
||
994 MOI
.getFailureOrdering() == AtomicOrdering::Acquire
||
995 MOI
.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent
) {
996 Changed
|= CC
->insertWait(MI
, MOI
.getScope(),
997 MOI
.getOrderingAddrSpace(),
998 isAtomicRet(*MI
) ? SIMemOp::LOAD
:
1000 MOI
.getIsCrossAddressSpaceOrdering(),
1002 Changed
|= CC
->insertCacheInvalidate(MI
, MOI
.getScope(),
1003 MOI
.getOrderingAddrSpace(),
1013 bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction
&MF
) {
1014 bool Changed
= false;
1016 SIMemOpAccess
MOA(MF
);
1017 CC
= SICacheControl::create(MF
.getSubtarget
<GCNSubtarget
>());
1019 for (auto &MBB
: MF
) {
1020 for (auto MI
= MBB
.begin(); MI
!= MBB
.end(); ++MI
) {
1021 if (!(MI
->getDesc().TSFlags
& SIInstrFlags::maybeAtomic
))
1024 if (const auto &MOI
= MOA
.getLoadInfo(MI
))
1025 Changed
|= expandLoad(MOI
.getValue(), MI
);
1026 else if (const auto &MOI
= MOA
.getStoreInfo(MI
))
1027 Changed
|= expandStore(MOI
.getValue(), MI
);
1028 else if (const auto &MOI
= MOA
.getAtomicFenceInfo(MI
))
1029 Changed
|= expandAtomicFence(MOI
.getValue(), MI
);
1030 else if (const auto &MOI
= MOA
.getAtomicCmpxchgOrRmwInfo(MI
))
1031 Changed
|= expandAtomicCmpxchgOrRmw(MOI
.getValue(), MI
);
1035 Changed
|= removeAtomicPseudoMIs();
1039 INITIALIZE_PASS(SIMemoryLegalizer
, DEBUG_TYPE
, PASS_NAME
, false, false)
1041 char SIMemoryLegalizer::ID
= 0;
1042 char &llvm::SIMemoryLegalizerID
= SIMemoryLegalizer::ID
;
1044 FunctionPass
*llvm::createSIMemoryLegalizerPass() {
1045 return new SIMemoryLegalizer();