1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
43 // Future improvements:
45 // - This currently relies on the scheduler to place loads and stores next to
46 // each other, and then only merges adjacent pairs of instructions. It would
47 // be good to be more flexible with interleaved instructions, and possibly run
48 // before scheduling. It currently missing stores of constants because loading
49 // the constant into the data register is placed between the stores, although
50 // this is arguably a scheduling problem.
52 // - Live interval recomputing seems inefficient. This currently only matches
53 // one pair, and recomputes live intervals and moves on to the next pair. It
54 // would be better to compute a list of all merges that need to occur.
56 // - With a list of instructions to process, we can also merge more. If a
57 // cluster of loads have offsets that are too large to fit in the 8-bit
58 // offsets, but are close enough to fit in the 8 bits, we can add to the base
59 // pointer and use the new reduced offsets.
61 //===----------------------------------------------------------------------===//
64 #include "AMDGPUSubtarget.h"
65 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
66 #include "SIInstrInfo.h"
67 #include "SIRegisterInfo.h"
68 #include "Utils/AMDGPUBaseInfo.h"
69 #include "llvm/ADT/ArrayRef.h"
70 #include "llvm/ADT/SmallVector.h"
71 #include "llvm/ADT/StringRef.h"
72 #include "llvm/Analysis/AliasAnalysis.h"
73 #include "llvm/CodeGen/MachineBasicBlock.h"
74 #include "llvm/CodeGen/MachineFunction.h"
75 #include "llvm/CodeGen/MachineFunctionPass.h"
76 #include "llvm/CodeGen/MachineInstr.h"
77 #include "llvm/CodeGen/MachineInstrBuilder.h"
78 #include "llvm/CodeGen/MachineOperand.h"
79 #include "llvm/CodeGen/MachineRegisterInfo.h"
80 #include "llvm/IR/DebugLoc.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
83 #include "llvm/Support/MathExtras.h"
84 #include "llvm/Support/raw_ostream.h"
93 #define DEBUG_TYPE "si-load-store-opt"
101 BUFFER_LOAD_OFFEN
= AMDGPU::BUFFER_LOAD_DWORD_OFFEN
,
102 BUFFER_LOAD_OFFSET
= AMDGPU::BUFFER_LOAD_DWORD_OFFSET
,
103 BUFFER_STORE_OFFEN
= AMDGPU::BUFFER_STORE_DWORD_OFFEN
,
104 BUFFER_STORE_OFFSET
= AMDGPU::BUFFER_STORE_DWORD_OFFSET
,
105 BUFFER_LOAD_OFFEN_exact
= AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact
,
106 BUFFER_LOAD_OFFSET_exact
= AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact
,
107 BUFFER_STORE_OFFEN_exact
= AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
,
108 BUFFER_STORE_OFFSET_exact
= AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact
,
119 class SILoadStoreOptimizer
: public MachineFunctionPass
{
121 MachineBasicBlock::iterator I
;
122 MachineBasicBlock::iterator Paired
;
129 InstClassEnum InstClass
;
137 SmallVector
<MachineInstr
*, 8> InstsToMove
;
140 struct BaseRegisters
{
144 unsigned LoSubReg
= 0;
145 unsigned HiSubReg
= 0;
153 using MemInfoMap
= DenseMap
<MachineInstr
*, MemAddress
>;
156 const GCNSubtarget
*STM
= nullptr;
157 const SIInstrInfo
*TII
= nullptr;
158 const SIRegisterInfo
*TRI
= nullptr;
159 MachineRegisterInfo
*MRI
= nullptr;
160 AliasAnalysis
*AA
= nullptr;
163 static bool offsetsCanBeCombined(CombineInfo
&CI
);
164 static bool widthsFit(const GCNSubtarget
&STM
, const CombineInfo
&CI
);
165 static unsigned getNewOpcode(const CombineInfo
&CI
);
166 static std::pair
<unsigned, unsigned> getSubRegIdxs(const CombineInfo
&CI
);
167 const TargetRegisterClass
*getTargetRegisterClass(const CombineInfo
&CI
);
168 unsigned getOpcodeWidth(const MachineInstr
&MI
) const;
169 InstClassEnum
getInstClass(unsigned Opc
) const;
170 unsigned getRegs(unsigned Opc
) const;
172 bool findMatchingInst(CombineInfo
&CI
);
174 unsigned read2Opcode(unsigned EltSize
) const;
175 unsigned read2ST64Opcode(unsigned EltSize
) const;
176 MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo
&CI
);
178 unsigned write2Opcode(unsigned EltSize
) const;
179 unsigned write2ST64Opcode(unsigned EltSize
) const;
180 MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo
&CI
);
181 MachineBasicBlock::iterator
mergeSBufferLoadImmPair(CombineInfo
&CI
);
182 MachineBasicBlock::iterator
mergeBufferLoadPair(CombineInfo
&CI
);
183 MachineBasicBlock::iterator
mergeBufferStorePair(CombineInfo
&CI
);
185 void updateBaseAndOffset(MachineInstr
&I
, unsigned NewBase
,
186 int32_t NewOffset
) const;
187 unsigned computeBase(MachineInstr
&MI
, const MemAddress
&Addr
) const;
188 MachineOperand
createRegOrImm(int32_t Val
, MachineInstr
&MI
) const;
189 Optional
<int32_t> extractConstOffset(const MachineOperand
&Op
) const;
190 void processBaseWithConstOffset(const MachineOperand
&Base
, MemAddress
&Addr
) const;
191 /// Promotes constant offset to the immediate by adjusting the base. It
192 /// tries to use a base from the nearby instructions that allows it to have
193 /// a 13bit constant offset which gets promoted to the immediate.
194 bool promoteConstantOffsetToImm(MachineInstr
&CI
,
196 SmallPtrSet
<MachineInstr
*, 4> &Promoted
) const;
201 SILoadStoreOptimizer() : MachineFunctionPass(ID
) {
202 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
205 bool optimizeBlock(MachineBasicBlock
&MBB
);
207 bool runOnMachineFunction(MachineFunction
&MF
) override
;
209 StringRef
getPassName() const override
{ return "SI Load Store Optimizer"; }
211 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
212 AU
.setPreservesCFG();
213 AU
.addRequired
<AAResultsWrapperPass
>();
215 MachineFunctionPass::getAnalysisUsage(AU
);
219 } // end anonymous namespace.
221 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer
, DEBUG_TYPE
,
222 "SI Load Store Optimizer", false, false)
223 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
224 INITIALIZE_PASS_END(SILoadStoreOptimizer
, DEBUG_TYPE
, "SI Load Store Optimizer",
227 char SILoadStoreOptimizer::ID
= 0;
229 char &llvm::SILoadStoreOptimizerID
= SILoadStoreOptimizer::ID
;
231 FunctionPass
*llvm::createSILoadStoreOptimizerPass() {
232 return new SILoadStoreOptimizer();
235 static void moveInstsAfter(MachineBasicBlock::iterator I
,
236 ArrayRef
<MachineInstr
*> InstsToMove
) {
237 MachineBasicBlock
*MBB
= I
->getParent();
239 for (MachineInstr
*MI
: InstsToMove
) {
240 MI
->removeFromParent();
245 static void addDefsUsesToList(const MachineInstr
&MI
,
246 DenseSet
<unsigned> &RegDefs
,
247 DenseSet
<unsigned> &PhysRegUses
) {
248 for (const MachineOperand
&Op
: MI
.operands()) {
251 RegDefs
.insert(Op
.getReg());
252 else if (Op
.readsReg() && Register::isPhysicalRegister(Op
.getReg()))
253 PhysRegUses
.insert(Op
.getReg());
258 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A
,
259 MachineBasicBlock::iterator B
,
261 // RAW or WAR - cannot reorder
262 // WAW - cannot reorder
263 // RAR - safe to reorder
264 return !(A
->mayStore() || B
->mayStore()) || !A
->mayAlias(AA
, *B
, true);
267 // Add MI and its defs to the lists if MI reads one of the defs that are
268 // already in the list. Returns true in that case.
269 static bool addToListsIfDependent(MachineInstr
&MI
, DenseSet
<unsigned> &RegDefs
,
270 DenseSet
<unsigned> &PhysRegUses
,
271 SmallVectorImpl
<MachineInstr
*> &Insts
) {
272 for (MachineOperand
&Use
: MI
.operands()) {
273 // If one of the defs is read, then there is a use of Def between I and the
274 // instruction that I will potentially be merged with. We will need to move
275 // this instruction after the merged instructions.
277 // Similarly, if there is a def which is read by an instruction that is to
278 // be moved for merging, then we need to move the def-instruction as well.
279 // This can only happen for physical registers such as M0; virtual
280 // registers are in SSA form.
282 ((Use
.readsReg() && RegDefs
.count(Use
.getReg())) ||
283 (Use
.isDef() && RegDefs
.count(Use
.getReg())) ||
284 (Use
.isDef() && Register::isPhysicalRegister(Use
.getReg()) &&
285 PhysRegUses
.count(Use
.getReg())))) {
286 Insts
.push_back(&MI
);
287 addDefsUsesToList(MI
, RegDefs
, PhysRegUses
);
295 static bool canMoveInstsAcrossMemOp(MachineInstr
&MemOp
,
296 ArrayRef
<MachineInstr
*> InstsToMove
,
298 assert(MemOp
.mayLoadOrStore());
300 for (MachineInstr
*InstToMove
: InstsToMove
) {
301 if (!InstToMove
->mayLoadOrStore())
303 if (!memAccessesCanBeReordered(MemOp
, *InstToMove
, AA
))
309 // This function assumes that \p A and \p B have are identical except for
310 // size and offset, and they referecne adjacent memory.
311 static MachineMemOperand
*combineKnownAdjacentMMOs(MachineFunction
&MF
,
312 const MachineMemOperand
*A
,
313 const MachineMemOperand
*B
) {
314 unsigned MinOffset
= std::min(A
->getOffset(), B
->getOffset());
315 unsigned Size
= A
->getSize() + B
->getSize();
316 // This function adds the offset parameter to the existing offset for A,
317 // so we pass 0 here as the offset and then manually set it to the correct
318 // value after the call.
319 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(A
, 0, Size
);
320 MMO
->setOffset(MinOffset
);
324 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo
&CI
) {
325 // XXX - Would the same offset be OK? Is there any reason this would happen or
327 if (CI
.Offset0
== CI
.Offset1
)
330 // This won't be valid if the offset isn't aligned.
331 if ((CI
.Offset0
% CI
.EltSize
!= 0) || (CI
.Offset1
% CI
.EltSize
!= 0))
334 unsigned EltOffset0
= CI
.Offset0
/ CI
.EltSize
;
335 unsigned EltOffset1
= CI
.Offset1
/ CI
.EltSize
;
339 // Handle SMEM and VMEM instructions.
340 if ((CI
.InstClass
!= DS_READ
) && (CI
.InstClass
!= DS_WRITE
)) {
341 return (EltOffset0
+ CI
.Width0
== EltOffset1
||
342 EltOffset1
+ CI
.Width1
== EltOffset0
) &&
343 CI
.GLC0
== CI
.GLC1
&& CI
.DLC0
== CI
.DLC1
&&
344 (CI
.InstClass
== S_BUFFER_LOAD_IMM
|| CI
.SLC0
== CI
.SLC1
);
347 // If the offset in elements doesn't fit in 8-bits, we might be able to use
348 // the stride 64 versions.
349 if ((EltOffset0
% 64 == 0) && (EltOffset1
% 64) == 0 &&
350 isUInt
<8>(EltOffset0
/ 64) && isUInt
<8>(EltOffset1
/ 64)) {
351 CI
.Offset0
= EltOffset0
/ 64;
352 CI
.Offset1
= EltOffset1
/ 64;
357 // Check if the new offsets fit in the reduced 8-bit range.
358 if (isUInt
<8>(EltOffset0
) && isUInt
<8>(EltOffset1
)) {
359 CI
.Offset0
= EltOffset0
;
360 CI
.Offset1
= EltOffset1
;
364 // Try to shift base address to decrease offsets.
365 unsigned OffsetDiff
= std::abs((int)EltOffset1
- (int)EltOffset0
);
366 CI
.BaseOff
= std::min(CI
.Offset0
, CI
.Offset1
);
368 if ((OffsetDiff
% 64 == 0) && isUInt
<8>(OffsetDiff
/ 64)) {
369 CI
.Offset0
= (EltOffset0
- CI
.BaseOff
/ CI
.EltSize
) / 64;
370 CI
.Offset1
= (EltOffset1
- CI
.BaseOff
/ CI
.EltSize
) / 64;
375 if (isUInt
<8>(OffsetDiff
)) {
376 CI
.Offset0
= EltOffset0
- CI
.BaseOff
/ CI
.EltSize
;
377 CI
.Offset1
= EltOffset1
- CI
.BaseOff
/ CI
.EltSize
;
384 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget
&STM
,
385 const CombineInfo
&CI
) {
386 const unsigned Width
= (CI
.Width0
+ CI
.Width1
);
387 switch (CI
.InstClass
) {
389 return (Width
<= 4) && (STM
.hasDwordx3LoadStores() || (Width
!= 3));
390 case S_BUFFER_LOAD_IMM
:
401 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr
&MI
) const {
402 const unsigned Opc
= MI
.getOpcode();
404 if (TII
->isMUBUF(MI
)) {
405 // FIXME: Handle d16 correctly
406 return AMDGPU::getMUBUFElements(Opc
);
412 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
414 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
416 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
421 InstClassEnum
SILoadStoreOptimizer::getInstClass(unsigned Opc
) const {
422 if (TII
->isMUBUF(Opc
)) {
423 const int baseOpcode
= AMDGPU::getMUBUFBaseOpcode(Opc
);
425 // If we couldn't identify the opcode, bail out.
426 if (baseOpcode
== -1) {
430 switch (baseOpcode
) {
433 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN
:
434 return BUFFER_LOAD_OFFEN
;
435 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET
:
436 return BUFFER_LOAD_OFFSET
;
437 case AMDGPU::BUFFER_STORE_DWORD_OFFEN
:
438 return BUFFER_STORE_OFFEN
;
439 case AMDGPU::BUFFER_STORE_DWORD_OFFSET
:
440 return BUFFER_STORE_OFFSET
;
441 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact
:
442 return BUFFER_LOAD_OFFEN_exact
;
443 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact
:
444 return BUFFER_LOAD_OFFSET_exact
;
445 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
:
446 return BUFFER_STORE_OFFEN_exact
;
447 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact
:
448 return BUFFER_STORE_OFFSET_exact
;
455 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
456 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
457 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
458 return S_BUFFER_LOAD_IMM
;
459 case AMDGPU::DS_READ_B32
:
460 case AMDGPU::DS_READ_B64
:
461 case AMDGPU::DS_READ_B32_gfx9
:
462 case AMDGPU::DS_READ_B64_gfx9
:
464 case AMDGPU::DS_WRITE_B32
:
465 case AMDGPU::DS_WRITE_B64
:
466 case AMDGPU::DS_WRITE_B32_gfx9
:
467 case AMDGPU::DS_WRITE_B64_gfx9
:
472 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc
) const {
473 if (TII
->isMUBUF(Opc
)) {
476 if (AMDGPU::getMUBUFHasVAddr(Opc
)) {
480 if (AMDGPU::getMUBUFHasSrsrc(Opc
)) {
484 if (AMDGPU::getMUBUFHasSoffset(Opc
)) {
494 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
495 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
496 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
498 case AMDGPU::DS_READ_B32
:
499 case AMDGPU::DS_READ_B64
:
500 case AMDGPU::DS_READ_B32_gfx9
:
501 case AMDGPU::DS_READ_B64_gfx9
:
502 case AMDGPU::DS_WRITE_B32
:
503 case AMDGPU::DS_WRITE_B64
:
504 case AMDGPU::DS_WRITE_B32_gfx9
:
505 case AMDGPU::DS_WRITE_B64_gfx9
:
510 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo
&CI
) {
511 MachineBasicBlock
*MBB
= CI
.I
->getParent();
512 MachineBasicBlock::iterator E
= MBB
->end();
513 MachineBasicBlock::iterator MBBI
= CI
.I
;
515 const unsigned Opc
= CI
.I
->getOpcode();
516 const InstClassEnum InstClass
= getInstClass(Opc
);
518 if (InstClass
== UNKNOWN
) {
522 const unsigned Regs
= getRegs(Opc
);
524 unsigned AddrOpName
[5] = {0};
526 const MachineOperand
*AddrReg
[5];
527 unsigned NumAddresses
= 0;
530 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::addr
;
534 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::sbase
;
538 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::srsrc
;
541 if (Regs
& SOFFSET
) {
542 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::soffset
;
546 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::vaddr
;
549 for (unsigned i
= 0; i
< NumAddresses
; i
++) {
550 AddrIdx
[i
] = AMDGPU::getNamedOperandIdx(CI
.I
->getOpcode(), AddrOpName
[i
]);
551 AddrReg
[i
] = &CI
.I
->getOperand(AddrIdx
[i
]);
553 // We only ever merge operations with the same base address register, so
554 // don't bother scanning forward if there are no other uses.
555 if (AddrReg
[i
]->isReg() &&
556 (Register::isPhysicalRegister(AddrReg
[i
]->getReg()) ||
557 MRI
->hasOneNonDBGUse(AddrReg
[i
]->getReg())))
563 DenseSet
<unsigned> RegDefsToMove
;
564 DenseSet
<unsigned> PhysRegUsesToMove
;
565 addDefsUsesToList(*CI
.I
, RegDefsToMove
, PhysRegUsesToMove
);
567 for (; MBBI
!= E
; ++MBBI
) {
568 const bool IsDS
= (InstClass
== DS_READ
) || (InstClass
== DS_WRITE
);
570 if ((getInstClass(MBBI
->getOpcode()) != InstClass
) ||
571 (IsDS
&& (MBBI
->getOpcode() != Opc
))) {
572 // This is not a matching DS instruction, but we can keep looking as
573 // long as one of these conditions are met:
574 // 1. It is safe to move I down past MBBI.
575 // 2. It is safe to move MBBI down past the instruction that I will
578 if (MBBI
->hasUnmodeledSideEffects()) {
579 // We can't re-order this instruction with respect to other memory
580 // operations, so we fail both conditions mentioned above.
584 if (MBBI
->mayLoadOrStore() &&
585 (!memAccessesCanBeReordered(*CI
.I
, *MBBI
, AA
) ||
586 !canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, AA
))) {
587 // We fail condition #1, but we may still be able to satisfy condition
588 // #2. Add this instruction to the move list and then we will check
589 // if condition #2 holds once we have selected the matching instruction.
590 CI
.InstsToMove
.push_back(&*MBBI
);
591 addDefsUsesToList(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
);
595 // When we match I with another DS instruction we will be moving I down
596 // to the location of the matched instruction any uses of I will need to
597 // be moved down as well.
598 addToListsIfDependent(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
,
603 // Don't merge volatiles.
604 if (MBBI
->hasOrderedMemoryRef())
607 // Handle a case like
608 // DS_WRITE_B32 addr, v, idx0
609 // w = DS_READ_B32 addr, idx0
610 // DS_WRITE_B32 addr, f(w), idx1
611 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
612 // merging of the two writes.
613 if (addToListsIfDependent(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
,
618 for (unsigned i
= 0; i
< NumAddresses
; i
++) {
619 const MachineOperand
&AddrRegNext
= MBBI
->getOperand(AddrIdx
[i
]);
621 if (AddrReg
[i
]->isImm() || AddrRegNext
.isImm()) {
622 if (AddrReg
[i
]->isImm() != AddrRegNext
.isImm() ||
623 AddrReg
[i
]->getImm() != AddrRegNext
.getImm()) {
630 // Check same base pointer. Be careful of subregisters, which can occur
631 // with vectors of pointers.
632 if (AddrReg
[i
]->getReg() != AddrRegNext
.getReg() ||
633 AddrReg
[i
]->getSubReg() != AddrRegNext
.getSubReg()) {
641 AMDGPU::getNamedOperandIdx(CI
.I
->getOpcode(), AMDGPU::OpName::offset
);
642 CI
.Offset0
= CI
.I
->getOperand(OffsetIdx
).getImm();
643 CI
.Width0
= getOpcodeWidth(*CI
.I
);
644 CI
.Offset1
= MBBI
->getOperand(OffsetIdx
).getImm();
645 CI
.Width1
= getOpcodeWidth(*MBBI
);
648 if ((CI
.InstClass
== DS_READ
) || (CI
.InstClass
== DS_WRITE
)) {
649 CI
.Offset0
&= 0xffff;
650 CI
.Offset1
&= 0xffff;
652 CI
.GLC0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::glc
)->getImm();
653 CI
.GLC1
= TII
->getNamedOperand(*MBBI
, AMDGPU::OpName::glc
)->getImm();
654 if (CI
.InstClass
!= S_BUFFER_LOAD_IMM
) {
655 CI
.SLC0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::slc
)->getImm();
656 CI
.SLC1
= TII
->getNamedOperand(*MBBI
, AMDGPU::OpName::slc
)->getImm();
658 CI
.DLC0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::dlc
)->getImm();
659 CI
.DLC1
= TII
->getNamedOperand(*MBBI
, AMDGPU::OpName::dlc
)->getImm();
662 // Check both offsets fit in the reduced range.
663 // We also need to go through the list of instructions that we plan to
664 // move and make sure they are all safe to move down past the merged
666 if (widthsFit(*STM
, CI
) && offsetsCanBeCombined(CI
))
667 if (canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, AA
))
671 // We've found a load/store that we couldn't merge for some reason.
672 // We could potentially keep looking, but we'd need to make sure that
673 // it was safe to move I and also all the instruction in InstsToMove
674 // down past this instruction.
675 // check if we can move I across MBBI and if we can move all I's users
676 if (!memAccessesCanBeReordered(*CI
.I
, *MBBI
, AA
) ||
677 !canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, AA
))
683 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize
) const {
684 if (STM
->ldsRequiresM0Init())
685 return (EltSize
== 4) ? AMDGPU::DS_READ2_B32
: AMDGPU::DS_READ2_B64
;
686 return (EltSize
== 4) ? AMDGPU::DS_READ2_B32_gfx9
: AMDGPU::DS_READ2_B64_gfx9
;
689 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize
) const {
690 if (STM
->ldsRequiresM0Init())
691 return (EltSize
== 4) ? AMDGPU::DS_READ2ST64_B32
: AMDGPU::DS_READ2ST64_B64
;
693 return (EltSize
== 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
694 : AMDGPU::DS_READ2ST64_B64_gfx9
;
697 MachineBasicBlock::iterator
698 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo
&CI
) {
699 MachineBasicBlock
*MBB
= CI
.I
->getParent();
701 // Be careful, since the addresses could be subregisters themselves in weird
702 // cases, like vectors of pointers.
703 const auto *AddrReg
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::addr
);
705 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdst
);
706 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdst
);
708 unsigned NewOffset0
= CI
.Offset0
;
709 unsigned NewOffset1
= CI
.Offset1
;
711 CI
.UseST64
? read2ST64Opcode(CI
.EltSize
) : read2Opcode(CI
.EltSize
);
713 unsigned SubRegIdx0
= (CI
.EltSize
== 4) ? AMDGPU::sub0
: AMDGPU::sub0_sub1
;
714 unsigned SubRegIdx1
= (CI
.EltSize
== 4) ? AMDGPU::sub1
: AMDGPU::sub2_sub3
;
716 if (NewOffset0
> NewOffset1
) {
717 // Canonicalize the merged instruction so the smaller offset comes first.
718 std::swap(NewOffset0
, NewOffset1
);
719 std::swap(SubRegIdx0
, SubRegIdx1
);
722 assert((isUInt
<8>(NewOffset0
) && isUInt
<8>(NewOffset1
)) &&
723 (NewOffset0
!= NewOffset1
) && "Computed offset doesn't fit");
725 const MCInstrDesc
&Read2Desc
= TII
->get(Opc
);
727 const TargetRegisterClass
*SuperRC
=
728 (CI
.EltSize
== 4) ? &AMDGPU::VReg_64RegClass
: &AMDGPU::VReg_128RegClass
;
729 Register DestReg
= MRI
->createVirtualRegister(SuperRC
);
731 DebugLoc DL
= CI
.I
->getDebugLoc();
733 Register BaseReg
= AddrReg
->getReg();
734 unsigned BaseSubReg
= AddrReg
->getSubReg();
735 unsigned BaseRegFlags
= 0;
737 Register ImmReg
= MRI
->createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
738 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::S_MOV_B32
), ImmReg
)
741 BaseReg
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
742 BaseRegFlags
= RegState::Kill
;
744 TII
->getAddNoCarry(*MBB
, CI
.Paired
, DL
, BaseReg
)
746 .addReg(AddrReg
->getReg(), 0, BaseSubReg
)
747 .addImm(0); // clamp bit
751 MachineInstrBuilder Read2
=
752 BuildMI(*MBB
, CI
.Paired
, DL
, Read2Desc
, DestReg
)
753 .addReg(BaseReg
, BaseRegFlags
, BaseSubReg
) // addr
754 .addImm(NewOffset0
) // offset0
755 .addImm(NewOffset1
) // offset1
757 .cloneMergedMemRefs({&*CI
.I
, &*CI
.Paired
});
761 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
763 // Copy to the old destination registers.
764 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
765 .add(*Dest0
) // Copy to same destination including flags and sub reg.
766 .addReg(DestReg
, 0, SubRegIdx0
);
767 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
769 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
771 moveInstsAfter(Copy1
, CI
.InstsToMove
);
773 MachineBasicBlock::iterator Next
= std::next(CI
.I
);
774 CI
.I
->eraseFromParent();
775 CI
.Paired
->eraseFromParent();
777 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2
<< '\n');
781 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize
) const {
782 if (STM
->ldsRequiresM0Init())
783 return (EltSize
== 4) ? AMDGPU::DS_WRITE2_B32
: AMDGPU::DS_WRITE2_B64
;
784 return (EltSize
== 4) ? AMDGPU::DS_WRITE2_B32_gfx9
785 : AMDGPU::DS_WRITE2_B64_gfx9
;
788 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize
) const {
789 if (STM
->ldsRequiresM0Init())
790 return (EltSize
== 4) ? AMDGPU::DS_WRITE2ST64_B32
791 : AMDGPU::DS_WRITE2ST64_B64
;
793 return (EltSize
== 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
794 : AMDGPU::DS_WRITE2ST64_B64_gfx9
;
797 MachineBasicBlock::iterator
798 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo
&CI
) {
799 MachineBasicBlock
*MBB
= CI
.I
->getParent();
801 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
802 // sure we preserve the subregister index and any register flags set on them.
803 const MachineOperand
*AddrReg
=
804 TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::addr
);
805 const MachineOperand
*Data0
=
806 TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::data0
);
807 const MachineOperand
*Data1
=
808 TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::data0
);
810 unsigned NewOffset0
= CI
.Offset0
;
811 unsigned NewOffset1
= CI
.Offset1
;
813 CI
.UseST64
? write2ST64Opcode(CI
.EltSize
) : write2Opcode(CI
.EltSize
);
815 if (NewOffset0
> NewOffset1
) {
816 // Canonicalize the merged instruction so the smaller offset comes first.
817 std::swap(NewOffset0
, NewOffset1
);
818 std::swap(Data0
, Data1
);
821 assert((isUInt
<8>(NewOffset0
) && isUInt
<8>(NewOffset1
)) &&
822 (NewOffset0
!= NewOffset1
) && "Computed offset doesn't fit");
824 const MCInstrDesc
&Write2Desc
= TII
->get(Opc
);
825 DebugLoc DL
= CI
.I
->getDebugLoc();
827 Register BaseReg
= AddrReg
->getReg();
828 unsigned BaseSubReg
= AddrReg
->getSubReg();
829 unsigned BaseRegFlags
= 0;
831 Register ImmReg
= MRI
->createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
832 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::S_MOV_B32
), ImmReg
)
835 BaseReg
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
836 BaseRegFlags
= RegState::Kill
;
838 TII
->getAddNoCarry(*MBB
, CI
.Paired
, DL
, BaseReg
)
840 .addReg(AddrReg
->getReg(), 0, BaseSubReg
)
841 .addImm(0); // clamp bit
845 MachineInstrBuilder Write2
=
846 BuildMI(*MBB
, CI
.Paired
, DL
, Write2Desc
)
847 .addReg(BaseReg
, BaseRegFlags
, BaseSubReg
) // addr
848 .add(*Data0
) // data0
849 .add(*Data1
) // data1
850 .addImm(NewOffset0
) // offset0
851 .addImm(NewOffset1
) // offset1
853 .cloneMergedMemRefs({&*CI
.I
, &*CI
.Paired
});
855 moveInstsAfter(Write2
, CI
.InstsToMove
);
857 MachineBasicBlock::iterator Next
= std::next(CI
.I
);
858 CI
.I
->eraseFromParent();
859 CI
.Paired
->eraseFromParent();
861 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2
<< '\n');
865 MachineBasicBlock::iterator
866 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo
&CI
) {
867 MachineBasicBlock
*MBB
= CI
.I
->getParent();
868 DebugLoc DL
= CI
.I
->getDebugLoc();
869 const unsigned Opcode
= getNewOpcode(CI
);
871 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
873 Register DestReg
= MRI
->createVirtualRegister(SuperRC
);
874 unsigned MergedOffset
= std::min(CI
.Offset0
, CI
.Offset1
);
876 // It shouldn't be possible to get this far if the two instructions
877 // don't have a single memoperand, because MachineInstr::mayAlias()
878 // will return true if this is the case.
879 assert(CI
.I
->hasOneMemOperand() && CI
.Paired
->hasOneMemOperand());
881 const MachineMemOperand
*MMOa
= *CI
.I
->memoperands_begin();
882 const MachineMemOperand
*MMOb
= *CI
.Paired
->memoperands_begin();
884 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
), DestReg
)
885 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::sbase
))
886 .addImm(MergedOffset
) // offset
887 .addImm(CI
.GLC0
) // glc
888 .addImm(CI
.DLC0
) // dlc
889 .addMemOperand(combineKnownAdjacentMMOs(*MBB
->getParent(), MMOa
, MMOb
));
891 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
892 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
893 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
895 // Copy to the old destination registers.
896 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
897 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::sdst
);
898 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::sdst
);
900 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
901 .add(*Dest0
) // Copy to same destination including flags and sub reg.
902 .addReg(DestReg
, 0, SubRegIdx0
);
903 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
905 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
907 moveInstsAfter(Copy1
, CI
.InstsToMove
);
909 MachineBasicBlock::iterator Next
= std::next(CI
.I
);
910 CI
.I
->eraseFromParent();
911 CI
.Paired
->eraseFromParent();
915 MachineBasicBlock::iterator
916 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo
&CI
) {
917 MachineBasicBlock
*MBB
= CI
.I
->getParent();
918 DebugLoc DL
= CI
.I
->getDebugLoc();
920 const unsigned Opcode
= getNewOpcode(CI
);
922 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
924 // Copy to the new source register.
925 Register DestReg
= MRI
->createVirtualRegister(SuperRC
);
926 unsigned MergedOffset
= std::min(CI
.Offset0
, CI
.Offset1
);
928 auto MIB
= BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
), DestReg
);
930 const unsigned Regs
= getRegs(Opcode
);
933 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vaddr
));
935 // It shouldn't be possible to get this far if the two instructions
936 // don't have a single memoperand, because MachineInstr::mayAlias()
937 // will return true if this is the case.
938 assert(CI
.I
->hasOneMemOperand() && CI
.Paired
->hasOneMemOperand());
940 const MachineMemOperand
*MMOa
= *CI
.I
->memoperands_begin();
941 const MachineMemOperand
*MMOb
= *CI
.Paired
->memoperands_begin();
943 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::srsrc
))
944 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::soffset
))
945 .addImm(MergedOffset
) // offset
946 .addImm(CI
.GLC0
) // glc
947 .addImm(CI
.SLC0
) // slc
949 .addImm(CI
.DLC0
) // dlc
950 .addMemOperand(combineKnownAdjacentMMOs(*MBB
->getParent(), MMOa
, MMOb
));
952 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
953 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
954 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
956 // Copy to the old destination registers.
957 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
958 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdata
);
959 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdata
);
961 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
962 .add(*Dest0
) // Copy to same destination including flags and sub reg.
963 .addReg(DestReg
, 0, SubRegIdx0
);
964 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
966 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
968 moveInstsAfter(Copy1
, CI
.InstsToMove
);
970 MachineBasicBlock::iterator Next
= std::next(CI
.I
);
971 CI
.I
->eraseFromParent();
972 CI
.Paired
->eraseFromParent();
976 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo
&CI
) {
977 const unsigned Width
= CI
.Width0
+ CI
.Width1
;
979 switch (CI
.InstClass
) {
981 // FIXME: Handle d16 correctly
982 return AMDGPU::getMUBUFOpcode(CI
.InstClass
, Width
);
984 llvm_unreachable("Unknown instruction class");
985 case S_BUFFER_LOAD_IMM
:
990 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
;
992 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
;
997 std::pair
<unsigned, unsigned>
998 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo
&CI
) {
999 if (CI
.Offset0
> CI
.Offset1
) {
1000 switch (CI
.Width0
) {
1002 return std::make_pair(0, 0);
1004 switch (CI
.Width1
) {
1006 return std::make_pair(0, 0);
1008 return std::make_pair(AMDGPU::sub1
, AMDGPU::sub0
);
1010 return std::make_pair(AMDGPU::sub2
, AMDGPU::sub0_sub1
);
1012 return std::make_pair(AMDGPU::sub3
, AMDGPU::sub0_sub1_sub2
);
1015 switch (CI
.Width1
) {
1017 return std::make_pair(0, 0);
1019 return std::make_pair(AMDGPU::sub1_sub2
, AMDGPU::sub0
);
1021 return std::make_pair(AMDGPU::sub2_sub3
, AMDGPU::sub0_sub1
);
1024 switch (CI
.Width1
) {
1026 return std::make_pair(0, 0);
1028 return std::make_pair(AMDGPU::sub1_sub2_sub3
, AMDGPU::sub0
);
1032 switch (CI
.Width0
) {
1034 return std::make_pair(0, 0);
1036 switch (CI
.Width1
) {
1038 return std::make_pair(0, 0);
1040 return std::make_pair(AMDGPU::sub0
, AMDGPU::sub1
);
1042 return std::make_pair(AMDGPU::sub0
, AMDGPU::sub1_sub2
);
1044 return std::make_pair(AMDGPU::sub0
, AMDGPU::sub1_sub2_sub3
);
1047 switch (CI
.Width1
) {
1049 return std::make_pair(0, 0);
1051 return std::make_pair(AMDGPU::sub0_sub1
, AMDGPU::sub2
);
1053 return std::make_pair(AMDGPU::sub0_sub1
, AMDGPU::sub2_sub3
);
1056 switch (CI
.Width1
) {
1058 return std::make_pair(0, 0);
1060 return std::make_pair(AMDGPU::sub0_sub1_sub2
, AMDGPU::sub3
);
1066 const TargetRegisterClass
*
1067 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo
&CI
) {
1068 if (CI
.InstClass
== S_BUFFER_LOAD_IMM
) {
1069 switch (CI
.Width0
+ CI
.Width1
) {
1073 return &AMDGPU::SReg_64_XEXECRegClass
;
1075 return &AMDGPU::SReg_128RegClass
;
1077 return &AMDGPU::SReg_256RegClass
;
1079 return &AMDGPU::SReg_512RegClass
;
1082 switch (CI
.Width0
+ CI
.Width1
) {
1086 return &AMDGPU::VReg_64RegClass
;
1088 return &AMDGPU::VReg_96RegClass
;
1090 return &AMDGPU::VReg_128RegClass
;
1095 MachineBasicBlock::iterator
1096 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo
&CI
) {
1097 MachineBasicBlock
*MBB
= CI
.I
->getParent();
1098 DebugLoc DL
= CI
.I
->getDebugLoc();
1100 const unsigned Opcode
= getNewOpcode(CI
);
1102 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
1103 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
1104 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
1106 // Copy to the new source register.
1107 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
1108 Register SrcReg
= MRI
->createVirtualRegister(SuperRC
);
1110 const auto *Src0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdata
);
1111 const auto *Src1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdata
);
1113 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
), SrcReg
)
1117 .addImm(SubRegIdx1
);
1119 auto MIB
= BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
))
1120 .addReg(SrcReg
, RegState::Kill
);
1122 const unsigned Regs
= getRegs(Opcode
);
1125 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vaddr
));
1128 // It shouldn't be possible to get this far if the two instructions
1129 // don't have a single memoperand, because MachineInstr::mayAlias()
1130 // will return true if this is the case.
1131 assert(CI
.I
->hasOneMemOperand() && CI
.Paired
->hasOneMemOperand());
1133 const MachineMemOperand
*MMOa
= *CI
.I
->memoperands_begin();
1134 const MachineMemOperand
*MMOb
= *CI
.Paired
->memoperands_begin();
1136 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::srsrc
))
1137 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::soffset
))
1138 .addImm(std::min(CI
.Offset0
, CI
.Offset1
)) // offset
1139 .addImm(CI
.GLC0
) // glc
1140 .addImm(CI
.SLC0
) // slc
1142 .addImm(CI
.DLC0
) // dlc
1143 .addMemOperand(combineKnownAdjacentMMOs(*MBB
->getParent(), MMOa
, MMOb
));
1145 moveInstsAfter(MIB
, CI
.InstsToMove
);
1147 MachineBasicBlock::iterator Next
= std::next(CI
.I
);
1148 CI
.I
->eraseFromParent();
1149 CI
.Paired
->eraseFromParent();
1154 SILoadStoreOptimizer::createRegOrImm(int32_t Val
, MachineInstr
&MI
) const {
1155 APInt
V(32, Val
, true);
1156 if (TII
->isInlineConstant(V
))
1157 return MachineOperand::CreateImm(Val
);
1159 Register Reg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1161 BuildMI(*MI
.getParent(), MI
.getIterator(), MI
.getDebugLoc(),
1162 TII
->get(AMDGPU::S_MOV_B32
), Reg
)
1165 LLVM_DEBUG(dbgs() << " "; Mov
->dump());
1166 return MachineOperand::CreateReg(Reg
, false);
1169 // Compute base address using Addr and return the final register.
1170 unsigned SILoadStoreOptimizer::computeBase(MachineInstr
&MI
,
1171 const MemAddress
&Addr
) const {
1172 MachineBasicBlock
*MBB
= MI
.getParent();
1173 MachineBasicBlock::iterator MBBI
= MI
.getIterator();
1174 DebugLoc DL
= MI
.getDebugLoc();
1176 assert((TRI
->getRegSizeInBits(Addr
.Base
.LoReg
, *MRI
) == 32 ||
1177 Addr
.Base
.LoSubReg
) &&
1178 "Expected 32-bit Base-Register-Low!!");
1180 assert((TRI
->getRegSizeInBits(Addr
.Base
.HiReg
, *MRI
) == 32 ||
1181 Addr
.Base
.HiSubReg
) &&
1182 "Expected 32-bit Base-Register-Hi!!");
1184 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1185 MachineOperand OffsetLo
= createRegOrImm(static_cast<int32_t>(Addr
.Offset
), MI
);
1186 MachineOperand OffsetHi
=
1187 createRegOrImm(static_cast<int32_t>(Addr
.Offset
>> 32), MI
);
1189 const auto *CarryRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
1190 Register CarryReg
= MRI
->createVirtualRegister(CarryRC
);
1191 Register DeadCarryReg
= MRI
->createVirtualRegister(CarryRC
);
1193 Register DestSub0
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1194 Register DestSub1
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1195 MachineInstr
*LoHalf
=
1196 BuildMI(*MBB
, MBBI
, DL
, TII
->get(AMDGPU::V_ADD_I32_e64
), DestSub0
)
1197 .addReg(CarryReg
, RegState::Define
)
1198 .addReg(Addr
.Base
.LoReg
, 0, Addr
.Base
.LoSubReg
)
1200 .addImm(0); // clamp bit
1202 LLVM_DEBUG(dbgs() << " "; LoHalf
->dump(););
1204 MachineInstr
*HiHalf
=
1205 BuildMI(*MBB
, MBBI
, DL
, TII
->get(AMDGPU::V_ADDC_U32_e64
), DestSub1
)
1206 .addReg(DeadCarryReg
, RegState::Define
| RegState::Dead
)
1207 .addReg(Addr
.Base
.HiReg
, 0, Addr
.Base
.HiSubReg
)
1209 .addReg(CarryReg
, RegState::Kill
)
1210 .addImm(0); // clamp bit
1212 LLVM_DEBUG(dbgs() << " "; HiHalf
->dump(););
1214 Register FullDestReg
= MRI
->createVirtualRegister(&AMDGPU::VReg_64RegClass
);
1215 MachineInstr
*FullBase
=
1216 BuildMI(*MBB
, MBBI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), FullDestReg
)
1218 .addImm(AMDGPU::sub0
)
1220 .addImm(AMDGPU::sub1
);
1222 LLVM_DEBUG(dbgs() << " "; FullBase
->dump(); dbgs() << "\n";);
1227 // Update base and offset with the NewBase and NewOffset in MI.
1228 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr
&MI
,
1230 int32_t NewOffset
) const {
1231 TII
->getNamedOperand(MI
, AMDGPU::OpName::vaddr
)->setReg(NewBase
);
1232 TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->setImm(NewOffset
);
1236 SILoadStoreOptimizer::extractConstOffset(const MachineOperand
&Op
) const {
1243 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Op
.getReg());
1244 if (!Def
|| Def
->getOpcode() != AMDGPU::S_MOV_B32
||
1245 !Def
->getOperand(1).isImm())
1248 return Def
->getOperand(1).getImm();
1251 // Analyze Base and extracts:
1252 // - 32bit base registers, subregisters
1253 // - 64bit constant offset
1254 // Expecting base computation as:
1255 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1256 // %LO:vgpr_32, %c:sreg_64_xexec =
1257 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1258 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1260 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1261 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand
&Base
,
1262 MemAddress
&Addr
) const {
1266 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Base
.getReg());
1267 if (!Def
|| Def
->getOpcode() != AMDGPU::REG_SEQUENCE
1268 || Def
->getNumOperands() != 5)
1271 MachineOperand BaseLo
= Def
->getOperand(1);
1272 MachineOperand BaseHi
= Def
->getOperand(3);
1273 if (!BaseLo
.isReg() || !BaseHi
.isReg())
1276 MachineInstr
*BaseLoDef
= MRI
->getUniqueVRegDef(BaseLo
.getReg());
1277 MachineInstr
*BaseHiDef
= MRI
->getUniqueVRegDef(BaseHi
.getReg());
1279 if (!BaseLoDef
|| BaseLoDef
->getOpcode() != AMDGPU::V_ADD_I32_e64
||
1280 !BaseHiDef
|| BaseHiDef
->getOpcode() != AMDGPU::V_ADDC_U32_e64
)
1283 const auto *Src0
= TII
->getNamedOperand(*BaseLoDef
, AMDGPU::OpName::src0
);
1284 const auto *Src1
= TII
->getNamedOperand(*BaseLoDef
, AMDGPU::OpName::src1
);
1286 auto Offset0P
= extractConstOffset(*Src0
);
1290 if (!(Offset0P
= extractConstOffset(*Src1
)))
1295 Src0
= TII
->getNamedOperand(*BaseHiDef
, AMDGPU::OpName::src0
);
1296 Src1
= TII
->getNamedOperand(*BaseHiDef
, AMDGPU::OpName::src1
);
1299 std::swap(Src0
, Src1
);
1304 uint64_t Offset1
= Src1
->getImm();
1307 Addr
.Base
.LoReg
= BaseLo
.getReg();
1308 Addr
.Base
.HiReg
= BaseHi
.getReg();
1309 Addr
.Base
.LoSubReg
= BaseLo
.getSubReg();
1310 Addr
.Base
.HiSubReg
= BaseHi
.getSubReg();
1311 Addr
.Offset
= (*Offset0P
& 0x00000000ffffffff) | (Offset1
<< 32);
1314 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1316 MemInfoMap
&Visited
,
1317 SmallPtrSet
<MachineInstr
*, 4> &AnchorList
) const {
1319 if (!(MI
.mayLoad() ^ MI
.mayStore()))
1322 // TODO: Support flat and scratch.
1323 if (AMDGPU::getGlobalSaddrOp(MI
.getOpcode()) < 0)
1326 if (MI
.mayLoad() && TII
->getNamedOperand(MI
, AMDGPU::OpName::vdata
) != NULL
)
1329 if (AnchorList
.count(&MI
))
1332 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI
.dump());
1334 if (TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm()) {
1335 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1339 // Step1: Find the base-registers and a 64bit constant offset.
1340 MachineOperand
&Base
= *TII
->getNamedOperand(MI
, AMDGPU::OpName::vaddr
);
1342 if (Visited
.find(&MI
) == Visited
.end()) {
1343 processBaseWithConstOffset(Base
, MAddr
);
1344 Visited
[&MI
] = MAddr
;
1346 MAddr
= Visited
[&MI
];
1348 if (MAddr
.Offset
== 0) {
1349 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1350 " constant offsets that can be promoted.\n";);
1354 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr
.Base
.HiReg
<< ", "
1355 << MAddr
.Base
.LoReg
<< "} Offset: " << MAddr
.Offset
<< "\n\n";);
1357 // Step2: Traverse through MI's basic block and find an anchor(that has the
1358 // same base-registers) with the highest 13bit distance from MI's offset.
1359 // E.g. (64bit loads)
1361 // addr1 = &a + 4096; load1 = load(addr1, 0)
1362 // addr2 = &a + 6144; load2 = load(addr2, 0)
1363 // addr3 = &a + 8192; load3 = load(addr3, 0)
1364 // addr4 = &a + 10240; load4 = load(addr4, 0)
1365 // addr5 = &a + 12288; load5 = load(addr5, 0)
1367 // Starting from the first load, the optimization will try to find a new base
1368 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1369 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1370 // as the new-base(anchor) because of the maximum distance which can
1371 // accomodate more intermediate bases presumeably.
1373 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1374 // (&a + 8192) for load1, load2, load4.
1376 // load1 = load(addr, -4096)
1377 // load2 = load(addr, -2048)
1378 // load3 = load(addr, 0)
1379 // load4 = load(addr, 2048)
1380 // addr5 = &a + 12288; load5 = load(addr5, 0)
1382 MachineInstr
*AnchorInst
= nullptr;
1383 MemAddress AnchorAddr
;
1384 uint32_t MaxDist
= std::numeric_limits
<uint32_t>::min();
1385 SmallVector
<std::pair
<MachineInstr
*, int64_t>, 4> InstsWCommonBase
;
1387 MachineBasicBlock
*MBB
= MI
.getParent();
1388 MachineBasicBlock::iterator E
= MBB
->end();
1389 MachineBasicBlock::iterator MBBI
= MI
.getIterator();
1391 const SITargetLowering
*TLI
=
1392 static_cast<const SITargetLowering
*>(STM
->getTargetLowering());
1394 for ( ; MBBI
!= E
; ++MBBI
) {
1395 MachineInstr
&MINext
= *MBBI
;
1396 // TODO: Support finding an anchor(with same base) from store addresses or
1397 // any other load addresses where the opcodes are different.
1398 if (MINext
.getOpcode() != MI
.getOpcode() ||
1399 TII
->getNamedOperand(MINext
, AMDGPU::OpName::offset
)->getImm())
1402 const MachineOperand
&BaseNext
=
1403 *TII
->getNamedOperand(MINext
, AMDGPU::OpName::vaddr
);
1404 MemAddress MAddrNext
;
1405 if (Visited
.find(&MINext
) == Visited
.end()) {
1406 processBaseWithConstOffset(BaseNext
, MAddrNext
);
1407 Visited
[&MINext
] = MAddrNext
;
1409 MAddrNext
= Visited
[&MINext
];
1411 if (MAddrNext
.Base
.LoReg
!= MAddr
.Base
.LoReg
||
1412 MAddrNext
.Base
.HiReg
!= MAddr
.Base
.HiReg
||
1413 MAddrNext
.Base
.LoSubReg
!= MAddr
.Base
.LoSubReg
||
1414 MAddrNext
.Base
.HiSubReg
!= MAddr
.Base
.HiSubReg
)
1417 InstsWCommonBase
.push_back(std::make_pair(&MINext
, MAddrNext
.Offset
));
1419 int64_t Dist
= MAddr
.Offset
- MAddrNext
.Offset
;
1420 TargetLoweringBase::AddrMode AM
;
1421 AM
.HasBaseReg
= true;
1423 if (TLI
->isLegalGlobalAddressingMode(AM
) &&
1424 (uint32_t)std::abs(Dist
) > MaxDist
) {
1425 MaxDist
= std::abs(Dist
);
1427 AnchorAddr
= MAddrNext
;
1428 AnchorInst
= &MINext
;
1433 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1434 AnchorInst
->dump());
1435 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1436 << AnchorAddr
.Offset
<< "\n\n");
1438 // Instead of moving up, just re-compute anchor-instruction's base address.
1439 unsigned Base
= computeBase(MI
, AnchorAddr
);
1441 updateBaseAndOffset(MI
, Base
, MAddr
.Offset
- AnchorAddr
.Offset
);
1442 LLVM_DEBUG(dbgs() << " After promotion: "; MI
.dump(););
1444 for (auto P
: InstsWCommonBase
) {
1445 TargetLoweringBase::AddrMode AM
;
1446 AM
.HasBaseReg
= true;
1447 AM
.BaseOffs
= P
.second
- AnchorAddr
.Offset
;
1449 if (TLI
->isLegalGlobalAddressingMode(AM
)) {
1450 LLVM_DEBUG(dbgs() << " Promote Offset(" << P
.second
;
1451 dbgs() << ")"; P
.first
->dump());
1452 updateBaseAndOffset(*P
.first
, Base
, P
.second
- AnchorAddr
.Offset
);
1453 LLVM_DEBUG(dbgs() << " After promotion: "; P
.first
->dump());
1456 AnchorList
.insert(AnchorInst
);
1463 // Scan through looking for adjacent LDS operations with constant offsets from
1464 // the same base register. We rely on the scheduler to do the hard work of
1465 // clustering nearby loads, and assume these are all adjacent.
1466 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock
&MBB
) {
1467 bool Modified
= false;
1471 // Contains the list of instructions for which constant offsets are being
1472 // promoted to the IMM.
1473 SmallPtrSet
<MachineInstr
*, 4> AnchorList
;
1475 for (MachineBasicBlock::iterator I
= MBB
.begin(), E
= MBB
.end(); I
!= E
;) {
1476 MachineInstr
&MI
= *I
;
1478 if (promoteConstantOffsetToImm(MI
, Visited
, AnchorList
))
1481 // Don't combine if volatile.
1482 if (MI
.hasOrderedMemoryRef()) {
1487 const unsigned Opc
= MI
.getOpcode();
1491 CI
.InstClass
= getInstClass(Opc
);
1493 switch (CI
.InstClass
) {
1498 (Opc
== AMDGPU::DS_READ_B64
|| Opc
== AMDGPU::DS_READ_B64_gfx9
) ? 8
1500 if (findMatchingInst(CI
)) {
1502 I
= mergeRead2Pair(CI
);
1509 (Opc
== AMDGPU::DS_WRITE_B64
|| Opc
== AMDGPU::DS_WRITE_B64_gfx9
) ? 8
1511 if (findMatchingInst(CI
)) {
1513 I
= mergeWrite2Pair(CI
);
1518 case S_BUFFER_LOAD_IMM
:
1519 CI
.EltSize
= AMDGPU::getSMRDEncodedOffset(*STM
, 4);
1520 if (findMatchingInst(CI
)) {
1522 I
= mergeSBufferLoadImmPair(CI
);
1523 OptimizeAgain
|= (CI
.Width0
+ CI
.Width1
) < 16;
1528 case BUFFER_LOAD_OFFEN
:
1529 case BUFFER_LOAD_OFFSET
:
1530 case BUFFER_LOAD_OFFEN_exact
:
1531 case BUFFER_LOAD_OFFSET_exact
:
1533 if (findMatchingInst(CI
)) {
1535 I
= mergeBufferLoadPair(CI
);
1536 OptimizeAgain
|= (CI
.Width0
+ CI
.Width1
) < 4;
1541 case BUFFER_STORE_OFFEN
:
1542 case BUFFER_STORE_OFFSET
:
1543 case BUFFER_STORE_OFFEN_exact
:
1544 case BUFFER_STORE_OFFSET_exact
:
1546 if (findMatchingInst(CI
)) {
1548 I
= mergeBufferStorePair(CI
);
1549 OptimizeAgain
|= (CI
.Width0
+ CI
.Width1
) < 4;
1562 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction
&MF
) {
1563 if (skipFunction(MF
.getFunction()))
1566 STM
= &MF
.getSubtarget
<GCNSubtarget
>();
1567 if (!STM
->loadStoreOptEnabled())
1570 TII
= STM
->getInstrInfo();
1571 TRI
= &TII
->getRegisterInfo();
1573 MRI
= &MF
.getRegInfo();
1574 AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
1576 assert(MRI
->isSSA() && "Must be run on SSA");
1578 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1580 bool Modified
= false;
1582 for (MachineBasicBlock
&MBB
: MF
) {
1584 OptimizeAgain
= false;
1585 Modified
|= optimizeBlock(MBB
);
1586 } while (OptimizeAgain
);