1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
43 // Future improvements:
45 // - This currently relies on the scheduler to place loads and stores next to
46 // each other, and then only merges adjacent pairs of instructions. It would
47 // be good to be more flexible with interleaved instructions, and possibly run
48 // before scheduling. It currently missing stores of constants because loading
49 // the constant into the data register is placed between the stores, although
50 // this is arguably a scheduling problem.
52 // - Live interval recomputing seems inefficient. This currently only matches
53 // one pair, and recomputes live intervals and moves on to the next pair. It
54 // would be better to compute a list of all merges that need to occur.
56 // - With a list of instructions to process, we can also merge more. If a
57 // cluster of loads have offsets that are too large to fit in the 8-bit
58 // offsets, but are close enough to fit in the 8 bits, we can add to the base
59 // pointer and use the new reduced offsets.
61 //===----------------------------------------------------------------------===//
64 #include "AMDGPUSubtarget.h"
65 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
66 #include "SIInstrInfo.h"
67 #include "SIRegisterInfo.h"
68 #include "Utils/AMDGPUBaseInfo.h"
69 #include "llvm/ADT/ArrayRef.h"
70 #include "llvm/ADT/SmallVector.h"
71 #include "llvm/ADT/StringRef.h"
72 #include "llvm/Analysis/AliasAnalysis.h"
73 #include "llvm/CodeGen/MachineBasicBlock.h"
74 #include "llvm/CodeGen/MachineFunction.h"
75 #include "llvm/CodeGen/MachineFunctionPass.h"
76 #include "llvm/CodeGen/MachineInstr.h"
77 #include "llvm/CodeGen/MachineInstrBuilder.h"
78 #include "llvm/CodeGen/MachineOperand.h"
79 #include "llvm/CodeGen/MachineRegisterInfo.h"
80 #include "llvm/IR/DebugLoc.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
83 #include "llvm/Support/MathExtras.h"
84 #include "llvm/Support/raw_ostream.h"
93 #define DEBUG_TYPE "si-load-store-opt"
101 BUFFER_LOAD_OFFEN
= AMDGPU::BUFFER_LOAD_DWORD_OFFEN
,
102 BUFFER_LOAD_OFFSET
= AMDGPU::BUFFER_LOAD_DWORD_OFFSET
,
103 BUFFER_STORE_OFFEN
= AMDGPU::BUFFER_STORE_DWORD_OFFEN
,
104 BUFFER_STORE_OFFSET
= AMDGPU::BUFFER_STORE_DWORD_OFFSET
,
105 BUFFER_LOAD_OFFEN_exact
= AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact
,
106 BUFFER_LOAD_OFFSET_exact
= AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact
,
107 BUFFER_STORE_OFFEN_exact
= AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
,
108 BUFFER_STORE_OFFSET_exact
= AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact
,
119 class SILoadStoreOptimizer
: public MachineFunctionPass
{
121 MachineBasicBlock::iterator I
;
122 MachineBasicBlock::iterator Paired
;
129 InstClassEnum InstClass
;
135 SmallVector
<MachineInstr
*, 8> InstsToMove
;
138 struct BaseRegisters
{
142 unsigned LoSubReg
= 0;
143 unsigned HiSubReg
= 0;
151 using MemInfoMap
= DenseMap
<MachineInstr
*, MemAddress
>;
154 const GCNSubtarget
*STM
= nullptr;
155 const SIInstrInfo
*TII
= nullptr;
156 const SIRegisterInfo
*TRI
= nullptr;
157 MachineRegisterInfo
*MRI
= nullptr;
158 AliasAnalysis
*AA
= nullptr;
161 static bool offsetsCanBeCombined(CombineInfo
&CI
);
162 static bool widthsFit(const GCNSubtarget
&STM
, const CombineInfo
&CI
);
163 static unsigned getNewOpcode(const CombineInfo
&CI
);
164 static std::pair
<unsigned, unsigned> getSubRegIdxs(const CombineInfo
&CI
);
165 const TargetRegisterClass
*getTargetRegisterClass(const CombineInfo
&CI
);
166 unsigned getOpcodeWidth(const MachineInstr
&MI
);
167 InstClassEnum
getInstClass(unsigned Opc
);
168 unsigned getRegs(unsigned Opc
);
170 bool findMatchingInst(CombineInfo
&CI
);
172 unsigned read2Opcode(unsigned EltSize
) const;
173 unsigned read2ST64Opcode(unsigned EltSize
) const;
174 MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo
&CI
);
176 unsigned write2Opcode(unsigned EltSize
) const;
177 unsigned write2ST64Opcode(unsigned EltSize
) const;
178 MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo
&CI
);
179 MachineBasicBlock::iterator
mergeSBufferLoadImmPair(CombineInfo
&CI
);
180 MachineBasicBlock::iterator
mergeBufferLoadPair(CombineInfo
&CI
);
181 MachineBasicBlock::iterator
mergeBufferStorePair(CombineInfo
&CI
);
183 void updateBaseAndOffset(MachineInstr
&I
, unsigned NewBase
,
185 unsigned computeBase(MachineInstr
&MI
, const MemAddress
&Addr
);
186 MachineOperand
createRegOrImm(int32_t Val
, MachineInstr
&MI
);
187 Optional
<int32_t> extractConstOffset(const MachineOperand
&Op
);
188 void processBaseWithConstOffset(const MachineOperand
&Base
, MemAddress
&Addr
);
189 /// Promotes constant offset to the immediate by adjusting the base. It
190 /// tries to use a base from the nearby instructions that allows it to have
191 /// a 13bit constant offset which gets promoted to the immediate.
192 bool promoteConstantOffsetToImm(MachineInstr
&CI
,
194 SmallPtrSet
<MachineInstr
*, 4> &Promoted
);
199 SILoadStoreOptimizer() : MachineFunctionPass(ID
) {
200 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
203 bool optimizeBlock(MachineBasicBlock
&MBB
);
205 bool runOnMachineFunction(MachineFunction
&MF
) override
;
207 StringRef
getPassName() const override
{ return "SI Load Store Optimizer"; }
209 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
210 AU
.setPreservesCFG();
211 AU
.addRequired
<AAResultsWrapperPass
>();
213 MachineFunctionPass::getAnalysisUsage(AU
);
217 } // end anonymous namespace.
219 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer
, DEBUG_TYPE
,
220 "SI Load Store Optimizer", false, false)
221 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
222 INITIALIZE_PASS_END(SILoadStoreOptimizer
, DEBUG_TYPE
, "SI Load Store Optimizer",
225 char SILoadStoreOptimizer::ID
= 0;
227 char &llvm::SILoadStoreOptimizerID
= SILoadStoreOptimizer::ID
;
229 FunctionPass
*llvm::createSILoadStoreOptimizerPass() {
230 return new SILoadStoreOptimizer();
233 static void moveInstsAfter(MachineBasicBlock::iterator I
,
234 ArrayRef
<MachineInstr
*> InstsToMove
) {
235 MachineBasicBlock
*MBB
= I
->getParent();
237 for (MachineInstr
*MI
: InstsToMove
) {
238 MI
->removeFromParent();
243 static void addDefsUsesToList(const MachineInstr
&MI
,
244 DenseSet
<unsigned> &RegDefs
,
245 DenseSet
<unsigned> &PhysRegUses
) {
246 for (const MachineOperand
&Op
: MI
.operands()) {
249 RegDefs
.insert(Op
.getReg());
250 else if (Op
.readsReg() &&
251 TargetRegisterInfo::isPhysicalRegister(Op
.getReg()))
252 PhysRegUses
.insert(Op
.getReg());
257 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A
,
258 MachineBasicBlock::iterator B
,
259 const SIInstrInfo
*TII
,
261 // RAW or WAR - cannot reorder
262 // WAW - cannot reorder
263 // RAR - safe to reorder
264 return !(A
->mayStore() || B
->mayStore()) ||
265 TII
->areMemAccessesTriviallyDisjoint(*A
, *B
, AA
);
268 // Add MI and its defs to the lists if MI reads one of the defs that are
269 // already in the list. Returns true in that case.
270 static bool addToListsIfDependent(MachineInstr
&MI
, DenseSet
<unsigned> &RegDefs
,
271 DenseSet
<unsigned> &PhysRegUses
,
272 SmallVectorImpl
<MachineInstr
*> &Insts
) {
273 for (MachineOperand
&Use
: MI
.operands()) {
274 // If one of the defs is read, then there is a use of Def between I and the
275 // instruction that I will potentially be merged with. We will need to move
276 // this instruction after the merged instructions.
278 // Similarly, if there is a def which is read by an instruction that is to
279 // be moved for merging, then we need to move the def-instruction as well.
280 // This can only happen for physical registers such as M0; virtual
281 // registers are in SSA form.
283 ((Use
.readsReg() && RegDefs
.count(Use
.getReg())) ||
284 (Use
.isDef() && TargetRegisterInfo::isPhysicalRegister(Use
.getReg()) &&
285 PhysRegUses
.count(Use
.getReg())))) {
286 Insts
.push_back(&MI
);
287 addDefsUsesToList(MI
, RegDefs
, PhysRegUses
);
295 static bool canMoveInstsAcrossMemOp(MachineInstr
&MemOp
,
296 ArrayRef
<MachineInstr
*> InstsToMove
,
297 const SIInstrInfo
*TII
, AliasAnalysis
*AA
) {
298 assert(MemOp
.mayLoadOrStore());
300 for (MachineInstr
*InstToMove
: InstsToMove
) {
301 if (!InstToMove
->mayLoadOrStore())
303 if (!memAccessesCanBeReordered(MemOp
, *InstToMove
, TII
, AA
))
309 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo
&CI
) {
310 // XXX - Would the same offset be OK? Is there any reason this would happen or
312 if (CI
.Offset0
== CI
.Offset1
)
315 // This won't be valid if the offset isn't aligned.
316 if ((CI
.Offset0
% CI
.EltSize
!= 0) || (CI
.Offset1
% CI
.EltSize
!= 0))
319 unsigned EltOffset0
= CI
.Offset0
/ CI
.EltSize
;
320 unsigned EltOffset1
= CI
.Offset1
/ CI
.EltSize
;
324 // Handle SMEM and VMEM instructions.
325 if ((CI
.InstClass
!= DS_READ
) && (CI
.InstClass
!= DS_WRITE
)) {
326 return (EltOffset0
+ CI
.Width0
== EltOffset1
||
327 EltOffset1
+ CI
.Width1
== EltOffset0
) &&
328 CI
.GLC0
== CI
.GLC1
&&
329 (CI
.InstClass
== S_BUFFER_LOAD_IMM
|| CI
.SLC0
== CI
.SLC1
);
332 // If the offset in elements doesn't fit in 8-bits, we might be able to use
333 // the stride 64 versions.
334 if ((EltOffset0
% 64 == 0) && (EltOffset1
% 64) == 0 &&
335 isUInt
<8>(EltOffset0
/ 64) && isUInt
<8>(EltOffset1
/ 64)) {
336 CI
.Offset0
= EltOffset0
/ 64;
337 CI
.Offset1
= EltOffset1
/ 64;
342 // Check if the new offsets fit in the reduced 8-bit range.
343 if (isUInt
<8>(EltOffset0
) && isUInt
<8>(EltOffset1
)) {
344 CI
.Offset0
= EltOffset0
;
345 CI
.Offset1
= EltOffset1
;
349 // Try to shift base address to decrease offsets.
350 unsigned OffsetDiff
= std::abs((int)EltOffset1
- (int)EltOffset0
);
351 CI
.BaseOff
= std::min(CI
.Offset0
, CI
.Offset1
);
353 if ((OffsetDiff
% 64 == 0) && isUInt
<8>(OffsetDiff
/ 64)) {
354 CI
.Offset0
= (EltOffset0
- CI
.BaseOff
/ CI
.EltSize
) / 64;
355 CI
.Offset1
= (EltOffset1
- CI
.BaseOff
/ CI
.EltSize
) / 64;
360 if (isUInt
<8>(OffsetDiff
)) {
361 CI
.Offset0
= EltOffset0
- CI
.BaseOff
/ CI
.EltSize
;
362 CI
.Offset1
= EltOffset1
- CI
.BaseOff
/ CI
.EltSize
;
369 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget
&STM
,
370 const CombineInfo
&CI
) {
371 const unsigned Width
= (CI
.Width0
+ CI
.Width1
);
372 switch (CI
.InstClass
) {
374 return (Width
<= 4) && (STM
.hasDwordx3LoadStores() || (Width
!= 3));
375 case S_BUFFER_LOAD_IMM
:
386 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr
&MI
) {
387 const unsigned Opc
= MI
.getOpcode();
389 if (TII
->isMUBUF(MI
)) {
390 return AMDGPU::getMUBUFDwords(Opc
);
396 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
398 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
400 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
405 InstClassEnum
SILoadStoreOptimizer::getInstClass(unsigned Opc
) {
406 if (TII
->isMUBUF(Opc
)) {
407 const int baseOpcode
= AMDGPU::getMUBUFBaseOpcode(Opc
);
409 // If we couldn't identify the opcode, bail out.
410 if (baseOpcode
== -1) {
414 switch (baseOpcode
) {
417 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN
:
418 return BUFFER_LOAD_OFFEN
;
419 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET
:
420 return BUFFER_LOAD_OFFSET
;
421 case AMDGPU::BUFFER_STORE_DWORD_OFFEN
:
422 return BUFFER_STORE_OFFEN
;
423 case AMDGPU::BUFFER_STORE_DWORD_OFFSET
:
424 return BUFFER_STORE_OFFSET
;
425 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact
:
426 return BUFFER_LOAD_OFFEN_exact
;
427 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact
:
428 return BUFFER_LOAD_OFFSET_exact
;
429 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
:
430 return BUFFER_STORE_OFFEN_exact
;
431 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact
:
432 return BUFFER_STORE_OFFSET_exact
;
439 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
440 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
441 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
442 return S_BUFFER_LOAD_IMM
;
443 case AMDGPU::DS_READ_B32
:
444 case AMDGPU::DS_READ_B64
:
445 case AMDGPU::DS_READ_B32_gfx9
:
446 case AMDGPU::DS_READ_B64_gfx9
:
448 case AMDGPU::DS_WRITE_B32
:
449 case AMDGPU::DS_WRITE_B64
:
450 case AMDGPU::DS_WRITE_B32_gfx9
:
451 case AMDGPU::DS_WRITE_B64_gfx9
:
456 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc
) {
457 if (TII
->isMUBUF(Opc
)) {
460 if (AMDGPU::getMUBUFHasVAddr(Opc
)) {
464 if (AMDGPU::getMUBUFHasSrsrc(Opc
)) {
468 if (AMDGPU::getMUBUFHasSoffset(Opc
)) {
478 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
479 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
480 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
482 case AMDGPU::DS_READ_B32
:
483 case AMDGPU::DS_READ_B64
:
484 case AMDGPU::DS_READ_B32_gfx9
:
485 case AMDGPU::DS_READ_B64_gfx9
:
486 case AMDGPU::DS_WRITE_B32
:
487 case AMDGPU::DS_WRITE_B64
:
488 case AMDGPU::DS_WRITE_B32_gfx9
:
489 case AMDGPU::DS_WRITE_B64_gfx9
:
494 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo
&CI
) {
495 MachineBasicBlock
*MBB
= CI
.I
->getParent();
496 MachineBasicBlock::iterator E
= MBB
->end();
497 MachineBasicBlock::iterator MBBI
= CI
.I
;
499 const unsigned Opc
= CI
.I
->getOpcode();
500 const InstClassEnum InstClass
= getInstClass(Opc
);
502 if (InstClass
== UNKNOWN
) {
506 const unsigned Regs
= getRegs(Opc
);
508 unsigned AddrOpName
[5] = {0};
510 const MachineOperand
*AddrReg
[5];
511 unsigned NumAddresses
= 0;
514 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::addr
;
518 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::sbase
;
522 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::srsrc
;
525 if (Regs
& SOFFSET
) {
526 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::soffset
;
530 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::vaddr
;
533 for (unsigned i
= 0; i
< NumAddresses
; i
++) {
534 AddrIdx
[i
] = AMDGPU::getNamedOperandIdx(CI
.I
->getOpcode(), AddrOpName
[i
]);
535 AddrReg
[i
] = &CI
.I
->getOperand(AddrIdx
[i
]);
537 // We only ever merge operations with the same base address register, so
538 // don't bother scanning forward if there are no other uses.
539 if (AddrReg
[i
]->isReg() &&
540 (TargetRegisterInfo::isPhysicalRegister(AddrReg
[i
]->getReg()) ||
541 MRI
->hasOneNonDBGUse(AddrReg
[i
]->getReg())))
547 DenseSet
<unsigned> RegDefsToMove
;
548 DenseSet
<unsigned> PhysRegUsesToMove
;
549 addDefsUsesToList(*CI
.I
, RegDefsToMove
, PhysRegUsesToMove
);
551 for (; MBBI
!= E
; ++MBBI
) {
552 const bool IsDS
= (InstClass
== DS_READ
) || (InstClass
== DS_WRITE
);
554 if ((getInstClass(MBBI
->getOpcode()) != InstClass
) ||
555 (IsDS
&& (MBBI
->getOpcode() != Opc
))) {
556 // This is not a matching DS instruction, but we can keep looking as
557 // long as one of these conditions are met:
558 // 1. It is safe to move I down past MBBI.
559 // 2. It is safe to move MBBI down past the instruction that I will
562 if (MBBI
->hasUnmodeledSideEffects()) {
563 // We can't re-order this instruction with respect to other memory
564 // operations, so we fail both conditions mentioned above.
568 if (MBBI
->mayLoadOrStore() &&
569 (!memAccessesCanBeReordered(*CI
.I
, *MBBI
, TII
, AA
) ||
570 !canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, TII
, AA
))) {
571 // We fail condition #1, but we may still be able to satisfy condition
572 // #2. Add this instruction to the move list and then we will check
573 // if condition #2 holds once we have selected the matching instruction.
574 CI
.InstsToMove
.push_back(&*MBBI
);
575 addDefsUsesToList(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
);
579 // When we match I with another DS instruction we will be moving I down
580 // to the location of the matched instruction any uses of I will need to
581 // be moved down as well.
582 addToListsIfDependent(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
,
587 // Don't merge volatiles.
588 if (MBBI
->hasOrderedMemoryRef())
591 // Handle a case like
592 // DS_WRITE_B32 addr, v, idx0
593 // w = DS_READ_B32 addr, idx0
594 // DS_WRITE_B32 addr, f(w), idx1
595 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
596 // merging of the two writes.
597 if (addToListsIfDependent(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
,
602 for (unsigned i
= 0; i
< NumAddresses
; i
++) {
603 const MachineOperand
&AddrRegNext
= MBBI
->getOperand(AddrIdx
[i
]);
605 if (AddrReg
[i
]->isImm() || AddrRegNext
.isImm()) {
606 if (AddrReg
[i
]->isImm() != AddrRegNext
.isImm() ||
607 AddrReg
[i
]->getImm() != AddrRegNext
.getImm()) {
614 // Check same base pointer. Be careful of subregisters, which can occur
615 // with vectors of pointers.
616 if (AddrReg
[i
]->getReg() != AddrRegNext
.getReg() ||
617 AddrReg
[i
]->getSubReg() != AddrRegNext
.getSubReg()) {
625 AMDGPU::getNamedOperandIdx(CI
.I
->getOpcode(), AMDGPU::OpName::offset
);
626 CI
.Offset0
= CI
.I
->getOperand(OffsetIdx
).getImm();
627 CI
.Width0
= getOpcodeWidth(*CI
.I
);
628 CI
.Offset1
= MBBI
->getOperand(OffsetIdx
).getImm();
629 CI
.Width1
= getOpcodeWidth(*MBBI
);
632 if ((CI
.InstClass
== DS_READ
) || (CI
.InstClass
== DS_WRITE
)) {
633 CI
.Offset0
&= 0xffff;
634 CI
.Offset1
&= 0xffff;
636 CI
.GLC0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::glc
)->getImm();
637 CI
.GLC1
= TII
->getNamedOperand(*MBBI
, AMDGPU::OpName::glc
)->getImm();
638 if (CI
.InstClass
!= S_BUFFER_LOAD_IMM
) {
639 CI
.SLC0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::slc
)->getImm();
640 CI
.SLC1
= TII
->getNamedOperand(*MBBI
, AMDGPU::OpName::slc
)->getImm();
644 // Check both offsets fit in the reduced range.
645 // We also need to go through the list of instructions that we plan to
646 // move and make sure they are all safe to move down past the merged
648 if (widthsFit(*STM
, CI
) && offsetsCanBeCombined(CI
))
649 if (canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, TII
, AA
))
653 // We've found a load/store that we couldn't merge for some reason.
654 // We could potentially keep looking, but we'd need to make sure that
655 // it was safe to move I and also all the instruction in InstsToMove
656 // down past this instruction.
657 // check if we can move I across MBBI and if we can move all I's users
658 if (!memAccessesCanBeReordered(*CI
.I
, *MBBI
, TII
, AA
) ||
659 !canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, TII
, AA
))
665 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize
) const {
666 if (STM
->ldsRequiresM0Init())
667 return (EltSize
== 4) ? AMDGPU::DS_READ2_B32
: AMDGPU::DS_READ2_B64
;
668 return (EltSize
== 4) ? AMDGPU::DS_READ2_B32_gfx9
: AMDGPU::DS_READ2_B64_gfx9
;
671 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize
) const {
672 if (STM
->ldsRequiresM0Init())
673 return (EltSize
== 4) ? AMDGPU::DS_READ2ST64_B32
: AMDGPU::DS_READ2ST64_B64
;
675 return (EltSize
== 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
676 : AMDGPU::DS_READ2ST64_B64_gfx9
;
679 MachineBasicBlock::iterator
680 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo
&CI
) {
681 MachineBasicBlock
*MBB
= CI
.I
->getParent();
683 // Be careful, since the addresses could be subregisters themselves in weird
684 // cases, like vectors of pointers.
685 const auto *AddrReg
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::addr
);
687 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdst
);
688 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdst
);
690 unsigned NewOffset0
= CI
.Offset0
;
691 unsigned NewOffset1
= CI
.Offset1
;
693 CI
.UseST64
? read2ST64Opcode(CI
.EltSize
) : read2Opcode(CI
.EltSize
);
695 unsigned SubRegIdx0
= (CI
.EltSize
== 4) ? AMDGPU::sub0
: AMDGPU::sub0_sub1
;
696 unsigned SubRegIdx1
= (CI
.EltSize
== 4) ? AMDGPU::sub1
: AMDGPU::sub2_sub3
;
698 if (NewOffset0
> NewOffset1
) {
699 // Canonicalize the merged instruction so the smaller offset comes first.
700 std::swap(NewOffset0
, NewOffset1
);
701 std::swap(SubRegIdx0
, SubRegIdx1
);
704 assert((isUInt
<8>(NewOffset0
) && isUInt
<8>(NewOffset1
)) &&
705 (NewOffset0
!= NewOffset1
) && "Computed offset doesn't fit");
707 const MCInstrDesc
&Read2Desc
= TII
->get(Opc
);
709 const TargetRegisterClass
*SuperRC
=
710 (CI
.EltSize
== 4) ? &AMDGPU::VReg_64RegClass
: &AMDGPU::VReg_128RegClass
;
711 unsigned DestReg
= MRI
->createVirtualRegister(SuperRC
);
713 DebugLoc DL
= CI
.I
->getDebugLoc();
715 unsigned BaseReg
= AddrReg
->getReg();
716 unsigned BaseSubReg
= AddrReg
->getSubReg();
717 unsigned BaseRegFlags
= 0;
719 unsigned ImmReg
= MRI
->createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
720 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::S_MOV_B32
), ImmReg
)
723 BaseReg
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
724 BaseRegFlags
= RegState::Kill
;
726 TII
->getAddNoCarry(*MBB
, CI
.Paired
, DL
, BaseReg
)
728 .addReg(AddrReg
->getReg(), 0, BaseSubReg
);
732 MachineInstrBuilder Read2
=
733 BuildMI(*MBB
, CI
.Paired
, DL
, Read2Desc
, DestReg
)
734 .addReg(BaseReg
, BaseRegFlags
, BaseSubReg
) // addr
735 .addImm(NewOffset0
) // offset0
736 .addImm(NewOffset1
) // offset1
738 .cloneMergedMemRefs({&*CI
.I
, &*CI
.Paired
});
742 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
744 // Copy to the old destination registers.
745 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
746 .add(*Dest0
) // Copy to same destination including flags and sub reg.
747 .addReg(DestReg
, 0, SubRegIdx0
);
748 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
750 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
752 moveInstsAfter(Copy1
, CI
.InstsToMove
);
754 MachineBasicBlock::iterator Next
= std::next(CI
.I
);
755 CI
.I
->eraseFromParent();
756 CI
.Paired
->eraseFromParent();
758 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2
<< '\n');
762 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize
) const {
763 if (STM
->ldsRequiresM0Init())
764 return (EltSize
== 4) ? AMDGPU::DS_WRITE2_B32
: AMDGPU::DS_WRITE2_B64
;
765 return (EltSize
== 4) ? AMDGPU::DS_WRITE2_B32_gfx9
766 : AMDGPU::DS_WRITE2_B64_gfx9
;
769 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize
) const {
770 if (STM
->ldsRequiresM0Init())
771 return (EltSize
== 4) ? AMDGPU::DS_WRITE2ST64_B32
772 : AMDGPU::DS_WRITE2ST64_B64
;
774 return (EltSize
== 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
775 : AMDGPU::DS_WRITE2ST64_B64_gfx9
;
778 MachineBasicBlock::iterator
779 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo
&CI
) {
780 MachineBasicBlock
*MBB
= CI
.I
->getParent();
782 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
783 // sure we preserve the subregister index and any register flags set on them.
784 const MachineOperand
*AddrReg
=
785 TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::addr
);
786 const MachineOperand
*Data0
=
787 TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::data0
);
788 const MachineOperand
*Data1
=
789 TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::data0
);
791 unsigned NewOffset0
= CI
.Offset0
;
792 unsigned NewOffset1
= CI
.Offset1
;
794 CI
.UseST64
? write2ST64Opcode(CI
.EltSize
) : write2Opcode(CI
.EltSize
);
796 if (NewOffset0
> NewOffset1
) {
797 // Canonicalize the merged instruction so the smaller offset comes first.
798 std::swap(NewOffset0
, NewOffset1
);
799 std::swap(Data0
, Data1
);
802 assert((isUInt
<8>(NewOffset0
) && isUInt
<8>(NewOffset1
)) &&
803 (NewOffset0
!= NewOffset1
) && "Computed offset doesn't fit");
805 const MCInstrDesc
&Write2Desc
= TII
->get(Opc
);
806 DebugLoc DL
= CI
.I
->getDebugLoc();
808 unsigned BaseReg
= AddrReg
->getReg();
809 unsigned BaseSubReg
= AddrReg
->getSubReg();
810 unsigned BaseRegFlags
= 0;
812 unsigned ImmReg
= MRI
->createVirtualRegister(&AMDGPU::SGPR_32RegClass
);
813 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::S_MOV_B32
), ImmReg
)
816 BaseReg
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
817 BaseRegFlags
= RegState::Kill
;
819 TII
->getAddNoCarry(*MBB
, CI
.Paired
, DL
, BaseReg
)
821 .addReg(AddrReg
->getReg(), 0, BaseSubReg
);
825 MachineInstrBuilder Write2
=
826 BuildMI(*MBB
, CI
.Paired
, DL
, Write2Desc
)
827 .addReg(BaseReg
, BaseRegFlags
, BaseSubReg
) // addr
828 .add(*Data0
) // data0
829 .add(*Data1
) // data1
830 .addImm(NewOffset0
) // offset0
831 .addImm(NewOffset1
) // offset1
833 .cloneMergedMemRefs({&*CI
.I
, &*CI
.Paired
});
835 moveInstsAfter(Write2
, CI
.InstsToMove
);
837 MachineBasicBlock::iterator Next
= std::next(CI
.I
);
838 CI
.I
->eraseFromParent();
839 CI
.Paired
->eraseFromParent();
841 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2
<< '\n');
845 MachineBasicBlock::iterator
846 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo
&CI
) {
847 MachineBasicBlock
*MBB
= CI
.I
->getParent();
848 DebugLoc DL
= CI
.I
->getDebugLoc();
849 const unsigned Opcode
= getNewOpcode(CI
);
851 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
853 unsigned DestReg
= MRI
->createVirtualRegister(SuperRC
);
854 unsigned MergedOffset
= std::min(CI
.Offset0
, CI
.Offset1
);
856 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
), DestReg
)
857 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::sbase
))
858 .addImm(MergedOffset
) // offset
859 .addImm(CI
.GLC0
) // glc
860 .cloneMergedMemRefs({&*CI
.I
, &*CI
.Paired
});
862 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
863 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
864 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
866 // Copy to the old destination registers.
867 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
868 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::sdst
);
869 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::sdst
);
871 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
872 .add(*Dest0
) // Copy to same destination including flags and sub reg.
873 .addReg(DestReg
, 0, SubRegIdx0
);
874 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
876 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
878 moveInstsAfter(Copy1
, CI
.InstsToMove
);
880 MachineBasicBlock::iterator Next
= std::next(CI
.I
);
881 CI
.I
->eraseFromParent();
882 CI
.Paired
->eraseFromParent();
886 MachineBasicBlock::iterator
887 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo
&CI
) {
888 MachineBasicBlock
*MBB
= CI
.I
->getParent();
889 DebugLoc DL
= CI
.I
->getDebugLoc();
891 const unsigned Opcode
= getNewOpcode(CI
);
893 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
895 // Copy to the new source register.
896 unsigned DestReg
= MRI
->createVirtualRegister(SuperRC
);
897 unsigned MergedOffset
= std::min(CI
.Offset0
, CI
.Offset1
);
899 auto MIB
= BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
), DestReg
);
901 const unsigned Regs
= getRegs(Opcode
);
904 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vaddr
));
906 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::srsrc
))
907 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::soffset
))
908 .addImm(MergedOffset
) // offset
909 .addImm(CI
.GLC0
) // glc
910 .addImm(CI
.SLC0
) // slc
912 .cloneMergedMemRefs({&*CI
.I
, &*CI
.Paired
});
914 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
915 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
916 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
918 // Copy to the old destination registers.
919 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
920 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdata
);
921 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdata
);
923 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
924 .add(*Dest0
) // Copy to same destination including flags and sub reg.
925 .addReg(DestReg
, 0, SubRegIdx0
);
926 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
928 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
930 moveInstsAfter(Copy1
, CI
.InstsToMove
);
932 MachineBasicBlock::iterator Next
= std::next(CI
.I
);
933 CI
.I
->eraseFromParent();
934 CI
.Paired
->eraseFromParent();
938 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo
&CI
) {
939 const unsigned Width
= CI
.Width0
+ CI
.Width1
;
941 switch (CI
.InstClass
) {
943 return AMDGPU::getMUBUFOpcode(CI
.InstClass
, Width
);
945 llvm_unreachable("Unknown instruction class");
946 case S_BUFFER_LOAD_IMM
:
951 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
;
953 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
;
958 std::pair
<unsigned, unsigned>
959 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo
&CI
) {
960 if (CI
.Offset0
> CI
.Offset1
) {
963 return std::make_pair(0, 0);
967 return std::make_pair(0, 0);
969 return std::make_pair(AMDGPU::sub1
, AMDGPU::sub0
);
971 return std::make_pair(AMDGPU::sub2
, AMDGPU::sub0_sub1
);
973 return std::make_pair(AMDGPU::sub3
, AMDGPU::sub0_sub1_sub2
);
978 return std::make_pair(0, 0);
980 return std::make_pair(AMDGPU::sub1_sub2
, AMDGPU::sub0
);
982 return std::make_pair(AMDGPU::sub2_sub3
, AMDGPU::sub0_sub1
);
987 return std::make_pair(0, 0);
989 return std::make_pair(AMDGPU::sub1_sub2_sub3
, AMDGPU::sub0
);
995 return std::make_pair(0, 0);
999 return std::make_pair(0, 0);
1001 return std::make_pair(AMDGPU::sub0
, AMDGPU::sub1
);
1003 return std::make_pair(AMDGPU::sub0
, AMDGPU::sub1_sub2
);
1005 return std::make_pair(AMDGPU::sub0
, AMDGPU::sub1_sub2_sub3
);
1008 switch (CI
.Width1
) {
1010 return std::make_pair(0, 0);
1012 return std::make_pair(AMDGPU::sub0_sub1
, AMDGPU::sub2
);
1014 return std::make_pair(AMDGPU::sub0_sub1
, AMDGPU::sub2_sub3
);
1017 switch (CI
.Width1
) {
1019 return std::make_pair(0, 0);
1021 return std::make_pair(AMDGPU::sub0_sub1_sub2
, AMDGPU::sub3
);
1027 const TargetRegisterClass
*
1028 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo
&CI
) {
1029 if (CI
.InstClass
== S_BUFFER_LOAD_IMM
) {
1030 switch (CI
.Width0
+ CI
.Width1
) {
1034 return &AMDGPU::SReg_64_XEXECRegClass
;
1036 return &AMDGPU::SReg_128RegClass
;
1038 return &AMDGPU::SReg_256RegClass
;
1040 return &AMDGPU::SReg_512RegClass
;
1043 switch (CI
.Width0
+ CI
.Width1
) {
1047 return &AMDGPU::VReg_64RegClass
;
1049 return &AMDGPU::VReg_96RegClass
;
1051 return &AMDGPU::VReg_128RegClass
;
1056 MachineBasicBlock::iterator
1057 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo
&CI
) {
1058 MachineBasicBlock
*MBB
= CI
.I
->getParent();
1059 DebugLoc DL
= CI
.I
->getDebugLoc();
1061 const unsigned Opcode
= getNewOpcode(CI
);
1063 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
1064 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
1065 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
1067 // Copy to the new source register.
1068 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
1069 unsigned SrcReg
= MRI
->createVirtualRegister(SuperRC
);
1071 const auto *Src0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdata
);
1072 const auto *Src1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdata
);
1074 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
), SrcReg
)
1078 .addImm(SubRegIdx1
);
1080 auto MIB
= BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
))
1081 .addReg(SrcReg
, RegState::Kill
);
1083 const unsigned Regs
= getRegs(Opcode
);
1086 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vaddr
));
1088 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::srsrc
))
1089 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::soffset
))
1090 .addImm(std::min(CI
.Offset0
, CI
.Offset1
)) // offset
1091 .addImm(CI
.GLC0
) // glc
1092 .addImm(CI
.SLC0
) // slc
1094 .cloneMergedMemRefs({&*CI
.I
, &*CI
.Paired
});
1096 moveInstsAfter(MIB
, CI
.InstsToMove
);
1098 MachineBasicBlock::iterator Next
= std::next(CI
.I
);
1099 CI
.I
->eraseFromParent();
1100 CI
.Paired
->eraseFromParent();
1105 SILoadStoreOptimizer::createRegOrImm(int32_t Val
, MachineInstr
&MI
) {
1106 APInt
V(32, Val
, true);
1107 if (TII
->isInlineConstant(V
))
1108 return MachineOperand::CreateImm(Val
);
1110 unsigned Reg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1112 BuildMI(*MI
.getParent(), MI
.getIterator(), MI
.getDebugLoc(),
1113 TII
->get(AMDGPU::S_MOV_B32
), Reg
)
1116 LLVM_DEBUG(dbgs() << " "; Mov
->dump());
1117 return MachineOperand::CreateReg(Reg
, false);
1120 // Compute base address using Addr and return the final register.
1121 unsigned SILoadStoreOptimizer::computeBase(MachineInstr
&MI
,
1122 const MemAddress
&Addr
) {
1123 MachineBasicBlock
*MBB
= MI
.getParent();
1124 MachineBasicBlock::iterator MBBI
= MI
.getIterator();
1125 DebugLoc DL
= MI
.getDebugLoc();
1127 assert((TRI
->getRegSizeInBits(Addr
.Base
.LoReg
, *MRI
) == 32 ||
1128 Addr
.Base
.LoSubReg
) &&
1129 "Expected 32-bit Base-Register-Low!!");
1131 assert((TRI
->getRegSizeInBits(Addr
.Base
.HiReg
, *MRI
) == 32 ||
1132 Addr
.Base
.HiSubReg
) &&
1133 "Expected 32-bit Base-Register-Hi!!");
1135 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1136 MachineOperand OffsetLo
= createRegOrImm(static_cast<int32_t>(Addr
.Offset
), MI
);
1137 MachineOperand OffsetHi
=
1138 createRegOrImm(static_cast<int32_t>(Addr
.Offset
>> 32), MI
);
1139 unsigned CarryReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
1140 unsigned DeadCarryReg
=
1141 MRI
->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass
);
1143 unsigned DestSub0
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1144 unsigned DestSub1
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1145 MachineInstr
*LoHalf
=
1146 BuildMI(*MBB
, MBBI
, DL
, TII
->get(AMDGPU::V_ADD_I32_e64
), DestSub0
)
1147 .addReg(CarryReg
, RegState::Define
)
1148 .addReg(Addr
.Base
.LoReg
, 0, Addr
.Base
.LoSubReg
)
1151 LLVM_DEBUG(dbgs() << " "; LoHalf
->dump(););
1153 MachineInstr
*HiHalf
=
1154 BuildMI(*MBB
, MBBI
, DL
, TII
->get(AMDGPU::V_ADDC_U32_e64
), DestSub1
)
1155 .addReg(DeadCarryReg
, RegState::Define
| RegState::Dead
)
1156 .addReg(Addr
.Base
.HiReg
, 0, Addr
.Base
.HiSubReg
)
1158 .addReg(CarryReg
, RegState::Kill
);
1160 LLVM_DEBUG(dbgs() << " "; HiHalf
->dump(););
1162 unsigned FullDestReg
= MRI
->createVirtualRegister(&AMDGPU::VReg_64RegClass
);
1163 MachineInstr
*FullBase
=
1164 BuildMI(*MBB
, MBBI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), FullDestReg
)
1166 .addImm(AMDGPU::sub0
)
1168 .addImm(AMDGPU::sub1
);
1170 LLVM_DEBUG(dbgs() << " "; FullBase
->dump(); dbgs() << "\n";);
1175 // Update base and offset with the NewBase and NewOffset in MI.
1176 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr
&MI
,
1178 int32_t NewOffset
) {
1179 TII
->getNamedOperand(MI
, AMDGPU::OpName::vaddr
)->setReg(NewBase
);
1180 TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->setImm(NewOffset
);
1184 SILoadStoreOptimizer::extractConstOffset(const MachineOperand
&Op
) {
1191 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Op
.getReg());
1192 if (!Def
|| Def
->getOpcode() != AMDGPU::S_MOV_B32
||
1193 !Def
->getOperand(1).isImm())
1196 return Def
->getOperand(1).getImm();
1199 // Analyze Base and extracts:
1200 // - 32bit base registers, subregisters
1201 // - 64bit constant offset
1202 // Expecting base computation as:
1203 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1204 // %LO:vgpr_32, %c:sreg_64_xexec =
1205 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1206 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1208 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1209 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand
&Base
,
1214 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Base
.getReg());
1215 if (!Def
|| Def
->getOpcode() != AMDGPU::REG_SEQUENCE
1216 || Def
->getNumOperands() != 5)
1219 MachineOperand BaseLo
= Def
->getOperand(1);
1220 MachineOperand BaseHi
= Def
->getOperand(3);
1221 if (!BaseLo
.isReg() || !BaseHi
.isReg())
1224 MachineInstr
*BaseLoDef
= MRI
->getUniqueVRegDef(BaseLo
.getReg());
1225 MachineInstr
*BaseHiDef
= MRI
->getUniqueVRegDef(BaseHi
.getReg());
1227 if (!BaseLoDef
|| BaseLoDef
->getOpcode() != AMDGPU::V_ADD_I32_e64
||
1228 !BaseHiDef
|| BaseHiDef
->getOpcode() != AMDGPU::V_ADDC_U32_e64
)
1231 const auto *Src0
= TII
->getNamedOperand(*BaseLoDef
, AMDGPU::OpName::src0
);
1232 const auto *Src1
= TII
->getNamedOperand(*BaseLoDef
, AMDGPU::OpName::src1
);
1234 auto Offset0P
= extractConstOffset(*Src0
);
1238 if (!(Offset0P
= extractConstOffset(*Src1
)))
1243 Src0
= TII
->getNamedOperand(*BaseHiDef
, AMDGPU::OpName::src0
);
1244 Src1
= TII
->getNamedOperand(*BaseHiDef
, AMDGPU::OpName::src1
);
1247 std::swap(Src0
, Src1
);
1252 uint64_t Offset1
= Src1
->getImm();
1255 Addr
.Base
.LoReg
= BaseLo
.getReg();
1256 Addr
.Base
.HiReg
= BaseHi
.getReg();
1257 Addr
.Base
.LoSubReg
= BaseLo
.getSubReg();
1258 Addr
.Base
.HiSubReg
= BaseHi
.getSubReg();
1259 Addr
.Offset
= (*Offset0P
& 0x00000000ffffffff) | (Offset1
<< 32);
1262 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1264 MemInfoMap
&Visited
,
1265 SmallPtrSet
<MachineInstr
*, 4> &AnchorList
) {
1267 // TODO: Support flat and scratch.
1268 if (AMDGPU::getGlobalSaddrOp(MI
.getOpcode()) < 0 ||
1269 TII
->getNamedOperand(MI
, AMDGPU::OpName::vdata
) != NULL
)
1272 // TODO: Support Store.
1276 if (AnchorList
.count(&MI
))
1279 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI
.dump());
1281 if (TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm()) {
1282 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1286 // Step1: Find the base-registers and a 64bit constant offset.
1287 MachineOperand
&Base
= *TII
->getNamedOperand(MI
, AMDGPU::OpName::vaddr
);
1289 if (Visited
.find(&MI
) == Visited
.end()) {
1290 processBaseWithConstOffset(Base
, MAddr
);
1291 Visited
[&MI
] = MAddr
;
1293 MAddr
= Visited
[&MI
];
1295 if (MAddr
.Offset
== 0) {
1296 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1297 " constant offsets that can be promoted.\n";);
1301 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr
.Base
.HiReg
<< ", "
1302 << MAddr
.Base
.LoReg
<< "} Offset: " << MAddr
.Offset
<< "\n\n";);
1304 // Step2: Traverse through MI's basic block and find an anchor(that has the
1305 // same base-registers) with the highest 13bit distance from MI's offset.
1306 // E.g. (64bit loads)
1308 // addr1 = &a + 4096; load1 = load(addr1, 0)
1309 // addr2 = &a + 6144; load2 = load(addr2, 0)
1310 // addr3 = &a + 8192; load3 = load(addr3, 0)
1311 // addr4 = &a + 10240; load4 = load(addr4, 0)
1312 // addr5 = &a + 12288; load5 = load(addr5, 0)
1314 // Starting from the first load, the optimization will try to find a new base
1315 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1316 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1317 // as the new-base(anchor) because of the maximum distance which can
1318 // accomodate more intermediate bases presumeably.
1320 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1321 // (&a + 8192) for load1, load2, load4.
1323 // load1 = load(addr, -4096)
1324 // load2 = load(addr, -2048)
1325 // load3 = load(addr, 0)
1326 // load4 = load(addr, 2048)
1327 // addr5 = &a + 12288; load5 = load(addr5, 0)
1329 MachineInstr
*AnchorInst
= nullptr;
1330 MemAddress AnchorAddr
;
1331 uint32_t MaxDist
= std::numeric_limits
<uint32_t>::min();
1332 SmallVector
<std::pair
<MachineInstr
*, int64_t>, 4> InstsWCommonBase
;
1334 MachineBasicBlock
*MBB
= MI
.getParent();
1335 MachineBasicBlock::iterator E
= MBB
->end();
1336 MachineBasicBlock::iterator MBBI
= MI
.getIterator();
1338 const SITargetLowering
*TLI
=
1339 static_cast<const SITargetLowering
*>(STM
->getTargetLowering());
1341 for ( ; MBBI
!= E
; ++MBBI
) {
1342 MachineInstr
&MINext
= *MBBI
;
1343 // TODO: Support finding an anchor(with same base) from store addresses or
1344 // any other load addresses where the opcodes are different.
1345 if (MINext
.getOpcode() != MI
.getOpcode() ||
1346 TII
->getNamedOperand(MINext
, AMDGPU::OpName::offset
)->getImm())
1349 const MachineOperand
&BaseNext
=
1350 *TII
->getNamedOperand(MINext
, AMDGPU::OpName::vaddr
);
1351 MemAddress MAddrNext
;
1352 if (Visited
.find(&MINext
) == Visited
.end()) {
1353 processBaseWithConstOffset(BaseNext
, MAddrNext
);
1354 Visited
[&MINext
] = MAddrNext
;
1356 MAddrNext
= Visited
[&MINext
];
1358 if (MAddrNext
.Base
.LoReg
!= MAddr
.Base
.LoReg
||
1359 MAddrNext
.Base
.HiReg
!= MAddr
.Base
.HiReg
||
1360 MAddrNext
.Base
.LoSubReg
!= MAddr
.Base
.LoSubReg
||
1361 MAddrNext
.Base
.HiSubReg
!= MAddr
.Base
.HiSubReg
)
1364 InstsWCommonBase
.push_back(std::make_pair(&MINext
, MAddrNext
.Offset
));
1366 int64_t Dist
= MAddr
.Offset
- MAddrNext
.Offset
;
1367 TargetLoweringBase::AddrMode AM
;
1368 AM
.HasBaseReg
= true;
1370 if (TLI
->isLegalGlobalAddressingMode(AM
) &&
1371 (uint32_t)std::abs(Dist
) > MaxDist
) {
1372 MaxDist
= std::abs(Dist
);
1374 AnchorAddr
= MAddrNext
;
1375 AnchorInst
= &MINext
;
1380 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1381 AnchorInst
->dump());
1382 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1383 << AnchorAddr
.Offset
<< "\n\n");
1385 // Instead of moving up, just re-compute anchor-instruction's base address.
1386 unsigned Base
= computeBase(MI
, AnchorAddr
);
1388 updateBaseAndOffset(MI
, Base
, MAddr
.Offset
- AnchorAddr
.Offset
);
1389 LLVM_DEBUG(dbgs() << " After promotion: "; MI
.dump(););
1391 for (auto P
: InstsWCommonBase
) {
1392 TargetLoweringBase::AddrMode AM
;
1393 AM
.HasBaseReg
= true;
1394 AM
.BaseOffs
= P
.second
- AnchorAddr
.Offset
;
1396 if (TLI
->isLegalGlobalAddressingMode(AM
)) {
1397 LLVM_DEBUG(dbgs() << " Promote Offset(" << P
.second
;
1398 dbgs() << ")"; P
.first
->dump());
1399 updateBaseAndOffset(*P
.first
, Base
, P
.second
- AnchorAddr
.Offset
);
1400 LLVM_DEBUG(dbgs() << " After promotion: "; P
.first
->dump());
1403 AnchorList
.insert(AnchorInst
);
1410 // Scan through looking for adjacent LDS operations with constant offsets from
1411 // the same base register. We rely on the scheduler to do the hard work of
1412 // clustering nearby loads, and assume these are all adjacent.
1413 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock
&MBB
) {
1414 bool Modified
= false;
1418 // Contains the list of instructions for which constant offsets are being
1419 // promoted to the IMM.
1420 SmallPtrSet
<MachineInstr
*, 4> AnchorList
;
1422 for (MachineBasicBlock::iterator I
= MBB
.begin(), E
= MBB
.end(); I
!= E
;) {
1423 MachineInstr
&MI
= *I
;
1425 if (promoteConstantOffsetToImm(MI
, Visited
, AnchorList
))
1428 // Don't combine if volatile.
1429 if (MI
.hasOrderedMemoryRef()) {
1434 const unsigned Opc
= MI
.getOpcode();
1438 CI
.InstClass
= getInstClass(Opc
);
1440 switch (CI
.InstClass
) {
1445 (Opc
== AMDGPU::DS_READ_B64
|| Opc
== AMDGPU::DS_READ_B64_gfx9
) ? 8
1447 if (findMatchingInst(CI
)) {
1449 I
= mergeRead2Pair(CI
);
1456 (Opc
== AMDGPU::DS_WRITE_B64
|| Opc
== AMDGPU::DS_WRITE_B64_gfx9
) ? 8
1458 if (findMatchingInst(CI
)) {
1460 I
= mergeWrite2Pair(CI
);
1465 case S_BUFFER_LOAD_IMM
:
1466 CI
.EltSize
= AMDGPU::getSMRDEncodedOffset(*STM
, 4);
1467 if (findMatchingInst(CI
)) {
1469 I
= mergeSBufferLoadImmPair(CI
);
1470 OptimizeAgain
|= (CI
.Width0
+ CI
.Width1
) < 16;
1475 case BUFFER_LOAD_OFFEN
:
1476 case BUFFER_LOAD_OFFSET
:
1477 case BUFFER_LOAD_OFFEN_exact
:
1478 case BUFFER_LOAD_OFFSET_exact
:
1480 if (findMatchingInst(CI
)) {
1482 I
= mergeBufferLoadPair(CI
);
1483 OptimizeAgain
|= (CI
.Width0
+ CI
.Width1
) < 4;
1488 case BUFFER_STORE_OFFEN
:
1489 case BUFFER_STORE_OFFSET
:
1490 case BUFFER_STORE_OFFEN_exact
:
1491 case BUFFER_STORE_OFFSET_exact
:
1493 if (findMatchingInst(CI
)) {
1495 I
= mergeBufferStorePair(CI
);
1496 OptimizeAgain
|= (CI
.Width0
+ CI
.Width1
) < 4;
1509 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction
&MF
) {
1510 if (skipFunction(MF
.getFunction()))
1513 STM
= &MF
.getSubtarget
<GCNSubtarget
>();
1514 if (!STM
->loadStoreOptEnabled())
1517 TII
= STM
->getInstrInfo();
1518 TRI
= &TII
->getRegisterInfo();
1520 MRI
= &MF
.getRegInfo();
1521 AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
1523 assert(MRI
->isSSA() && "Must be run on SSA");
1525 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1527 bool Modified
= false;
1529 for (MachineBasicBlock
&MBB
: MF
) {
1531 OptimizeAgain
= false;
1532 Modified
|= optimizeBlock(MBB
);
1533 } while (OptimizeAgain
);