1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
43 // Future improvements:
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
58 //===----------------------------------------------------------------------===//
61 #include "AMDGPUSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "SIInstrInfo.h"
64 #include "SIRegisterInfo.h"
65 #include "Utils/AMDGPUBaseInfo.h"
66 #include "llvm/ADT/ArrayRef.h"
67 #include "llvm/ADT/SmallVector.h"
68 #include "llvm/ADT/StringRef.h"
69 #include "llvm/Analysis/AliasAnalysis.h"
70 #include "llvm/CodeGen/MachineBasicBlock.h"
71 #include "llvm/CodeGen/MachineFunction.h"
72 #include "llvm/CodeGen/MachineFunctionPass.h"
73 #include "llvm/CodeGen/MachineInstr.h"
74 #include "llvm/CodeGen/MachineInstrBuilder.h"
75 #include "llvm/CodeGen/MachineOperand.h"
76 #include "llvm/CodeGen/MachineRegisterInfo.h"
77 #include "llvm/IR/DebugLoc.h"
78 #include "llvm/Pass.h"
79 #include "llvm/Support/Debug.h"
80 #include "llvm/Support/MathExtras.h"
81 #include "llvm/Support/raw_ostream.h"
90 #define DEBUG_TYPE "si-load-store-opt"
112 class SILoadStoreOptimizer
: public MachineFunctionPass
{
114 MachineBasicBlock::iterator I
;
115 MachineBasicBlock::iterator Paired
;
124 InstClassEnum InstClass
;
132 SmallVector
<MachineInstr
*, 8> InstsToMove
;
134 const MachineOperand
*AddrReg
[5];
135 unsigned NumAddresses
;
137 bool hasSameBaseAddress(const MachineInstr
&MI
) {
138 for (unsigned i
= 0; i
< NumAddresses
; i
++) {
139 const MachineOperand
&AddrRegNext
= MI
.getOperand(AddrIdx
[i
]);
141 if (AddrReg
[i
]->isImm() || AddrRegNext
.isImm()) {
142 if (AddrReg
[i
]->isImm() != AddrRegNext
.isImm() ||
143 AddrReg
[i
]->getImm() != AddrRegNext
.getImm()) {
149 // Check same base pointer. Be careful of subregisters, which can occur
150 // with vectors of pointers.
151 if (AddrReg
[i
]->getReg() != AddrRegNext
.getReg() ||
152 AddrReg
[i
]->getSubReg() != AddrRegNext
.getSubReg()) {
159 bool hasMergeableAddress(const MachineRegisterInfo
&MRI
) {
160 for (unsigned i
= 0; i
< NumAddresses
; ++i
) {
161 const MachineOperand
*AddrOp
= AddrReg
[i
];
162 // Immediates are always OK.
166 // Don't try to merge addresses that aren't either immediates or registers.
167 // TODO: Should be possible to merge FrameIndexes and maybe some other
169 if (!AddrOp
->isReg())
172 // TODO: We should be able to merge physical reg addreses.
173 if (Register::isPhysicalRegister(AddrOp
->getReg()))
176 // If an address has only one use then there will be on other
177 // instructions with the same address, so we can't merge this one.
178 if (MRI
.hasOneNonDBGUse(AddrOp
->getReg()))
184 void setMI(MachineBasicBlock::iterator MI
, const SIInstrInfo
&TII
,
185 const GCNSubtarget
&STM
);
186 void setPaired(MachineBasicBlock::iterator MI
, const SIInstrInfo
&TII
);
189 struct BaseRegisters
{
193 unsigned LoSubReg
= 0;
194 unsigned HiSubReg
= 0;
202 using MemInfoMap
= DenseMap
<MachineInstr
*, MemAddress
>;
205 const GCNSubtarget
*STM
= nullptr;
206 const SIInstrInfo
*TII
= nullptr;
207 const SIRegisterInfo
*TRI
= nullptr;
208 MachineRegisterInfo
*MRI
= nullptr;
209 AliasAnalysis
*AA
= nullptr;
212 static bool dmasksCanBeCombined(const CombineInfo
&CI
, const SIInstrInfo
&TII
);
213 static bool offsetsCanBeCombined(CombineInfo
&CI
);
214 static bool widthsFit(const GCNSubtarget
&STM
, const CombineInfo
&CI
);
215 static unsigned getNewOpcode(const CombineInfo
&CI
);
216 static std::pair
<unsigned, unsigned> getSubRegIdxs(const CombineInfo
&CI
);
217 const TargetRegisterClass
*getTargetRegisterClass(const CombineInfo
&CI
);
219 bool findMatchingInst(CombineInfo
&CI
);
221 unsigned read2Opcode(unsigned EltSize
) const;
222 unsigned read2ST64Opcode(unsigned EltSize
) const;
223 MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo
&CI
);
225 unsigned write2Opcode(unsigned EltSize
) const;
226 unsigned write2ST64Opcode(unsigned EltSize
) const;
227 MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo
&CI
);
228 MachineBasicBlock::iterator
mergeImagePair(CombineInfo
&CI
);
229 MachineBasicBlock::iterator
mergeSBufferLoadImmPair(CombineInfo
&CI
);
230 MachineBasicBlock::iterator
mergeBufferLoadPair(CombineInfo
&CI
);
231 MachineBasicBlock::iterator
mergeBufferStorePair(CombineInfo
&CI
);
233 void updateBaseAndOffset(MachineInstr
&I
, unsigned NewBase
,
234 int32_t NewOffset
) const;
235 unsigned computeBase(MachineInstr
&MI
, const MemAddress
&Addr
) const;
236 MachineOperand
createRegOrImm(int32_t Val
, MachineInstr
&MI
) const;
237 Optional
<int32_t> extractConstOffset(const MachineOperand
&Op
) const;
238 void processBaseWithConstOffset(const MachineOperand
&Base
, MemAddress
&Addr
) const;
239 /// Promotes constant offset to the immediate by adjusting the base. It
240 /// tries to use a base from the nearby instructions that allows it to have
241 /// a 13bit constant offset which gets promoted to the immediate.
242 bool promoteConstantOffsetToImm(MachineInstr
&CI
,
244 SmallPtrSet
<MachineInstr
*, 4> &Promoted
) const;
245 void addInstToMergeableList(const CombineInfo
&CI
,
246 std::list
<std::list
<CombineInfo
> > &MergeableInsts
) const;
247 bool collectMergeableInsts(MachineBasicBlock
&MBB
,
248 std::list
<std::list
<CombineInfo
> > &MergeableInsts
) const;
253 SILoadStoreOptimizer() : MachineFunctionPass(ID
) {
254 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
257 void removeCombinedInst(std::list
<CombineInfo
> &MergeList
,
258 const MachineInstr
&MI
);
259 bool optimizeInstsWithSameBaseAddr(std::list
<CombineInfo
> &MergeList
,
260 bool &OptimizeListAgain
);
261 bool optimizeBlock(std::list
<std::list
<CombineInfo
> > &MergeableInsts
);
263 bool runOnMachineFunction(MachineFunction
&MF
) override
;
265 StringRef
getPassName() const override
{ return "SI Load Store Optimizer"; }
267 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
268 AU
.setPreservesCFG();
269 AU
.addRequired
<AAResultsWrapperPass
>();
271 MachineFunctionPass::getAnalysisUsage(AU
);
275 static unsigned getOpcodeWidth(const MachineInstr
&MI
, const SIInstrInfo
&TII
) {
276 const unsigned Opc
= MI
.getOpcode();
278 if (TII
.isMUBUF(Opc
)) {
279 // FIXME: Handle d16 correctly
280 return AMDGPU::getMUBUFElements(Opc
);
282 if (TII
.isMIMG(MI
)) {
284 TII
.getNamedOperand(MI
, AMDGPU::OpName::dmask
)->getImm();
285 return countPopulation(DMaskImm
);
289 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
291 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
293 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
300 /// Maps instruction opcode to enum InstClassEnum.
301 static InstClassEnum
getInstClass(unsigned Opc
, const SIInstrInfo
&TII
) {
304 if (TII
.isMUBUF(Opc
)) {
305 switch (AMDGPU::getMUBUFBaseOpcode(Opc
)) {
308 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN
:
309 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact
:
310 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET
:
311 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact
:
313 case AMDGPU::BUFFER_STORE_DWORD_OFFEN
:
314 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
:
315 case AMDGPU::BUFFER_STORE_DWORD_OFFSET
:
316 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact
:
320 if (TII
.isMIMG(Opc
)) {
321 // Ignore instructions encoded without vaddr.
322 if (AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::vaddr
) == -1)
324 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
325 if (TII
.get(Opc
).mayStore() || !TII
.get(Opc
).mayLoad() || TII
.isGather4(Opc
))
330 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
331 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
332 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
333 return S_BUFFER_LOAD_IMM
;
334 case AMDGPU::DS_READ_B32
:
335 case AMDGPU::DS_READ_B32_gfx9
:
336 case AMDGPU::DS_READ_B64
:
337 case AMDGPU::DS_READ_B64_gfx9
:
339 case AMDGPU::DS_WRITE_B32
:
340 case AMDGPU::DS_WRITE_B32_gfx9
:
341 case AMDGPU::DS_WRITE_B64
:
342 case AMDGPU::DS_WRITE_B64_gfx9
:
347 /// Determines instruction subclass from opcode. Only instructions
348 /// of the same subclass can be merged together.
349 static unsigned getInstSubclass(unsigned Opc
, const SIInstrInfo
&TII
) {
352 if (TII
.isMUBUF(Opc
))
353 return AMDGPU::getMUBUFBaseOpcode(Opc
);
354 if (TII
.isMIMG(Opc
)) {
355 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(Opc
);
357 return Info
->BaseOpcode
;
360 case AMDGPU::DS_READ_B32
:
361 case AMDGPU::DS_READ_B32_gfx9
:
362 case AMDGPU::DS_READ_B64
:
363 case AMDGPU::DS_READ_B64_gfx9
:
364 case AMDGPU::DS_WRITE_B32
:
365 case AMDGPU::DS_WRITE_B32_gfx9
:
366 case AMDGPU::DS_WRITE_B64
:
367 case AMDGPU::DS_WRITE_B64_gfx9
:
369 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
370 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
371 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
372 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM
;
376 static unsigned getRegs(unsigned Opc
, const SIInstrInfo
&TII
) {
377 if (TII
.isMUBUF(Opc
)) {
380 if (AMDGPU::getMUBUFHasVAddr(Opc
)) {
384 if (AMDGPU::getMUBUFHasSrsrc(Opc
)) {
388 if (AMDGPU::getMUBUFHasSoffset(Opc
)) {
395 if (TII
.isMIMG(Opc
)) {
396 unsigned result
= VADDR
| SRSRC
;
397 const AMDGPU::MIMGInfo
*Info
= AMDGPU::getMIMGInfo(Opc
);
398 if (Info
&& AMDGPU::getMIMGBaseOpcodeInfo(Info
->BaseOpcode
)->Sampler
)
406 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
407 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
408 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
410 case AMDGPU::DS_READ_B32
:
411 case AMDGPU::DS_READ_B64
:
412 case AMDGPU::DS_READ_B32_gfx9
:
413 case AMDGPU::DS_READ_B64_gfx9
:
414 case AMDGPU::DS_WRITE_B32
:
415 case AMDGPU::DS_WRITE_B64
:
416 case AMDGPU::DS_WRITE_B32_gfx9
:
417 case AMDGPU::DS_WRITE_B64_gfx9
:
423 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI
,
424 const SIInstrInfo
&TII
,
425 const GCNSubtarget
&STM
) {
427 unsigned Opc
= MI
->getOpcode();
428 InstClass
= getInstClass(Opc
, TII
);
430 if (InstClass
== UNKNOWN
)
436 (Opc
== AMDGPU::DS_READ_B64
|| Opc
== AMDGPU::DS_READ_B64_gfx9
) ? 8
441 (Opc
== AMDGPU::DS_WRITE_B64
|| Opc
== AMDGPU::DS_WRITE_B64_gfx9
) ? 8
444 case S_BUFFER_LOAD_IMM
:
445 EltSize
= AMDGPU::getSMRDEncodedOffset(STM
, 4);
452 if (InstClass
== MIMG
) {
453 DMask0
= TII
.getNamedOperand(*I
, AMDGPU::OpName::dmask
)->getImm();
455 int OffsetIdx
= AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::offset
);
456 Offset0
= I
->getOperand(OffsetIdx
).getImm();
459 Width0
= getOpcodeWidth(*I
, TII
);
461 if ((InstClass
== DS_READ
) || (InstClass
== DS_WRITE
)) {
463 } else if (InstClass
!= MIMG
) {
464 GLC0
= TII
.getNamedOperand(*I
, AMDGPU::OpName::glc
)->getImm();
465 if (InstClass
!= S_BUFFER_LOAD_IMM
) {
466 SLC0
= TII
.getNamedOperand(*I
, AMDGPU::OpName::slc
)->getImm();
468 DLC0
= TII
.getNamedOperand(*I
, AMDGPU::OpName::dlc
)->getImm();
471 unsigned AddrOpName
[5] = {0};
473 const unsigned Regs
= getRegs(I
->getOpcode(), TII
);
476 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::addr
;
480 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::sbase
;
484 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::srsrc
;
487 if (Regs
& SOFFSET
) {
488 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::soffset
;
492 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::vaddr
;
496 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::ssamp
;
499 for (unsigned i
= 0; i
< NumAddresses
; i
++) {
500 AddrIdx
[i
] = AMDGPU::getNamedOperandIdx(I
->getOpcode(), AddrOpName
[i
]);
501 AddrReg
[i
] = &I
->getOperand(AddrIdx
[i
]);
507 void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI
,
508 const SIInstrInfo
&TII
) {
510 assert(InstClass
== getInstClass(Paired
->getOpcode(), TII
));
512 if (InstClass
== MIMG
) {
513 DMask1
= TII
.getNamedOperand(*Paired
, AMDGPU::OpName::dmask
)->getImm();
516 AMDGPU::getNamedOperandIdx(I
->getOpcode(), AMDGPU::OpName::offset
);
517 Offset1
= Paired
->getOperand(OffsetIdx
).getImm();
520 Width1
= getOpcodeWidth(*Paired
, TII
);
521 if ((InstClass
== DS_READ
) || (InstClass
== DS_WRITE
)) {
523 } else if (InstClass
!= MIMG
) {
524 GLC1
= TII
.getNamedOperand(*Paired
, AMDGPU::OpName::glc
)->getImm();
525 if (InstClass
!= S_BUFFER_LOAD_IMM
) {
526 SLC1
= TII
.getNamedOperand(*Paired
, AMDGPU::OpName::slc
)->getImm();
528 DLC1
= TII
.getNamedOperand(*Paired
, AMDGPU::OpName::dlc
)->getImm();
533 } // end anonymous namespace.
535 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer
, DEBUG_TYPE
,
536 "SI Load Store Optimizer", false, false)
537 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
538 INITIALIZE_PASS_END(SILoadStoreOptimizer
, DEBUG_TYPE
, "SI Load Store Optimizer",
541 char SILoadStoreOptimizer::ID
= 0;
543 char &llvm::SILoadStoreOptimizerID
= SILoadStoreOptimizer::ID
;
545 FunctionPass
*llvm::createSILoadStoreOptimizerPass() {
546 return new SILoadStoreOptimizer();
549 static void moveInstsAfter(MachineBasicBlock::iterator I
,
550 ArrayRef
<MachineInstr
*> InstsToMove
) {
551 MachineBasicBlock
*MBB
= I
->getParent();
553 for (MachineInstr
*MI
: InstsToMove
) {
554 MI
->removeFromParent();
559 static void addDefsUsesToList(const MachineInstr
&MI
,
560 DenseSet
<unsigned> &RegDefs
,
561 DenseSet
<unsigned> &PhysRegUses
) {
562 for (const MachineOperand
&Op
: MI
.operands()) {
565 RegDefs
.insert(Op
.getReg());
566 else if (Op
.readsReg() && Register::isPhysicalRegister(Op
.getReg()))
567 PhysRegUses
.insert(Op
.getReg());
572 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A
,
573 MachineBasicBlock::iterator B
,
575 // RAW or WAR - cannot reorder
576 // WAW - cannot reorder
577 // RAR - safe to reorder
578 return !(A
->mayStore() || B
->mayStore()) || !A
->mayAlias(AA
, *B
, true);
581 // Add MI and its defs to the lists if MI reads one of the defs that are
582 // already in the list. Returns true in that case.
583 static bool addToListsIfDependent(MachineInstr
&MI
, DenseSet
<unsigned> &RegDefs
,
584 DenseSet
<unsigned> &PhysRegUses
,
585 SmallVectorImpl
<MachineInstr
*> &Insts
) {
586 for (MachineOperand
&Use
: MI
.operands()) {
587 // If one of the defs is read, then there is a use of Def between I and the
588 // instruction that I will potentially be merged with. We will need to move
589 // this instruction after the merged instructions.
591 // Similarly, if there is a def which is read by an instruction that is to
592 // be moved for merging, then we need to move the def-instruction as well.
593 // This can only happen for physical registers such as M0; virtual
594 // registers are in SSA form.
596 ((Use
.readsReg() && RegDefs
.count(Use
.getReg())) ||
597 (Use
.isDef() && RegDefs
.count(Use
.getReg())) ||
598 (Use
.isDef() && Register::isPhysicalRegister(Use
.getReg()) &&
599 PhysRegUses
.count(Use
.getReg())))) {
600 Insts
.push_back(&MI
);
601 addDefsUsesToList(MI
, RegDefs
, PhysRegUses
);
609 static bool canMoveInstsAcrossMemOp(MachineInstr
&MemOp
,
610 ArrayRef
<MachineInstr
*> InstsToMove
,
612 assert(MemOp
.mayLoadOrStore());
614 for (MachineInstr
*InstToMove
: InstsToMove
) {
615 if (!InstToMove
->mayLoadOrStore())
617 if (!memAccessesCanBeReordered(MemOp
, *InstToMove
, AA
))
623 // This function assumes that \p A and \p B have are identical except for
624 // size and offset, and they referecne adjacent memory.
625 static MachineMemOperand
*combineKnownAdjacentMMOs(MachineFunction
&MF
,
626 const MachineMemOperand
*A
,
627 const MachineMemOperand
*B
) {
628 unsigned MinOffset
= std::min(A
->getOffset(), B
->getOffset());
629 unsigned Size
= A
->getSize() + B
->getSize();
630 // This function adds the offset parameter to the existing offset for A,
631 // so we pass 0 here as the offset and then manually set it to the correct
632 // value after the call.
633 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(A
, 0, Size
);
634 MMO
->setOffset(MinOffset
);
638 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo
&CI
, const SIInstrInfo
&TII
) {
639 assert(CI
.InstClass
== MIMG
);
641 // Ignore instructions with tfe/lwe set.
642 const auto *TFEOp
= TII
.getNamedOperand(*CI
.I
, AMDGPU::OpName::tfe
);
643 const auto *LWEOp
= TII
.getNamedOperand(*CI
.I
, AMDGPU::OpName::lwe
);
645 if ((TFEOp
&& TFEOp
->getImm()) || (LWEOp
&& LWEOp
->getImm()))
648 // Check other optional immediate operands for equality.
649 unsigned OperandsToMatch
[] = {AMDGPU::OpName::glc
, AMDGPU::OpName::slc
,
650 AMDGPU::OpName::d16
, AMDGPU::OpName::unorm
,
651 AMDGPU::OpName::da
, AMDGPU::OpName::r128
};
653 for (auto op
: OperandsToMatch
) {
654 int Idx
= AMDGPU::getNamedOperandIdx(CI
.I
->getOpcode(), op
);
655 if (AMDGPU::getNamedOperandIdx(CI
.Paired
->getOpcode(), op
) != Idx
)
658 CI
.I
->getOperand(Idx
).getImm() != CI
.Paired
->getOperand(Idx
).getImm())
662 // Check DMask for overlaps.
663 unsigned MaxMask
= std::max(CI
.DMask0
, CI
.DMask1
);
664 unsigned MinMask
= std::min(CI
.DMask0
, CI
.DMask1
);
666 unsigned AllowedBitsForMin
= llvm::countTrailingZeros(MaxMask
);
667 if ((1u << AllowedBitsForMin
) <= MinMask
)
673 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo
&CI
) {
674 assert(CI
.InstClass
!= MIMG
);
676 // XXX - Would the same offset be OK? Is there any reason this would happen or
678 if (CI
.Offset0
== CI
.Offset1
)
681 // This won't be valid if the offset isn't aligned.
682 if ((CI
.Offset0
% CI
.EltSize
!= 0) || (CI
.Offset1
% CI
.EltSize
!= 0))
685 unsigned EltOffset0
= CI
.Offset0
/ CI
.EltSize
;
686 unsigned EltOffset1
= CI
.Offset1
/ CI
.EltSize
;
690 // Handle SMEM and VMEM instructions.
691 if ((CI
.InstClass
!= DS_READ
) && (CI
.InstClass
!= DS_WRITE
)) {
692 return (EltOffset0
+ CI
.Width0
== EltOffset1
||
693 EltOffset1
+ CI
.Width1
== EltOffset0
) &&
694 CI
.GLC0
== CI
.GLC1
&& CI
.DLC0
== CI
.DLC1
&&
695 (CI
.InstClass
== S_BUFFER_LOAD_IMM
|| CI
.SLC0
== CI
.SLC1
);
698 // If the offset in elements doesn't fit in 8-bits, we might be able to use
699 // the stride 64 versions.
700 if ((EltOffset0
% 64 == 0) && (EltOffset1
% 64) == 0 &&
701 isUInt
<8>(EltOffset0
/ 64) && isUInt
<8>(EltOffset1
/ 64)) {
702 CI
.Offset0
= EltOffset0
/ 64;
703 CI
.Offset1
= EltOffset1
/ 64;
708 // Check if the new offsets fit in the reduced 8-bit range.
709 if (isUInt
<8>(EltOffset0
) && isUInt
<8>(EltOffset1
)) {
710 CI
.Offset0
= EltOffset0
;
711 CI
.Offset1
= EltOffset1
;
715 // Try to shift base address to decrease offsets.
716 unsigned OffsetDiff
= std::abs((int)EltOffset1
- (int)EltOffset0
);
717 CI
.BaseOff
= std::min(CI
.Offset0
, CI
.Offset1
);
719 if ((OffsetDiff
% 64 == 0) && isUInt
<8>(OffsetDiff
/ 64)) {
720 CI
.Offset0
= (EltOffset0
- CI
.BaseOff
/ CI
.EltSize
) / 64;
721 CI
.Offset1
= (EltOffset1
- CI
.BaseOff
/ CI
.EltSize
) / 64;
726 if (isUInt
<8>(OffsetDiff
)) {
727 CI
.Offset0
= EltOffset0
- CI
.BaseOff
/ CI
.EltSize
;
728 CI
.Offset1
= EltOffset1
- CI
.BaseOff
/ CI
.EltSize
;
735 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget
&STM
,
736 const CombineInfo
&CI
) {
737 const unsigned Width
= (CI
.Width0
+ CI
.Width1
);
738 switch (CI
.InstClass
) {
740 return (Width
<= 4) && (STM
.hasDwordx3LoadStores() || (Width
!= 3));
741 case S_BUFFER_LOAD_IMM
:
752 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo
&CI
) {
753 MachineBasicBlock
*MBB
= CI
.I
->getParent();
754 MachineBasicBlock::iterator E
= MBB
->end();
755 MachineBasicBlock::iterator MBBI
= CI
.I
;
757 const unsigned Opc
= CI
.I
->getOpcode();
758 const InstClassEnum InstClass
= getInstClass(Opc
, *TII
);
760 if (InstClass
== UNKNOWN
) {
763 const unsigned InstSubclass
= getInstSubclass(Opc
, *TII
);
765 // Do not merge VMEM buffer instructions with "swizzled" bit set.
767 AMDGPU::getNamedOperandIdx(CI
.I
->getOpcode(), AMDGPU::OpName::swz
);
768 if (Swizzled
!= -1 && CI
.I
->getOperand(Swizzled
).getImm())
773 DenseSet
<unsigned> RegDefsToMove
;
774 DenseSet
<unsigned> PhysRegUsesToMove
;
775 addDefsUsesToList(*CI
.I
, RegDefsToMove
, PhysRegUsesToMove
);
777 for (; MBBI
!= E
; ++MBBI
) {
779 if ((getInstClass(MBBI
->getOpcode(), *TII
) != InstClass
) ||
780 (getInstSubclass(MBBI
->getOpcode(), *TII
) != InstSubclass
)) {
781 // This is not a matching instruction, but we can keep looking as
782 // long as one of these conditions are met:
783 // 1. It is safe to move I down past MBBI.
784 // 2. It is safe to move MBBI down past the instruction that I will
787 if (MBBI
->hasUnmodeledSideEffects()) {
788 // We can't re-order this instruction with respect to other memory
789 // operations, so we fail both conditions mentioned above.
793 if (MBBI
->mayLoadOrStore() &&
794 (!memAccessesCanBeReordered(*CI
.I
, *MBBI
, AA
) ||
795 !canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, AA
))) {
796 // We fail condition #1, but we may still be able to satisfy condition
797 // #2. Add this instruction to the move list and then we will check
798 // if condition #2 holds once we have selected the matching instruction.
799 CI
.InstsToMove
.push_back(&*MBBI
);
800 addDefsUsesToList(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
);
804 // When we match I with another DS instruction we will be moving I down
805 // to the location of the matched instruction any uses of I will need to
806 // be moved down as well.
807 addToListsIfDependent(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
,
812 // Don't merge volatiles.
813 if (MBBI
->hasOrderedMemoryRef())
816 // Handle a case like
817 // DS_WRITE_B32 addr, v, idx0
818 // w = DS_READ_B32 addr, idx0
819 // DS_WRITE_B32 addr, f(w), idx1
820 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
821 // merging of the two writes.
822 if (addToListsIfDependent(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
,
826 bool Match
= CI
.hasSameBaseAddress(*MBBI
);
829 CI
.setPaired(MBBI
, *TII
);
831 // Check both offsets (or masks for MIMG) can be combined and fit in the
835 ? dmasksCanBeCombined(CI
, *TII
)
836 : widthsFit(*STM
, CI
) && offsetsCanBeCombined(CI
);
838 // We also need to go through the list of instructions that we plan to
839 // move and make sure they are all safe to move down past the merged
841 if (canBeCombined
&& canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, AA
))
845 // We've found a load/store that we couldn't merge for some reason.
846 // We could potentially keep looking, but we'd need to make sure that
847 // it was safe to move I and also all the instruction in InstsToMove
848 // down past this instruction.
849 // check if we can move I across MBBI and if we can move all I's users
850 if (!memAccessesCanBeReordered(*CI
.I
, *MBBI
, AA
) ||
851 !canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, AA
))
857 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize
) const {
858 if (STM
->ldsRequiresM0Init())
859 return (EltSize
== 4) ? AMDGPU::DS_READ2_B32
: AMDGPU::DS_READ2_B64
;
860 return (EltSize
== 4) ? AMDGPU::DS_READ2_B32_gfx9
: AMDGPU::DS_READ2_B64_gfx9
;
863 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize
) const {
864 if (STM
->ldsRequiresM0Init())
865 return (EltSize
== 4) ? AMDGPU::DS_READ2ST64_B32
: AMDGPU::DS_READ2ST64_B64
;
867 return (EltSize
== 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
868 : AMDGPU::DS_READ2ST64_B64_gfx9
;
871 MachineBasicBlock::iterator
872 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo
&CI
) {
873 MachineBasicBlock
*MBB
= CI
.I
->getParent();
875 // Be careful, since the addresses could be subregisters themselves in weird
876 // cases, like vectors of pointers.
877 const auto *AddrReg
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::addr
);
879 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdst
);
880 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdst
);
882 unsigned NewOffset0
= CI
.Offset0
;
883 unsigned NewOffset1
= CI
.Offset1
;
885 CI
.UseST64
? read2ST64Opcode(CI
.EltSize
) : read2Opcode(CI
.EltSize
);
887 unsigned SubRegIdx0
= (CI
.EltSize
== 4) ? AMDGPU::sub0
: AMDGPU::sub0_sub1
;
888 unsigned SubRegIdx1
= (CI
.EltSize
== 4) ? AMDGPU::sub1
: AMDGPU::sub2_sub3
;
890 if (NewOffset0
> NewOffset1
) {
891 // Canonicalize the merged instruction so the smaller offset comes first.
892 std::swap(NewOffset0
, NewOffset1
);
893 std::swap(SubRegIdx0
, SubRegIdx1
);
896 assert((isUInt
<8>(NewOffset0
) && isUInt
<8>(NewOffset1
)) &&
897 (NewOffset0
!= NewOffset1
) && "Computed offset doesn't fit");
899 const MCInstrDesc
&Read2Desc
= TII
->get(Opc
);
901 const TargetRegisterClass
*SuperRC
=
902 (CI
.EltSize
== 4) ? &AMDGPU::VReg_64RegClass
: &AMDGPU::VReg_128RegClass
;
903 Register DestReg
= MRI
->createVirtualRegister(SuperRC
);
905 DebugLoc DL
= CI
.I
->getDebugLoc();
907 Register BaseReg
= AddrReg
->getReg();
908 unsigned BaseSubReg
= AddrReg
->getSubReg();
909 unsigned BaseRegFlags
= 0;
911 Register ImmReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
912 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::S_MOV_B32
), ImmReg
)
915 BaseReg
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
916 BaseRegFlags
= RegState::Kill
;
918 TII
->getAddNoCarry(*MBB
, CI
.Paired
, DL
, BaseReg
)
920 .addReg(AddrReg
->getReg(), 0, BaseSubReg
)
921 .addImm(0); // clamp bit
925 MachineInstrBuilder Read2
=
926 BuildMI(*MBB
, CI
.Paired
, DL
, Read2Desc
, DestReg
)
927 .addReg(BaseReg
, BaseRegFlags
, BaseSubReg
) // addr
928 .addImm(NewOffset0
) // offset0
929 .addImm(NewOffset1
) // offset1
931 .cloneMergedMemRefs({&*CI
.I
, &*CI
.Paired
});
935 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
937 // Copy to the old destination registers.
938 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
939 .add(*Dest0
) // Copy to same destination including flags and sub reg.
940 .addReg(DestReg
, 0, SubRegIdx0
);
941 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
943 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
945 moveInstsAfter(Copy1
, CI
.InstsToMove
);
947 CI
.I
->eraseFromParent();
948 CI
.Paired
->eraseFromParent();
950 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2
<< '\n');
954 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize
) const {
955 if (STM
->ldsRequiresM0Init())
956 return (EltSize
== 4) ? AMDGPU::DS_WRITE2_B32
: AMDGPU::DS_WRITE2_B64
;
957 return (EltSize
== 4) ? AMDGPU::DS_WRITE2_B32_gfx9
958 : AMDGPU::DS_WRITE2_B64_gfx9
;
961 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize
) const {
962 if (STM
->ldsRequiresM0Init())
963 return (EltSize
== 4) ? AMDGPU::DS_WRITE2ST64_B32
964 : AMDGPU::DS_WRITE2ST64_B64
;
966 return (EltSize
== 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
967 : AMDGPU::DS_WRITE2ST64_B64_gfx9
;
970 MachineBasicBlock::iterator
971 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo
&CI
) {
972 MachineBasicBlock
*MBB
= CI
.I
->getParent();
974 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
975 // sure we preserve the subregister index and any register flags set on them.
976 const MachineOperand
*AddrReg
=
977 TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::addr
);
978 const MachineOperand
*Data0
=
979 TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::data0
);
980 const MachineOperand
*Data1
=
981 TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::data0
);
983 unsigned NewOffset0
= CI
.Offset0
;
984 unsigned NewOffset1
= CI
.Offset1
;
986 CI
.UseST64
? write2ST64Opcode(CI
.EltSize
) : write2Opcode(CI
.EltSize
);
988 if (NewOffset0
> NewOffset1
) {
989 // Canonicalize the merged instruction so the smaller offset comes first.
990 std::swap(NewOffset0
, NewOffset1
);
991 std::swap(Data0
, Data1
);
994 assert((isUInt
<8>(NewOffset0
) && isUInt
<8>(NewOffset1
)) &&
995 (NewOffset0
!= NewOffset1
) && "Computed offset doesn't fit");
997 const MCInstrDesc
&Write2Desc
= TII
->get(Opc
);
998 DebugLoc DL
= CI
.I
->getDebugLoc();
1000 Register BaseReg
= AddrReg
->getReg();
1001 unsigned BaseSubReg
= AddrReg
->getSubReg();
1002 unsigned BaseRegFlags
= 0;
1004 Register ImmReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1005 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::S_MOV_B32
), ImmReg
)
1006 .addImm(CI
.BaseOff
);
1008 BaseReg
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1009 BaseRegFlags
= RegState::Kill
;
1011 TII
->getAddNoCarry(*MBB
, CI
.Paired
, DL
, BaseReg
)
1013 .addReg(AddrReg
->getReg(), 0, BaseSubReg
)
1014 .addImm(0); // clamp bit
1018 MachineInstrBuilder Write2
=
1019 BuildMI(*MBB
, CI
.Paired
, DL
, Write2Desc
)
1020 .addReg(BaseReg
, BaseRegFlags
, BaseSubReg
) // addr
1021 .add(*Data0
) // data0
1022 .add(*Data1
) // data1
1023 .addImm(NewOffset0
) // offset0
1024 .addImm(NewOffset1
) // offset1
1026 .cloneMergedMemRefs({&*CI
.I
, &*CI
.Paired
});
1028 moveInstsAfter(Write2
, CI
.InstsToMove
);
1030 CI
.I
->eraseFromParent();
1031 CI
.Paired
->eraseFromParent();
1033 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2
<< '\n');
1037 MachineBasicBlock::iterator
1038 SILoadStoreOptimizer::mergeImagePair(CombineInfo
&CI
) {
1039 MachineBasicBlock
*MBB
= CI
.I
->getParent();
1040 DebugLoc DL
= CI
.I
->getDebugLoc();
1041 const unsigned Opcode
= getNewOpcode(CI
);
1043 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
1045 Register DestReg
= MRI
->createVirtualRegister(SuperRC
);
1046 unsigned MergedDMask
= CI
.DMask0
| CI
.DMask1
;
1048 AMDGPU::getNamedOperandIdx(CI
.I
->getOpcode(), AMDGPU::OpName::dmask
);
1050 auto MIB
= BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
), DestReg
);
1051 for (unsigned I
= 1, E
= (*CI
.I
).getNumOperands(); I
!= E
; ++I
) {
1053 MIB
.addImm(MergedDMask
);
1055 MIB
.add((*CI
.I
).getOperand(I
));
1058 // It shouldn't be possible to get this far if the two instructions
1059 // don't have a single memoperand, because MachineInstr::mayAlias()
1060 // will return true if this is the case.
1061 assert(CI
.I
->hasOneMemOperand() && CI
.Paired
->hasOneMemOperand());
1063 const MachineMemOperand
*MMOa
= *CI
.I
->memoperands_begin();
1064 const MachineMemOperand
*MMOb
= *CI
.Paired
->memoperands_begin();
1066 MachineInstr
*New
= MIB
.addMemOperand(combineKnownAdjacentMMOs(*MBB
->getParent(), MMOa
, MMOb
));
1068 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
1069 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
1070 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
1072 // Copy to the old destination registers.
1073 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
1074 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdata
);
1075 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdata
);
1077 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
1078 .add(*Dest0
) // Copy to same destination including flags and sub reg.
1079 .addReg(DestReg
, 0, SubRegIdx0
);
1080 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
1082 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
1084 moveInstsAfter(Copy1
, CI
.InstsToMove
);
1086 CI
.I
->eraseFromParent();
1087 CI
.Paired
->eraseFromParent();
1091 MachineBasicBlock::iterator
1092 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo
&CI
) {
1093 MachineBasicBlock
*MBB
= CI
.I
->getParent();
1094 DebugLoc DL
= CI
.I
->getDebugLoc();
1095 const unsigned Opcode
= getNewOpcode(CI
);
1097 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
1099 Register DestReg
= MRI
->createVirtualRegister(SuperRC
);
1100 unsigned MergedOffset
= std::min(CI
.Offset0
, CI
.Offset1
);
1102 // It shouldn't be possible to get this far if the two instructions
1103 // don't have a single memoperand, because MachineInstr::mayAlias()
1104 // will return true if this is the case.
1105 assert(CI
.I
->hasOneMemOperand() && CI
.Paired
->hasOneMemOperand());
1107 const MachineMemOperand
*MMOa
= *CI
.I
->memoperands_begin();
1108 const MachineMemOperand
*MMOb
= *CI
.Paired
->memoperands_begin();
1111 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
), DestReg
)
1112 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::sbase
))
1113 .addImm(MergedOffset
) // offset
1114 .addImm(CI
.GLC0
) // glc
1115 .addImm(CI
.DLC0
) // dlc
1116 .addMemOperand(combineKnownAdjacentMMOs(*MBB
->getParent(), MMOa
, MMOb
));
1118 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
1119 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
1120 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
1122 // Copy to the old destination registers.
1123 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
1124 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::sdst
);
1125 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::sdst
);
1127 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
1128 .add(*Dest0
) // Copy to same destination including flags and sub reg.
1129 .addReg(DestReg
, 0, SubRegIdx0
);
1130 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
1132 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
1134 moveInstsAfter(Copy1
, CI
.InstsToMove
);
1136 CI
.I
->eraseFromParent();
1137 CI
.Paired
->eraseFromParent();
1141 MachineBasicBlock::iterator
1142 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo
&CI
) {
1143 MachineBasicBlock
*MBB
= CI
.I
->getParent();
1144 DebugLoc DL
= CI
.I
->getDebugLoc();
1146 const unsigned Opcode
= getNewOpcode(CI
);
1148 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
1150 // Copy to the new source register.
1151 Register DestReg
= MRI
->createVirtualRegister(SuperRC
);
1152 unsigned MergedOffset
= std::min(CI
.Offset0
, CI
.Offset1
);
1154 auto MIB
= BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
), DestReg
);
1156 const unsigned Regs
= getRegs(Opcode
, *TII
);
1159 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vaddr
));
1161 // It shouldn't be possible to get this far if the two instructions
1162 // don't have a single memoperand, because MachineInstr::mayAlias()
1163 // will return true if this is the case.
1164 assert(CI
.I
->hasOneMemOperand() && CI
.Paired
->hasOneMemOperand());
1166 const MachineMemOperand
*MMOa
= *CI
.I
->memoperands_begin();
1167 const MachineMemOperand
*MMOb
= *CI
.Paired
->memoperands_begin();
1170 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::srsrc
))
1171 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::soffset
))
1172 .addImm(MergedOffset
) // offset
1173 .addImm(CI
.GLC0
) // glc
1174 .addImm(CI
.SLC0
) // slc
1176 .addImm(CI
.DLC0
) // dlc
1178 .addMemOperand(combineKnownAdjacentMMOs(*MBB
->getParent(), MMOa
, MMOb
));
1180 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
1181 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
1182 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
1184 // Copy to the old destination registers.
1185 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
1186 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdata
);
1187 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdata
);
1189 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
1190 .add(*Dest0
) // Copy to same destination including flags and sub reg.
1191 .addReg(DestReg
, 0, SubRegIdx0
);
1192 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
1194 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
1196 moveInstsAfter(Copy1
, CI
.InstsToMove
);
1198 CI
.I
->eraseFromParent();
1199 CI
.Paired
->eraseFromParent();
1203 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo
&CI
) {
1204 const unsigned Width
= CI
.Width0
+ CI
.Width1
;
1206 switch (CI
.InstClass
) {
1208 assert(CI
.InstClass
== BUFFER_LOAD
|| CI
.InstClass
== BUFFER_STORE
);
1209 // FIXME: Handle d16 correctly
1210 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI
.I
->getOpcode()),
1213 llvm_unreachable("Unknown instruction class");
1214 case S_BUFFER_LOAD_IMM
:
1219 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
;
1221 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
;
1224 assert("No overlaps" && (countPopulation(CI
.DMask0
| CI
.DMask1
) == Width
));
1225 return AMDGPU::getMaskedMIMGOp(CI
.I
->getOpcode(), Width
);
1229 std::pair
<unsigned, unsigned>
1230 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo
&CI
) {
1232 if (CI
.Width0
== 0 || CI
.Width0
== 0 || CI
.Width0
+ CI
.Width1
> 4)
1233 return std::make_pair(0, 0);
1236 if (CI
.InstClass
== MIMG
) {
1237 assert((countPopulation(CI
.DMask0
| CI
.DMask1
) == CI
.Width0
+ CI
.Width1
) &&
1239 ReverseOrder
= CI
.DMask0
> CI
.DMask1
;
1241 ReverseOrder
= CI
.Offset0
> CI
.Offset1
;
1243 static const unsigned Idxs
[4][4] = {
1244 {AMDGPU::sub0
, AMDGPU::sub0_sub1
, AMDGPU::sub0_sub1_sub2
, AMDGPU::sub0_sub1_sub2_sub3
},
1245 {AMDGPU::sub1
, AMDGPU::sub1_sub2
, AMDGPU::sub1_sub2_sub3
, 0},
1246 {AMDGPU::sub2
, AMDGPU::sub2_sub3
, 0, 0},
1247 {AMDGPU::sub3
, 0, 0, 0},
1252 assert(CI
.Width0
>= 1 && CI
.Width0
<= 3);
1253 assert(CI
.Width1
>= 1 && CI
.Width1
<= 3);
1256 Idx1
= Idxs
[0][CI
.Width1
- 1];
1257 Idx0
= Idxs
[CI
.Width1
][CI
.Width0
- 1];
1259 Idx0
= Idxs
[0][CI
.Width0
- 1];
1260 Idx1
= Idxs
[CI
.Width0
][CI
.Width1
- 1];
1263 return std::make_pair(Idx0
, Idx1
);
1266 const TargetRegisterClass
*
1267 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo
&CI
) {
1268 if (CI
.InstClass
== S_BUFFER_LOAD_IMM
) {
1269 switch (CI
.Width0
+ CI
.Width1
) {
1273 return &AMDGPU::SReg_64_XEXECRegClass
;
1275 return &AMDGPU::SGPR_128RegClass
;
1277 return &AMDGPU::SReg_256RegClass
;
1279 return &AMDGPU::SReg_512RegClass
;
1282 switch (CI
.Width0
+ CI
.Width1
) {
1286 return &AMDGPU::VReg_64RegClass
;
1288 return &AMDGPU::VReg_96RegClass
;
1290 return &AMDGPU::VReg_128RegClass
;
1295 MachineBasicBlock::iterator
1296 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo
&CI
) {
1297 MachineBasicBlock
*MBB
= CI
.I
->getParent();
1298 DebugLoc DL
= CI
.I
->getDebugLoc();
1300 const unsigned Opcode
= getNewOpcode(CI
);
1302 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
1303 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
1304 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
1306 // Copy to the new source register.
1307 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
1308 Register SrcReg
= MRI
->createVirtualRegister(SuperRC
);
1310 const auto *Src0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdata
);
1311 const auto *Src1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdata
);
1313 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
), SrcReg
)
1317 .addImm(SubRegIdx1
);
1319 auto MIB
= BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
))
1320 .addReg(SrcReg
, RegState::Kill
);
1322 const unsigned Regs
= getRegs(Opcode
, *TII
);
1325 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vaddr
));
1328 // It shouldn't be possible to get this far if the two instructions
1329 // don't have a single memoperand, because MachineInstr::mayAlias()
1330 // will return true if this is the case.
1331 assert(CI
.I
->hasOneMemOperand() && CI
.Paired
->hasOneMemOperand());
1333 const MachineMemOperand
*MMOa
= *CI
.I
->memoperands_begin();
1334 const MachineMemOperand
*MMOb
= *CI
.Paired
->memoperands_begin();
1337 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::srsrc
))
1338 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::soffset
))
1339 .addImm(std::min(CI
.Offset0
, CI
.Offset1
)) // offset
1340 .addImm(CI
.GLC0
) // glc
1341 .addImm(CI
.SLC0
) // slc
1343 .addImm(CI
.DLC0
) // dlc
1345 .addMemOperand(combineKnownAdjacentMMOs(*MBB
->getParent(), MMOa
, MMOb
));
1347 moveInstsAfter(MIB
, CI
.InstsToMove
);
1349 CI
.I
->eraseFromParent();
1350 CI
.Paired
->eraseFromParent();
1355 SILoadStoreOptimizer::createRegOrImm(int32_t Val
, MachineInstr
&MI
) const {
1356 APInt
V(32, Val
, true);
1357 if (TII
->isInlineConstant(V
))
1358 return MachineOperand::CreateImm(Val
);
1360 Register Reg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1362 BuildMI(*MI
.getParent(), MI
.getIterator(), MI
.getDebugLoc(),
1363 TII
->get(AMDGPU::S_MOV_B32
), Reg
)
1366 LLVM_DEBUG(dbgs() << " "; Mov
->dump());
1367 return MachineOperand::CreateReg(Reg
, false);
1370 // Compute base address using Addr and return the final register.
1371 unsigned SILoadStoreOptimizer::computeBase(MachineInstr
&MI
,
1372 const MemAddress
&Addr
) const {
1373 MachineBasicBlock
*MBB
= MI
.getParent();
1374 MachineBasicBlock::iterator MBBI
= MI
.getIterator();
1375 DebugLoc DL
= MI
.getDebugLoc();
1377 assert((TRI
->getRegSizeInBits(Addr
.Base
.LoReg
, *MRI
) == 32 ||
1378 Addr
.Base
.LoSubReg
) &&
1379 "Expected 32-bit Base-Register-Low!!");
1381 assert((TRI
->getRegSizeInBits(Addr
.Base
.HiReg
, *MRI
) == 32 ||
1382 Addr
.Base
.HiSubReg
) &&
1383 "Expected 32-bit Base-Register-Hi!!");
1385 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1386 MachineOperand OffsetLo
= createRegOrImm(static_cast<int32_t>(Addr
.Offset
), MI
);
1387 MachineOperand OffsetHi
=
1388 createRegOrImm(static_cast<int32_t>(Addr
.Offset
>> 32), MI
);
1390 const auto *CarryRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
1391 Register CarryReg
= MRI
->createVirtualRegister(CarryRC
);
1392 Register DeadCarryReg
= MRI
->createVirtualRegister(CarryRC
);
1394 Register DestSub0
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1395 Register DestSub1
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1396 MachineInstr
*LoHalf
=
1397 BuildMI(*MBB
, MBBI
, DL
, TII
->get(AMDGPU::V_ADD_I32_e64
), DestSub0
)
1398 .addReg(CarryReg
, RegState::Define
)
1399 .addReg(Addr
.Base
.LoReg
, 0, Addr
.Base
.LoSubReg
)
1401 .addImm(0); // clamp bit
1403 LLVM_DEBUG(dbgs() << " "; LoHalf
->dump(););
1405 MachineInstr
*HiHalf
=
1406 BuildMI(*MBB
, MBBI
, DL
, TII
->get(AMDGPU::V_ADDC_U32_e64
), DestSub1
)
1407 .addReg(DeadCarryReg
, RegState::Define
| RegState::Dead
)
1408 .addReg(Addr
.Base
.HiReg
, 0, Addr
.Base
.HiSubReg
)
1410 .addReg(CarryReg
, RegState::Kill
)
1411 .addImm(0); // clamp bit
1413 LLVM_DEBUG(dbgs() << " "; HiHalf
->dump(););
1415 Register FullDestReg
= MRI
->createVirtualRegister(&AMDGPU::VReg_64RegClass
);
1416 MachineInstr
*FullBase
=
1417 BuildMI(*MBB
, MBBI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), FullDestReg
)
1419 .addImm(AMDGPU::sub0
)
1421 .addImm(AMDGPU::sub1
);
1423 LLVM_DEBUG(dbgs() << " "; FullBase
->dump(); dbgs() << "\n";);
1428 // Update base and offset with the NewBase and NewOffset in MI.
1429 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr
&MI
,
1431 int32_t NewOffset
) const {
1432 TII
->getNamedOperand(MI
, AMDGPU::OpName::vaddr
)->setReg(NewBase
);
1433 TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->setImm(NewOffset
);
1437 SILoadStoreOptimizer::extractConstOffset(const MachineOperand
&Op
) const {
1444 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Op
.getReg());
1445 if (!Def
|| Def
->getOpcode() != AMDGPU::S_MOV_B32
||
1446 !Def
->getOperand(1).isImm())
1449 return Def
->getOperand(1).getImm();
1452 // Analyze Base and extracts:
1453 // - 32bit base registers, subregisters
1454 // - 64bit constant offset
1455 // Expecting base computation as:
1456 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1457 // %LO:vgpr_32, %c:sreg_64_xexec =
1458 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1459 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1461 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1462 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand
&Base
,
1463 MemAddress
&Addr
) const {
1467 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Base
.getReg());
1468 if (!Def
|| Def
->getOpcode() != AMDGPU::REG_SEQUENCE
1469 || Def
->getNumOperands() != 5)
1472 MachineOperand BaseLo
= Def
->getOperand(1);
1473 MachineOperand BaseHi
= Def
->getOperand(3);
1474 if (!BaseLo
.isReg() || !BaseHi
.isReg())
1477 MachineInstr
*BaseLoDef
= MRI
->getUniqueVRegDef(BaseLo
.getReg());
1478 MachineInstr
*BaseHiDef
= MRI
->getUniqueVRegDef(BaseHi
.getReg());
1480 if (!BaseLoDef
|| BaseLoDef
->getOpcode() != AMDGPU::V_ADD_I32_e64
||
1481 !BaseHiDef
|| BaseHiDef
->getOpcode() != AMDGPU::V_ADDC_U32_e64
)
1484 const auto *Src0
= TII
->getNamedOperand(*BaseLoDef
, AMDGPU::OpName::src0
);
1485 const auto *Src1
= TII
->getNamedOperand(*BaseLoDef
, AMDGPU::OpName::src1
);
1487 auto Offset0P
= extractConstOffset(*Src0
);
1491 if (!(Offset0P
= extractConstOffset(*Src1
)))
1496 Src0
= TII
->getNamedOperand(*BaseHiDef
, AMDGPU::OpName::src0
);
1497 Src1
= TII
->getNamedOperand(*BaseHiDef
, AMDGPU::OpName::src1
);
1500 std::swap(Src0
, Src1
);
1505 uint64_t Offset1
= Src1
->getImm();
1508 Addr
.Base
.LoReg
= BaseLo
.getReg();
1509 Addr
.Base
.HiReg
= BaseHi
.getReg();
1510 Addr
.Base
.LoSubReg
= BaseLo
.getSubReg();
1511 Addr
.Base
.HiSubReg
= BaseHi
.getSubReg();
1512 Addr
.Offset
= (*Offset0P
& 0x00000000ffffffff) | (Offset1
<< 32);
1515 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1517 MemInfoMap
&Visited
,
1518 SmallPtrSet
<MachineInstr
*, 4> &AnchorList
) const {
1520 if (!(MI
.mayLoad() ^ MI
.mayStore()))
1523 // TODO: Support flat and scratch.
1524 if (AMDGPU::getGlobalSaddrOp(MI
.getOpcode()) < 0)
1527 if (MI
.mayLoad() && TII
->getNamedOperand(MI
, AMDGPU::OpName::vdata
) != NULL
)
1530 if (AnchorList
.count(&MI
))
1533 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI
.dump());
1535 if (TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm()) {
1536 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1540 // Step1: Find the base-registers and a 64bit constant offset.
1541 MachineOperand
&Base
= *TII
->getNamedOperand(MI
, AMDGPU::OpName::vaddr
);
1543 if (Visited
.find(&MI
) == Visited
.end()) {
1544 processBaseWithConstOffset(Base
, MAddr
);
1545 Visited
[&MI
] = MAddr
;
1547 MAddr
= Visited
[&MI
];
1549 if (MAddr
.Offset
== 0) {
1550 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1551 " constant offsets that can be promoted.\n";);
1555 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr
.Base
.HiReg
<< ", "
1556 << MAddr
.Base
.LoReg
<< "} Offset: " << MAddr
.Offset
<< "\n\n";);
1558 // Step2: Traverse through MI's basic block and find an anchor(that has the
1559 // same base-registers) with the highest 13bit distance from MI's offset.
1560 // E.g. (64bit loads)
1562 // addr1 = &a + 4096; load1 = load(addr1, 0)
1563 // addr2 = &a + 6144; load2 = load(addr2, 0)
1564 // addr3 = &a + 8192; load3 = load(addr3, 0)
1565 // addr4 = &a + 10240; load4 = load(addr4, 0)
1566 // addr5 = &a + 12288; load5 = load(addr5, 0)
1568 // Starting from the first load, the optimization will try to find a new base
1569 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1570 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1571 // as the new-base(anchor) because of the maximum distance which can
1572 // accomodate more intermediate bases presumeably.
1574 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1575 // (&a + 8192) for load1, load2, load4.
1577 // load1 = load(addr, -4096)
1578 // load2 = load(addr, -2048)
1579 // load3 = load(addr, 0)
1580 // load4 = load(addr, 2048)
1581 // addr5 = &a + 12288; load5 = load(addr5, 0)
1583 MachineInstr
*AnchorInst
= nullptr;
1584 MemAddress AnchorAddr
;
1585 uint32_t MaxDist
= std::numeric_limits
<uint32_t>::min();
1586 SmallVector
<std::pair
<MachineInstr
*, int64_t>, 4> InstsWCommonBase
;
1588 MachineBasicBlock
*MBB
= MI
.getParent();
1589 MachineBasicBlock::iterator E
= MBB
->end();
1590 MachineBasicBlock::iterator MBBI
= MI
.getIterator();
1592 const SITargetLowering
*TLI
=
1593 static_cast<const SITargetLowering
*>(STM
->getTargetLowering());
1595 for ( ; MBBI
!= E
; ++MBBI
) {
1596 MachineInstr
&MINext
= *MBBI
;
1597 // TODO: Support finding an anchor(with same base) from store addresses or
1598 // any other load addresses where the opcodes are different.
1599 if (MINext
.getOpcode() != MI
.getOpcode() ||
1600 TII
->getNamedOperand(MINext
, AMDGPU::OpName::offset
)->getImm())
1603 const MachineOperand
&BaseNext
=
1604 *TII
->getNamedOperand(MINext
, AMDGPU::OpName::vaddr
);
1605 MemAddress MAddrNext
;
1606 if (Visited
.find(&MINext
) == Visited
.end()) {
1607 processBaseWithConstOffset(BaseNext
, MAddrNext
);
1608 Visited
[&MINext
] = MAddrNext
;
1610 MAddrNext
= Visited
[&MINext
];
1612 if (MAddrNext
.Base
.LoReg
!= MAddr
.Base
.LoReg
||
1613 MAddrNext
.Base
.HiReg
!= MAddr
.Base
.HiReg
||
1614 MAddrNext
.Base
.LoSubReg
!= MAddr
.Base
.LoSubReg
||
1615 MAddrNext
.Base
.HiSubReg
!= MAddr
.Base
.HiSubReg
)
1618 InstsWCommonBase
.push_back(std::make_pair(&MINext
, MAddrNext
.Offset
));
1620 int64_t Dist
= MAddr
.Offset
- MAddrNext
.Offset
;
1621 TargetLoweringBase::AddrMode AM
;
1622 AM
.HasBaseReg
= true;
1624 if (TLI
->isLegalGlobalAddressingMode(AM
) &&
1625 (uint32_t)std::abs(Dist
) > MaxDist
) {
1626 MaxDist
= std::abs(Dist
);
1628 AnchorAddr
= MAddrNext
;
1629 AnchorInst
= &MINext
;
1634 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1635 AnchorInst
->dump());
1636 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1637 << AnchorAddr
.Offset
<< "\n\n");
1639 // Instead of moving up, just re-compute anchor-instruction's base address.
1640 unsigned Base
= computeBase(MI
, AnchorAddr
);
1642 updateBaseAndOffset(MI
, Base
, MAddr
.Offset
- AnchorAddr
.Offset
);
1643 LLVM_DEBUG(dbgs() << " After promotion: "; MI
.dump(););
1645 for (auto P
: InstsWCommonBase
) {
1646 TargetLoweringBase::AddrMode AM
;
1647 AM
.HasBaseReg
= true;
1648 AM
.BaseOffs
= P
.second
- AnchorAddr
.Offset
;
1650 if (TLI
->isLegalGlobalAddressingMode(AM
)) {
1651 LLVM_DEBUG(dbgs() << " Promote Offset(" << P
.second
;
1652 dbgs() << ")"; P
.first
->dump());
1653 updateBaseAndOffset(*P
.first
, Base
, P
.second
- AnchorAddr
.Offset
);
1654 LLVM_DEBUG(dbgs() << " After promotion: "; P
.first
->dump());
1657 AnchorList
.insert(AnchorInst
);
1664 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo
&CI
,
1665 std::list
<std::list
<CombineInfo
> > &MergeableInsts
) const {
1666 for (std::list
<CombineInfo
> &AddrList
: MergeableInsts
) {
1667 if (AddrList
.front().hasSameBaseAddress(*CI
.I
) &&
1668 AddrList
.front().InstClass
== CI
.InstClass
) {
1669 AddrList
.emplace_back(CI
);
1674 // Base address not found, so add a new list.
1675 MergeableInsts
.emplace_back(1, CI
);
1678 bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock
&MBB
,
1679 std::list
<std::list
<CombineInfo
> > &MergeableInsts
) const {
1680 bool Modified
= false;
1683 // Contains the list of instructions for which constant offsets are being
1684 // promoted to the IMM.
1685 SmallPtrSet
<MachineInstr
*, 4> AnchorList
;
1687 // Sort potential mergeable instructions into lists. One list per base address.
1688 for (MachineInstr
&MI
: MBB
.instrs()) {
1689 // We run this before checking if an address is mergeable, because it can produce
1690 // better code even if the instructions aren't mergeable.
1691 if (promoteConstantOffsetToImm(MI
, Visited
, AnchorList
))
1694 const InstClassEnum InstClass
= getInstClass(MI
.getOpcode(), *TII
);
1695 if (InstClass
== UNKNOWN
)
1698 // Don't combine if volatile.
1699 if (MI
.hasOrderedMemoryRef())
1703 CI
.setMI(MI
, *TII
, *STM
);
1705 if (!CI
.hasMergeableAddress(*MRI
))
1708 addInstToMergeableList(CI
, MergeableInsts
);
1713 // Scan through looking for adjacent LDS operations with constant offsets from
1714 // the same base register. We rely on the scheduler to do the hard work of
1715 // clustering nearby loads, and assume these are all adjacent.
1716 bool SILoadStoreOptimizer::optimizeBlock(
1717 std::list
<std::list
<CombineInfo
> > &MergeableInsts
) {
1718 bool Modified
= false;
1720 for (std::list
<CombineInfo
> &MergeList
: MergeableInsts
) {
1721 if (MergeList
.size() < 2)
1724 bool OptimizeListAgain
= false;
1725 if (!optimizeInstsWithSameBaseAddr(MergeList
, OptimizeListAgain
)) {
1726 // We weren't able to make any changes, so clear the list so we don't
1727 // process the same instructions the next time we try to optimize this
1733 // We made changes, but also determined that there were no more optimization
1734 // opportunities, so we don't need to reprocess the list
1735 if (!OptimizeListAgain
)
1738 OptimizeAgain
|= OptimizeListAgain
;
1745 SILoadStoreOptimizer::removeCombinedInst(std::list
<CombineInfo
> &MergeList
,
1746 const MachineInstr
&MI
) {
1748 for (auto CI
= MergeList
.begin(), E
= MergeList
.end(); CI
!= E
; ++CI
) {
1749 if (&*CI
->I
== &MI
) {
1750 MergeList
.erase(CI
);
1757 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
1758 std::list
<CombineInfo
> &MergeList
,
1759 bool &OptimizeListAgain
) {
1760 bool Modified
= false;
1761 for (auto I
= MergeList
.begin(); I
!= MergeList
.end(); ++I
) {
1762 CombineInfo
&CI
= *I
;
1764 switch (CI
.InstClass
) {
1768 if (findMatchingInst(CI
)) {
1770 removeCombinedInst(MergeList
, *CI
.Paired
);
1771 MachineBasicBlock::iterator NewMI
= mergeRead2Pair(CI
);
1772 CI
.setMI(NewMI
, *TII
, *STM
);
1776 if (findMatchingInst(CI
)) {
1778 removeCombinedInst(MergeList
, *CI
.Paired
);
1779 MachineBasicBlock::iterator NewMI
= mergeWrite2Pair(CI
);
1780 CI
.setMI(NewMI
, *TII
, *STM
);
1783 case S_BUFFER_LOAD_IMM
:
1784 if (findMatchingInst(CI
)) {
1786 removeCombinedInst(MergeList
, *CI
.Paired
);
1787 MachineBasicBlock::iterator NewMI
= mergeSBufferLoadImmPair(CI
);
1788 CI
.setMI(NewMI
, *TII
, *STM
);
1789 OptimizeListAgain
|= (CI
.Width0
+ CI
.Width1
) < 16;
1793 if (findMatchingInst(CI
)) {
1795 removeCombinedInst(MergeList
, *CI
.Paired
);
1796 MachineBasicBlock::iterator NewMI
= mergeBufferLoadPair(CI
);
1797 CI
.setMI(NewMI
, *TII
, *STM
);
1798 OptimizeListAgain
|= (CI
.Width0
+ CI
.Width1
) < 4;
1802 if (findMatchingInst(CI
)) {
1804 removeCombinedInst(MergeList
, *CI
.Paired
);
1805 MachineBasicBlock::iterator NewMI
= mergeBufferStorePair(CI
);
1806 CI
.setMI(NewMI
, *TII
, *STM
);
1807 OptimizeListAgain
|= (CI
.Width0
+ CI
.Width1
) < 4;
1811 if (findMatchingInst(CI
)) {
1813 removeCombinedInst(MergeList
, *CI
.Paired
);
1814 MachineBasicBlock::iterator NewMI
= mergeImagePair(CI
);
1815 CI
.setMI(NewMI
, *TII
, *STM
);
1816 OptimizeListAgain
|= (CI
.Width0
+ CI
.Width1
) < 4;
1820 // Clear the InstsToMove after we have finished searching so we don't have
1821 // stale values left over if we search for this CI again in another pass
1823 CI
.InstsToMove
.clear();
1829 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction
&MF
) {
1830 if (skipFunction(MF
.getFunction()))
1833 STM
= &MF
.getSubtarget
<GCNSubtarget
>();
1834 if (!STM
->loadStoreOptEnabled())
1837 TII
= STM
->getInstrInfo();
1838 TRI
= &TII
->getRegisterInfo();
1840 MRI
= &MF
.getRegInfo();
1841 AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
1843 assert(MRI
->isSSA() && "Must be run on SSA");
1845 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1847 bool Modified
= false;
1850 for (MachineBasicBlock
&MBB
: MF
) {
1851 std::list
<std::list
<CombineInfo
> > MergeableInsts
;
1852 // First pass: Collect list of all instructions we know how to merge.
1853 Modified
|= collectMergeableInsts(MBB
, MergeableInsts
);
1855 OptimizeAgain
= false;
1856 Modified
|= optimizeBlock(MergeableInsts
);
1857 } while (OptimizeAgain
);