1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
43 // Future improvements:
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
58 //===----------------------------------------------------------------------===//
61 #include "AMDGPUSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "SIInstrInfo.h"
64 #include "SIRegisterInfo.h"
65 #include "Utils/AMDGPUBaseInfo.h"
66 #include "llvm/ADT/ArrayRef.h"
67 #include "llvm/ADT/SmallVector.h"
68 #include "llvm/ADT/StringRef.h"
69 #include "llvm/Analysis/AliasAnalysis.h"
70 #include "llvm/CodeGen/MachineBasicBlock.h"
71 #include "llvm/CodeGen/MachineFunction.h"
72 #include "llvm/CodeGen/MachineFunctionPass.h"
73 #include "llvm/CodeGen/MachineInstr.h"
74 #include "llvm/CodeGen/MachineInstrBuilder.h"
75 #include "llvm/CodeGen/MachineOperand.h"
76 #include "llvm/CodeGen/MachineRegisterInfo.h"
77 #include "llvm/IR/DebugLoc.h"
78 #include "llvm/Pass.h"
79 #include "llvm/Support/Debug.h"
80 #include "llvm/Support/MathExtras.h"
81 #include "llvm/Support/raw_ostream.h"
90 #define DEBUG_TYPE "si-load-store-opt"
110 class SILoadStoreOptimizer
: public MachineFunctionPass
{
112 MachineBasicBlock::iterator I
;
113 MachineBasicBlock::iterator Paired
;
120 InstClassEnum InstClass
;
128 SmallVector
<MachineInstr
*, 8> InstsToMove
;
130 const MachineOperand
*AddrReg
[5];
131 unsigned NumAddresses
;
133 bool hasSameBaseAddress(const MachineInstr
&MI
) {
134 for (unsigned i
= 0; i
< NumAddresses
; i
++) {
135 const MachineOperand
&AddrRegNext
= MI
.getOperand(AddrIdx
[i
]);
137 if (AddrReg
[i
]->isImm() || AddrRegNext
.isImm()) {
138 if (AddrReg
[i
]->isImm() != AddrRegNext
.isImm() ||
139 AddrReg
[i
]->getImm() != AddrRegNext
.getImm()) {
145 // Check same base pointer. Be careful of subregisters, which can occur
146 // with vectors of pointers.
147 if (AddrReg
[i
]->getReg() != AddrRegNext
.getReg() ||
148 AddrReg
[i
]->getSubReg() != AddrRegNext
.getSubReg()) {
155 bool hasMergeableAddress(const MachineRegisterInfo
&MRI
) {
156 for (unsigned i
= 0; i
< NumAddresses
; ++i
) {
157 const MachineOperand
*AddrOp
= AddrReg
[i
];
158 // Immediates are always OK.
162 // Don't try to merge addresses that aren't either immediates or registers.
163 // TODO: Should be possible to merge FrameIndexes and maybe some other
165 if (!AddrOp
->isReg())
168 // TODO: We should be able to merge physical reg addreses.
169 if (Register::isPhysicalRegister(AddrOp
->getReg()))
172 // If an address has only one use then there will be on other
173 // instructions with the same address, so we can't merge this one.
174 if (MRI
.hasOneNonDBGUse(AddrOp
->getReg()))
180 void setMI(MachineBasicBlock::iterator MI
, const SIInstrInfo
&TII
,
181 const GCNSubtarget
&STM
);
182 void setPaired(MachineBasicBlock::iterator MI
, const SIInstrInfo
&TII
);
185 struct BaseRegisters
{
189 unsigned LoSubReg
= 0;
190 unsigned HiSubReg
= 0;
198 using MemInfoMap
= DenseMap
<MachineInstr
*, MemAddress
>;
201 const GCNSubtarget
*STM
= nullptr;
202 const SIInstrInfo
*TII
= nullptr;
203 const SIRegisterInfo
*TRI
= nullptr;
204 MachineRegisterInfo
*MRI
= nullptr;
205 AliasAnalysis
*AA
= nullptr;
208 static bool offsetsCanBeCombined(CombineInfo
&CI
);
209 static bool widthsFit(const GCNSubtarget
&STM
, const CombineInfo
&CI
);
210 static unsigned getNewOpcode(const CombineInfo
&CI
);
211 static std::pair
<unsigned, unsigned> getSubRegIdxs(const CombineInfo
&CI
);
212 const TargetRegisterClass
*getTargetRegisterClass(const CombineInfo
&CI
);
214 bool findMatchingInst(CombineInfo
&CI
);
216 unsigned read2Opcode(unsigned EltSize
) const;
217 unsigned read2ST64Opcode(unsigned EltSize
) const;
218 MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo
&CI
);
220 unsigned write2Opcode(unsigned EltSize
) const;
221 unsigned write2ST64Opcode(unsigned EltSize
) const;
222 MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo
&CI
);
223 MachineBasicBlock::iterator
mergeSBufferLoadImmPair(CombineInfo
&CI
);
224 MachineBasicBlock::iterator
mergeBufferLoadPair(CombineInfo
&CI
);
225 MachineBasicBlock::iterator
mergeBufferStorePair(CombineInfo
&CI
);
227 void updateBaseAndOffset(MachineInstr
&I
, unsigned NewBase
,
228 int32_t NewOffset
) const;
229 unsigned computeBase(MachineInstr
&MI
, const MemAddress
&Addr
) const;
230 MachineOperand
createRegOrImm(int32_t Val
, MachineInstr
&MI
) const;
231 Optional
<int32_t> extractConstOffset(const MachineOperand
&Op
) const;
232 void processBaseWithConstOffset(const MachineOperand
&Base
, MemAddress
&Addr
) const;
233 /// Promotes constant offset to the immediate by adjusting the base. It
234 /// tries to use a base from the nearby instructions that allows it to have
235 /// a 13bit constant offset which gets promoted to the immediate.
236 bool promoteConstantOffsetToImm(MachineInstr
&CI
,
238 SmallPtrSet
<MachineInstr
*, 4> &Promoted
) const;
239 void addInstToMergeableList(const CombineInfo
&CI
,
240 std::list
<std::list
<CombineInfo
> > &MergeableInsts
) const;
241 bool collectMergeableInsts(MachineBasicBlock
&MBB
,
242 std::list
<std::list
<CombineInfo
> > &MergeableInsts
) const;
247 SILoadStoreOptimizer() : MachineFunctionPass(ID
) {
248 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
251 void removeCombinedInst(std::list
<CombineInfo
> &MergeList
,
252 const MachineInstr
&MI
);
253 bool optimizeInstsWithSameBaseAddr(std::list
<CombineInfo
> &MergeList
,
254 bool &OptimizeListAgain
);
255 bool optimizeBlock(std::list
<std::list
<CombineInfo
> > &MergeableInsts
);
257 bool runOnMachineFunction(MachineFunction
&MF
) override
;
259 StringRef
getPassName() const override
{ return "SI Load Store Optimizer"; }
261 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
262 AU
.setPreservesCFG();
263 AU
.addRequired
<AAResultsWrapperPass
>();
265 MachineFunctionPass::getAnalysisUsage(AU
);
269 static unsigned getOpcodeWidth(const MachineInstr
&MI
, const SIInstrInfo
&TII
) {
270 const unsigned Opc
= MI
.getOpcode();
272 if (TII
.isMUBUF(Opc
)) {
273 // FIXME: Handle d16 correctly
274 return AMDGPU::getMUBUFElements(Opc
);
278 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
280 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
282 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
289 /// Maps instruction opcode to enum InstClassEnum.
290 static InstClassEnum
getInstClass(unsigned Opc
, const SIInstrInfo
&TII
) {
293 if (TII
.isMUBUF(Opc
)) {
294 switch (AMDGPU::getMUBUFBaseOpcode(Opc
)) {
297 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN
:
298 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact
:
299 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET
:
300 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact
:
302 case AMDGPU::BUFFER_STORE_DWORD_OFFEN
:
303 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact
:
304 case AMDGPU::BUFFER_STORE_DWORD_OFFSET
:
305 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact
:
310 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
311 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
312 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
313 return S_BUFFER_LOAD_IMM
;
314 case AMDGPU::DS_READ_B32
:
315 case AMDGPU::DS_READ_B32_gfx9
:
316 case AMDGPU::DS_READ_B64
:
317 case AMDGPU::DS_READ_B64_gfx9
:
319 case AMDGPU::DS_WRITE_B32
:
320 case AMDGPU::DS_WRITE_B32_gfx9
:
321 case AMDGPU::DS_WRITE_B64
:
322 case AMDGPU::DS_WRITE_B64_gfx9
:
327 /// Determines instruction subclass from opcode. Only instructions
328 /// of the same subclass can be merged together.
329 static unsigned getInstSubclass(unsigned Opc
, const SIInstrInfo
&TII
) {
332 if (TII
.isMUBUF(Opc
))
333 return AMDGPU::getMUBUFBaseOpcode(Opc
);
335 case AMDGPU::DS_READ_B32
:
336 case AMDGPU::DS_READ_B32_gfx9
:
337 case AMDGPU::DS_READ_B64
:
338 case AMDGPU::DS_READ_B64_gfx9
:
339 case AMDGPU::DS_WRITE_B32
:
340 case AMDGPU::DS_WRITE_B32_gfx9
:
341 case AMDGPU::DS_WRITE_B64
:
342 case AMDGPU::DS_WRITE_B64_gfx9
:
344 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
346 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
347 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM
;
351 static unsigned getRegs(unsigned Opc
, const SIInstrInfo
&TII
) {
352 if (TII
.isMUBUF(Opc
)) {
355 if (AMDGPU::getMUBUFHasVAddr(Opc
)) {
359 if (AMDGPU::getMUBUFHasSrsrc(Opc
)) {
363 if (AMDGPU::getMUBUFHasSoffset(Opc
)) {
373 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM
:
374 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
:
375 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
:
377 case AMDGPU::DS_READ_B32
:
378 case AMDGPU::DS_READ_B64
:
379 case AMDGPU::DS_READ_B32_gfx9
:
380 case AMDGPU::DS_READ_B64_gfx9
:
381 case AMDGPU::DS_WRITE_B32
:
382 case AMDGPU::DS_WRITE_B64
:
383 case AMDGPU::DS_WRITE_B32_gfx9
:
384 case AMDGPU::DS_WRITE_B64_gfx9
:
390 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI
,
391 const SIInstrInfo
&TII
,
392 const GCNSubtarget
&STM
) {
394 unsigned Opc
= MI
->getOpcode();
395 InstClass
= getInstClass(Opc
, TII
);
397 if (InstClass
== UNKNOWN
)
403 (Opc
== AMDGPU::DS_READ_B64
|| Opc
== AMDGPU::DS_READ_B64_gfx9
) ? 8
408 (Opc
== AMDGPU::DS_WRITE_B64
|| Opc
== AMDGPU::DS_WRITE_B64_gfx9
) ? 8
411 case S_BUFFER_LOAD_IMM
:
412 EltSize
= AMDGPU::getSMRDEncodedOffset(STM
, 4);
420 AMDGPU::getNamedOperandIdx(Opc
, AMDGPU::OpName::offset
);
421 Offset0
= I
->getOperand(OffsetIdx
).getImm();
422 Width0
= getOpcodeWidth(*I
, TII
);
424 if ((InstClass
== DS_READ
) || (InstClass
== DS_WRITE
)) {
427 GLC0
= TII
.getNamedOperand(*I
, AMDGPU::OpName::glc
)->getImm();
428 if (InstClass
!= S_BUFFER_LOAD_IMM
) {
429 SLC0
= TII
.getNamedOperand(*I
, AMDGPU::OpName::slc
)->getImm();
431 DLC0
= TII
.getNamedOperand(*I
, AMDGPU::OpName::dlc
)->getImm();
434 unsigned AddrOpName
[5] = {0};
436 const unsigned Regs
= getRegs(I
->getOpcode(), TII
);
439 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::addr
;
443 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::sbase
;
447 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::srsrc
;
450 if (Regs
& SOFFSET
) {
451 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::soffset
;
455 AddrOpName
[NumAddresses
++] = AMDGPU::OpName::vaddr
;
458 for (unsigned i
= 0; i
< NumAddresses
; i
++) {
459 AddrIdx
[i
] = AMDGPU::getNamedOperandIdx(I
->getOpcode(), AddrOpName
[i
]);
460 AddrReg
[i
] = &I
->getOperand(AddrIdx
[i
]);
466 void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI
,
467 const SIInstrInfo
&TII
) {
469 assert(InstClass
== getInstClass(Paired
->getOpcode(), TII
));
471 AMDGPU::getNamedOperandIdx(I
->getOpcode(), AMDGPU::OpName::offset
);
472 Offset1
= Paired
->getOperand(OffsetIdx
).getImm();
473 Width1
= getOpcodeWidth(*Paired
, TII
);
474 if ((InstClass
== DS_READ
) || (InstClass
== DS_WRITE
)) {
477 GLC1
= TII
.getNamedOperand(*Paired
, AMDGPU::OpName::glc
)->getImm();
478 if (InstClass
!= S_BUFFER_LOAD_IMM
) {
479 SLC1
= TII
.getNamedOperand(*Paired
, AMDGPU::OpName::slc
)->getImm();
481 DLC1
= TII
.getNamedOperand(*Paired
, AMDGPU::OpName::dlc
)->getImm();
486 } // end anonymous namespace.
488 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer
, DEBUG_TYPE
,
489 "SI Load Store Optimizer", false, false)
490 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass
)
491 INITIALIZE_PASS_END(SILoadStoreOptimizer
, DEBUG_TYPE
, "SI Load Store Optimizer",
494 char SILoadStoreOptimizer::ID
= 0;
496 char &llvm::SILoadStoreOptimizerID
= SILoadStoreOptimizer::ID
;
498 FunctionPass
*llvm::createSILoadStoreOptimizerPass() {
499 return new SILoadStoreOptimizer();
502 static void moveInstsAfter(MachineBasicBlock::iterator I
,
503 ArrayRef
<MachineInstr
*> InstsToMove
) {
504 MachineBasicBlock
*MBB
= I
->getParent();
506 for (MachineInstr
*MI
: InstsToMove
) {
507 MI
->removeFromParent();
512 static void addDefsUsesToList(const MachineInstr
&MI
,
513 DenseSet
<unsigned> &RegDefs
,
514 DenseSet
<unsigned> &PhysRegUses
) {
515 for (const MachineOperand
&Op
: MI
.operands()) {
518 RegDefs
.insert(Op
.getReg());
519 else if (Op
.readsReg() && Register::isPhysicalRegister(Op
.getReg()))
520 PhysRegUses
.insert(Op
.getReg());
525 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A
,
526 MachineBasicBlock::iterator B
,
528 // RAW or WAR - cannot reorder
529 // WAW - cannot reorder
530 // RAR - safe to reorder
531 return !(A
->mayStore() || B
->mayStore()) || !A
->mayAlias(AA
, *B
, true);
534 // Add MI and its defs to the lists if MI reads one of the defs that are
535 // already in the list. Returns true in that case.
536 static bool addToListsIfDependent(MachineInstr
&MI
, DenseSet
<unsigned> &RegDefs
,
537 DenseSet
<unsigned> &PhysRegUses
,
538 SmallVectorImpl
<MachineInstr
*> &Insts
) {
539 for (MachineOperand
&Use
: MI
.operands()) {
540 // If one of the defs is read, then there is a use of Def between I and the
541 // instruction that I will potentially be merged with. We will need to move
542 // this instruction after the merged instructions.
544 // Similarly, if there is a def which is read by an instruction that is to
545 // be moved for merging, then we need to move the def-instruction as well.
546 // This can only happen for physical registers such as M0; virtual
547 // registers are in SSA form.
549 ((Use
.readsReg() && RegDefs
.count(Use
.getReg())) ||
550 (Use
.isDef() && RegDefs
.count(Use
.getReg())) ||
551 (Use
.isDef() && Register::isPhysicalRegister(Use
.getReg()) &&
552 PhysRegUses
.count(Use
.getReg())))) {
553 Insts
.push_back(&MI
);
554 addDefsUsesToList(MI
, RegDefs
, PhysRegUses
);
562 static bool canMoveInstsAcrossMemOp(MachineInstr
&MemOp
,
563 ArrayRef
<MachineInstr
*> InstsToMove
,
565 assert(MemOp
.mayLoadOrStore());
567 for (MachineInstr
*InstToMove
: InstsToMove
) {
568 if (!InstToMove
->mayLoadOrStore())
570 if (!memAccessesCanBeReordered(MemOp
, *InstToMove
, AA
))
576 // This function assumes that \p A and \p B have are identical except for
577 // size and offset, and they referecne adjacent memory.
578 static MachineMemOperand
*combineKnownAdjacentMMOs(MachineFunction
&MF
,
579 const MachineMemOperand
*A
,
580 const MachineMemOperand
*B
) {
581 unsigned MinOffset
= std::min(A
->getOffset(), B
->getOffset());
582 unsigned Size
= A
->getSize() + B
->getSize();
583 // This function adds the offset parameter to the existing offset for A,
584 // so we pass 0 here as the offset and then manually set it to the correct
585 // value after the call.
586 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(A
, 0, Size
);
587 MMO
->setOffset(MinOffset
);
591 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo
&CI
) {
592 // XXX - Would the same offset be OK? Is there any reason this would happen or
594 if (CI
.Offset0
== CI
.Offset1
)
597 // This won't be valid if the offset isn't aligned.
598 if ((CI
.Offset0
% CI
.EltSize
!= 0) || (CI
.Offset1
% CI
.EltSize
!= 0))
601 unsigned EltOffset0
= CI
.Offset0
/ CI
.EltSize
;
602 unsigned EltOffset1
= CI
.Offset1
/ CI
.EltSize
;
606 // Handle SMEM and VMEM instructions.
607 if ((CI
.InstClass
!= DS_READ
) && (CI
.InstClass
!= DS_WRITE
)) {
608 return (EltOffset0
+ CI
.Width0
== EltOffset1
||
609 EltOffset1
+ CI
.Width1
== EltOffset0
) &&
610 CI
.GLC0
== CI
.GLC1
&& CI
.DLC0
== CI
.DLC1
&&
611 (CI
.InstClass
== S_BUFFER_LOAD_IMM
|| CI
.SLC0
== CI
.SLC1
);
614 // If the offset in elements doesn't fit in 8-bits, we might be able to use
615 // the stride 64 versions.
616 if ((EltOffset0
% 64 == 0) && (EltOffset1
% 64) == 0 &&
617 isUInt
<8>(EltOffset0
/ 64) && isUInt
<8>(EltOffset1
/ 64)) {
618 CI
.Offset0
= EltOffset0
/ 64;
619 CI
.Offset1
= EltOffset1
/ 64;
624 // Check if the new offsets fit in the reduced 8-bit range.
625 if (isUInt
<8>(EltOffset0
) && isUInt
<8>(EltOffset1
)) {
626 CI
.Offset0
= EltOffset0
;
627 CI
.Offset1
= EltOffset1
;
631 // Try to shift base address to decrease offsets.
632 unsigned OffsetDiff
= std::abs((int)EltOffset1
- (int)EltOffset0
);
633 CI
.BaseOff
= std::min(CI
.Offset0
, CI
.Offset1
);
635 if ((OffsetDiff
% 64 == 0) && isUInt
<8>(OffsetDiff
/ 64)) {
636 CI
.Offset0
= (EltOffset0
- CI
.BaseOff
/ CI
.EltSize
) / 64;
637 CI
.Offset1
= (EltOffset1
- CI
.BaseOff
/ CI
.EltSize
) / 64;
642 if (isUInt
<8>(OffsetDiff
)) {
643 CI
.Offset0
= EltOffset0
- CI
.BaseOff
/ CI
.EltSize
;
644 CI
.Offset1
= EltOffset1
- CI
.BaseOff
/ CI
.EltSize
;
651 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget
&STM
,
652 const CombineInfo
&CI
) {
653 const unsigned Width
= (CI
.Width0
+ CI
.Width1
);
654 switch (CI
.InstClass
) {
656 return (Width
<= 4) && (STM
.hasDwordx3LoadStores() || (Width
!= 3));
657 case S_BUFFER_LOAD_IMM
:
668 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo
&CI
) {
669 MachineBasicBlock
*MBB
= CI
.I
->getParent();
670 MachineBasicBlock::iterator E
= MBB
->end();
671 MachineBasicBlock::iterator MBBI
= CI
.I
;
673 const unsigned Opc
= CI
.I
->getOpcode();
674 const InstClassEnum InstClass
= getInstClass(Opc
, *TII
);
676 if (InstClass
== UNKNOWN
) {
679 const unsigned InstSubclass
= getInstSubclass(Opc
, *TII
);
681 // Do not merge VMEM buffer instructions with "swizzled" bit set.
683 AMDGPU::getNamedOperandIdx(CI
.I
->getOpcode(), AMDGPU::OpName::swz
);
684 if (Swizzled
!= -1 && CI
.I
->getOperand(Swizzled
).getImm())
689 DenseSet
<unsigned> RegDefsToMove
;
690 DenseSet
<unsigned> PhysRegUsesToMove
;
691 addDefsUsesToList(*CI
.I
, RegDefsToMove
, PhysRegUsesToMove
);
693 for (; MBBI
!= E
; ++MBBI
) {
695 if ((getInstClass(MBBI
->getOpcode(), *TII
) != InstClass
) ||
696 (getInstSubclass(MBBI
->getOpcode(), *TII
) != InstSubclass
)) {
697 // This is not a matching instruction, but we can keep looking as
698 // long as one of these conditions are met:
699 // 1. It is safe to move I down past MBBI.
700 // 2. It is safe to move MBBI down past the instruction that I will
703 if (MBBI
->hasUnmodeledSideEffects()) {
704 // We can't re-order this instruction with respect to other memory
705 // operations, so we fail both conditions mentioned above.
709 if (MBBI
->mayLoadOrStore() &&
710 (!memAccessesCanBeReordered(*CI
.I
, *MBBI
, AA
) ||
711 !canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, AA
))) {
712 // We fail condition #1, but we may still be able to satisfy condition
713 // #2. Add this instruction to the move list and then we will check
714 // if condition #2 holds once we have selected the matching instruction.
715 CI
.InstsToMove
.push_back(&*MBBI
);
716 addDefsUsesToList(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
);
720 // When we match I with another DS instruction we will be moving I down
721 // to the location of the matched instruction any uses of I will need to
722 // be moved down as well.
723 addToListsIfDependent(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
,
728 // Don't merge volatiles.
729 if (MBBI
->hasOrderedMemoryRef())
732 // Handle a case like
733 // DS_WRITE_B32 addr, v, idx0
734 // w = DS_READ_B32 addr, idx0
735 // DS_WRITE_B32 addr, f(w), idx1
736 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
737 // merging of the two writes.
738 if (addToListsIfDependent(*MBBI
, RegDefsToMove
, PhysRegUsesToMove
,
742 bool Match
= CI
.hasSameBaseAddress(*MBBI
);
745 CI
.setPaired(MBBI
, *TII
);
747 // Check both offsets fit in the reduced range.
748 // We also need to go through the list of instructions that we plan to
749 // move and make sure they are all safe to move down past the merged
751 if (widthsFit(*STM
, CI
) && offsetsCanBeCombined(CI
))
752 if (canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, AA
))
756 // We've found a load/store that we couldn't merge for some reason.
757 // We could potentially keep looking, but we'd need to make sure that
758 // it was safe to move I and also all the instruction in InstsToMove
759 // down past this instruction.
760 // check if we can move I across MBBI and if we can move all I's users
761 if (!memAccessesCanBeReordered(*CI
.I
, *MBBI
, AA
) ||
762 !canMoveInstsAcrossMemOp(*MBBI
, CI
.InstsToMove
, AA
))
768 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize
) const {
769 if (STM
->ldsRequiresM0Init())
770 return (EltSize
== 4) ? AMDGPU::DS_READ2_B32
: AMDGPU::DS_READ2_B64
;
771 return (EltSize
== 4) ? AMDGPU::DS_READ2_B32_gfx9
: AMDGPU::DS_READ2_B64_gfx9
;
774 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize
) const {
775 if (STM
->ldsRequiresM0Init())
776 return (EltSize
== 4) ? AMDGPU::DS_READ2ST64_B32
: AMDGPU::DS_READ2ST64_B64
;
778 return (EltSize
== 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
779 : AMDGPU::DS_READ2ST64_B64_gfx9
;
782 MachineBasicBlock::iterator
783 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo
&CI
) {
784 MachineBasicBlock
*MBB
= CI
.I
->getParent();
786 // Be careful, since the addresses could be subregisters themselves in weird
787 // cases, like vectors of pointers.
788 const auto *AddrReg
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::addr
);
790 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdst
);
791 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdst
);
793 unsigned NewOffset0
= CI
.Offset0
;
794 unsigned NewOffset1
= CI
.Offset1
;
796 CI
.UseST64
? read2ST64Opcode(CI
.EltSize
) : read2Opcode(CI
.EltSize
);
798 unsigned SubRegIdx0
= (CI
.EltSize
== 4) ? AMDGPU::sub0
: AMDGPU::sub0_sub1
;
799 unsigned SubRegIdx1
= (CI
.EltSize
== 4) ? AMDGPU::sub1
: AMDGPU::sub2_sub3
;
801 if (NewOffset0
> NewOffset1
) {
802 // Canonicalize the merged instruction so the smaller offset comes first.
803 std::swap(NewOffset0
, NewOffset1
);
804 std::swap(SubRegIdx0
, SubRegIdx1
);
807 assert((isUInt
<8>(NewOffset0
) && isUInt
<8>(NewOffset1
)) &&
808 (NewOffset0
!= NewOffset1
) && "Computed offset doesn't fit");
810 const MCInstrDesc
&Read2Desc
= TII
->get(Opc
);
812 const TargetRegisterClass
*SuperRC
=
813 (CI
.EltSize
== 4) ? &AMDGPU::VReg_64RegClass
: &AMDGPU::VReg_128RegClass
;
814 Register DestReg
= MRI
->createVirtualRegister(SuperRC
);
816 DebugLoc DL
= CI
.I
->getDebugLoc();
818 Register BaseReg
= AddrReg
->getReg();
819 unsigned BaseSubReg
= AddrReg
->getSubReg();
820 unsigned BaseRegFlags
= 0;
822 Register ImmReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
823 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::S_MOV_B32
), ImmReg
)
826 BaseReg
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
827 BaseRegFlags
= RegState::Kill
;
829 TII
->getAddNoCarry(*MBB
, CI
.Paired
, DL
, BaseReg
)
831 .addReg(AddrReg
->getReg(), 0, BaseSubReg
)
832 .addImm(0); // clamp bit
836 MachineInstrBuilder Read2
=
837 BuildMI(*MBB
, CI
.Paired
, DL
, Read2Desc
, DestReg
)
838 .addReg(BaseReg
, BaseRegFlags
, BaseSubReg
) // addr
839 .addImm(NewOffset0
) // offset0
840 .addImm(NewOffset1
) // offset1
842 .cloneMergedMemRefs({&*CI
.I
, &*CI
.Paired
});
846 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
848 // Copy to the old destination registers.
849 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
850 .add(*Dest0
) // Copy to same destination including flags and sub reg.
851 .addReg(DestReg
, 0, SubRegIdx0
);
852 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
854 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
856 moveInstsAfter(Copy1
, CI
.InstsToMove
);
858 CI
.I
->eraseFromParent();
859 CI
.Paired
->eraseFromParent();
861 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2
<< '\n');
865 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize
) const {
866 if (STM
->ldsRequiresM0Init())
867 return (EltSize
== 4) ? AMDGPU::DS_WRITE2_B32
: AMDGPU::DS_WRITE2_B64
;
868 return (EltSize
== 4) ? AMDGPU::DS_WRITE2_B32_gfx9
869 : AMDGPU::DS_WRITE2_B64_gfx9
;
872 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize
) const {
873 if (STM
->ldsRequiresM0Init())
874 return (EltSize
== 4) ? AMDGPU::DS_WRITE2ST64_B32
875 : AMDGPU::DS_WRITE2ST64_B64
;
877 return (EltSize
== 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
878 : AMDGPU::DS_WRITE2ST64_B64_gfx9
;
881 MachineBasicBlock::iterator
882 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo
&CI
) {
883 MachineBasicBlock
*MBB
= CI
.I
->getParent();
885 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
886 // sure we preserve the subregister index and any register flags set on them.
887 const MachineOperand
*AddrReg
=
888 TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::addr
);
889 const MachineOperand
*Data0
=
890 TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::data0
);
891 const MachineOperand
*Data1
=
892 TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::data0
);
894 unsigned NewOffset0
= CI
.Offset0
;
895 unsigned NewOffset1
= CI
.Offset1
;
897 CI
.UseST64
? write2ST64Opcode(CI
.EltSize
) : write2Opcode(CI
.EltSize
);
899 if (NewOffset0
> NewOffset1
) {
900 // Canonicalize the merged instruction so the smaller offset comes first.
901 std::swap(NewOffset0
, NewOffset1
);
902 std::swap(Data0
, Data1
);
905 assert((isUInt
<8>(NewOffset0
) && isUInt
<8>(NewOffset1
)) &&
906 (NewOffset0
!= NewOffset1
) && "Computed offset doesn't fit");
908 const MCInstrDesc
&Write2Desc
= TII
->get(Opc
);
909 DebugLoc DL
= CI
.I
->getDebugLoc();
911 Register BaseReg
= AddrReg
->getReg();
912 unsigned BaseSubReg
= AddrReg
->getSubReg();
913 unsigned BaseRegFlags
= 0;
915 Register ImmReg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
916 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::S_MOV_B32
), ImmReg
)
919 BaseReg
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
920 BaseRegFlags
= RegState::Kill
;
922 TII
->getAddNoCarry(*MBB
, CI
.Paired
, DL
, BaseReg
)
924 .addReg(AddrReg
->getReg(), 0, BaseSubReg
)
925 .addImm(0); // clamp bit
929 MachineInstrBuilder Write2
=
930 BuildMI(*MBB
, CI
.Paired
, DL
, Write2Desc
)
931 .addReg(BaseReg
, BaseRegFlags
, BaseSubReg
) // addr
932 .add(*Data0
) // data0
933 .add(*Data1
) // data1
934 .addImm(NewOffset0
) // offset0
935 .addImm(NewOffset1
) // offset1
937 .cloneMergedMemRefs({&*CI
.I
, &*CI
.Paired
});
939 moveInstsAfter(Write2
, CI
.InstsToMove
);
941 CI
.I
->eraseFromParent();
942 CI
.Paired
->eraseFromParent();
944 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2
<< '\n');
948 MachineBasicBlock::iterator
949 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo
&CI
) {
950 MachineBasicBlock
*MBB
= CI
.I
->getParent();
951 DebugLoc DL
= CI
.I
->getDebugLoc();
952 const unsigned Opcode
= getNewOpcode(CI
);
954 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
956 Register DestReg
= MRI
->createVirtualRegister(SuperRC
);
957 unsigned MergedOffset
= std::min(CI
.Offset0
, CI
.Offset1
);
959 // It shouldn't be possible to get this far if the two instructions
960 // don't have a single memoperand, because MachineInstr::mayAlias()
961 // will return true if this is the case.
962 assert(CI
.I
->hasOneMemOperand() && CI
.Paired
->hasOneMemOperand());
964 const MachineMemOperand
*MMOa
= *CI
.I
->memoperands_begin();
965 const MachineMemOperand
*MMOb
= *CI
.Paired
->memoperands_begin();
968 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
), DestReg
)
969 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::sbase
))
970 .addImm(MergedOffset
) // offset
971 .addImm(CI
.GLC0
) // glc
972 .addImm(CI
.DLC0
) // dlc
973 .addMemOperand(combineKnownAdjacentMMOs(*MBB
->getParent(), MMOa
, MMOb
));
975 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
976 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
977 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
979 // Copy to the old destination registers.
980 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
981 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::sdst
);
982 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::sdst
);
984 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
985 .add(*Dest0
) // Copy to same destination including flags and sub reg.
986 .addReg(DestReg
, 0, SubRegIdx0
);
987 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
989 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
991 moveInstsAfter(Copy1
, CI
.InstsToMove
);
993 CI
.I
->eraseFromParent();
994 CI
.Paired
->eraseFromParent();
998 MachineBasicBlock::iterator
999 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo
&CI
) {
1000 MachineBasicBlock
*MBB
= CI
.I
->getParent();
1001 DebugLoc DL
= CI
.I
->getDebugLoc();
1003 const unsigned Opcode
= getNewOpcode(CI
);
1005 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
1007 // Copy to the new source register.
1008 Register DestReg
= MRI
->createVirtualRegister(SuperRC
);
1009 unsigned MergedOffset
= std::min(CI
.Offset0
, CI
.Offset1
);
1011 auto MIB
= BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
), DestReg
);
1013 const unsigned Regs
= getRegs(Opcode
, *TII
);
1016 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vaddr
));
1018 // It shouldn't be possible to get this far if the two instructions
1019 // don't have a single memoperand, because MachineInstr::mayAlias()
1020 // will return true if this is the case.
1021 assert(CI
.I
->hasOneMemOperand() && CI
.Paired
->hasOneMemOperand());
1023 const MachineMemOperand
*MMOa
= *CI
.I
->memoperands_begin();
1024 const MachineMemOperand
*MMOb
= *CI
.Paired
->memoperands_begin();
1027 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::srsrc
))
1028 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::soffset
))
1029 .addImm(MergedOffset
) // offset
1030 .addImm(CI
.GLC0
) // glc
1031 .addImm(CI
.SLC0
) // slc
1033 .addImm(CI
.DLC0
) // dlc
1035 .addMemOperand(combineKnownAdjacentMMOs(*MBB
->getParent(), MMOa
, MMOb
));
1037 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
1038 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
1039 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
1041 // Copy to the old destination registers.
1042 const MCInstrDesc
&CopyDesc
= TII
->get(TargetOpcode::COPY
);
1043 const auto *Dest0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdata
);
1044 const auto *Dest1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdata
);
1046 BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
1047 .add(*Dest0
) // Copy to same destination including flags and sub reg.
1048 .addReg(DestReg
, 0, SubRegIdx0
);
1049 MachineInstr
*Copy1
= BuildMI(*MBB
, CI
.Paired
, DL
, CopyDesc
)
1051 .addReg(DestReg
, RegState::Kill
, SubRegIdx1
);
1053 moveInstsAfter(Copy1
, CI
.InstsToMove
);
1055 CI
.I
->eraseFromParent();
1056 CI
.Paired
->eraseFromParent();
1060 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo
&CI
) {
1061 const unsigned Width
= CI
.Width0
+ CI
.Width1
;
1063 switch (CI
.InstClass
) {
1065 assert(CI
.InstClass
== BUFFER_LOAD
|| CI
.InstClass
== BUFFER_STORE
);
1066 // FIXME: Handle d16 correctly
1067 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI
.I
->getOpcode()),
1070 llvm_unreachable("Unknown instruction class");
1071 case S_BUFFER_LOAD_IMM
:
1076 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM
;
1078 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM
;
1083 std::pair
<unsigned, unsigned>
1084 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo
&CI
) {
1086 if (CI
.Width0
== 0 || CI
.Width0
== 0 || CI
.Width0
+ CI
.Width1
> 4)
1087 return std::make_pair(0, 0);
1089 bool ReverseOrder
= CI
.Offset0
> CI
.Offset1
;
1091 static const unsigned Idxs
[4][4] = {
1092 {AMDGPU::sub0
, AMDGPU::sub0_sub1
, AMDGPU::sub0_sub1_sub2
, AMDGPU::sub0_sub1_sub2_sub3
},
1093 {AMDGPU::sub1
, AMDGPU::sub1_sub2
, AMDGPU::sub1_sub2_sub3
, 0},
1094 {AMDGPU::sub2
, AMDGPU::sub2_sub3
, 0, 0},
1095 {AMDGPU::sub3
, 0, 0, 0},
1100 assert(CI
.Width0
>= 1 && CI
.Width0
<= 3);
1101 assert(CI
.Width1
>= 1 && CI
.Width1
<= 3);
1104 Idx1
= Idxs
[0][CI
.Width1
- 1];
1105 Idx0
= Idxs
[CI
.Width1
][CI
.Width0
- 1];
1107 Idx0
= Idxs
[0][CI
.Width0
- 1];
1108 Idx1
= Idxs
[CI
.Width0
][CI
.Width1
- 1];
1111 return std::make_pair(Idx0
, Idx1
);
1114 const TargetRegisterClass
*
1115 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo
&CI
) {
1116 if (CI
.InstClass
== S_BUFFER_LOAD_IMM
) {
1117 switch (CI
.Width0
+ CI
.Width1
) {
1121 return &AMDGPU::SReg_64_XEXECRegClass
;
1123 return &AMDGPU::SGPR_128RegClass
;
1125 return &AMDGPU::SReg_256RegClass
;
1127 return &AMDGPU::SReg_512RegClass
;
1130 switch (CI
.Width0
+ CI
.Width1
) {
1134 return &AMDGPU::VReg_64RegClass
;
1136 return &AMDGPU::VReg_96RegClass
;
1138 return &AMDGPU::VReg_128RegClass
;
1143 MachineBasicBlock::iterator
1144 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo
&CI
) {
1145 MachineBasicBlock
*MBB
= CI
.I
->getParent();
1146 DebugLoc DL
= CI
.I
->getDebugLoc();
1148 const unsigned Opcode
= getNewOpcode(CI
);
1150 std::pair
<unsigned, unsigned> SubRegIdx
= getSubRegIdxs(CI
);
1151 const unsigned SubRegIdx0
= std::get
<0>(SubRegIdx
);
1152 const unsigned SubRegIdx1
= std::get
<1>(SubRegIdx
);
1154 // Copy to the new source register.
1155 const TargetRegisterClass
*SuperRC
= getTargetRegisterClass(CI
);
1156 Register SrcReg
= MRI
->createVirtualRegister(SuperRC
);
1158 const auto *Src0
= TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vdata
);
1159 const auto *Src1
= TII
->getNamedOperand(*CI
.Paired
, AMDGPU::OpName::vdata
);
1161 BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(AMDGPU::REG_SEQUENCE
), SrcReg
)
1165 .addImm(SubRegIdx1
);
1167 auto MIB
= BuildMI(*MBB
, CI
.Paired
, DL
, TII
->get(Opcode
))
1168 .addReg(SrcReg
, RegState::Kill
);
1170 const unsigned Regs
= getRegs(Opcode
, *TII
);
1173 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::vaddr
));
1176 // It shouldn't be possible to get this far if the two instructions
1177 // don't have a single memoperand, because MachineInstr::mayAlias()
1178 // will return true if this is the case.
1179 assert(CI
.I
->hasOneMemOperand() && CI
.Paired
->hasOneMemOperand());
1181 const MachineMemOperand
*MMOa
= *CI
.I
->memoperands_begin();
1182 const MachineMemOperand
*MMOb
= *CI
.Paired
->memoperands_begin();
1185 MIB
.add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::srsrc
))
1186 .add(*TII
->getNamedOperand(*CI
.I
, AMDGPU::OpName::soffset
))
1187 .addImm(std::min(CI
.Offset0
, CI
.Offset1
)) // offset
1188 .addImm(CI
.GLC0
) // glc
1189 .addImm(CI
.SLC0
) // slc
1191 .addImm(CI
.DLC0
) // dlc
1193 .addMemOperand(combineKnownAdjacentMMOs(*MBB
->getParent(), MMOa
, MMOb
));
1195 moveInstsAfter(MIB
, CI
.InstsToMove
);
1197 CI
.I
->eraseFromParent();
1198 CI
.Paired
->eraseFromParent();
1203 SILoadStoreOptimizer::createRegOrImm(int32_t Val
, MachineInstr
&MI
) const {
1204 APInt
V(32, Val
, true);
1205 if (TII
->isInlineConstant(V
))
1206 return MachineOperand::CreateImm(Val
);
1208 Register Reg
= MRI
->createVirtualRegister(&AMDGPU::SReg_32RegClass
);
1210 BuildMI(*MI
.getParent(), MI
.getIterator(), MI
.getDebugLoc(),
1211 TII
->get(AMDGPU::S_MOV_B32
), Reg
)
1214 LLVM_DEBUG(dbgs() << " "; Mov
->dump());
1215 return MachineOperand::CreateReg(Reg
, false);
1218 // Compute base address using Addr and return the final register.
1219 unsigned SILoadStoreOptimizer::computeBase(MachineInstr
&MI
,
1220 const MemAddress
&Addr
) const {
1221 MachineBasicBlock
*MBB
= MI
.getParent();
1222 MachineBasicBlock::iterator MBBI
= MI
.getIterator();
1223 DebugLoc DL
= MI
.getDebugLoc();
1225 assert((TRI
->getRegSizeInBits(Addr
.Base
.LoReg
, *MRI
) == 32 ||
1226 Addr
.Base
.LoSubReg
) &&
1227 "Expected 32-bit Base-Register-Low!!");
1229 assert((TRI
->getRegSizeInBits(Addr
.Base
.HiReg
, *MRI
) == 32 ||
1230 Addr
.Base
.HiSubReg
) &&
1231 "Expected 32-bit Base-Register-Hi!!");
1233 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1234 MachineOperand OffsetLo
= createRegOrImm(static_cast<int32_t>(Addr
.Offset
), MI
);
1235 MachineOperand OffsetHi
=
1236 createRegOrImm(static_cast<int32_t>(Addr
.Offset
>> 32), MI
);
1238 const auto *CarryRC
= TRI
->getRegClass(AMDGPU::SReg_1_XEXECRegClassID
);
1239 Register CarryReg
= MRI
->createVirtualRegister(CarryRC
);
1240 Register DeadCarryReg
= MRI
->createVirtualRegister(CarryRC
);
1242 Register DestSub0
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1243 Register DestSub1
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1244 MachineInstr
*LoHalf
=
1245 BuildMI(*MBB
, MBBI
, DL
, TII
->get(AMDGPU::V_ADD_I32_e64
), DestSub0
)
1246 .addReg(CarryReg
, RegState::Define
)
1247 .addReg(Addr
.Base
.LoReg
, 0, Addr
.Base
.LoSubReg
)
1249 .addImm(0); // clamp bit
1251 LLVM_DEBUG(dbgs() << " "; LoHalf
->dump(););
1253 MachineInstr
*HiHalf
=
1254 BuildMI(*MBB
, MBBI
, DL
, TII
->get(AMDGPU::V_ADDC_U32_e64
), DestSub1
)
1255 .addReg(DeadCarryReg
, RegState::Define
| RegState::Dead
)
1256 .addReg(Addr
.Base
.HiReg
, 0, Addr
.Base
.HiSubReg
)
1258 .addReg(CarryReg
, RegState::Kill
)
1259 .addImm(0); // clamp bit
1261 LLVM_DEBUG(dbgs() << " "; HiHalf
->dump(););
1263 Register FullDestReg
= MRI
->createVirtualRegister(&AMDGPU::VReg_64RegClass
);
1264 MachineInstr
*FullBase
=
1265 BuildMI(*MBB
, MBBI
, DL
, TII
->get(TargetOpcode::REG_SEQUENCE
), FullDestReg
)
1267 .addImm(AMDGPU::sub0
)
1269 .addImm(AMDGPU::sub1
);
1271 LLVM_DEBUG(dbgs() << " "; FullBase
->dump(); dbgs() << "\n";);
1276 // Update base and offset with the NewBase and NewOffset in MI.
1277 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr
&MI
,
1279 int32_t NewOffset
) const {
1280 TII
->getNamedOperand(MI
, AMDGPU::OpName::vaddr
)->setReg(NewBase
);
1281 TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->setImm(NewOffset
);
1285 SILoadStoreOptimizer::extractConstOffset(const MachineOperand
&Op
) const {
1292 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Op
.getReg());
1293 if (!Def
|| Def
->getOpcode() != AMDGPU::S_MOV_B32
||
1294 !Def
->getOperand(1).isImm())
1297 return Def
->getOperand(1).getImm();
1300 // Analyze Base and extracts:
1301 // - 32bit base registers, subregisters
1302 // - 64bit constant offset
1303 // Expecting base computation as:
1304 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1305 // %LO:vgpr_32, %c:sreg_64_xexec =
1306 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1307 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1309 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1310 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand
&Base
,
1311 MemAddress
&Addr
) const {
1315 MachineInstr
*Def
= MRI
->getUniqueVRegDef(Base
.getReg());
1316 if (!Def
|| Def
->getOpcode() != AMDGPU::REG_SEQUENCE
1317 || Def
->getNumOperands() != 5)
1320 MachineOperand BaseLo
= Def
->getOperand(1);
1321 MachineOperand BaseHi
= Def
->getOperand(3);
1322 if (!BaseLo
.isReg() || !BaseHi
.isReg())
1325 MachineInstr
*BaseLoDef
= MRI
->getUniqueVRegDef(BaseLo
.getReg());
1326 MachineInstr
*BaseHiDef
= MRI
->getUniqueVRegDef(BaseHi
.getReg());
1328 if (!BaseLoDef
|| BaseLoDef
->getOpcode() != AMDGPU::V_ADD_I32_e64
||
1329 !BaseHiDef
|| BaseHiDef
->getOpcode() != AMDGPU::V_ADDC_U32_e64
)
1332 const auto *Src0
= TII
->getNamedOperand(*BaseLoDef
, AMDGPU::OpName::src0
);
1333 const auto *Src1
= TII
->getNamedOperand(*BaseLoDef
, AMDGPU::OpName::src1
);
1335 auto Offset0P
= extractConstOffset(*Src0
);
1339 if (!(Offset0P
= extractConstOffset(*Src1
)))
1344 Src0
= TII
->getNamedOperand(*BaseHiDef
, AMDGPU::OpName::src0
);
1345 Src1
= TII
->getNamedOperand(*BaseHiDef
, AMDGPU::OpName::src1
);
1348 std::swap(Src0
, Src1
);
1353 uint64_t Offset1
= Src1
->getImm();
1356 Addr
.Base
.LoReg
= BaseLo
.getReg();
1357 Addr
.Base
.HiReg
= BaseHi
.getReg();
1358 Addr
.Base
.LoSubReg
= BaseLo
.getSubReg();
1359 Addr
.Base
.HiSubReg
= BaseHi
.getSubReg();
1360 Addr
.Offset
= (*Offset0P
& 0x00000000ffffffff) | (Offset1
<< 32);
1363 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1365 MemInfoMap
&Visited
,
1366 SmallPtrSet
<MachineInstr
*, 4> &AnchorList
) const {
1368 if (!(MI
.mayLoad() ^ MI
.mayStore()))
1371 // TODO: Support flat and scratch.
1372 if (AMDGPU::getGlobalSaddrOp(MI
.getOpcode()) < 0)
1375 if (MI
.mayLoad() && TII
->getNamedOperand(MI
, AMDGPU::OpName::vdata
) != NULL
)
1378 if (AnchorList
.count(&MI
))
1381 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI
.dump());
1383 if (TII
->getNamedOperand(MI
, AMDGPU::OpName::offset
)->getImm()) {
1384 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1388 // Step1: Find the base-registers and a 64bit constant offset.
1389 MachineOperand
&Base
= *TII
->getNamedOperand(MI
, AMDGPU::OpName::vaddr
);
1391 if (Visited
.find(&MI
) == Visited
.end()) {
1392 processBaseWithConstOffset(Base
, MAddr
);
1393 Visited
[&MI
] = MAddr
;
1395 MAddr
= Visited
[&MI
];
1397 if (MAddr
.Offset
== 0) {
1398 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1399 " constant offsets that can be promoted.\n";);
1403 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr
.Base
.HiReg
<< ", "
1404 << MAddr
.Base
.LoReg
<< "} Offset: " << MAddr
.Offset
<< "\n\n";);
1406 // Step2: Traverse through MI's basic block and find an anchor(that has the
1407 // same base-registers) with the highest 13bit distance from MI's offset.
1408 // E.g. (64bit loads)
1410 // addr1 = &a + 4096; load1 = load(addr1, 0)
1411 // addr2 = &a + 6144; load2 = load(addr2, 0)
1412 // addr3 = &a + 8192; load3 = load(addr3, 0)
1413 // addr4 = &a + 10240; load4 = load(addr4, 0)
1414 // addr5 = &a + 12288; load5 = load(addr5, 0)
1416 // Starting from the first load, the optimization will try to find a new base
1417 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1418 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1419 // as the new-base(anchor) because of the maximum distance which can
1420 // accomodate more intermediate bases presumeably.
1422 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1423 // (&a + 8192) for load1, load2, load4.
1425 // load1 = load(addr, -4096)
1426 // load2 = load(addr, -2048)
1427 // load3 = load(addr, 0)
1428 // load4 = load(addr, 2048)
1429 // addr5 = &a + 12288; load5 = load(addr5, 0)
1431 MachineInstr
*AnchorInst
= nullptr;
1432 MemAddress AnchorAddr
;
1433 uint32_t MaxDist
= std::numeric_limits
<uint32_t>::min();
1434 SmallVector
<std::pair
<MachineInstr
*, int64_t>, 4> InstsWCommonBase
;
1436 MachineBasicBlock
*MBB
= MI
.getParent();
1437 MachineBasicBlock::iterator E
= MBB
->end();
1438 MachineBasicBlock::iterator MBBI
= MI
.getIterator();
1440 const SITargetLowering
*TLI
=
1441 static_cast<const SITargetLowering
*>(STM
->getTargetLowering());
1443 for ( ; MBBI
!= E
; ++MBBI
) {
1444 MachineInstr
&MINext
= *MBBI
;
1445 // TODO: Support finding an anchor(with same base) from store addresses or
1446 // any other load addresses where the opcodes are different.
1447 if (MINext
.getOpcode() != MI
.getOpcode() ||
1448 TII
->getNamedOperand(MINext
, AMDGPU::OpName::offset
)->getImm())
1451 const MachineOperand
&BaseNext
=
1452 *TII
->getNamedOperand(MINext
, AMDGPU::OpName::vaddr
);
1453 MemAddress MAddrNext
;
1454 if (Visited
.find(&MINext
) == Visited
.end()) {
1455 processBaseWithConstOffset(BaseNext
, MAddrNext
);
1456 Visited
[&MINext
] = MAddrNext
;
1458 MAddrNext
= Visited
[&MINext
];
1460 if (MAddrNext
.Base
.LoReg
!= MAddr
.Base
.LoReg
||
1461 MAddrNext
.Base
.HiReg
!= MAddr
.Base
.HiReg
||
1462 MAddrNext
.Base
.LoSubReg
!= MAddr
.Base
.LoSubReg
||
1463 MAddrNext
.Base
.HiSubReg
!= MAddr
.Base
.HiSubReg
)
1466 InstsWCommonBase
.push_back(std::make_pair(&MINext
, MAddrNext
.Offset
));
1468 int64_t Dist
= MAddr
.Offset
- MAddrNext
.Offset
;
1469 TargetLoweringBase::AddrMode AM
;
1470 AM
.HasBaseReg
= true;
1472 if (TLI
->isLegalGlobalAddressingMode(AM
) &&
1473 (uint32_t)std::abs(Dist
) > MaxDist
) {
1474 MaxDist
= std::abs(Dist
);
1476 AnchorAddr
= MAddrNext
;
1477 AnchorInst
= &MINext
;
1482 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1483 AnchorInst
->dump());
1484 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1485 << AnchorAddr
.Offset
<< "\n\n");
1487 // Instead of moving up, just re-compute anchor-instruction's base address.
1488 unsigned Base
= computeBase(MI
, AnchorAddr
);
1490 updateBaseAndOffset(MI
, Base
, MAddr
.Offset
- AnchorAddr
.Offset
);
1491 LLVM_DEBUG(dbgs() << " After promotion: "; MI
.dump(););
1493 for (auto P
: InstsWCommonBase
) {
1494 TargetLoweringBase::AddrMode AM
;
1495 AM
.HasBaseReg
= true;
1496 AM
.BaseOffs
= P
.second
- AnchorAddr
.Offset
;
1498 if (TLI
->isLegalGlobalAddressingMode(AM
)) {
1499 LLVM_DEBUG(dbgs() << " Promote Offset(" << P
.second
;
1500 dbgs() << ")"; P
.first
->dump());
1501 updateBaseAndOffset(*P
.first
, Base
, P
.second
- AnchorAddr
.Offset
);
1502 LLVM_DEBUG(dbgs() << " After promotion: "; P
.first
->dump());
1505 AnchorList
.insert(AnchorInst
);
1512 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo
&CI
,
1513 std::list
<std::list
<CombineInfo
> > &MergeableInsts
) const {
1514 for (std::list
<CombineInfo
> &AddrList
: MergeableInsts
) {
1515 if (AddrList
.front().hasSameBaseAddress(*CI
.I
) &&
1516 AddrList
.front().InstClass
== CI
.InstClass
) {
1517 AddrList
.emplace_back(CI
);
1522 // Base address not found, so add a new list.
1523 MergeableInsts
.emplace_back(1, CI
);
1526 bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock
&MBB
,
1527 std::list
<std::list
<CombineInfo
> > &MergeableInsts
) const {
1528 bool Modified
= false;
1531 // Contains the list of instructions for which constant offsets are being
1532 // promoted to the IMM.
1533 SmallPtrSet
<MachineInstr
*, 4> AnchorList
;
1535 // Sort potential mergeable instructions into lists. One list per base address.
1536 for (MachineInstr
&MI
: MBB
.instrs()) {
1537 // We run this before checking if an address is mergeable, because it can produce
1538 // better code even if the instructions aren't mergeable.
1539 if (promoteConstantOffsetToImm(MI
, Visited
, AnchorList
))
1542 const InstClassEnum InstClass
= getInstClass(MI
.getOpcode(), *TII
);
1543 if (InstClass
== UNKNOWN
)
1546 // Don't combine if volatile.
1547 if (MI
.hasOrderedMemoryRef())
1551 CI
.setMI(MI
, *TII
, *STM
);
1553 if (!CI
.hasMergeableAddress(*MRI
))
1556 addInstToMergeableList(CI
, MergeableInsts
);
1561 // Scan through looking for adjacent LDS operations with constant offsets from
1562 // the same base register. We rely on the scheduler to do the hard work of
1563 // clustering nearby loads, and assume these are all adjacent.
1564 bool SILoadStoreOptimizer::optimizeBlock(
1565 std::list
<std::list
<CombineInfo
> > &MergeableInsts
) {
1566 bool Modified
= false;
1568 for (std::list
<CombineInfo
> &MergeList
: MergeableInsts
) {
1569 if (MergeList
.size() < 2)
1572 bool OptimizeListAgain
= false;
1573 if (!optimizeInstsWithSameBaseAddr(MergeList
, OptimizeListAgain
)) {
1574 // We weren't able to make any changes, so clear the list so we don't
1575 // process the same instructions the next time we try to optimize this
1581 // We made changes, but also determined that there were no more optimization
1582 // opportunities, so we don't need to reprocess the list
1583 if (!OptimizeListAgain
)
1586 OptimizeAgain
|= OptimizeListAgain
;
1593 SILoadStoreOptimizer::removeCombinedInst(std::list
<CombineInfo
> &MergeList
,
1594 const MachineInstr
&MI
) {
1596 for (auto CI
= MergeList
.begin(), E
= MergeList
.end(); CI
!= E
; ++CI
) {
1597 if (&*CI
->I
== &MI
) {
1598 MergeList
.erase(CI
);
1605 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
1606 std::list
<CombineInfo
> &MergeList
,
1607 bool &OptimizeListAgain
) {
1608 bool Modified
= false;
1609 for (auto I
= MergeList
.begin(); I
!= MergeList
.end(); ++I
) {
1610 CombineInfo
&CI
= *I
;
1612 switch (CI
.InstClass
) {
1616 if (findMatchingInst(CI
)) {
1618 removeCombinedInst(MergeList
, *CI
.Paired
);
1619 MachineBasicBlock::iterator NewMI
= mergeRead2Pair(CI
);
1620 CI
.setMI(NewMI
, *TII
, *STM
);
1624 if (findMatchingInst(CI
)) {
1626 removeCombinedInst(MergeList
, *CI
.Paired
);
1627 MachineBasicBlock::iterator NewMI
= mergeWrite2Pair(CI
);
1628 CI
.setMI(NewMI
, *TII
, *STM
);
1631 case S_BUFFER_LOAD_IMM
:
1632 if (findMatchingInst(CI
)) {
1634 removeCombinedInst(MergeList
, *CI
.Paired
);
1635 MachineBasicBlock::iterator NewMI
= mergeSBufferLoadImmPair(CI
);
1636 CI
.setMI(NewMI
, *TII
, *STM
);
1637 OptimizeListAgain
|= (CI
.Width0
+ CI
.Width1
) < 16;
1641 if (findMatchingInst(CI
)) {
1643 removeCombinedInst(MergeList
, *CI
.Paired
);
1644 MachineBasicBlock::iterator NewMI
= mergeBufferLoadPair(CI
);
1645 CI
.setMI(NewMI
, *TII
, *STM
);
1646 OptimizeListAgain
|= (CI
.Width0
+ CI
.Width1
) < 4;
1650 if (findMatchingInst(CI
)) {
1652 removeCombinedInst(MergeList
, *CI
.Paired
);
1653 MachineBasicBlock::iterator NewMI
= mergeBufferStorePair(CI
);
1654 CI
.setMI(NewMI
, *TII
, *STM
);
1655 OptimizeListAgain
|= (CI
.Width0
+ CI
.Width1
) < 4;
1659 // Clear the InstsToMove after we have finished searching so we don't have
1660 // stale values left over if we search for this CI again in another pass
1662 CI
.InstsToMove
.clear();
1668 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction
&MF
) {
1669 if (skipFunction(MF
.getFunction()))
1672 STM
= &MF
.getSubtarget
<GCNSubtarget
>();
1673 if (!STM
->loadStoreOptEnabled())
1676 TII
= STM
->getInstrInfo();
1677 TRI
= &TII
->getRegisterInfo();
1679 MRI
= &MF
.getRegInfo();
1680 AA
= &getAnalysis
<AAResultsWrapperPass
>().getAAResults();
1682 assert(MRI
->isSSA() && "Must be run on SSA");
1684 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1686 bool Modified
= false;
1689 for (MachineBasicBlock
&MBB
: MF
) {
1690 std::list
<std::list
<CombineInfo
> > MergeableInsts
;
1691 // First pass: Collect list of all instructions we know how to merge.
1692 Modified
|= collectMergeableInsts(MBB
, MergeableInsts
);
1694 OptimizeAgain
= false;
1695 Modified
|= optimizeBlock(MergeableInsts
);
1696 } while (OptimizeAgain
);