[X86][BMI] Pull out schedule classes from bmi_andn<> and bmi_bls<>
[llvm-core.git] / lib / Target / AMDGPU / SILoadStoreOptimizer.cpp
blob20db1c37f3542bc5acb70d8734c1ddeaf6bb01e9
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
43 // Future improvements:
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
58 //===----------------------------------------------------------------------===//
60 #include "AMDGPU.h"
61 #include "AMDGPUSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "SIInstrInfo.h"
64 #include "SIRegisterInfo.h"
65 #include "Utils/AMDGPUBaseInfo.h"
66 #include "llvm/ADT/ArrayRef.h"
67 #include "llvm/ADT/SmallVector.h"
68 #include "llvm/ADT/StringRef.h"
69 #include "llvm/Analysis/AliasAnalysis.h"
70 #include "llvm/CodeGen/MachineBasicBlock.h"
71 #include "llvm/CodeGen/MachineFunction.h"
72 #include "llvm/CodeGen/MachineFunctionPass.h"
73 #include "llvm/CodeGen/MachineInstr.h"
74 #include "llvm/CodeGen/MachineInstrBuilder.h"
75 #include "llvm/CodeGen/MachineOperand.h"
76 #include "llvm/CodeGen/MachineRegisterInfo.h"
77 #include "llvm/IR/DebugLoc.h"
78 #include "llvm/Pass.h"
79 #include "llvm/Support/Debug.h"
80 #include "llvm/Support/MathExtras.h"
81 #include "llvm/Support/raw_ostream.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <iterator>
86 #include <utility>
88 using namespace llvm;
90 #define DEBUG_TYPE "si-load-store-opt"
92 namespace {
93 enum InstClassEnum {
94 UNKNOWN,
95 DS_READ,
96 DS_WRITE,
97 S_BUFFER_LOAD_IMM,
98 BUFFER_LOAD,
99 BUFFER_STORE,
100 MIMG,
103 enum RegisterEnum {
104 SBASE = 0x1,
105 SRSRC = 0x2,
106 SOFFSET = 0x4,
107 VADDR = 0x8,
108 ADDR = 0x10,
109 SSAMP = 0x20,
112 class SILoadStoreOptimizer : public MachineFunctionPass {
113 struct CombineInfo {
114 MachineBasicBlock::iterator I;
115 MachineBasicBlock::iterator Paired;
116 unsigned EltSize;
117 unsigned Offset0;
118 unsigned Offset1;
119 unsigned Width0;
120 unsigned Width1;
121 unsigned BaseOff;
122 unsigned DMask0;
123 unsigned DMask1;
124 InstClassEnum InstClass;
125 bool GLC0;
126 bool GLC1;
127 bool SLC0;
128 bool SLC1;
129 bool DLC0;
130 bool DLC1;
131 bool UseST64;
132 SmallVector<MachineInstr *, 8> InstsToMove;
133 int AddrIdx[5];
134 const MachineOperand *AddrReg[5];
135 unsigned NumAddresses;
137 bool hasSameBaseAddress(const MachineInstr &MI) {
138 for (unsigned i = 0; i < NumAddresses; i++) {
139 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
141 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
142 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
143 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
144 return false;
146 continue;
149 // Check same base pointer. Be careful of subregisters, which can occur
150 // with vectors of pointers.
151 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
152 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
153 return false;
156 return true;
159 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
160 for (unsigned i = 0; i < NumAddresses; ++i) {
161 const MachineOperand *AddrOp = AddrReg[i];
162 // Immediates are always OK.
163 if (AddrOp->isImm())
164 continue;
166 // Don't try to merge addresses that aren't either immediates or registers.
167 // TODO: Should be possible to merge FrameIndexes and maybe some other
168 // non-register
169 if (!AddrOp->isReg())
170 return false;
172 // TODO: We should be able to merge physical reg addreses.
173 if (Register::isPhysicalRegister(AddrOp->getReg()))
174 return false;
176 // If an address has only one use then there will be on other
177 // instructions with the same address, so we can't merge this one.
178 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
179 return false;
181 return true;
184 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
185 const GCNSubtarget &STM);
186 void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
189 struct BaseRegisters {
190 unsigned LoReg = 0;
191 unsigned HiReg = 0;
193 unsigned LoSubReg = 0;
194 unsigned HiSubReg = 0;
197 struct MemAddress {
198 BaseRegisters Base;
199 int64_t Offset = 0;
202 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
204 private:
205 const GCNSubtarget *STM = nullptr;
206 const SIInstrInfo *TII = nullptr;
207 const SIRegisterInfo *TRI = nullptr;
208 MachineRegisterInfo *MRI = nullptr;
209 AliasAnalysis *AA = nullptr;
210 bool OptimizeAgain;
212 static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII);
213 static bool offsetsCanBeCombined(CombineInfo &CI);
214 static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
215 static unsigned getNewOpcode(const CombineInfo &CI);
216 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
217 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
219 bool findMatchingInst(CombineInfo &CI);
221 unsigned read2Opcode(unsigned EltSize) const;
222 unsigned read2ST64Opcode(unsigned EltSize) const;
223 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
225 unsigned write2Opcode(unsigned EltSize) const;
226 unsigned write2ST64Opcode(unsigned EltSize) const;
227 MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
228 MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI);
229 MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
230 MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
231 MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
233 void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
234 int32_t NewOffset) const;
235 unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
236 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
237 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
238 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
239 /// Promotes constant offset to the immediate by adjusting the base. It
240 /// tries to use a base from the nearby instructions that allows it to have
241 /// a 13bit constant offset which gets promoted to the immediate.
242 bool promoteConstantOffsetToImm(MachineInstr &CI,
243 MemInfoMap &Visited,
244 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
245 void addInstToMergeableList(const CombineInfo &CI,
246 std::list<std::list<CombineInfo> > &MergeableInsts) const;
247 bool collectMergeableInsts(MachineBasicBlock &MBB,
248 std::list<std::list<CombineInfo> > &MergeableInsts) const;
250 public:
251 static char ID;
253 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
254 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
257 void removeCombinedInst(std::list<CombineInfo> &MergeList,
258 const MachineInstr &MI);
259 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
260 bool &OptimizeListAgain);
261 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
263 bool runOnMachineFunction(MachineFunction &MF) override;
265 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
267 void getAnalysisUsage(AnalysisUsage &AU) const override {
268 AU.setPreservesCFG();
269 AU.addRequired<AAResultsWrapperPass>();
271 MachineFunctionPass::getAnalysisUsage(AU);
275 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
276 const unsigned Opc = MI.getOpcode();
278 if (TII.isMUBUF(Opc)) {
279 // FIXME: Handle d16 correctly
280 return AMDGPU::getMUBUFElements(Opc);
282 if (TII.isMIMG(MI)) {
283 uint64_t DMaskImm =
284 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
285 return countPopulation(DMaskImm);
288 switch (Opc) {
289 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
290 return 1;
291 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
292 return 2;
293 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
294 return 4;
295 default:
296 return 0;
300 /// Maps instruction opcode to enum InstClassEnum.
301 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
302 switch (Opc) {
303 default:
304 if (TII.isMUBUF(Opc)) {
305 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
306 default:
307 return UNKNOWN;
308 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
309 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
310 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
311 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
312 return BUFFER_LOAD;
313 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
314 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
315 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
316 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
317 return BUFFER_STORE;
320 if (TII.isMIMG(Opc)) {
321 // Ignore instructions encoded without vaddr.
322 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1)
323 return UNKNOWN;
324 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
325 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc))
326 return UNKNOWN;
327 return MIMG;
329 return UNKNOWN;
330 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
331 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
332 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
333 return S_BUFFER_LOAD_IMM;
334 case AMDGPU::DS_READ_B32:
335 case AMDGPU::DS_READ_B32_gfx9:
336 case AMDGPU::DS_READ_B64:
337 case AMDGPU::DS_READ_B64_gfx9:
338 return DS_READ;
339 case AMDGPU::DS_WRITE_B32:
340 case AMDGPU::DS_WRITE_B32_gfx9:
341 case AMDGPU::DS_WRITE_B64:
342 case AMDGPU::DS_WRITE_B64_gfx9:
343 return DS_WRITE;
347 /// Determines instruction subclass from opcode. Only instructions
348 /// of the same subclass can be merged together.
349 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
350 switch (Opc) {
351 default:
352 if (TII.isMUBUF(Opc))
353 return AMDGPU::getMUBUFBaseOpcode(Opc);
354 if (TII.isMIMG(Opc)) {
355 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
356 assert(Info);
357 return Info->BaseOpcode;
359 return -1;
360 case AMDGPU::DS_READ_B32:
361 case AMDGPU::DS_READ_B32_gfx9:
362 case AMDGPU::DS_READ_B64:
363 case AMDGPU::DS_READ_B64_gfx9:
364 case AMDGPU::DS_WRITE_B32:
365 case AMDGPU::DS_WRITE_B32_gfx9:
366 case AMDGPU::DS_WRITE_B64:
367 case AMDGPU::DS_WRITE_B64_gfx9:
368 return Opc;
369 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
370 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
371 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
372 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
376 static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
377 if (TII.isMUBUF(Opc)) {
378 unsigned result = 0;
380 if (AMDGPU::getMUBUFHasVAddr(Opc)) {
381 result |= VADDR;
384 if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
385 result |= SRSRC;
388 if (AMDGPU::getMUBUFHasSoffset(Opc)) {
389 result |= SOFFSET;
392 return result;
395 if (TII.isMIMG(Opc)) {
396 unsigned result = VADDR | SRSRC;
397 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
398 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
399 result |= SSAMP;
400 return result;
403 switch (Opc) {
404 default:
405 return 0;
406 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
407 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
408 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
409 return SBASE;
410 case AMDGPU::DS_READ_B32:
411 case AMDGPU::DS_READ_B64:
412 case AMDGPU::DS_READ_B32_gfx9:
413 case AMDGPU::DS_READ_B64_gfx9:
414 case AMDGPU::DS_WRITE_B32:
415 case AMDGPU::DS_WRITE_B64:
416 case AMDGPU::DS_WRITE_B32_gfx9:
417 case AMDGPU::DS_WRITE_B64_gfx9:
418 return ADDR;
423 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
424 const SIInstrInfo &TII,
425 const GCNSubtarget &STM) {
426 I = MI;
427 unsigned Opc = MI->getOpcode();
428 InstClass = getInstClass(Opc, TII);
430 if (InstClass == UNKNOWN)
431 return;
433 switch (InstClass) {
434 case DS_READ:
435 EltSize =
436 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
437 : 4;
438 break;
439 case DS_WRITE:
440 EltSize =
441 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
442 : 4;
443 break;
444 case S_BUFFER_LOAD_IMM:
445 EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4);
446 break;
447 default:
448 EltSize = 4;
449 break;
452 if (InstClass == MIMG) {
453 DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
454 } else {
455 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
456 Offset0 = I->getOperand(OffsetIdx).getImm();
459 Width0 = getOpcodeWidth(*I, TII);
461 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
462 Offset0 &= 0xffff;
463 } else if (InstClass != MIMG) {
464 GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
465 if (InstClass != S_BUFFER_LOAD_IMM) {
466 SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
468 DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
471 unsigned AddrOpName[5] = {0};
472 NumAddresses = 0;
473 const unsigned Regs = getRegs(I->getOpcode(), TII);
475 if (Regs & ADDR) {
476 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
479 if (Regs & SBASE) {
480 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
483 if (Regs & SRSRC) {
484 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
487 if (Regs & SOFFSET) {
488 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
491 if (Regs & VADDR) {
492 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
495 if (Regs & SSAMP) {
496 AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp;
499 for (unsigned i = 0; i < NumAddresses; i++) {
500 AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
501 AddrReg[i] = &I->getOperand(AddrIdx[i]);
504 InstsToMove.clear();
507 void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
508 const SIInstrInfo &TII) {
509 Paired = MI;
510 assert(InstClass == getInstClass(Paired->getOpcode(), TII));
512 if (InstClass == MIMG) {
513 DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm();
514 } else {
515 int OffsetIdx =
516 AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset);
517 Offset1 = Paired->getOperand(OffsetIdx).getImm();
520 Width1 = getOpcodeWidth(*Paired, TII);
521 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
522 Offset1 &= 0xffff;
523 } else if (InstClass != MIMG) {
524 GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm();
525 if (InstClass != S_BUFFER_LOAD_IMM) {
526 SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm();
528 DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm();
533 } // end anonymous namespace.
535 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
536 "SI Load Store Optimizer", false, false)
537 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
538 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
539 false, false)
541 char SILoadStoreOptimizer::ID = 0;
543 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
545 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
546 return new SILoadStoreOptimizer();
549 static void moveInstsAfter(MachineBasicBlock::iterator I,
550 ArrayRef<MachineInstr *> InstsToMove) {
551 MachineBasicBlock *MBB = I->getParent();
552 ++I;
553 for (MachineInstr *MI : InstsToMove) {
554 MI->removeFromParent();
555 MBB->insert(I, MI);
559 static void addDefsUsesToList(const MachineInstr &MI,
560 DenseSet<unsigned> &RegDefs,
561 DenseSet<unsigned> &PhysRegUses) {
562 for (const MachineOperand &Op : MI.operands()) {
563 if (Op.isReg()) {
564 if (Op.isDef())
565 RegDefs.insert(Op.getReg());
566 else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
567 PhysRegUses.insert(Op.getReg());
572 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
573 MachineBasicBlock::iterator B,
574 AliasAnalysis *AA) {
575 // RAW or WAR - cannot reorder
576 // WAW - cannot reorder
577 // RAR - safe to reorder
578 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
581 // Add MI and its defs to the lists if MI reads one of the defs that are
582 // already in the list. Returns true in that case.
583 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
584 DenseSet<unsigned> &PhysRegUses,
585 SmallVectorImpl<MachineInstr *> &Insts) {
586 for (MachineOperand &Use : MI.operands()) {
587 // If one of the defs is read, then there is a use of Def between I and the
588 // instruction that I will potentially be merged with. We will need to move
589 // this instruction after the merged instructions.
591 // Similarly, if there is a def which is read by an instruction that is to
592 // be moved for merging, then we need to move the def-instruction as well.
593 // This can only happen for physical registers such as M0; virtual
594 // registers are in SSA form.
595 if (Use.isReg() &&
596 ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
597 (Use.isDef() && RegDefs.count(Use.getReg())) ||
598 (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
599 PhysRegUses.count(Use.getReg())))) {
600 Insts.push_back(&MI);
601 addDefsUsesToList(MI, RegDefs, PhysRegUses);
602 return true;
606 return false;
609 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
610 ArrayRef<MachineInstr *> InstsToMove,
611 AliasAnalysis *AA) {
612 assert(MemOp.mayLoadOrStore());
614 for (MachineInstr *InstToMove : InstsToMove) {
615 if (!InstToMove->mayLoadOrStore())
616 continue;
617 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
618 return false;
620 return true;
623 // This function assumes that \p A and \p B have are identical except for
624 // size and offset, and they referecne adjacent memory.
625 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
626 const MachineMemOperand *A,
627 const MachineMemOperand *B) {
628 unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
629 unsigned Size = A->getSize() + B->getSize();
630 // This function adds the offset parameter to the existing offset for A,
631 // so we pass 0 here as the offset and then manually set it to the correct
632 // value after the call.
633 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
634 MMO->setOffset(MinOffset);
635 return MMO;
638 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) {
639 assert(CI.InstClass == MIMG);
641 // Ignore instructions with tfe/lwe set.
642 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
643 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
645 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
646 return false;
648 // Check other optional immediate operands for equality.
649 unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
650 AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
651 AMDGPU::OpName::da, AMDGPU::OpName::r128};
653 for (auto op : OperandsToMatch) {
654 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
655 if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx)
656 return false;
657 if (Idx != -1 &&
658 CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm())
659 return false;
662 // Check DMask for overlaps.
663 unsigned MaxMask = std::max(CI.DMask0, CI.DMask1);
664 unsigned MinMask = std::min(CI.DMask0, CI.DMask1);
666 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
667 if ((1u << AllowedBitsForMin) <= MinMask)
668 return false;
670 return true;
673 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
674 assert(CI.InstClass != MIMG);
676 // XXX - Would the same offset be OK? Is there any reason this would happen or
677 // be useful?
678 if (CI.Offset0 == CI.Offset1)
679 return false;
681 // This won't be valid if the offset isn't aligned.
682 if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
683 return false;
685 unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
686 unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
687 CI.UseST64 = false;
688 CI.BaseOff = 0;
690 // Handle SMEM and VMEM instructions.
691 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
692 return (EltOffset0 + CI.Width0 == EltOffset1 ||
693 EltOffset1 + CI.Width1 == EltOffset0) &&
694 CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
695 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
698 // If the offset in elements doesn't fit in 8-bits, we might be able to use
699 // the stride 64 versions.
700 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
701 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
702 CI.Offset0 = EltOffset0 / 64;
703 CI.Offset1 = EltOffset1 / 64;
704 CI.UseST64 = true;
705 return true;
708 // Check if the new offsets fit in the reduced 8-bit range.
709 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
710 CI.Offset0 = EltOffset0;
711 CI.Offset1 = EltOffset1;
712 return true;
715 // Try to shift base address to decrease offsets.
716 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
717 CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
719 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
720 CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
721 CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
722 CI.UseST64 = true;
723 return true;
726 if (isUInt<8>(OffsetDiff)) {
727 CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
728 CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
729 return true;
732 return false;
735 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
736 const CombineInfo &CI) {
737 const unsigned Width = (CI.Width0 + CI.Width1);
738 switch (CI.InstClass) {
739 default:
740 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
741 case S_BUFFER_LOAD_IMM:
742 switch (Width) {
743 default:
744 return false;
745 case 2:
746 case 4:
747 return true;
752 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
753 MachineBasicBlock *MBB = CI.I->getParent();
754 MachineBasicBlock::iterator E = MBB->end();
755 MachineBasicBlock::iterator MBBI = CI.I;
757 const unsigned Opc = CI.I->getOpcode();
758 const InstClassEnum InstClass = getInstClass(Opc, *TII);
760 if (InstClass == UNKNOWN) {
761 return false;
763 const unsigned InstSubclass = getInstSubclass(Opc, *TII);
765 // Do not merge VMEM buffer instructions with "swizzled" bit set.
766 int Swizzled =
767 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
768 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
769 return false;
771 ++MBBI;
773 DenseSet<unsigned> RegDefsToMove;
774 DenseSet<unsigned> PhysRegUsesToMove;
775 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
777 for (; MBBI != E; ++MBBI) {
779 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
780 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
781 // This is not a matching instruction, but we can keep looking as
782 // long as one of these conditions are met:
783 // 1. It is safe to move I down past MBBI.
784 // 2. It is safe to move MBBI down past the instruction that I will
785 // be merged into.
787 if (MBBI->hasUnmodeledSideEffects()) {
788 // We can't re-order this instruction with respect to other memory
789 // operations, so we fail both conditions mentioned above.
790 return false;
793 if (MBBI->mayLoadOrStore() &&
794 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
795 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
796 // We fail condition #1, but we may still be able to satisfy condition
797 // #2. Add this instruction to the move list and then we will check
798 // if condition #2 holds once we have selected the matching instruction.
799 CI.InstsToMove.push_back(&*MBBI);
800 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
801 continue;
804 // When we match I with another DS instruction we will be moving I down
805 // to the location of the matched instruction any uses of I will need to
806 // be moved down as well.
807 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
808 CI.InstsToMove);
809 continue;
812 // Don't merge volatiles.
813 if (MBBI->hasOrderedMemoryRef())
814 return false;
816 // Handle a case like
817 // DS_WRITE_B32 addr, v, idx0
818 // w = DS_READ_B32 addr, idx0
819 // DS_WRITE_B32 addr, f(w), idx1
820 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
821 // merging of the two writes.
822 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
823 CI.InstsToMove))
824 continue;
826 bool Match = CI.hasSameBaseAddress(*MBBI);
828 if (Match) {
829 CI.setPaired(MBBI, *TII);
831 // Check both offsets (or masks for MIMG) can be combined and fit in the
832 // reduced range.
833 bool canBeCombined =
834 CI.InstClass == MIMG
835 ? dmasksCanBeCombined(CI, *TII)
836 : widthsFit(*STM, CI) && offsetsCanBeCombined(CI);
838 // We also need to go through the list of instructions that we plan to
839 // move and make sure they are all safe to move down past the merged
840 // instruction.
841 if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
842 return true;
845 // We've found a load/store that we couldn't merge for some reason.
846 // We could potentially keep looking, but we'd need to make sure that
847 // it was safe to move I and also all the instruction in InstsToMove
848 // down past this instruction.
849 // check if we can move I across MBBI and if we can move all I's users
850 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
851 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
852 break;
854 return false;
857 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
858 if (STM->ldsRequiresM0Init())
859 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
860 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
863 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
864 if (STM->ldsRequiresM0Init())
865 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
867 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
868 : AMDGPU::DS_READ2ST64_B64_gfx9;
871 MachineBasicBlock::iterator
872 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
873 MachineBasicBlock *MBB = CI.I->getParent();
875 // Be careful, since the addresses could be subregisters themselves in weird
876 // cases, like vectors of pointers.
877 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
879 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
880 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
882 unsigned NewOffset0 = CI.Offset0;
883 unsigned NewOffset1 = CI.Offset1;
884 unsigned Opc =
885 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
887 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
888 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
890 if (NewOffset0 > NewOffset1) {
891 // Canonicalize the merged instruction so the smaller offset comes first.
892 std::swap(NewOffset0, NewOffset1);
893 std::swap(SubRegIdx0, SubRegIdx1);
896 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
897 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
899 const MCInstrDesc &Read2Desc = TII->get(Opc);
901 const TargetRegisterClass *SuperRC =
902 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
903 Register DestReg = MRI->createVirtualRegister(SuperRC);
905 DebugLoc DL = CI.I->getDebugLoc();
907 Register BaseReg = AddrReg->getReg();
908 unsigned BaseSubReg = AddrReg->getSubReg();
909 unsigned BaseRegFlags = 0;
910 if (CI.BaseOff) {
911 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
912 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
913 .addImm(CI.BaseOff);
915 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
916 BaseRegFlags = RegState::Kill;
918 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
919 .addReg(ImmReg)
920 .addReg(AddrReg->getReg(), 0, BaseSubReg)
921 .addImm(0); // clamp bit
922 BaseSubReg = 0;
925 MachineInstrBuilder Read2 =
926 BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
927 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
928 .addImm(NewOffset0) // offset0
929 .addImm(NewOffset1) // offset1
930 .addImm(0) // gds
931 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
933 (void)Read2;
935 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
937 // Copy to the old destination registers.
938 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
939 .add(*Dest0) // Copy to same destination including flags and sub reg.
940 .addReg(DestReg, 0, SubRegIdx0);
941 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
942 .add(*Dest1)
943 .addReg(DestReg, RegState::Kill, SubRegIdx1);
945 moveInstsAfter(Copy1, CI.InstsToMove);
947 CI.I->eraseFromParent();
948 CI.Paired->eraseFromParent();
950 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
951 return Read2;
954 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
955 if (STM->ldsRequiresM0Init())
956 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
957 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
958 : AMDGPU::DS_WRITE2_B64_gfx9;
961 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
962 if (STM->ldsRequiresM0Init())
963 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
964 : AMDGPU::DS_WRITE2ST64_B64;
966 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
967 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
970 MachineBasicBlock::iterator
971 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
972 MachineBasicBlock *MBB = CI.I->getParent();
974 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
975 // sure we preserve the subregister index and any register flags set on them.
976 const MachineOperand *AddrReg =
977 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
978 const MachineOperand *Data0 =
979 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
980 const MachineOperand *Data1 =
981 TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
983 unsigned NewOffset0 = CI.Offset0;
984 unsigned NewOffset1 = CI.Offset1;
985 unsigned Opc =
986 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
988 if (NewOffset0 > NewOffset1) {
989 // Canonicalize the merged instruction so the smaller offset comes first.
990 std::swap(NewOffset0, NewOffset1);
991 std::swap(Data0, Data1);
994 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
995 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
997 const MCInstrDesc &Write2Desc = TII->get(Opc);
998 DebugLoc DL = CI.I->getDebugLoc();
1000 Register BaseReg = AddrReg->getReg();
1001 unsigned BaseSubReg = AddrReg->getSubReg();
1002 unsigned BaseRegFlags = 0;
1003 if (CI.BaseOff) {
1004 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1005 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1006 .addImm(CI.BaseOff);
1008 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1009 BaseRegFlags = RegState::Kill;
1011 TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
1012 .addReg(ImmReg)
1013 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1014 .addImm(0); // clamp bit
1015 BaseSubReg = 0;
1018 MachineInstrBuilder Write2 =
1019 BuildMI(*MBB, CI.Paired, DL, Write2Desc)
1020 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1021 .add(*Data0) // data0
1022 .add(*Data1) // data1
1023 .addImm(NewOffset0) // offset0
1024 .addImm(NewOffset1) // offset1
1025 .addImm(0) // gds
1026 .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1028 moveInstsAfter(Write2, CI.InstsToMove);
1030 CI.I->eraseFromParent();
1031 CI.Paired->eraseFromParent();
1033 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1034 return Write2;
1037 MachineBasicBlock::iterator
1038 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) {
1039 MachineBasicBlock *MBB = CI.I->getParent();
1040 DebugLoc DL = CI.I->getDebugLoc();
1041 const unsigned Opcode = getNewOpcode(CI);
1043 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1045 Register DestReg = MRI->createVirtualRegister(SuperRC);
1046 unsigned MergedDMask = CI.DMask0 | CI.DMask1;
1047 unsigned DMaskIdx =
1048 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1050 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
1051 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1052 if (I == DMaskIdx)
1053 MIB.addImm(MergedDMask);
1054 else
1055 MIB.add((*CI.I).getOperand(I));
1058 // It shouldn't be possible to get this far if the two instructions
1059 // don't have a single memoperand, because MachineInstr::mayAlias()
1060 // will return true if this is the case.
1061 assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1063 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1064 const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1066 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1068 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1069 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1070 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1072 // Copy to the old destination registers.
1073 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1074 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1075 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1077 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1078 .add(*Dest0) // Copy to same destination including flags and sub reg.
1079 .addReg(DestReg, 0, SubRegIdx0);
1080 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1081 .add(*Dest1)
1082 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1084 moveInstsAfter(Copy1, CI.InstsToMove);
1086 CI.I->eraseFromParent();
1087 CI.Paired->eraseFromParent();
1088 return New;
1091 MachineBasicBlock::iterator
1092 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
1093 MachineBasicBlock *MBB = CI.I->getParent();
1094 DebugLoc DL = CI.I->getDebugLoc();
1095 const unsigned Opcode = getNewOpcode(CI);
1097 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1099 Register DestReg = MRI->createVirtualRegister(SuperRC);
1100 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
1102 // It shouldn't be possible to get this far if the two instructions
1103 // don't have a single memoperand, because MachineInstr::mayAlias()
1104 // will return true if this is the case.
1105 assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1107 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1108 const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1110 MachineInstr *New =
1111 BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
1112 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1113 .addImm(MergedOffset) // offset
1114 .addImm(CI.GLC0) // glc
1115 .addImm(CI.DLC0) // dlc
1116 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1118 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1119 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1120 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1122 // Copy to the old destination registers.
1123 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1124 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1125 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
1127 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1128 .add(*Dest0) // Copy to same destination including flags and sub reg.
1129 .addReg(DestReg, 0, SubRegIdx0);
1130 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1131 .add(*Dest1)
1132 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1134 moveInstsAfter(Copy1, CI.InstsToMove);
1136 CI.I->eraseFromParent();
1137 CI.Paired->eraseFromParent();
1138 return New;
1141 MachineBasicBlock::iterator
1142 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
1143 MachineBasicBlock *MBB = CI.I->getParent();
1144 DebugLoc DL = CI.I->getDebugLoc();
1146 const unsigned Opcode = getNewOpcode(CI);
1148 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1150 // Copy to the new source register.
1151 Register DestReg = MRI->createVirtualRegister(SuperRC);
1152 unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
1154 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
1156 const unsigned Regs = getRegs(Opcode, *TII);
1158 if (Regs & VADDR)
1159 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1161 // It shouldn't be possible to get this far if the two instructions
1162 // don't have a single memoperand, because MachineInstr::mayAlias()
1163 // will return true if this is the case.
1164 assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1166 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1167 const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1169 MachineInstr *New =
1170 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1171 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1172 .addImm(MergedOffset) // offset
1173 .addImm(CI.GLC0) // glc
1174 .addImm(CI.SLC0) // slc
1175 .addImm(0) // tfe
1176 .addImm(CI.DLC0) // dlc
1177 .addImm(0) // swz
1178 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1180 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1181 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1182 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1184 // Copy to the old destination registers.
1185 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1186 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1187 const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1189 BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1190 .add(*Dest0) // Copy to same destination including flags and sub reg.
1191 .addReg(DestReg, 0, SubRegIdx0);
1192 MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1193 .add(*Dest1)
1194 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1196 moveInstsAfter(Copy1, CI.InstsToMove);
1198 CI.I->eraseFromParent();
1199 CI.Paired->eraseFromParent();
1200 return New;
1203 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
1204 const unsigned Width = CI.Width0 + CI.Width1;
1206 switch (CI.InstClass) {
1207 default:
1208 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1209 // FIXME: Handle d16 correctly
1210 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1211 Width);
1212 case UNKNOWN:
1213 llvm_unreachable("Unknown instruction class");
1214 case S_BUFFER_LOAD_IMM:
1215 switch (Width) {
1216 default:
1217 return 0;
1218 case 2:
1219 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1220 case 4:
1221 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1223 case MIMG:
1224 assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width));
1225 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1229 std::pair<unsigned, unsigned>
1230 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
1232 if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4)
1233 return std::make_pair(0, 0);
1235 bool ReverseOrder;
1236 if (CI.InstClass == MIMG) {
1237 assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) &&
1238 "No overlaps");
1239 ReverseOrder = CI.DMask0 > CI.DMask1;
1240 } else
1241 ReverseOrder = CI.Offset0 > CI.Offset1;
1243 static const unsigned Idxs[4][4] = {
1244 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1245 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1246 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1247 {AMDGPU::sub3, 0, 0, 0},
1249 unsigned Idx0;
1250 unsigned Idx1;
1252 assert(CI.Width0 >= 1 && CI.Width0 <= 3);
1253 assert(CI.Width1 >= 1 && CI.Width1 <= 3);
1255 if (ReverseOrder) {
1256 Idx1 = Idxs[0][CI.Width1 - 1];
1257 Idx0 = Idxs[CI.Width1][CI.Width0 - 1];
1258 } else {
1259 Idx0 = Idxs[0][CI.Width0 - 1];
1260 Idx1 = Idxs[CI.Width0][CI.Width1 - 1];
1263 return std::make_pair(Idx0, Idx1);
1266 const TargetRegisterClass *
1267 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1268 if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1269 switch (CI.Width0 + CI.Width1) {
1270 default:
1271 return nullptr;
1272 case 2:
1273 return &AMDGPU::SReg_64_XEXECRegClass;
1274 case 4:
1275 return &AMDGPU::SGPR_128RegClass;
1276 case 8:
1277 return &AMDGPU::SReg_256RegClass;
1278 case 16:
1279 return &AMDGPU::SReg_512RegClass;
1281 } else {
1282 switch (CI.Width0 + CI.Width1) {
1283 default:
1284 return nullptr;
1285 case 2:
1286 return &AMDGPU::VReg_64RegClass;
1287 case 3:
1288 return &AMDGPU::VReg_96RegClass;
1289 case 4:
1290 return &AMDGPU::VReg_128RegClass;
1295 MachineBasicBlock::iterator
1296 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1297 MachineBasicBlock *MBB = CI.I->getParent();
1298 DebugLoc DL = CI.I->getDebugLoc();
1300 const unsigned Opcode = getNewOpcode(CI);
1302 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1303 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1304 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1306 // Copy to the new source register.
1307 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1308 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1310 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1311 const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1313 BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1314 .add(*Src0)
1315 .addImm(SubRegIdx0)
1316 .add(*Src1)
1317 .addImm(SubRegIdx1);
1319 auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1320 .addReg(SrcReg, RegState::Kill);
1322 const unsigned Regs = getRegs(Opcode, *TII);
1324 if (Regs & VADDR)
1325 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1328 // It shouldn't be possible to get this far if the two instructions
1329 // don't have a single memoperand, because MachineInstr::mayAlias()
1330 // will return true if this is the case.
1331 assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1333 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1334 const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1336 MachineInstr *New =
1337 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1338 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1339 .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1340 .addImm(CI.GLC0) // glc
1341 .addImm(CI.SLC0) // slc
1342 .addImm(0) // tfe
1343 .addImm(CI.DLC0) // dlc
1344 .addImm(0) // swz
1345 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1347 moveInstsAfter(MIB, CI.InstsToMove);
1349 CI.I->eraseFromParent();
1350 CI.Paired->eraseFromParent();
1351 return New;
1354 MachineOperand
1355 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1356 APInt V(32, Val, true);
1357 if (TII->isInlineConstant(V))
1358 return MachineOperand::CreateImm(Val);
1360 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1361 MachineInstr *Mov =
1362 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1363 TII->get(AMDGPU::S_MOV_B32), Reg)
1364 .addImm(Val);
1365 (void)Mov;
1366 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1367 return MachineOperand::CreateReg(Reg, false);
1370 // Compute base address using Addr and return the final register.
1371 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1372 const MemAddress &Addr) const {
1373 MachineBasicBlock *MBB = MI.getParent();
1374 MachineBasicBlock::iterator MBBI = MI.getIterator();
1375 DebugLoc DL = MI.getDebugLoc();
1377 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1378 Addr.Base.LoSubReg) &&
1379 "Expected 32-bit Base-Register-Low!!");
1381 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1382 Addr.Base.HiSubReg) &&
1383 "Expected 32-bit Base-Register-Hi!!");
1385 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1386 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1387 MachineOperand OffsetHi =
1388 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1390 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1391 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1392 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1394 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1395 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1396 MachineInstr *LoHalf =
1397 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1398 .addReg(CarryReg, RegState::Define)
1399 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1400 .add(OffsetLo)
1401 .addImm(0); // clamp bit
1402 (void)LoHalf;
1403 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1405 MachineInstr *HiHalf =
1406 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1407 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1408 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1409 .add(OffsetHi)
1410 .addReg(CarryReg, RegState::Kill)
1411 .addImm(0); // clamp bit
1412 (void)HiHalf;
1413 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1415 Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1416 MachineInstr *FullBase =
1417 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1418 .addReg(DestSub0)
1419 .addImm(AMDGPU::sub0)
1420 .addReg(DestSub1)
1421 .addImm(AMDGPU::sub1);
1422 (void)FullBase;
1423 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1425 return FullDestReg;
1428 // Update base and offset with the NewBase and NewOffset in MI.
1429 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1430 unsigned NewBase,
1431 int32_t NewOffset) const {
1432 TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1433 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1436 Optional<int32_t>
1437 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1438 if (Op.isImm())
1439 return Op.getImm();
1441 if (!Op.isReg())
1442 return None;
1444 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1445 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1446 !Def->getOperand(1).isImm())
1447 return None;
1449 return Def->getOperand(1).getImm();
1452 // Analyze Base and extracts:
1453 // - 32bit base registers, subregisters
1454 // - 64bit constant offset
1455 // Expecting base computation as:
1456 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1457 // %LO:vgpr_32, %c:sreg_64_xexec =
1458 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1459 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1460 // %Base:vreg_64 =
1461 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1462 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1463 MemAddress &Addr) const {
1464 if (!Base.isReg())
1465 return;
1467 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1468 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1469 || Def->getNumOperands() != 5)
1470 return;
1472 MachineOperand BaseLo = Def->getOperand(1);
1473 MachineOperand BaseHi = Def->getOperand(3);
1474 if (!BaseLo.isReg() || !BaseHi.isReg())
1475 return;
1477 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1478 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1480 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1481 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1482 return;
1484 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1485 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1487 auto Offset0P = extractConstOffset(*Src0);
1488 if (Offset0P)
1489 BaseLo = *Src1;
1490 else {
1491 if (!(Offset0P = extractConstOffset(*Src1)))
1492 return;
1493 BaseLo = *Src0;
1496 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1497 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1499 if (Src0->isImm())
1500 std::swap(Src0, Src1);
1502 if (!Src1->isImm())
1503 return;
1505 uint64_t Offset1 = Src1->getImm();
1506 BaseHi = *Src0;
1508 Addr.Base.LoReg = BaseLo.getReg();
1509 Addr.Base.HiReg = BaseHi.getReg();
1510 Addr.Base.LoSubReg = BaseLo.getSubReg();
1511 Addr.Base.HiSubReg = BaseHi.getSubReg();
1512 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1515 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1516 MachineInstr &MI,
1517 MemInfoMap &Visited,
1518 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1520 if (!(MI.mayLoad() ^ MI.mayStore()))
1521 return false;
1523 // TODO: Support flat and scratch.
1524 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1525 return false;
1527 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1528 return false;
1530 if (AnchorList.count(&MI))
1531 return false;
1533 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1535 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1536 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1537 return false;
1540 // Step1: Find the base-registers and a 64bit constant offset.
1541 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1542 MemAddress MAddr;
1543 if (Visited.find(&MI) == Visited.end()) {
1544 processBaseWithConstOffset(Base, MAddr);
1545 Visited[&MI] = MAddr;
1546 } else
1547 MAddr = Visited[&MI];
1549 if (MAddr.Offset == 0) {
1550 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1551 " constant offsets that can be promoted.\n";);
1552 return false;
1555 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1556 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1558 // Step2: Traverse through MI's basic block and find an anchor(that has the
1559 // same base-registers) with the highest 13bit distance from MI's offset.
1560 // E.g. (64bit loads)
1561 // bb:
1562 // addr1 = &a + 4096; load1 = load(addr1, 0)
1563 // addr2 = &a + 6144; load2 = load(addr2, 0)
1564 // addr3 = &a + 8192; load3 = load(addr3, 0)
1565 // addr4 = &a + 10240; load4 = load(addr4, 0)
1566 // addr5 = &a + 12288; load5 = load(addr5, 0)
1568 // Starting from the first load, the optimization will try to find a new base
1569 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1570 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1571 // as the new-base(anchor) because of the maximum distance which can
1572 // accomodate more intermediate bases presumeably.
1574 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1575 // (&a + 8192) for load1, load2, load4.
1576 // addr = &a + 8192
1577 // load1 = load(addr, -4096)
1578 // load2 = load(addr, -2048)
1579 // load3 = load(addr, 0)
1580 // load4 = load(addr, 2048)
1581 // addr5 = &a + 12288; load5 = load(addr5, 0)
1583 MachineInstr *AnchorInst = nullptr;
1584 MemAddress AnchorAddr;
1585 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1586 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1588 MachineBasicBlock *MBB = MI.getParent();
1589 MachineBasicBlock::iterator E = MBB->end();
1590 MachineBasicBlock::iterator MBBI = MI.getIterator();
1591 ++MBBI;
1592 const SITargetLowering *TLI =
1593 static_cast<const SITargetLowering *>(STM->getTargetLowering());
1595 for ( ; MBBI != E; ++MBBI) {
1596 MachineInstr &MINext = *MBBI;
1597 // TODO: Support finding an anchor(with same base) from store addresses or
1598 // any other load addresses where the opcodes are different.
1599 if (MINext.getOpcode() != MI.getOpcode() ||
1600 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1601 continue;
1603 const MachineOperand &BaseNext =
1604 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1605 MemAddress MAddrNext;
1606 if (Visited.find(&MINext) == Visited.end()) {
1607 processBaseWithConstOffset(BaseNext, MAddrNext);
1608 Visited[&MINext] = MAddrNext;
1609 } else
1610 MAddrNext = Visited[&MINext];
1612 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1613 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1614 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1615 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1616 continue;
1618 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1620 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1621 TargetLoweringBase::AddrMode AM;
1622 AM.HasBaseReg = true;
1623 AM.BaseOffs = Dist;
1624 if (TLI->isLegalGlobalAddressingMode(AM) &&
1625 (uint32_t)std::abs(Dist) > MaxDist) {
1626 MaxDist = std::abs(Dist);
1628 AnchorAddr = MAddrNext;
1629 AnchorInst = &MINext;
1633 if (AnchorInst) {
1634 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1635 AnchorInst->dump());
1636 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1637 << AnchorAddr.Offset << "\n\n");
1639 // Instead of moving up, just re-compute anchor-instruction's base address.
1640 unsigned Base = computeBase(MI, AnchorAddr);
1642 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1643 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1645 for (auto P : InstsWCommonBase) {
1646 TargetLoweringBase::AddrMode AM;
1647 AM.HasBaseReg = true;
1648 AM.BaseOffs = P.second - AnchorAddr.Offset;
1650 if (TLI->isLegalGlobalAddressingMode(AM)) {
1651 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1652 dbgs() << ")"; P.first->dump());
1653 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1654 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1657 AnchorList.insert(AnchorInst);
1658 return true;
1661 return false;
1664 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1665 std::list<std::list<CombineInfo> > &MergeableInsts) const {
1666 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1667 if (AddrList.front().hasSameBaseAddress(*CI.I) &&
1668 AddrList.front().InstClass == CI.InstClass) {
1669 AddrList.emplace_back(CI);
1670 return;
1674 // Base address not found, so add a new list.
1675 MergeableInsts.emplace_back(1, CI);
1678 bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
1679 std::list<std::list<CombineInfo> > &MergeableInsts) const {
1680 bool Modified = false;
1681 // Contain the list
1682 MemInfoMap Visited;
1683 // Contains the list of instructions for which constant offsets are being
1684 // promoted to the IMM.
1685 SmallPtrSet<MachineInstr *, 4> AnchorList;
1687 // Sort potential mergeable instructions into lists. One list per base address.
1688 for (MachineInstr &MI : MBB.instrs()) {
1689 // We run this before checking if an address is mergeable, because it can produce
1690 // better code even if the instructions aren't mergeable.
1691 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1692 Modified = true;
1694 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
1695 if (InstClass == UNKNOWN)
1696 continue;
1698 // Don't combine if volatile.
1699 if (MI.hasOrderedMemoryRef())
1700 continue;
1702 CombineInfo CI;
1703 CI.setMI(MI, *TII, *STM);
1705 if (!CI.hasMergeableAddress(*MRI))
1706 continue;
1708 addInstToMergeableList(CI, MergeableInsts);
1710 return Modified;
1713 // Scan through looking for adjacent LDS operations with constant offsets from
1714 // the same base register. We rely on the scheduler to do the hard work of
1715 // clustering nearby loads, and assume these are all adjacent.
1716 bool SILoadStoreOptimizer::optimizeBlock(
1717 std::list<std::list<CombineInfo> > &MergeableInsts) {
1718 bool Modified = false;
1720 for (std::list<CombineInfo> &MergeList : MergeableInsts) {
1721 if (MergeList.size() < 2)
1722 continue;
1724 bool OptimizeListAgain = false;
1725 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
1726 // We weren't able to make any changes, so clear the list so we don't
1727 // process the same instructions the next time we try to optimize this
1728 // block.
1729 MergeList.clear();
1730 continue;
1733 // We made changes, but also determined that there were no more optimization
1734 // opportunities, so we don't need to reprocess the list
1735 if (!OptimizeListAgain)
1736 MergeList.clear();
1738 OptimizeAgain |= OptimizeListAgain;
1739 Modified = true;
1741 return Modified;
1744 void
1745 SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
1746 const MachineInstr &MI) {
1748 for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
1749 if (&*CI->I == &MI) {
1750 MergeList.erase(CI);
1751 return;
1756 bool
1757 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
1758 std::list<CombineInfo> &MergeList,
1759 bool &OptimizeListAgain) {
1760 bool Modified = false;
1761 for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
1762 CombineInfo &CI = *I;
1764 switch (CI.InstClass) {
1765 default:
1766 break;
1767 case DS_READ:
1768 if (findMatchingInst(CI)) {
1769 Modified = true;
1770 removeCombinedInst(MergeList, *CI.Paired);
1771 MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
1772 CI.setMI(NewMI, *TII, *STM);
1774 break;
1775 case DS_WRITE:
1776 if (findMatchingInst(CI)) {
1777 Modified = true;
1778 removeCombinedInst(MergeList, *CI.Paired);
1779 MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
1780 CI.setMI(NewMI, *TII, *STM);
1782 break;
1783 case S_BUFFER_LOAD_IMM:
1784 if (findMatchingInst(CI)) {
1785 Modified = true;
1786 removeCombinedInst(MergeList, *CI.Paired);
1787 MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
1788 CI.setMI(NewMI, *TII, *STM);
1789 OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
1791 break;
1792 case BUFFER_LOAD:
1793 if (findMatchingInst(CI)) {
1794 Modified = true;
1795 removeCombinedInst(MergeList, *CI.Paired);
1796 MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
1797 CI.setMI(NewMI, *TII, *STM);
1798 OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
1800 break;
1801 case BUFFER_STORE:
1802 if (findMatchingInst(CI)) {
1803 Modified = true;
1804 removeCombinedInst(MergeList, *CI.Paired);
1805 MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
1806 CI.setMI(NewMI, *TII, *STM);
1807 OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
1809 break;
1810 case MIMG:
1811 if (findMatchingInst(CI)) {
1812 Modified = true;
1813 removeCombinedInst(MergeList, *CI.Paired);
1814 MachineBasicBlock::iterator NewMI = mergeImagePair(CI);
1815 CI.setMI(NewMI, *TII, *STM);
1816 OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
1818 break;
1820 // Clear the InstsToMove after we have finished searching so we don't have
1821 // stale values left over if we search for this CI again in another pass
1822 // over the block.
1823 CI.InstsToMove.clear();
1826 return Modified;
1829 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1830 if (skipFunction(MF.getFunction()))
1831 return false;
1833 STM = &MF.getSubtarget<GCNSubtarget>();
1834 if (!STM->loadStoreOptEnabled())
1835 return false;
1837 TII = STM->getInstrInfo();
1838 TRI = &TII->getRegisterInfo();
1840 MRI = &MF.getRegInfo();
1841 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1843 assert(MRI->isSSA() && "Must be run on SSA");
1845 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1847 bool Modified = false;
1850 for (MachineBasicBlock &MBB : MF) {
1851 std::list<std::list<CombineInfo> > MergeableInsts;
1852 // First pass: Collect list of all instructions we know how to merge.
1853 Modified |= collectMergeableInsts(MBB, MergeableInsts);
1854 do {
1855 OptimizeAgain = false;
1856 Modified |= optimizeBlock(MergeableInsts);
1857 } while (OptimizeAgain);
1860 return Modified;