1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// \file This pass tries to apply several peephole SDWA patterns.
12 /// V_LSHRREV_B32_e32 %0, 16, %1
13 /// V_ADD_I32_e32 %2, %0, %3
14 /// V_LSHLREV_B32_e32 %4, 16, %2
17 /// V_ADD_I32_sdwa %4, %1, %3
18 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
20 //===----------------------------------------------------------------------===//
23 #include "AMDGPUSubtarget.h"
24 #include "SIDefines.h"
25 #include "SIInstrInfo.h"
26 #include "SIRegisterInfo.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "Utils/AMDGPUBaseInfo.h"
29 #include "llvm/ADT/None.h"
30 #include "llvm/ADT/Optional.h"
31 #include "llvm/ADT/STLExtras.h"
32 #include "llvm/ADT/SmallVector.h"
33 #include "llvm/ADT/Statistic.h"
34 #include "llvm/CodeGen/MachineBasicBlock.h"
35 #include "llvm/CodeGen/MachineFunction.h"
36 #include "llvm/CodeGen/MachineFunctionPass.h"
37 #include "llvm/CodeGen/MachineInstr.h"
38 #include "llvm/CodeGen/MachineInstrBuilder.h"
39 #include "llvm/CodeGen/MachineOperand.h"
40 #include "llvm/CodeGen/MachineRegisterInfo.h"
41 #include "llvm/CodeGen/TargetRegisterInfo.h"
42 #include "llvm/Config/llvm-config.h"
43 #include "llvm/MC/LaneBitmask.h"
44 #include "llvm/MC/MCInstrDesc.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Support/raw_ostream.h"
52 #include <unordered_map>
56 #define DEBUG_TYPE "si-peephole-sdwa"
58 STATISTIC(NumSDWAPatternsFound
, "Number of SDWA patterns found.");
59 STATISTIC(NumSDWAInstructionsPeepholed
,
60 "Number of instruction converted to SDWA.");
67 class SIPeepholeSDWA
: public MachineFunctionPass
{
69 using SDWAOperandsVector
= SmallVector
<SDWAOperand
*, 4>;
72 MachineRegisterInfo
*MRI
;
73 const SIRegisterInfo
*TRI
;
74 const SIInstrInfo
*TII
;
76 std::unordered_map
<MachineInstr
*, std::unique_ptr
<SDWAOperand
>> SDWAOperands
;
77 std::unordered_map
<MachineInstr
*, SDWAOperandsVector
> PotentialMatches
;
78 SmallVector
<MachineInstr
*, 8> ConvertedInstructions
;
80 Optional
<int64_t> foldToImm(const MachineOperand
&Op
) const;
85 SIPeepholeSDWA() : MachineFunctionPass(ID
) {
86 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
89 bool runOnMachineFunction(MachineFunction
&MF
) override
;
90 void matchSDWAOperands(MachineBasicBlock
&MBB
);
91 std::unique_ptr
<SDWAOperand
> matchSDWAOperand(MachineInstr
&MI
);
92 bool isConvertibleToSDWA(MachineInstr
&MI
, const GCNSubtarget
&ST
) const;
93 void pseudoOpConvertToVOP2(MachineInstr
&MI
,
94 const GCNSubtarget
&ST
) const;
95 bool convertToSDWA(MachineInstr
&MI
, const SDWAOperandsVector
&SDWAOperands
);
96 void legalizeScalarOperands(MachineInstr
&MI
, const GCNSubtarget
&ST
) const;
98 StringRef
getPassName() const override
{ return "SI Peephole SDWA"; }
100 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
101 AU
.setPreservesCFG();
102 MachineFunctionPass::getAnalysisUsage(AU
);
108 MachineOperand
*Target
; // Operand that would be used in converted instruction
109 MachineOperand
*Replaced
; // Operand that would be replace by Target
112 SDWAOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
)
113 : Target(TargetOp
), Replaced(ReplacedOp
) {
114 assert(Target
->isReg());
115 assert(Replaced
->isReg());
118 virtual ~SDWAOperand() = default;
120 virtual MachineInstr
*potentialToConvert(const SIInstrInfo
*TII
) = 0;
121 virtual bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) = 0;
123 MachineOperand
*getTargetOperand() const { return Target
; }
124 MachineOperand
*getReplacedOperand() const { return Replaced
; }
125 MachineInstr
*getParentInst() const { return Target
->getParent(); }
127 MachineRegisterInfo
*getMRI() const {
128 return &getParentInst()->getParent()->getParent()->getRegInfo();
131 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
132 virtual void print(raw_ostream
& OS
) const = 0;
133 void dump() const { print(dbgs()); }
137 using namespace AMDGPU::SDWA
;
139 class SDWASrcOperand
: public SDWAOperand
{
147 SDWASrcOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
,
148 SdwaSel SrcSel_
= DWORD
, bool Abs_
= false, bool Neg_
= false,
150 : SDWAOperand(TargetOp
, ReplacedOp
),
151 SrcSel(SrcSel_
), Abs(Abs_
), Neg(Neg_
), Sext(Sext_
) {}
153 MachineInstr
*potentialToConvert(const SIInstrInfo
*TII
) override
;
154 bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) override
;
156 SdwaSel
getSrcSel() const { return SrcSel
; }
157 bool getAbs() const { return Abs
; }
158 bool getNeg() const { return Neg
; }
159 bool getSext() const { return Sext
; }
161 uint64_t getSrcMods(const SIInstrInfo
*TII
,
162 const MachineOperand
*SrcOp
) const;
164 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
165 void print(raw_ostream
& OS
) const override
;
169 class SDWADstOperand
: public SDWAOperand
{
176 SDWADstOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
,
177 SdwaSel DstSel_
= DWORD
, DstUnused DstUn_
= UNUSED_PAD
)
178 : SDWAOperand(TargetOp
, ReplacedOp
), DstSel(DstSel_
), DstUn(DstUn_
) {}
180 MachineInstr
*potentialToConvert(const SIInstrInfo
*TII
) override
;
181 bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) override
;
183 SdwaSel
getDstSel() const { return DstSel
; }
184 DstUnused
getDstUnused() const { return DstUn
; }
186 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
187 void print(raw_ostream
& OS
) const override
;
191 class SDWADstPreserveOperand
: public SDWADstOperand
{
193 MachineOperand
*Preserve
;
196 SDWADstPreserveOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
,
197 MachineOperand
*PreserveOp
, SdwaSel DstSel_
= DWORD
)
198 : SDWADstOperand(TargetOp
, ReplacedOp
, DstSel_
, UNUSED_PRESERVE
),
199 Preserve(PreserveOp
) {}
201 bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) override
;
203 MachineOperand
*getPreservedOperand() const { return Preserve
; }
205 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206 void print(raw_ostream
& OS
) const override
;
210 } // end anonymous namespace
212 INITIALIZE_PASS(SIPeepholeSDWA
, DEBUG_TYPE
, "SI Peephole SDWA", false, false)
214 char SIPeepholeSDWA::ID
= 0;
216 char &llvm::SIPeepholeSDWAID
= SIPeepholeSDWA::ID
;
218 FunctionPass
*llvm::createSIPeepholeSDWAPass() {
219 return new SIPeepholeSDWA();
223 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
224 static raw_ostream
& operator<<(raw_ostream
&OS
, SdwaSel Sel
) {
226 case BYTE_0
: OS
<< "BYTE_0"; break;
227 case BYTE_1
: OS
<< "BYTE_1"; break;
228 case BYTE_2
: OS
<< "BYTE_2"; break;
229 case BYTE_3
: OS
<< "BYTE_3"; break;
230 case WORD_0
: OS
<< "WORD_0"; break;
231 case WORD_1
: OS
<< "WORD_1"; break;
232 case DWORD
: OS
<< "DWORD"; break;
237 static raw_ostream
& operator<<(raw_ostream
&OS
, const DstUnused
&Un
) {
239 case UNUSED_PAD
: OS
<< "UNUSED_PAD"; break;
240 case UNUSED_SEXT
: OS
<< "UNUSED_SEXT"; break;
241 case UNUSED_PRESERVE
: OS
<< "UNUSED_PRESERVE"; break;
246 static raw_ostream
& operator<<(raw_ostream
&OS
, const SDWAOperand
&Operand
) {
252 void SDWASrcOperand::print(raw_ostream
& OS
) const {
253 OS
<< "SDWA src: " << *getTargetOperand()
254 << " src_sel:" << getSrcSel()
255 << " abs:" << getAbs() << " neg:" << getNeg()
256 << " sext:" << getSext() << '\n';
260 void SDWADstOperand::print(raw_ostream
& OS
) const {
261 OS
<< "SDWA dst: " << *getTargetOperand()
262 << " dst_sel:" << getDstSel()
263 << " dst_unused:" << getDstUnused() << '\n';
267 void SDWADstPreserveOperand::print(raw_ostream
& OS
) const {
268 OS
<< "SDWA preserve dst: " << *getTargetOperand()
269 << " dst_sel:" << getDstSel()
270 << " preserve:" << *getPreservedOperand() << '\n';
275 static void copyRegOperand(MachineOperand
&To
, const MachineOperand
&From
) {
276 assert(To
.isReg() && From
.isReg());
277 To
.setReg(From
.getReg());
278 To
.setSubReg(From
.getSubReg());
279 To
.setIsUndef(From
.isUndef());
281 To
.setIsKill(From
.isKill());
283 To
.setIsDead(From
.isDead());
287 static bool isSameReg(const MachineOperand
&LHS
, const MachineOperand
&RHS
) {
288 return LHS
.isReg() &&
290 LHS
.getReg() == RHS
.getReg() &&
291 LHS
.getSubReg() == RHS
.getSubReg();
294 static MachineOperand
*findSingleRegUse(const MachineOperand
*Reg
,
295 const MachineRegisterInfo
*MRI
) {
296 if (!Reg
->isReg() || !Reg
->isDef())
299 MachineOperand
*ResMO
= nullptr;
300 for (MachineOperand
&UseMO
: MRI
->use_nodbg_operands(Reg
->getReg())) {
301 // If there exist use of subreg of Reg then return nullptr
302 if (!isSameReg(UseMO
, *Reg
))
305 // Check that there is only one instruction that uses Reg
308 } else if (ResMO
->getParent() != UseMO
.getParent()) {
316 static MachineOperand
*findSingleRegDef(const MachineOperand
*Reg
,
317 const MachineRegisterInfo
*MRI
) {
321 MachineInstr
*DefInstr
= MRI
->getUniqueVRegDef(Reg
->getReg());
325 for (auto &DefMO
: DefInstr
->defs()) {
326 if (DefMO
.isReg() && DefMO
.getReg() == Reg
->getReg())
330 // Ignore implicit defs.
334 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo
*TII
,
335 const MachineOperand
*SrcOp
) const {
337 const auto *MI
= SrcOp
->getParent();
338 if (TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0
) == SrcOp
) {
339 if (auto *Mod
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0_modifiers
)) {
340 Mods
= Mod
->getImm();
342 } else if (TII
->getNamedOperand(*MI
, AMDGPU::OpName::src1
) == SrcOp
) {
343 if (auto *Mod
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src1_modifiers
)) {
344 Mods
= Mod
->getImm();
349 "Float and integer src modifiers can't be set simulteniously");
350 Mods
|= Abs
? SISrcMods::ABS
: 0u;
351 Mods
^= Neg
? SISrcMods::NEG
: 0u;
353 Mods
|= SISrcMods::SEXT
;
359 MachineInstr
*SDWASrcOperand::potentialToConvert(const SIInstrInfo
*TII
) {
360 // For SDWA src operand potential instruction is one that use register
361 // defined by parent instruction
362 MachineOperand
*PotentialMO
= findSingleRegUse(getReplacedOperand(), getMRI());
366 return PotentialMO
->getParent();
369 bool SDWASrcOperand::convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) {
370 // Find operand in instruction that matches source operand and replace it with
371 // target operand. Set corresponding src_sel
372 bool IsPreserveSrc
= false;
373 MachineOperand
*Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
374 MachineOperand
*SrcSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_sel
);
375 MachineOperand
*SrcMods
=
376 TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
);
377 assert(Src
&& (Src
->isReg() || Src
->isImm()));
378 if (!isSameReg(*Src
, *getReplacedOperand())) {
379 // If this is not src0 then it could be src1
380 Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
381 SrcSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_sel
);
382 SrcMods
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
);
385 !isSameReg(*Src
, *getReplacedOperand())) {
386 // It's possible this Src is a tied operand for
387 // UNUSED_PRESERVE, in which case we can either
388 // abandon the peephole attempt, or if legal we can
389 // copy the target operand into the tied slot
390 // if the preserve operation will effectively cause the same
391 // result by overwriting the rest of the dst.
392 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
393 MachineOperand
*DstUnused
=
394 TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
397 DstUnused
->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE
) {
398 // This will work if the tied src is acessing WORD_0, and the dst is
399 // writing WORD_1. Modifiers don't matter because all the bits that
400 // would be impacted are being overwritten by the dst.
401 // Any other case will not work.
402 SdwaSel DstSel
= static_cast<SdwaSel
>(
403 TII
->getNamedImmOperand(MI
, AMDGPU::OpName::dst_sel
));
404 if (DstSel
== AMDGPU::SDWA::SdwaSel::WORD_1
&&
405 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0
) {
406 IsPreserveSrc
= true;
407 auto DstIdx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(),
408 AMDGPU::OpName::vdst
);
409 auto TiedIdx
= MI
.findTiedOperandIdx(DstIdx
);
410 Src
= &MI
.getOperand(TiedIdx
);
414 // Not legal to convert this src
419 assert(Src
&& Src
->isReg());
421 if ((MI
.getOpcode() == AMDGPU::V_FMAC_F16_sdwa
||
422 MI
.getOpcode() == AMDGPU::V_FMAC_F32_sdwa
||
423 MI
.getOpcode() == AMDGPU::V_MAC_F16_sdwa
||
424 MI
.getOpcode() == AMDGPU::V_MAC_F32_sdwa
) &&
425 !isSameReg(*Src
, *getReplacedOperand())) {
426 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
427 // src2. This is not allowed.
431 assert(isSameReg(*Src
, *getReplacedOperand()) &&
432 (IsPreserveSrc
|| (SrcSel
&& SrcMods
)));
434 copyRegOperand(*Src
, *getTargetOperand());
435 if (!IsPreserveSrc
) {
436 SrcSel
->setImm(getSrcSel());
437 SrcMods
->setImm(getSrcMods(TII
, Src
));
439 getTargetOperand()->setIsKill(false);
443 MachineInstr
*SDWADstOperand::potentialToConvert(const SIInstrInfo
*TII
) {
444 // For SDWA dst operand potential instruction is one that defines register
445 // that this operand uses
446 MachineRegisterInfo
*MRI
= getMRI();
447 MachineInstr
*ParentMI
= getParentInst();
449 MachineOperand
*PotentialMO
= findSingleRegDef(getReplacedOperand(), MRI
);
453 // Check that ParentMI is the only instruction that uses replaced register
454 for (MachineInstr
&UseInst
: MRI
->use_nodbg_instructions(PotentialMO
->getReg())) {
455 if (&UseInst
!= ParentMI
)
459 return PotentialMO
->getParent();
462 bool SDWADstOperand::convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) {
463 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
465 if ((MI
.getOpcode() == AMDGPU::V_FMAC_F16_sdwa
||
466 MI
.getOpcode() == AMDGPU::V_FMAC_F32_sdwa
||
467 MI
.getOpcode() == AMDGPU::V_MAC_F16_sdwa
||
468 MI
.getOpcode() == AMDGPU::V_MAC_F32_sdwa
) &&
469 getDstSel() != AMDGPU::SDWA::DWORD
) {
470 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
474 MachineOperand
*Operand
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
477 isSameReg(*Operand
, *getReplacedOperand()));
478 copyRegOperand(*Operand
, *getTargetOperand());
479 MachineOperand
*DstSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_sel
);
481 DstSel
->setImm(getDstSel());
482 MachineOperand
*DstUnused
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
484 DstUnused
->setImm(getDstUnused());
486 // Remove original instruction because it would conflict with our new
487 // instruction by register definition
488 getParentInst()->eraseFromParent();
492 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr
&MI
,
493 const SIInstrInfo
*TII
) {
494 // MI should be moved right before v_or_b32.
495 // For this we should clear all kill flags on uses of MI src-operands or else
496 // we can encounter problem with use of killed operand.
497 for (MachineOperand
&MO
: MI
.uses()) {
500 getMRI()->clearKillFlags(MO
.getReg());
503 // Move MI before v_or_b32
504 auto MBB
= MI
.getParent();
506 MBB
->insert(getParentInst(), &MI
);
508 // Add Implicit use of preserved register
509 MachineInstrBuilder
MIB(*MBB
->getParent(), MI
);
510 MIB
.addReg(getPreservedOperand()->getReg(),
511 RegState::ImplicitKill
,
512 getPreservedOperand()->getSubReg());
514 // Tie dst to implicit use
515 MI
.tieOperands(AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vdst
),
516 MI
.getNumOperands() - 1);
518 // Convert MI as any other SDWADstOperand and remove v_or_b32
519 return SDWADstOperand::convertToSDWA(MI
, TII
);
522 Optional
<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand
&Op
) const {
527 // If this is not immediate then it can be copy of immediate value, e.g.:
528 // %1 = S_MOV_B32 255;
530 for (const MachineOperand
&Def
: MRI
->def_operands(Op
.getReg())) {
531 if (!isSameReg(Op
, Def
))
534 const MachineInstr
*DefInst
= Def
.getParent();
535 if (!TII
->isFoldableCopy(*DefInst
))
538 const MachineOperand
&Copied
= DefInst
->getOperand(1);
542 return Copied
.getImm();
549 std::unique_ptr
<SDWAOperand
>
550 SIPeepholeSDWA::matchSDWAOperand(MachineInstr
&MI
) {
551 unsigned Opcode
= MI
.getOpcode();
553 case AMDGPU::V_LSHRREV_B32_e32
:
554 case AMDGPU::V_ASHRREV_I32_e32
:
555 case AMDGPU::V_LSHLREV_B32_e32
:
556 case AMDGPU::V_LSHRREV_B32_e64
:
557 case AMDGPU::V_ASHRREV_I32_e64
:
558 case AMDGPU::V_LSHLREV_B32_e64
: {
559 // from: v_lshrrev_b32_e32 v1, 16/24, v0
560 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
562 // from: v_ashrrev_i32_e32 v1, 16/24, v0
563 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
565 // from: v_lshlrev_b32_e32 v1, 16/24, v0
566 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
567 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
568 auto Imm
= foldToImm(*Src0
);
572 if (*Imm
!= 16 && *Imm
!= 24)
575 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
576 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
577 if (Register::isPhysicalRegister(Src1
->getReg()) ||
578 Register::isPhysicalRegister(Dst
->getReg()))
581 if (Opcode
== AMDGPU::V_LSHLREV_B32_e32
||
582 Opcode
== AMDGPU::V_LSHLREV_B32_e64
) {
583 return std::make_unique
<SDWADstOperand
>(
584 Dst
, Src1
, *Imm
== 16 ? WORD_1
: BYTE_3
, UNUSED_PAD
);
586 return std::make_unique
<SDWASrcOperand
>(
587 Src1
, Dst
, *Imm
== 16 ? WORD_1
: BYTE_3
, false, false,
588 Opcode
!= AMDGPU::V_LSHRREV_B32_e32
&&
589 Opcode
!= AMDGPU::V_LSHRREV_B32_e64
);
594 case AMDGPU::V_LSHRREV_B16_e32
:
595 case AMDGPU::V_ASHRREV_I16_e32
:
596 case AMDGPU::V_LSHLREV_B16_e32
:
597 case AMDGPU::V_LSHRREV_B16_e64
:
598 case AMDGPU::V_ASHRREV_I16_e64
:
599 case AMDGPU::V_LSHLREV_B16_e64
: {
600 // from: v_lshrrev_b16_e32 v1, 8, v0
601 // to SDWA src:v0 src_sel:BYTE_1
603 // from: v_ashrrev_i16_e32 v1, 8, v0
604 // to SDWA src:v0 src_sel:BYTE_1 sext:1
606 // from: v_lshlrev_b16_e32 v1, 8, v0
607 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
608 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
609 auto Imm
= foldToImm(*Src0
);
610 if (!Imm
|| *Imm
!= 8)
613 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
614 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
616 if (Register::isPhysicalRegister(Src1
->getReg()) ||
617 Register::isPhysicalRegister(Dst
->getReg()))
620 if (Opcode
== AMDGPU::V_LSHLREV_B16_e32
||
621 Opcode
== AMDGPU::V_LSHLREV_B16_e64
) {
622 return std::make_unique
<SDWADstOperand
>(Dst
, Src1
, BYTE_1
, UNUSED_PAD
);
624 return std::make_unique
<SDWASrcOperand
>(
625 Src1
, Dst
, BYTE_1
, false, false,
626 Opcode
!= AMDGPU::V_LSHRREV_B16_e32
&&
627 Opcode
!= AMDGPU::V_LSHRREV_B16_e64
);
632 case AMDGPU::V_BFE_I32
:
633 case AMDGPU::V_BFE_U32
: {
635 // from: v_bfe_u32 v1, v0, 8, 8
636 // to SDWA src:v0 src_sel:BYTE_1
638 // offset | width | src_sel
639 // ------------------------
648 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
649 auto Offset
= foldToImm(*Src1
);
653 MachineOperand
*Src2
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src2
);
654 auto Width
= foldToImm(*Src2
);
658 SdwaSel SrcSel
= DWORD
;
660 if (*Offset
== 0 && *Width
== 8)
662 else if (*Offset
== 0 && *Width
== 16)
664 else if (*Offset
== 0 && *Width
== 32)
666 else if (*Offset
== 8 && *Width
== 8)
668 else if (*Offset
== 16 && *Width
== 8)
670 else if (*Offset
== 16 && *Width
== 16)
672 else if (*Offset
== 24 && *Width
== 8)
677 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
678 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
680 if (Register::isPhysicalRegister(Src0
->getReg()) ||
681 Register::isPhysicalRegister(Dst
->getReg()))
684 return std::make_unique
<SDWASrcOperand
>(
685 Src0
, Dst
, SrcSel
, false, false, Opcode
!= AMDGPU::V_BFE_U32
);
688 case AMDGPU::V_AND_B32_e32
:
689 case AMDGPU::V_AND_B32_e64
: {
691 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
692 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
694 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
695 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
697 auto Imm
= foldToImm(*Src0
);
700 Imm
= foldToImm(*Src1
);
704 if (!Imm
|| (*Imm
!= 0x0000ffff && *Imm
!= 0x000000ff))
707 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
709 if (Register::isPhysicalRegister(ValSrc
->getReg()) ||
710 Register::isPhysicalRegister(Dst
->getReg()))
713 return std::make_unique
<SDWASrcOperand
>(
714 ValSrc
, Dst
, *Imm
== 0x0000ffff ? WORD_0
: BYTE_0
);
717 case AMDGPU::V_OR_B32_e32
:
718 case AMDGPU::V_OR_B32_e64
: {
719 // Patterns for dst_unused:UNUSED_PRESERVE.
721 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
722 // src1_sel:WORD_1 src2_sel:WORD1
723 // v_add_f16_e32 v3, v1, v2
724 // v_or_b32_e32 v4, v0, v3
725 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
727 // Check if one of operands of v_or_b32 is SDWA instruction
728 using CheckRetType
= Optional
<std::pair
<MachineOperand
*, MachineOperand
*>>;
729 auto CheckOROperandsForSDWA
=
730 [&](const MachineOperand
*Op1
, const MachineOperand
*Op2
) -> CheckRetType
{
731 if (!Op1
|| !Op1
->isReg() || !Op2
|| !Op2
->isReg())
732 return CheckRetType(None
);
734 MachineOperand
*Op1Def
= findSingleRegDef(Op1
, MRI
);
736 return CheckRetType(None
);
738 MachineInstr
*Op1Inst
= Op1Def
->getParent();
739 if (!TII
->isSDWA(*Op1Inst
))
740 return CheckRetType(None
);
742 MachineOperand
*Op2Def
= findSingleRegDef(Op2
, MRI
);
744 return CheckRetType(None
);
746 return CheckRetType(std::make_pair(Op1Def
, Op2Def
));
749 MachineOperand
*OrSDWA
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
750 MachineOperand
*OrOther
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
751 assert(OrSDWA
&& OrOther
);
752 auto Res
= CheckOROperandsForSDWA(OrSDWA
, OrOther
);
754 OrSDWA
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
755 OrOther
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
756 assert(OrSDWA
&& OrOther
);
757 Res
= CheckOROperandsForSDWA(OrSDWA
, OrOther
);
762 MachineOperand
*OrSDWADef
= Res
->first
;
763 MachineOperand
*OrOtherDef
= Res
->second
;
764 assert(OrSDWADef
&& OrOtherDef
);
766 MachineInstr
*SDWAInst
= OrSDWADef
->getParent();
767 MachineInstr
*OtherInst
= OrOtherDef
->getParent();
769 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
770 // destination patterns don't overlap. Compatible instruction can be either
771 // regular instruction with compatible bitness or SDWA instruction with
773 // SDWAInst | OtherInst bitness / OtherInst dst_sel
774 // -----------------------------------------------------
776 // WORD_0 | no / BYTE_2/3, WORD_1
777 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
778 // BYTE_0 | no / BYTE_1/2/3, WORD_1
779 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
780 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
781 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
782 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
783 // but v_add_f32 is not.
785 // TODO: add support for non-SDWA instructions as OtherInst.
786 // For now this only works with SDWA instructions. For regular instructions
787 // there is no way to determine if the instruction writes only 8/16/24-bit
788 // out of full register size and all registers are at min 32-bit wide.
789 if (!TII
->isSDWA(*OtherInst
))
792 SdwaSel DstSel
= static_cast<SdwaSel
>(
793 TII
->getNamedImmOperand(*SDWAInst
, AMDGPU::OpName::dst_sel
));;
794 SdwaSel OtherDstSel
= static_cast<SdwaSel
>(
795 TII
->getNamedImmOperand(*OtherInst
, AMDGPU::OpName::dst_sel
));
797 bool DstSelAgree
= false;
799 case WORD_0
: DstSelAgree
= ((OtherDstSel
== BYTE_2
) ||
800 (OtherDstSel
== BYTE_3
) ||
801 (OtherDstSel
== WORD_1
));
803 case WORD_1
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
804 (OtherDstSel
== BYTE_1
) ||
805 (OtherDstSel
== WORD_0
));
807 case BYTE_0
: DstSelAgree
= ((OtherDstSel
== BYTE_1
) ||
808 (OtherDstSel
== BYTE_2
) ||
809 (OtherDstSel
== BYTE_3
) ||
810 (OtherDstSel
== WORD_1
));
812 case BYTE_1
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
813 (OtherDstSel
== BYTE_2
) ||
814 (OtherDstSel
== BYTE_3
) ||
815 (OtherDstSel
== WORD_1
));
817 case BYTE_2
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
818 (OtherDstSel
== BYTE_1
) ||
819 (OtherDstSel
== BYTE_3
) ||
820 (OtherDstSel
== WORD_0
));
822 case BYTE_3
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
823 (OtherDstSel
== BYTE_1
) ||
824 (OtherDstSel
== BYTE_2
) ||
825 (OtherDstSel
== WORD_0
));
827 default: DstSelAgree
= false;
833 // Also OtherInst dst_unused should be UNUSED_PAD
834 DstUnused OtherDstUnused
= static_cast<DstUnused
>(
835 TII
->getNamedImmOperand(*OtherInst
, AMDGPU::OpName::dst_unused
));
836 if (OtherDstUnused
!= DstUnused::UNUSED_PAD
)
839 // Create DstPreserveOperand
840 MachineOperand
*OrDst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
841 assert(OrDst
&& OrDst
->isReg());
843 return std::make_unique
<SDWADstPreserveOperand
>(
844 OrDst
, OrSDWADef
, OrOtherDef
, DstSel
);
849 return std::unique_ptr
<SDWAOperand
>(nullptr);
852 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock
&MBB
) {
853 for (MachineInstr
&MI
: MBB
) {
854 if (auto Operand
= matchSDWAOperand(MI
)) {
855 LLVM_DEBUG(dbgs() << "Match: " << MI
<< "To: " << *Operand
<< '\n');
856 SDWAOperands
[&MI
] = std::move(Operand
);
857 ++NumSDWAPatternsFound
;
862 // Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
863 // V_ADD_I32_e64 into V_ADD_I32_e32. This allows isConvertibleToSDWA
864 // to perform its transformation on V_ADD_I32_e32 into V_ADD_I32_sdwa.
866 // We are transforming from a VOP3 into a VOP2 form of the instruction.
867 // %19:vgpr_32 = V_AND_B32_e32 255,
868 // killed %16:vgpr_32, implicit $exec
869 // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_I32_e64
870 // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
871 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
872 // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
875 // %47:vgpr_32 = V_ADD_I32_sdwa
876 // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
877 // implicit-def $vcc, implicit $exec
878 // %48:vgpr_32 = V_ADDC_U32_e32
879 // 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
880 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr
&MI
,
881 const GCNSubtarget
&ST
) const {
882 int Opc
= MI
.getOpcode();
883 assert((Opc
== AMDGPU::V_ADD_I32_e64
|| Opc
== AMDGPU::V_SUB_I32_e64
) &&
884 "Currently only handles V_ADD_I32_e64 or V_SUB_I32_e64");
886 // Can the candidate MI be shrunk?
887 if (!TII
->canShrink(MI
, *MRI
))
889 Opc
= AMDGPU::getVOPe32(Opc
);
890 // Find the related ADD instruction.
891 const MachineOperand
*Sdst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
);
894 MachineOperand
*NextOp
= findSingleRegUse(Sdst
, MRI
);
897 MachineInstr
&MISucc
= *NextOp
->getParent();
898 // Can the successor be shrunk?
899 if (!TII
->canShrink(MISucc
, *MRI
))
901 int SuccOpc
= AMDGPU::getVOPe32(MISucc
.getOpcode());
902 // Make sure the carry in/out are subsequently unused.
903 MachineOperand
*CarryIn
= TII
->getNamedOperand(MISucc
, AMDGPU::OpName::src2
);
906 MachineOperand
*CarryOut
= TII
->getNamedOperand(MISucc
, AMDGPU::OpName::sdst
);
909 if (!MRI
->hasOneUse(CarryIn
->getReg()) || !MRI
->use_empty(CarryOut
->getReg()))
911 // Make sure VCC or its subregs are dead before MI.
912 MachineBasicBlock
&MBB
= *MI
.getParent();
913 auto Liveness
= MBB
.computeRegisterLiveness(TRI
, AMDGPU::VCC
, MI
, 25);
914 if (Liveness
!= MachineBasicBlock::LQR_Dead
)
916 // Check if VCC is referenced in range of (MI,MISucc].
917 for (auto I
= std::next(MI
.getIterator()), E
= MISucc
.getIterator();
919 if (I
->modifiesRegister(AMDGPU::VCC
, TRI
))
922 // Make the two new e32 instruction variants.
923 // Replace MI with V_{SUB|ADD}_I32_e32
924 auto NewMI
= BuildMI(MBB
, MI
, MI
.getDebugLoc(), TII
->get(Opc
));
925 NewMI
.add(*TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
));
926 NewMI
.add(*TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
));
927 NewMI
.add(*TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
));
928 MI
.eraseFromParent();
929 // Replace MISucc with V_{SUBB|ADDC}_U32_e32
930 auto NewInst
= BuildMI(MBB
, MISucc
, MISucc
.getDebugLoc(), TII
->get(SuccOpc
));
931 NewInst
.add(*TII
->getNamedOperand(MISucc
, AMDGPU::OpName::vdst
));
932 NewInst
.add(*TII
->getNamedOperand(MISucc
, AMDGPU::OpName::src0
));
933 NewInst
.add(*TII
->getNamedOperand(MISucc
, AMDGPU::OpName::src1
));
934 MISucc
.eraseFromParent();
937 bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr
&MI
,
938 const GCNSubtarget
&ST
) const {
939 // Check if this is already an SDWA instruction
940 unsigned Opc
= MI
.getOpcode();
941 if (TII
->isSDWA(Opc
))
944 // Check if this instruction has opcode that supports SDWA
945 if (AMDGPU::getSDWAOp(Opc
) == -1)
946 Opc
= AMDGPU::getVOPe32(Opc
);
948 if (AMDGPU::getSDWAOp(Opc
) == -1)
951 if (!ST
.hasSDWAOmod() && TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
954 if (TII
->isVOPC(Opc
)) {
955 if (!ST
.hasSDWASdst()) {
956 const MachineOperand
*SDst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
);
957 if (SDst
&& (SDst
->getReg() != AMDGPU::VCC
&&
958 SDst
->getReg() != AMDGPU::VCC_LO
))
962 if (!ST
.hasSDWAOutModsVOPC() &&
963 (TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
) ||
964 TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
)))
967 } else if (TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
) ||
968 !TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
)) {
972 if (!ST
.hasSDWAMac() && (Opc
== AMDGPU::V_FMAC_F16_e32
||
973 Opc
== AMDGPU::V_FMAC_F32_e32
||
974 Opc
== AMDGPU::V_MAC_F16_e32
||
975 Opc
== AMDGPU::V_MAC_F32_e32
))
978 // Check if target supports this SDWA opcode
979 if (TII
->pseudoToMCOpcode(Opc
) == -1)
982 // FIXME: has SDWA but require handling of implicit VCC use
983 if (Opc
== AMDGPU::V_CNDMASK_B32_e32
)
989 bool SIPeepholeSDWA::convertToSDWA(MachineInstr
&MI
,
990 const SDWAOperandsVector
&SDWAOperands
) {
992 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI
);
996 unsigned Opcode
= MI
.getOpcode();
997 if (TII
->isSDWA(Opcode
)) {
1000 SDWAOpcode
= AMDGPU::getSDWAOp(Opcode
);
1001 if (SDWAOpcode
== -1)
1002 SDWAOpcode
= AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode
));
1004 assert(SDWAOpcode
!= -1);
1006 const MCInstrDesc
&SDWADesc
= TII
->get(SDWAOpcode
);
1008 // Create SDWA version of instruction MI and initialize its operands
1009 MachineInstrBuilder SDWAInst
=
1010 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(), SDWADesc
);
1012 // Copy dst, if it is present in original then should also be present in SDWA
1013 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
1015 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::vdst
) != -1);
1017 } else if ((Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
))) {
1019 AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::sdst
) != -1);
1022 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::sdst
) != -1);
1023 SDWAInst
.addReg(TRI
->getVCC(), RegState::Define
);
1026 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1027 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1028 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1031 AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src0
) != -1 &&
1032 AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src0_modifiers
) != -1);
1033 if (auto *Mod
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
))
1034 SDWAInst
.addImm(Mod
->getImm());
1037 SDWAInst
.add(*Src0
);
1039 // Copy src1 if present, initialize src1_modifiers.
1040 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1043 AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src1
) != -1 &&
1044 AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src1_modifiers
) != -1);
1045 if (auto *Mod
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
))
1046 SDWAInst
.addImm(Mod
->getImm());
1049 SDWAInst
.add(*Src1
);
1052 if (SDWAOpcode
== AMDGPU::V_FMAC_F16_sdwa
||
1053 SDWAOpcode
== AMDGPU::V_FMAC_F32_sdwa
||
1054 SDWAOpcode
== AMDGPU::V_MAC_F16_sdwa
||
1055 SDWAOpcode
== AMDGPU::V_MAC_F32_sdwa
) {
1056 // v_mac_f16/32 has additional src2 operand tied to vdst
1057 MachineOperand
*Src2
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src2
);
1059 SDWAInst
.add(*Src2
);
1062 // Copy clamp if present, initialize otherwise
1063 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::clamp
) != -1);
1064 MachineOperand
*Clamp
= TII
->getNamedOperand(MI
, AMDGPU::OpName::clamp
);
1066 SDWAInst
.add(*Clamp
);
1071 // Copy omod if present, initialize otherwise if needed
1072 if (AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::omod
) != -1) {
1073 MachineOperand
*OMod
= TII
->getNamedOperand(MI
, AMDGPU::OpName::omod
);
1075 SDWAInst
.add(*OMod
);
1081 // Copy dst_sel if present, initialize otherwise if needed
1082 if (AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::dst_sel
) != -1) {
1083 MachineOperand
*DstSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_sel
);
1085 SDWAInst
.add(*DstSel
);
1087 SDWAInst
.addImm(AMDGPU::SDWA::SdwaSel::DWORD
);
1091 // Copy dst_unused if present, initialize otherwise if needed
1092 if (AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::dst_unused
) != -1) {
1093 MachineOperand
*DstUnused
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
1095 SDWAInst
.add(*DstUnused
);
1097 SDWAInst
.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD
);
1101 // Copy src0_sel if present, initialize otherwise
1102 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src0_sel
) != -1);
1103 MachineOperand
*Src0Sel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_sel
);
1105 SDWAInst
.add(*Src0Sel
);
1107 SDWAInst
.addImm(AMDGPU::SDWA::SdwaSel::DWORD
);
1110 // Copy src1_sel if present, initialize otherwise if needed
1112 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src1_sel
) != -1);
1113 MachineOperand
*Src1Sel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_sel
);
1115 SDWAInst
.add(*Src1Sel
);
1117 SDWAInst
.addImm(AMDGPU::SDWA::SdwaSel::DWORD
);
1121 // Check for a preserved register that needs to be copied.
1122 auto DstUnused
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
1124 DstUnused
->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE
) {
1125 // We expect, if we are here, that the instruction was already in it's SDWA form,
1126 // with a tied operand.
1127 assert(Dst
&& Dst
->isTied());
1128 assert(Opcode
== static_cast<unsigned int>(SDWAOpcode
));
1129 // We also expect a vdst, since sdst can't preserve.
1130 auto PreserveDstIdx
= AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::vdst
);
1131 assert(PreserveDstIdx
!= -1);
1133 auto TiedIdx
= MI
.findTiedOperandIdx(PreserveDstIdx
);
1134 auto Tied
= MI
.getOperand(TiedIdx
);
1137 SDWAInst
->tieOperands(PreserveDstIdx
, SDWAInst
->getNumOperands() - 1);
1140 // Apply all sdwa operand patterns.
1141 bool Converted
= false;
1142 for (auto &Operand
: SDWAOperands
) {
1143 LLVM_DEBUG(dbgs() << *SDWAInst
<< "\nOperand: " << *Operand
);
1144 // There should be no intesection between SDWA operands and potential MIs
1146 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1147 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1148 // v_add_u32 v3, v4, v2
1150 // In that example it is possible that we would fold 2nd instruction into 3rd
1151 // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
1152 // already destroyed). So if SDWAOperand is also a potential MI then do not
1154 if (PotentialMatches
.count(Operand
->getParentInst()) == 0)
1155 Converted
|= Operand
->convertToSDWA(*SDWAInst
, TII
);
1158 ConvertedInstructions
.push_back(SDWAInst
);
1160 SDWAInst
->eraseFromParent();
1164 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst
<< '\n');
1165 ++NumSDWAInstructionsPeepholed
;
1167 MI
.eraseFromParent();
1171 // If an instruction was converted to SDWA it should not have immediates or SGPR
1172 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1173 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr
&MI
,
1174 const GCNSubtarget
&ST
) const {
1175 const MCInstrDesc
&Desc
= TII
->get(MI
.getOpcode());
1176 unsigned ConstantBusCount
= 0;
1177 for (MachineOperand
&Op
: MI
.explicit_uses()) {
1178 if (!Op
.isImm() && !(Op
.isReg() && !TRI
->isVGPR(*MRI
, Op
.getReg())))
1181 unsigned I
= MI
.getOperandNo(&Op
);
1182 if (Desc
.OpInfo
[I
].RegClass
== -1 ||
1183 !TRI
->hasVGPRs(TRI
->getRegClass(Desc
.OpInfo
[I
].RegClass
)))
1186 if (ST
.hasSDWAScalar() && ConstantBusCount
== 0 && Op
.isReg() &&
1187 TRI
->isSGPRReg(*MRI
, Op
.getReg())) {
1192 Register VGPR
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1193 auto Copy
= BuildMI(*MI
.getParent(), MI
.getIterator(), MI
.getDebugLoc(),
1194 TII
->get(AMDGPU::V_MOV_B32_e32
), VGPR
);
1196 Copy
.addImm(Op
.getImm());
1197 else if (Op
.isReg())
1198 Copy
.addReg(Op
.getReg(), Op
.isKill() ? RegState::Kill
: 0,
1200 Op
.ChangeToRegister(VGPR
, false);
1204 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction
&MF
) {
1205 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1207 if (!ST
.hasSDWA() || skipFunction(MF
.getFunction()))
1210 MRI
= &MF
.getRegInfo();
1211 TRI
= ST
.getRegisterInfo();
1212 TII
= ST
.getInstrInfo();
1214 // Find all SDWA operands in MF.
1216 for (MachineBasicBlock
&MBB
: MF
) {
1217 bool Changed
= false;
1219 // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1220 // Look for a possible ADD or SUB that resulted from a previously lowered
1221 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1222 // lowers the pair of instructions into e32 form.
1223 matchSDWAOperands(MBB
);
1224 for (const auto &OperandPair
: SDWAOperands
) {
1225 const auto &Operand
= OperandPair
.second
;
1226 MachineInstr
*PotentialMI
= Operand
->potentialToConvert(TII
);
1228 (PotentialMI
->getOpcode() == AMDGPU::V_ADD_I32_e64
||
1229 PotentialMI
->getOpcode() == AMDGPU::V_SUB_I32_e64
))
1230 pseudoOpConvertToVOP2(*PotentialMI
, ST
);
1232 SDWAOperands
.clear();
1234 // Generate potential match list.
1235 matchSDWAOperands(MBB
);
1237 for (const auto &OperandPair
: SDWAOperands
) {
1238 const auto &Operand
= OperandPair
.second
;
1239 MachineInstr
*PotentialMI
= Operand
->potentialToConvert(TII
);
1240 if (PotentialMI
&& isConvertibleToSDWA(*PotentialMI
, ST
)) {
1241 PotentialMatches
[PotentialMI
].push_back(Operand
.get());
1245 for (auto &PotentialPair
: PotentialMatches
) {
1246 MachineInstr
&PotentialMI
= *PotentialPair
.first
;
1247 convertToSDWA(PotentialMI
, PotentialPair
.second
);
1250 PotentialMatches
.clear();
1251 SDWAOperands
.clear();
1253 Changed
= !ConvertedInstructions
.empty();
1257 while (!ConvertedInstructions
.empty())
1258 legalizeScalarOperands(*ConvertedInstructions
.pop_back_val(), ST
);