1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 /// \file This pass tries to apply several peephole SDWA patterns.
12 /// V_LSHRREV_B32_e32 %0, 16, %1
13 /// V_ADD_CO_U32_e32 %2, %0, %3
14 /// V_LSHLREV_B32_e32 %4, 16, %2
17 /// V_ADD_CO_U32_sdwa %4, %1, %3
18 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
20 //===----------------------------------------------------------------------===//
23 #include "GCNSubtarget.h"
24 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
25 #include "llvm/ADT/MapVector.h"
26 #include "llvm/ADT/Statistic.h"
27 #include "llvm/CodeGen/MachineFunctionPass.h"
32 #define DEBUG_TYPE "si-peephole-sdwa"
34 STATISTIC(NumSDWAPatternsFound
, "Number of SDWA patterns found.");
35 STATISTIC(NumSDWAInstructionsPeepholed
,
36 "Number of instruction converted to SDWA.");
40 bool isConvertibleToSDWA(MachineInstr
&MI
, const GCNSubtarget
&ST
,
41 const SIInstrInfo
*TII
);
45 using SDWAOperandsVector
= SmallVector
<SDWAOperand
*, 4>;
46 using SDWAOperandsMap
= MapVector
<MachineInstr
*, SDWAOperandsVector
>;
48 class SIPeepholeSDWA
: public MachineFunctionPass
{
50 MachineRegisterInfo
*MRI
;
51 const SIRegisterInfo
*TRI
;
52 const SIInstrInfo
*TII
;
54 MapVector
<MachineInstr
*, std::unique_ptr
<SDWAOperand
>> SDWAOperands
;
55 SDWAOperandsMap PotentialMatches
;
56 SmallVector
<MachineInstr
*, 8> ConvertedInstructions
;
58 std::optional
<int64_t> foldToImm(const MachineOperand
&Op
) const;
63 SIPeepholeSDWA() : MachineFunctionPass(ID
) {
64 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
67 bool runOnMachineFunction(MachineFunction
&MF
) override
;
68 void matchSDWAOperands(MachineBasicBlock
&MBB
);
69 std::unique_ptr
<SDWAOperand
> matchSDWAOperand(MachineInstr
&MI
);
70 void pseudoOpConvertToVOP2(MachineInstr
&MI
,
71 const GCNSubtarget
&ST
) const;
72 bool convertToSDWA(MachineInstr
&MI
, const SDWAOperandsVector
&SDWAOperands
);
73 void legalizeScalarOperands(MachineInstr
&MI
, const GCNSubtarget
&ST
) const;
75 StringRef
getPassName() const override
{ return "SI Peephole SDWA"; }
77 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
79 MachineFunctionPass::getAnalysisUsage(AU
);
85 MachineOperand
*Target
; // Operand that would be used in converted instruction
86 MachineOperand
*Replaced
; // Operand that would be replace by Target
89 SDWAOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
)
90 : Target(TargetOp
), Replaced(ReplacedOp
) {
91 assert(Target
->isReg());
92 assert(Replaced
->isReg());
95 virtual ~SDWAOperand() = default;
97 virtual MachineInstr
*potentialToConvert(const SIInstrInfo
*TII
,
98 const GCNSubtarget
&ST
,
99 SDWAOperandsMap
*PotentialMatches
= nullptr) = 0;
100 virtual bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) = 0;
102 MachineOperand
*getTargetOperand() const { return Target
; }
103 MachineOperand
*getReplacedOperand() const { return Replaced
; }
104 MachineInstr
*getParentInst() const { return Target
->getParent(); }
106 MachineRegisterInfo
*getMRI() const {
107 return &getParentInst()->getParent()->getParent()->getRegInfo();
110 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
111 virtual void print(raw_ostream
& OS
) const = 0;
112 void dump() const { print(dbgs()); }
116 using namespace AMDGPU::SDWA
;
118 class SDWASrcOperand
: public SDWAOperand
{
126 SDWASrcOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
,
127 SdwaSel SrcSel_
= DWORD
, bool Abs_
= false, bool Neg_
= false,
129 : SDWAOperand(TargetOp
, ReplacedOp
),
130 SrcSel(SrcSel_
), Abs(Abs_
), Neg(Neg_
), Sext(Sext_
) {}
132 MachineInstr
*potentialToConvert(const SIInstrInfo
*TII
,
133 const GCNSubtarget
&ST
,
134 SDWAOperandsMap
*PotentialMatches
= nullptr) override
;
135 bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) override
;
137 SdwaSel
getSrcSel() const { return SrcSel
; }
138 bool getAbs() const { return Abs
; }
139 bool getNeg() const { return Neg
; }
140 bool getSext() const { return Sext
; }
142 uint64_t getSrcMods(const SIInstrInfo
*TII
,
143 const MachineOperand
*SrcOp
) const;
145 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
146 void print(raw_ostream
& OS
) const override
;
150 class SDWADstOperand
: public SDWAOperand
{
157 SDWADstOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
,
158 SdwaSel DstSel_
= DWORD
, DstUnused DstUn_
= UNUSED_PAD
)
159 : SDWAOperand(TargetOp
, ReplacedOp
), DstSel(DstSel_
), DstUn(DstUn_
) {}
161 MachineInstr
*potentialToConvert(const SIInstrInfo
*TII
,
162 const GCNSubtarget
&ST
,
163 SDWAOperandsMap
*PotentialMatches
= nullptr) override
;
164 bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) override
;
166 SdwaSel
getDstSel() const { return DstSel
; }
167 DstUnused
getDstUnused() const { return DstUn
; }
169 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
170 void print(raw_ostream
& OS
) const override
;
174 class SDWADstPreserveOperand
: public SDWADstOperand
{
176 MachineOperand
*Preserve
;
179 SDWADstPreserveOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
,
180 MachineOperand
*PreserveOp
, SdwaSel DstSel_
= DWORD
)
181 : SDWADstOperand(TargetOp
, ReplacedOp
, DstSel_
, UNUSED_PRESERVE
),
182 Preserve(PreserveOp
) {}
184 bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) override
;
186 MachineOperand
*getPreservedOperand() const { return Preserve
; }
188 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
189 void print(raw_ostream
& OS
) const override
;
193 } // end anonymous namespace
195 INITIALIZE_PASS(SIPeepholeSDWA
, DEBUG_TYPE
, "SI Peephole SDWA", false, false)
197 char SIPeepholeSDWA::ID
= 0;
199 char &llvm::SIPeepholeSDWAID
= SIPeepholeSDWA::ID
;
201 FunctionPass
*llvm::createSIPeepholeSDWAPass() {
202 return new SIPeepholeSDWA();
206 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
207 static raw_ostream
& operator<<(raw_ostream
&OS
, SdwaSel Sel
) {
209 case BYTE_0
: OS
<< "BYTE_0"; break;
210 case BYTE_1
: OS
<< "BYTE_1"; break;
211 case BYTE_2
: OS
<< "BYTE_2"; break;
212 case BYTE_3
: OS
<< "BYTE_3"; break;
213 case WORD_0
: OS
<< "WORD_0"; break;
214 case WORD_1
: OS
<< "WORD_1"; break;
215 case DWORD
: OS
<< "DWORD"; break;
220 static raw_ostream
& operator<<(raw_ostream
&OS
, const DstUnused
&Un
) {
222 case UNUSED_PAD
: OS
<< "UNUSED_PAD"; break;
223 case UNUSED_SEXT
: OS
<< "UNUSED_SEXT"; break;
224 case UNUSED_PRESERVE
: OS
<< "UNUSED_PRESERVE"; break;
230 void SDWASrcOperand::print(raw_ostream
& OS
) const {
231 OS
<< "SDWA src: " << *getTargetOperand()
232 << " src_sel:" << getSrcSel()
233 << " abs:" << getAbs() << " neg:" << getNeg()
234 << " sext:" << getSext() << '\n';
238 void SDWADstOperand::print(raw_ostream
& OS
) const {
239 OS
<< "SDWA dst: " << *getTargetOperand()
240 << " dst_sel:" << getDstSel()
241 << " dst_unused:" << getDstUnused() << '\n';
245 void SDWADstPreserveOperand::print(raw_ostream
& OS
) const {
246 OS
<< "SDWA preserve dst: " << *getTargetOperand()
247 << " dst_sel:" << getDstSel()
248 << " preserve:" << *getPreservedOperand() << '\n';
253 static void copyRegOperand(MachineOperand
&To
, const MachineOperand
&From
) {
254 assert(To
.isReg() && From
.isReg());
255 To
.setReg(From
.getReg());
256 To
.setSubReg(From
.getSubReg());
257 To
.setIsUndef(From
.isUndef());
259 To
.setIsKill(From
.isKill());
261 To
.setIsDead(From
.isDead());
265 static bool isSameReg(const MachineOperand
&LHS
, const MachineOperand
&RHS
) {
266 return LHS
.isReg() &&
268 LHS
.getReg() == RHS
.getReg() &&
269 LHS
.getSubReg() == RHS
.getSubReg();
272 static MachineOperand
*findSingleRegUse(const MachineOperand
*Reg
,
273 const MachineRegisterInfo
*MRI
) {
274 if (!Reg
->isReg() || !Reg
->isDef())
277 MachineOperand
*ResMO
= nullptr;
278 for (MachineOperand
&UseMO
: MRI
->use_nodbg_operands(Reg
->getReg())) {
279 // If there exist use of subreg of Reg then return nullptr
280 if (!isSameReg(UseMO
, *Reg
))
283 // Check that there is only one instruction that uses Reg
286 } else if (ResMO
->getParent() != UseMO
.getParent()) {
294 static MachineOperand
*findSingleRegDef(const MachineOperand
*Reg
,
295 const MachineRegisterInfo
*MRI
) {
299 MachineInstr
*DefInstr
= MRI
->getUniqueVRegDef(Reg
->getReg());
303 for (auto &DefMO
: DefInstr
->defs()) {
304 if (DefMO
.isReg() && DefMO
.getReg() == Reg
->getReg())
308 // Ignore implicit defs.
312 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo
*TII
,
313 const MachineOperand
*SrcOp
) const {
315 const auto *MI
= SrcOp
->getParent();
316 if (TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0
) == SrcOp
) {
317 if (auto *Mod
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0_modifiers
)) {
318 Mods
= Mod
->getImm();
320 } else if (TII
->getNamedOperand(*MI
, AMDGPU::OpName::src1
) == SrcOp
) {
321 if (auto *Mod
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src1_modifiers
)) {
322 Mods
= Mod
->getImm();
327 "Float and integer src modifiers can't be set simultaneously");
328 Mods
|= Abs
? SISrcMods::ABS
: 0u;
329 Mods
^= Neg
? SISrcMods::NEG
: 0u;
331 Mods
|= SISrcMods::SEXT
;
337 MachineInstr
*SDWASrcOperand::potentialToConvert(const SIInstrInfo
*TII
,
338 const GCNSubtarget
&ST
,
339 SDWAOperandsMap
*PotentialMatches
) {
340 if (PotentialMatches
!= nullptr) {
341 // Fill out the map for all uses if all can be converted
342 MachineOperand
*Reg
= getReplacedOperand();
343 if (!Reg
->isReg() || !Reg
->isDef())
346 for (MachineInstr
&UseMI
: getMRI()->use_nodbg_instructions(Reg
->getReg()))
347 // Check that all instructions that use Reg can be converted
348 if (!isConvertibleToSDWA(UseMI
, ST
, TII
))
351 // Now that it's guaranteed all uses are legal, iterate over the uses again
352 // to add them for later conversion.
353 for (MachineOperand
&UseMO
: getMRI()->use_nodbg_operands(Reg
->getReg())) {
354 // Should not get a subregister here
355 assert(isSameReg(UseMO
, *Reg
));
357 SDWAOperandsMap
&potentialMatchesMap
= *PotentialMatches
;
358 MachineInstr
*UseMI
= UseMO
.getParent();
359 potentialMatchesMap
[UseMI
].push_back(this);
364 // For SDWA src operand potential instruction is one that use register
365 // defined by parent instruction
366 MachineOperand
*PotentialMO
= findSingleRegUse(getReplacedOperand(), getMRI());
370 return PotentialMO
->getParent();
373 bool SDWASrcOperand::convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) {
374 switch (MI
.getOpcode()) {
375 case AMDGPU::V_CVT_F32_FP8_sdwa
:
376 case AMDGPU::V_CVT_F32_BF8_sdwa
:
377 case AMDGPU::V_CVT_PK_F32_FP8_sdwa
:
378 case AMDGPU::V_CVT_PK_F32_BF8_sdwa
:
379 // Does not support input modifiers: noabs, noneg, nosext.
383 // Find operand in instruction that matches source operand and replace it with
384 // target operand. Set corresponding src_sel
385 bool IsPreserveSrc
= false;
386 MachineOperand
*Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
387 MachineOperand
*SrcSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_sel
);
388 MachineOperand
*SrcMods
=
389 TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
);
390 assert(Src
&& (Src
->isReg() || Src
->isImm()));
391 if (!isSameReg(*Src
, *getReplacedOperand())) {
392 // If this is not src0 then it could be src1
393 Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
394 SrcSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_sel
);
395 SrcMods
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
);
398 !isSameReg(*Src
, *getReplacedOperand())) {
399 // It's possible this Src is a tied operand for
400 // UNUSED_PRESERVE, in which case we can either
401 // abandon the peephole attempt, or if legal we can
402 // copy the target operand into the tied slot
403 // if the preserve operation will effectively cause the same
404 // result by overwriting the rest of the dst.
405 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
406 MachineOperand
*DstUnused
=
407 TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
410 DstUnused
->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE
) {
411 // This will work if the tied src is accessing WORD_0, and the dst is
412 // writing WORD_1. Modifiers don't matter because all the bits that
413 // would be impacted are being overwritten by the dst.
414 // Any other case will not work.
415 SdwaSel DstSel
= static_cast<SdwaSel
>(
416 TII
->getNamedImmOperand(MI
, AMDGPU::OpName::dst_sel
));
417 if (DstSel
== AMDGPU::SDWA::SdwaSel::WORD_1
&&
418 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0
) {
419 IsPreserveSrc
= true;
420 auto DstIdx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(),
421 AMDGPU::OpName::vdst
);
422 auto TiedIdx
= MI
.findTiedOperandIdx(DstIdx
);
423 Src
= &MI
.getOperand(TiedIdx
);
427 // Not legal to convert this src
432 assert(Src
&& Src
->isReg());
434 if ((MI
.getOpcode() == AMDGPU::V_FMAC_F16_sdwa
||
435 MI
.getOpcode() == AMDGPU::V_FMAC_F32_sdwa
||
436 MI
.getOpcode() == AMDGPU::V_MAC_F16_sdwa
||
437 MI
.getOpcode() == AMDGPU::V_MAC_F32_sdwa
) &&
438 !isSameReg(*Src
, *getReplacedOperand())) {
439 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
440 // src2. This is not allowed.
444 assert(isSameReg(*Src
, *getReplacedOperand()) &&
445 (IsPreserveSrc
|| (SrcSel
&& SrcMods
)));
447 copyRegOperand(*Src
, *getTargetOperand());
448 if (!IsPreserveSrc
) {
449 SrcSel
->setImm(getSrcSel());
450 SrcMods
->setImm(getSrcMods(TII
, Src
));
452 getTargetOperand()->setIsKill(false);
456 MachineInstr
*SDWADstOperand::potentialToConvert(const SIInstrInfo
*TII
,
457 const GCNSubtarget
&ST
,
458 SDWAOperandsMap
*PotentialMatches
) {
459 // For SDWA dst operand potential instruction is one that defines register
460 // that this operand uses
461 MachineRegisterInfo
*MRI
= getMRI();
462 MachineInstr
*ParentMI
= getParentInst();
464 MachineOperand
*PotentialMO
= findSingleRegDef(getReplacedOperand(), MRI
);
468 // Check that ParentMI is the only instruction that uses replaced register
469 for (MachineInstr
&UseInst
: MRI
->use_nodbg_instructions(PotentialMO
->getReg())) {
470 if (&UseInst
!= ParentMI
)
474 return PotentialMO
->getParent();
477 bool SDWADstOperand::convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) {
478 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
480 if ((MI
.getOpcode() == AMDGPU::V_FMAC_F16_sdwa
||
481 MI
.getOpcode() == AMDGPU::V_FMAC_F32_sdwa
||
482 MI
.getOpcode() == AMDGPU::V_MAC_F16_sdwa
||
483 MI
.getOpcode() == AMDGPU::V_MAC_F32_sdwa
) &&
484 getDstSel() != AMDGPU::SDWA::DWORD
) {
485 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
489 MachineOperand
*Operand
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
492 isSameReg(*Operand
, *getReplacedOperand()));
493 copyRegOperand(*Operand
, *getTargetOperand());
494 MachineOperand
*DstSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_sel
);
496 DstSel
->setImm(getDstSel());
497 MachineOperand
*DstUnused
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
499 DstUnused
->setImm(getDstUnused());
501 // Remove original instruction because it would conflict with our new
502 // instruction by register definition
503 getParentInst()->eraseFromParent();
507 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr
&MI
,
508 const SIInstrInfo
*TII
) {
509 // MI should be moved right before v_or_b32.
510 // For this we should clear all kill flags on uses of MI src-operands or else
511 // we can encounter problem with use of killed operand.
512 for (MachineOperand
&MO
: MI
.uses()) {
515 getMRI()->clearKillFlags(MO
.getReg());
518 // Move MI before v_or_b32
519 MI
.getParent()->remove(&MI
);
520 getParentInst()->getParent()->insert(getParentInst(), &MI
);
522 // Add Implicit use of preserved register
523 MachineInstrBuilder
MIB(*MI
.getMF(), MI
);
524 MIB
.addReg(getPreservedOperand()->getReg(),
525 RegState::ImplicitKill
,
526 getPreservedOperand()->getSubReg());
528 // Tie dst to implicit use
529 MI
.tieOperands(AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vdst
),
530 MI
.getNumOperands() - 1);
532 // Convert MI as any other SDWADstOperand and remove v_or_b32
533 return SDWADstOperand::convertToSDWA(MI
, TII
);
536 std::optional
<int64_t>
537 SIPeepholeSDWA::foldToImm(const MachineOperand
&Op
) const {
542 // If this is not immediate then it can be copy of immediate value, e.g.:
543 // %1 = S_MOV_B32 255;
545 for (const MachineOperand
&Def
: MRI
->def_operands(Op
.getReg())) {
546 if (!isSameReg(Op
, Def
))
549 const MachineInstr
*DefInst
= Def
.getParent();
550 if (!TII
->isFoldableCopy(*DefInst
))
553 const MachineOperand
&Copied
= DefInst
->getOperand(1);
557 return Copied
.getImm();
564 std::unique_ptr
<SDWAOperand
>
565 SIPeepholeSDWA::matchSDWAOperand(MachineInstr
&MI
) {
566 unsigned Opcode
= MI
.getOpcode();
568 case AMDGPU::V_LSHRREV_B32_e32
:
569 case AMDGPU::V_ASHRREV_I32_e32
:
570 case AMDGPU::V_LSHLREV_B32_e32
:
571 case AMDGPU::V_LSHRREV_B32_e64
:
572 case AMDGPU::V_ASHRREV_I32_e64
:
573 case AMDGPU::V_LSHLREV_B32_e64
: {
574 // from: v_lshrrev_b32_e32 v1, 16/24, v0
575 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
577 // from: v_ashrrev_i32_e32 v1, 16/24, v0
578 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
580 // from: v_lshlrev_b32_e32 v1, 16/24, v0
581 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
582 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
583 auto Imm
= foldToImm(*Src0
);
587 if (*Imm
!= 16 && *Imm
!= 24)
590 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
591 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
592 if (!Src1
->isReg() || Src1
->getReg().isPhysical() ||
593 Dst
->getReg().isPhysical())
596 if (Opcode
== AMDGPU::V_LSHLREV_B32_e32
||
597 Opcode
== AMDGPU::V_LSHLREV_B32_e64
) {
598 return std::make_unique
<SDWADstOperand
>(
599 Dst
, Src1
, *Imm
== 16 ? WORD_1
: BYTE_3
, UNUSED_PAD
);
601 return std::make_unique
<SDWASrcOperand
>(
602 Src1
, Dst
, *Imm
== 16 ? WORD_1
: BYTE_3
, false, false,
603 Opcode
!= AMDGPU::V_LSHRREV_B32_e32
&&
604 Opcode
!= AMDGPU::V_LSHRREV_B32_e64
);
608 case AMDGPU::V_LSHRREV_B16_e32
:
609 case AMDGPU::V_ASHRREV_I16_e32
:
610 case AMDGPU::V_LSHLREV_B16_e32
:
611 case AMDGPU::V_LSHRREV_B16_e64
:
612 case AMDGPU::V_ASHRREV_I16_e64
:
613 case AMDGPU::V_LSHLREV_B16_e64
: {
614 // from: v_lshrrev_b16_e32 v1, 8, v0
615 // to SDWA src:v0 src_sel:BYTE_1
617 // from: v_ashrrev_i16_e32 v1, 8, v0
618 // to SDWA src:v0 src_sel:BYTE_1 sext:1
620 // from: v_lshlrev_b16_e32 v1, 8, v0
621 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
622 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
623 auto Imm
= foldToImm(*Src0
);
624 if (!Imm
|| *Imm
!= 8)
627 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
628 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
630 if (!Src1
->isReg() || Src1
->getReg().isPhysical() ||
631 Dst
->getReg().isPhysical())
634 if (Opcode
== AMDGPU::V_LSHLREV_B16_e32
||
635 Opcode
== AMDGPU::V_LSHLREV_B16_e64
)
636 return std::make_unique
<SDWADstOperand
>(Dst
, Src1
, BYTE_1
, UNUSED_PAD
);
637 return std::make_unique
<SDWASrcOperand
>(
638 Src1
, Dst
, BYTE_1
, false, false,
639 Opcode
!= AMDGPU::V_LSHRREV_B16_e32
&&
640 Opcode
!= AMDGPU::V_LSHRREV_B16_e64
);
644 case AMDGPU::V_BFE_I32_e64
:
645 case AMDGPU::V_BFE_U32_e64
: {
647 // from: v_bfe_u32 v1, v0, 8, 8
648 // to SDWA src:v0 src_sel:BYTE_1
650 // offset | width | src_sel
651 // ------------------------
660 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
661 auto Offset
= foldToImm(*Src1
);
665 MachineOperand
*Src2
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src2
);
666 auto Width
= foldToImm(*Src2
);
670 SdwaSel SrcSel
= DWORD
;
672 if (*Offset
== 0 && *Width
== 8)
674 else if (*Offset
== 0 && *Width
== 16)
676 else if (*Offset
== 0 && *Width
== 32)
678 else if (*Offset
== 8 && *Width
== 8)
680 else if (*Offset
== 16 && *Width
== 8)
682 else if (*Offset
== 16 && *Width
== 16)
684 else if (*Offset
== 24 && *Width
== 8)
689 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
690 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
692 if (!Src0
->isReg() || Src0
->getReg().isPhysical() ||
693 Dst
->getReg().isPhysical())
696 return std::make_unique
<SDWASrcOperand
>(
697 Src0
, Dst
, SrcSel
, false, false, Opcode
!= AMDGPU::V_BFE_U32_e64
);
700 case AMDGPU::V_AND_B32_e32
:
701 case AMDGPU::V_AND_B32_e64
: {
703 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
704 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
706 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
707 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
709 auto Imm
= foldToImm(*Src0
);
712 Imm
= foldToImm(*Src1
);
716 if (!Imm
|| (*Imm
!= 0x0000ffff && *Imm
!= 0x000000ff))
719 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
721 if (!ValSrc
->isReg() || ValSrc
->getReg().isPhysical() ||
722 Dst
->getReg().isPhysical())
725 return std::make_unique
<SDWASrcOperand
>(
726 ValSrc
, Dst
, *Imm
== 0x0000ffff ? WORD_0
: BYTE_0
);
729 case AMDGPU::V_OR_B32_e32
:
730 case AMDGPU::V_OR_B32_e64
: {
731 // Patterns for dst_unused:UNUSED_PRESERVE.
733 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
734 // src1_sel:WORD_1 src2_sel:WORD1
735 // v_add_f16_e32 v3, v1, v2
736 // v_or_b32_e32 v4, v0, v3
737 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
739 // Check if one of operands of v_or_b32 is SDWA instruction
741 std::optional
<std::pair
<MachineOperand
*, MachineOperand
*>>;
742 auto CheckOROperandsForSDWA
=
743 [&](const MachineOperand
*Op1
, const MachineOperand
*Op2
) -> CheckRetType
{
744 if (!Op1
|| !Op1
->isReg() || !Op2
|| !Op2
->isReg())
745 return CheckRetType(std::nullopt
);
747 MachineOperand
*Op1Def
= findSingleRegDef(Op1
, MRI
);
749 return CheckRetType(std::nullopt
);
751 MachineInstr
*Op1Inst
= Op1Def
->getParent();
752 if (!TII
->isSDWA(*Op1Inst
))
753 return CheckRetType(std::nullopt
);
755 MachineOperand
*Op2Def
= findSingleRegDef(Op2
, MRI
);
757 return CheckRetType(std::nullopt
);
759 return CheckRetType(std::pair(Op1Def
, Op2Def
));
762 MachineOperand
*OrSDWA
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
763 MachineOperand
*OrOther
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
764 assert(OrSDWA
&& OrOther
);
765 auto Res
= CheckOROperandsForSDWA(OrSDWA
, OrOther
);
767 OrSDWA
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
768 OrOther
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
769 assert(OrSDWA
&& OrOther
);
770 Res
= CheckOROperandsForSDWA(OrSDWA
, OrOther
);
775 MachineOperand
*OrSDWADef
= Res
->first
;
776 MachineOperand
*OrOtherDef
= Res
->second
;
777 assert(OrSDWADef
&& OrOtherDef
);
779 MachineInstr
*SDWAInst
= OrSDWADef
->getParent();
780 MachineInstr
*OtherInst
= OrOtherDef
->getParent();
782 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
783 // destination patterns don't overlap. Compatible instruction can be either
784 // regular instruction with compatible bitness or SDWA instruction with
786 // SDWAInst | OtherInst bitness / OtherInst dst_sel
787 // -----------------------------------------------------
789 // WORD_0 | no / BYTE_2/3, WORD_1
790 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
791 // BYTE_0 | no / BYTE_1/2/3, WORD_1
792 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
793 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
794 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
795 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
796 // but v_add_f32 is not.
798 // TODO: add support for non-SDWA instructions as OtherInst.
799 // For now this only works with SDWA instructions. For regular instructions
800 // there is no way to determine if the instruction writes only 8/16/24-bit
801 // out of full register size and all registers are at min 32-bit wide.
802 if (!TII
->isSDWA(*OtherInst
))
805 SdwaSel DstSel
= static_cast<SdwaSel
>(
806 TII
->getNamedImmOperand(*SDWAInst
, AMDGPU::OpName::dst_sel
));
807 SdwaSel OtherDstSel
= static_cast<SdwaSel
>(
808 TII
->getNamedImmOperand(*OtherInst
, AMDGPU::OpName::dst_sel
));
810 bool DstSelAgree
= false;
812 case WORD_0
: DstSelAgree
= ((OtherDstSel
== BYTE_2
) ||
813 (OtherDstSel
== BYTE_3
) ||
814 (OtherDstSel
== WORD_1
));
816 case WORD_1
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
817 (OtherDstSel
== BYTE_1
) ||
818 (OtherDstSel
== WORD_0
));
820 case BYTE_0
: DstSelAgree
= ((OtherDstSel
== BYTE_1
) ||
821 (OtherDstSel
== BYTE_2
) ||
822 (OtherDstSel
== BYTE_3
) ||
823 (OtherDstSel
== WORD_1
));
825 case BYTE_1
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
826 (OtherDstSel
== BYTE_2
) ||
827 (OtherDstSel
== BYTE_3
) ||
828 (OtherDstSel
== WORD_1
));
830 case BYTE_2
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
831 (OtherDstSel
== BYTE_1
) ||
832 (OtherDstSel
== BYTE_3
) ||
833 (OtherDstSel
== WORD_0
));
835 case BYTE_3
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
836 (OtherDstSel
== BYTE_1
) ||
837 (OtherDstSel
== BYTE_2
) ||
838 (OtherDstSel
== WORD_0
));
840 default: DstSelAgree
= false;
846 // Also OtherInst dst_unused should be UNUSED_PAD
847 DstUnused OtherDstUnused
= static_cast<DstUnused
>(
848 TII
->getNamedImmOperand(*OtherInst
, AMDGPU::OpName::dst_unused
));
849 if (OtherDstUnused
!= DstUnused::UNUSED_PAD
)
852 // Create DstPreserveOperand
853 MachineOperand
*OrDst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
854 assert(OrDst
&& OrDst
->isReg());
856 return std::make_unique
<SDWADstPreserveOperand
>(
857 OrDst
, OrSDWADef
, OrOtherDef
, DstSel
);
862 return std::unique_ptr
<SDWAOperand
>(nullptr);
866 static raw_ostream
& operator<<(raw_ostream
&OS
, const SDWAOperand
&Operand
) {
872 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock
&MBB
) {
873 for (MachineInstr
&MI
: MBB
) {
874 if (auto Operand
= matchSDWAOperand(MI
)) {
875 LLVM_DEBUG(dbgs() << "Match: " << MI
<< "To: " << *Operand
<< '\n');
876 SDWAOperands
[&MI
] = std::move(Operand
);
877 ++NumSDWAPatternsFound
;
882 // Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
883 // isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
884 // V_ADD_CO_U32_sdwa.
886 // We are transforming from a VOP3 into a VOP2 form of the instruction.
887 // %19:vgpr_32 = V_AND_B32_e32 255,
888 // killed %16:vgpr_32, implicit $exec
889 // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
890 // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
891 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
892 // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
895 // %47:vgpr_32 = V_ADD_CO_U32_sdwa
896 // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
897 // implicit-def $vcc, implicit $exec
898 // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
899 // %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
900 void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr
&MI
,
901 const GCNSubtarget
&ST
) const {
902 int Opc
= MI
.getOpcode();
903 assert((Opc
== AMDGPU::V_ADD_CO_U32_e64
|| Opc
== AMDGPU::V_SUB_CO_U32_e64
) &&
904 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
906 // Can the candidate MI be shrunk?
907 if (!TII
->canShrink(MI
, *MRI
))
909 Opc
= AMDGPU::getVOPe32(Opc
);
910 // Find the related ADD instruction.
911 const MachineOperand
*Sdst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
);
914 MachineOperand
*NextOp
= findSingleRegUse(Sdst
, MRI
);
917 MachineInstr
&MISucc
= *NextOp
->getParent();
919 // Make sure the carry in/out are subsequently unused.
920 MachineOperand
*CarryIn
= TII
->getNamedOperand(MISucc
, AMDGPU::OpName::src2
);
923 MachineOperand
*CarryOut
= TII
->getNamedOperand(MISucc
, AMDGPU::OpName::sdst
);
926 if (!MRI
->hasOneUse(CarryIn
->getReg()) || !MRI
->use_empty(CarryOut
->getReg()))
928 // Make sure VCC or its subregs are dead before MI.
929 MachineBasicBlock
&MBB
= *MI
.getParent();
930 auto Liveness
= MBB
.computeRegisterLiveness(TRI
, AMDGPU::VCC
, MI
, 25);
931 if (Liveness
!= MachineBasicBlock::LQR_Dead
)
933 // Check if VCC is referenced in range of (MI,MISucc].
934 for (auto I
= std::next(MI
.getIterator()), E
= MISucc
.getIterator();
936 if (I
->modifiesRegister(AMDGPU::VCC
, TRI
))
940 // Replace MI with V_{SUB|ADD}_I32_e32
941 BuildMI(MBB
, MI
, MI
.getDebugLoc(), TII
->get(Opc
))
942 .add(*TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
))
943 .add(*TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
))
944 .add(*TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
))
945 .setMIFlags(MI
.getFlags());
947 MI
.eraseFromParent();
949 // Since the carry output of MI is now VCC, update its use in MISucc.
951 MISucc
.substituteRegister(CarryIn
->getReg(), TRI
->getVCC(), 0, *TRI
);
955 bool isConvertibleToSDWA(MachineInstr
&MI
,
956 const GCNSubtarget
&ST
,
957 const SIInstrInfo
* TII
) {
958 // Check if this is already an SDWA instruction
959 unsigned Opc
= MI
.getOpcode();
960 if (TII
->isSDWA(Opc
))
963 // Check if this instruction has opcode that supports SDWA
964 if (AMDGPU::getSDWAOp(Opc
) == -1)
965 Opc
= AMDGPU::getVOPe32(Opc
);
967 if (AMDGPU::getSDWAOp(Opc
) == -1)
970 if (!ST
.hasSDWAOmod() && TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
973 if (TII
->isVOPC(Opc
)) {
974 if (!ST
.hasSDWASdst()) {
975 const MachineOperand
*SDst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
);
976 if (SDst
&& (SDst
->getReg() != AMDGPU::VCC
&&
977 SDst
->getReg() != AMDGPU::VCC_LO
))
981 if (!ST
.hasSDWAOutModsVOPC() &&
982 (TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
) ||
983 TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
)))
986 } else if (TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
) ||
987 !TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
)) {
991 if (!ST
.hasSDWAMac() && (Opc
== AMDGPU::V_FMAC_F16_e32
||
992 Opc
== AMDGPU::V_FMAC_F32_e32
||
993 Opc
== AMDGPU::V_MAC_F16_e32
||
994 Opc
== AMDGPU::V_MAC_F32_e32
))
997 // Check if target supports this SDWA opcode
998 if (TII
->pseudoToMCOpcode(Opc
) == -1)
1001 // FIXME: has SDWA but require handling of implicit VCC use
1002 if (Opc
== AMDGPU::V_CNDMASK_B32_e32
)
1005 if (MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
)) {
1006 if (!Src0
->isReg() && !Src0
->isImm())
1010 if (MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
)) {
1011 if (!Src1
->isReg() && !Src1
->isImm())
1019 bool SIPeepholeSDWA::convertToSDWA(MachineInstr
&MI
,
1020 const SDWAOperandsVector
&SDWAOperands
) {
1022 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI
);
1026 unsigned Opcode
= MI
.getOpcode();
1027 if (TII
->isSDWA(Opcode
)) {
1028 SDWAOpcode
= Opcode
;
1030 SDWAOpcode
= AMDGPU::getSDWAOp(Opcode
);
1031 if (SDWAOpcode
== -1)
1032 SDWAOpcode
= AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode
));
1034 assert(SDWAOpcode
!= -1);
1036 const MCInstrDesc
&SDWADesc
= TII
->get(SDWAOpcode
);
1038 // Create SDWA version of instruction MI and initialize its operands
1039 MachineInstrBuilder SDWAInst
=
1040 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(), SDWADesc
)
1041 .setMIFlags(MI
.getFlags());
1043 // Copy dst, if it is present in original then should also be present in SDWA
1044 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
1046 assert(AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::vdst
));
1048 } else if ((Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
))) {
1049 assert(Dst
&& AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::sdst
));
1052 assert(AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::sdst
));
1053 SDWAInst
.addReg(TRI
->getVCC(), RegState::Define
);
1056 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1057 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1058 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
1059 assert(Src0
&& AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::src0
) &&
1060 AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::src0_modifiers
));
1061 if (auto *Mod
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
))
1062 SDWAInst
.addImm(Mod
->getImm());
1065 SDWAInst
.add(*Src0
);
1067 // Copy src1 if present, initialize src1_modifiers.
1068 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
1070 assert(AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::src1
) &&
1071 AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::src1_modifiers
));
1072 if (auto *Mod
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
))
1073 SDWAInst
.addImm(Mod
->getImm());
1076 SDWAInst
.add(*Src1
);
1079 if (SDWAOpcode
== AMDGPU::V_FMAC_F16_sdwa
||
1080 SDWAOpcode
== AMDGPU::V_FMAC_F32_sdwa
||
1081 SDWAOpcode
== AMDGPU::V_MAC_F16_sdwa
||
1082 SDWAOpcode
== AMDGPU::V_MAC_F32_sdwa
) {
1083 // v_mac_f16/32 has additional src2 operand tied to vdst
1084 MachineOperand
*Src2
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src2
);
1086 SDWAInst
.add(*Src2
);
1089 // Copy clamp if present, initialize otherwise
1090 assert(AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::clamp
));
1091 MachineOperand
*Clamp
= TII
->getNamedOperand(MI
, AMDGPU::OpName::clamp
);
1093 SDWAInst
.add(*Clamp
);
1098 // Copy omod if present, initialize otherwise if needed
1099 if (AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::omod
)) {
1100 MachineOperand
*OMod
= TII
->getNamedOperand(MI
, AMDGPU::OpName::omod
);
1102 SDWAInst
.add(*OMod
);
1108 // Copy dst_sel if present, initialize otherwise if needed
1109 if (AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::dst_sel
)) {
1110 MachineOperand
*DstSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_sel
);
1112 SDWAInst
.add(*DstSel
);
1114 SDWAInst
.addImm(AMDGPU::SDWA::SdwaSel::DWORD
);
1118 // Copy dst_unused if present, initialize otherwise if needed
1119 if (AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::dst_unused
)) {
1120 MachineOperand
*DstUnused
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
1122 SDWAInst
.add(*DstUnused
);
1124 SDWAInst
.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD
);
1128 // Copy src0_sel if present, initialize otherwise
1129 assert(AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::src0_sel
));
1130 MachineOperand
*Src0Sel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_sel
);
1132 SDWAInst
.add(*Src0Sel
);
1134 SDWAInst
.addImm(AMDGPU::SDWA::SdwaSel::DWORD
);
1137 // Copy src1_sel if present, initialize otherwise if needed
1139 assert(AMDGPU::hasNamedOperand(SDWAOpcode
, AMDGPU::OpName::src1_sel
));
1140 MachineOperand
*Src1Sel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_sel
);
1142 SDWAInst
.add(*Src1Sel
);
1144 SDWAInst
.addImm(AMDGPU::SDWA::SdwaSel::DWORD
);
1148 // Check for a preserved register that needs to be copied.
1149 auto DstUnused
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
1151 DstUnused
->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE
) {
1152 // We expect, if we are here, that the instruction was already in it's SDWA form,
1153 // with a tied operand.
1154 assert(Dst
&& Dst
->isTied());
1155 assert(Opcode
== static_cast<unsigned int>(SDWAOpcode
));
1156 // We also expect a vdst, since sdst can't preserve.
1157 auto PreserveDstIdx
= AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::vdst
);
1158 assert(PreserveDstIdx
!= -1);
1160 auto TiedIdx
= MI
.findTiedOperandIdx(PreserveDstIdx
);
1161 auto Tied
= MI
.getOperand(TiedIdx
);
1164 SDWAInst
->tieOperands(PreserveDstIdx
, SDWAInst
->getNumOperands() - 1);
1167 // Apply all sdwa operand patterns.
1168 bool Converted
= false;
1169 for (auto &Operand
: SDWAOperands
) {
1170 LLVM_DEBUG(dbgs() << *SDWAInst
<< "\nOperand: " << *Operand
);
1171 // There should be no intersection between SDWA operands and potential MIs
1173 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1174 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1175 // v_add_u32 v3, v4, v2
1177 // In that example it is possible that we would fold 2nd instruction into
1178 // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
1179 // was already destroyed). So if SDWAOperand is also a potential MI then do
1181 if (PotentialMatches
.count(Operand
->getParentInst()) == 0)
1182 Converted
|= Operand
->convertToSDWA(*SDWAInst
, TII
);
1186 ConvertedInstructions
.push_back(SDWAInst
);
1187 for (MachineOperand
&MO
: SDWAInst
->uses()) {
1191 MRI
->clearKillFlags(MO
.getReg());
1194 SDWAInst
->eraseFromParent();
1198 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst
<< '\n');
1199 ++NumSDWAInstructionsPeepholed
;
1201 MI
.eraseFromParent();
1205 // If an instruction was converted to SDWA it should not have immediates or SGPR
1206 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1207 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr
&MI
,
1208 const GCNSubtarget
&ST
) const {
1209 const MCInstrDesc
&Desc
= TII
->get(MI
.getOpcode());
1210 unsigned ConstantBusCount
= 0;
1211 for (MachineOperand
&Op
: MI
.explicit_uses()) {
1212 if (!Op
.isImm() && !(Op
.isReg() && !TRI
->isVGPR(*MRI
, Op
.getReg())))
1215 unsigned I
= Op
.getOperandNo();
1216 if (Desc
.operands()[I
].RegClass
== -1 ||
1217 !TRI
->isVSSuperClass(TRI
->getRegClass(Desc
.operands()[I
].RegClass
)))
1220 if (ST
.hasSDWAScalar() && ConstantBusCount
== 0 && Op
.isReg() &&
1221 TRI
->isSGPRReg(*MRI
, Op
.getReg())) {
1226 Register VGPR
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1227 auto Copy
= BuildMI(*MI
.getParent(), MI
.getIterator(), MI
.getDebugLoc(),
1228 TII
->get(AMDGPU::V_MOV_B32_e32
), VGPR
);
1230 Copy
.addImm(Op
.getImm());
1231 else if (Op
.isReg())
1232 Copy
.addReg(Op
.getReg(), Op
.isKill() ? RegState::Kill
: 0,
1234 Op
.ChangeToRegister(VGPR
, false);
1238 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction
&MF
) {
1239 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1241 if (!ST
.hasSDWA() || skipFunction(MF
.getFunction()))
1244 MRI
= &MF
.getRegInfo();
1245 TRI
= ST
.getRegisterInfo();
1246 TII
= ST
.getInstrInfo();
1248 // Find all SDWA operands in MF.
1250 for (MachineBasicBlock
&MBB
: MF
) {
1251 bool Changed
= false;
1253 // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1254 // Look for a possible ADD or SUB that resulted from a previously lowered
1255 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1256 // lowers the pair of instructions into e32 form.
1257 matchSDWAOperands(MBB
);
1258 for (const auto &OperandPair
: SDWAOperands
) {
1259 const auto &Operand
= OperandPair
.second
;
1260 MachineInstr
*PotentialMI
= Operand
->potentialToConvert(TII
, ST
);
1262 (PotentialMI
->getOpcode() == AMDGPU::V_ADD_CO_U32_e64
||
1263 PotentialMI
->getOpcode() == AMDGPU::V_SUB_CO_U32_e64
))
1264 pseudoOpConvertToVOP2(*PotentialMI
, ST
);
1266 SDWAOperands
.clear();
1268 // Generate potential match list.
1269 matchSDWAOperands(MBB
);
1271 for (const auto &OperandPair
: SDWAOperands
) {
1272 const auto &Operand
= OperandPair
.second
;
1273 MachineInstr
*PotentialMI
= Operand
->potentialToConvert(TII
, ST
, &PotentialMatches
);
1274 if (PotentialMI
&& isConvertibleToSDWA(*PotentialMI
, ST
, TII
)) {
1275 PotentialMatches
[PotentialMI
].push_back(Operand
.get());
1279 for (auto &PotentialPair
: PotentialMatches
) {
1280 MachineInstr
&PotentialMI
= *PotentialPair
.first
;
1281 convertToSDWA(PotentialMI
, PotentialPair
.second
);
1284 PotentialMatches
.clear();
1285 SDWAOperands
.clear();
1287 Changed
= !ConvertedInstructions
.empty();
1291 while (!ConvertedInstructions
.empty())
1292 legalizeScalarOperands(*ConvertedInstructions
.pop_back_val(), ST
);