1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 /// \file This pass tries to apply several peephole SDWA patterns.
13 /// V_LSHRREV_B32_e32 %0, 16, %1
14 /// V_ADD_I32_e32 %2, %0, %3
15 /// V_LSHLREV_B32_e32 %4, 16, %2
18 /// V_ADD_I32_sdwa %4, %1, %3
19 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
21 //===----------------------------------------------------------------------===//
24 #include "AMDGPUSubtarget.h"
25 #include "SIDefines.h"
26 #include "SIInstrInfo.h"
27 #include "SIRegisterInfo.h"
28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29 #include "Utils/AMDGPUBaseInfo.h"
30 #include "llvm/ADT/None.h"
31 #include "llvm/ADT/Optional.h"
32 #include "llvm/ADT/STLExtras.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/CodeGen/MachineBasicBlock.h"
36 #include "llvm/CodeGen/MachineFunction.h"
37 #include "llvm/CodeGen/MachineFunctionPass.h"
38 #include "llvm/CodeGen/MachineInstr.h"
39 #include "llvm/CodeGen/MachineInstrBuilder.h"
40 #include "llvm/CodeGen/MachineOperand.h"
41 #include "llvm/CodeGen/MachineRegisterInfo.h"
42 #include "llvm/CodeGen/TargetRegisterInfo.h"
43 #include "llvm/Config/llvm-config.h"
44 #include "llvm/MC/LaneBitmask.h"
45 #include "llvm/MC/MCInstrDesc.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/Debug.h"
48 #include "llvm/Support/raw_ostream.h"
53 #include <unordered_map>
57 #define DEBUG_TYPE "si-peephole-sdwa"
59 STATISTIC(NumSDWAPatternsFound
, "Number of SDWA patterns found.");
60 STATISTIC(NumSDWAInstructionsPeepholed
,
61 "Number of instruction converted to SDWA.");
68 class SIPeepholeSDWA
: public MachineFunctionPass
{
70 using SDWAOperandsVector
= SmallVector
<SDWAOperand
*, 4>;
73 MachineRegisterInfo
*MRI
;
74 const SIRegisterInfo
*TRI
;
75 const SIInstrInfo
*TII
;
77 std::unordered_map
<MachineInstr
*, std::unique_ptr
<SDWAOperand
>> SDWAOperands
;
78 std::unordered_map
<MachineInstr
*, SDWAOperandsVector
> PotentialMatches
;
79 SmallVector
<MachineInstr
*, 8> ConvertedInstructions
;
81 Optional
<int64_t> foldToImm(const MachineOperand
&Op
) const;
86 SIPeepholeSDWA() : MachineFunctionPass(ID
) {
87 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
90 bool runOnMachineFunction(MachineFunction
&MF
) override
;
91 void matchSDWAOperands(MachineBasicBlock
&MBB
);
92 std::unique_ptr
<SDWAOperand
> matchSDWAOperand(MachineInstr
&MI
);
93 bool isConvertibleToSDWA(const MachineInstr
&MI
, const GCNSubtarget
&ST
) const;
94 bool convertToSDWA(MachineInstr
&MI
, const SDWAOperandsVector
&SDWAOperands
);
95 void legalizeScalarOperands(MachineInstr
&MI
, const GCNSubtarget
&ST
) const;
97 StringRef
getPassName() const override
{ return "SI Peephole SDWA"; }
99 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
100 AU
.setPreservesCFG();
101 MachineFunctionPass::getAnalysisUsage(AU
);
107 MachineOperand
*Target
; // Operand that would be used in converted instruction
108 MachineOperand
*Replaced
; // Operand that would be replace by Target
111 SDWAOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
)
112 : Target(TargetOp
), Replaced(ReplacedOp
) {
113 assert(Target
->isReg());
114 assert(Replaced
->isReg());
117 virtual ~SDWAOperand() = default;
119 virtual MachineInstr
*potentialToConvert(const SIInstrInfo
*TII
) = 0;
120 virtual bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) = 0;
122 MachineOperand
*getTargetOperand() const { return Target
; }
123 MachineOperand
*getReplacedOperand() const { return Replaced
; }
124 MachineInstr
*getParentInst() const { return Target
->getParent(); }
126 MachineRegisterInfo
*getMRI() const {
127 return &getParentInst()->getParent()->getParent()->getRegInfo();
130 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
131 virtual void print(raw_ostream
& OS
) const = 0;
132 void dump() const { print(dbgs()); }
136 using namespace AMDGPU::SDWA
;
138 class SDWASrcOperand
: public SDWAOperand
{
146 SDWASrcOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
,
147 SdwaSel SrcSel_
= DWORD
, bool Abs_
= false, bool Neg_
= false,
149 : SDWAOperand(TargetOp
, ReplacedOp
),
150 SrcSel(SrcSel_
), Abs(Abs_
), Neg(Neg_
), Sext(Sext_
) {}
152 MachineInstr
*potentialToConvert(const SIInstrInfo
*TII
) override
;
153 bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) override
;
155 SdwaSel
getSrcSel() const { return SrcSel
; }
156 bool getAbs() const { return Abs
; }
157 bool getNeg() const { return Neg
; }
158 bool getSext() const { return Sext
; }
160 uint64_t getSrcMods(const SIInstrInfo
*TII
,
161 const MachineOperand
*SrcOp
) const;
163 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
164 void print(raw_ostream
& OS
) const override
;
168 class SDWADstOperand
: public SDWAOperand
{
175 SDWADstOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
,
176 SdwaSel DstSel_
= DWORD
, DstUnused DstUn_
= UNUSED_PAD
)
177 : SDWAOperand(TargetOp
, ReplacedOp
), DstSel(DstSel_
), DstUn(DstUn_
) {}
179 MachineInstr
*potentialToConvert(const SIInstrInfo
*TII
) override
;
180 bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) override
;
182 SdwaSel
getDstSel() const { return DstSel
; }
183 DstUnused
getDstUnused() const { return DstUn
; }
185 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
186 void print(raw_ostream
& OS
) const override
;
190 class SDWADstPreserveOperand
: public SDWADstOperand
{
192 MachineOperand
*Preserve
;
195 SDWADstPreserveOperand(MachineOperand
*TargetOp
, MachineOperand
*ReplacedOp
,
196 MachineOperand
*PreserveOp
, SdwaSel DstSel_
= DWORD
)
197 : SDWADstOperand(TargetOp
, ReplacedOp
, DstSel_
, UNUSED_PRESERVE
),
198 Preserve(PreserveOp
) {}
200 bool convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) override
;
202 MachineOperand
*getPreservedOperand() const { return Preserve
; }
204 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
205 void print(raw_ostream
& OS
) const override
;
209 } // end anonymous namespace
211 INITIALIZE_PASS(SIPeepholeSDWA
, DEBUG_TYPE
, "SI Peephole SDWA", false, false)
213 char SIPeepholeSDWA::ID
= 0;
215 char &llvm::SIPeepholeSDWAID
= SIPeepholeSDWA::ID
;
217 FunctionPass
*llvm::createSIPeepholeSDWAPass() {
218 return new SIPeepholeSDWA();
222 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
223 static raw_ostream
& operator<<(raw_ostream
&OS
, SdwaSel Sel
) {
225 case BYTE_0
: OS
<< "BYTE_0"; break;
226 case BYTE_1
: OS
<< "BYTE_1"; break;
227 case BYTE_2
: OS
<< "BYTE_2"; break;
228 case BYTE_3
: OS
<< "BYTE_3"; break;
229 case WORD_0
: OS
<< "WORD_0"; break;
230 case WORD_1
: OS
<< "WORD_1"; break;
231 case DWORD
: OS
<< "DWORD"; break;
236 static raw_ostream
& operator<<(raw_ostream
&OS
, const DstUnused
&Un
) {
238 case UNUSED_PAD
: OS
<< "UNUSED_PAD"; break;
239 case UNUSED_SEXT
: OS
<< "UNUSED_SEXT"; break;
240 case UNUSED_PRESERVE
: OS
<< "UNUSED_PRESERVE"; break;
245 static raw_ostream
& operator<<(raw_ostream
&OS
, const SDWAOperand
&Operand
) {
251 void SDWASrcOperand::print(raw_ostream
& OS
) const {
252 OS
<< "SDWA src: " << *getTargetOperand()
253 << " src_sel:" << getSrcSel()
254 << " abs:" << getAbs() << " neg:" << getNeg()
255 << " sext:" << getSext() << '\n';
259 void SDWADstOperand::print(raw_ostream
& OS
) const {
260 OS
<< "SDWA dst: " << *getTargetOperand()
261 << " dst_sel:" << getDstSel()
262 << " dst_unused:" << getDstUnused() << '\n';
266 void SDWADstPreserveOperand::print(raw_ostream
& OS
) const {
267 OS
<< "SDWA preserve dst: " << *getTargetOperand()
268 << " dst_sel:" << getDstSel()
269 << " preserve:" << *getPreservedOperand() << '\n';
274 static void copyRegOperand(MachineOperand
&To
, const MachineOperand
&From
) {
275 assert(To
.isReg() && From
.isReg());
276 To
.setReg(From
.getReg());
277 To
.setSubReg(From
.getSubReg());
278 To
.setIsUndef(From
.isUndef());
280 To
.setIsKill(From
.isKill());
282 To
.setIsDead(From
.isDead());
286 static bool isSameReg(const MachineOperand
&LHS
, const MachineOperand
&RHS
) {
287 return LHS
.isReg() &&
289 LHS
.getReg() == RHS
.getReg() &&
290 LHS
.getSubReg() == RHS
.getSubReg();
293 static MachineOperand
*findSingleRegUse(const MachineOperand
*Reg
,
294 const MachineRegisterInfo
*MRI
) {
295 if (!Reg
->isReg() || !Reg
->isDef())
298 MachineOperand
*ResMO
= nullptr;
299 for (MachineOperand
&UseMO
: MRI
->use_nodbg_operands(Reg
->getReg())) {
300 // If there exist use of subreg of Reg then return nullptr
301 if (!isSameReg(UseMO
, *Reg
))
304 // Check that there is only one instruction that uses Reg
307 } else if (ResMO
->getParent() != UseMO
.getParent()) {
315 static MachineOperand
*findSingleRegDef(const MachineOperand
*Reg
,
316 const MachineRegisterInfo
*MRI
) {
320 MachineInstr
*DefInstr
= MRI
->getUniqueVRegDef(Reg
->getReg());
324 for (auto &DefMO
: DefInstr
->defs()) {
325 if (DefMO
.isReg() && DefMO
.getReg() == Reg
->getReg())
329 // Ignore implicit defs.
333 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo
*TII
,
334 const MachineOperand
*SrcOp
) const {
336 const auto *MI
= SrcOp
->getParent();
337 if (TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0
) == SrcOp
) {
338 if (auto *Mod
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src0_modifiers
)) {
339 Mods
= Mod
->getImm();
341 } else if (TII
->getNamedOperand(*MI
, AMDGPU::OpName::src1
) == SrcOp
) {
342 if (auto *Mod
= TII
->getNamedOperand(*MI
, AMDGPU::OpName::src1_modifiers
)) {
343 Mods
= Mod
->getImm();
348 "Float and integer src modifiers can't be set simulteniously");
349 Mods
|= Abs
? SISrcMods::ABS
: 0;
350 Mods
^= Neg
? SISrcMods::NEG
: 0;
352 Mods
|= SISrcMods::SEXT
;
358 MachineInstr
*SDWASrcOperand::potentialToConvert(const SIInstrInfo
*TII
) {
359 // For SDWA src operand potential instruction is one that use register
360 // defined by parent instruction
361 MachineOperand
*PotentialMO
= findSingleRegUse(getReplacedOperand(), getMRI());
365 return PotentialMO
->getParent();
368 bool SDWASrcOperand::convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) {
369 // Find operand in instruction that matches source operand and replace it with
370 // target operand. Set corresponding src_sel
371 bool IsPreserveSrc
= false;
372 MachineOperand
*Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
373 MachineOperand
*SrcSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_sel
);
374 MachineOperand
*SrcMods
=
375 TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
);
376 assert(Src
&& (Src
->isReg() || Src
->isImm()));
377 if (!isSameReg(*Src
, *getReplacedOperand())) {
378 // If this is not src0 then it could be src1
379 Src
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
380 SrcSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_sel
);
381 SrcMods
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
);
384 !isSameReg(*Src
, *getReplacedOperand())) {
385 // It's possible this Src is a tied operand for
386 // UNUSED_PRESERVE, in which case we can either
387 // abandon the peephole attempt, or if legal we can
388 // copy the target operand into the tied slot
389 // if the preserve operation will effectively cause the same
390 // result by overwriting the rest of the dst.
391 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
392 MachineOperand
*DstUnused
=
393 TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
396 DstUnused
->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE
) {
397 // This will work if the tied src is acessing WORD_0, and the dst is
398 // writing WORD_1. Modifiers don't matter because all the bits that
399 // would be impacted are being overwritten by the dst.
400 // Any other case will not work.
401 SdwaSel DstSel
= static_cast<SdwaSel
>(
402 TII
->getNamedImmOperand(MI
, AMDGPU::OpName::dst_sel
));
403 if (DstSel
== AMDGPU::SDWA::SdwaSel::WORD_1
&&
404 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0
) {
405 IsPreserveSrc
= true;
406 auto DstIdx
= AMDGPU::getNamedOperandIdx(MI
.getOpcode(),
407 AMDGPU::OpName::vdst
);
408 auto TiedIdx
= MI
.findTiedOperandIdx(DstIdx
);
409 Src
= &MI
.getOperand(TiedIdx
);
413 // Not legal to convert this src
418 assert(Src
&& Src
->isReg());
420 if ((MI
.getOpcode() == AMDGPU::V_MAC_F16_sdwa
||
421 MI
.getOpcode() == AMDGPU::V_MAC_F32_sdwa
) &&
422 !isSameReg(*Src
, *getReplacedOperand())) {
423 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
424 // src2. This is not allowed.
428 assert(isSameReg(*Src
, *getReplacedOperand()) &&
429 (IsPreserveSrc
|| (SrcSel
&& SrcMods
)));
431 copyRegOperand(*Src
, *getTargetOperand());
432 if (!IsPreserveSrc
) {
433 SrcSel
->setImm(getSrcSel());
434 SrcMods
->setImm(getSrcMods(TII
, Src
));
436 getTargetOperand()->setIsKill(false);
440 MachineInstr
*SDWADstOperand::potentialToConvert(const SIInstrInfo
*TII
) {
441 // For SDWA dst operand potential instruction is one that defines register
442 // that this operand uses
443 MachineRegisterInfo
*MRI
= getMRI();
444 MachineInstr
*ParentMI
= getParentInst();
446 MachineOperand
*PotentialMO
= findSingleRegDef(getReplacedOperand(), MRI
);
450 // Check that ParentMI is the only instruction that uses replaced register
451 for (MachineInstr
&UseInst
: MRI
->use_nodbg_instructions(PotentialMO
->getReg())) {
452 if (&UseInst
!= ParentMI
)
456 return PotentialMO
->getParent();
459 bool SDWADstOperand::convertToSDWA(MachineInstr
&MI
, const SIInstrInfo
*TII
) {
460 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
462 if ((MI
.getOpcode() == AMDGPU::V_MAC_F16_sdwa
||
463 MI
.getOpcode() == AMDGPU::V_MAC_F32_sdwa
) &&
464 getDstSel() != AMDGPU::SDWA::DWORD
) {
465 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
469 MachineOperand
*Operand
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
472 isSameReg(*Operand
, *getReplacedOperand()));
473 copyRegOperand(*Operand
, *getTargetOperand());
474 MachineOperand
*DstSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_sel
);
476 DstSel
->setImm(getDstSel());
477 MachineOperand
*DstUnused
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
479 DstUnused
->setImm(getDstUnused());
481 // Remove original instruction because it would conflict with our new
482 // instruction by register definition
483 getParentInst()->eraseFromParent();
487 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr
&MI
,
488 const SIInstrInfo
*TII
) {
489 // MI should be moved right before v_or_b32.
490 // For this we should clear all kill flags on uses of MI src-operands or else
491 // we can encounter problem with use of killed operand.
492 for (MachineOperand
&MO
: MI
.uses()) {
495 getMRI()->clearKillFlags(MO
.getReg());
498 // Move MI before v_or_b32
499 auto MBB
= MI
.getParent();
501 MBB
->insert(getParentInst(), &MI
);
503 // Add Implicit use of preserved register
504 MachineInstrBuilder
MIB(*MBB
->getParent(), MI
);
505 MIB
.addReg(getPreservedOperand()->getReg(),
506 RegState::ImplicitKill
,
507 getPreservedOperand()->getSubReg());
509 // Tie dst to implicit use
510 MI
.tieOperands(AMDGPU::getNamedOperandIdx(MI
.getOpcode(), AMDGPU::OpName::vdst
),
511 MI
.getNumOperands() - 1);
513 // Convert MI as any other SDWADstOperand and remove v_or_b32
514 return SDWADstOperand::convertToSDWA(MI
, TII
);
517 Optional
<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand
&Op
) const {
522 // If this is not immediate then it can be copy of immediate value, e.g.:
523 // %1 = S_MOV_B32 255;
525 for (const MachineOperand
&Def
: MRI
->def_operands(Op
.getReg())) {
526 if (!isSameReg(Op
, Def
))
529 const MachineInstr
*DefInst
= Def
.getParent();
530 if (!TII
->isFoldableCopy(*DefInst
))
533 const MachineOperand
&Copied
= DefInst
->getOperand(1);
537 return Copied
.getImm();
544 std::unique_ptr
<SDWAOperand
>
545 SIPeepholeSDWA::matchSDWAOperand(MachineInstr
&MI
) {
546 unsigned Opcode
= MI
.getOpcode();
548 case AMDGPU::V_LSHRREV_B32_e32
:
549 case AMDGPU::V_ASHRREV_I32_e32
:
550 case AMDGPU::V_LSHLREV_B32_e32
:
551 case AMDGPU::V_LSHRREV_B32_e64
:
552 case AMDGPU::V_ASHRREV_I32_e64
:
553 case AMDGPU::V_LSHLREV_B32_e64
: {
554 // from: v_lshrrev_b32_e32 v1, 16/24, v0
555 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
557 // from: v_ashrrev_i32_e32 v1, 16/24, v0
558 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
560 // from: v_lshlrev_b32_e32 v1, 16/24, v0
561 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
562 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
563 auto Imm
= foldToImm(*Src0
);
567 if (*Imm
!= 16 && *Imm
!= 24)
570 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
571 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
572 if (TRI
->isPhysicalRegister(Src1
->getReg()) ||
573 TRI
->isPhysicalRegister(Dst
->getReg()))
576 if (Opcode
== AMDGPU::V_LSHLREV_B32_e32
||
577 Opcode
== AMDGPU::V_LSHLREV_B32_e64
) {
578 return make_unique
<SDWADstOperand
>(
579 Dst
, Src1
, *Imm
== 16 ? WORD_1
: BYTE_3
, UNUSED_PAD
);
581 return make_unique
<SDWASrcOperand
>(
582 Src1
, Dst
, *Imm
== 16 ? WORD_1
: BYTE_3
, false, false,
583 Opcode
!= AMDGPU::V_LSHRREV_B32_e32
&&
584 Opcode
!= AMDGPU::V_LSHRREV_B32_e64
);
589 case AMDGPU::V_LSHRREV_B16_e32
:
590 case AMDGPU::V_ASHRREV_I16_e32
:
591 case AMDGPU::V_LSHLREV_B16_e32
:
592 case AMDGPU::V_LSHRREV_B16_e64
:
593 case AMDGPU::V_ASHRREV_I16_e64
:
594 case AMDGPU::V_LSHLREV_B16_e64
: {
595 // from: v_lshrrev_b16_e32 v1, 8, v0
596 // to SDWA src:v0 src_sel:BYTE_1
598 // from: v_ashrrev_i16_e32 v1, 8, v0
599 // to SDWA src:v0 src_sel:BYTE_1 sext:1
601 // from: v_lshlrev_b16_e32 v1, 8, v0
602 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
603 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
604 auto Imm
= foldToImm(*Src0
);
605 if (!Imm
|| *Imm
!= 8)
608 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
609 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
611 if (TRI
->isPhysicalRegister(Src1
->getReg()) ||
612 TRI
->isPhysicalRegister(Dst
->getReg()))
615 if (Opcode
== AMDGPU::V_LSHLREV_B16_e32
||
616 Opcode
== AMDGPU::V_LSHLREV_B16_e64
) {
617 return make_unique
<SDWADstOperand
>(Dst
, Src1
, BYTE_1
, UNUSED_PAD
);
619 return make_unique
<SDWASrcOperand
>(
620 Src1
, Dst
, BYTE_1
, false, false,
621 Opcode
!= AMDGPU::V_LSHRREV_B16_e32
&&
622 Opcode
!= AMDGPU::V_LSHRREV_B16_e64
);
627 case AMDGPU::V_BFE_I32
:
628 case AMDGPU::V_BFE_U32
: {
630 // from: v_bfe_u32 v1, v0, 8, 8
631 // to SDWA src:v0 src_sel:BYTE_1
633 // offset | width | src_sel
634 // ------------------------
643 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
644 auto Offset
= foldToImm(*Src1
);
648 MachineOperand
*Src2
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src2
);
649 auto Width
= foldToImm(*Src2
);
653 SdwaSel SrcSel
= DWORD
;
655 if (*Offset
== 0 && *Width
== 8)
657 else if (*Offset
== 0 && *Width
== 16)
659 else if (*Offset
== 0 && *Width
== 32)
661 else if (*Offset
== 8 && *Width
== 8)
663 else if (*Offset
== 16 && *Width
== 8)
665 else if (*Offset
== 16 && *Width
== 16)
667 else if (*Offset
== 24 && *Width
== 8)
672 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
673 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
675 if (TRI
->isPhysicalRegister(Src0
->getReg()) ||
676 TRI
->isPhysicalRegister(Dst
->getReg()))
679 return make_unique
<SDWASrcOperand
>(
680 Src0
, Dst
, SrcSel
, false, false, Opcode
!= AMDGPU::V_BFE_U32
);
683 case AMDGPU::V_AND_B32_e32
:
684 case AMDGPU::V_AND_B32_e64
: {
686 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
687 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
689 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
690 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
692 auto Imm
= foldToImm(*Src0
);
695 Imm
= foldToImm(*Src1
);
699 if (!Imm
|| (*Imm
!= 0x0000ffff && *Imm
!= 0x000000ff))
702 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
704 if (TRI
->isPhysicalRegister(ValSrc
->getReg()) ||
705 TRI
->isPhysicalRegister(Dst
->getReg()))
708 return make_unique
<SDWASrcOperand
>(
709 ValSrc
, Dst
, *Imm
== 0x0000ffff ? WORD_0
: BYTE_0
);
712 case AMDGPU::V_OR_B32_e32
:
713 case AMDGPU::V_OR_B32_e64
: {
714 // Patterns for dst_unused:UNUSED_PRESERVE.
716 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
717 // src1_sel:WORD_1 src2_sel:WORD1
718 // v_add_f16_e32 v3, v1, v2
719 // v_or_b32_e32 v4, v0, v3
720 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
722 // Check if one of operands of v_or_b32 is SDWA instruction
723 using CheckRetType
= Optional
<std::pair
<MachineOperand
*, MachineOperand
*>>;
724 auto CheckOROperandsForSDWA
=
725 [&](const MachineOperand
*Op1
, const MachineOperand
*Op2
) -> CheckRetType
{
726 if (!Op1
|| !Op1
->isReg() || !Op2
|| !Op2
->isReg())
727 return CheckRetType(None
);
729 MachineOperand
*Op1Def
= findSingleRegDef(Op1
, MRI
);
731 return CheckRetType(None
);
733 MachineInstr
*Op1Inst
= Op1Def
->getParent();
734 if (!TII
->isSDWA(*Op1Inst
))
735 return CheckRetType(None
);
737 MachineOperand
*Op2Def
= findSingleRegDef(Op2
, MRI
);
739 return CheckRetType(None
);
741 return CheckRetType(std::make_pair(Op1Def
, Op2Def
));
744 MachineOperand
*OrSDWA
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
745 MachineOperand
*OrOther
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
746 assert(OrSDWA
&& OrOther
);
747 auto Res
= CheckOROperandsForSDWA(OrSDWA
, OrOther
);
749 OrSDWA
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
750 OrOther
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
751 assert(OrSDWA
&& OrOther
);
752 Res
= CheckOROperandsForSDWA(OrSDWA
, OrOther
);
757 MachineOperand
*OrSDWADef
= Res
->first
;
758 MachineOperand
*OrOtherDef
= Res
->second
;
759 assert(OrSDWADef
&& OrOtherDef
);
761 MachineInstr
*SDWAInst
= OrSDWADef
->getParent();
762 MachineInstr
*OtherInst
= OrOtherDef
->getParent();
764 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
765 // destination patterns don't overlap. Compatible instruction can be either
766 // regular instruction with compatible bitness or SDWA instruction with
768 // SDWAInst | OtherInst bitness / OtherInst dst_sel
769 // -----------------------------------------------------
771 // WORD_0 | no / BYTE_2/3, WORD_1
772 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
773 // BYTE_0 | no / BYTE_1/2/3, WORD_1
774 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
775 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
776 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
777 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
778 // but v_add_f32 is not.
780 // TODO: add support for non-SDWA instructions as OtherInst.
781 // For now this only works with SDWA instructions. For regular instructions
782 // there is no way to determine if the instruction writes only 8/16/24-bit
783 // out of full register size and all registers are at min 32-bit wide.
784 if (!TII
->isSDWA(*OtherInst
))
787 SdwaSel DstSel
= static_cast<SdwaSel
>(
788 TII
->getNamedImmOperand(*SDWAInst
, AMDGPU::OpName::dst_sel
));;
789 SdwaSel OtherDstSel
= static_cast<SdwaSel
>(
790 TII
->getNamedImmOperand(*OtherInst
, AMDGPU::OpName::dst_sel
));
792 bool DstSelAgree
= false;
794 case WORD_0
: DstSelAgree
= ((OtherDstSel
== BYTE_2
) ||
795 (OtherDstSel
== BYTE_3
) ||
796 (OtherDstSel
== WORD_1
));
798 case WORD_1
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
799 (OtherDstSel
== BYTE_1
) ||
800 (OtherDstSel
== WORD_0
));
802 case BYTE_0
: DstSelAgree
= ((OtherDstSel
== BYTE_1
) ||
803 (OtherDstSel
== BYTE_2
) ||
804 (OtherDstSel
== BYTE_3
) ||
805 (OtherDstSel
== WORD_1
));
807 case BYTE_1
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
808 (OtherDstSel
== BYTE_2
) ||
809 (OtherDstSel
== BYTE_3
) ||
810 (OtherDstSel
== WORD_1
));
812 case BYTE_2
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
813 (OtherDstSel
== BYTE_1
) ||
814 (OtherDstSel
== BYTE_3
) ||
815 (OtherDstSel
== WORD_0
));
817 case BYTE_3
: DstSelAgree
= ((OtherDstSel
== BYTE_0
) ||
818 (OtherDstSel
== BYTE_1
) ||
819 (OtherDstSel
== BYTE_2
) ||
820 (OtherDstSel
== WORD_0
));
822 default: DstSelAgree
= false;
828 // Also OtherInst dst_unused should be UNUSED_PAD
829 DstUnused OtherDstUnused
= static_cast<DstUnused
>(
830 TII
->getNamedImmOperand(*OtherInst
, AMDGPU::OpName::dst_unused
));
831 if (OtherDstUnused
!= DstUnused::UNUSED_PAD
)
834 // Create DstPreserveOperand
835 MachineOperand
*OrDst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
836 assert(OrDst
&& OrDst
->isReg());
838 return make_unique
<SDWADstPreserveOperand
>(
839 OrDst
, OrSDWADef
, OrOtherDef
, DstSel
);
844 return std::unique_ptr
<SDWAOperand
>(nullptr);
847 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock
&MBB
) {
848 for (MachineInstr
&MI
: MBB
) {
849 if (auto Operand
= matchSDWAOperand(MI
)) {
850 LLVM_DEBUG(dbgs() << "Match: " << MI
<< "To: " << *Operand
<< '\n');
851 SDWAOperands
[&MI
] = std::move(Operand
);
852 ++NumSDWAPatternsFound
;
857 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr
&MI
,
858 const GCNSubtarget
&ST
) const {
859 // Check if this is already an SDWA instruction
860 unsigned Opc
= MI
.getOpcode();
861 if (TII
->isSDWA(Opc
))
864 // Check if this instruction has opcode that supports SDWA
865 if (AMDGPU::getSDWAOp(Opc
) == -1)
866 Opc
= AMDGPU::getVOPe32(Opc
);
868 if (AMDGPU::getSDWAOp(Opc
) == -1)
871 if (!ST
.hasSDWAOmod() && TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
))
874 if (TII
->isVOPC(Opc
)) {
875 if (!ST
.hasSDWASdst()) {
876 const MachineOperand
*SDst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
);
877 if (SDst
&& SDst
->getReg() != AMDGPU::VCC
)
881 if (!ST
.hasSDWAOutModsVOPC() &&
882 (TII
->hasModifiersSet(MI
, AMDGPU::OpName::clamp
) ||
883 TII
->hasModifiersSet(MI
, AMDGPU::OpName::omod
)))
886 } else if (TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
) ||
887 !TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
)) {
891 if (!ST
.hasSDWAMac() && (Opc
== AMDGPU::V_MAC_F16_e32
||
892 Opc
== AMDGPU::V_MAC_F32_e32
))
895 // FIXME: has SDWA but require handling of implicit VCC use
896 if (Opc
== AMDGPU::V_CNDMASK_B32_e32
)
902 bool SIPeepholeSDWA::convertToSDWA(MachineInstr
&MI
,
903 const SDWAOperandsVector
&SDWAOperands
) {
905 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI
);
909 unsigned Opcode
= MI
.getOpcode();
910 if (TII
->isSDWA(Opcode
)) {
913 SDWAOpcode
= AMDGPU::getSDWAOp(Opcode
);
914 if (SDWAOpcode
== -1)
915 SDWAOpcode
= AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode
));
917 assert(SDWAOpcode
!= -1);
919 const MCInstrDesc
&SDWADesc
= TII
->get(SDWAOpcode
);
921 // Create SDWA version of instruction MI and initialize its operands
922 MachineInstrBuilder SDWAInst
=
923 BuildMI(*MI
.getParent(), MI
, MI
.getDebugLoc(), SDWADesc
);
925 // Copy dst, if it is present in original then should also be present in SDWA
926 MachineOperand
*Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::vdst
);
928 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::vdst
) != -1);
930 } else if ((Dst
= TII
->getNamedOperand(MI
, AMDGPU::OpName::sdst
))) {
932 AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::sdst
) != -1);
935 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::sdst
) != -1);
936 SDWAInst
.addReg(AMDGPU::VCC
, RegState::Define
);
939 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
940 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
941 MachineOperand
*Src0
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0
);
944 AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src0
) != -1 &&
945 AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src0_modifiers
) != -1);
946 if (auto *Mod
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_modifiers
))
947 SDWAInst
.addImm(Mod
->getImm());
952 // Copy src1 if present, initialize src1_modifiers.
953 MachineOperand
*Src1
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1
);
956 AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src1
) != -1 &&
957 AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src1_modifiers
) != -1);
958 if (auto *Mod
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_modifiers
))
959 SDWAInst
.addImm(Mod
->getImm());
965 if (SDWAOpcode
== AMDGPU::V_MAC_F16_sdwa
||
966 SDWAOpcode
== AMDGPU::V_MAC_F32_sdwa
) {
967 // v_mac_f16/32 has additional src2 operand tied to vdst
968 MachineOperand
*Src2
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src2
);
973 // Copy clamp if present, initialize otherwise
974 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::clamp
) != -1);
975 MachineOperand
*Clamp
= TII
->getNamedOperand(MI
, AMDGPU::OpName::clamp
);
977 SDWAInst
.add(*Clamp
);
982 // Copy omod if present, initialize otherwise if needed
983 if (AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::omod
) != -1) {
984 MachineOperand
*OMod
= TII
->getNamedOperand(MI
, AMDGPU::OpName::omod
);
992 // Copy dst_sel if present, initialize otherwise if needed
993 if (AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::dst_sel
) != -1) {
994 MachineOperand
*DstSel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_sel
);
996 SDWAInst
.add(*DstSel
);
998 SDWAInst
.addImm(AMDGPU::SDWA::SdwaSel::DWORD
);
1002 // Copy dst_unused if present, initialize otherwise if needed
1003 if (AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::dst_unused
) != -1) {
1004 MachineOperand
*DstUnused
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
1006 SDWAInst
.add(*DstUnused
);
1008 SDWAInst
.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD
);
1012 // Copy src0_sel if present, initialize otherwise
1013 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src0_sel
) != -1);
1014 MachineOperand
*Src0Sel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src0_sel
);
1016 SDWAInst
.add(*Src0Sel
);
1018 SDWAInst
.addImm(AMDGPU::SDWA::SdwaSel::DWORD
);
1021 // Copy src1_sel if present, initialize otherwise if needed
1023 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::src1_sel
) != -1);
1024 MachineOperand
*Src1Sel
= TII
->getNamedOperand(MI
, AMDGPU::OpName::src1_sel
);
1026 SDWAInst
.add(*Src1Sel
);
1028 SDWAInst
.addImm(AMDGPU::SDWA::SdwaSel::DWORD
);
1032 // Check for a preserved register that needs to be copied.
1033 auto DstUnused
= TII
->getNamedOperand(MI
, AMDGPU::OpName::dst_unused
);
1035 DstUnused
->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE
) {
1036 // We expect, if we are here, that the instruction was already in it's SDWA form,
1037 // with a tied operand.
1038 assert(Dst
&& Dst
->isTied());
1039 assert(Opcode
== static_cast<unsigned int>(SDWAOpcode
));
1040 // We also expect a vdst, since sdst can't preserve.
1041 auto PreserveDstIdx
= AMDGPU::getNamedOperandIdx(SDWAOpcode
, AMDGPU::OpName::vdst
);
1042 assert(PreserveDstIdx
!= -1);
1044 auto TiedIdx
= MI
.findTiedOperandIdx(PreserveDstIdx
);
1045 auto Tied
= MI
.getOperand(TiedIdx
);
1048 SDWAInst
->tieOperands(PreserveDstIdx
, SDWAInst
->getNumOperands() - 1);
1051 // Apply all sdwa operand patterns.
1052 bool Converted
= false;
1053 for (auto &Operand
: SDWAOperands
) {
1054 LLVM_DEBUG(dbgs() << *SDWAInst
<< "\nOperand: " << *Operand
);
1055 // There should be no intesection between SDWA operands and potential MIs
1057 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1058 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1059 // v_add_u32 v3, v4, v2
1061 // In that example it is possible that we would fold 2nd instruction into 3rd
1062 // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
1063 // already destroyed). So if SDWAOperand is also a potential MI then do not
1065 if (PotentialMatches
.count(Operand
->getParentInst()) == 0)
1066 Converted
|= Operand
->convertToSDWA(*SDWAInst
, TII
);
1069 ConvertedInstructions
.push_back(SDWAInst
);
1071 SDWAInst
->eraseFromParent();
1075 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst
<< '\n');
1076 ++NumSDWAInstructionsPeepholed
;
1078 MI
.eraseFromParent();
1082 // If an instruction was converted to SDWA it should not have immediates or SGPR
1083 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1084 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr
&MI
,
1085 const GCNSubtarget
&ST
) const {
1086 const MCInstrDesc
&Desc
= TII
->get(MI
.getOpcode());
1087 unsigned ConstantBusCount
= 0;
1088 for (MachineOperand
&Op
: MI
.explicit_uses()) {
1089 if (!Op
.isImm() && !(Op
.isReg() && !TRI
->isVGPR(*MRI
, Op
.getReg())))
1092 unsigned I
= MI
.getOperandNo(&Op
);
1093 if (Desc
.OpInfo
[I
].RegClass
== -1 ||
1094 !TRI
->hasVGPRs(TRI
->getRegClass(Desc
.OpInfo
[I
].RegClass
)))
1097 if (ST
.hasSDWAScalar() && ConstantBusCount
== 0 && Op
.isReg() &&
1098 TRI
->isSGPRReg(*MRI
, Op
.getReg())) {
1103 unsigned VGPR
= MRI
->createVirtualRegister(&AMDGPU::VGPR_32RegClass
);
1104 auto Copy
= BuildMI(*MI
.getParent(), MI
.getIterator(), MI
.getDebugLoc(),
1105 TII
->get(AMDGPU::V_MOV_B32_e32
), VGPR
);
1107 Copy
.addImm(Op
.getImm());
1108 else if (Op
.isReg())
1109 Copy
.addReg(Op
.getReg(), Op
.isKill() ? RegState::Kill
: 0,
1111 Op
.ChangeToRegister(VGPR
, false);
1115 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction
&MF
) {
1116 const GCNSubtarget
&ST
= MF
.getSubtarget
<GCNSubtarget
>();
1118 if (!ST
.hasSDWA() || skipFunction(MF
.getFunction()))
1121 MRI
= &MF
.getRegInfo();
1122 TRI
= ST
.getRegisterInfo();
1123 TII
= ST
.getInstrInfo();
1125 // Find all SDWA operands in MF.
1127 for (MachineBasicBlock
&MBB
: MF
) {
1128 bool Changed
= false;
1130 matchSDWAOperands(MBB
);
1132 for (const auto &OperandPair
: SDWAOperands
) {
1133 const auto &Operand
= OperandPair
.second
;
1134 MachineInstr
*PotentialMI
= Operand
->potentialToConvert(TII
);
1135 if (PotentialMI
&& isConvertibleToSDWA(*PotentialMI
, ST
)) {
1136 PotentialMatches
[PotentialMI
].push_back(Operand
.get());
1140 for (auto &PotentialPair
: PotentialMatches
) {
1141 MachineInstr
&PotentialMI
= *PotentialPair
.first
;
1142 convertToSDWA(PotentialMI
, PotentialPair
.second
);
1145 PotentialMatches
.clear();
1146 SDWAOperands
.clear();
1148 Changed
= !ConvertedInstructions
.empty();
1152 while (!ConvertedInstructions
.empty())
1153 legalizeScalarOperands(*ConvertedInstructions
.pop_back_val(), ST
);