1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
11 /// to perform the atomic operation, thus reducing contention on that memory
14 //===----------------------------------------------------------------------===//
17 #include "AMDGPUSubtarget.h"
18 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
19 #include "llvm/CodeGen/TargetPassConfig.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/InstVisitor.h"
22 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
24 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
37 DPP_ROW_BCAST15
= 0x142,
38 DPP_ROW_BCAST31
= 0x143
41 struct ReplacementInfo
{
43 Instruction::BinaryOps Op
;
48 class AMDGPUAtomicOptimizer
: public FunctionPass
,
49 public InstVisitor
<AMDGPUAtomicOptimizer
> {
51 SmallVector
<ReplacementInfo
, 8> ToReplace
;
52 const LegacyDivergenceAnalysis
*DA
;
58 void optimizeAtomic(Instruction
&I
, Instruction::BinaryOps Op
,
59 unsigned ValIdx
, bool ValDivergent
) const;
61 void setConvergent(CallInst
*const CI
) const;
66 AMDGPUAtomicOptimizer() : FunctionPass(ID
) {}
68 bool runOnFunction(Function
&F
) override
;
70 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
71 AU
.addPreserved
<DominatorTreeWrapperPass
>();
72 AU
.addRequired
<LegacyDivergenceAnalysis
>();
73 AU
.addRequired
<TargetPassConfig
>();
76 void visitAtomicRMWInst(AtomicRMWInst
&I
);
77 void visitIntrinsicInst(IntrinsicInst
&I
);
82 char AMDGPUAtomicOptimizer::ID
= 0;
84 char &llvm::AMDGPUAtomicOptimizerID
= AMDGPUAtomicOptimizer::ID
;
86 bool AMDGPUAtomicOptimizer::runOnFunction(Function
&F
) {
87 if (skipFunction(F
)) {
91 DA
= &getAnalysis
<LegacyDivergenceAnalysis
>();
92 DL
= &F
.getParent()->getDataLayout();
93 DominatorTreeWrapperPass
*const DTW
=
94 getAnalysisIfAvailable
<DominatorTreeWrapperPass
>();
95 DT
= DTW
? &DTW
->getDomTree() : nullptr;
96 const TargetPassConfig
&TPC
= getAnalysis
<TargetPassConfig
>();
97 const TargetMachine
&TM
= TPC
.getTM
<TargetMachine
>();
98 const GCNSubtarget
&ST
= TM
.getSubtarget
<GCNSubtarget
>(F
);
100 IsPixelShader
= F
.getCallingConv() == CallingConv::AMDGPU_PS
;
104 const bool Changed
= !ToReplace
.empty();
106 for (ReplacementInfo
&Info
: ToReplace
) {
107 optimizeAtomic(*Info
.I
, Info
.Op
, Info
.ValIdx
, Info
.ValDivergent
);
115 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst
&I
) {
116 // Early exit for unhandled address space atomic instructions.
117 switch (I
.getPointerAddressSpace()) {
120 case AMDGPUAS::GLOBAL_ADDRESS
:
121 case AMDGPUAS::LOCAL_ADDRESS
:
125 Instruction::BinaryOps Op
;
127 switch (I
.getOperation()) {
130 case AtomicRMWInst::Add
:
131 Op
= Instruction::Add
;
133 case AtomicRMWInst::Sub
:
134 Op
= Instruction::Sub
;
138 const unsigned PtrIdx
= 0;
139 const unsigned ValIdx
= 1;
141 // If the pointer operand is divergent, then each lane is doing an atomic
142 // operation on a different address, and we cannot optimize that.
143 if (DA
->isDivergent(I
.getOperand(PtrIdx
))) {
147 const bool ValDivergent
= DA
->isDivergent(I
.getOperand(ValIdx
));
149 // If the value operand is divergent, each lane is contributing a different
150 // value to the atomic calculation. We can only optimize divergent values if
151 // we have DPP available on our subtarget, and the atomic operation is 32
153 if (ValDivergent
&& (!HasDPP
|| (DL
->getTypeSizeInBits(I
.getType()) != 32))) {
157 // If we get here, we can optimize the atomic using a single wavefront-wide
158 // atomic operation to do the calculation for the entire wavefront, so
159 // remember the instruction so we can come back to it.
160 const ReplacementInfo Info
= {&I
, Op
, ValIdx
, ValDivergent
};
162 ToReplace
.push_back(Info
);
165 void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst
&I
) {
166 Instruction::BinaryOps Op
;
168 switch (I
.getIntrinsicID()) {
171 case Intrinsic::amdgcn_buffer_atomic_add
:
172 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
173 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
174 Op
= Instruction::Add
;
176 case Intrinsic::amdgcn_buffer_atomic_sub
:
177 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
178 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
179 Op
= Instruction::Sub
;
183 const unsigned ValIdx
= 0;
185 const bool ValDivergent
= DA
->isDivergent(I
.getOperand(ValIdx
));
187 // If the value operand is divergent, each lane is contributing a different
188 // value to the atomic calculation. We can only optimize divergent values if
189 // we have DPP available on our subtarget, and the atomic operation is 32
191 if (ValDivergent
&& (!HasDPP
|| (DL
->getTypeSizeInBits(I
.getType()) != 32))) {
195 // If any of the other arguments to the intrinsic are divergent, we can't
196 // optimize the operation.
197 for (unsigned Idx
= 1; Idx
< I
.getNumOperands(); Idx
++) {
198 if (DA
->isDivergent(I
.getOperand(Idx
))) {
203 // If we get here, we can optimize the atomic using a single wavefront-wide
204 // atomic operation to do the calculation for the entire wavefront, so
205 // remember the instruction so we can come back to it.
206 const ReplacementInfo Info
= {&I
, Op
, ValIdx
, ValDivergent
};
208 ToReplace
.push_back(Info
);
211 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction
&I
,
212 Instruction::BinaryOps Op
,
214 bool ValDivergent
) const {
215 // Start building just before the instruction.
218 // If we are in a pixel shader, because of how we have to mask out helper
219 // lane invocations, we need to record the entry and exit BB's.
220 BasicBlock
*PixelEntryBB
= nullptr;
221 BasicBlock
*PixelExitBB
= nullptr;
223 // If we're optimizing an atomic within a pixel shader, we need to wrap the
224 // entire atomic operation in a helper-lane check. We do not want any helper
225 // lanes that are around only for the purposes of derivatives to take part
226 // in any cross-lane communication, and we use a branch on whether the lane is
229 // Record I's original position as the entry block.
230 PixelEntryBB
= I
.getParent();
232 Value
*const Cond
= B
.CreateIntrinsic(Intrinsic::amdgcn_ps_live
, {}, {});
233 Instruction
*const NonHelperTerminator
=
234 SplitBlockAndInsertIfThen(Cond
, &I
, false, nullptr, DT
, nullptr);
236 // Record I's new position as the exit block.
237 PixelExitBB
= I
.getParent();
239 I
.moveBefore(NonHelperTerminator
);
240 B
.SetInsertPoint(&I
);
243 Type
*const Ty
= I
.getType();
244 const unsigned TyBitWidth
= DL
->getTypeSizeInBits(Ty
);
245 Type
*const VecTy
= VectorType::get(B
.getInt32Ty(), 2);
247 // This is the value in the atomic operation we need to combine in order to
248 // reduce the number of atomic operations.
249 Value
*const V
= I
.getOperand(ValIdx
);
251 // We need to know how many lanes are active within the wavefront, and we do
252 // this by doing a ballot of active lanes.
253 CallInst
*const Ballot
=
254 B
.CreateIntrinsic(Intrinsic::amdgcn_icmp
, {B
.getInt32Ty()},
255 {B
.getInt32(1), B
.getInt32(0), B
.getInt32(33)});
256 setConvergent(Ballot
);
258 // We need to know how many lanes are active within the wavefront that are
259 // below us. If we counted each lane linearly starting from 0, a lane is
260 // below us only if its associated index was less than ours. We do this by
261 // using the mbcnt intrinsic.
262 Value
*const BitCast
= B
.CreateBitCast(Ballot
, VecTy
);
263 Value
*const ExtractLo
= B
.CreateExtractElement(BitCast
, B
.getInt32(0));
264 Value
*const ExtractHi
= B
.CreateExtractElement(BitCast
, B
.getInt32(1));
265 CallInst
*const PartialMbcnt
= B
.CreateIntrinsic(
266 Intrinsic::amdgcn_mbcnt_lo
, {}, {ExtractLo
, B
.getInt32(0)});
267 CallInst
*const Mbcnt
= B
.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi
, {},
268 {ExtractHi
, PartialMbcnt
});
270 Value
*const MbcntCast
= B
.CreateIntCast(Mbcnt
, Ty
, false);
272 Value
*LaneOffset
= nullptr;
273 Value
*NewV
= nullptr;
275 // If we have a divergent value in each lane, we need to combine the value
278 Value
*const Identity
= B
.getIntN(TyBitWidth
, 0);
280 // First we need to set all inactive invocations to 0, so that they can
281 // correctly contribute to the final result.
282 CallInst
*const SetInactive
=
283 B
.CreateIntrinsic(Intrinsic::amdgcn_set_inactive
, Ty
, {V
, Identity
});
284 setConvergent(SetInactive
);
286 CallInst
*const FirstDPP
=
287 B
.CreateIntrinsic(Intrinsic::amdgcn_update_dpp
, Ty
,
288 {Identity
, SetInactive
, B
.getInt32(DPP_WF_SR1
),
289 B
.getInt32(0xf), B
.getInt32(0xf), B
.getFalse()});
290 setConvergent(FirstDPP
);
293 const unsigned Iters
= 7;
294 const unsigned DPPCtrl
[Iters
] = {
295 DPP_ROW_SR1
, DPP_ROW_SR2
, DPP_ROW_SR3
, DPP_ROW_SR4
,
296 DPP_ROW_SR8
, DPP_ROW_BCAST15
, DPP_ROW_BCAST31
};
297 const unsigned RowMask
[Iters
] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
298 const unsigned BankMask
[Iters
] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf};
300 // This loop performs an exclusive scan across the wavefront, with all lanes
301 // active (by using the WWM intrinsic).
302 for (unsigned Idx
= 0; Idx
< Iters
; Idx
++) {
303 Value
*const UpdateValue
= Idx
< 3 ? FirstDPP
: NewV
;
304 CallInst
*const DPP
= B
.CreateIntrinsic(
305 Intrinsic::amdgcn_update_dpp
, Ty
,
306 {Identity
, UpdateValue
, B
.getInt32(DPPCtrl
[Idx
]),
307 B
.getInt32(RowMask
[Idx
]), B
.getInt32(BankMask
[Idx
]), B
.getFalse()});
310 NewV
= B
.CreateBinOp(Op
, NewV
, DPP
);
313 LaneOffset
= B
.CreateIntrinsic(Intrinsic::amdgcn_wwm
, Ty
, NewV
);
314 NewV
= B
.CreateBinOp(Op
, NewV
, SetInactive
);
316 // Read the value from the last lane, which has accumlated the values of
317 // each active lane in the wavefront. This will be our new value with which
318 // we will provide to the atomic operation.
319 if (TyBitWidth
== 64) {
320 Value
*const ExtractLo
= B
.CreateTrunc(NewV
, B
.getInt32Ty());
321 Value
*const ExtractHi
=
322 B
.CreateTrunc(B
.CreateLShr(NewV
, B
.getInt64(32)), B
.getInt32Ty());
323 CallInst
*const ReadLaneLo
= B
.CreateIntrinsic(
324 Intrinsic::amdgcn_readlane
, {}, {ExtractLo
, B
.getInt32(63)});
325 setConvergent(ReadLaneLo
);
326 CallInst
*const ReadLaneHi
= B
.CreateIntrinsic(
327 Intrinsic::amdgcn_readlane
, {}, {ExtractHi
, B
.getInt32(63)});
328 setConvergent(ReadLaneHi
);
329 Value
*const PartialInsert
= B
.CreateInsertElement(
330 UndefValue::get(VecTy
), ReadLaneLo
, B
.getInt32(0));
331 Value
*const Insert
=
332 B
.CreateInsertElement(PartialInsert
, ReadLaneHi
, B
.getInt32(1));
333 NewV
= B
.CreateBitCast(Insert
, Ty
);
334 } else if (TyBitWidth
== 32) {
335 CallInst
*const ReadLane
= B
.CreateIntrinsic(Intrinsic::amdgcn_readlane
,
336 {}, {NewV
, B
.getInt32(63)});
337 setConvergent(ReadLane
);
340 llvm_unreachable("Unhandled atomic bit width");
343 // Finally mark the readlanes in the WWM section.
344 NewV
= B
.CreateIntrinsic(Intrinsic::amdgcn_wwm
, Ty
, NewV
);
346 // Get the total number of active lanes we have by using popcount.
347 Instruction
*const Ctpop
= B
.CreateUnaryIntrinsic(Intrinsic::ctpop
, Ballot
);
348 Value
*const CtpopCast
= B
.CreateIntCast(Ctpop
, Ty
, false);
350 // Calculate the new value we will be contributing to the atomic operation
351 // for the entire wavefront.
352 NewV
= B
.CreateMul(V
, CtpopCast
);
353 LaneOffset
= B
.CreateMul(V
, MbcntCast
);
356 // We only want a single lane to enter our new control flow, and we do this
357 // by checking if there are any active lanes below us. Only one lane will
358 // have 0 active lanes below us, so that will be the only one to progress.
359 Value
*const Cond
= B
.CreateICmpEQ(MbcntCast
, B
.getIntN(TyBitWidth
, 0));
361 // Store I's original basic block before we split the block.
362 BasicBlock
*const EntryBB
= I
.getParent();
364 // We need to introduce some new control flow to force a single lane to be
365 // active. We do this by splitting I's basic block at I, and introducing the
366 // new block such that:
367 // entry --> single_lane -\
368 // \------------------> exit
369 Instruction
*const SingleLaneTerminator
=
370 SplitBlockAndInsertIfThen(Cond
, &I
, false, nullptr, DT
, nullptr);
372 // Move the IR builder into single_lane next.
373 B
.SetInsertPoint(SingleLaneTerminator
);
375 // Clone the original atomic operation into single lane, replacing the
376 // original value with our newly created one.
377 Instruction
*const NewI
= I
.clone();
379 NewI
->setOperand(ValIdx
, NewV
);
381 // Move the IR builder into exit next, and start inserting just before the
382 // original instruction.
383 B
.SetInsertPoint(&I
);
385 // Create a PHI node to get our new atomic result into the exit block.
386 PHINode
*const PHI
= B
.CreatePHI(Ty
, 2);
387 PHI
->addIncoming(UndefValue::get(Ty
), EntryBB
);
388 PHI
->addIncoming(NewI
, SingleLaneTerminator
->getParent());
390 // We need to broadcast the value who was the lowest active lane (the first
391 // lane) to all other lanes in the wavefront. We use an intrinsic for this,
392 // but have to handle 64-bit broadcasts with two calls to this intrinsic.
393 Value
*BroadcastI
= nullptr;
395 if (TyBitWidth
== 64) {
396 Value
*const ExtractLo
= B
.CreateTrunc(PHI
, B
.getInt32Ty());
397 Value
*const ExtractHi
=
398 B
.CreateTrunc(B
.CreateLShr(PHI
, B
.getInt64(32)), B
.getInt32Ty());
399 CallInst
*const ReadFirstLaneLo
=
400 B
.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane
, {}, ExtractLo
);
401 setConvergent(ReadFirstLaneLo
);
402 CallInst
*const ReadFirstLaneHi
=
403 B
.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane
, {}, ExtractHi
);
404 setConvergent(ReadFirstLaneHi
);
405 Value
*const PartialInsert
= B
.CreateInsertElement(
406 UndefValue::get(VecTy
), ReadFirstLaneLo
, B
.getInt32(0));
407 Value
*const Insert
=
408 B
.CreateInsertElement(PartialInsert
, ReadFirstLaneHi
, B
.getInt32(1));
409 BroadcastI
= B
.CreateBitCast(Insert
, Ty
);
410 } else if (TyBitWidth
== 32) {
411 CallInst
*const ReadFirstLane
=
412 B
.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane
, {}, PHI
);
413 setConvergent(ReadFirstLane
);
414 BroadcastI
= ReadFirstLane
;
416 llvm_unreachable("Unhandled atomic bit width");
419 // Now that we have the result of our single atomic operation, we need to
420 // get our individual lane's slice into the result. We use the lane offset we
421 // previously calculated combined with the atomic result value we got from the
422 // first lane, to get our lane's index into the atomic result.
423 Value
*const Result
= B
.CreateBinOp(Op
, BroadcastI
, LaneOffset
);
426 // Need a final PHI to reconverge to above the helper lane branch mask.
427 B
.SetInsertPoint(PixelExitBB
->getFirstNonPHI());
429 PHINode
*const PHI
= B
.CreatePHI(Ty
, 2);
430 PHI
->addIncoming(UndefValue::get(Ty
), PixelEntryBB
);
431 PHI
->addIncoming(Result
, I
.getParent());
432 I
.replaceAllUsesWith(PHI
);
434 // Replace the original atomic instruction with the new one.
435 I
.replaceAllUsesWith(Result
);
438 // And delete the original.
442 void AMDGPUAtomicOptimizer::setConvergent(CallInst
*const CI
) const {
443 CI
->addAttribute(AttributeList::FunctionIndex
, Attribute::Convergent
);
446 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer
, DEBUG_TYPE
,
447 "AMDGPU atomic optimizations", false, false)
448 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis
)
449 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig
)
450 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer
, DEBUG_TYPE
,
451 "AMDGPU atomic optimizations", false, false)
453 FunctionPass
*llvm::createAMDGPUAtomicOptimizerPass() {
454 return new AMDGPUAtomicOptimizer();