1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
11 /// to perform the atomic operation, thus reducing contention on that memory
13 /// Atomic optimizer uses following strategies to compute scan and reduced
16 /// This is the most efficient implementation for scan. DPP uses Whole Wave
19 // An alternative implementation iterates over all active lanes
20 /// of Wavefront using llvm.cttz and performs scan using readlane & writelane
22 //===----------------------------------------------------------------------===//
25 #include "GCNSubtarget.h"
26 #include "llvm/Analysis/DomTreeUpdater.h"
27 #include "llvm/Analysis/UniformityAnalysis.h"
28 #include "llvm/CodeGen/TargetPassConfig.h"
29 #include "llvm/IR/IRBuilder.h"
30 #include "llvm/IR/InstVisitor.h"
31 #include "llvm/IR/IntrinsicsAMDGPU.h"
32 #include "llvm/InitializePasses.h"
33 #include "llvm/Target/TargetMachine.h"
34 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
36 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
39 using namespace llvm::AMDGPU
;
43 struct ReplacementInfo
{
45 AtomicRMWInst::BinOp Op
;
50 class AMDGPUAtomicOptimizer
: public FunctionPass
{
54 AMDGPUAtomicOptimizer(ScanOptions ScanImpl
)
55 : FunctionPass(ID
), ScanImpl(ScanImpl
) {}
57 bool runOnFunction(Function
&F
) override
;
59 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
60 AU
.addPreserved
<DominatorTreeWrapperPass
>();
61 AU
.addRequired
<UniformityInfoWrapperPass
>();
62 AU
.addRequired
<TargetPassConfig
>();
66 class AMDGPUAtomicOptimizerImpl
67 : public InstVisitor
<AMDGPUAtomicOptimizerImpl
> {
69 SmallVector
<ReplacementInfo
, 8> ToReplace
;
70 const UniformityInfo
*UA
;
73 const GCNSubtarget
*ST
;
77 Value
*buildReduction(IRBuilder
<> &B
, AtomicRMWInst::BinOp Op
, Value
*V
,
78 Value
*const Identity
) const;
79 Value
*buildScan(IRBuilder
<> &B
, AtomicRMWInst::BinOp Op
, Value
*V
,
80 Value
*const Identity
) const;
81 Value
*buildShiftRight(IRBuilder
<> &B
, Value
*V
, Value
*const Identity
) const;
83 std::pair
<Value
*, Value
*>
84 buildScanIteratively(IRBuilder
<> &B
, AtomicRMWInst::BinOp Op
,
85 Value
*const Identity
, Value
*V
, Instruction
&I
,
86 BasicBlock
*ComputeLoop
, BasicBlock
*ComputeEnd
) const;
88 void optimizeAtomic(Instruction
&I
, AtomicRMWInst::BinOp Op
, unsigned ValIdx
,
89 bool ValDivergent
) const;
92 AMDGPUAtomicOptimizerImpl() = delete;
94 AMDGPUAtomicOptimizerImpl(const UniformityInfo
*UA
, const DataLayout
*DL
,
95 DomTreeUpdater
&DTU
, const GCNSubtarget
*ST
,
96 bool IsPixelShader
, ScanOptions ScanImpl
)
97 : UA(UA
), DL(DL
), DTU(DTU
), ST(ST
), IsPixelShader(IsPixelShader
),
100 bool run(Function
&F
);
102 void visitAtomicRMWInst(AtomicRMWInst
&I
);
103 void visitIntrinsicInst(IntrinsicInst
&I
);
108 char AMDGPUAtomicOptimizer::ID
= 0;
110 char &llvm::AMDGPUAtomicOptimizerID
= AMDGPUAtomicOptimizer::ID
;
112 bool AMDGPUAtomicOptimizer::runOnFunction(Function
&F
) {
113 if (skipFunction(F
)) {
117 const UniformityInfo
*UA
=
118 &getAnalysis
<UniformityInfoWrapperPass
>().getUniformityInfo();
119 const DataLayout
*DL
= &F
.getDataLayout();
121 DominatorTreeWrapperPass
*const DTW
=
122 getAnalysisIfAvailable
<DominatorTreeWrapperPass
>();
123 DomTreeUpdater
DTU(DTW
? &DTW
->getDomTree() : nullptr,
124 DomTreeUpdater::UpdateStrategy::Lazy
);
126 const TargetPassConfig
&TPC
= getAnalysis
<TargetPassConfig
>();
127 const TargetMachine
&TM
= TPC
.getTM
<TargetMachine
>();
128 const GCNSubtarget
*ST
= &TM
.getSubtarget
<GCNSubtarget
>(F
);
130 bool IsPixelShader
= F
.getCallingConv() == CallingConv::AMDGPU_PS
;
132 return AMDGPUAtomicOptimizerImpl(UA
, DL
, DTU
, ST
, IsPixelShader
, ScanImpl
)
136 PreservedAnalyses
AMDGPUAtomicOptimizerPass::run(Function
&F
,
137 FunctionAnalysisManager
&AM
) {
139 const auto *UA
= &AM
.getResult
<UniformityInfoAnalysis
>(F
);
140 const DataLayout
*DL
= &F
.getDataLayout();
142 DomTreeUpdater
DTU(&AM
.getResult
<DominatorTreeAnalysis
>(F
),
143 DomTreeUpdater::UpdateStrategy::Lazy
);
144 const GCNSubtarget
*ST
= &TM
.getSubtarget
<GCNSubtarget
>(F
);
146 bool IsPixelShader
= F
.getCallingConv() == CallingConv::AMDGPU_PS
;
149 AMDGPUAtomicOptimizerImpl(UA
, DL
, DTU
, ST
, IsPixelShader
, ScanImpl
)
153 return PreservedAnalyses::all();
156 PreservedAnalyses PA
;
157 PA
.preserve
<DominatorTreeAnalysis
>();
161 bool AMDGPUAtomicOptimizerImpl::run(Function
&F
) {
163 // Scan option None disables the Pass
164 if (ScanImpl
== ScanOptions::None
) {
170 const bool Changed
= !ToReplace
.empty();
172 for (ReplacementInfo
&Info
: ToReplace
) {
173 optimizeAtomic(*Info
.I
, Info
.Op
, Info
.ValIdx
, Info
.ValDivergent
);
181 static bool isLegalCrossLaneType(Type
*Ty
) {
182 switch (Ty
->getTypeID()) {
183 case Type::FloatTyID
:
184 case Type::DoubleTyID
:
186 case Type::IntegerTyID
: {
187 unsigned Size
= Ty
->getIntegerBitWidth();
188 return (Size
== 32 || Size
== 64);
195 void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst
&I
) {
196 // Early exit for unhandled address space atomic instructions.
197 switch (I
.getPointerAddressSpace()) {
200 case AMDGPUAS::GLOBAL_ADDRESS
:
201 case AMDGPUAS::LOCAL_ADDRESS
:
205 AtomicRMWInst::BinOp Op
= I
.getOperation();
210 case AtomicRMWInst::Add
:
211 case AtomicRMWInst::Sub
:
212 case AtomicRMWInst::And
:
213 case AtomicRMWInst::Or
:
214 case AtomicRMWInst::Xor
:
215 case AtomicRMWInst::Max
:
216 case AtomicRMWInst::Min
:
217 case AtomicRMWInst::UMax
:
218 case AtomicRMWInst::UMin
:
219 case AtomicRMWInst::FAdd
:
220 case AtomicRMWInst::FSub
:
221 case AtomicRMWInst::FMax
:
222 case AtomicRMWInst::FMin
:
226 // Only 32 and 64 bit floating point atomic ops are supported.
227 if (AtomicRMWInst::isFPOperation(Op
) &&
228 !(I
.getType()->isFloatTy() || I
.getType()->isDoubleTy())) {
232 const unsigned PtrIdx
= 0;
233 const unsigned ValIdx
= 1;
235 // If the pointer operand is divergent, then each lane is doing an atomic
236 // operation on a different address, and we cannot optimize that.
237 if (UA
->isDivergentUse(I
.getOperandUse(PtrIdx
))) {
241 bool ValDivergent
= UA
->isDivergentUse(I
.getOperandUse(ValIdx
));
243 // If the value operand is divergent, each lane is contributing a different
244 // value to the atomic calculation. We can only optimize divergent values if
245 // we have DPP available on our subtarget (for DPP strategy), and the atomic
246 // operation is 32 or 64 bits.
248 if (ScanImpl
== ScanOptions::DPP
&& !ST
->hasDPP())
251 if (!isLegalCrossLaneType(I
.getType()))
255 // If we get here, we can optimize the atomic using a single wavefront-wide
256 // atomic operation to do the calculation for the entire wavefront, so
257 // remember the instruction so we can come back to it.
258 const ReplacementInfo Info
= {&I
, Op
, ValIdx
, ValDivergent
};
260 ToReplace
.push_back(Info
);
263 void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst
&I
) {
264 AtomicRMWInst::BinOp Op
;
266 switch (I
.getIntrinsicID()) {
269 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
270 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add
:
271 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
272 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add
:
273 Op
= AtomicRMWInst::Add
;
275 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
276 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub
:
277 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
278 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub
:
279 Op
= AtomicRMWInst::Sub
;
281 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
282 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and
:
283 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
284 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and
:
285 Op
= AtomicRMWInst::And
;
287 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
288 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or
:
289 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
290 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or
:
291 Op
= AtomicRMWInst::Or
;
293 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
294 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor
:
295 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
296 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor
:
297 Op
= AtomicRMWInst::Xor
;
299 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
300 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin
:
301 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
302 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin
:
303 Op
= AtomicRMWInst::Min
;
305 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
306 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin
:
307 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
308 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin
:
309 Op
= AtomicRMWInst::UMin
;
311 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
312 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax
:
313 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
314 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax
:
315 Op
= AtomicRMWInst::Max
;
317 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
318 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax
:
319 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
320 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax
:
321 Op
= AtomicRMWInst::UMax
;
325 const unsigned ValIdx
= 0;
327 const bool ValDivergent
= UA
->isDivergentUse(I
.getOperandUse(ValIdx
));
329 // If the value operand is divergent, each lane is contributing a different
330 // value to the atomic calculation. We can only optimize divergent values if
331 // we have DPP available on our subtarget (for DPP strategy), and the atomic
332 // operation is 32 or 64 bits.
334 if (ScanImpl
== ScanOptions::DPP
&& !ST
->hasDPP())
337 if (!isLegalCrossLaneType(I
.getType()))
341 // If any of the other arguments to the intrinsic are divergent, we can't
342 // optimize the operation.
343 for (unsigned Idx
= 1; Idx
< I
.getNumOperands(); Idx
++) {
344 if (UA
->isDivergentUse(I
.getOperandUse(Idx
))) {
349 // If we get here, we can optimize the atomic using a single wavefront-wide
350 // atomic operation to do the calculation for the entire wavefront, so
351 // remember the instruction so we can come back to it.
352 const ReplacementInfo Info
= {&I
, Op
, ValIdx
, ValDivergent
};
354 ToReplace
.push_back(Info
);
357 // Use the builder to create the non-atomic counterpart of the specified
358 // atomicrmw binary op.
359 static Value
*buildNonAtomicBinOp(IRBuilder
<> &B
, AtomicRMWInst::BinOp Op
,
360 Value
*LHS
, Value
*RHS
) {
361 CmpInst::Predicate Pred
;
365 llvm_unreachable("Unhandled atomic op");
366 case AtomicRMWInst::Add
:
367 return B
.CreateBinOp(Instruction::Add
, LHS
, RHS
);
368 case AtomicRMWInst::FAdd
:
369 return B
.CreateFAdd(LHS
, RHS
);
370 case AtomicRMWInst::Sub
:
371 return B
.CreateBinOp(Instruction::Sub
, LHS
, RHS
);
372 case AtomicRMWInst::FSub
:
373 return B
.CreateFSub(LHS
, RHS
);
374 case AtomicRMWInst::And
:
375 return B
.CreateBinOp(Instruction::And
, LHS
, RHS
);
376 case AtomicRMWInst::Or
:
377 return B
.CreateBinOp(Instruction::Or
, LHS
, RHS
);
378 case AtomicRMWInst::Xor
:
379 return B
.CreateBinOp(Instruction::Xor
, LHS
, RHS
);
381 case AtomicRMWInst::Max
:
382 Pred
= CmpInst::ICMP_SGT
;
384 case AtomicRMWInst::Min
:
385 Pred
= CmpInst::ICMP_SLT
;
387 case AtomicRMWInst::UMax
:
388 Pred
= CmpInst::ICMP_UGT
;
390 case AtomicRMWInst::UMin
:
391 Pred
= CmpInst::ICMP_ULT
;
393 case AtomicRMWInst::FMax
:
394 return B
.CreateMaxNum(LHS
, RHS
);
395 case AtomicRMWInst::FMin
:
396 return B
.CreateMinNum(LHS
, RHS
);
398 Value
*Cond
= B
.CreateICmp(Pred
, LHS
, RHS
);
399 return B
.CreateSelect(Cond
, LHS
, RHS
);
402 // Use the builder to create a reduction of V across the wavefront, with all
403 // lanes active, returning the same result in all lanes.
404 Value
*AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder
<> &B
,
405 AtomicRMWInst::BinOp Op
,
407 Value
*const Identity
) const {
408 Type
*AtomicTy
= V
->getType();
409 Module
*M
= B
.GetInsertBlock()->getModule();
410 Function
*UpdateDPP
=
411 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_update_dpp
, AtomicTy
);
413 // Reduce within each row of 16 lanes.
414 for (unsigned Idx
= 0; Idx
< 4; Idx
++) {
415 V
= buildNonAtomicBinOp(
417 B
.CreateCall(UpdateDPP
,
418 {Identity
, V
, B
.getInt32(DPP::ROW_XMASK0
| 1 << Idx
),
419 B
.getInt32(0xf), B
.getInt32(0xf), B
.getFalse()}));
422 // Reduce within each pair of rows (i.e. 32 lanes).
423 assert(ST
->hasPermLaneX16());
424 Value
*Permlanex16Call
= B
.CreateIntrinsic(
425 V
->getType(), Intrinsic::amdgcn_permlanex16
,
426 {V
, V
, B
.getInt32(-1), B
.getInt32(-1), B
.getFalse(), B
.getFalse()});
427 V
= buildNonAtomicBinOp(B
, Op
, V
, Permlanex16Call
);
428 if (ST
->isWave32()) {
432 if (ST
->hasPermLane64()) {
433 // Reduce across the upper and lower 32 lanes.
434 Value
*Permlane64Call
=
435 B
.CreateIntrinsic(V
->getType(), Intrinsic::amdgcn_permlane64
, V
);
436 return buildNonAtomicBinOp(B
, Op
, V
, Permlane64Call
);
439 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
440 // combine them with a scalar operation.
442 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_readlane
, AtomicTy
);
443 Value
*Lane0
= B
.CreateCall(ReadLane
, {V
, B
.getInt32(0)});
444 Value
*Lane32
= B
.CreateCall(ReadLane
, {V
, B
.getInt32(32)});
445 return buildNonAtomicBinOp(B
, Op
, Lane0
, Lane32
);
448 // Use the builder to create an inclusive scan of V across the wavefront, with
450 Value
*AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder
<> &B
,
451 AtomicRMWInst::BinOp Op
, Value
*V
,
452 Value
*Identity
) const {
453 Type
*AtomicTy
= V
->getType();
454 Module
*M
= B
.GetInsertBlock()->getModule();
455 Function
*UpdateDPP
=
456 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_update_dpp
, AtomicTy
);
458 for (unsigned Idx
= 0; Idx
< 4; Idx
++) {
459 V
= buildNonAtomicBinOp(
461 B
.CreateCall(UpdateDPP
,
462 {Identity
, V
, B
.getInt32(DPP::ROW_SHR0
| 1 << Idx
),
463 B
.getInt32(0xf), B
.getInt32(0xf), B
.getFalse()}));
465 if (ST
->hasDPPBroadcasts()) {
466 // GFX9 has DPP row broadcast operations.
467 V
= buildNonAtomicBinOp(
469 B
.CreateCall(UpdateDPP
,
470 {Identity
, V
, B
.getInt32(DPP::BCAST15
), B
.getInt32(0xa),
471 B
.getInt32(0xf), B
.getFalse()}));
472 V
= buildNonAtomicBinOp(
474 B
.CreateCall(UpdateDPP
,
475 {Identity
, V
, B
.getInt32(DPP::BCAST31
), B
.getInt32(0xc),
476 B
.getInt32(0xf), B
.getFalse()}));
478 // On GFX10 all DPP operations are confined to a single row. To get cross-
479 // row operations we have to use permlane or readlane.
481 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
483 assert(ST
->hasPermLaneX16());
484 Value
*PermX
= B
.CreateIntrinsic(
485 V
->getType(), Intrinsic::amdgcn_permlanex16
,
486 {V
, V
, B
.getInt32(-1), B
.getInt32(-1), B
.getFalse(), B
.getFalse()});
488 Value
*UpdateDPPCall
= B
.CreateCall(
489 UpdateDPP
, {Identity
, PermX
, B
.getInt32(DPP::QUAD_PERM_ID
),
490 B
.getInt32(0xa), B
.getInt32(0xf), B
.getFalse()});
491 V
= buildNonAtomicBinOp(B
, Op
, V
, UpdateDPPCall
);
493 if (!ST
->isWave32()) {
494 // Combine lane 31 into lanes 32..63.
495 Value
*const Lane31
= B
.CreateIntrinsic(
496 V
->getType(), Intrinsic::amdgcn_readlane
, {V
, B
.getInt32(31)});
498 Value
*UpdateDPPCall
= B
.CreateCall(
499 UpdateDPP
, {Identity
, Lane31
, B
.getInt32(DPP::QUAD_PERM_ID
),
500 B
.getInt32(0xc), B
.getInt32(0xf), B
.getFalse()});
502 V
= buildNonAtomicBinOp(B
, Op
, V
, UpdateDPPCall
);
508 // Use the builder to create a shift right of V across the wavefront, with all
509 // lanes active, to turn an inclusive scan into an exclusive scan.
510 Value
*AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder
<> &B
, Value
*V
,
511 Value
*Identity
) const {
512 Type
*AtomicTy
= V
->getType();
513 Module
*M
= B
.GetInsertBlock()->getModule();
514 Function
*UpdateDPP
=
515 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_update_dpp
, AtomicTy
);
516 if (ST
->hasDPPWavefrontShifts()) {
517 // GFX9 has DPP wavefront shift operations.
518 V
= B
.CreateCall(UpdateDPP
,
519 {Identity
, V
, B
.getInt32(DPP::WAVE_SHR1
), B
.getInt32(0xf),
520 B
.getInt32(0xf), B
.getFalse()});
523 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_readlane
, AtomicTy
);
524 Function
*WriteLane
=
525 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_writelane
, AtomicTy
);
527 // On GFX10 all DPP operations are confined to a single row. To get cross-
528 // row operations we have to use permlane or readlane.
530 V
= B
.CreateCall(UpdateDPP
,
531 {Identity
, V
, B
.getInt32(DPP::ROW_SHR0
+ 1),
532 B
.getInt32(0xf), B
.getInt32(0xf), B
.getFalse()});
534 // Copy the old lane 15 to the new lane 16.
535 V
= B
.CreateCall(WriteLane
, {B
.CreateCall(ReadLane
, {Old
, B
.getInt32(15)}),
538 if (!ST
->isWave32()) {
539 // Copy the old lane 31 to the new lane 32.
542 {B
.CreateCall(ReadLane
, {Old
, B
.getInt32(31)}), B
.getInt32(32), V
});
544 // Copy the old lane 47 to the new lane 48.
547 {B
.CreateCall(ReadLane
, {Old
, B
.getInt32(47)}), B
.getInt32(48), V
});
554 // Use the builder to create an exclusive scan and compute the final reduced
555 // value using an iterative approach. This provides an alternative
556 // implementation to DPP which uses WMM for scan computations. This API iterate
557 // over active lanes to read, compute and update the value using
558 // readlane and writelane intrinsics.
559 std::pair
<Value
*, Value
*> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
560 IRBuilder
<> &B
, AtomicRMWInst::BinOp Op
, Value
*const Identity
, Value
*V
,
561 Instruction
&I
, BasicBlock
*ComputeLoop
, BasicBlock
*ComputeEnd
) const {
562 auto *Ty
= I
.getType();
563 auto *WaveTy
= B
.getIntNTy(ST
->getWavefrontSize());
564 auto *EntryBB
= I
.getParent();
565 auto NeedResult
= !I
.use_empty();
568 B
.CreateIntrinsic(Intrinsic::amdgcn_ballot
, WaveTy
, B
.getTrue());
570 // Start inserting instructions for ComputeLoop block
571 B
.SetInsertPoint(ComputeLoop
);
572 // Phi nodes for Accumulator, Scan results destination, and Active Lanes
573 auto *Accumulator
= B
.CreatePHI(Ty
, 2, "Accumulator");
574 Accumulator
->addIncoming(Identity
, EntryBB
);
575 PHINode
*OldValuePhi
= nullptr;
577 OldValuePhi
= B
.CreatePHI(Ty
, 2, "OldValuePhi");
578 OldValuePhi
->addIncoming(PoisonValue::get(Ty
), EntryBB
);
580 auto *ActiveBits
= B
.CreatePHI(WaveTy
, 2, "ActiveBits");
581 ActiveBits
->addIncoming(Ballot
, EntryBB
);
583 // Use llvm.cttz instrinsic to find the lowest remaining active lane.
585 B
.CreateIntrinsic(Intrinsic::cttz
, WaveTy
, {ActiveBits
, B
.getTrue()});
587 auto *LaneIdxInt
= B
.CreateTrunc(FF1
, B
.getInt32Ty());
589 // Get the value required for atomic operation
590 Value
*LaneValue
= B
.CreateIntrinsic(V
->getType(), Intrinsic::amdgcn_readlane
,
593 // Perform writelane if intermediate scan results are required later in the
594 // kernel computations
595 Value
*OldValue
= nullptr;
597 OldValue
= B
.CreateIntrinsic(V
->getType(), Intrinsic::amdgcn_writelane
,
598 {Accumulator
, LaneIdxInt
, OldValuePhi
});
599 OldValuePhi
->addIncoming(OldValue
, ComputeLoop
);
602 // Accumulate the results
603 auto *NewAccumulator
= buildNonAtomicBinOp(B
, Op
, Accumulator
, LaneValue
);
604 Accumulator
->addIncoming(NewAccumulator
, ComputeLoop
);
606 // Set bit to zero of current active lane so that for next iteration llvm.cttz
607 // return the next active lane
608 auto *Mask
= B
.CreateShl(ConstantInt::get(WaveTy
, 1), FF1
);
610 auto *InverseMask
= B
.CreateXor(Mask
, ConstantInt::get(WaveTy
, -1));
611 auto *NewActiveBits
= B
.CreateAnd(ActiveBits
, InverseMask
);
612 ActiveBits
->addIncoming(NewActiveBits
, ComputeLoop
);
614 // Branch out of the loop when all lanes are processed.
615 auto *IsEnd
= B
.CreateICmpEQ(NewActiveBits
, ConstantInt::get(WaveTy
, 0));
616 B
.CreateCondBr(IsEnd
, ComputeEnd
, ComputeLoop
);
618 B
.SetInsertPoint(ComputeEnd
);
620 return {OldValue
, NewAccumulator
};
623 static Constant
*getIdentityValueForAtomicOp(Type
*const Ty
,
624 AtomicRMWInst::BinOp Op
) {
625 LLVMContext
&C
= Ty
->getContext();
626 const unsigned BitWidth
= Ty
->getPrimitiveSizeInBits();
629 llvm_unreachable("Unhandled atomic op");
630 case AtomicRMWInst::Add
:
631 case AtomicRMWInst::Sub
:
632 case AtomicRMWInst::Or
:
633 case AtomicRMWInst::Xor
:
634 case AtomicRMWInst::UMax
:
635 return ConstantInt::get(C
, APInt::getMinValue(BitWidth
));
636 case AtomicRMWInst::And
:
637 case AtomicRMWInst::UMin
:
638 return ConstantInt::get(C
, APInt::getMaxValue(BitWidth
));
639 case AtomicRMWInst::Max
:
640 return ConstantInt::get(C
, APInt::getSignedMinValue(BitWidth
));
641 case AtomicRMWInst::Min
:
642 return ConstantInt::get(C
, APInt::getSignedMaxValue(BitWidth
));
643 case AtomicRMWInst::FAdd
:
644 return ConstantFP::get(C
, APFloat::getZero(Ty
->getFltSemantics(), true));
645 case AtomicRMWInst::FSub
:
646 return ConstantFP::get(C
, APFloat::getZero(Ty
->getFltSemantics(), false));
647 case AtomicRMWInst::FMin
:
648 case AtomicRMWInst::FMax
:
649 // FIXME: atomicrmw fmax/fmin behave like llvm.maxnum/minnum so NaN is the
650 // closest thing they have to an identity, but it still does not preserve
651 // the difference between quiet and signaling NaNs or NaNs with different
653 return ConstantFP::get(C
, APFloat::getNaN(Ty
->getFltSemantics()));
657 static Value
*buildMul(IRBuilder
<> &B
, Value
*LHS
, Value
*RHS
) {
658 const ConstantInt
*CI
= dyn_cast
<ConstantInt
>(LHS
);
659 return (CI
&& CI
->isOne()) ? RHS
: B
.CreateMul(LHS
, RHS
);
662 void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction
&I
,
663 AtomicRMWInst::BinOp Op
,
665 bool ValDivergent
) const {
666 // Start building just before the instruction.
669 if (AtomicRMWInst::isFPOperation(Op
)) {
670 B
.setIsFPConstrained(I
.getFunction()->hasFnAttribute(Attribute::StrictFP
));
673 // If we are in a pixel shader, because of how we have to mask out helper
674 // lane invocations, we need to record the entry and exit BB's.
675 BasicBlock
*PixelEntryBB
= nullptr;
676 BasicBlock
*PixelExitBB
= nullptr;
678 // If we're optimizing an atomic within a pixel shader, we need to wrap the
679 // entire atomic operation in a helper-lane check. We do not want any helper
680 // lanes that are around only for the purposes of derivatives to take part
681 // in any cross-lane communication, and we use a branch on whether the lane is
684 // Record I's original position as the entry block.
685 PixelEntryBB
= I
.getParent();
687 Value
*const Cond
= B
.CreateIntrinsic(Intrinsic::amdgcn_ps_live
, {}, {});
688 Instruction
*const NonHelperTerminator
=
689 SplitBlockAndInsertIfThen(Cond
, &I
, false, nullptr, &DTU
, nullptr);
691 // Record I's new position as the exit block.
692 PixelExitBB
= I
.getParent();
694 I
.moveBefore(NonHelperTerminator
);
695 B
.SetInsertPoint(&I
);
698 Type
*const Ty
= I
.getType();
699 Type
*Int32Ty
= B
.getInt32Ty();
700 bool isAtomicFloatingPointTy
= Ty
->isFloatingPointTy();
701 [[maybe_unused
]] const unsigned TyBitWidth
= DL
->getTypeSizeInBits(Ty
);
703 // This is the value in the atomic operation we need to combine in order to
704 // reduce the number of atomic operations.
705 Value
*V
= I
.getOperand(ValIdx
);
707 // We need to know how many lanes are active within the wavefront, and we do
708 // this by doing a ballot of active lanes.
709 Type
*const WaveTy
= B
.getIntNTy(ST
->getWavefrontSize());
710 CallInst
*const Ballot
=
711 B
.CreateIntrinsic(Intrinsic::amdgcn_ballot
, WaveTy
, B
.getTrue());
713 // We need to know how many lanes are active within the wavefront that are
714 // below us. If we counted each lane linearly starting from 0, a lane is
715 // below us only if its associated index was less than ours. We do this by
716 // using the mbcnt intrinsic.
718 if (ST
->isWave32()) {
719 Mbcnt
= B
.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo
, {},
720 {Ballot
, B
.getInt32(0)});
722 Value
*const ExtractLo
= B
.CreateTrunc(Ballot
, Int32Ty
);
723 Value
*const ExtractHi
= B
.CreateTrunc(B
.CreateLShr(Ballot
, 32), Int32Ty
);
724 Mbcnt
= B
.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo
, {},
725 {ExtractLo
, B
.getInt32(0)});
727 B
.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi
, {}, {ExtractHi
, Mbcnt
});
730 Function
*F
= I
.getFunction();
731 LLVMContext
&C
= F
->getContext();
733 // For atomic sub, perform scan with add operation and allow one lane to
734 // subtract the reduced value later.
735 AtomicRMWInst::BinOp ScanOp
= Op
;
736 if (Op
== AtomicRMWInst::Sub
) {
737 ScanOp
= AtomicRMWInst::Add
;
738 } else if (Op
== AtomicRMWInst::FSub
) {
739 ScanOp
= AtomicRMWInst::FAdd
;
741 Value
*Identity
= getIdentityValueForAtomicOp(Ty
, ScanOp
);
743 Value
*ExclScan
= nullptr;
744 Value
*NewV
= nullptr;
746 const bool NeedResult
= !I
.use_empty();
748 BasicBlock
*ComputeLoop
= nullptr;
749 BasicBlock
*ComputeEnd
= nullptr;
750 // If we have a divergent value in each lane, we need to combine the value
753 if (ScanImpl
== ScanOptions::DPP
) {
754 // First we need to set all inactive invocations to the identity value, so
755 // that they can correctly contribute to the final result.
757 B
.CreateIntrinsic(Intrinsic::amdgcn_set_inactive
, Ty
, {V
, Identity
});
758 if (!NeedResult
&& ST
->hasPermLaneX16()) {
759 // On GFX10 the permlanex16 instruction helps us build a reduction
760 // without too many readlanes and writelanes, which are generally bad
762 NewV
= buildReduction(B
, ScanOp
, NewV
, Identity
);
764 NewV
= buildScan(B
, ScanOp
, NewV
, Identity
);
766 ExclScan
= buildShiftRight(B
, NewV
, Identity
);
767 // Read the value from the last lane, which has accumulated the values
768 // of each active lane in the wavefront. This will be our new value
769 // which we will provide to the atomic operation.
770 Value
*const LastLaneIdx
= B
.getInt32(ST
->getWavefrontSize() - 1);
771 NewV
= B
.CreateIntrinsic(Ty
, Intrinsic::amdgcn_readlane
,
772 {NewV
, LastLaneIdx
});
774 // Finally mark the readlanes in the WWM section.
775 NewV
= B
.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm
, Ty
, NewV
);
776 } else if (ScanImpl
== ScanOptions::Iterative
) {
777 // Alternative implementation for scan
778 ComputeLoop
= BasicBlock::Create(C
, "ComputeLoop", F
);
779 ComputeEnd
= BasicBlock::Create(C
, "ComputeEnd", F
);
780 std::tie(ExclScan
, NewV
) = buildScanIteratively(B
, ScanOp
, Identity
, V
, I
,
781 ComputeLoop
, ComputeEnd
);
783 llvm_unreachable("Atomic Optimzer is disabled for None strategy");
788 llvm_unreachable("Unhandled atomic op");
790 case AtomicRMWInst::Add
:
791 case AtomicRMWInst::Sub
: {
792 // The new value we will be contributing to the atomic operation is the
793 // old value times the number of active lanes.
794 Value
*const Ctpop
= B
.CreateIntCast(
795 B
.CreateUnaryIntrinsic(Intrinsic::ctpop
, Ballot
), Ty
, false);
796 NewV
= buildMul(B
, V
, Ctpop
);
799 case AtomicRMWInst::FAdd
:
800 case AtomicRMWInst::FSub
: {
801 Value
*const Ctpop
= B
.CreateIntCast(
802 B
.CreateUnaryIntrinsic(Intrinsic::ctpop
, Ballot
), Int32Ty
, false);
803 Value
*const CtpopFP
= B
.CreateUIToFP(Ctpop
, Ty
);
804 NewV
= B
.CreateFMul(V
, CtpopFP
);
807 case AtomicRMWInst::And
:
808 case AtomicRMWInst::Or
:
809 case AtomicRMWInst::Max
:
810 case AtomicRMWInst::Min
:
811 case AtomicRMWInst::UMax
:
812 case AtomicRMWInst::UMin
:
813 case AtomicRMWInst::FMin
:
814 case AtomicRMWInst::FMax
:
815 // These operations with a uniform value are idempotent: doing the atomic
816 // operation multiple times has the same effect as doing it once.
820 case AtomicRMWInst::Xor
:
821 // The new value we will be contributing to the atomic operation is the
822 // old value times the parity of the number of active lanes.
823 Value
*const Ctpop
= B
.CreateIntCast(
824 B
.CreateUnaryIntrinsic(Intrinsic::ctpop
, Ballot
), Ty
, false);
825 NewV
= buildMul(B
, V
, B
.CreateAnd(Ctpop
, 1));
830 // We only want a single lane to enter our new control flow, and we do this
831 // by checking if there are any active lanes below us. Only one lane will
832 // have 0 active lanes below us, so that will be the only one to progress.
833 Value
*const Cond
= B
.CreateICmpEQ(Mbcnt
, B
.getInt32(0));
835 // Store I's original basic block before we split the block.
836 BasicBlock
*const OriginalBB
= I
.getParent();
838 // We need to introduce some new control flow to force a single lane to be
839 // active. We do this by splitting I's basic block at I, and introducing the
840 // new block such that:
841 // entry --> single_lane -\
842 // \------------------> exit
843 Instruction
*const SingleLaneTerminator
=
844 SplitBlockAndInsertIfThen(Cond
, &I
, false, nullptr, &DTU
, nullptr);
846 // At this point, we have split the I's block to allow one lane in wavefront
847 // to update the precomputed reduced value. Also, completed the codegen for
848 // new control flow i.e. iterative loop which perform reduction and scan using
849 // ComputeLoop and ComputeEnd.
850 // For the new control flow, we need to move branch instruction i.e.
851 // terminator created during SplitBlockAndInsertIfThen from I's block to
852 // ComputeEnd block. We also need to set up predecessor to next block when
853 // single lane done updating the final reduced value.
854 BasicBlock
*Predecessor
= nullptr;
855 if (ValDivergent
&& ScanImpl
== ScanOptions::Iterative
) {
856 // Move terminator from I's block to ComputeEnd block.
858 // OriginalBB is known to have a branch as terminator because
859 // SplitBlockAndInsertIfThen will have inserted one.
860 BranchInst
*Terminator
= cast
<BranchInst
>(OriginalBB
->getTerminator());
861 B
.SetInsertPoint(ComputeEnd
);
862 Terminator
->removeFromParent();
863 B
.Insert(Terminator
);
865 // Branch to ComputeLoop Block unconditionally from the I's block for
866 // iterative approach.
867 B
.SetInsertPoint(OriginalBB
);
868 B
.CreateBr(ComputeLoop
);
870 // Update the dominator tree for new control flow.
871 SmallVector
<DominatorTree::UpdateType
, 6> DomTreeUpdates(
872 {{DominatorTree::Insert
, OriginalBB
, ComputeLoop
},
873 {DominatorTree::Insert
, ComputeLoop
, ComputeEnd
}});
875 // We're moving the terminator from EntryBB to ComputeEnd, make sure we move
876 // the DT edges as well.
877 for (auto *Succ
: Terminator
->successors()) {
878 DomTreeUpdates
.push_back({DominatorTree::Insert
, ComputeEnd
, Succ
});
879 DomTreeUpdates
.push_back({DominatorTree::Delete
, OriginalBB
, Succ
});
882 DTU
.applyUpdates(DomTreeUpdates
);
884 Predecessor
= ComputeEnd
;
886 Predecessor
= OriginalBB
;
888 // Move the IR builder into single_lane next.
889 B
.SetInsertPoint(SingleLaneTerminator
);
891 // Clone the original atomic operation into single lane, replacing the
892 // original value with our newly created one.
893 Instruction
*const NewI
= I
.clone();
895 NewI
->setOperand(ValIdx
, NewV
);
897 // Move the IR builder into exit next, and start inserting just before the
898 // original instruction.
899 B
.SetInsertPoint(&I
);
902 // Create a PHI node to get our new atomic result into the exit block.
903 PHINode
*const PHI
= B
.CreatePHI(Ty
, 2);
904 PHI
->addIncoming(PoisonValue::get(Ty
), Predecessor
);
905 PHI
->addIncoming(NewI
, SingleLaneTerminator
->getParent());
907 // We need to broadcast the value who was the lowest active lane (the first
908 // lane) to all other lanes in the wavefront. We use an intrinsic for this,
909 // but have to handle 64-bit broadcasts with two calls to this intrinsic.
910 Value
*BroadcastI
= nullptr;
911 BroadcastI
= B
.CreateIntrinsic(Ty
, Intrinsic::amdgcn_readfirstlane
, PHI
);
913 // Now that we have the result of our single atomic operation, we need to
914 // get our individual lane's slice into the result. We use the lane offset
915 // we previously calculated combined with the atomic result value we got
916 // from the first lane, to get our lane's index into the atomic result.
917 Value
*LaneOffset
= nullptr;
919 if (ScanImpl
== ScanOptions::DPP
) {
921 B
.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm
, Ty
, ExclScan
);
922 } else if (ScanImpl
== ScanOptions::Iterative
) {
923 LaneOffset
= ExclScan
;
925 llvm_unreachable("Atomic Optimzer is disabled for None strategy");
928 Mbcnt
= isAtomicFloatingPointTy
? B
.CreateUIToFP(Mbcnt
, Ty
)
929 : B
.CreateIntCast(Mbcnt
, Ty
, false);
932 llvm_unreachable("Unhandled atomic op");
933 case AtomicRMWInst::Add
:
934 case AtomicRMWInst::Sub
:
935 LaneOffset
= buildMul(B
, V
, Mbcnt
);
937 case AtomicRMWInst::And
:
938 case AtomicRMWInst::Or
:
939 case AtomicRMWInst::Max
:
940 case AtomicRMWInst::Min
:
941 case AtomicRMWInst::UMax
:
942 case AtomicRMWInst::UMin
:
943 case AtomicRMWInst::FMin
:
944 case AtomicRMWInst::FMax
:
945 LaneOffset
= B
.CreateSelect(Cond
, Identity
, V
);
947 case AtomicRMWInst::Xor
:
948 LaneOffset
= buildMul(B
, V
, B
.CreateAnd(Mbcnt
, 1));
950 case AtomicRMWInst::FAdd
:
951 case AtomicRMWInst::FSub
: {
952 LaneOffset
= B
.CreateFMul(V
, Mbcnt
);
957 Value
*Result
= buildNonAtomicBinOp(B
, Op
, BroadcastI
, LaneOffset
);
958 if (isAtomicFloatingPointTy
) {
959 // For fadd/fsub the first active lane of LaneOffset should be the
960 // identity (-0.0 for fadd or +0.0 for fsub) but the value we calculated
961 // is V * +0.0 which might have the wrong sign or might be nan (if V is
964 // For all floating point ops if the in-memory value was a nan then the
965 // binop we just built might have quieted it or changed its payload.
967 // Correct all these problems by using BroadcastI as the result in the
968 // first active lane.
969 Result
= B
.CreateSelect(Cond
, BroadcastI
, Result
);
973 // Need a final PHI to reconverge to above the helper lane branch mask.
974 B
.SetInsertPoint(PixelExitBB
, PixelExitBB
->getFirstNonPHIIt());
976 PHINode
*const PHI
= B
.CreatePHI(Ty
, 2);
977 PHI
->addIncoming(PoisonValue::get(Ty
), PixelEntryBB
);
978 PHI
->addIncoming(Result
, I
.getParent());
979 I
.replaceAllUsesWith(PHI
);
981 // Replace the original atomic instruction with the new one.
982 I
.replaceAllUsesWith(Result
);
986 // And delete the original.
990 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer
, DEBUG_TYPE
,
991 "AMDGPU atomic optimizations", false, false)
992 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass
)
993 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig
)
994 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer
, DEBUG_TYPE
,
995 "AMDGPU atomic optimizations", false, false)
997 FunctionPass
*llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy
) {
998 return new AMDGPUAtomicOptimizer(ScanStrategy
);