1 //===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This pass optimizes atomic operations by using a single lane of a wavefront
11 /// to perform the atomic operation, thus reducing contention on that memory
14 //===----------------------------------------------------------------------===//
17 #include "AMDGPUSubtarget.h"
18 #include "SIDefines.h"
19 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
20 #include "llvm/CodeGen/TargetPassConfig.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/InstVisitor.h"
23 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
25 #define DEBUG_TYPE "amdgpu-atomic-optimizer"
28 using namespace llvm::AMDGPU
;
32 struct ReplacementInfo
{
34 AtomicRMWInst::BinOp Op
;
39 class AMDGPUAtomicOptimizer
: public FunctionPass
,
40 public InstVisitor
<AMDGPUAtomicOptimizer
> {
42 SmallVector
<ReplacementInfo
, 8> ToReplace
;
43 const LegacyDivergenceAnalysis
*DA
;
46 const GCNSubtarget
*ST
;
49 Value
*buildScan(IRBuilder
<> &B
, AtomicRMWInst::BinOp Op
, Value
*V
,
50 Value
*const Identity
) const;
51 Value
*buildShiftRight(IRBuilder
<> &B
, Value
*V
, Value
*const Identity
) const;
52 void optimizeAtomic(Instruction
&I
, AtomicRMWInst::BinOp Op
, unsigned ValIdx
,
53 bool ValDivergent
) const;
58 AMDGPUAtomicOptimizer() : FunctionPass(ID
) {}
60 bool runOnFunction(Function
&F
) override
;
62 void getAnalysisUsage(AnalysisUsage
&AU
) const override
{
63 AU
.addPreserved
<DominatorTreeWrapperPass
>();
64 AU
.addRequired
<LegacyDivergenceAnalysis
>();
65 AU
.addRequired
<TargetPassConfig
>();
68 void visitAtomicRMWInst(AtomicRMWInst
&I
);
69 void visitIntrinsicInst(IntrinsicInst
&I
);
74 char AMDGPUAtomicOptimizer::ID
= 0;
76 char &llvm::AMDGPUAtomicOptimizerID
= AMDGPUAtomicOptimizer::ID
;
78 bool AMDGPUAtomicOptimizer::runOnFunction(Function
&F
) {
79 if (skipFunction(F
)) {
83 DA
= &getAnalysis
<LegacyDivergenceAnalysis
>();
84 DL
= &F
.getParent()->getDataLayout();
85 DominatorTreeWrapperPass
*const DTW
=
86 getAnalysisIfAvailable
<DominatorTreeWrapperPass
>();
87 DT
= DTW
? &DTW
->getDomTree() : nullptr;
88 const TargetPassConfig
&TPC
= getAnalysis
<TargetPassConfig
>();
89 const TargetMachine
&TM
= TPC
.getTM
<TargetMachine
>();
90 ST
= &TM
.getSubtarget
<GCNSubtarget
>(F
);
91 IsPixelShader
= F
.getCallingConv() == CallingConv::AMDGPU_PS
;
95 const bool Changed
= !ToReplace
.empty();
97 for (ReplacementInfo
&Info
: ToReplace
) {
98 optimizeAtomic(*Info
.I
, Info
.Op
, Info
.ValIdx
, Info
.ValDivergent
);
106 void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst
&I
) {
107 // Early exit for unhandled address space atomic instructions.
108 switch (I
.getPointerAddressSpace()) {
111 case AMDGPUAS::GLOBAL_ADDRESS
:
112 case AMDGPUAS::LOCAL_ADDRESS
:
116 AtomicRMWInst::BinOp Op
= I
.getOperation();
121 case AtomicRMWInst::Add
:
122 case AtomicRMWInst::Sub
:
123 case AtomicRMWInst::And
:
124 case AtomicRMWInst::Or
:
125 case AtomicRMWInst::Xor
:
126 case AtomicRMWInst::Max
:
127 case AtomicRMWInst::Min
:
128 case AtomicRMWInst::UMax
:
129 case AtomicRMWInst::UMin
:
133 const unsigned PtrIdx
= 0;
134 const unsigned ValIdx
= 1;
136 // If the pointer operand is divergent, then each lane is doing an atomic
137 // operation on a different address, and we cannot optimize that.
138 if (DA
->isDivergentUse(&I
.getOperandUse(PtrIdx
))) {
142 const bool ValDivergent
= DA
->isDivergentUse(&I
.getOperandUse(ValIdx
));
144 // If the value operand is divergent, each lane is contributing a different
145 // value to the atomic calculation. We can only optimize divergent values if
146 // we have DPP available on our subtarget, and the atomic operation is 32
149 (!ST
->hasDPP() || DL
->getTypeSizeInBits(I
.getType()) != 32)) {
153 // If we get here, we can optimize the atomic using a single wavefront-wide
154 // atomic operation to do the calculation for the entire wavefront, so
155 // remember the instruction so we can come back to it.
156 const ReplacementInfo Info
= {&I
, Op
, ValIdx
, ValDivergent
};
158 ToReplace
.push_back(Info
);
161 void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst
&I
) {
162 AtomicRMWInst::BinOp Op
;
164 switch (I
.getIntrinsicID()) {
167 case Intrinsic::amdgcn_buffer_atomic_add
:
168 case Intrinsic::amdgcn_struct_buffer_atomic_add
:
169 case Intrinsic::amdgcn_raw_buffer_atomic_add
:
170 Op
= AtomicRMWInst::Add
;
172 case Intrinsic::amdgcn_buffer_atomic_sub
:
173 case Intrinsic::amdgcn_struct_buffer_atomic_sub
:
174 case Intrinsic::amdgcn_raw_buffer_atomic_sub
:
175 Op
= AtomicRMWInst::Sub
;
177 case Intrinsic::amdgcn_buffer_atomic_and
:
178 case Intrinsic::amdgcn_struct_buffer_atomic_and
:
179 case Intrinsic::amdgcn_raw_buffer_atomic_and
:
180 Op
= AtomicRMWInst::And
;
182 case Intrinsic::amdgcn_buffer_atomic_or
:
183 case Intrinsic::amdgcn_struct_buffer_atomic_or
:
184 case Intrinsic::amdgcn_raw_buffer_atomic_or
:
185 Op
= AtomicRMWInst::Or
;
187 case Intrinsic::amdgcn_buffer_atomic_xor
:
188 case Intrinsic::amdgcn_struct_buffer_atomic_xor
:
189 case Intrinsic::amdgcn_raw_buffer_atomic_xor
:
190 Op
= AtomicRMWInst::Xor
;
192 case Intrinsic::amdgcn_buffer_atomic_smin
:
193 case Intrinsic::amdgcn_struct_buffer_atomic_smin
:
194 case Intrinsic::amdgcn_raw_buffer_atomic_smin
:
195 Op
= AtomicRMWInst::Min
;
197 case Intrinsic::amdgcn_buffer_atomic_umin
:
198 case Intrinsic::amdgcn_struct_buffer_atomic_umin
:
199 case Intrinsic::amdgcn_raw_buffer_atomic_umin
:
200 Op
= AtomicRMWInst::UMin
;
202 case Intrinsic::amdgcn_buffer_atomic_smax
:
203 case Intrinsic::amdgcn_struct_buffer_atomic_smax
:
204 case Intrinsic::amdgcn_raw_buffer_atomic_smax
:
205 Op
= AtomicRMWInst::Max
;
207 case Intrinsic::amdgcn_buffer_atomic_umax
:
208 case Intrinsic::amdgcn_struct_buffer_atomic_umax
:
209 case Intrinsic::amdgcn_raw_buffer_atomic_umax
:
210 Op
= AtomicRMWInst::UMax
;
214 const unsigned ValIdx
= 0;
216 const bool ValDivergent
= DA
->isDivergentUse(&I
.getOperandUse(ValIdx
));
218 // If the value operand is divergent, each lane is contributing a different
219 // value to the atomic calculation. We can only optimize divergent values if
220 // we have DPP available on our subtarget, and the atomic operation is 32
223 (!ST
->hasDPP() || DL
->getTypeSizeInBits(I
.getType()) != 32)) {
227 // If any of the other arguments to the intrinsic are divergent, we can't
228 // optimize the operation.
229 for (unsigned Idx
= 1; Idx
< I
.getNumOperands(); Idx
++) {
230 if (DA
->isDivergentUse(&I
.getOperandUse(Idx
))) {
235 // If we get here, we can optimize the atomic using a single wavefront-wide
236 // atomic operation to do the calculation for the entire wavefront, so
237 // remember the instruction so we can come back to it.
238 const ReplacementInfo Info
= {&I
, Op
, ValIdx
, ValDivergent
};
240 ToReplace
.push_back(Info
);
243 // Use the builder to create the non-atomic counterpart of the specified
244 // atomicrmw binary op.
245 static Value
*buildNonAtomicBinOp(IRBuilder
<> &B
, AtomicRMWInst::BinOp Op
,
246 Value
*LHS
, Value
*RHS
) {
247 CmpInst::Predicate Pred
;
251 llvm_unreachable("Unhandled atomic op");
252 case AtomicRMWInst::Add
:
253 return B
.CreateBinOp(Instruction::Add
, LHS
, RHS
);
254 case AtomicRMWInst::Sub
:
255 return B
.CreateBinOp(Instruction::Sub
, LHS
, RHS
);
256 case AtomicRMWInst::And
:
257 return B
.CreateBinOp(Instruction::And
, LHS
, RHS
);
258 case AtomicRMWInst::Or
:
259 return B
.CreateBinOp(Instruction::Or
, LHS
, RHS
);
260 case AtomicRMWInst::Xor
:
261 return B
.CreateBinOp(Instruction::Xor
, LHS
, RHS
);
263 case AtomicRMWInst::Max
:
264 Pred
= CmpInst::ICMP_SGT
;
266 case AtomicRMWInst::Min
:
267 Pred
= CmpInst::ICMP_SLT
;
269 case AtomicRMWInst::UMax
:
270 Pred
= CmpInst::ICMP_UGT
;
272 case AtomicRMWInst::UMin
:
273 Pred
= CmpInst::ICMP_ULT
;
276 Value
*Cond
= B
.CreateICmp(Pred
, LHS
, RHS
);
277 return B
.CreateSelect(Cond
, LHS
, RHS
);
280 // Use the builder to create an inclusive scan of V across the wavefront, with
282 Value
*AMDGPUAtomicOptimizer::buildScan(IRBuilder
<> &B
, AtomicRMWInst::BinOp Op
,
283 Value
*V
, Value
*const Identity
) const {
284 Type
*const Ty
= V
->getType();
285 Module
*M
= B
.GetInsertBlock()->getModule();
286 Function
*UpdateDPP
=
287 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_update_dpp
, Ty
);
288 Function
*PermLaneX16
=
289 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_permlanex16
, {});
291 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_readlane
, {});
293 for (unsigned Idx
= 0; Idx
< 4; Idx
++) {
294 V
= buildNonAtomicBinOp(
296 B
.CreateCall(UpdateDPP
,
297 {Identity
, V
, B
.getInt32(DPP::ROW_SHR0
| 1 << Idx
),
298 B
.getInt32(0xf), B
.getInt32(0xf), B
.getFalse()}));
300 if (ST
->hasDPPBroadcasts()) {
301 // GFX9 has DPP row broadcast operations.
302 V
= buildNonAtomicBinOp(
304 B
.CreateCall(UpdateDPP
,
305 {Identity
, V
, B
.getInt32(DPP::BCAST15
), B
.getInt32(0xa),
306 B
.getInt32(0xf), B
.getFalse()}));
307 V
= buildNonAtomicBinOp(
309 B
.CreateCall(UpdateDPP
,
310 {Identity
, V
, B
.getInt32(DPP::BCAST31
), B
.getInt32(0xc),
311 B
.getInt32(0xf), B
.getFalse()}));
313 // On GFX10 all DPP operations are confined to a single row. To get cross-
314 // row operations we have to use permlane or readlane.
316 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
319 B
.CreateCall(PermLaneX16
, {V
, V
, B
.getInt32(-1), B
.getInt32(-1),
320 B
.getFalse(), B
.getFalse()});
321 V
= buildNonAtomicBinOp(
323 B
.CreateCall(UpdateDPP
,
324 {Identity
, PermX
, B
.getInt32(DPP::QUAD_PERM_ID
),
325 B
.getInt32(0xa), B
.getInt32(0xf), B
.getFalse()}));
326 if (!ST
->isWave32()) {
327 // Combine lane 31 into lanes 32..63.
328 Value
*const Lane31
= B
.CreateCall(ReadLane
, {V
, B
.getInt32(31)});
329 V
= buildNonAtomicBinOp(
331 B
.CreateCall(UpdateDPP
,
332 {Identity
, Lane31
, B
.getInt32(DPP::QUAD_PERM_ID
),
333 B
.getInt32(0xc), B
.getInt32(0xf), B
.getFalse()}));
339 // Use the builder to create a shift right of V across the wavefront, with all
340 // lanes active, to turn an inclusive scan into an exclusive scan.
341 Value
*AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder
<> &B
, Value
*V
,
342 Value
*const Identity
) const {
343 Type
*const Ty
= V
->getType();
344 Module
*M
= B
.GetInsertBlock()->getModule();
345 Function
*UpdateDPP
=
346 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_update_dpp
, Ty
);
348 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_readlane
, {});
349 Function
*WriteLane
=
350 Intrinsic::getDeclaration(M
, Intrinsic::amdgcn_writelane
, {});
352 if (ST
->hasDPPWavefrontShifts()) {
353 // GFX9 has DPP wavefront shift operations.
354 V
= B
.CreateCall(UpdateDPP
,
355 {Identity
, V
, B
.getInt32(DPP::WAVE_SHR1
), B
.getInt32(0xf),
356 B
.getInt32(0xf), B
.getFalse()});
358 // On GFX10 all DPP operations are confined to a single row. To get cross-
359 // row operations we have to use permlane or readlane.
361 V
= B
.CreateCall(UpdateDPP
,
362 {Identity
, V
, B
.getInt32(DPP::ROW_SHR0
+ 1),
363 B
.getInt32(0xf), B
.getInt32(0xf), B
.getFalse()});
365 // Copy the old lane 15 to the new lane 16.
366 V
= B
.CreateCall(WriteLane
, {B
.CreateCall(ReadLane
, {Old
, B
.getInt32(15)}),
369 if (!ST
->isWave32()) {
370 // Copy the old lane 31 to the new lane 32.
373 {B
.CreateCall(ReadLane
, {Old
, B
.getInt32(31)}), B
.getInt32(32), V
});
375 // Copy the old lane 47 to the new lane 48.
378 {B
.CreateCall(ReadLane
, {Old
, B
.getInt32(47)}), B
.getInt32(48), V
});
385 static APInt
getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op
,
389 llvm_unreachable("Unhandled atomic op");
390 case AtomicRMWInst::Add
:
391 case AtomicRMWInst::Sub
:
392 case AtomicRMWInst::Or
:
393 case AtomicRMWInst::Xor
:
394 case AtomicRMWInst::UMax
:
395 return APInt::getMinValue(BitWidth
);
396 case AtomicRMWInst::And
:
397 case AtomicRMWInst::UMin
:
398 return APInt::getMaxValue(BitWidth
);
399 case AtomicRMWInst::Max
:
400 return APInt::getSignedMinValue(BitWidth
);
401 case AtomicRMWInst::Min
:
402 return APInt::getSignedMaxValue(BitWidth
);
406 void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction
&I
,
407 AtomicRMWInst::BinOp Op
,
409 bool ValDivergent
) const {
410 // Start building just before the instruction.
413 // If we are in a pixel shader, because of how we have to mask out helper
414 // lane invocations, we need to record the entry and exit BB's.
415 BasicBlock
*PixelEntryBB
= nullptr;
416 BasicBlock
*PixelExitBB
= nullptr;
418 // If we're optimizing an atomic within a pixel shader, we need to wrap the
419 // entire atomic operation in a helper-lane check. We do not want any helper
420 // lanes that are around only for the purposes of derivatives to take part
421 // in any cross-lane communication, and we use a branch on whether the lane is
424 // Record I's original position as the entry block.
425 PixelEntryBB
= I
.getParent();
427 Value
*const Cond
= B
.CreateIntrinsic(Intrinsic::amdgcn_ps_live
, {}, {});
428 Instruction
*const NonHelperTerminator
=
429 SplitBlockAndInsertIfThen(Cond
, &I
, false, nullptr, DT
, nullptr);
431 // Record I's new position as the exit block.
432 PixelExitBB
= I
.getParent();
434 I
.moveBefore(NonHelperTerminator
);
435 B
.SetInsertPoint(&I
);
438 Type
*const Ty
= I
.getType();
439 const unsigned TyBitWidth
= DL
->getTypeSizeInBits(Ty
);
440 Type
*const VecTy
= VectorType::get(B
.getInt32Ty(), 2);
442 // This is the value in the atomic operation we need to combine in order to
443 // reduce the number of atomic operations.
444 Value
*const V
= I
.getOperand(ValIdx
);
446 // We need to know how many lanes are active within the wavefront, and we do
447 // this by doing a ballot of active lanes.
448 Type
*const WaveTy
= B
.getIntNTy(ST
->getWavefrontSize());
449 CallInst
*const Ballot
= B
.CreateIntrinsic(
450 Intrinsic::amdgcn_icmp
, {WaveTy
, B
.getInt32Ty()},
451 {B
.getInt32(1), B
.getInt32(0), B
.getInt32(CmpInst::ICMP_NE
)});
453 // We need to know how many lanes are active within the wavefront that are
454 // below us. If we counted each lane linearly starting from 0, a lane is
455 // below us only if its associated index was less than ours. We do this by
456 // using the mbcnt intrinsic.
458 if (ST
->isWave32()) {
459 Mbcnt
= B
.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo
, {},
460 {Ballot
, B
.getInt32(0)});
462 Value
*const BitCast
= B
.CreateBitCast(Ballot
, VecTy
);
463 Value
*const ExtractLo
= B
.CreateExtractElement(BitCast
, B
.getInt32(0));
464 Value
*const ExtractHi
= B
.CreateExtractElement(BitCast
, B
.getInt32(1));
465 Mbcnt
= B
.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo
, {},
466 {ExtractLo
, B
.getInt32(0)});
468 B
.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi
, {}, {ExtractHi
, Mbcnt
});
470 Mbcnt
= B
.CreateIntCast(Mbcnt
, Ty
, false);
472 Value
*const Identity
= B
.getInt(getIdentityValueForAtomicOp(Op
, TyBitWidth
));
474 Value
*ExclScan
= nullptr;
475 Value
*NewV
= nullptr;
477 // If we have a divergent value in each lane, we need to combine the value
480 // First we need to set all inactive invocations to the identity value, so
481 // that they can correctly contribute to the final result.
482 NewV
= B
.CreateIntrinsic(Intrinsic::amdgcn_set_inactive
, Ty
, {V
, Identity
});
484 const AtomicRMWInst::BinOp ScanOp
=
485 Op
== AtomicRMWInst::Sub
? AtomicRMWInst::Add
: Op
;
486 NewV
= buildScan(B
, ScanOp
, NewV
, Identity
);
487 ExclScan
= buildShiftRight(B
, NewV
, Identity
);
489 // Read the value from the last lane, which has accumlated the values of
490 // each active lane in the wavefront. This will be our new value which we
491 // will provide to the atomic operation.
492 Value
*const LastLaneIdx
= B
.getInt32(ST
->getWavefrontSize() - 1);
493 if (TyBitWidth
== 64) {
494 Value
*const ExtractLo
= B
.CreateTrunc(NewV
, B
.getInt32Ty());
495 Value
*const ExtractHi
=
496 B
.CreateTrunc(B
.CreateLShr(NewV
, 32), B
.getInt32Ty());
497 CallInst
*const ReadLaneLo
= B
.CreateIntrinsic(
498 Intrinsic::amdgcn_readlane
, {}, {ExtractLo
, LastLaneIdx
});
499 CallInst
*const ReadLaneHi
= B
.CreateIntrinsic(
500 Intrinsic::amdgcn_readlane
, {}, {ExtractHi
, LastLaneIdx
});
501 Value
*const PartialInsert
= B
.CreateInsertElement(
502 UndefValue::get(VecTy
), ReadLaneLo
, B
.getInt32(0));
503 Value
*const Insert
=
504 B
.CreateInsertElement(PartialInsert
, ReadLaneHi
, B
.getInt32(1));
505 NewV
= B
.CreateBitCast(Insert
, Ty
);
506 } else if (TyBitWidth
== 32) {
507 NewV
= B
.CreateIntrinsic(Intrinsic::amdgcn_readlane
, {},
508 {NewV
, LastLaneIdx
});
510 llvm_unreachable("Unhandled atomic bit width");
513 // Finally mark the readlanes in the WWM section.
514 NewV
= B
.CreateIntrinsic(Intrinsic::amdgcn_wwm
, Ty
, NewV
);
518 llvm_unreachable("Unhandled atomic op");
520 case AtomicRMWInst::Add
:
521 case AtomicRMWInst::Sub
: {
522 // The new value we will be contributing to the atomic operation is the
523 // old value times the number of active lanes.
524 Value
*const Ctpop
= B
.CreateIntCast(
525 B
.CreateUnaryIntrinsic(Intrinsic::ctpop
, Ballot
), Ty
, false);
526 NewV
= B
.CreateMul(V
, Ctpop
);
530 case AtomicRMWInst::And
:
531 case AtomicRMWInst::Or
:
532 case AtomicRMWInst::Max
:
533 case AtomicRMWInst::Min
:
534 case AtomicRMWInst::UMax
:
535 case AtomicRMWInst::UMin
:
536 // These operations with a uniform value are idempotent: doing the atomic
537 // operation multiple times has the same effect as doing it once.
541 case AtomicRMWInst::Xor
:
542 // The new value we will be contributing to the atomic operation is the
543 // old value times the parity of the number of active lanes.
544 Value
*const Ctpop
= B
.CreateIntCast(
545 B
.CreateUnaryIntrinsic(Intrinsic::ctpop
, Ballot
), Ty
, false);
546 NewV
= B
.CreateMul(V
, B
.CreateAnd(Ctpop
, 1));
551 // We only want a single lane to enter our new control flow, and we do this
552 // by checking if there are any active lanes below us. Only one lane will
553 // have 0 active lanes below us, so that will be the only one to progress.
554 Value
*const Cond
= B
.CreateICmpEQ(Mbcnt
, B
.getIntN(TyBitWidth
, 0));
556 // Store I's original basic block before we split the block.
557 BasicBlock
*const EntryBB
= I
.getParent();
559 // We need to introduce some new control flow to force a single lane to be
560 // active. We do this by splitting I's basic block at I, and introducing the
561 // new block such that:
562 // entry --> single_lane -\
563 // \------------------> exit
564 Instruction
*const SingleLaneTerminator
=
565 SplitBlockAndInsertIfThen(Cond
, &I
, false, nullptr, DT
, nullptr);
567 // Move the IR builder into single_lane next.
568 B
.SetInsertPoint(SingleLaneTerminator
);
570 // Clone the original atomic operation into single lane, replacing the
571 // original value with our newly created one.
572 Instruction
*const NewI
= I
.clone();
574 NewI
->setOperand(ValIdx
, NewV
);
576 // Move the IR builder into exit next, and start inserting just before the
577 // original instruction.
578 B
.SetInsertPoint(&I
);
580 const bool NeedResult
= !I
.use_empty();
582 // Create a PHI node to get our new atomic result into the exit block.
583 PHINode
*const PHI
= B
.CreatePHI(Ty
, 2);
584 PHI
->addIncoming(UndefValue::get(Ty
), EntryBB
);
585 PHI
->addIncoming(NewI
, SingleLaneTerminator
->getParent());
587 // We need to broadcast the value who was the lowest active lane (the first
588 // lane) to all other lanes in the wavefront. We use an intrinsic for this,
589 // but have to handle 64-bit broadcasts with two calls to this intrinsic.
590 Value
*BroadcastI
= nullptr;
592 if (TyBitWidth
== 64) {
593 Value
*const ExtractLo
= B
.CreateTrunc(PHI
, B
.getInt32Ty());
594 Value
*const ExtractHi
=
595 B
.CreateTrunc(B
.CreateLShr(PHI
, 32), B
.getInt32Ty());
596 CallInst
*const ReadFirstLaneLo
=
597 B
.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane
, {}, ExtractLo
);
598 CallInst
*const ReadFirstLaneHi
=
599 B
.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane
, {}, ExtractHi
);
600 Value
*const PartialInsert
= B
.CreateInsertElement(
601 UndefValue::get(VecTy
), ReadFirstLaneLo
, B
.getInt32(0));
602 Value
*const Insert
=
603 B
.CreateInsertElement(PartialInsert
, ReadFirstLaneHi
, B
.getInt32(1));
604 BroadcastI
= B
.CreateBitCast(Insert
, Ty
);
605 } else if (TyBitWidth
== 32) {
607 BroadcastI
= B
.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane
, {}, PHI
);
609 llvm_unreachable("Unhandled atomic bit width");
612 // Now that we have the result of our single atomic operation, we need to
613 // get our individual lane's slice into the result. We use the lane offset
614 // we previously calculated combined with the atomic result value we got
615 // from the first lane, to get our lane's index into the atomic result.
616 Value
*LaneOffset
= nullptr;
618 LaneOffset
= B
.CreateIntrinsic(Intrinsic::amdgcn_wwm
, Ty
, ExclScan
);
622 llvm_unreachable("Unhandled atomic op");
623 case AtomicRMWInst::Add
:
624 case AtomicRMWInst::Sub
:
625 LaneOffset
= B
.CreateMul(V
, Mbcnt
);
627 case AtomicRMWInst::And
:
628 case AtomicRMWInst::Or
:
629 case AtomicRMWInst::Max
:
630 case AtomicRMWInst::Min
:
631 case AtomicRMWInst::UMax
:
632 case AtomicRMWInst::UMin
:
633 LaneOffset
= B
.CreateSelect(Cond
, Identity
, V
);
635 case AtomicRMWInst::Xor
:
636 LaneOffset
= B
.CreateMul(V
, B
.CreateAnd(Mbcnt
, 1));
640 Value
*const Result
= buildNonAtomicBinOp(B
, Op
, BroadcastI
, LaneOffset
);
643 // Need a final PHI to reconverge to above the helper lane branch mask.
644 B
.SetInsertPoint(PixelExitBB
->getFirstNonPHI());
646 PHINode
*const PHI
= B
.CreatePHI(Ty
, 2);
647 PHI
->addIncoming(UndefValue::get(Ty
), PixelEntryBB
);
648 PHI
->addIncoming(Result
, I
.getParent());
649 I
.replaceAllUsesWith(PHI
);
651 // Replace the original atomic instruction with the new one.
652 I
.replaceAllUsesWith(Result
);
656 // And delete the original.
660 INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer
, DEBUG_TYPE
,
661 "AMDGPU atomic optimizations", false, false)
662 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis
)
663 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig
)
664 INITIALIZE_PASS_END(AMDGPUAtomicOptimizer
, DEBUG_TYPE
,
665 "AMDGPU atomic optimizations", false, false)
667 FunctionPass
*llvm::createAMDGPUAtomicOptimizerPass() {
668 return new AMDGPUAtomicOptimizer();