1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This provides a generalized class for OpenMP runtime code generation
10 // specialized by GPU targets NVPTX and AMDGCN.
12 //===----------------------------------------------------------------------===//
14 #include "CGOpenMPRuntimeGPU.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/Attr.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/OpenMPClause.h"
19 #include "clang/AST/StmtOpenMP.h"
20 #include "clang/AST/StmtVisitor.h"
21 #include "clang/Basic/Cuda.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24 #include "llvm/Support/MathExtras.h"
26 using namespace clang
;
27 using namespace CodeGen
;
28 using namespace llvm::omp
;
31 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32 class NVPTXActionTy final
: public PrePostActionTy
{
33 llvm::FunctionCallee EnterCallee
= nullptr;
34 ArrayRef
<llvm::Value
*> EnterArgs
;
35 llvm::FunctionCallee ExitCallee
= nullptr;
36 ArrayRef
<llvm::Value
*> ExitArgs
;
37 bool Conditional
= false;
38 llvm::BasicBlock
*ContBlock
= nullptr;
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee
,
42 ArrayRef
<llvm::Value
*> EnterArgs
,
43 llvm::FunctionCallee ExitCallee
,
44 ArrayRef
<llvm::Value
*> ExitArgs
, bool Conditional
= false)
45 : EnterCallee(EnterCallee
), EnterArgs(EnterArgs
), ExitCallee(ExitCallee
),
46 ExitArgs(ExitArgs
), Conditional(Conditional
) {}
47 void Enter(CodeGenFunction
&CGF
) override
{
48 llvm::Value
*EnterRes
= CGF
.EmitRuntimeCall(EnterCallee
, EnterArgs
);
50 llvm::Value
*CallBool
= CGF
.Builder
.CreateIsNotNull(EnterRes
);
51 auto *ThenBlock
= CGF
.createBasicBlock("omp_if.then");
52 ContBlock
= CGF
.createBasicBlock("omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF
.Builder
.CreateCondBr(CallBool
, ThenBlock
, ContBlock
);
55 CGF
.EmitBlock(ThenBlock
);
58 void Done(CodeGenFunction
&CGF
) {
59 // Emit the rest of blocks/branches
60 CGF
.EmitBranch(ContBlock
);
61 CGF
.EmitBlock(ContBlock
, true);
63 void Exit(CodeGenFunction
&CGF
) override
{
64 CGF
.EmitRuntimeCall(ExitCallee
, ExitArgs
);
68 /// A class to track the execution mode when codegening directives within
69 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70 /// to the target region and used by containing directives such as 'parallel'
71 /// to emit optimized code.
72 class ExecutionRuntimeModesRAII
{
74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode
=
75 CGOpenMPRuntimeGPU::EM_Unknown
;
76 CGOpenMPRuntimeGPU::ExecutionMode
&ExecMode
;
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode
&ExecMode
,
80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode
)
81 : ExecMode(ExecMode
) {
82 SavedExecMode
= ExecMode
;
85 ~ExecutionRuntimeModesRAII() { ExecMode
= SavedExecMode
; }
88 /// GPU Configuration: This information can be derived from cuda registers,
89 /// however, providing compile time constants helps generate more efficient
90 /// code. For all practical purposes this is fine because the configuration
91 /// is the same for all known NVPTX architectures.
92 enum MachineConfiguration
: unsigned {
93 /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
94 /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
96 /// Global memory alignment for performance.
97 GlobalMemoryAlignment
= 128,
100 static const ValueDecl
*getPrivateItem(const Expr
*RefExpr
) {
101 RefExpr
= RefExpr
->IgnoreParens();
102 if (const auto *ASE
= dyn_cast
<ArraySubscriptExpr
>(RefExpr
)) {
103 const Expr
*Base
= ASE
->getBase()->IgnoreParenImpCasts();
104 while (const auto *TempASE
= dyn_cast
<ArraySubscriptExpr
>(Base
))
105 Base
= TempASE
->getBase()->IgnoreParenImpCasts();
107 } else if (auto *OASE
= dyn_cast
<OMPArraySectionExpr
>(RefExpr
)) {
108 const Expr
*Base
= OASE
->getBase()->IgnoreParenImpCasts();
109 while (const auto *TempOASE
= dyn_cast
<OMPArraySectionExpr
>(Base
))
110 Base
= TempOASE
->getBase()->IgnoreParenImpCasts();
111 while (const auto *TempASE
= dyn_cast
<ArraySubscriptExpr
>(Base
))
112 Base
= TempASE
->getBase()->IgnoreParenImpCasts();
115 RefExpr
= RefExpr
->IgnoreParenImpCasts();
116 if (const auto *DE
= dyn_cast
<DeclRefExpr
>(RefExpr
))
117 return cast
<ValueDecl
>(DE
->getDecl()->getCanonicalDecl());
118 const auto *ME
= cast
<MemberExpr
>(RefExpr
);
119 return cast
<ValueDecl
>(ME
->getMemberDecl()->getCanonicalDecl());
123 static RecordDecl
*buildRecordForGlobalizedVars(
124 ASTContext
&C
, ArrayRef
<const ValueDecl
*> EscapedDecls
,
125 ArrayRef
<const ValueDecl
*> EscapedDeclsForTeams
,
126 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
127 &MappedDeclsFields
, int BufSize
) {
128 using VarsDataTy
= std::pair
<CharUnits
/*Align*/, const ValueDecl
*>;
129 if (EscapedDecls
.empty() && EscapedDeclsForTeams
.empty())
131 SmallVector
<VarsDataTy
, 4> GlobalizedVars
;
132 for (const ValueDecl
*D
: EscapedDecls
)
133 GlobalizedVars
.emplace_back(
134 CharUnits::fromQuantity(std::max(
135 C
.getDeclAlign(D
).getQuantity(),
136 static_cast<CharUnits::QuantityType
>(GlobalMemoryAlignment
))),
138 for (const ValueDecl
*D
: EscapedDeclsForTeams
)
139 GlobalizedVars
.emplace_back(C
.getDeclAlign(D
), D
);
140 llvm::stable_sort(GlobalizedVars
, [](VarsDataTy L
, VarsDataTy R
) {
141 return L
.first
> R
.first
;
144 // Build struct _globalized_locals_ty {
145 // /* globalized vars */[WarSize] align (max(decl_align,
146 // GlobalMemoryAlignment))
147 // /* globalized vars */ for EscapedDeclsForTeams
149 RecordDecl
*GlobalizedRD
= C
.buildImplicitRecord("_globalized_locals_ty");
150 GlobalizedRD
->startDefinition();
151 llvm::SmallPtrSet
<const ValueDecl
*, 16> SingleEscaped(
152 EscapedDeclsForTeams
.begin(), EscapedDeclsForTeams
.end());
153 for (const auto &Pair
: GlobalizedVars
) {
154 const ValueDecl
*VD
= Pair
.second
;
155 QualType Type
= VD
->getType();
156 if (Type
->isLValueReferenceType())
157 Type
= C
.getPointerType(Type
.getNonReferenceType());
159 Type
= Type
.getNonReferenceType();
160 SourceLocation Loc
= VD
->getLocation();
162 if (SingleEscaped
.count(VD
)) {
163 Field
= FieldDecl::Create(
164 C
, GlobalizedRD
, Loc
, Loc
, VD
->getIdentifier(), Type
,
165 C
.getTrivialTypeSourceInfo(Type
, SourceLocation()),
166 /*BW=*/nullptr, /*Mutable=*/false,
167 /*InitStyle=*/ICIS_NoInit
);
168 Field
->setAccess(AS_public
);
169 if (VD
->hasAttrs()) {
170 for (specific_attr_iterator
<AlignedAttr
> I(VD
->getAttrs().begin()),
171 E(VD
->getAttrs().end());
176 llvm::APInt
ArraySize(32, BufSize
);
177 Type
= C
.getConstantArrayType(Type
, ArraySize
, nullptr,
178 ArraySizeModifier::Normal
, 0);
179 Field
= FieldDecl::Create(
180 C
, GlobalizedRD
, Loc
, Loc
, VD
->getIdentifier(), Type
,
181 C
.getTrivialTypeSourceInfo(Type
, SourceLocation()),
182 /*BW=*/nullptr, /*Mutable=*/false,
183 /*InitStyle=*/ICIS_NoInit
);
184 Field
->setAccess(AS_public
);
185 llvm::APInt
Align(32, std::max(C
.getDeclAlign(VD
).getQuantity(),
186 static_cast<CharUnits::QuantityType
>(
187 GlobalMemoryAlignment
)));
188 Field
->addAttr(AlignedAttr::CreateImplicit(
189 C
, /*IsAlignmentExpr=*/true,
190 IntegerLiteral::Create(C
, Align
,
191 C
.getIntTypeForBitwidth(32, /*Signed=*/0),
193 {}, AlignedAttr::GNU_aligned
));
195 GlobalizedRD
->addDecl(Field
);
196 MappedDeclsFields
.try_emplace(VD
, Field
);
198 GlobalizedRD
->completeDefinition();
202 /// Get the list of variables that can escape their declaration context.
203 class CheckVarsEscapingDeclContext final
204 : public ConstStmtVisitor
<CheckVarsEscapingDeclContext
> {
205 CodeGenFunction
&CGF
;
206 llvm::SetVector
<const ValueDecl
*> EscapedDecls
;
207 llvm::SetVector
<const ValueDecl
*> EscapedVariableLengthDecls
;
208 llvm::SetVector
<const ValueDecl
*> DelayedVariableLengthDecls
;
209 llvm::SmallPtrSet
<const Decl
*, 4> EscapedParameters
;
210 RecordDecl
*GlobalizedRD
= nullptr;
211 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> MappedDeclsFields
;
212 bool AllEscaped
= false;
213 bool IsForCombinedParallelRegion
= false;
215 void markAsEscaped(const ValueDecl
*VD
) {
216 // Do not globalize declare target variables.
217 if (!isa
<VarDecl
>(VD
) ||
218 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD
))
220 VD
= cast
<ValueDecl
>(VD
->getCanonicalDecl());
221 // Use user-specified allocation.
222 if (VD
->hasAttrs() && VD
->hasAttr
<OMPAllocateDeclAttr
>())
224 // Variables captured by value must be globalized.
225 bool IsCaptured
= false;
226 if (auto *CSI
= CGF
.CapturedStmtInfo
) {
227 if (const FieldDecl
*FD
= CSI
->lookup(cast
<VarDecl
>(VD
))) {
228 // Check if need to capture the variable that was already captured by
229 // value in the outer region.
231 if (!IsForCombinedParallelRegion
) {
234 const auto *Attr
= FD
->getAttr
<OMPCaptureKindAttr
>();
237 if (((Attr
->getCaptureKind() != OMPC_map
) &&
238 !isOpenMPPrivate(Attr
->getCaptureKind())) ||
239 ((Attr
->getCaptureKind() == OMPC_map
) &&
240 !FD
->getType()->isAnyPointerType()))
243 if (!FD
->getType()->isReferenceType()) {
244 assert(!VD
->getType()->isVariablyModifiedType() &&
245 "Parameter captured by value with variably modified type");
246 EscapedParameters
.insert(VD
);
247 } else if (!IsForCombinedParallelRegion
) {
252 if ((!CGF
.CapturedStmtInfo
||
253 (IsForCombinedParallelRegion
&& CGF
.CapturedStmtInfo
)) &&
254 VD
->getType()->isReferenceType())
255 // Do not globalize variables with reference type.
257 if (VD
->getType()->isVariablyModifiedType()) {
258 // If not captured at the target region level then mark the escaped
259 // variable as delayed.
261 EscapedVariableLengthDecls
.insert(VD
);
263 DelayedVariableLengthDecls
.insert(VD
);
265 EscapedDecls
.insert(VD
);
268 void VisitValueDecl(const ValueDecl
*VD
) {
269 if (VD
->getType()->isLValueReferenceType())
271 if (const auto *VarD
= dyn_cast
<VarDecl
>(VD
)) {
272 if (!isa
<ParmVarDecl
>(VarD
) && VarD
->hasInit()) {
273 const bool SavedAllEscaped
= AllEscaped
;
274 AllEscaped
= VD
->getType()->isLValueReferenceType();
275 Visit(VarD
->getInit());
276 AllEscaped
= SavedAllEscaped
;
280 void VisitOpenMPCapturedStmt(const CapturedStmt
*S
,
281 ArrayRef
<OMPClause
*> Clauses
,
282 bool IsCombinedParallelRegion
) {
285 for (const CapturedStmt::Capture
&C
: S
->captures()) {
286 if (C
.capturesVariable() && !C
.capturesVariableByCopy()) {
287 const ValueDecl
*VD
= C
.getCapturedVar();
288 bool SavedIsForCombinedParallelRegion
= IsForCombinedParallelRegion
;
289 if (IsCombinedParallelRegion
) {
290 // Check if the variable is privatized in the combined construct and
291 // those private copies must be shared in the inner parallel
293 IsForCombinedParallelRegion
= false;
294 for (const OMPClause
*C
: Clauses
) {
295 if (!isOpenMPPrivate(C
->getClauseKind()) ||
296 C
->getClauseKind() == OMPC_reduction
||
297 C
->getClauseKind() == OMPC_linear
||
298 C
->getClauseKind() == OMPC_private
)
300 ArrayRef
<const Expr
*> Vars
;
301 if (const auto *PC
= dyn_cast
<OMPFirstprivateClause
>(C
))
302 Vars
= PC
->getVarRefs();
303 else if (const auto *PC
= dyn_cast
<OMPLastprivateClause
>(C
))
304 Vars
= PC
->getVarRefs();
306 llvm_unreachable("Unexpected clause.");
307 for (const auto *E
: Vars
) {
309 cast
<DeclRefExpr
>(E
)->getDecl()->getCanonicalDecl();
310 if (D
== VD
->getCanonicalDecl()) {
311 IsForCombinedParallelRegion
= true;
315 if (IsForCombinedParallelRegion
)
320 if (isa
<OMPCapturedExprDecl
>(VD
))
322 IsForCombinedParallelRegion
= SavedIsForCombinedParallelRegion
;
327 void buildRecordForGlobalizedVars(bool IsInTTDRegion
) {
328 assert(!GlobalizedRD
&&
329 "Record for globalized variables is built already.");
330 ArrayRef
<const ValueDecl
*> EscapedDeclsForParallel
, EscapedDeclsForTeams
;
331 unsigned WarpSize
= CGF
.getTarget().getGridValue().GV_Warp_Size
;
333 EscapedDeclsForTeams
= EscapedDecls
.getArrayRef();
335 EscapedDeclsForParallel
= EscapedDecls
.getArrayRef();
336 GlobalizedRD
= ::buildRecordForGlobalizedVars(
337 CGF
.getContext(), EscapedDeclsForParallel
, EscapedDeclsForTeams
,
338 MappedDeclsFields
, WarpSize
);
342 CheckVarsEscapingDeclContext(CodeGenFunction
&CGF
,
343 ArrayRef
<const ValueDecl
*> TeamsReductions
)
344 : CGF(CGF
), EscapedDecls(TeamsReductions
.begin(), TeamsReductions
.end()) {
346 virtual ~CheckVarsEscapingDeclContext() = default;
347 void VisitDeclStmt(const DeclStmt
*S
) {
350 for (const Decl
*D
: S
->decls())
351 if (const auto *VD
= dyn_cast_or_null
<ValueDecl
>(D
))
354 void VisitOMPExecutableDirective(const OMPExecutableDirective
*D
) {
357 if (!D
->hasAssociatedStmt())
360 dyn_cast_or_null
<CapturedStmt
>(D
->getAssociatedStmt())) {
361 // Do not analyze directives that do not actually require capturing,
362 // like `omp for` or `omp simd` directives.
363 llvm::SmallVector
<OpenMPDirectiveKind
, 4> CaptureRegions
;
364 getOpenMPCaptureRegions(CaptureRegions
, D
->getDirectiveKind());
365 if (CaptureRegions
.size() == 1 && CaptureRegions
.back() == OMPD_unknown
) {
366 VisitStmt(S
->getCapturedStmt());
369 VisitOpenMPCapturedStmt(
371 CaptureRegions
.back() == OMPD_parallel
&&
372 isOpenMPDistributeDirective(D
->getDirectiveKind()));
375 void VisitCapturedStmt(const CapturedStmt
*S
) {
378 for (const CapturedStmt::Capture
&C
: S
->captures()) {
379 if (C
.capturesVariable() && !C
.capturesVariableByCopy()) {
380 const ValueDecl
*VD
= C
.getCapturedVar();
382 if (isa
<OMPCapturedExprDecl
>(VD
))
387 void VisitLambdaExpr(const LambdaExpr
*E
) {
390 for (const LambdaCapture
&C
: E
->captures()) {
391 if (C
.capturesVariable()) {
392 if (C
.getCaptureKind() == LCK_ByRef
) {
393 const ValueDecl
*VD
= C
.getCapturedVar();
395 if (E
->isInitCapture(&C
) || isa
<OMPCapturedExprDecl
>(VD
))
401 void VisitBlockExpr(const BlockExpr
*E
) {
404 for (const BlockDecl::Capture
&C
: E
->getBlockDecl()->captures()) {
406 const VarDecl
*VD
= C
.getVariable();
408 if (isa
<OMPCapturedExprDecl
>(VD
) || VD
->isInitCapture())
413 void VisitCallExpr(const CallExpr
*E
) {
416 for (const Expr
*Arg
: E
->arguments()) {
419 if (Arg
->isLValue()) {
420 const bool SavedAllEscaped
= AllEscaped
;
423 AllEscaped
= SavedAllEscaped
;
428 Visit(E
->getCallee());
430 void VisitDeclRefExpr(const DeclRefExpr
*E
) {
433 const ValueDecl
*VD
= E
->getDecl();
436 if (isa
<OMPCapturedExprDecl
>(VD
))
438 else if (VD
->isInitCapture())
441 void VisitUnaryOperator(const UnaryOperator
*E
) {
444 if (E
->getOpcode() == UO_AddrOf
) {
445 const bool SavedAllEscaped
= AllEscaped
;
447 Visit(E
->getSubExpr());
448 AllEscaped
= SavedAllEscaped
;
450 Visit(E
->getSubExpr());
453 void VisitImplicitCastExpr(const ImplicitCastExpr
*E
) {
456 if (E
->getCastKind() == CK_ArrayToPointerDecay
) {
457 const bool SavedAllEscaped
= AllEscaped
;
459 Visit(E
->getSubExpr());
460 AllEscaped
= SavedAllEscaped
;
462 Visit(E
->getSubExpr());
465 void VisitExpr(const Expr
*E
) {
468 bool SavedAllEscaped
= AllEscaped
;
471 for (const Stmt
*Child
: E
->children())
474 AllEscaped
= SavedAllEscaped
;
476 void VisitStmt(const Stmt
*S
) {
479 for (const Stmt
*Child
: S
->children())
484 /// Returns the record that handles all the escaped local variables and used
485 /// instead of their original storage.
486 const RecordDecl
*getGlobalizedRecord(bool IsInTTDRegion
) {
488 buildRecordForGlobalizedVars(IsInTTDRegion
);
492 /// Returns the field in the globalized record for the escaped variable.
493 const FieldDecl
*getFieldForGlobalizedVar(const ValueDecl
*VD
) const {
494 assert(GlobalizedRD
&&
495 "Record for globalized variables must be generated already.");
496 return MappedDeclsFields
.lookup(VD
);
499 /// Returns the list of the escaped local variables/parameters.
500 ArrayRef
<const ValueDecl
*> getEscapedDecls() const {
501 return EscapedDecls
.getArrayRef();
504 /// Checks if the escaped local variable is actually a parameter passed by
506 const llvm::SmallPtrSetImpl
<const Decl
*> &getEscapedParameters() const {
507 return EscapedParameters
;
510 /// Returns the list of the escaped variables with the variably modified
512 ArrayRef
<const ValueDecl
*> getEscapedVariableLengthDecls() const {
513 return EscapedVariableLengthDecls
.getArrayRef();
516 /// Returns the list of the delayed variables with the variably modified
518 ArrayRef
<const ValueDecl
*> getDelayedVariableLengthDecls() const {
519 return DelayedVariableLengthDecls
.getArrayRef();
522 } // anonymous namespace
524 /// Get the id of the warp in the block.
525 /// We assume that the warp size is 32, which is always the case
526 /// on the NVPTX device, to generate more efficient code.
527 static llvm::Value
*getNVPTXWarpID(CodeGenFunction
&CGF
) {
528 CGBuilderTy
&Bld
= CGF
.Builder
;
529 unsigned LaneIDBits
=
530 llvm::Log2_32(CGF
.getTarget().getGridValue().GV_Warp_Size
);
531 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
532 return Bld
.CreateAShr(RT
.getGPUThreadID(CGF
), LaneIDBits
, "nvptx_warp_id");
535 /// Get the id of the current lane in the Warp.
536 /// We assume that the warp size is 32, which is always the case
537 /// on the NVPTX device, to generate more efficient code.
538 static llvm::Value
*getNVPTXLaneID(CodeGenFunction
&CGF
) {
539 CGBuilderTy
&Bld
= CGF
.Builder
;
540 unsigned LaneIDBits
=
541 llvm::Log2_32(CGF
.getTarget().getGridValue().GV_Warp_Size
);
542 assert(LaneIDBits
< 32 && "Invalid LaneIDBits size in NVPTX device.");
543 unsigned LaneIDMask
= ~0u >> (32u - LaneIDBits
);
544 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
545 return Bld
.CreateAnd(RT
.getGPUThreadID(CGF
), Bld
.getInt32(LaneIDMask
),
549 CGOpenMPRuntimeGPU::ExecutionMode
550 CGOpenMPRuntimeGPU::getExecutionMode() const {
551 return CurrentExecutionMode
;
554 CGOpenMPRuntimeGPU::DataSharingMode
555 CGOpenMPRuntimeGPU::getDataSharingMode() const {
556 return CurrentDataSharingMode
;
559 /// Check for inner (nested) SPMD construct, if any
560 static bool hasNestedSPMDDirective(ASTContext
&Ctx
,
561 const OMPExecutableDirective
&D
) {
562 const auto *CS
= D
.getInnermostCapturedStmt();
564 CS
->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
565 const Stmt
*ChildStmt
= CGOpenMPRuntime::getSingleCompoundChild(Ctx
, Body
);
567 if (const auto *NestedDir
=
568 dyn_cast_or_null
<OMPExecutableDirective
>(ChildStmt
)) {
569 OpenMPDirectiveKind DKind
= NestedDir
->getDirectiveKind();
570 switch (D
.getDirectiveKind()) {
572 if (isOpenMPParallelDirective(DKind
))
574 if (DKind
== OMPD_teams
) {
575 Body
= NestedDir
->getInnermostCapturedStmt()->IgnoreContainers(
576 /*IgnoreCaptured=*/true);
579 ChildStmt
= CGOpenMPRuntime::getSingleCompoundChild(Ctx
, Body
);
580 if (const auto *NND
=
581 dyn_cast_or_null
<OMPExecutableDirective
>(ChildStmt
)) {
582 DKind
= NND
->getDirectiveKind();
583 if (isOpenMPParallelDirective(DKind
))
588 case OMPD_target_teams
:
589 return isOpenMPParallelDirective(DKind
);
590 case OMPD_target_simd
:
591 case OMPD_target_parallel
:
592 case OMPD_target_parallel_for
:
593 case OMPD_target_parallel_for_simd
:
594 case OMPD_target_teams_distribute
:
595 case OMPD_target_teams_distribute_simd
:
596 case OMPD_target_teams_distribute_parallel_for
:
597 case OMPD_target_teams_distribute_parallel_for_simd
:
600 case OMPD_parallel_for
:
601 case OMPD_parallel_master
:
602 case OMPD_parallel_sections
:
604 case OMPD_parallel_for_simd
:
606 case OMPD_cancellation_point
:
608 case OMPD_threadprivate
:
626 case OMPD_target_data
:
627 case OMPD_target_exit_data
:
628 case OMPD_target_enter_data
:
629 case OMPD_distribute
:
630 case OMPD_distribute_simd
:
631 case OMPD_distribute_parallel_for
:
632 case OMPD_distribute_parallel_for_simd
:
633 case OMPD_teams_distribute
:
634 case OMPD_teams_distribute_simd
:
635 case OMPD_teams_distribute_parallel_for
:
636 case OMPD_teams_distribute_parallel_for_simd
:
637 case OMPD_target_update
:
638 case OMPD_declare_simd
:
639 case OMPD_declare_variant
:
640 case OMPD_begin_declare_variant
:
641 case OMPD_end_declare_variant
:
642 case OMPD_declare_target
:
643 case OMPD_end_declare_target
:
644 case OMPD_declare_reduction
:
645 case OMPD_declare_mapper
:
647 case OMPD_taskloop_simd
:
648 case OMPD_master_taskloop
:
649 case OMPD_master_taskloop_simd
:
650 case OMPD_parallel_master_taskloop
:
651 case OMPD_parallel_master_taskloop_simd
:
655 llvm_unreachable("Unexpected directive.");
662 static bool supportsSPMDExecutionMode(ASTContext
&Ctx
,
663 const OMPExecutableDirective
&D
) {
664 OpenMPDirectiveKind DirectiveKind
= D
.getDirectiveKind();
665 switch (DirectiveKind
) {
667 case OMPD_target_teams
:
668 return hasNestedSPMDDirective(Ctx
, D
);
669 case OMPD_target_teams_loop
:
670 case OMPD_target_parallel_loop
:
671 case OMPD_target_parallel
:
672 case OMPD_target_parallel_for
:
673 case OMPD_target_parallel_for_simd
:
674 case OMPD_target_teams_distribute_parallel_for
:
675 case OMPD_target_teams_distribute_parallel_for_simd
:
676 case OMPD_target_simd
:
677 case OMPD_target_teams_distribute_simd
:
679 case OMPD_target_teams_distribute
:
683 case OMPD_parallel_for
:
684 case OMPD_parallel_master
:
685 case OMPD_parallel_sections
:
687 case OMPD_parallel_for_simd
:
689 case OMPD_cancellation_point
:
691 case OMPD_threadprivate
:
709 case OMPD_target_data
:
710 case OMPD_target_exit_data
:
711 case OMPD_target_enter_data
:
712 case OMPD_distribute
:
713 case OMPD_distribute_simd
:
714 case OMPD_distribute_parallel_for
:
715 case OMPD_distribute_parallel_for_simd
:
716 case OMPD_teams_distribute
:
717 case OMPD_teams_distribute_simd
:
718 case OMPD_teams_distribute_parallel_for
:
719 case OMPD_teams_distribute_parallel_for_simd
:
720 case OMPD_target_update
:
721 case OMPD_declare_simd
:
722 case OMPD_declare_variant
:
723 case OMPD_begin_declare_variant
:
724 case OMPD_end_declare_variant
:
725 case OMPD_declare_target
:
726 case OMPD_end_declare_target
:
727 case OMPD_declare_reduction
:
728 case OMPD_declare_mapper
:
730 case OMPD_taskloop_simd
:
731 case OMPD_master_taskloop
:
732 case OMPD_master_taskloop_simd
:
733 case OMPD_parallel_master_taskloop
:
734 case OMPD_parallel_master_taskloop_simd
:
741 "Unknown programming model for OpenMP directive on NVPTX target.");
744 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective
&D
,
745 StringRef ParentName
,
746 llvm::Function
*&OutlinedFn
,
747 llvm::Constant
*&OutlinedFnID
,
749 const RegionCodeGenTy
&CodeGen
) {
750 ExecutionRuntimeModesRAII
ModeRAII(CurrentExecutionMode
, EM_NonSPMD
);
751 EntryFunctionState EST
;
752 WrapperFunctionsMap
.clear();
754 [[maybe_unused
]] bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
755 assert(!IsBareKernel
&& "bare kernel should not be at generic mode");
757 // Emit target region as a standalone region.
758 class NVPTXPrePostActionTy
: public PrePostActionTy
{
759 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
;
760 const OMPExecutableDirective
&D
;
763 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState
&EST
,
764 const OMPExecutableDirective
&D
)
766 void Enter(CodeGenFunction
&CGF
) override
{
767 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
768 RT
.emitKernelInit(D
, CGF
, EST
, /* IsSPMD */ false);
769 // Skip target region initialization.
770 RT
.setLocThreadIdInsertPt(CGF
, /*AtCurrentPoint=*/true);
772 void Exit(CodeGenFunction
&CGF
) override
{
773 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
774 RT
.clearLocThreadIdInsertPt(CGF
);
775 RT
.emitKernelDeinit(CGF
, EST
, /* IsSPMD */ false);
778 CodeGen
.setAction(Action
);
779 IsInTTDRegion
= true;
780 emitTargetOutlinedFunctionHelper(D
, ParentName
, OutlinedFn
, OutlinedFnID
,
781 IsOffloadEntry
, CodeGen
);
782 IsInTTDRegion
= false;
785 void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective
&D
,
786 CodeGenFunction
&CGF
,
787 EntryFunctionState
&EST
, bool IsSPMD
) {
788 int32_t MinThreadsVal
= 1, MaxThreadsVal
= -1, MinTeamsVal
= 1,
790 computeMinAndMaxThreadsAndTeams(D
, CGF
, MinThreadsVal
, MaxThreadsVal
,
791 MinTeamsVal
, MaxTeamsVal
);
793 CGBuilderTy
&Bld
= CGF
.Builder
;
794 Bld
.restoreIP(OMPBuilder
.createTargetInit(
795 Bld
, IsSPMD
, MinThreadsVal
, MaxThreadsVal
, MinTeamsVal
, MaxTeamsVal
));
797 emitGenericVarsProlog(CGF
, EST
.Loc
);
800 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction
&CGF
,
801 EntryFunctionState
&EST
,
804 emitGenericVarsEpilog(CGF
);
806 // This is temporary until we remove the fixed sized buffer.
807 ASTContext
&C
= CGM
.getContext();
808 RecordDecl
*StaticRD
= C
.buildImplicitRecord(
809 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union
);
810 StaticRD
->startDefinition();
811 for (const RecordDecl
*TeamReductionRec
: TeamsReductions
) {
812 QualType RecTy
= C
.getRecordType(TeamReductionRec
);
813 auto *Field
= FieldDecl::Create(
814 C
, StaticRD
, SourceLocation(), SourceLocation(), nullptr, RecTy
,
815 C
.getTrivialTypeSourceInfo(RecTy
, SourceLocation()),
816 /*BW=*/nullptr, /*Mutable=*/false,
817 /*InitStyle=*/ICIS_NoInit
);
818 Field
->setAccess(AS_public
);
819 StaticRD
->addDecl(Field
);
821 StaticRD
->completeDefinition();
822 QualType StaticTy
= C
.getRecordType(StaticRD
);
823 llvm::Type
*LLVMReductionsBufferTy
=
824 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
825 const auto &DL
= CGM
.getModule().getDataLayout();
826 uint64_t BufferSize
=
827 DL
.getTypeAllocSize(LLVMReductionsBufferTy
).getFixedValue();
828 CGBuilderTy
&Bld
= CGF
.Builder
;
829 OMPBuilder
.createTargetDeinit(Bld
, BufferSize
);
832 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective
&D
,
833 StringRef ParentName
,
834 llvm::Function
*&OutlinedFn
,
835 llvm::Constant
*&OutlinedFnID
,
837 const RegionCodeGenTy
&CodeGen
) {
838 ExecutionRuntimeModesRAII
ModeRAII(CurrentExecutionMode
, EM_SPMD
);
839 EntryFunctionState EST
;
841 bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
843 // Emit target region as a standalone region.
844 class NVPTXPrePostActionTy
: public PrePostActionTy
{
845 CGOpenMPRuntimeGPU
&RT
;
846 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
;
848 DataSharingMode Mode
;
849 const OMPExecutableDirective
&D
;
852 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU
&RT
,
853 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
,
854 bool IsBareKernel
, const OMPExecutableDirective
&D
)
855 : RT(RT
), EST(EST
), IsBareKernel(IsBareKernel
),
856 Mode(RT
.CurrentDataSharingMode
), D(D
) {}
857 void Enter(CodeGenFunction
&CGF
) override
{
859 RT
.CurrentDataSharingMode
= DataSharingMode::DS_CUDA
;
862 RT
.emitKernelInit(D
, CGF
, EST
, /* IsSPMD */ true);
863 // Skip target region initialization.
864 RT
.setLocThreadIdInsertPt(CGF
, /*AtCurrentPoint=*/true);
866 void Exit(CodeGenFunction
&CGF
) override
{
868 RT
.CurrentDataSharingMode
= Mode
;
871 RT
.clearLocThreadIdInsertPt(CGF
);
872 RT
.emitKernelDeinit(CGF
, EST
, /* IsSPMD */ true);
874 } Action(*this, EST
, IsBareKernel
, D
);
875 CodeGen
.setAction(Action
);
876 IsInTTDRegion
= true;
877 emitTargetOutlinedFunctionHelper(D
, ParentName
, OutlinedFn
, OutlinedFnID
,
878 IsOffloadEntry
, CodeGen
);
879 IsInTTDRegion
= false;
882 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
883 const OMPExecutableDirective
&D
, StringRef ParentName
,
884 llvm::Function
*&OutlinedFn
, llvm::Constant
*&OutlinedFnID
,
885 bool IsOffloadEntry
, const RegionCodeGenTy
&CodeGen
) {
886 if (!IsOffloadEntry
) // Nothing to do.
889 assert(!ParentName
.empty() && "Invalid target region parent name!");
891 bool Mode
= supportsSPMDExecutionMode(CGM
.getContext(), D
);
892 bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
893 if (Mode
|| IsBareKernel
)
894 emitSPMDKernel(D
, ParentName
, OutlinedFn
, OutlinedFnID
, IsOffloadEntry
,
897 emitNonSPMDKernel(D
, ParentName
, OutlinedFn
, OutlinedFnID
, IsOffloadEntry
,
901 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule
&CGM
)
902 : CGOpenMPRuntime(CGM
) {
903 llvm::OpenMPIRBuilderConfig
Config(
904 CGM
.getLangOpts().OpenMPIsTargetDevice
, isGPU(),
905 CGM
.getLangOpts().OpenMPOffloadMandatory
,
906 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
907 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
908 OMPBuilder
.setConfig(Config
);
910 if (!CGM
.getLangOpts().OpenMPIsTargetDevice
)
911 llvm_unreachable("OpenMP can only handle device code.");
913 if (CGM
.getLangOpts().OpenMPCUDAMode
)
914 CurrentDataSharingMode
= CGOpenMPRuntimeGPU::DS_CUDA
;
916 llvm::OpenMPIRBuilder
&OMPBuilder
= getOMPBuilder();
917 if (CGM
.getLangOpts().NoGPULib
|| CGM
.getLangOpts().OMPHostIRFile
.empty())
920 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPTargetDebug
,
921 "__omp_rtl_debug_kind");
922 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPTeamSubscription
,
923 "__omp_rtl_assume_teams_oversubscription");
924 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPThreadSubscription
,
925 "__omp_rtl_assume_threads_oversubscription");
926 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPNoThreadState
,
927 "__omp_rtl_assume_no_thread_state");
928 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPNoNestedParallelism
,
929 "__omp_rtl_assume_no_nested_parallelism");
932 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction
&CGF
,
933 ProcBindKind ProcBind
,
934 SourceLocation Loc
) {
938 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction
&CGF
,
939 llvm::Value
*NumThreads
,
940 SourceLocation Loc
) {
944 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction
&CGF
,
945 const Expr
*NumTeams
,
946 const Expr
*ThreadLimit
,
947 SourceLocation Loc
) {}
949 llvm::Function
*CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
950 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
,
951 const VarDecl
*ThreadIDVar
, OpenMPDirectiveKind InnermostKind
,
952 const RegionCodeGenTy
&CodeGen
) {
953 // Emit target region as a standalone region.
954 bool PrevIsInTTDRegion
= IsInTTDRegion
;
955 IsInTTDRegion
= false;
957 cast
<llvm::Function
>(CGOpenMPRuntime::emitParallelOutlinedFunction(
958 CGF
, D
, ThreadIDVar
, InnermostKind
, CodeGen
));
959 IsInTTDRegion
= PrevIsInTTDRegion
;
960 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD
) {
961 llvm::Function
*WrapperFun
=
962 createParallelDataSharingWrapper(OutlinedFun
, D
);
963 WrapperFunctionsMap
[OutlinedFun
] = WrapperFun
;
969 /// Get list of lastprivate variables from the teams distribute ... or
970 /// teams {distribute ...} directives.
972 getDistributeLastprivateVars(ASTContext
&Ctx
, const OMPExecutableDirective
&D
,
973 llvm::SmallVectorImpl
<const ValueDecl
*> &Vars
) {
974 assert(isOpenMPTeamsDirective(D
.getDirectiveKind()) &&
975 "expected teams directive.");
976 const OMPExecutableDirective
*Dir
= &D
;
977 if (!isOpenMPDistributeDirective(D
.getDirectiveKind())) {
978 if (const Stmt
*S
= CGOpenMPRuntime::getSingleCompoundChild(
980 D
.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
981 /*IgnoreCaptured=*/true))) {
982 Dir
= dyn_cast_or_null
<OMPExecutableDirective
>(S
);
983 if (Dir
&& !isOpenMPDistributeDirective(Dir
->getDirectiveKind()))
989 for (const auto *C
: Dir
->getClausesOfKind
<OMPLastprivateClause
>()) {
990 for (const Expr
*E
: C
->getVarRefs())
991 Vars
.push_back(getPrivateItem(E
));
995 /// Get list of reduction variables from the teams ... directives.
997 getTeamsReductionVars(ASTContext
&Ctx
, const OMPExecutableDirective
&D
,
998 llvm::SmallVectorImpl
<const ValueDecl
*> &Vars
) {
999 assert(isOpenMPTeamsDirective(D
.getDirectiveKind()) &&
1000 "expected teams directive.");
1001 for (const auto *C
: D
.getClausesOfKind
<OMPReductionClause
>()) {
1002 for (const Expr
*E
: C
->privates())
1003 Vars
.push_back(getPrivateItem(E
));
1007 llvm::Function
*CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1008 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
,
1009 const VarDecl
*ThreadIDVar
, OpenMPDirectiveKind InnermostKind
,
1010 const RegionCodeGenTy
&CodeGen
) {
1011 SourceLocation Loc
= D
.getBeginLoc();
1013 const RecordDecl
*GlobalizedRD
= nullptr;
1014 llvm::SmallVector
<const ValueDecl
*, 4> LastPrivatesReductions
;
1015 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> MappedDeclsFields
;
1016 unsigned WarpSize
= CGM
.getTarget().getGridValue().GV_Warp_Size
;
1017 // Globalize team reductions variable unconditionally in all modes.
1018 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD
)
1019 getTeamsReductionVars(CGM
.getContext(), D
, LastPrivatesReductions
);
1020 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
) {
1021 getDistributeLastprivateVars(CGM
.getContext(), D
, LastPrivatesReductions
);
1022 if (!LastPrivatesReductions
.empty()) {
1023 GlobalizedRD
= ::buildRecordForGlobalizedVars(
1024 CGM
.getContext(), std::nullopt
, LastPrivatesReductions
,
1025 MappedDeclsFields
, WarpSize
);
1027 } else if (!LastPrivatesReductions
.empty()) {
1028 assert(!TeamAndReductions
.first
&&
1029 "Previous team declaration is not expected.");
1030 TeamAndReductions
.first
= D
.getCapturedStmt(OMPD_teams
)->getCapturedDecl();
1031 std::swap(TeamAndReductions
.second
, LastPrivatesReductions
);
1034 // Emit target region as a standalone region.
1035 class NVPTXPrePostActionTy
: public PrePostActionTy
{
1036 SourceLocation
&Loc
;
1037 const RecordDecl
*GlobalizedRD
;
1038 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
1042 NVPTXPrePostActionTy(
1043 SourceLocation
&Loc
, const RecordDecl
*GlobalizedRD
,
1044 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
1046 : Loc(Loc
), GlobalizedRD(GlobalizedRD
),
1047 MappedDeclsFields(MappedDeclsFields
) {}
1048 void Enter(CodeGenFunction
&CGF
) override
{
1050 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
1052 auto I
= Rt
.FunctionGlobalizedDecls
.try_emplace(CGF
.CurFn
).first
;
1053 I
->getSecond().MappedParams
=
1054 std::make_unique
<CodeGenFunction::OMPMapVars
>();
1055 DeclToAddrMapTy
&Data
= I
->getSecond().LocalVarData
;
1056 for (const auto &Pair
: MappedDeclsFields
) {
1057 assert(Pair
.getFirst()->isCanonicalDecl() &&
1058 "Expected canonical declaration");
1059 Data
.insert(std::make_pair(Pair
.getFirst(), MappedVarData()));
1062 Rt
.emitGenericVarsProlog(CGF
, Loc
);
1064 void Exit(CodeGenFunction
&CGF
) override
{
1065 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime())
1066 .emitGenericVarsEpilog(CGF
);
1068 } Action(Loc
, GlobalizedRD
, MappedDeclsFields
);
1069 CodeGen
.setAction(Action
);
1070 llvm::Function
*OutlinedFun
= CGOpenMPRuntime::emitTeamsOutlinedFunction(
1071 CGF
, D
, ThreadIDVar
, InnermostKind
, CodeGen
);
1076 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction
&CGF
,
1077 SourceLocation Loc
) {
1078 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
1081 CGBuilderTy
&Bld
= CGF
.Builder
;
1083 const auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
1084 if (I
== FunctionGlobalizedDecls
.end())
1087 for (auto &Rec
: I
->getSecond().LocalVarData
) {
1088 const auto *VD
= cast
<VarDecl
>(Rec
.first
);
1089 bool EscapedParam
= I
->getSecond().EscapedParameters
.count(Rec
.first
);
1090 QualType VarTy
= VD
->getType();
1092 // Get the local allocation of a firstprivate variable before sharing
1093 llvm::Value
*ParValue
;
1096 CGF
.MakeAddrLValue(CGF
.GetAddrOfLocalVar(VD
), VD
->getType());
1097 ParValue
= CGF
.EmitLoadOfScalar(ParLVal
, Loc
);
1100 // Allocate space for the variable to be globalized
1101 llvm::Value
*AllocArgs
[] = {CGF
.getTypeSize(VD
->getType())};
1102 llvm::CallBase
*VoidPtr
=
1103 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1104 CGM
.getModule(), OMPRTL___kmpc_alloc_shared
),
1105 AllocArgs
, VD
->getName());
1106 // FIXME: We should use the variables actual alignment as an argument.
1107 VoidPtr
->addRetAttr(llvm::Attribute::get(
1108 CGM
.getLLVMContext(), llvm::Attribute::Alignment
,
1109 CGM
.getContext().getTargetInfo().getNewAlign() / 8));
1111 // Cast the void pointer and get the address of the globalized variable.
1112 llvm::PointerType
*VarPtrTy
= CGF
.ConvertTypeForMem(VarTy
)->getPointerTo();
1113 llvm::Value
*CastedVoidPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1114 VoidPtr
, VarPtrTy
, VD
->getName() + "_on_stack");
1115 LValue VarAddr
= CGF
.MakeNaturalAlignAddrLValue(CastedVoidPtr
, VarTy
);
1116 Rec
.second
.PrivateAddr
= VarAddr
.getAddress(CGF
);
1117 Rec
.second
.GlobalizedVal
= VoidPtr
;
1119 // Assign the local allocation to the newly globalized location.
1121 CGF
.EmitStoreOfScalar(ParValue
, VarAddr
);
1122 I
->getSecond().MappedParams
->setVarAddr(CGF
, VD
, VarAddr
.getAddress(CGF
));
1124 if (auto *DI
= CGF
.getDebugInfo())
1125 VoidPtr
->setDebugLoc(DI
->SourceLocToDebugLoc(VD
->getLocation()));
1128 for (const auto *ValueD
: I
->getSecond().EscapedVariableLengthDecls
) {
1129 const auto *VD
= cast
<VarDecl
>(ValueD
);
1130 std::pair
<llvm::Value
*, llvm::Value
*> AddrSizePair
=
1131 getKmpcAllocShared(CGF
, VD
);
1132 I
->getSecond().EscapedVariableLengthDeclsAddrs
.emplace_back(AddrSizePair
);
1133 LValue Base
= CGF
.MakeAddrLValue(AddrSizePair
.first
, VD
->getType(),
1134 CGM
.getContext().getDeclAlign(VD
),
1135 AlignmentSource::Decl
);
1136 I
->getSecond().MappedParams
->setVarAddr(CGF
, VD
, Base
.getAddress(CGF
));
1138 I
->getSecond().MappedParams
->apply(CGF
);
1141 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction
&CGF
,
1142 const VarDecl
*VD
) const {
1143 const auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
1144 if (I
== FunctionGlobalizedDecls
.end())
1147 // Check variable declaration is delayed:
1148 return llvm::is_contained(I
->getSecond().DelayedVariableLengthDecls
, VD
);
1151 std::pair
<llvm::Value
*, llvm::Value
*>
1152 CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction
&CGF
,
1153 const VarDecl
*VD
) {
1154 CGBuilderTy
&Bld
= CGF
.Builder
;
1156 // Compute size and alignment.
1157 llvm::Value
*Size
= CGF
.getTypeSize(VD
->getType());
1158 CharUnits Align
= CGM
.getContext().getDeclAlign(VD
);
1159 Size
= Bld
.CreateNUWAdd(
1160 Size
, llvm::ConstantInt::get(CGF
.SizeTy
, Align
.getQuantity() - 1));
1161 llvm::Value
*AlignVal
=
1162 llvm::ConstantInt::get(CGF
.SizeTy
, Align
.getQuantity());
1163 Size
= Bld
.CreateUDiv(Size
, AlignVal
);
1164 Size
= Bld
.CreateNUWMul(Size
, AlignVal
);
1166 // Allocate space for this VLA object to be globalized.
1167 llvm::Value
*AllocArgs
[] = {Size
};
1168 llvm::CallBase
*VoidPtr
=
1169 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1170 CGM
.getModule(), OMPRTL___kmpc_alloc_shared
),
1171 AllocArgs
, VD
->getName());
1172 VoidPtr
->addRetAttr(llvm::Attribute::get(
1173 CGM
.getLLVMContext(), llvm::Attribute::Alignment
, Align
.getQuantity()));
1175 return std::make_pair(VoidPtr
, Size
);
1178 void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1179 CodeGenFunction
&CGF
,
1180 const std::pair
<llvm::Value
*, llvm::Value
*> &AddrSizePair
) {
1181 // Deallocate the memory for each globalized VLA object
1182 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1183 CGM
.getModule(), OMPRTL___kmpc_free_shared
),
1184 {AddrSizePair
.first
, AddrSizePair
.second
});
1187 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction
&CGF
) {
1188 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
1191 const auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
1192 if (I
!= FunctionGlobalizedDecls
.end()) {
1193 // Deallocate the memory for each globalized VLA object that was
1194 // globalized in the prolog (i.e. emitGenericVarsProlog).
1195 for (const auto &AddrSizePair
:
1196 llvm::reverse(I
->getSecond().EscapedVariableLengthDeclsAddrs
)) {
1197 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1198 CGM
.getModule(), OMPRTL___kmpc_free_shared
),
1199 {AddrSizePair
.first
, AddrSizePair
.second
});
1201 // Deallocate the memory for each globalized value
1202 for (auto &Rec
: llvm::reverse(I
->getSecond().LocalVarData
)) {
1203 const auto *VD
= cast
<VarDecl
>(Rec
.first
);
1204 I
->getSecond().MappedParams
->restore(CGF
);
1206 llvm::Value
*FreeArgs
[] = {Rec
.second
.GlobalizedVal
,
1207 CGF
.getTypeSize(VD
->getType())};
1208 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1209 CGM
.getModule(), OMPRTL___kmpc_free_shared
),
1215 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction
&CGF
,
1216 const OMPExecutableDirective
&D
,
1218 llvm::Function
*OutlinedFn
,
1219 ArrayRef
<llvm::Value
*> CapturedVars
) {
1220 if (!CGF
.HaveInsertPoint())
1223 bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
1225 Address ZeroAddr
= CGF
.CreateDefaultAlignTempAlloca(CGF
.Int32Ty
,
1226 /*Name=*/".zero.addr");
1227 CGF
.Builder
.CreateStore(CGF
.Builder
.getInt32(/*C*/ 0), ZeroAddr
);
1228 llvm::SmallVector
<llvm::Value
*, 16> OutlinedFnArgs
;
1229 // We don't emit any thread id function call in bare kernel, but because the
1230 // outlined function has a pointer argument, we emit a nullptr here.
1232 OutlinedFnArgs
.push_back(llvm::ConstantPointerNull::get(CGM
.VoidPtrTy
));
1234 OutlinedFnArgs
.push_back(emitThreadIDAddress(CGF
, Loc
).getPointer());
1235 OutlinedFnArgs
.push_back(ZeroAddr
.getPointer());
1236 OutlinedFnArgs
.append(CapturedVars
.begin(), CapturedVars
.end());
1237 emitOutlinedFunctionCall(CGF
, Loc
, OutlinedFn
, OutlinedFnArgs
);
1240 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction
&CGF
,
1242 llvm::Function
*OutlinedFn
,
1243 ArrayRef
<llvm::Value
*> CapturedVars
,
1245 llvm::Value
*NumThreads
) {
1246 if (!CGF
.HaveInsertPoint())
1249 auto &&ParallelGen
= [this, Loc
, OutlinedFn
, CapturedVars
, IfCond
,
1250 NumThreads
](CodeGenFunction
&CGF
,
1251 PrePostActionTy
&Action
) {
1252 CGBuilderTy
&Bld
= CGF
.Builder
;
1253 llvm::Value
*NumThreadsVal
= NumThreads
;
1254 llvm::Function
*WFn
= WrapperFunctionsMap
[OutlinedFn
];
1255 llvm::Value
*ID
= llvm::ConstantPointerNull::get(CGM
.Int8PtrTy
);
1257 ID
= Bld
.CreateBitOrPointerCast(WFn
, CGM
.Int8PtrTy
);
1258 llvm::Value
*FnPtr
= Bld
.CreateBitOrPointerCast(OutlinedFn
, CGM
.Int8PtrTy
);
1260 // Create a private scope that will globalize the arguments
1261 // passed from the outside of the target region.
1262 // TODO: Is that needed?
1263 CodeGenFunction::OMPPrivateScope
PrivateArgScope(CGF
);
1265 Address CapturedVarsAddrs
= CGF
.CreateDefaultAlignTempAlloca(
1266 llvm::ArrayType::get(CGM
.VoidPtrTy
, CapturedVars
.size()),
1267 "captured_vars_addrs");
1268 // There's something to share.
1269 if (!CapturedVars
.empty()) {
1270 // Prepare for parallel region. Indicate the outlined function.
1271 ASTContext
&Ctx
= CGF
.getContext();
1273 for (llvm::Value
*V
: CapturedVars
) {
1274 Address Dst
= Bld
.CreateConstArrayGEP(CapturedVarsAddrs
, Idx
);
1276 if (V
->getType()->isIntegerTy())
1277 PtrV
= Bld
.CreateIntToPtr(V
, CGF
.VoidPtrTy
);
1279 PtrV
= Bld
.CreatePointerBitCastOrAddrSpaceCast(V
, CGF
.VoidPtrTy
);
1280 CGF
.EmitStoreOfScalar(PtrV
, Dst
, /*Volatile=*/false,
1281 Ctx
.getPointerType(Ctx
.VoidPtrTy
));
1286 llvm::Value
*IfCondVal
= nullptr;
1288 IfCondVal
= Bld
.CreateIntCast(CGF
.EvaluateExprAsBool(IfCond
), CGF
.Int32Ty
,
1289 /* isSigned */ false);
1291 IfCondVal
= llvm::ConstantInt::get(CGF
.Int32Ty
, 1);
1294 NumThreadsVal
= llvm::ConstantInt::get(CGF
.Int32Ty
, -1);
1296 NumThreadsVal
= Bld
.CreateZExtOrTrunc(NumThreadsVal
, CGF
.Int32Ty
),
1298 assert(IfCondVal
&& "Expected a value");
1299 llvm::Value
*RTLoc
= emitUpdateLocation(CGF
, Loc
);
1300 llvm::Value
*Args
[] = {
1302 getThreadID(CGF
, Loc
),
1305 llvm::ConstantInt::get(CGF
.Int32Ty
, -1),
1308 Bld
.CreateBitOrPointerCast(CapturedVarsAddrs
.getPointer(),
1310 llvm::ConstantInt::get(CGM
.SizeTy
, CapturedVars
.size())};
1311 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1312 CGM
.getModule(), OMPRTL___kmpc_parallel_51
),
1316 RegionCodeGenTy
RCG(ParallelGen
);
1320 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction
&CGF
) {
1321 // Always emit simple barriers!
1322 if (!CGF
.HaveInsertPoint())
1324 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1325 // This function does not use parameters, so we can emit just default values.
1326 llvm::Value
*Args
[] = {
1327 llvm::ConstantPointerNull::get(
1328 cast
<llvm::PointerType
>(getIdentTyPointerTy())),
1329 llvm::ConstantInt::get(CGF
.Int32Ty
, /*V=*/0, /*isSigned=*/true)};
1330 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1331 CGM
.getModule(), OMPRTL___kmpc_barrier_simple_spmd
),
1335 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction
&CGF
,
1337 OpenMPDirectiveKind Kind
, bool,
1339 // Always emit simple barriers!
1340 if (!CGF
.HaveInsertPoint())
1342 // Build call __kmpc_cancel_barrier(loc, thread_id);
1343 unsigned Flags
= getDefaultFlagsForBarriers(Kind
);
1344 llvm::Value
*Args
[] = {emitUpdateLocation(CGF
, Loc
, Flags
),
1345 getThreadID(CGF
, Loc
)};
1347 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1348 CGM
.getModule(), OMPRTL___kmpc_barrier
),
1352 void CGOpenMPRuntimeGPU::emitCriticalRegion(
1353 CodeGenFunction
&CGF
, StringRef CriticalName
,
1354 const RegionCodeGenTy
&CriticalOpGen
, SourceLocation Loc
,
1356 llvm::BasicBlock
*LoopBB
= CGF
.createBasicBlock("omp.critical.loop");
1357 llvm::BasicBlock
*TestBB
= CGF
.createBasicBlock("omp.critical.test");
1358 llvm::BasicBlock
*SyncBB
= CGF
.createBasicBlock("omp.critical.sync");
1359 llvm::BasicBlock
*BodyBB
= CGF
.createBasicBlock("omp.critical.body");
1360 llvm::BasicBlock
*ExitBB
= CGF
.createBasicBlock("omp.critical.exit");
1362 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
1364 // Get the mask of active threads in the warp.
1365 llvm::Value
*Mask
= CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1366 CGM
.getModule(), OMPRTL___kmpc_warp_active_thread_mask
));
1367 // Fetch team-local id of the thread.
1368 llvm::Value
*ThreadID
= RT
.getGPUThreadID(CGF
);
1370 // Get the width of the team.
1371 llvm::Value
*TeamWidth
= RT
.getGPUNumThreads(CGF
);
1373 // Initialize the counter variable for the loop.
1375 CGF
.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1376 Address Counter
= CGF
.CreateMemTemp(Int32Ty
, "critical_counter");
1377 LValue CounterLVal
= CGF
.MakeAddrLValue(Counter
, Int32Ty
);
1378 CGF
.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM
.Int32Ty
), CounterLVal
,
1381 // Block checks if loop counter exceeds upper bound.
1382 CGF
.EmitBlock(LoopBB
);
1383 llvm::Value
*CounterVal
= CGF
.EmitLoadOfScalar(CounterLVal
, Loc
);
1384 llvm::Value
*CmpLoopBound
= CGF
.Builder
.CreateICmpSLT(CounterVal
, TeamWidth
);
1385 CGF
.Builder
.CreateCondBr(CmpLoopBound
, TestBB
, ExitBB
);
1387 // Block tests which single thread should execute region, and which threads
1388 // should go straight to synchronisation point.
1389 CGF
.EmitBlock(TestBB
);
1390 CounterVal
= CGF
.EmitLoadOfScalar(CounterLVal
, Loc
);
1391 llvm::Value
*CmpThreadToCounter
=
1392 CGF
.Builder
.CreateICmpEQ(ThreadID
, CounterVal
);
1393 CGF
.Builder
.CreateCondBr(CmpThreadToCounter
, BodyBB
, SyncBB
);
1395 // Block emits the body of the critical region.
1396 CGF
.EmitBlock(BodyBB
);
1398 // Output the critical statement.
1399 CGOpenMPRuntime::emitCriticalRegion(CGF
, CriticalName
, CriticalOpGen
, Loc
,
1402 // After the body surrounded by the critical region, the single executing
1403 // thread will jump to the synchronisation point.
1404 // Block waits for all threads in current team to finish then increments the
1405 // counter variable and returns to the loop.
1406 CGF
.EmitBlock(SyncBB
);
1407 // Reconverge active threads in the warp.
1408 (void)CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1409 CGM
.getModule(), OMPRTL___kmpc_syncwarp
),
1412 llvm::Value
*IncCounterVal
=
1413 CGF
.Builder
.CreateNSWAdd(CounterVal
, CGF
.Builder
.getInt32(1));
1414 CGF
.EmitStoreOfScalar(IncCounterVal
, CounterLVal
);
1415 CGF
.EmitBranch(LoopBB
);
1417 // Block that is reached when all threads in the team complete the region.
1418 CGF
.EmitBlock(ExitBB
, /*IsFinished=*/true);
1421 /// Cast value to the specified type.
1422 static llvm::Value
*castValueToType(CodeGenFunction
&CGF
, llvm::Value
*Val
,
1423 QualType ValTy
, QualType CastTy
,
1424 SourceLocation Loc
) {
1425 assert(!CGF
.getContext().getTypeSizeInChars(CastTy
).isZero() &&
1426 "Cast type must sized.");
1427 assert(!CGF
.getContext().getTypeSizeInChars(ValTy
).isZero() &&
1428 "Val type must sized.");
1429 llvm::Type
*LLVMCastTy
= CGF
.ConvertTypeForMem(CastTy
);
1430 if (ValTy
== CastTy
)
1432 if (CGF
.getContext().getTypeSizeInChars(ValTy
) ==
1433 CGF
.getContext().getTypeSizeInChars(CastTy
))
1434 return CGF
.Builder
.CreateBitCast(Val
, LLVMCastTy
);
1435 if (CastTy
->isIntegerType() && ValTy
->isIntegerType())
1436 return CGF
.Builder
.CreateIntCast(Val
, LLVMCastTy
,
1437 CastTy
->hasSignedIntegerRepresentation());
1438 Address CastItem
= CGF
.CreateMemTemp(CastTy
);
1439 Address ValCastItem
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
1440 CastItem
, Val
->getType()->getPointerTo(CastItem
.getAddressSpace()),
1442 CGF
.EmitStoreOfScalar(Val
, ValCastItem
, /*Volatile=*/false, ValTy
,
1443 LValueBaseInfo(AlignmentSource::Type
),
1445 return CGF
.EmitLoadOfScalar(CastItem
, /*Volatile=*/false, CastTy
, Loc
,
1446 LValueBaseInfo(AlignmentSource::Type
),
1450 /// This function creates calls to one of two shuffle functions to copy
1451 /// variables between lanes in a warp.
1452 static llvm::Value
*createRuntimeShuffleFunction(CodeGenFunction
&CGF
,
1455 llvm::Value
*Offset
,
1456 SourceLocation Loc
) {
1457 CodeGenModule
&CGM
= CGF
.CGM
;
1458 CGBuilderTy
&Bld
= CGF
.Builder
;
1459 CGOpenMPRuntimeGPU
&RT
=
1460 *(static_cast<CGOpenMPRuntimeGPU
*>(&CGM
.getOpenMPRuntime()));
1461 llvm::OpenMPIRBuilder
&OMPBuilder
= RT
.getOMPBuilder();
1463 CharUnits Size
= CGF
.getContext().getTypeSizeInChars(ElemType
);
1464 assert(Size
.getQuantity() <= 8 &&
1465 "Unsupported bitwidth in shuffle instruction.");
1467 RuntimeFunction ShuffleFn
= Size
.getQuantity() <= 4
1468 ? OMPRTL___kmpc_shuffle_int32
1469 : OMPRTL___kmpc_shuffle_int64
;
1471 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1472 QualType CastTy
= CGF
.getContext().getIntTypeForBitwidth(
1473 Size
.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1474 llvm::Value
*ElemCast
= castValueToType(CGF
, Elem
, ElemType
, CastTy
, Loc
);
1475 llvm::Value
*WarpSize
=
1476 Bld
.CreateIntCast(RT
.getGPUWarpSize(CGF
), CGM
.Int16Ty
, /*isSigned=*/true);
1478 llvm::Value
*ShuffledVal
= CGF
.EmitRuntimeCall(
1479 OMPBuilder
.getOrCreateRuntimeFunction(CGM
.getModule(), ShuffleFn
),
1480 {ElemCast
, Offset
, WarpSize
});
1482 return castValueToType(CGF
, ShuffledVal
, CastTy
, ElemType
, Loc
);
1485 static void shuffleAndStore(CodeGenFunction
&CGF
, Address SrcAddr
,
1486 Address DestAddr
, QualType ElemType
,
1487 llvm::Value
*Offset
, SourceLocation Loc
) {
1488 CGBuilderTy
&Bld
= CGF
.Builder
;
1490 CharUnits Size
= CGF
.getContext().getTypeSizeInChars(ElemType
);
1491 // Create the loop over the big sized data.
1492 // ptr = (void*)Elem;
1493 // ptrEnd = (void*) Elem + 1;
1495 // while (ptr + Step < ptrEnd)
1496 // shuffle((int64_t)*ptr);
1498 // while (ptr + Step < ptrEnd)
1499 // shuffle((int32_t)*ptr);
1501 Address ElemPtr
= DestAddr
;
1502 Address Ptr
= SrcAddr
;
1503 Address PtrEnd
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1504 Bld
.CreateConstGEP(SrcAddr
, 1), CGF
.VoidPtrTy
, CGF
.Int8Ty
);
1505 for (int IntSize
= 8; IntSize
>= 1; IntSize
/= 2) {
1506 if (Size
< CharUnits::fromQuantity(IntSize
))
1508 QualType IntType
= CGF
.getContext().getIntTypeForBitwidth(
1509 CGF
.getContext().toBits(CharUnits::fromQuantity(IntSize
)),
1511 llvm::Type
*IntTy
= CGF
.ConvertTypeForMem(IntType
);
1512 Ptr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(Ptr
, IntTy
->getPointerTo(),
1514 ElemPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1515 ElemPtr
, IntTy
->getPointerTo(), IntTy
);
1516 if (Size
.getQuantity() / IntSize
> 1) {
1517 llvm::BasicBlock
*PreCondBB
= CGF
.createBasicBlock(".shuffle.pre_cond");
1518 llvm::BasicBlock
*ThenBB
= CGF
.createBasicBlock(".shuffle.then");
1519 llvm::BasicBlock
*ExitBB
= CGF
.createBasicBlock(".shuffle.exit");
1520 llvm::BasicBlock
*CurrentBB
= Bld
.GetInsertBlock();
1521 CGF
.EmitBlock(PreCondBB
);
1522 llvm::PHINode
*PhiSrc
=
1523 Bld
.CreatePHI(Ptr
.getType(), /*NumReservedValues=*/2);
1524 PhiSrc
->addIncoming(Ptr
.getPointer(), CurrentBB
);
1525 llvm::PHINode
*PhiDest
=
1526 Bld
.CreatePHI(ElemPtr
.getType(), /*NumReservedValues=*/2);
1527 PhiDest
->addIncoming(ElemPtr
.getPointer(), CurrentBB
);
1528 Ptr
= Address(PhiSrc
, Ptr
.getElementType(), Ptr
.getAlignment());
1530 Address(PhiDest
, ElemPtr
.getElementType(), ElemPtr
.getAlignment());
1531 llvm::Value
*PtrDiff
= Bld
.CreatePtrDiff(
1532 CGF
.Int8Ty
, PtrEnd
.getPointer(),
1533 Bld
.CreatePointerBitCastOrAddrSpaceCast(Ptr
.getPointer(),
1535 Bld
.CreateCondBr(Bld
.CreateICmpSGT(PtrDiff
, Bld
.getInt64(IntSize
- 1)),
1537 CGF
.EmitBlock(ThenBB
);
1538 llvm::Value
*Res
= createRuntimeShuffleFunction(
1540 CGF
.EmitLoadOfScalar(Ptr
, /*Volatile=*/false, IntType
, Loc
,
1541 LValueBaseInfo(AlignmentSource::Type
),
1543 IntType
, Offset
, Loc
);
1544 CGF
.EmitStoreOfScalar(Res
, ElemPtr
, /*Volatile=*/false, IntType
,
1545 LValueBaseInfo(AlignmentSource::Type
),
1547 Address LocalPtr
= Bld
.CreateConstGEP(Ptr
, 1);
1548 Address LocalElemPtr
= Bld
.CreateConstGEP(ElemPtr
, 1);
1549 PhiSrc
->addIncoming(LocalPtr
.getPointer(), ThenBB
);
1550 PhiDest
->addIncoming(LocalElemPtr
.getPointer(), ThenBB
);
1551 CGF
.EmitBranch(PreCondBB
);
1552 CGF
.EmitBlock(ExitBB
);
1554 llvm::Value
*Res
= createRuntimeShuffleFunction(
1556 CGF
.EmitLoadOfScalar(Ptr
, /*Volatile=*/false, IntType
, Loc
,
1557 LValueBaseInfo(AlignmentSource::Type
),
1559 IntType
, Offset
, Loc
);
1560 CGF
.EmitStoreOfScalar(Res
, ElemPtr
, /*Volatile=*/false, IntType
,
1561 LValueBaseInfo(AlignmentSource::Type
),
1563 Ptr
= Bld
.CreateConstGEP(Ptr
, 1);
1564 ElemPtr
= Bld
.CreateConstGEP(ElemPtr
, 1);
1566 Size
= Size
% IntSize
;
1571 enum CopyAction
: unsigned {
1572 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1573 // the warp using shuffle instructions.
1575 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1577 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1579 // ScratchpadToThread: Copy from a scratchpad array in global memory
1580 // containing team-reduced data to a thread's stack.
1585 struct CopyOptionsTy
{
1586 llvm::Value
*RemoteLaneOffset
;
1587 llvm::Value
*ScratchpadIndex
;
1588 llvm::Value
*ScratchpadWidth
;
1591 /// Emit instructions to copy a Reduce list, which contains partially
1592 /// aggregated values, in the specified direction.
1593 static void emitReductionListCopy(
1594 CopyAction Action
, CodeGenFunction
&CGF
, QualType ReductionArrayTy
,
1595 ArrayRef
<const Expr
*> Privates
, Address SrcBase
, Address DestBase
,
1596 CopyOptionsTy CopyOptions
= {nullptr, nullptr, nullptr}) {
1598 CodeGenModule
&CGM
= CGF
.CGM
;
1599 ASTContext
&C
= CGM
.getContext();
1600 CGBuilderTy
&Bld
= CGF
.Builder
;
1602 llvm::Value
*RemoteLaneOffset
= CopyOptions
.RemoteLaneOffset
;
1603 llvm::Value
*ScratchpadIndex
= CopyOptions
.ScratchpadIndex
;
1604 llvm::Value
*ScratchpadWidth
= CopyOptions
.ScratchpadWidth
;
1606 // Iterates, element-by-element, through the source Reduce list and
1609 unsigned Size
= Privates
.size();
1610 for (const Expr
*Private
: Privates
) {
1611 Address SrcElementAddr
= Address::invalid();
1612 Address DestElementAddr
= Address::invalid();
1613 Address DestElementPtrAddr
= Address::invalid();
1614 // Should we shuffle in an element from a remote lane?
1615 bool ShuffleInElement
= false;
1616 // Set to true to update the pointer in the dest Reduce list to a
1617 // newly created element.
1618 bool UpdateDestListPtr
= false;
1619 // Increment the src or dest pointer to the scratchpad, for each
1621 bool IncrScratchpadSrc
= false;
1622 bool IncrScratchpadDest
= false;
1623 QualType PrivatePtrType
= C
.getPointerType(Private
->getType());
1624 llvm::Type
*PrivateLlvmPtrType
= CGF
.ConvertType(PrivatePtrType
);
1627 case RemoteLaneToThread
: {
1628 // Step 1.1: Get the address for the src element in the Reduce list.
1629 Address SrcElementPtrAddr
= Bld
.CreateConstArrayGEP(SrcBase
, Idx
);
1630 SrcElementAddr
= CGF
.EmitLoadOfPointer(
1631 SrcElementPtrAddr
.withElementType(PrivateLlvmPtrType
),
1632 PrivatePtrType
->castAs
<PointerType
>());
1634 // Step 1.2: Create a temporary to store the element in the destination
1636 DestElementPtrAddr
= Bld
.CreateConstArrayGEP(DestBase
, Idx
);
1638 CGF
.CreateMemTemp(Private
->getType(), ".omp.reduction.element");
1639 ShuffleInElement
= true;
1640 UpdateDestListPtr
= true;
1644 // Step 1.1: Get the address for the src element in the Reduce list.
1645 Address SrcElementPtrAddr
= Bld
.CreateConstArrayGEP(SrcBase
, Idx
);
1646 SrcElementAddr
= CGF
.EmitLoadOfPointer(
1647 SrcElementPtrAddr
.withElementType(PrivateLlvmPtrType
),
1648 PrivatePtrType
->castAs
<PointerType
>());
1650 // Step 1.2: Get the address for dest element. The destination
1651 // element has already been created on the thread's stack.
1652 DestElementPtrAddr
= Bld
.CreateConstArrayGEP(DestBase
, Idx
);
1653 DestElementAddr
= CGF
.EmitLoadOfPointer(
1654 DestElementPtrAddr
.withElementType(PrivateLlvmPtrType
),
1655 PrivatePtrType
->castAs
<PointerType
>());
1658 case ThreadToScratchpad
: {
1659 // Step 1.1: Get the address for the src element in the Reduce list.
1660 Address SrcElementPtrAddr
= Bld
.CreateConstArrayGEP(SrcBase
, Idx
);
1661 SrcElementAddr
= CGF
.EmitLoadOfPointer(
1662 SrcElementPtrAddr
.withElementType(PrivateLlvmPtrType
),
1663 PrivatePtrType
->castAs
<PointerType
>());
1665 // Step 1.2: Get the address for dest element:
1666 // address = base + index * ElementSizeInChars.
1667 llvm::Value
*ElementSizeInChars
= CGF
.getTypeSize(Private
->getType());
1668 llvm::Value
*CurrentOffset
=
1669 Bld
.CreateNUWMul(ElementSizeInChars
, ScratchpadIndex
);
1670 llvm::Value
*ScratchPadElemAbsolutePtrVal
=
1671 Bld
.CreateNUWAdd(DestBase
.getPointer(), CurrentOffset
);
1672 ScratchPadElemAbsolutePtrVal
=
1673 Bld
.CreateIntToPtr(ScratchPadElemAbsolutePtrVal
, CGF
.VoidPtrTy
);
1674 DestElementAddr
= Address(ScratchPadElemAbsolutePtrVal
, CGF
.Int8Ty
,
1675 C
.getTypeAlignInChars(Private
->getType()));
1676 IncrScratchpadDest
= true;
1679 case ScratchpadToThread
: {
1680 // Step 1.1: Get the address for the src element in the scratchpad.
1681 // address = base + index * ElementSizeInChars.
1682 llvm::Value
*ElementSizeInChars
= CGF
.getTypeSize(Private
->getType());
1683 llvm::Value
*CurrentOffset
=
1684 Bld
.CreateNUWMul(ElementSizeInChars
, ScratchpadIndex
);
1685 llvm::Value
*ScratchPadElemAbsolutePtrVal
=
1686 Bld
.CreateNUWAdd(SrcBase
.getPointer(), CurrentOffset
);
1687 ScratchPadElemAbsolutePtrVal
=
1688 Bld
.CreateIntToPtr(ScratchPadElemAbsolutePtrVal
, CGF
.VoidPtrTy
);
1689 SrcElementAddr
= Address(ScratchPadElemAbsolutePtrVal
, CGF
.Int8Ty
,
1690 C
.getTypeAlignInChars(Private
->getType()));
1691 IncrScratchpadSrc
= true;
1693 // Step 1.2: Create a temporary to store the element in the destination
1695 DestElementPtrAddr
= Bld
.CreateConstArrayGEP(DestBase
, Idx
);
1697 CGF
.CreateMemTemp(Private
->getType(), ".omp.reduction.element");
1698 UpdateDestListPtr
= true;
1703 // Regardless of src and dest of copy, we emit the load of src
1704 // element as this is required in all directions
1705 SrcElementAddr
= SrcElementAddr
.withElementType(
1706 CGF
.ConvertTypeForMem(Private
->getType()));
1708 DestElementAddr
.withElementType(SrcElementAddr
.getElementType());
1710 // Now that all active lanes have read the element in the
1711 // Reduce list, shuffle over the value from the remote lane.
1712 if (ShuffleInElement
) {
1713 shuffleAndStore(CGF
, SrcElementAddr
, DestElementAddr
, Private
->getType(),
1714 RemoteLaneOffset
, Private
->getExprLoc());
1716 switch (CGF
.getEvaluationKind(Private
->getType())) {
1718 llvm::Value
*Elem
= CGF
.EmitLoadOfScalar(
1719 SrcElementAddr
, /*Volatile=*/false, Private
->getType(),
1720 Private
->getExprLoc(), LValueBaseInfo(AlignmentSource::Type
),
1722 // Store the source element value to the dest element address.
1723 CGF
.EmitStoreOfScalar(
1724 Elem
, DestElementAddr
, /*Volatile=*/false, Private
->getType(),
1725 LValueBaseInfo(AlignmentSource::Type
), TBAAAccessInfo());
1729 CodeGenFunction::ComplexPairTy Elem
= CGF
.EmitLoadOfComplex(
1730 CGF
.MakeAddrLValue(SrcElementAddr
, Private
->getType()),
1731 Private
->getExprLoc());
1732 CGF
.EmitStoreOfComplex(
1733 Elem
, CGF
.MakeAddrLValue(DestElementAddr
, Private
->getType()),
1738 CGF
.EmitAggregateCopy(
1739 CGF
.MakeAddrLValue(DestElementAddr
, Private
->getType()),
1740 CGF
.MakeAddrLValue(SrcElementAddr
, Private
->getType()),
1741 Private
->getType(), AggValueSlot::DoesNotOverlap
);
1746 // Step 3.1: Modify reference in dest Reduce list as needed.
1747 // Modifying the reference in Reduce list to point to the newly
1748 // created element. The element is live in the current function
1749 // scope and that of functions it invokes (i.e., reduce_function).
1750 // RemoteReduceData[i] = (void*)&RemoteElem
1751 if (UpdateDestListPtr
) {
1752 CGF
.EmitStoreOfScalar(Bld
.CreatePointerBitCastOrAddrSpaceCast(
1753 DestElementAddr
.getPointer(), CGF
.VoidPtrTy
),
1754 DestElementPtrAddr
, /*Volatile=*/false,
1758 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1759 // address of the next element in scratchpad memory, unless we're currently
1760 // processing the last one. Memory alignment is also taken care of here.
1761 if ((IncrScratchpadDest
|| IncrScratchpadSrc
) && (Idx
+ 1 < Size
)) {
1762 // FIXME: This code doesn't make any sense, it's trying to perform
1763 // integer arithmetic on pointers.
1764 llvm::Value
*ScratchpadBasePtr
=
1765 IncrScratchpadDest
? DestBase
.getPointer() : SrcBase
.getPointer();
1766 llvm::Value
*ElementSizeInChars
= CGF
.getTypeSize(Private
->getType());
1767 ScratchpadBasePtr
= Bld
.CreateNUWAdd(
1769 Bld
.CreateNUWMul(ScratchpadWidth
, ElementSizeInChars
));
1771 // Take care of global memory alignment for performance
1772 ScratchpadBasePtr
= Bld
.CreateNUWSub(
1773 ScratchpadBasePtr
, llvm::ConstantInt::get(CGM
.SizeTy
, 1));
1774 ScratchpadBasePtr
= Bld
.CreateUDiv(
1776 llvm::ConstantInt::get(CGM
.SizeTy
, GlobalMemoryAlignment
));
1777 ScratchpadBasePtr
= Bld
.CreateNUWAdd(
1778 ScratchpadBasePtr
, llvm::ConstantInt::get(CGM
.SizeTy
, 1));
1779 ScratchpadBasePtr
= Bld
.CreateNUWMul(
1781 llvm::ConstantInt::get(CGM
.SizeTy
, GlobalMemoryAlignment
));
1783 if (IncrScratchpadDest
)
1785 Address(ScratchpadBasePtr
, CGF
.VoidPtrTy
, CGF
.getPointerAlign());
1786 else /* IncrScratchpadSrc = true */
1788 Address(ScratchpadBasePtr
, CGF
.VoidPtrTy
, CGF
.getPointerAlign());
1795 /// This function emits a helper that gathers Reduce lists from the first
1796 /// lane of every active warp to lanes in the first warp.
1798 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1799 /// shared smem[warp_size];
1800 /// For all data entries D in reduce_data:
1802 /// If (I am the first lane in each warp)
1803 /// Copy my local D to smem[warp_id]
1805 /// if (I am the first warp)
1806 /// Copy smem[thread_id] to my local D
1807 static llvm::Value
*emitInterWarpCopyFunction(CodeGenModule
&CGM
,
1808 ArrayRef
<const Expr
*> Privates
,
1809 QualType ReductionArrayTy
,
1810 SourceLocation Loc
) {
1811 ASTContext
&C
= CGM
.getContext();
1812 llvm::Module
&M
= CGM
.getModule();
1814 // ReduceList: thread local Reduce list.
1815 // At the stage of the computation when this function is called, partially
1816 // aggregated values reside in the first lane of every active warp.
1817 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
1818 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
1819 // NumWarps: number of warps active in the parallel region. This could
1820 // be smaller than 32 (max warps in a CTA) for partial block reduction.
1821 ImplicitParamDecl
NumWarpsArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
1822 C
.getIntTypeForBitwidth(32, /* Signed */ true),
1823 ImplicitParamDecl::Other
);
1824 FunctionArgList Args
;
1825 Args
.push_back(&ReduceListArg
);
1826 Args
.push_back(&NumWarpsArg
);
1828 const CGFunctionInfo
&CGFI
=
1829 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
1830 auto *Fn
= llvm::Function::Create(CGM
.getTypes().GetFunctionType(CGFI
),
1831 llvm::GlobalValue::InternalLinkage
,
1832 "_omp_reduction_inter_warp_copy_func", &M
);
1833 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
1834 Fn
->setDoesNotRecurse();
1835 CodeGenFunction
CGF(CGM
);
1836 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
1838 CGBuilderTy
&Bld
= CGF
.Builder
;
1840 // This array is used as a medium to transfer, one reduce element at a time,
1841 // the data from the first lane of every warp to lanes in the first warp
1842 // in order to perform the final step of a reduction in a parallel region
1843 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1844 // for reduced latency, as well as to have a distinct copy for concurrently
1845 // executing target regions. The array is declared with common linkage so
1846 // as to be shared across compilation units.
1847 StringRef TransferMediumName
=
1848 "__openmp_nvptx_data_transfer_temporary_storage";
1849 llvm::GlobalVariable
*TransferMedium
=
1850 M
.getGlobalVariable(TransferMediumName
);
1851 unsigned WarpSize
= CGF
.getTarget().getGridValue().GV_Warp_Size
;
1852 if (!TransferMedium
) {
1853 auto *Ty
= llvm::ArrayType::get(CGM
.Int32Ty
, WarpSize
);
1854 unsigned SharedAddressSpace
= C
.getTargetAddressSpace(LangAS::cuda_shared
);
1855 TransferMedium
= new llvm::GlobalVariable(
1856 M
, Ty
, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage
,
1857 llvm::UndefValue::get(Ty
), TransferMediumName
,
1858 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal
,
1859 SharedAddressSpace
);
1860 CGM
.addCompilerUsedGlobal(TransferMedium
);
1863 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
1864 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1865 llvm::Value
*ThreadID
= RT
.getGPUThreadID(CGF
);
1866 // nvptx_lane_id = nvptx_id % warpsize
1867 llvm::Value
*LaneID
= getNVPTXLaneID(CGF
);
1868 // nvptx_warp_id = nvptx_id / warpsize
1869 llvm::Value
*WarpID
= getNVPTXWarpID(CGF
);
1871 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
1872 llvm::Type
*ElemTy
= CGF
.ConvertTypeForMem(ReductionArrayTy
);
1873 Address
LocalReduceList(
1874 Bld
.CreatePointerBitCastOrAddrSpaceCast(
1875 CGF
.EmitLoadOfScalar(
1876 AddrReduceListArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
,
1877 LValueBaseInfo(AlignmentSource::Type
), TBAAAccessInfo()),
1878 ElemTy
->getPointerTo()),
1879 ElemTy
, CGF
.getPointerAlign());
1882 for (const Expr
*Private
: Privates
) {
1884 // Warp master copies reduce element to transfer medium in __shared__
1887 unsigned RealTySize
=
1888 C
.getTypeSizeInChars(Private
->getType())
1889 .alignTo(C
.getTypeAlignInChars(Private
->getType()))
1891 for (unsigned TySize
= 4; TySize
> 0 && RealTySize
> 0; TySize
/=2) {
1892 unsigned NumIters
= RealTySize
/ TySize
;
1895 QualType CType
= C
.getIntTypeForBitwidth(
1896 C
.toBits(CharUnits::fromQuantity(TySize
)), /*Signed=*/1);
1897 llvm::Type
*CopyType
= CGF
.ConvertTypeForMem(CType
);
1898 CharUnits Align
= CharUnits::fromQuantity(TySize
);
1899 llvm::Value
*Cnt
= nullptr;
1900 Address CntAddr
= Address::invalid();
1901 llvm::BasicBlock
*PrecondBB
= nullptr;
1902 llvm::BasicBlock
*ExitBB
= nullptr;
1904 CntAddr
= CGF
.CreateMemTemp(C
.IntTy
, ".cnt.addr");
1905 CGF
.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM
.IntTy
), CntAddr
,
1906 /*Volatile=*/false, C
.IntTy
);
1907 PrecondBB
= CGF
.createBasicBlock("precond");
1908 ExitBB
= CGF
.createBasicBlock("exit");
1909 llvm::BasicBlock
*BodyBB
= CGF
.createBasicBlock("body");
1910 // There is no need to emit line number for unconditional branch.
1911 (void)ApplyDebugLocation::CreateEmpty(CGF
);
1912 CGF
.EmitBlock(PrecondBB
);
1913 Cnt
= CGF
.EmitLoadOfScalar(CntAddr
, /*Volatile=*/false, C
.IntTy
, Loc
);
1915 Bld
.CreateICmpULT(Cnt
, llvm::ConstantInt::get(CGM
.IntTy
, NumIters
));
1916 Bld
.CreateCondBr(Cmp
, BodyBB
, ExitBB
);
1917 CGF
.EmitBlock(BodyBB
);
1920 CGM
.getOpenMPRuntime().emitBarrierCall(CGF
, Loc
, OMPD_unknown
,
1921 /*EmitChecks=*/false,
1922 /*ForceSimpleCall=*/true);
1923 llvm::BasicBlock
*ThenBB
= CGF
.createBasicBlock("then");
1924 llvm::BasicBlock
*ElseBB
= CGF
.createBasicBlock("else");
1925 llvm::BasicBlock
*MergeBB
= CGF
.createBasicBlock("ifcont");
1927 // if (lane_id == 0)
1928 llvm::Value
*IsWarpMaster
= Bld
.CreateIsNull(LaneID
, "warp_master");
1929 Bld
.CreateCondBr(IsWarpMaster
, ThenBB
, ElseBB
);
1930 CGF
.EmitBlock(ThenBB
);
1932 // Reduce element = LocalReduceList[i]
1933 Address ElemPtrPtrAddr
= Bld
.CreateConstArrayGEP(LocalReduceList
, Idx
);
1934 llvm::Value
*ElemPtrPtr
= CGF
.EmitLoadOfScalar(
1935 ElemPtrPtrAddr
, /*Volatile=*/false, C
.VoidPtrTy
, SourceLocation());
1936 // elemptr = ((CopyType*)(elemptrptr)) + I
1937 Address
ElemPtr(ElemPtrPtr
, CopyType
, Align
);
1939 ElemPtr
= Bld
.CreateGEP(ElemPtr
, Cnt
);
1941 // Get pointer to location in transfer medium.
1942 // MediumPtr = &medium[warp_id]
1943 llvm::Value
*MediumPtrVal
= Bld
.CreateInBoundsGEP(
1944 TransferMedium
->getValueType(), TransferMedium
,
1945 {llvm::Constant::getNullValue(CGM
.Int64Ty
), WarpID
});
1946 // Casting to actual data type.
1947 // MediumPtr = (CopyType*)MediumPtrAddr;
1951 CopyType
->getPointerTo(
1952 MediumPtrVal
->getType()->getPointerAddressSpace())),
1957 llvm::Value
*Elem
= CGF
.EmitLoadOfScalar(
1958 ElemPtr
, /*Volatile=*/false, CType
, Loc
,
1959 LValueBaseInfo(AlignmentSource::Type
), TBAAAccessInfo());
1960 // Store the source element value to the dest element address.
1961 CGF
.EmitStoreOfScalar(Elem
, MediumPtr
, /*Volatile=*/true, CType
,
1962 LValueBaseInfo(AlignmentSource::Type
),
1965 Bld
.CreateBr(MergeBB
);
1967 CGF
.EmitBlock(ElseBB
);
1968 Bld
.CreateBr(MergeBB
);
1970 CGF
.EmitBlock(MergeBB
);
1973 CGM
.getOpenMPRuntime().emitBarrierCall(CGF
, Loc
, OMPD_unknown
,
1974 /*EmitChecks=*/false,
1975 /*ForceSimpleCall=*/true);
1978 // Warp 0 copies reduce element from transfer medium.
1980 llvm::BasicBlock
*W0ThenBB
= CGF
.createBasicBlock("then");
1981 llvm::BasicBlock
*W0ElseBB
= CGF
.createBasicBlock("else");
1982 llvm::BasicBlock
*W0MergeBB
= CGF
.createBasicBlock("ifcont");
1984 Address AddrNumWarpsArg
= CGF
.GetAddrOfLocalVar(&NumWarpsArg
);
1985 llvm::Value
*NumWarpsVal
= CGF
.EmitLoadOfScalar(
1986 AddrNumWarpsArg
, /*Volatile=*/false, C
.IntTy
, Loc
);
1988 // Up to 32 threads in warp 0 are active.
1989 llvm::Value
*IsActiveThread
=
1990 Bld
.CreateICmpULT(ThreadID
, NumWarpsVal
, "is_active_thread");
1991 Bld
.CreateCondBr(IsActiveThread
, W0ThenBB
, W0ElseBB
);
1993 CGF
.EmitBlock(W0ThenBB
);
1995 // SrcMediumPtr = &medium[tid]
1996 llvm::Value
*SrcMediumPtrVal
= Bld
.CreateInBoundsGEP(
1997 TransferMedium
->getValueType(), TransferMedium
,
1998 {llvm::Constant::getNullValue(CGM
.Int64Ty
), ThreadID
});
1999 // SrcMediumVal = *SrcMediumPtr;
2000 Address
SrcMediumPtr(
2003 CopyType
->getPointerTo(
2004 SrcMediumPtrVal
->getType()->getPointerAddressSpace())),
2007 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2008 Address TargetElemPtrPtr
= Bld
.CreateConstArrayGEP(LocalReduceList
, Idx
);
2009 llvm::Value
*TargetElemPtrVal
= CGF
.EmitLoadOfScalar(
2010 TargetElemPtrPtr
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
);
2011 Address
TargetElemPtr(TargetElemPtrVal
, CopyType
, Align
);
2013 TargetElemPtr
= Bld
.CreateGEP(TargetElemPtr
, Cnt
);
2015 // *TargetElemPtr = SrcMediumVal;
2016 llvm::Value
*SrcMediumValue
=
2017 CGF
.EmitLoadOfScalar(SrcMediumPtr
, /*Volatile=*/true, CType
, Loc
);
2018 CGF
.EmitStoreOfScalar(SrcMediumValue
, TargetElemPtr
, /*Volatile=*/false,
2020 Bld
.CreateBr(W0MergeBB
);
2022 CGF
.EmitBlock(W0ElseBB
);
2023 Bld
.CreateBr(W0MergeBB
);
2025 CGF
.EmitBlock(W0MergeBB
);
2028 Cnt
= Bld
.CreateNSWAdd(Cnt
, llvm::ConstantInt::get(CGM
.IntTy
, /*V=*/1));
2029 CGF
.EmitStoreOfScalar(Cnt
, CntAddr
, /*Volatile=*/false, C
.IntTy
);
2030 CGF
.EmitBranch(PrecondBB
);
2031 (void)ApplyDebugLocation::CreateEmpty(CGF
);
2032 CGF
.EmitBlock(ExitBB
);
2034 RealTySize
%= TySize
;
2039 CGF
.FinishFunction();
2043 /// Emit a helper that reduces data across two OpenMP threads (lanes)
2044 /// in the same warp. It uses shuffle instructions to copy over data from
2045 /// a remote lane's stack. The reduction algorithm performed is specified
2046 /// by the fourth parameter.
2048 /// Algorithm Versions.
2049 /// Full Warp Reduce (argument value 0):
2050 /// This algorithm assumes that all 32 lanes are active and gathers
2051 /// data from these 32 lanes, producing a single resultant value.
2052 /// Contiguous Partial Warp Reduce (argument value 1):
2053 /// This algorithm assumes that only a *contiguous* subset of lanes
2054 /// are active. This happens for the last warp in a parallel region
2055 /// when the user specified num_threads is not an integer multiple of
2056 /// 32. This contiguous subset always starts with the zeroth lane.
2057 /// Partial Warp Reduce (argument value 2):
2058 /// This algorithm gathers data from any number of lanes at any position.
2059 /// All reduced values are stored in the lowest possible lane. The set
2060 /// of problems every algorithm addresses is a super set of those
2061 /// addressable by algorithms with a lower version number. Overhead
2062 /// increases as algorithm version increases.
2066 /// Reduce element refers to the individual data field with primitive
2067 /// data types to be combined and reduced across threads.
2069 /// Reduce list refers to a collection of local, thread-private
2070 /// reduce elements.
2071 /// Remote Reduce list:
2072 /// Remote Reduce list refers to a collection of remote (relative to
2073 /// the current thread) reduce elements.
2075 /// We distinguish between three states of threads that are important to
2076 /// the implementation of this function.
2078 /// Threads in a warp executing the SIMT instruction, as distinguished from
2079 /// threads that are inactive due to divergent control flow.
2081 /// The minimal set of threads that has to be alive upon entry to this
2082 /// function. The computation is correct iff active threads are alive.
2083 /// Some threads are alive but they are not active because they do not
2084 /// contribute to the computation in any useful manner. Turning them off
2085 /// may introduce control flow overheads without any tangible benefits.
2086 /// Effective threads:
2087 /// In order to comply with the argument requirements of the shuffle
2088 /// function, we must keep all lanes holding data alive. But at most
2089 /// half of them perform value aggregation; we refer to this half of
2090 /// threads as effective. The other half is simply handing off their
2095 /// In this step active threads transfer data from higher lane positions
2096 /// in the warp to lower lane positions, creating Remote Reduce list.
2097 /// Value aggregation:
2098 /// In this step, effective threads combine their thread local Reduce list
2099 /// with Remote Reduce list and store the result in the thread local
2102 /// In this step, we deal with the assumption made by algorithm 2
2103 /// (i.e. contiguity assumption). When we have an odd number of lanes
2104 /// active, say 2k+1, only k threads will be effective and therefore k
2105 /// new values will be produced. However, the Reduce list owned by the
2106 /// (2k+1)th thread is ignored in the value aggregation. Therefore
2107 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2108 /// that the contiguity assumption still holds.
2109 static llvm::Function
*emitShuffleAndReduceFunction(
2110 CodeGenModule
&CGM
, ArrayRef
<const Expr
*> Privates
,
2111 QualType ReductionArrayTy
, llvm::Function
*ReduceFn
, SourceLocation Loc
) {
2112 ASTContext
&C
= CGM
.getContext();
2114 // Thread local Reduce list used to host the values of data to be reduced.
2115 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2116 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2117 // Current lane id; could be logical.
2118 ImplicitParamDecl
LaneIDArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr, C
.ShortTy
,
2119 ImplicitParamDecl::Other
);
2120 // Offset of the remote source lane relative to the current lane.
2121 ImplicitParamDecl
RemoteLaneOffsetArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2122 C
.ShortTy
, ImplicitParamDecl::Other
);
2123 // Algorithm version. This is expected to be known at compile time.
2124 ImplicitParamDecl
AlgoVerArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2125 C
.ShortTy
, ImplicitParamDecl::Other
);
2126 FunctionArgList Args
;
2127 Args
.push_back(&ReduceListArg
);
2128 Args
.push_back(&LaneIDArg
);
2129 Args
.push_back(&RemoteLaneOffsetArg
);
2130 Args
.push_back(&AlgoVerArg
);
2132 const CGFunctionInfo
&CGFI
=
2133 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
2134 auto *Fn
= llvm::Function::Create(
2135 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
2136 "_omp_reduction_shuffle_and_reduce_func", &CGM
.getModule());
2137 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
2138 Fn
->setDoesNotRecurse();
2140 CodeGenFunction
CGF(CGM
);
2141 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
2143 CGBuilderTy
&Bld
= CGF
.Builder
;
2145 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
2146 llvm::Type
*ElemTy
= CGF
.ConvertTypeForMem(ReductionArrayTy
);
2147 Address
LocalReduceList(
2148 Bld
.CreatePointerBitCastOrAddrSpaceCast(
2149 CGF
.EmitLoadOfScalar(AddrReduceListArg
, /*Volatile=*/false,
2150 C
.VoidPtrTy
, SourceLocation()),
2151 ElemTy
->getPointerTo()),
2152 ElemTy
, CGF
.getPointerAlign());
2154 Address AddrLaneIDArg
= CGF
.GetAddrOfLocalVar(&LaneIDArg
);
2155 llvm::Value
*LaneIDArgVal
= CGF
.EmitLoadOfScalar(
2156 AddrLaneIDArg
, /*Volatile=*/false, C
.ShortTy
, SourceLocation());
2158 Address AddrRemoteLaneOffsetArg
= CGF
.GetAddrOfLocalVar(&RemoteLaneOffsetArg
);
2159 llvm::Value
*RemoteLaneOffsetArgVal
= CGF
.EmitLoadOfScalar(
2160 AddrRemoteLaneOffsetArg
, /*Volatile=*/false, C
.ShortTy
, SourceLocation());
2162 Address AddrAlgoVerArg
= CGF
.GetAddrOfLocalVar(&AlgoVerArg
);
2163 llvm::Value
*AlgoVerArgVal
= CGF
.EmitLoadOfScalar(
2164 AddrAlgoVerArg
, /*Volatile=*/false, C
.ShortTy
, SourceLocation());
2166 // Create a local thread-private variable to host the Reduce list
2167 // from a remote lane.
2168 Address RemoteReduceList
=
2169 CGF
.CreateMemTemp(ReductionArrayTy
, ".omp.reduction.remote_reduce_list");
2171 // This loop iterates through the list of reduce elements and copies,
2172 // element by element, from a remote lane in the warp to RemoteReduceList,
2173 // hosted on the thread's stack.
2174 emitReductionListCopy(RemoteLaneToThread
, CGF
, ReductionArrayTy
, Privates
,
2175 LocalReduceList
, RemoteReduceList
,
2176 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal
,
2177 /*ScratchpadIndex=*/nullptr,
2178 /*ScratchpadWidth=*/nullptr});
2180 // The actions to be performed on the Remote Reduce list is dependent
2181 // on the algorithm version.
2183 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2184 // LaneId % 2 == 0 && Offset > 0):
2185 // do the reduction value aggregation
2187 // The thread local variable Reduce list is mutated in place to host the
2188 // reduced data, which is the aggregated value produced from local and
2191 // Note that AlgoVer is expected to be a constant integer known at compile
2193 // When AlgoVer==0, the first conjunction evaluates to true, making
2194 // the entire predicate true during compile time.
2195 // When AlgoVer==1, the second conjunction has only the second part to be
2196 // evaluated during runtime. Other conjunctions evaluates to false
2197 // during compile time.
2198 // When AlgoVer==2, the third conjunction has only the second part to be
2199 // evaluated during runtime. Other conjunctions evaluates to false
2200 // during compile time.
2201 llvm::Value
*CondAlgo0
= Bld
.CreateIsNull(AlgoVerArgVal
);
2203 llvm::Value
*Algo1
= Bld
.CreateICmpEQ(AlgoVerArgVal
, Bld
.getInt16(1));
2204 llvm::Value
*CondAlgo1
= Bld
.CreateAnd(
2205 Algo1
, Bld
.CreateICmpULT(LaneIDArgVal
, RemoteLaneOffsetArgVal
));
2207 llvm::Value
*Algo2
= Bld
.CreateICmpEQ(AlgoVerArgVal
, Bld
.getInt16(2));
2208 llvm::Value
*CondAlgo2
= Bld
.CreateAnd(
2209 Algo2
, Bld
.CreateIsNull(Bld
.CreateAnd(LaneIDArgVal
, Bld
.getInt16(1))));
2210 CondAlgo2
= Bld
.CreateAnd(
2211 CondAlgo2
, Bld
.CreateICmpSGT(RemoteLaneOffsetArgVal
, Bld
.getInt16(0)));
2213 llvm::Value
*CondReduce
= Bld
.CreateOr(CondAlgo0
, CondAlgo1
);
2214 CondReduce
= Bld
.CreateOr(CondReduce
, CondAlgo2
);
2216 llvm::BasicBlock
*ThenBB
= CGF
.createBasicBlock("then");
2217 llvm::BasicBlock
*ElseBB
= CGF
.createBasicBlock("else");
2218 llvm::BasicBlock
*MergeBB
= CGF
.createBasicBlock("ifcont");
2219 Bld
.CreateCondBr(CondReduce
, ThenBB
, ElseBB
);
2221 CGF
.EmitBlock(ThenBB
);
2222 // reduce_function(LocalReduceList, RemoteReduceList)
2223 llvm::Value
*LocalReduceListPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2224 LocalReduceList
.getPointer(), CGF
.VoidPtrTy
);
2225 llvm::Value
*RemoteReduceListPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2226 RemoteReduceList
.getPointer(), CGF
.VoidPtrTy
);
2227 CGM
.getOpenMPRuntime().emitOutlinedFunctionCall(
2228 CGF
, Loc
, ReduceFn
, {LocalReduceListPtr
, RemoteReduceListPtr
});
2229 Bld
.CreateBr(MergeBB
);
2231 CGF
.EmitBlock(ElseBB
);
2232 Bld
.CreateBr(MergeBB
);
2234 CGF
.EmitBlock(MergeBB
);
2236 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2238 Algo1
= Bld
.CreateICmpEQ(AlgoVerArgVal
, Bld
.getInt16(1));
2239 llvm::Value
*CondCopy
= Bld
.CreateAnd(
2240 Algo1
, Bld
.CreateICmpUGE(LaneIDArgVal
, RemoteLaneOffsetArgVal
));
2242 llvm::BasicBlock
*CpyThenBB
= CGF
.createBasicBlock("then");
2243 llvm::BasicBlock
*CpyElseBB
= CGF
.createBasicBlock("else");
2244 llvm::BasicBlock
*CpyMergeBB
= CGF
.createBasicBlock("ifcont");
2245 Bld
.CreateCondBr(CondCopy
, CpyThenBB
, CpyElseBB
);
2247 CGF
.EmitBlock(CpyThenBB
);
2248 emitReductionListCopy(ThreadCopy
, CGF
, ReductionArrayTy
, Privates
,
2249 RemoteReduceList
, LocalReduceList
);
2250 Bld
.CreateBr(CpyMergeBB
);
2252 CGF
.EmitBlock(CpyElseBB
);
2253 Bld
.CreateBr(CpyMergeBB
);
2255 CGF
.EmitBlock(CpyMergeBB
);
2257 CGF
.FinishFunction();
2261 /// This function emits a helper that copies all the reduction variables from
2262 /// the team into the provided global buffer for the reduction variables.
2264 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2265 /// For all data entries D in reduce_data:
2266 /// Copy local D to buffer.D[Idx]
2267 static llvm::Value
*emitListToGlobalCopyFunction(
2268 CodeGenModule
&CGM
, ArrayRef
<const Expr
*> Privates
,
2269 QualType ReductionArrayTy
, SourceLocation Loc
,
2270 const RecordDecl
*TeamReductionRec
,
2271 const llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
2273 ASTContext
&C
= CGM
.getContext();
2275 // Buffer: global reduction buffer.
2276 ImplicitParamDecl
BufferArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2277 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2278 // Idx: index of the buffer.
2279 ImplicitParamDecl
IdxArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr, C
.IntTy
,
2280 ImplicitParamDecl::Other
);
2281 // ReduceList: thread local Reduce list.
2282 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2283 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2284 FunctionArgList Args
;
2285 Args
.push_back(&BufferArg
);
2286 Args
.push_back(&IdxArg
);
2287 Args
.push_back(&ReduceListArg
);
2289 const CGFunctionInfo
&CGFI
=
2290 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
2291 auto *Fn
= llvm::Function::Create(
2292 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
2293 "_omp_reduction_list_to_global_copy_func", &CGM
.getModule());
2294 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
2295 Fn
->setDoesNotRecurse();
2296 CodeGenFunction
CGF(CGM
);
2297 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
2299 CGBuilderTy
&Bld
= CGF
.Builder
;
2301 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
2302 Address AddrBufferArg
= CGF
.GetAddrOfLocalVar(&BufferArg
);
2303 llvm::Type
*ElemTy
= CGF
.ConvertTypeForMem(ReductionArrayTy
);
2304 Address
LocalReduceList(
2305 Bld
.CreatePointerBitCastOrAddrSpaceCast(
2306 CGF
.EmitLoadOfScalar(AddrReduceListArg
, /*Volatile=*/false,
2308 ElemTy
->getPointerTo()),
2309 ElemTy
, CGF
.getPointerAlign());
2310 QualType StaticTy
= C
.getRecordType(TeamReductionRec
);
2311 llvm::Type
*LLVMReductionsBufferTy
=
2312 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
2313 llvm::Value
*BufferArrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2314 CGF
.EmitLoadOfScalar(AddrBufferArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
),
2315 LLVMReductionsBufferTy
->getPointerTo());
2316 llvm::Value
*Idxs
[] = {llvm::ConstantInt::getNullValue(CGF
.Int32Ty
),
2317 CGF
.EmitLoadOfScalar(CGF
.GetAddrOfLocalVar(&IdxArg
),
2318 /*Volatile=*/false, C
.IntTy
,
2321 for (const Expr
*Private
: Privates
) {
2322 // Reduce element = LocalReduceList[i]
2323 Address ElemPtrPtrAddr
= Bld
.CreateConstArrayGEP(LocalReduceList
, Idx
);
2324 llvm::Value
*ElemPtrPtr
= CGF
.EmitLoadOfScalar(
2325 ElemPtrPtrAddr
, /*Volatile=*/false, C
.VoidPtrTy
, SourceLocation());
2326 // elemptr = ((CopyType*)(elemptrptr)) + I
2327 ElemTy
= CGF
.ConvertTypeForMem(Private
->getType());
2328 ElemPtrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2329 ElemPtrPtr
, ElemTy
->getPointerTo());
2331 Address(ElemPtrPtr
, ElemTy
, C
.getTypeAlignInChars(Private
->getType()));
2332 const ValueDecl
*VD
= cast
<DeclRefExpr
>(Private
)->getDecl();
2333 // Global = Buffer.VD[Idx];
2334 const FieldDecl
*FD
= VarFieldMap
.lookup(VD
);
2335 LValue GlobLVal
= CGF
.EmitLValueForField(
2336 CGF
.MakeNaturalAlignAddrLValue(BufferArrPtr
, StaticTy
), FD
);
2337 Address GlobAddr
= GlobLVal
.getAddress(CGF
);
2338 llvm::Value
*BufferPtr
= Bld
.CreateInBoundsGEP(GlobAddr
.getElementType(),
2339 GlobAddr
.getPointer(), Idxs
);
2340 GlobLVal
.setAddress(Address(BufferPtr
,
2341 CGF
.ConvertTypeForMem(Private
->getType()),
2342 GlobAddr
.getAlignment()));
2343 switch (CGF
.getEvaluationKind(Private
->getType())) {
2345 llvm::Value
*V
= CGF
.EmitLoadOfScalar(
2346 ElemPtr
, /*Volatile=*/false, Private
->getType(), Loc
,
2347 LValueBaseInfo(AlignmentSource::Type
), TBAAAccessInfo());
2348 CGF
.EmitStoreOfScalar(V
, GlobLVal
);
2352 CodeGenFunction::ComplexPairTy V
= CGF
.EmitLoadOfComplex(
2353 CGF
.MakeAddrLValue(ElemPtr
, Private
->getType()), Loc
);
2354 CGF
.EmitStoreOfComplex(V
, GlobLVal
, /*isInit=*/false);
2358 CGF
.EmitAggregateCopy(GlobLVal
,
2359 CGF
.MakeAddrLValue(ElemPtr
, Private
->getType()),
2360 Private
->getType(), AggValueSlot::DoesNotOverlap
);
2366 CGF
.FinishFunction();
2370 /// This function emits a helper that reduces all the reduction variables from
2371 /// the team into the provided global buffer for the reduction variables.
2373 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2374 /// void *GlobPtrs[];
2375 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2377 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2378 /// reduce_function(GlobPtrs, reduce_data);
2379 static llvm::Value
*emitListToGlobalReduceFunction(
2380 CodeGenModule
&CGM
, ArrayRef
<const Expr
*> Privates
,
2381 QualType ReductionArrayTy
, SourceLocation Loc
,
2382 const RecordDecl
*TeamReductionRec
,
2383 const llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
2385 llvm::Function
*ReduceFn
) {
2386 ASTContext
&C
= CGM
.getContext();
2388 // Buffer: global reduction buffer.
2389 ImplicitParamDecl
BufferArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2390 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2391 // Idx: index of the buffer.
2392 ImplicitParamDecl
IdxArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr, C
.IntTy
,
2393 ImplicitParamDecl::Other
);
2394 // ReduceList: thread local Reduce list.
2395 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2396 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2397 FunctionArgList Args
;
2398 Args
.push_back(&BufferArg
);
2399 Args
.push_back(&IdxArg
);
2400 Args
.push_back(&ReduceListArg
);
2402 const CGFunctionInfo
&CGFI
=
2403 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
2404 auto *Fn
= llvm::Function::Create(
2405 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
2406 "_omp_reduction_list_to_global_reduce_func", &CGM
.getModule());
2407 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
2408 Fn
->setDoesNotRecurse();
2409 CodeGenFunction
CGF(CGM
);
2410 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
2412 CGBuilderTy
&Bld
= CGF
.Builder
;
2414 Address AddrBufferArg
= CGF
.GetAddrOfLocalVar(&BufferArg
);
2415 QualType StaticTy
= C
.getRecordType(TeamReductionRec
);
2416 llvm::Type
*LLVMReductionsBufferTy
=
2417 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
2418 llvm::Value
*BufferArrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2419 CGF
.EmitLoadOfScalar(AddrBufferArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
),
2420 LLVMReductionsBufferTy
->getPointerTo());
2422 // 1. Build a list of reduction variables.
2423 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2424 Address ReductionList
=
2425 CGF
.CreateMemTemp(ReductionArrayTy
, ".omp.reduction.red_list");
2426 auto IPriv
= Privates
.begin();
2427 llvm::Value
*Idxs
[] = {llvm::ConstantInt::getNullValue(CGF
.Int32Ty
),
2428 CGF
.EmitLoadOfScalar(CGF
.GetAddrOfLocalVar(&IdxArg
),
2429 /*Volatile=*/false, C
.IntTy
,
2432 for (unsigned I
= 0, E
= Privates
.size(); I
< E
; ++I
, ++IPriv
, ++Idx
) {
2433 Address Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2434 // Global = Buffer.VD[Idx];
2435 const ValueDecl
*VD
= cast
<DeclRefExpr
>(*IPriv
)->getDecl();
2436 const FieldDecl
*FD
= VarFieldMap
.lookup(VD
);
2437 LValue GlobLVal
= CGF
.EmitLValueForField(
2438 CGF
.MakeNaturalAlignAddrLValue(BufferArrPtr
, StaticTy
), FD
);
2439 Address GlobAddr
= GlobLVal
.getAddress(CGF
);
2440 llvm::Value
*BufferPtr
= Bld
.CreateInBoundsGEP(
2441 GlobAddr
.getElementType(), GlobAddr
.getPointer(), Idxs
);
2442 CGF
.EmitStoreOfScalar(BufferPtr
, Elem
, /*Volatile=*/false, C
.VoidPtrTy
);
2443 if ((*IPriv
)->getType()->isVariablyModifiedType()) {
2444 // Store array size.
2446 Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2447 llvm::Value
*Size
= CGF
.Builder
.CreateIntCast(
2449 CGF
.getContext().getAsVariableArrayType((*IPriv
)->getType()))
2451 CGF
.SizeTy
, /*isSigned=*/false);
2452 CGF
.Builder
.CreateStore(CGF
.Builder
.CreateIntToPtr(Size
, CGF
.VoidPtrTy
),
2457 // Call reduce_function(GlobalReduceList, ReduceList)
2458 llvm::Value
*GlobalReduceList
= ReductionList
.getPointer();
2459 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
2460 llvm::Value
*ReducedPtr
= CGF
.EmitLoadOfScalar(
2461 AddrReduceListArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
);
2462 CGM
.getOpenMPRuntime().emitOutlinedFunctionCall(
2463 CGF
, Loc
, ReduceFn
, {GlobalReduceList
, ReducedPtr
});
2464 CGF
.FinishFunction();
2468 /// This function emits a helper that copies all the reduction variables from
2469 /// the team into the provided global buffer for the reduction variables.
2471 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2472 /// For all data entries D in reduce_data:
2473 /// Copy buffer.D[Idx] to local D;
2474 static llvm::Value
*emitGlobalToListCopyFunction(
2475 CodeGenModule
&CGM
, ArrayRef
<const Expr
*> Privates
,
2476 QualType ReductionArrayTy
, SourceLocation Loc
,
2477 const RecordDecl
*TeamReductionRec
,
2478 const llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
2480 ASTContext
&C
= CGM
.getContext();
2482 // Buffer: global reduction buffer.
2483 ImplicitParamDecl
BufferArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2484 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2485 // Idx: index of the buffer.
2486 ImplicitParamDecl
IdxArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr, C
.IntTy
,
2487 ImplicitParamDecl::Other
);
2488 // ReduceList: thread local Reduce list.
2489 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2490 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2491 FunctionArgList Args
;
2492 Args
.push_back(&BufferArg
);
2493 Args
.push_back(&IdxArg
);
2494 Args
.push_back(&ReduceListArg
);
2496 const CGFunctionInfo
&CGFI
=
2497 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
2498 auto *Fn
= llvm::Function::Create(
2499 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
2500 "_omp_reduction_global_to_list_copy_func", &CGM
.getModule());
2501 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
2502 Fn
->setDoesNotRecurse();
2503 CodeGenFunction
CGF(CGM
);
2504 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
2506 CGBuilderTy
&Bld
= CGF
.Builder
;
2508 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
2509 Address AddrBufferArg
= CGF
.GetAddrOfLocalVar(&BufferArg
);
2510 llvm::Type
*ElemTy
= CGF
.ConvertTypeForMem(ReductionArrayTy
);
2511 Address
LocalReduceList(
2512 Bld
.CreatePointerBitCastOrAddrSpaceCast(
2513 CGF
.EmitLoadOfScalar(AddrReduceListArg
, /*Volatile=*/false,
2515 ElemTy
->getPointerTo()),
2516 ElemTy
, CGF
.getPointerAlign());
2517 QualType StaticTy
= C
.getRecordType(TeamReductionRec
);
2518 llvm::Type
*LLVMReductionsBufferTy
=
2519 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
2520 llvm::Value
*BufferArrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2521 CGF
.EmitLoadOfScalar(AddrBufferArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
),
2522 LLVMReductionsBufferTy
->getPointerTo());
2524 llvm::Value
*Idxs
[] = {llvm::ConstantInt::getNullValue(CGF
.Int32Ty
),
2525 CGF
.EmitLoadOfScalar(CGF
.GetAddrOfLocalVar(&IdxArg
),
2526 /*Volatile=*/false, C
.IntTy
,
2529 for (const Expr
*Private
: Privates
) {
2530 // Reduce element = LocalReduceList[i]
2531 Address ElemPtrPtrAddr
= Bld
.CreateConstArrayGEP(LocalReduceList
, Idx
);
2532 llvm::Value
*ElemPtrPtr
= CGF
.EmitLoadOfScalar(
2533 ElemPtrPtrAddr
, /*Volatile=*/false, C
.VoidPtrTy
, SourceLocation());
2534 // elemptr = ((CopyType*)(elemptrptr)) + I
2535 ElemTy
= CGF
.ConvertTypeForMem(Private
->getType());
2536 ElemPtrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2537 ElemPtrPtr
, ElemTy
->getPointerTo());
2539 Address(ElemPtrPtr
, ElemTy
, C
.getTypeAlignInChars(Private
->getType()));
2540 const ValueDecl
*VD
= cast
<DeclRefExpr
>(Private
)->getDecl();
2541 // Global = Buffer.VD[Idx];
2542 const FieldDecl
*FD
= VarFieldMap
.lookup(VD
);
2543 LValue GlobLVal
= CGF
.EmitLValueForField(
2544 CGF
.MakeNaturalAlignAddrLValue(BufferArrPtr
, StaticTy
), FD
);
2545 Address GlobAddr
= GlobLVal
.getAddress(CGF
);
2546 llvm::Value
*BufferPtr
= Bld
.CreateInBoundsGEP(GlobAddr
.getElementType(),
2547 GlobAddr
.getPointer(), Idxs
);
2548 GlobLVal
.setAddress(Address(BufferPtr
,
2549 CGF
.ConvertTypeForMem(Private
->getType()),
2550 GlobAddr
.getAlignment()));
2551 switch (CGF
.getEvaluationKind(Private
->getType())) {
2553 llvm::Value
*V
= CGF
.EmitLoadOfScalar(GlobLVal
, Loc
);
2554 CGF
.EmitStoreOfScalar(V
, ElemPtr
, /*Volatile=*/false, Private
->getType(),
2555 LValueBaseInfo(AlignmentSource::Type
),
2560 CodeGenFunction::ComplexPairTy V
= CGF
.EmitLoadOfComplex(GlobLVal
, Loc
);
2561 CGF
.EmitStoreOfComplex(V
, CGF
.MakeAddrLValue(ElemPtr
, Private
->getType()),
2566 CGF
.EmitAggregateCopy(CGF
.MakeAddrLValue(ElemPtr
, Private
->getType()),
2567 GlobLVal
, Private
->getType(),
2568 AggValueSlot::DoesNotOverlap
);
2574 CGF
.FinishFunction();
2578 /// This function emits a helper that reduces all the reduction variables from
2579 /// the team into the provided global buffer for the reduction variables.
2581 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2582 /// void *GlobPtrs[];
2583 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2585 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2586 /// reduce_function(reduce_data, GlobPtrs);
2587 static llvm::Value
*emitGlobalToListReduceFunction(
2588 CodeGenModule
&CGM
, ArrayRef
<const Expr
*> Privates
,
2589 QualType ReductionArrayTy
, SourceLocation Loc
,
2590 const RecordDecl
*TeamReductionRec
,
2591 const llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
2593 llvm::Function
*ReduceFn
) {
2594 ASTContext
&C
= CGM
.getContext();
2596 // Buffer: global reduction buffer.
2597 ImplicitParamDecl
BufferArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2598 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2599 // Idx: index of the buffer.
2600 ImplicitParamDecl
IdxArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr, C
.IntTy
,
2601 ImplicitParamDecl::Other
);
2602 // ReduceList: thread local Reduce list.
2603 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2604 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2605 FunctionArgList Args
;
2606 Args
.push_back(&BufferArg
);
2607 Args
.push_back(&IdxArg
);
2608 Args
.push_back(&ReduceListArg
);
2610 const CGFunctionInfo
&CGFI
=
2611 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
2612 auto *Fn
= llvm::Function::Create(
2613 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
2614 "_omp_reduction_global_to_list_reduce_func", &CGM
.getModule());
2615 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
2616 Fn
->setDoesNotRecurse();
2617 CodeGenFunction
CGF(CGM
);
2618 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
2620 CGBuilderTy
&Bld
= CGF
.Builder
;
2622 Address AddrBufferArg
= CGF
.GetAddrOfLocalVar(&BufferArg
);
2623 QualType StaticTy
= C
.getRecordType(TeamReductionRec
);
2624 llvm::Type
*LLVMReductionsBufferTy
=
2625 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
2626 llvm::Value
*BufferArrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2627 CGF
.EmitLoadOfScalar(AddrBufferArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
),
2628 LLVMReductionsBufferTy
->getPointerTo());
2630 // 1. Build a list of reduction variables.
2631 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2632 Address ReductionList
=
2633 CGF
.CreateMemTemp(ReductionArrayTy
, ".omp.reduction.red_list");
2634 auto IPriv
= Privates
.begin();
2635 llvm::Value
*Idxs
[] = {llvm::ConstantInt::getNullValue(CGF
.Int32Ty
),
2636 CGF
.EmitLoadOfScalar(CGF
.GetAddrOfLocalVar(&IdxArg
),
2637 /*Volatile=*/false, C
.IntTy
,
2640 for (unsigned I
= 0, E
= Privates
.size(); I
< E
; ++I
, ++IPriv
, ++Idx
) {
2641 Address Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2642 // Global = Buffer.VD[Idx];
2643 const ValueDecl
*VD
= cast
<DeclRefExpr
>(*IPriv
)->getDecl();
2644 const FieldDecl
*FD
= VarFieldMap
.lookup(VD
);
2645 LValue GlobLVal
= CGF
.EmitLValueForField(
2646 CGF
.MakeNaturalAlignAddrLValue(BufferArrPtr
, StaticTy
), FD
);
2647 Address GlobAddr
= GlobLVal
.getAddress(CGF
);
2648 llvm::Value
*BufferPtr
= Bld
.CreateInBoundsGEP(
2649 GlobAddr
.getElementType(), GlobAddr
.getPointer(), Idxs
);
2650 CGF
.EmitStoreOfScalar(BufferPtr
, Elem
, /*Volatile=*/false, C
.VoidPtrTy
);
2651 if ((*IPriv
)->getType()->isVariablyModifiedType()) {
2652 // Store array size.
2654 Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2655 llvm::Value
*Size
= CGF
.Builder
.CreateIntCast(
2657 CGF
.getContext().getAsVariableArrayType((*IPriv
)->getType()))
2659 CGF
.SizeTy
, /*isSigned=*/false);
2660 CGF
.Builder
.CreateStore(CGF
.Builder
.CreateIntToPtr(Size
, CGF
.VoidPtrTy
),
2665 // Call reduce_function(ReduceList, GlobalReduceList)
2666 llvm::Value
*GlobalReduceList
= ReductionList
.getPointer();
2667 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
2668 llvm::Value
*ReducedPtr
= CGF
.EmitLoadOfScalar(
2669 AddrReduceListArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
);
2670 CGM
.getOpenMPRuntime().emitOutlinedFunctionCall(
2671 CGF
, Loc
, ReduceFn
, {ReducedPtr
, GlobalReduceList
});
2672 CGF
.FinishFunction();
2677 /// Design of OpenMP reductions on the GPU
2679 /// Consider a typical OpenMP program with one or more reduction
2684 /// #pragma omp target teams distribute parallel for \
2685 /// reduction(+:foo) reduction(*:bar)
2686 /// for (int i = 0; i < N; i++) {
2687 /// foo += A[i]; bar *= B[i];
2690 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
2691 /// all teams. In our OpenMP implementation on the NVPTX device an
2692 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2693 /// within a team are mapped to CUDA threads within a threadblock.
2694 /// Our goal is to efficiently aggregate values across all OpenMP
2695 /// threads such that:
2697 /// - the compiler and runtime are logically concise, and
2698 /// - the reduction is performed efficiently in a hierarchical
2699 /// manner as follows: within OpenMP threads in the same warp,
2700 /// across warps in a threadblock, and finally across teams on
2701 /// the NVPTX device.
2703 /// Introduction to Decoupling
2705 /// We would like to decouple the compiler and the runtime so that the
2706 /// latter is ignorant of the reduction variables (number, data types)
2707 /// and the reduction operators. This allows a simpler interface
2708 /// and implementation while still attaining good performance.
2710 /// Pseudocode for the aforementioned OpenMP program generated by the
2711 /// compiler is as follows:
2713 /// 1. Create private copies of reduction variables on each OpenMP
2714 /// thread: 'foo_private', 'bar_private'
2715 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2716 /// to it and writes the result in 'foo_private' and 'bar_private'
2718 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
2719 /// and store the result on the team master:
2721 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2722 /// reduceData, shuffleReduceFn, interWarpCpyFn)
2725 /// struct ReduceData {
2729 /// reduceData.foo = &foo_private
2730 /// reduceData.bar = &bar_private
2732 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2733 /// auxiliary functions generated by the compiler that operate on
2734 /// variables of type 'ReduceData'. They aid the runtime perform
2735 /// algorithmic steps in a data agnostic manner.
2737 /// 'shuffleReduceFn' is a pointer to a function that reduces data
2738 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
2739 /// same warp. It takes the following arguments as input:
2741 /// a. variable of type 'ReduceData' on the calling lane,
2743 /// c. an offset relative to the current lane_id to generate a
2744 /// remote_lane_id. The remote lane contains the second
2745 /// variable of type 'ReduceData' that is to be reduced.
2746 /// d. an algorithm version parameter determining which reduction
2747 /// algorithm to use.
2749 /// 'shuffleReduceFn' retrieves data from the remote lane using
2750 /// efficient GPU shuffle intrinsics and reduces, using the
2751 /// algorithm specified by the 4th parameter, the two operands
2752 /// element-wise. The result is written to the first operand.
2754 /// Different reduction algorithms are implemented in different
2755 /// runtime functions, all calling 'shuffleReduceFn' to perform
2756 /// the essential reduction step. Therefore, based on the 4th
2757 /// parameter, this function behaves slightly differently to
2758 /// cooperate with the runtime to ensure correctness under
2759 /// different circumstances.
2761 /// 'InterWarpCpyFn' is a pointer to a function that transfers
2762 /// reduced variables across warps. It tunnels, through CUDA
2763 /// shared memory, the thread-private data of type 'ReduceData'
2764 /// from lane 0 of each warp to a lane in the first warp.
2765 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2766 /// The last team writes the global reduced value to memory.
2768 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2769 /// reduceData, shuffleReduceFn, interWarpCpyFn,
2770 /// scratchpadCopyFn, loadAndReduceFn)
2772 /// 'scratchpadCopyFn' is a helper that stores reduced
2773 /// data from the team master to a scratchpad array in
2776 /// 'loadAndReduceFn' is a helper that loads data from
2777 /// the scratchpad array and reduces it with the input
2780 /// These compiler generated functions hide address
2781 /// calculation and alignment information from the runtime.
2783 /// The team master of the last team stores the reduced
2784 /// result to the globals in memory.
2785 /// foo += reduceData.foo; bar *= reduceData.bar
2788 /// Warp Reduction Algorithms
2790 /// On the warp level, we have three algorithms implemented in the
2791 /// OpenMP runtime depending on the number of active lanes:
2793 /// Full Warp Reduction
2795 /// The reduce algorithm within a warp where all lanes are active
2796 /// is implemented in the runtime as follows:
2798 /// full_warp_reduce(void *reduce_data,
2799 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2800 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2801 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
2804 /// The algorithm completes in log(2, WARPSIZE) steps.
2806 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2807 /// not used therefore we save instructions by not retrieving lane_id
2808 /// from the corresponding special registers. The 4th parameter, which
2809 /// represents the version of the algorithm being used, is set to 0 to
2810 /// signify full warp reduction.
2812 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2814 /// #reduce_elem refers to an element in the local lane's data structure
2815 /// #remote_elem is retrieved from a remote lane
2816 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2817 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2819 /// Contiguous Partial Warp Reduction
2821 /// This reduce algorithm is used within a warp where only the first
2822 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2823 /// number of OpenMP threads in a parallel region is not a multiple of
2824 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
2827 /// contiguous_partial_reduce(void *reduce_data,
2828 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2829 /// int size, int lane_id) {
2832 /// curr_size = size;
2833 /// mask = curr_size/2;
2834 /// while (offset>0) {
2835 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2836 /// curr_size = (curr_size+1)/2;
2837 /// offset = curr_size/2;
2841 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2843 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2844 /// if (lane_id < offset)
2845 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2847 /// reduce_elem = remote_elem
2849 /// This algorithm assumes that the data to be reduced are located in a
2850 /// contiguous subset of lanes starting from the first. When there is
2851 /// an odd number of active lanes, the data in the last lane is not
2852 /// aggregated with any other lane's dat but is instead copied over.
2854 /// Dispersed Partial Warp Reduction
2856 /// This algorithm is used within a warp when any discontiguous subset of
2857 /// lanes are active. It is used to implement the reduction operation
2858 /// across lanes in an OpenMP simd region or in a nested parallel region.
2861 /// dispersed_partial_reduce(void *reduce_data,
2862 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2863 /// int size, remote_id;
2864 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2866 /// remote_id = next_active_lane_id_right_after_me();
2867 /// # the above function returns 0 of no active lane
2868 /// # is present right after the current lane.
2869 /// size = number_of_active_lanes_in_this_warp();
2870 /// logical_lane_id /= 2;
2871 /// ShuffleReduceFn(reduce_data, logical_lane_id,
2872 /// remote_id-1-threadIdx.x, 2);
2873 /// } while (logical_lane_id % 2 == 0 && size > 1);
2876 /// There is no assumption made about the initial state of the reduction.
2877 /// Any number of lanes (>=1) could be active at any position. The reduction
2878 /// result is returned in the first active lane.
2880 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2882 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2883 /// if (lane_id % 2 == 0 && offset > 0)
2884 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2886 /// reduce_elem = remote_elem
2889 /// Intra-Team Reduction
2891 /// This function, as implemented in the runtime call
2892 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2893 /// threads in a team. It first reduces within a warp using the
2894 /// aforementioned algorithms. We then proceed to gather all such
2895 /// reduced values at the first warp.
2897 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
2898 /// data from each of the "warp master" (zeroth lane of each warp, where
2899 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
2900 /// a mathematical sense) the problem of reduction across warp masters in
2901 /// a block to the problem of warp reduction.
2904 /// Inter-Team Reduction
2906 /// Once a team has reduced its data to a single value, it is stored in
2907 /// a global scratchpad array. Since each team has a distinct slot, this
2908 /// can be done without locking.
2910 /// The last team to write to the scratchpad array proceeds to reduce the
2911 /// scratchpad array. One or more workers in the last team use the helper
2912 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2913 /// the k'th worker reduces every k'th element.
2915 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2916 /// reduce across workers and compute a globally reduced value.
2918 void CGOpenMPRuntimeGPU::emitReduction(
2919 CodeGenFunction
&CGF
, SourceLocation Loc
, ArrayRef
<const Expr
*> Privates
,
2920 ArrayRef
<const Expr
*> LHSExprs
, ArrayRef
<const Expr
*> RHSExprs
,
2921 ArrayRef
<const Expr
*> ReductionOps
, ReductionOptionsTy Options
) {
2922 if (!CGF
.HaveInsertPoint())
2925 bool ParallelReduction
= isOpenMPParallelDirective(Options
.ReductionKind
);
2927 bool TeamsReduction
= isOpenMPTeamsDirective(Options
.ReductionKind
);
2930 if (Options
.SimpleReduction
) {
2931 assert(!TeamsReduction
&& !ParallelReduction
&&
2932 "Invalid reduction selection in emitReduction.");
2933 CGOpenMPRuntime::emitReduction(CGF
, Loc
, Privates
, LHSExprs
, RHSExprs
,
2934 ReductionOps
, Options
);
2938 assert((TeamsReduction
|| ParallelReduction
) &&
2939 "Invalid reduction selection in emitReduction.");
2941 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2942 // RedList, shuffle_reduce_func, interwarp_copy_func);
2944 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2945 llvm::Value
*RTLoc
= emitUpdateLocation(CGF
, Loc
);
2946 llvm::Value
*ThreadId
= getThreadID(CGF
, Loc
);
2949 ASTContext
&C
= CGM
.getContext();
2950 // 1. Build a list of reduction variables.
2951 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2952 auto Size
= RHSExprs
.size();
2953 for (const Expr
*E
: Privates
) {
2954 if (E
->getType()->isVariablyModifiedType())
2955 // Reserve place for array size.
2958 llvm::APInt
ArraySize(/*unsigned int numBits=*/32, Size
);
2959 QualType ReductionArrayTy
= C
.getConstantArrayType(
2960 C
.VoidPtrTy
, ArraySize
, nullptr, ArraySizeModifier::Normal
,
2961 /*IndexTypeQuals=*/0);
2962 Address ReductionList
=
2963 CGF
.CreateMemTemp(ReductionArrayTy
, ".omp.reduction.red_list");
2964 auto IPriv
= Privates
.begin();
2966 for (unsigned I
= 0, E
= RHSExprs
.size(); I
< E
; ++I
, ++IPriv
, ++Idx
) {
2967 Address Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2968 CGF
.Builder
.CreateStore(
2969 CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
2970 CGF
.EmitLValue(RHSExprs
[I
]).getPointer(CGF
), CGF
.VoidPtrTy
),
2972 if ((*IPriv
)->getType()->isVariablyModifiedType()) {
2973 // Store array size.
2975 Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2976 llvm::Value
*Size
= CGF
.Builder
.CreateIntCast(
2978 CGF
.getContext().getAsVariableArrayType((*IPriv
)->getType()))
2980 CGF
.SizeTy
, /*isSigned=*/false);
2981 CGF
.Builder
.CreateStore(CGF
.Builder
.CreateIntToPtr(Size
, CGF
.VoidPtrTy
),
2986 llvm::Value
*RL
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
2987 ReductionList
.getPointer(), CGF
.VoidPtrTy
);
2988 llvm::Function
*ReductionFn
= emitReductionFunction(
2989 CGF
.CurFn
->getName(), Loc
, CGF
.ConvertTypeForMem(ReductionArrayTy
),
2990 Privates
, LHSExprs
, RHSExprs
, ReductionOps
);
2991 llvm::Value
*ReductionArrayTySize
= CGF
.getTypeSize(ReductionArrayTy
);
2992 llvm::Function
*ShuffleAndReduceFn
= emitShuffleAndReduceFunction(
2993 CGM
, Privates
, ReductionArrayTy
, ReductionFn
, Loc
);
2994 llvm::Value
*InterWarpCopyFn
=
2995 emitInterWarpCopyFunction(CGM
, Privates
, ReductionArrayTy
, Loc
);
2997 if (ParallelReduction
) {
2998 llvm::Value
*Args
[] = {RTLoc
,
3000 CGF
.Builder
.getInt32(RHSExprs
.size()),
3001 ReductionArrayTySize
,
3006 Res
= CGF
.EmitRuntimeCall(
3007 OMPBuilder
.getOrCreateRuntimeFunction(
3008 CGM
.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2
),
3011 assert(TeamsReduction
&& "expected teams reduction.");
3012 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> VarFieldMap
;
3013 llvm::SmallVector
<const ValueDecl
*, 4> PrivatesReductions(Privates
.size());
3015 for (const Expr
*DRE
: Privates
) {
3016 PrivatesReductions
[Cnt
] = cast
<DeclRefExpr
>(DRE
)->getDecl();
3019 const RecordDecl
*TeamReductionRec
= ::buildRecordForGlobalizedVars(
3020 CGM
.getContext(), PrivatesReductions
, std::nullopt
, VarFieldMap
,
3021 C
.getLangOpts().OpenMPCUDAReductionBufNum
);
3022 TeamsReductions
.push_back(TeamReductionRec
);
3023 auto *KernelTeamsReductionPtr
= CGF
.EmitRuntimeCall(
3024 OMPBuilder
.getOrCreateRuntimeFunction(
3025 CGM
.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer
),
3026 {}, "_openmp_teams_reductions_buffer_$_$ptr");
3027 llvm::Value
*GlobalToBufferCpyFn
= ::emitListToGlobalCopyFunction(
3028 CGM
, Privates
, ReductionArrayTy
, Loc
, TeamReductionRec
, VarFieldMap
);
3029 llvm::Value
*GlobalToBufferRedFn
= ::emitListToGlobalReduceFunction(
3030 CGM
, Privates
, ReductionArrayTy
, Loc
, TeamReductionRec
, VarFieldMap
,
3032 llvm::Value
*BufferToGlobalCpyFn
= ::emitGlobalToListCopyFunction(
3033 CGM
, Privates
, ReductionArrayTy
, Loc
, TeamReductionRec
, VarFieldMap
);
3034 llvm::Value
*BufferToGlobalRedFn
= ::emitGlobalToListReduceFunction(
3035 CGM
, Privates
, ReductionArrayTy
, Loc
, TeamReductionRec
, VarFieldMap
,
3038 llvm::Value
*Args
[] = {
3041 KernelTeamsReductionPtr
,
3042 CGF
.Builder
.getInt32(C
.getLangOpts().OpenMPCUDAReductionBufNum
),
3046 GlobalToBufferCpyFn
,
3047 GlobalToBufferRedFn
,
3048 BufferToGlobalCpyFn
,
3049 BufferToGlobalRedFn
};
3051 Res
= CGF
.EmitRuntimeCall(
3052 OMPBuilder
.getOrCreateRuntimeFunction(
3053 CGM
.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2
),
3057 // 5. Build if (res == 1)
3058 llvm::BasicBlock
*ExitBB
= CGF
.createBasicBlock(".omp.reduction.done");
3059 llvm::BasicBlock
*ThenBB
= CGF
.createBasicBlock(".omp.reduction.then");
3060 llvm::Value
*Cond
= CGF
.Builder
.CreateICmpEQ(
3061 Res
, llvm::ConstantInt::get(CGM
.Int32Ty
, /*V=*/1));
3062 CGF
.Builder
.CreateCondBr(Cond
, ThenBB
, ExitBB
);
3064 // 6. Build then branch: where we have reduced values in the master
3065 // thread in each team.
3066 // __kmpc_end_reduce{_nowait}(<gtid>);
3068 CGF
.EmitBlock(ThenBB
);
3070 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3071 auto &&CodeGen
= [Privates
, LHSExprs
, RHSExprs
, ReductionOps
,
3072 this](CodeGenFunction
&CGF
, PrePostActionTy
&Action
) {
3073 auto IPriv
= Privates
.begin();
3074 auto ILHS
= LHSExprs
.begin();
3075 auto IRHS
= RHSExprs
.begin();
3076 for (const Expr
*E
: ReductionOps
) {
3077 emitSingleReductionCombiner(CGF
, E
, *IPriv
, cast
<DeclRefExpr
>(*ILHS
),
3078 cast
<DeclRefExpr
>(*IRHS
));
3084 RegionCodeGenTy
RCG(CodeGen
);
3086 // There is no need to emit line number for unconditional branch.
3087 (void)ApplyDebugLocation::CreateEmpty(CGF
);
3088 CGF
.EmitBlock(ExitBB
, /*IsFinished=*/true);
3092 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl
*FD
,
3093 const VarDecl
*NativeParam
) const {
3094 if (!NativeParam
->getType()->isReferenceType())
3096 QualType ArgType
= NativeParam
->getType();
3097 QualifierCollector QC
;
3098 const Type
*NonQualTy
= QC
.strip(ArgType
);
3099 QualType PointeeTy
= cast
<ReferenceType
>(NonQualTy
)->getPointeeType();
3100 if (const auto *Attr
= FD
->getAttr
<OMPCaptureKindAttr
>()) {
3101 if (Attr
->getCaptureKind() == OMPC_map
) {
3102 PointeeTy
= CGM
.getContext().getAddrSpaceQualType(PointeeTy
,
3103 LangAS::opencl_global
);
3106 ArgType
= CGM
.getContext().getPointerType(PointeeTy
);
3108 enum { NVPTX_local_addr
= 5 };
3109 QC
.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr
));
3110 ArgType
= QC
.apply(CGM
.getContext(), ArgType
);
3111 if (isa
<ImplicitParamDecl
>(NativeParam
))
3112 return ImplicitParamDecl::Create(
3113 CGM
.getContext(), /*DC=*/nullptr, NativeParam
->getLocation(),
3114 NativeParam
->getIdentifier(), ArgType
, ImplicitParamDecl::Other
);
3115 return ParmVarDecl::Create(
3117 const_cast<DeclContext
*>(NativeParam
->getDeclContext()),
3118 NativeParam
->getBeginLoc(), NativeParam
->getLocation(),
3119 NativeParam
->getIdentifier(), ArgType
,
3120 /*TInfo=*/nullptr, SC_None
, /*DefArg=*/nullptr);
3124 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction
&CGF
,
3125 const VarDecl
*NativeParam
,
3126 const VarDecl
*TargetParam
) const {
3127 assert(NativeParam
!= TargetParam
&&
3128 NativeParam
->getType()->isReferenceType() &&
3129 "Native arg must not be the same as target arg.");
3130 Address LocalAddr
= CGF
.GetAddrOfLocalVar(TargetParam
);
3131 QualType NativeParamType
= NativeParam
->getType();
3132 QualifierCollector QC
;
3133 const Type
*NonQualTy
= QC
.strip(NativeParamType
);
3134 QualType NativePointeeTy
= cast
<ReferenceType
>(NonQualTy
)->getPointeeType();
3135 unsigned NativePointeeAddrSpace
=
3136 CGF
.getTypes().getTargetAddressSpace(NativePointeeTy
);
3137 QualType TargetTy
= TargetParam
->getType();
3138 llvm::Value
*TargetAddr
= CGF
.EmitLoadOfScalar(LocalAddr
, /*Volatile=*/false,
3139 TargetTy
, SourceLocation());
3140 // First cast to generic.
3141 TargetAddr
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
3143 llvm::PointerType::get(CGF
.getLLVMContext(), /*AddrSpace=*/0));
3144 // Cast from generic to native address space.
3145 TargetAddr
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
3147 llvm::PointerType::get(CGF
.getLLVMContext(), NativePointeeAddrSpace
));
3148 Address NativeParamAddr
= CGF
.CreateMemTemp(NativeParamType
);
3149 CGF
.EmitStoreOfScalar(TargetAddr
, NativeParamAddr
, /*Volatile=*/false,
3151 return NativeParamAddr
;
3154 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3155 CodeGenFunction
&CGF
, SourceLocation Loc
, llvm::FunctionCallee OutlinedFn
,
3156 ArrayRef
<llvm::Value
*> Args
) const {
3157 SmallVector
<llvm::Value
*, 4> TargetArgs
;
3158 TargetArgs
.reserve(Args
.size());
3159 auto *FnType
= OutlinedFn
.getFunctionType();
3160 for (unsigned I
= 0, E
= Args
.size(); I
< E
; ++I
) {
3161 if (FnType
->isVarArg() && FnType
->getNumParams() <= I
) {
3162 TargetArgs
.append(std::next(Args
.begin(), I
), Args
.end());
3165 llvm::Type
*TargetType
= FnType
->getParamType(I
);
3166 llvm::Value
*NativeArg
= Args
[I
];
3167 if (!TargetType
->isPointerTy()) {
3168 TargetArgs
.emplace_back(NativeArg
);
3171 llvm::Value
*TargetArg
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
3173 llvm::PointerType::get(CGF
.getLLVMContext(), /*AddrSpace*/ 0));
3174 TargetArgs
.emplace_back(
3175 CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(TargetArg
, TargetType
));
3177 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF
, Loc
, OutlinedFn
, TargetArgs
);
3180 /// Emit function which wraps the outline parallel region
3181 /// and controls the arguments which are passed to this function.
3182 /// The wrapper ensures that the outlined function is called
3183 /// with the correct arguments when data is shared.
3184 llvm::Function
*CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3185 llvm::Function
*OutlinedParallelFn
, const OMPExecutableDirective
&D
) {
3186 ASTContext
&Ctx
= CGM
.getContext();
3187 const auto &CS
= *D
.getCapturedStmt(OMPD_parallel
);
3189 // Create a function that takes as argument the source thread.
3190 FunctionArgList WrapperArgs
;
3192 Ctx
.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3194 Ctx
.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3195 ImplicitParamDecl
ParallelLevelArg(Ctx
, /*DC=*/nullptr, D
.getBeginLoc(),
3196 /*Id=*/nullptr, Int16QTy
,
3197 ImplicitParamDecl::Other
);
3198 ImplicitParamDecl
WrapperArg(Ctx
, /*DC=*/nullptr, D
.getBeginLoc(),
3199 /*Id=*/nullptr, Int32QTy
,
3200 ImplicitParamDecl::Other
);
3201 WrapperArgs
.emplace_back(&ParallelLevelArg
);
3202 WrapperArgs
.emplace_back(&WrapperArg
);
3204 const CGFunctionInfo
&CGFI
=
3205 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(Ctx
.VoidTy
, WrapperArgs
);
3207 auto *Fn
= llvm::Function::Create(
3208 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
3209 Twine(OutlinedParallelFn
->getName(), "_wrapper"), &CGM
.getModule());
3211 // Ensure we do not inline the function. This is trivially true for the ones
3212 // passed to __kmpc_fork_call but the ones calles in serialized regions
3213 // could be inlined. This is not a perfect but it is closer to the invariant
3214 // we want, namely, every data environment starts with a new function.
3215 // TODO: We should pass the if condition to the runtime function and do the
3216 // handling there. Much cleaner code.
3217 Fn
->addFnAttr(llvm::Attribute::NoInline
);
3219 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
3220 Fn
->setLinkage(llvm::GlobalValue::InternalLinkage
);
3221 Fn
->setDoesNotRecurse();
3223 CodeGenFunction
CGF(CGM
, /*suppressNewContext=*/true);
3224 CGF
.StartFunction(GlobalDecl(), Ctx
.VoidTy
, Fn
, CGFI
, WrapperArgs
,
3225 D
.getBeginLoc(), D
.getBeginLoc());
3227 const auto *RD
= CS
.getCapturedRecordDecl();
3228 auto CurField
= RD
->field_begin();
3230 Address ZeroAddr
= CGF
.CreateDefaultAlignTempAlloca(CGF
.Int32Ty
,
3231 /*Name=*/".zero.addr");
3232 CGF
.Builder
.CreateStore(CGF
.Builder
.getInt32(/*C*/ 0), ZeroAddr
);
3233 // Get the array of arguments.
3234 SmallVector
<llvm::Value
*, 8> Args
;
3236 Args
.emplace_back(CGF
.GetAddrOfLocalVar(&WrapperArg
).getPointer());
3237 Args
.emplace_back(ZeroAddr
.getPointer());
3239 CGBuilderTy
&Bld
= CGF
.Builder
;
3240 auto CI
= CS
.capture_begin();
3242 // Use global memory for data sharing.
3243 // Handle passing of global args to workers.
3244 Address GlobalArgs
=
3245 CGF
.CreateDefaultAlignTempAlloca(CGF
.VoidPtrPtrTy
, "global_args");
3246 llvm::Value
*GlobalArgsPtr
= GlobalArgs
.getPointer();
3247 llvm::Value
*DataSharingArgs
[] = {GlobalArgsPtr
};
3248 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
3249 CGM
.getModule(), OMPRTL___kmpc_get_shared_variables
),
3252 // Retrieve the shared variables from the list of references returned
3253 // by the runtime. Pass the variables to the outlined function.
3254 Address SharedArgListAddress
= Address::invalid();
3255 if (CS
.capture_size() > 0 ||
3256 isOpenMPLoopBoundSharingDirective(D
.getDirectiveKind())) {
3257 SharedArgListAddress
= CGF
.EmitLoadOfPointer(
3258 GlobalArgs
, CGF
.getContext()
3259 .getPointerType(CGF
.getContext().VoidPtrTy
)
3260 .castAs
<PointerType
>());
3263 if (isOpenMPLoopBoundSharingDirective(D
.getDirectiveKind())) {
3264 Address Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, Idx
);
3265 Address TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
3266 Src
, CGF
.SizeTy
->getPointerTo(), CGF
.SizeTy
);
3267 llvm::Value
*LB
= CGF
.EmitLoadOfScalar(
3270 CGF
.getContext().getPointerType(CGF
.getContext().getSizeType()),
3271 cast
<OMPLoopDirective
>(D
).getLowerBoundVariable()->getExprLoc());
3272 Args
.emplace_back(LB
);
3274 Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, Idx
);
3275 TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
3276 Src
, CGF
.SizeTy
->getPointerTo(), CGF
.SizeTy
);
3277 llvm::Value
*UB
= CGF
.EmitLoadOfScalar(
3280 CGF
.getContext().getPointerType(CGF
.getContext().getSizeType()),
3281 cast
<OMPLoopDirective
>(D
).getUpperBoundVariable()->getExprLoc());
3282 Args
.emplace_back(UB
);
3285 if (CS
.capture_size() > 0) {
3286 ASTContext
&CGFContext
= CGF
.getContext();
3287 for (unsigned I
= 0, E
= CS
.capture_size(); I
< E
; ++I
, ++CI
, ++CurField
) {
3288 QualType ElemTy
= CurField
->getType();
3289 Address Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, I
+ Idx
);
3290 Address TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
3291 Src
, CGF
.ConvertTypeForMem(CGFContext
.getPointerType(ElemTy
)),
3292 CGF
.ConvertTypeForMem(ElemTy
));
3293 llvm::Value
*Arg
= CGF
.EmitLoadOfScalar(TypedAddress
,
3295 CGFContext
.getPointerType(ElemTy
),
3297 if (CI
->capturesVariableByCopy() &&
3298 !CI
->getCapturedVar()->getType()->isAnyPointerType()) {
3299 Arg
= castValueToType(CGF
, Arg
, ElemTy
, CGFContext
.getUIntPtrType(),
3302 Args
.emplace_back(Arg
);
3306 emitOutlinedFunctionCall(CGF
, D
.getBeginLoc(), OutlinedParallelFn
, Args
);
3307 CGF
.FinishFunction();
3311 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction
&CGF
,
3313 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
3316 assert(D
&& "Expected function or captured|block decl.");
3317 assert(FunctionGlobalizedDecls
.count(CGF
.CurFn
) == 0 &&
3318 "Function is registered already.");
3319 assert((!TeamAndReductions
.first
|| TeamAndReductions
.first
== D
) &&
3320 "Team is set but not processed.");
3321 const Stmt
*Body
= nullptr;
3322 bool NeedToDelayGlobalization
= false;
3323 if (const auto *FD
= dyn_cast
<FunctionDecl
>(D
)) {
3324 Body
= FD
->getBody();
3325 } else if (const auto *BD
= dyn_cast
<BlockDecl
>(D
)) {
3326 Body
= BD
->getBody();
3327 } else if (const auto *CD
= dyn_cast
<CapturedDecl
>(D
)) {
3328 Body
= CD
->getBody();
3329 NeedToDelayGlobalization
= CGF
.CapturedStmtInfo
->getKind() == CR_OpenMP
;
3330 if (NeedToDelayGlobalization
&&
3331 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
)
3336 CheckVarsEscapingDeclContext
VarChecker(CGF
, TeamAndReductions
.second
);
3337 VarChecker
.Visit(Body
);
3338 const RecordDecl
*GlobalizedVarsRecord
=
3339 VarChecker
.getGlobalizedRecord(IsInTTDRegion
);
3340 TeamAndReductions
.first
= nullptr;
3341 TeamAndReductions
.second
.clear();
3342 ArrayRef
<const ValueDecl
*> EscapedVariableLengthDecls
=
3343 VarChecker
.getEscapedVariableLengthDecls();
3344 ArrayRef
<const ValueDecl
*> DelayedVariableLengthDecls
=
3345 VarChecker
.getDelayedVariableLengthDecls();
3346 if (!GlobalizedVarsRecord
&& EscapedVariableLengthDecls
.empty() &&
3347 DelayedVariableLengthDecls
.empty())
3349 auto I
= FunctionGlobalizedDecls
.try_emplace(CGF
.CurFn
).first
;
3350 I
->getSecond().MappedParams
=
3351 std::make_unique
<CodeGenFunction::OMPMapVars
>();
3352 I
->getSecond().EscapedParameters
.insert(
3353 VarChecker
.getEscapedParameters().begin(),
3354 VarChecker
.getEscapedParameters().end());
3355 I
->getSecond().EscapedVariableLengthDecls
.append(
3356 EscapedVariableLengthDecls
.begin(), EscapedVariableLengthDecls
.end());
3357 I
->getSecond().DelayedVariableLengthDecls
.append(
3358 DelayedVariableLengthDecls
.begin(), DelayedVariableLengthDecls
.end());
3359 DeclToAddrMapTy
&Data
= I
->getSecond().LocalVarData
;
3360 for (const ValueDecl
*VD
: VarChecker
.getEscapedDecls()) {
3361 assert(VD
->isCanonicalDecl() && "Expected canonical declaration");
3362 Data
.insert(std::make_pair(VD
, MappedVarData()));
3364 if (!NeedToDelayGlobalization
) {
3365 emitGenericVarsProlog(CGF
, D
->getBeginLoc());
3366 struct GlobalizationScope final
: EHScopeStack::Cleanup
{
3367 GlobalizationScope() = default;
3369 void Emit(CodeGenFunction
&CGF
, Flags flags
) override
{
3370 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime())
3371 .emitGenericVarsEpilog(CGF
);
3374 CGF
.EHStack
.pushCleanup
<GlobalizationScope
>(NormalAndEHCleanup
);
3378 Address
CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction
&CGF
,
3379 const VarDecl
*VD
) {
3380 if (VD
&& VD
->hasAttr
<OMPAllocateDeclAttr
>()) {
3381 const auto *A
= VD
->getAttr
<OMPAllocateDeclAttr
>();
3382 auto AS
= LangAS::Default
;
3383 switch (A
->getAllocatorType()) {
3384 // Use the default allocator here as by default local vars are
3386 case OMPAllocateDeclAttr::OMPNullMemAlloc
:
3387 case OMPAllocateDeclAttr::OMPDefaultMemAlloc
:
3388 case OMPAllocateDeclAttr::OMPThreadMemAlloc
:
3389 case OMPAllocateDeclAttr::OMPHighBWMemAlloc
:
3390 case OMPAllocateDeclAttr::OMPLowLatMemAlloc
:
3391 // Follow the user decision - use default allocation.
3392 return Address::invalid();
3393 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc
:
3394 // TODO: implement aupport for user-defined allocators.
3395 return Address::invalid();
3396 case OMPAllocateDeclAttr::OMPConstMemAlloc
:
3397 AS
= LangAS::cuda_constant
;
3399 case OMPAllocateDeclAttr::OMPPTeamMemAlloc
:
3400 AS
= LangAS::cuda_shared
;
3402 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc
:
3403 case OMPAllocateDeclAttr::OMPCGroupMemAlloc
:
3406 llvm::Type
*VarTy
= CGF
.ConvertTypeForMem(VD
->getType());
3407 auto *GV
= new llvm::GlobalVariable(
3408 CGM
.getModule(), VarTy
, /*isConstant=*/false,
3409 llvm::GlobalValue::InternalLinkage
, llvm::PoisonValue::get(VarTy
),
3411 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal
,
3412 CGM
.getContext().getTargetAddressSpace(AS
));
3413 CharUnits Align
= CGM
.getContext().getDeclAlign(VD
);
3414 GV
->setAlignment(Align
.getAsAlign());
3416 CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
3417 GV
, VarTy
->getPointerTo(CGM
.getContext().getTargetAddressSpace(
3418 VD
->getType().getAddressSpace()))),
3422 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
3423 return Address::invalid();
3425 VD
= VD
->getCanonicalDecl();
3426 auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
3427 if (I
== FunctionGlobalizedDecls
.end())
3428 return Address::invalid();
3429 auto VDI
= I
->getSecond().LocalVarData
.find(VD
);
3430 if (VDI
!= I
->getSecond().LocalVarData
.end())
3431 return VDI
->second
.PrivateAddr
;
3432 if (VD
->hasAttrs()) {
3433 for (specific_attr_iterator
<OMPReferencedVarAttr
> IT(VD
->attr_begin()),
3436 auto VDI
= I
->getSecond().LocalVarData
.find(
3437 cast
<VarDecl
>(cast
<DeclRefExpr
>(IT
->getRef())->getDecl())
3438 ->getCanonicalDecl());
3439 if (VDI
!= I
->getSecond().LocalVarData
.end())
3440 return VDI
->second
.PrivateAddr
;
3444 return Address::invalid();
3447 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction
&CGF
) {
3448 FunctionGlobalizedDecls
.erase(CGF
.CurFn
);
3449 CGOpenMPRuntime::functionFinished(CGF
);
3452 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3453 CodeGenFunction
&CGF
, const OMPLoopDirective
&S
,
3454 OpenMPDistScheduleClauseKind
&ScheduleKind
,
3455 llvm::Value
*&Chunk
) const {
3456 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
3457 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
) {
3458 ScheduleKind
= OMPC_DIST_SCHEDULE_static
;
3459 Chunk
= CGF
.EmitScalarConversion(
3460 RT
.getGPUNumThreads(CGF
),
3461 CGF
.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3462 S
.getIterationVariable()->getType(), S
.getBeginLoc());
3465 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3466 CGF
, S
, ScheduleKind
, Chunk
);
3469 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3470 CodeGenFunction
&CGF
, const OMPLoopDirective
&S
,
3471 OpenMPScheduleClauseKind
&ScheduleKind
,
3472 const Expr
*&ChunkExpr
) const {
3473 ScheduleKind
= OMPC_SCHEDULE_static
;
3474 // Chunk size is 1 in this case.
3475 llvm::APInt
ChunkSize(32, 1);
3476 ChunkExpr
= IntegerLiteral::Create(CGF
.getContext(), ChunkSize
,
3477 CGF
.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3481 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3482 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
) const {
3483 assert(isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) &&
3484 " Expected target-based directive.");
3485 const CapturedStmt
*CS
= D
.getCapturedStmt(OMPD_target
);
3486 for (const CapturedStmt::Capture
&C
: CS
->captures()) {
3487 // Capture variables captured by reference in lambdas for target-based
3489 if (!C
.capturesVariable())
3491 const VarDecl
*VD
= C
.getCapturedVar();
3492 const auto *RD
= VD
->getType()
3494 .getNonReferenceType()
3495 ->getAsCXXRecordDecl();
3496 if (!RD
|| !RD
->isLambda())
3498 Address VDAddr
= CGF
.GetAddrOfLocalVar(VD
);
3500 if (VD
->getType().getCanonicalType()->isReferenceType())
3501 VDLVal
= CGF
.EmitLoadOfReferenceLValue(VDAddr
, VD
->getType());
3503 VDLVal
= CGF
.MakeAddrLValue(
3504 VDAddr
, VD
->getType().getCanonicalType().getNonReferenceType());
3505 llvm::DenseMap
<const ValueDecl
*, FieldDecl
*> Captures
;
3506 FieldDecl
*ThisCapture
= nullptr;
3507 RD
->getCaptureFields(Captures
, ThisCapture
);
3508 if (ThisCapture
&& CGF
.CapturedStmtInfo
->isCXXThisExprCaptured()) {
3510 CGF
.EmitLValueForFieldInitialization(VDLVal
, ThisCapture
);
3511 llvm::Value
*CXXThis
= CGF
.LoadCXXThis();
3512 CGF
.EmitStoreOfScalar(CXXThis
, ThisLVal
);
3514 for (const LambdaCapture
&LC
: RD
->captures()) {
3515 if (LC
.getCaptureKind() != LCK_ByRef
)
3517 const ValueDecl
*VD
= LC
.getCapturedVar();
3518 // FIXME: For now VD is always a VarDecl because OpenMP does not support
3519 // capturing structured bindings in lambdas yet.
3520 if (!CS
->capturesVariable(cast
<VarDecl
>(VD
)))
3522 auto It
= Captures
.find(VD
);
3523 assert(It
!= Captures
.end() && "Found lambda capture without field.");
3524 LValue VarLVal
= CGF
.EmitLValueForFieldInitialization(VDLVal
, It
->second
);
3525 Address VDAddr
= CGF
.GetAddrOfLocalVar(cast
<VarDecl
>(VD
));
3526 if (VD
->getType().getCanonicalType()->isReferenceType())
3527 VDAddr
= CGF
.EmitLoadOfReferenceLValue(VDAddr
,
3528 VD
->getType().getCanonicalType())
3530 CGF
.EmitStoreOfScalar(VDAddr
.getPointer(), VarLVal
);
3535 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl
*VD
,
3537 if (!VD
|| !VD
->hasAttr
<OMPAllocateDeclAttr
>())
3539 const auto *A
= VD
->getAttr
<OMPAllocateDeclAttr
>();
3540 switch(A
->getAllocatorType()) {
3541 case OMPAllocateDeclAttr::OMPNullMemAlloc
:
3542 case OMPAllocateDeclAttr::OMPDefaultMemAlloc
:
3543 // Not supported, fallback to the default mem space.
3544 case OMPAllocateDeclAttr::OMPThreadMemAlloc
:
3545 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc
:
3546 case OMPAllocateDeclAttr::OMPCGroupMemAlloc
:
3547 case OMPAllocateDeclAttr::OMPHighBWMemAlloc
:
3548 case OMPAllocateDeclAttr::OMPLowLatMemAlloc
:
3549 AS
= LangAS::Default
;
3551 case OMPAllocateDeclAttr::OMPConstMemAlloc
:
3552 AS
= LangAS::cuda_constant
;
3554 case OMPAllocateDeclAttr::OMPPTeamMemAlloc
:
3555 AS
= LangAS::cuda_shared
;
3557 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc
:
3558 llvm_unreachable("Expected predefined allocator for the variables with the "
3564 // Get current CudaArch and ignore any unknown values
3565 static CudaArch
getCudaArch(CodeGenModule
&CGM
) {
3566 if (!CGM
.getTarget().hasFeature("ptx"))
3567 return CudaArch::UNKNOWN
;
3568 for (const auto &Feature
: CGM
.getTarget().getTargetOpts().FeatureMap
) {
3569 if (Feature
.getValue()) {
3570 CudaArch Arch
= StringToCudaArch(Feature
.getKey());
3571 if (Arch
!= CudaArch::UNKNOWN
)
3575 return CudaArch::UNKNOWN
;
3578 /// Check to see if target architecture supports unified addressing which is
3579 /// a restriction for OpenMP requires clause "unified_shared_memory".
3580 void CGOpenMPRuntimeGPU::processRequiresDirective(
3581 const OMPRequiresDecl
*D
) {
3582 for (const OMPClause
*Clause
: D
->clauselists()) {
3583 if (Clause
->getClauseKind() == OMPC_unified_shared_memory
) {
3584 CudaArch Arch
= getCudaArch(CGM
);
3586 case CudaArch::SM_20
:
3587 case CudaArch::SM_21
:
3588 case CudaArch::SM_30
:
3589 case CudaArch::SM_32
:
3590 case CudaArch::SM_35
:
3591 case CudaArch::SM_37
:
3592 case CudaArch::SM_50
:
3593 case CudaArch::SM_52
:
3594 case CudaArch::SM_53
: {
3595 SmallString
<256> Buffer
;
3596 llvm::raw_svector_ostream
Out(Buffer
);
3597 Out
<< "Target architecture " << CudaArchToString(Arch
)
3598 << " does not support unified addressing";
3599 CGM
.Error(Clause
->getBeginLoc(), Out
.str());
3602 case CudaArch::SM_60
:
3603 case CudaArch::SM_61
:
3604 case CudaArch::SM_62
:
3605 case CudaArch::SM_70
:
3606 case CudaArch::SM_72
:
3607 case CudaArch::SM_75
:
3608 case CudaArch::SM_80
:
3609 case CudaArch::SM_86
:
3610 case CudaArch::SM_87
:
3611 case CudaArch::SM_89
:
3612 case CudaArch::SM_90
:
3613 case CudaArch::GFX600
:
3614 case CudaArch::GFX601
:
3615 case CudaArch::GFX602
:
3616 case CudaArch::GFX700
:
3617 case CudaArch::GFX701
:
3618 case CudaArch::GFX702
:
3619 case CudaArch::GFX703
:
3620 case CudaArch::GFX704
:
3621 case CudaArch::GFX705
:
3622 case CudaArch::GFX801
:
3623 case CudaArch::GFX802
:
3624 case CudaArch::GFX803
:
3625 case CudaArch::GFX805
:
3626 case CudaArch::GFX810
:
3627 case CudaArch::GFX900
:
3628 case CudaArch::GFX902
:
3629 case CudaArch::GFX904
:
3630 case CudaArch::GFX906
:
3631 case CudaArch::GFX908
:
3632 case CudaArch::GFX909
:
3633 case CudaArch::GFX90a
:
3634 case CudaArch::GFX90c
:
3635 case CudaArch::GFX940
:
3636 case CudaArch::GFX941
:
3637 case CudaArch::GFX942
:
3638 case CudaArch::GFX1010
:
3639 case CudaArch::GFX1011
:
3640 case CudaArch::GFX1012
:
3641 case CudaArch::GFX1013
:
3642 case CudaArch::GFX1030
:
3643 case CudaArch::GFX1031
:
3644 case CudaArch::GFX1032
:
3645 case CudaArch::GFX1033
:
3646 case CudaArch::GFX1034
:
3647 case CudaArch::GFX1035
:
3648 case CudaArch::GFX1036
:
3649 case CudaArch::GFX1100
:
3650 case CudaArch::GFX1101
:
3651 case CudaArch::GFX1102
:
3652 case CudaArch::GFX1103
:
3653 case CudaArch::GFX1150
:
3654 case CudaArch::GFX1151
:
3655 case CudaArch::Generic
:
3656 case CudaArch::UNUSED
:
3657 case CudaArch::UNKNOWN
:
3659 case CudaArch::LAST
:
3660 llvm_unreachable("Unexpected Cuda arch.");
3664 CGOpenMPRuntime::processRequiresDirective(D
);
3667 llvm::Value
*CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction
&CGF
) {
3668 CGBuilderTy
&Bld
= CGF
.Builder
;
3669 llvm::Module
*M
= &CGF
.CGM
.getModule();
3670 const char *LocSize
= "__kmpc_get_hardware_num_threads_in_block";
3671 llvm::Function
*F
= M
->getFunction(LocSize
);
3673 F
= llvm::Function::Create(
3674 llvm::FunctionType::get(CGF
.Int32Ty
, std::nullopt
, false),
3675 llvm::GlobalVariable::ExternalLinkage
, LocSize
, &CGF
.CGM
.getModule());
3677 return Bld
.CreateCall(F
, std::nullopt
, "nvptx_num_threads");
3680 llvm::Value
*CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction
&CGF
) {
3681 ArrayRef
<llvm::Value
*> Args
{};
3682 return CGF
.EmitRuntimeCall(
3683 OMPBuilder
.getOrCreateRuntimeFunction(
3684 CGM
.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block
),
3688 llvm::Value
*CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction
&CGF
) {
3689 ArrayRef
<llvm::Value
*> Args
{};
3690 return CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
3691 CGM
.getModule(), OMPRTL___kmpc_get_warp_size
),