1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This provides a generalized class for OpenMP runtime code generation
10 // specialized by GPU targets NVPTX and AMDGCN.
12 //===----------------------------------------------------------------------===//
14 #include "CGOpenMPRuntimeGPU.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/Attr.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/OpenMPClause.h"
19 #include "clang/AST/StmtOpenMP.h"
20 #include "clang/AST/StmtVisitor.h"
21 #include "clang/Basic/Cuda.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24 #include "llvm/Support/MathExtras.h"
26 using namespace clang
;
27 using namespace CodeGen
;
28 using namespace llvm::omp
;
31 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32 class NVPTXActionTy final
: public PrePostActionTy
{
33 llvm::FunctionCallee EnterCallee
= nullptr;
34 ArrayRef
<llvm::Value
*> EnterArgs
;
35 llvm::FunctionCallee ExitCallee
= nullptr;
36 ArrayRef
<llvm::Value
*> ExitArgs
;
37 bool Conditional
= false;
38 llvm::BasicBlock
*ContBlock
= nullptr;
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee
,
42 ArrayRef
<llvm::Value
*> EnterArgs
,
43 llvm::FunctionCallee ExitCallee
,
44 ArrayRef
<llvm::Value
*> ExitArgs
, bool Conditional
= false)
45 : EnterCallee(EnterCallee
), EnterArgs(EnterArgs
), ExitCallee(ExitCallee
),
46 ExitArgs(ExitArgs
), Conditional(Conditional
) {}
47 void Enter(CodeGenFunction
&CGF
) override
{
48 llvm::Value
*EnterRes
= CGF
.EmitRuntimeCall(EnterCallee
, EnterArgs
);
50 llvm::Value
*CallBool
= CGF
.Builder
.CreateIsNotNull(EnterRes
);
51 auto *ThenBlock
= CGF
.createBasicBlock("omp_if.then");
52 ContBlock
= CGF
.createBasicBlock("omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF
.Builder
.CreateCondBr(CallBool
, ThenBlock
, ContBlock
);
55 CGF
.EmitBlock(ThenBlock
);
58 void Done(CodeGenFunction
&CGF
) {
59 // Emit the rest of blocks/branches
60 CGF
.EmitBranch(ContBlock
);
61 CGF
.EmitBlock(ContBlock
, true);
63 void Exit(CodeGenFunction
&CGF
) override
{
64 CGF
.EmitRuntimeCall(ExitCallee
, ExitArgs
);
68 /// A class to track the execution mode when codegening directives within
69 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70 /// to the target region and used by containing directives such as 'parallel'
71 /// to emit optimized code.
72 class ExecutionRuntimeModesRAII
{
74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode
=
75 CGOpenMPRuntimeGPU::EM_Unknown
;
76 CGOpenMPRuntimeGPU::ExecutionMode
&ExecMode
;
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode
&ExecMode
,
80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode
)
81 : ExecMode(ExecMode
) {
82 SavedExecMode
= ExecMode
;
85 ~ExecutionRuntimeModesRAII() { ExecMode
= SavedExecMode
; }
88 static const ValueDecl
*getPrivateItem(const Expr
*RefExpr
) {
89 RefExpr
= RefExpr
->IgnoreParens();
90 if (const auto *ASE
= dyn_cast
<ArraySubscriptExpr
>(RefExpr
)) {
91 const Expr
*Base
= ASE
->getBase()->IgnoreParenImpCasts();
92 while (const auto *TempASE
= dyn_cast
<ArraySubscriptExpr
>(Base
))
93 Base
= TempASE
->getBase()->IgnoreParenImpCasts();
95 } else if (auto *OASE
= dyn_cast
<ArraySectionExpr
>(RefExpr
)) {
96 const Expr
*Base
= OASE
->getBase()->IgnoreParenImpCasts();
97 while (const auto *TempOASE
= dyn_cast
<ArraySectionExpr
>(Base
))
98 Base
= TempOASE
->getBase()->IgnoreParenImpCasts();
99 while (const auto *TempASE
= dyn_cast
<ArraySubscriptExpr
>(Base
))
100 Base
= TempASE
->getBase()->IgnoreParenImpCasts();
103 RefExpr
= RefExpr
->IgnoreParenImpCasts();
104 if (const auto *DE
= dyn_cast
<DeclRefExpr
>(RefExpr
))
105 return cast
<ValueDecl
>(DE
->getDecl()->getCanonicalDecl());
106 const auto *ME
= cast
<MemberExpr
>(RefExpr
);
107 return cast
<ValueDecl
>(ME
->getMemberDecl()->getCanonicalDecl());
110 static RecordDecl
*buildRecordForGlobalizedVars(
111 ASTContext
&C
, ArrayRef
<const ValueDecl
*> EscapedDecls
,
112 ArrayRef
<const ValueDecl
*> EscapedDeclsForTeams
,
113 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
116 using VarsDataTy
= std::pair
<CharUnits
/*Align*/, const ValueDecl
*>;
117 if (EscapedDecls
.empty() && EscapedDeclsForTeams
.empty())
119 SmallVector
<VarsDataTy
, 4> GlobalizedVars
;
120 for (const ValueDecl
*D
: EscapedDecls
)
121 GlobalizedVars
.emplace_back(C
.getDeclAlign(D
), D
);
122 for (const ValueDecl
*D
: EscapedDeclsForTeams
)
123 GlobalizedVars
.emplace_back(C
.getDeclAlign(D
), D
);
125 // Build struct _globalized_locals_ty {
126 // /* globalized vars */[WarSize] align (decl_align)
127 // /* globalized vars */ for EscapedDeclsForTeams
129 RecordDecl
*GlobalizedRD
= C
.buildImplicitRecord("_globalized_locals_ty");
130 GlobalizedRD
->startDefinition();
131 llvm::SmallPtrSet
<const ValueDecl
*, 16> SingleEscaped(
132 EscapedDeclsForTeams
.begin(), EscapedDeclsForTeams
.end());
133 for (const auto &Pair
: GlobalizedVars
) {
134 const ValueDecl
*VD
= Pair
.second
;
135 QualType Type
= VD
->getType();
136 if (Type
->isLValueReferenceType())
137 Type
= C
.getPointerType(Type
.getNonReferenceType());
139 Type
= Type
.getNonReferenceType();
140 SourceLocation Loc
= VD
->getLocation();
142 if (SingleEscaped
.count(VD
)) {
143 Field
= FieldDecl::Create(
144 C
, GlobalizedRD
, Loc
, Loc
, VD
->getIdentifier(), Type
,
145 C
.getTrivialTypeSourceInfo(Type
, SourceLocation()),
146 /*BW=*/nullptr, /*Mutable=*/false,
147 /*InitStyle=*/ICIS_NoInit
);
148 Field
->setAccess(AS_public
);
149 if (VD
->hasAttrs()) {
150 for (specific_attr_iterator
<AlignedAttr
> I(VD
->getAttrs().begin()),
151 E(VD
->getAttrs().end());
157 llvm::APInt
ArraySize(32, BufSize
);
158 Type
= C
.getConstantArrayType(Type
, ArraySize
, nullptr,
159 ArraySizeModifier::Normal
, 0);
161 Field
= FieldDecl::Create(
162 C
, GlobalizedRD
, Loc
, Loc
, VD
->getIdentifier(), Type
,
163 C
.getTrivialTypeSourceInfo(Type
, SourceLocation()),
164 /*BW=*/nullptr, /*Mutable=*/false,
165 /*InitStyle=*/ICIS_NoInit
);
166 Field
->setAccess(AS_public
);
167 llvm::APInt
Align(32, Pair
.first
.getQuantity());
168 Field
->addAttr(AlignedAttr::CreateImplicit(
169 C
, /*IsAlignmentExpr=*/true,
170 IntegerLiteral::Create(C
, Align
,
171 C
.getIntTypeForBitwidth(32, /*Signed=*/0),
173 {}, AlignedAttr::GNU_aligned
));
175 GlobalizedRD
->addDecl(Field
);
176 MappedDeclsFields
.try_emplace(VD
, Field
);
178 GlobalizedRD
->completeDefinition();
182 /// Get the list of variables that can escape their declaration context.
183 class CheckVarsEscapingDeclContext final
184 : public ConstStmtVisitor
<CheckVarsEscapingDeclContext
> {
185 CodeGenFunction
&CGF
;
186 llvm::SetVector
<const ValueDecl
*> EscapedDecls
;
187 llvm::SetVector
<const ValueDecl
*> EscapedVariableLengthDecls
;
188 llvm::SetVector
<const ValueDecl
*> DelayedVariableLengthDecls
;
189 llvm::SmallPtrSet
<const Decl
*, 4> EscapedParameters
;
190 RecordDecl
*GlobalizedRD
= nullptr;
191 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> MappedDeclsFields
;
192 bool AllEscaped
= false;
193 bool IsForCombinedParallelRegion
= false;
195 void markAsEscaped(const ValueDecl
*VD
) {
196 // Do not globalize declare target variables.
197 if (!isa
<VarDecl
>(VD
) ||
198 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD
))
200 VD
= cast
<ValueDecl
>(VD
->getCanonicalDecl());
201 // Use user-specified allocation.
202 if (VD
->hasAttrs() && VD
->hasAttr
<OMPAllocateDeclAttr
>())
204 // Variables captured by value must be globalized.
205 bool IsCaptured
= false;
206 if (auto *CSI
= CGF
.CapturedStmtInfo
) {
207 if (const FieldDecl
*FD
= CSI
->lookup(cast
<VarDecl
>(VD
))) {
208 // Check if need to capture the variable that was already captured by
209 // value in the outer region.
211 if (!IsForCombinedParallelRegion
) {
214 const auto *Attr
= FD
->getAttr
<OMPCaptureKindAttr
>();
217 if (((Attr
->getCaptureKind() != OMPC_map
) &&
218 !isOpenMPPrivate(Attr
->getCaptureKind())) ||
219 ((Attr
->getCaptureKind() == OMPC_map
) &&
220 !FD
->getType()->isAnyPointerType()))
223 if (!FD
->getType()->isReferenceType()) {
224 assert(!VD
->getType()->isVariablyModifiedType() &&
225 "Parameter captured by value with variably modified type");
226 EscapedParameters
.insert(VD
);
227 } else if (!IsForCombinedParallelRegion
) {
232 if ((!CGF
.CapturedStmtInfo
||
233 (IsForCombinedParallelRegion
&& CGF
.CapturedStmtInfo
)) &&
234 VD
->getType()->isReferenceType())
235 // Do not globalize variables with reference type.
237 if (VD
->getType()->isVariablyModifiedType()) {
238 // If not captured at the target region level then mark the escaped
239 // variable as delayed.
241 EscapedVariableLengthDecls
.insert(VD
);
243 DelayedVariableLengthDecls
.insert(VD
);
245 EscapedDecls
.insert(VD
);
248 void VisitValueDecl(const ValueDecl
*VD
) {
249 if (VD
->getType()->isLValueReferenceType())
251 if (const auto *VarD
= dyn_cast
<VarDecl
>(VD
)) {
252 if (!isa
<ParmVarDecl
>(VarD
) && VarD
->hasInit()) {
253 const bool SavedAllEscaped
= AllEscaped
;
254 AllEscaped
= VD
->getType()->isLValueReferenceType();
255 Visit(VarD
->getInit());
256 AllEscaped
= SavedAllEscaped
;
260 void VisitOpenMPCapturedStmt(const CapturedStmt
*S
,
261 ArrayRef
<OMPClause
*> Clauses
,
262 bool IsCombinedParallelRegion
) {
265 for (const CapturedStmt::Capture
&C
: S
->captures()) {
266 if (C
.capturesVariable() && !C
.capturesVariableByCopy()) {
267 const ValueDecl
*VD
= C
.getCapturedVar();
268 bool SavedIsForCombinedParallelRegion
= IsForCombinedParallelRegion
;
269 if (IsCombinedParallelRegion
) {
270 // Check if the variable is privatized in the combined construct and
271 // those private copies must be shared in the inner parallel
273 IsForCombinedParallelRegion
= false;
274 for (const OMPClause
*C
: Clauses
) {
275 if (!isOpenMPPrivate(C
->getClauseKind()) ||
276 C
->getClauseKind() == OMPC_reduction
||
277 C
->getClauseKind() == OMPC_linear
||
278 C
->getClauseKind() == OMPC_private
)
280 ArrayRef
<const Expr
*> Vars
;
281 if (const auto *PC
= dyn_cast
<OMPFirstprivateClause
>(C
))
282 Vars
= PC
->getVarRefs();
283 else if (const auto *PC
= dyn_cast
<OMPLastprivateClause
>(C
))
284 Vars
= PC
->getVarRefs();
286 llvm_unreachable("Unexpected clause.");
287 for (const auto *E
: Vars
) {
289 cast
<DeclRefExpr
>(E
)->getDecl()->getCanonicalDecl();
290 if (D
== VD
->getCanonicalDecl()) {
291 IsForCombinedParallelRegion
= true;
295 if (IsForCombinedParallelRegion
)
300 if (isa
<OMPCapturedExprDecl
>(VD
))
302 IsForCombinedParallelRegion
= SavedIsForCombinedParallelRegion
;
307 void buildRecordForGlobalizedVars(bool IsInTTDRegion
) {
308 assert(!GlobalizedRD
&&
309 "Record for globalized variables is built already.");
310 ArrayRef
<const ValueDecl
*> EscapedDeclsForParallel
, EscapedDeclsForTeams
;
311 unsigned WarpSize
= CGF
.getTarget().getGridValue().GV_Warp_Size
;
313 EscapedDeclsForTeams
= EscapedDecls
.getArrayRef();
315 EscapedDeclsForParallel
= EscapedDecls
.getArrayRef();
316 GlobalizedRD
= ::buildRecordForGlobalizedVars(
317 CGF
.getContext(), EscapedDeclsForParallel
, EscapedDeclsForTeams
,
318 MappedDeclsFields
, WarpSize
);
322 CheckVarsEscapingDeclContext(CodeGenFunction
&CGF
,
323 ArrayRef
<const ValueDecl
*> TeamsReductions
)
324 : CGF(CGF
), EscapedDecls(TeamsReductions
.begin(), TeamsReductions
.end()) {
326 virtual ~CheckVarsEscapingDeclContext() = default;
327 void VisitDeclStmt(const DeclStmt
*S
) {
330 for (const Decl
*D
: S
->decls())
331 if (const auto *VD
= dyn_cast_or_null
<ValueDecl
>(D
))
334 void VisitOMPExecutableDirective(const OMPExecutableDirective
*D
) {
337 if (!D
->hasAssociatedStmt())
340 dyn_cast_or_null
<CapturedStmt
>(D
->getAssociatedStmt())) {
341 // Do not analyze directives that do not actually require capturing,
342 // like `omp for` or `omp simd` directives.
343 llvm::SmallVector
<OpenMPDirectiveKind
, 4> CaptureRegions
;
344 getOpenMPCaptureRegions(CaptureRegions
, D
->getDirectiveKind());
345 if (CaptureRegions
.size() == 1 && CaptureRegions
.back() == OMPD_unknown
) {
346 VisitStmt(S
->getCapturedStmt());
349 VisitOpenMPCapturedStmt(
351 CaptureRegions
.back() == OMPD_parallel
&&
352 isOpenMPDistributeDirective(D
->getDirectiveKind()));
355 void VisitCapturedStmt(const CapturedStmt
*S
) {
358 for (const CapturedStmt::Capture
&C
: S
->captures()) {
359 if (C
.capturesVariable() && !C
.capturesVariableByCopy()) {
360 const ValueDecl
*VD
= C
.getCapturedVar();
362 if (isa
<OMPCapturedExprDecl
>(VD
))
367 void VisitLambdaExpr(const LambdaExpr
*E
) {
370 for (const LambdaCapture
&C
: E
->captures()) {
371 if (C
.capturesVariable()) {
372 if (C
.getCaptureKind() == LCK_ByRef
) {
373 const ValueDecl
*VD
= C
.getCapturedVar();
375 if (E
->isInitCapture(&C
) || isa
<OMPCapturedExprDecl
>(VD
))
381 void VisitBlockExpr(const BlockExpr
*E
) {
384 for (const BlockDecl::Capture
&C
: E
->getBlockDecl()->captures()) {
386 const VarDecl
*VD
= C
.getVariable();
388 if (isa
<OMPCapturedExprDecl
>(VD
) || VD
->isInitCapture())
393 void VisitCallExpr(const CallExpr
*E
) {
396 for (const Expr
*Arg
: E
->arguments()) {
399 if (Arg
->isLValue()) {
400 const bool SavedAllEscaped
= AllEscaped
;
403 AllEscaped
= SavedAllEscaped
;
408 Visit(E
->getCallee());
410 void VisitDeclRefExpr(const DeclRefExpr
*E
) {
413 const ValueDecl
*VD
= E
->getDecl();
416 if (isa
<OMPCapturedExprDecl
>(VD
))
418 else if (VD
->isInitCapture())
421 void VisitUnaryOperator(const UnaryOperator
*E
) {
424 if (E
->getOpcode() == UO_AddrOf
) {
425 const bool SavedAllEscaped
= AllEscaped
;
427 Visit(E
->getSubExpr());
428 AllEscaped
= SavedAllEscaped
;
430 Visit(E
->getSubExpr());
433 void VisitImplicitCastExpr(const ImplicitCastExpr
*E
) {
436 if (E
->getCastKind() == CK_ArrayToPointerDecay
) {
437 const bool SavedAllEscaped
= AllEscaped
;
439 Visit(E
->getSubExpr());
440 AllEscaped
= SavedAllEscaped
;
442 Visit(E
->getSubExpr());
445 void VisitExpr(const Expr
*E
) {
448 bool SavedAllEscaped
= AllEscaped
;
451 for (const Stmt
*Child
: E
->children())
454 AllEscaped
= SavedAllEscaped
;
456 void VisitStmt(const Stmt
*S
) {
459 for (const Stmt
*Child
: S
->children())
464 /// Returns the record that handles all the escaped local variables and used
465 /// instead of their original storage.
466 const RecordDecl
*getGlobalizedRecord(bool IsInTTDRegion
) {
468 buildRecordForGlobalizedVars(IsInTTDRegion
);
472 /// Returns the field in the globalized record for the escaped variable.
473 const FieldDecl
*getFieldForGlobalizedVar(const ValueDecl
*VD
) const {
474 assert(GlobalizedRD
&&
475 "Record for globalized variables must be generated already.");
476 return MappedDeclsFields
.lookup(VD
);
479 /// Returns the list of the escaped local variables/parameters.
480 ArrayRef
<const ValueDecl
*> getEscapedDecls() const {
481 return EscapedDecls
.getArrayRef();
484 /// Checks if the escaped local variable is actually a parameter passed by
486 const llvm::SmallPtrSetImpl
<const Decl
*> &getEscapedParameters() const {
487 return EscapedParameters
;
490 /// Returns the list of the escaped variables with the variably modified
492 ArrayRef
<const ValueDecl
*> getEscapedVariableLengthDecls() const {
493 return EscapedVariableLengthDecls
.getArrayRef();
496 /// Returns the list of the delayed variables with the variably modified
498 ArrayRef
<const ValueDecl
*> getDelayedVariableLengthDecls() const {
499 return DelayedVariableLengthDecls
.getArrayRef();
502 } // anonymous namespace
504 CGOpenMPRuntimeGPU::ExecutionMode
505 CGOpenMPRuntimeGPU::getExecutionMode() const {
506 return CurrentExecutionMode
;
509 CGOpenMPRuntimeGPU::DataSharingMode
510 CGOpenMPRuntimeGPU::getDataSharingMode() const {
511 return CurrentDataSharingMode
;
514 /// Check for inner (nested) SPMD construct, if any
515 static bool hasNestedSPMDDirective(ASTContext
&Ctx
,
516 const OMPExecutableDirective
&D
) {
517 const auto *CS
= D
.getInnermostCapturedStmt();
519 CS
->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
520 const Stmt
*ChildStmt
= CGOpenMPRuntime::getSingleCompoundChild(Ctx
, Body
);
522 if (const auto *NestedDir
=
523 dyn_cast_or_null
<OMPExecutableDirective
>(ChildStmt
)) {
524 OpenMPDirectiveKind DKind
= NestedDir
->getDirectiveKind();
525 switch (D
.getDirectiveKind()) {
527 if (isOpenMPParallelDirective(DKind
))
529 if (DKind
== OMPD_teams
) {
530 Body
= NestedDir
->getInnermostCapturedStmt()->IgnoreContainers(
531 /*IgnoreCaptured=*/true);
534 ChildStmt
= CGOpenMPRuntime::getSingleCompoundChild(Ctx
, Body
);
535 if (const auto *NND
=
536 dyn_cast_or_null
<OMPExecutableDirective
>(ChildStmt
)) {
537 DKind
= NND
->getDirectiveKind();
538 if (isOpenMPParallelDirective(DKind
))
543 case OMPD_target_teams
:
544 return isOpenMPParallelDirective(DKind
);
545 case OMPD_target_simd
:
546 case OMPD_target_parallel
:
547 case OMPD_target_parallel_for
:
548 case OMPD_target_parallel_for_simd
:
549 case OMPD_target_teams_distribute
:
550 case OMPD_target_teams_distribute_simd
:
551 case OMPD_target_teams_distribute_parallel_for
:
552 case OMPD_target_teams_distribute_parallel_for_simd
:
555 case OMPD_parallel_for
:
556 case OMPD_parallel_master
:
557 case OMPD_parallel_sections
:
559 case OMPD_parallel_for_simd
:
561 case OMPD_cancellation_point
:
563 case OMPD_threadprivate
:
581 case OMPD_target_data
:
582 case OMPD_target_exit_data
:
583 case OMPD_target_enter_data
:
584 case OMPD_distribute
:
585 case OMPD_distribute_simd
:
586 case OMPD_distribute_parallel_for
:
587 case OMPD_distribute_parallel_for_simd
:
588 case OMPD_teams_distribute
:
589 case OMPD_teams_distribute_simd
:
590 case OMPD_teams_distribute_parallel_for
:
591 case OMPD_teams_distribute_parallel_for_simd
:
592 case OMPD_target_update
:
593 case OMPD_declare_simd
:
594 case OMPD_declare_variant
:
595 case OMPD_begin_declare_variant
:
596 case OMPD_end_declare_variant
:
597 case OMPD_declare_target
:
598 case OMPD_end_declare_target
:
599 case OMPD_declare_reduction
:
600 case OMPD_declare_mapper
:
602 case OMPD_taskloop_simd
:
603 case OMPD_master_taskloop
:
604 case OMPD_master_taskloop_simd
:
605 case OMPD_parallel_master_taskloop
:
606 case OMPD_parallel_master_taskloop_simd
:
610 llvm_unreachable("Unexpected directive.");
617 static bool supportsSPMDExecutionMode(ASTContext
&Ctx
,
618 const OMPExecutableDirective
&D
) {
619 OpenMPDirectiveKind DirectiveKind
= D
.getDirectiveKind();
620 switch (DirectiveKind
) {
622 case OMPD_target_teams
:
623 return hasNestedSPMDDirective(Ctx
, D
);
624 case OMPD_target_parallel_loop
:
625 case OMPD_target_parallel
:
626 case OMPD_target_parallel_for
:
627 case OMPD_target_parallel_for_simd
:
628 case OMPD_target_teams_distribute_parallel_for
:
629 case OMPD_target_teams_distribute_parallel_for_simd
:
630 case OMPD_target_simd
:
631 case OMPD_target_teams_distribute_simd
:
633 case OMPD_target_teams_distribute
:
635 case OMPD_target_teams_loop
:
636 // Whether this is true or not depends on how the directive will
637 // eventually be emitted.
638 if (auto *TTLD
= dyn_cast
<OMPTargetTeamsGenericLoopDirective
>(&D
))
639 return TTLD
->canBeParallelFor();
643 case OMPD_parallel_for
:
644 case OMPD_parallel_master
:
645 case OMPD_parallel_sections
:
647 case OMPD_parallel_for_simd
:
649 case OMPD_cancellation_point
:
651 case OMPD_threadprivate
:
669 case OMPD_target_data
:
670 case OMPD_target_exit_data
:
671 case OMPD_target_enter_data
:
672 case OMPD_distribute
:
673 case OMPD_distribute_simd
:
674 case OMPD_distribute_parallel_for
:
675 case OMPD_distribute_parallel_for_simd
:
676 case OMPD_teams_distribute
:
677 case OMPD_teams_distribute_simd
:
678 case OMPD_teams_distribute_parallel_for
:
679 case OMPD_teams_distribute_parallel_for_simd
:
680 case OMPD_target_update
:
681 case OMPD_declare_simd
:
682 case OMPD_declare_variant
:
683 case OMPD_begin_declare_variant
:
684 case OMPD_end_declare_variant
:
685 case OMPD_declare_target
:
686 case OMPD_end_declare_target
:
687 case OMPD_declare_reduction
:
688 case OMPD_declare_mapper
:
690 case OMPD_taskloop_simd
:
691 case OMPD_master_taskloop
:
692 case OMPD_master_taskloop_simd
:
693 case OMPD_parallel_master_taskloop
:
694 case OMPD_parallel_master_taskloop_simd
:
701 "Unknown programming model for OpenMP directive on NVPTX target.");
704 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective
&D
,
705 StringRef ParentName
,
706 llvm::Function
*&OutlinedFn
,
707 llvm::Constant
*&OutlinedFnID
,
709 const RegionCodeGenTy
&CodeGen
) {
710 ExecutionRuntimeModesRAII
ModeRAII(CurrentExecutionMode
, EM_NonSPMD
);
711 EntryFunctionState EST
;
712 WrapperFunctionsMap
.clear();
714 [[maybe_unused
]] bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
715 assert(!IsBareKernel
&& "bare kernel should not be at generic mode");
717 // Emit target region as a standalone region.
718 class NVPTXPrePostActionTy
: public PrePostActionTy
{
719 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
;
720 const OMPExecutableDirective
&D
;
723 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState
&EST
,
724 const OMPExecutableDirective
&D
)
726 void Enter(CodeGenFunction
&CGF
) override
{
727 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
728 RT
.emitKernelInit(D
, CGF
, EST
, /* IsSPMD */ false);
729 // Skip target region initialization.
730 RT
.setLocThreadIdInsertPt(CGF
, /*AtCurrentPoint=*/true);
732 void Exit(CodeGenFunction
&CGF
) override
{
733 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
734 RT
.clearLocThreadIdInsertPt(CGF
);
735 RT
.emitKernelDeinit(CGF
, EST
, /* IsSPMD */ false);
738 CodeGen
.setAction(Action
);
739 IsInTTDRegion
= true;
740 emitTargetOutlinedFunctionHelper(D
, ParentName
, OutlinedFn
, OutlinedFnID
,
741 IsOffloadEntry
, CodeGen
);
742 IsInTTDRegion
= false;
745 void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective
&D
,
746 CodeGenFunction
&CGF
,
747 EntryFunctionState
&EST
, bool IsSPMD
) {
748 int32_t MinThreadsVal
= 1, MaxThreadsVal
= -1, MinTeamsVal
= 1,
750 computeMinAndMaxThreadsAndTeams(D
, CGF
, MinThreadsVal
, MaxThreadsVal
,
751 MinTeamsVal
, MaxTeamsVal
);
753 CGBuilderTy
&Bld
= CGF
.Builder
;
754 Bld
.restoreIP(OMPBuilder
.createTargetInit(
755 Bld
, IsSPMD
, MinThreadsVal
, MaxThreadsVal
, MinTeamsVal
, MaxTeamsVal
));
757 emitGenericVarsProlog(CGF
, EST
.Loc
);
760 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction
&CGF
,
761 EntryFunctionState
&EST
,
764 emitGenericVarsEpilog(CGF
);
766 // This is temporary until we remove the fixed sized buffer.
767 ASTContext
&C
= CGM
.getContext();
768 RecordDecl
*StaticRD
= C
.buildImplicitRecord(
769 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union
);
770 StaticRD
->startDefinition();
771 for (const RecordDecl
*TeamReductionRec
: TeamsReductions
) {
772 QualType RecTy
= C
.getRecordType(TeamReductionRec
);
773 auto *Field
= FieldDecl::Create(
774 C
, StaticRD
, SourceLocation(), SourceLocation(), nullptr, RecTy
,
775 C
.getTrivialTypeSourceInfo(RecTy
, SourceLocation()),
776 /*BW=*/nullptr, /*Mutable=*/false,
777 /*InitStyle=*/ICIS_NoInit
);
778 Field
->setAccess(AS_public
);
779 StaticRD
->addDecl(Field
);
781 StaticRD
->completeDefinition();
782 QualType StaticTy
= C
.getRecordType(StaticRD
);
783 llvm::Type
*LLVMReductionsBufferTy
=
784 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
785 const auto &DL
= CGM
.getModule().getDataLayout();
786 uint64_t ReductionDataSize
=
787 TeamsReductions
.empty()
789 : DL
.getTypeAllocSize(LLVMReductionsBufferTy
).getFixedValue();
790 CGBuilderTy
&Bld
= CGF
.Builder
;
791 OMPBuilder
.createTargetDeinit(Bld
, ReductionDataSize
,
792 C
.getLangOpts().OpenMPCUDAReductionBufNum
);
793 TeamsReductions
.clear();
796 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective
&D
,
797 StringRef ParentName
,
798 llvm::Function
*&OutlinedFn
,
799 llvm::Constant
*&OutlinedFnID
,
801 const RegionCodeGenTy
&CodeGen
) {
802 ExecutionRuntimeModesRAII
ModeRAII(CurrentExecutionMode
, EM_SPMD
);
803 EntryFunctionState EST
;
805 bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
807 // Emit target region as a standalone region.
808 class NVPTXPrePostActionTy
: public PrePostActionTy
{
809 CGOpenMPRuntimeGPU
&RT
;
810 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
;
812 DataSharingMode Mode
;
813 const OMPExecutableDirective
&D
;
816 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU
&RT
,
817 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
,
818 bool IsBareKernel
, const OMPExecutableDirective
&D
)
819 : RT(RT
), EST(EST
), IsBareKernel(IsBareKernel
),
820 Mode(RT
.CurrentDataSharingMode
), D(D
) {}
821 void Enter(CodeGenFunction
&CGF
) override
{
823 RT
.CurrentDataSharingMode
= DataSharingMode::DS_CUDA
;
826 RT
.emitKernelInit(D
, CGF
, EST
, /* IsSPMD */ true);
827 // Skip target region initialization.
828 RT
.setLocThreadIdInsertPt(CGF
, /*AtCurrentPoint=*/true);
830 void Exit(CodeGenFunction
&CGF
) override
{
832 RT
.CurrentDataSharingMode
= Mode
;
835 RT
.clearLocThreadIdInsertPt(CGF
);
836 RT
.emitKernelDeinit(CGF
, EST
, /* IsSPMD */ true);
838 } Action(*this, EST
, IsBareKernel
, D
);
839 CodeGen
.setAction(Action
);
840 IsInTTDRegion
= true;
841 emitTargetOutlinedFunctionHelper(D
, ParentName
, OutlinedFn
, OutlinedFnID
,
842 IsOffloadEntry
, CodeGen
);
843 IsInTTDRegion
= false;
846 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
847 const OMPExecutableDirective
&D
, StringRef ParentName
,
848 llvm::Function
*&OutlinedFn
, llvm::Constant
*&OutlinedFnID
,
849 bool IsOffloadEntry
, const RegionCodeGenTy
&CodeGen
) {
850 if (!IsOffloadEntry
) // Nothing to do.
853 assert(!ParentName
.empty() && "Invalid target region parent name!");
855 bool Mode
= supportsSPMDExecutionMode(CGM
.getContext(), D
);
856 bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
857 if (Mode
|| IsBareKernel
)
858 emitSPMDKernel(D
, ParentName
, OutlinedFn
, OutlinedFnID
, IsOffloadEntry
,
861 emitNonSPMDKernel(D
, ParentName
, OutlinedFn
, OutlinedFnID
, IsOffloadEntry
,
865 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule
&CGM
)
866 : CGOpenMPRuntime(CGM
) {
867 llvm::OpenMPIRBuilderConfig
Config(
868 CGM
.getLangOpts().OpenMPIsTargetDevice
, isGPU(),
869 CGM
.getLangOpts().OpenMPOffloadMandatory
,
870 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
871 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
872 OMPBuilder
.setConfig(Config
);
874 if (!CGM
.getLangOpts().OpenMPIsTargetDevice
)
875 llvm_unreachable("OpenMP can only handle device code.");
877 if (CGM
.getLangOpts().OpenMPCUDAMode
)
878 CurrentDataSharingMode
= CGOpenMPRuntimeGPU::DS_CUDA
;
880 llvm::OpenMPIRBuilder
&OMPBuilder
= getOMPBuilder();
881 if (CGM
.getLangOpts().NoGPULib
|| CGM
.getLangOpts().OMPHostIRFile
.empty())
884 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPTargetDebug
,
885 "__omp_rtl_debug_kind");
886 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPTeamSubscription
,
887 "__omp_rtl_assume_teams_oversubscription");
888 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPThreadSubscription
,
889 "__omp_rtl_assume_threads_oversubscription");
890 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPNoThreadState
,
891 "__omp_rtl_assume_no_thread_state");
892 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPNoNestedParallelism
,
893 "__omp_rtl_assume_no_nested_parallelism");
896 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction
&CGF
,
897 ProcBindKind ProcBind
,
898 SourceLocation Loc
) {
902 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction
&CGF
,
903 llvm::Value
*NumThreads
,
904 SourceLocation Loc
) {
908 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction
&CGF
,
909 const Expr
*NumTeams
,
910 const Expr
*ThreadLimit
,
911 SourceLocation Loc
) {}
913 llvm::Function
*CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
914 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
,
915 const VarDecl
*ThreadIDVar
, OpenMPDirectiveKind InnermostKind
,
916 const RegionCodeGenTy
&CodeGen
) {
917 // Emit target region as a standalone region.
918 bool PrevIsInTTDRegion
= IsInTTDRegion
;
919 IsInTTDRegion
= false;
921 cast
<llvm::Function
>(CGOpenMPRuntime::emitParallelOutlinedFunction(
922 CGF
, D
, ThreadIDVar
, InnermostKind
, CodeGen
));
923 IsInTTDRegion
= PrevIsInTTDRegion
;
924 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD
) {
925 llvm::Function
*WrapperFun
=
926 createParallelDataSharingWrapper(OutlinedFun
, D
);
927 WrapperFunctionsMap
[OutlinedFun
] = WrapperFun
;
933 /// Get list of lastprivate variables from the teams distribute ... or
934 /// teams {distribute ...} directives.
936 getDistributeLastprivateVars(ASTContext
&Ctx
, const OMPExecutableDirective
&D
,
937 llvm::SmallVectorImpl
<const ValueDecl
*> &Vars
) {
938 assert(isOpenMPTeamsDirective(D
.getDirectiveKind()) &&
939 "expected teams directive.");
940 const OMPExecutableDirective
*Dir
= &D
;
941 if (!isOpenMPDistributeDirective(D
.getDirectiveKind())) {
942 if (const Stmt
*S
= CGOpenMPRuntime::getSingleCompoundChild(
944 D
.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
945 /*IgnoreCaptured=*/true))) {
946 Dir
= dyn_cast_or_null
<OMPExecutableDirective
>(S
);
947 if (Dir
&& !isOpenMPDistributeDirective(Dir
->getDirectiveKind()))
953 for (const auto *C
: Dir
->getClausesOfKind
<OMPLastprivateClause
>()) {
954 for (const Expr
*E
: C
->getVarRefs())
955 Vars
.push_back(getPrivateItem(E
));
959 /// Get list of reduction variables from the teams ... directives.
961 getTeamsReductionVars(ASTContext
&Ctx
, const OMPExecutableDirective
&D
,
962 llvm::SmallVectorImpl
<const ValueDecl
*> &Vars
) {
963 assert(isOpenMPTeamsDirective(D
.getDirectiveKind()) &&
964 "expected teams directive.");
965 for (const auto *C
: D
.getClausesOfKind
<OMPReductionClause
>()) {
966 for (const Expr
*E
: C
->privates())
967 Vars
.push_back(getPrivateItem(E
));
971 llvm::Function
*CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
972 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
,
973 const VarDecl
*ThreadIDVar
, OpenMPDirectiveKind InnermostKind
,
974 const RegionCodeGenTy
&CodeGen
) {
975 SourceLocation Loc
= D
.getBeginLoc();
977 const RecordDecl
*GlobalizedRD
= nullptr;
978 llvm::SmallVector
<const ValueDecl
*, 4> LastPrivatesReductions
;
979 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> MappedDeclsFields
;
980 unsigned WarpSize
= CGM
.getTarget().getGridValue().GV_Warp_Size
;
981 // Globalize team reductions variable unconditionally in all modes.
982 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD
)
983 getTeamsReductionVars(CGM
.getContext(), D
, LastPrivatesReductions
);
984 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
) {
985 getDistributeLastprivateVars(CGM
.getContext(), D
, LastPrivatesReductions
);
986 if (!LastPrivatesReductions
.empty()) {
987 GlobalizedRD
= ::buildRecordForGlobalizedVars(
988 CGM
.getContext(), std::nullopt
, LastPrivatesReductions
,
989 MappedDeclsFields
, WarpSize
);
991 } else if (!LastPrivatesReductions
.empty()) {
992 assert(!TeamAndReductions
.first
&&
993 "Previous team declaration is not expected.");
994 TeamAndReductions
.first
= D
.getCapturedStmt(OMPD_teams
)->getCapturedDecl();
995 std::swap(TeamAndReductions
.second
, LastPrivatesReductions
);
998 // Emit target region as a standalone region.
999 class NVPTXPrePostActionTy
: public PrePostActionTy
{
1000 SourceLocation
&Loc
;
1001 const RecordDecl
*GlobalizedRD
;
1002 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
1006 NVPTXPrePostActionTy(
1007 SourceLocation
&Loc
, const RecordDecl
*GlobalizedRD
,
1008 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
1010 : Loc(Loc
), GlobalizedRD(GlobalizedRD
),
1011 MappedDeclsFields(MappedDeclsFields
) {}
1012 void Enter(CodeGenFunction
&CGF
) override
{
1014 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
1016 auto I
= Rt
.FunctionGlobalizedDecls
.try_emplace(CGF
.CurFn
).first
;
1017 I
->getSecond().MappedParams
=
1018 std::make_unique
<CodeGenFunction::OMPMapVars
>();
1019 DeclToAddrMapTy
&Data
= I
->getSecond().LocalVarData
;
1020 for (const auto &Pair
: MappedDeclsFields
) {
1021 assert(Pair
.getFirst()->isCanonicalDecl() &&
1022 "Expected canonical declaration");
1023 Data
.insert(std::make_pair(Pair
.getFirst(), MappedVarData()));
1026 Rt
.emitGenericVarsProlog(CGF
, Loc
);
1028 void Exit(CodeGenFunction
&CGF
) override
{
1029 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime())
1030 .emitGenericVarsEpilog(CGF
);
1032 } Action(Loc
, GlobalizedRD
, MappedDeclsFields
);
1033 CodeGen
.setAction(Action
);
1034 llvm::Function
*OutlinedFun
= CGOpenMPRuntime::emitTeamsOutlinedFunction(
1035 CGF
, D
, ThreadIDVar
, InnermostKind
, CodeGen
);
1040 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction
&CGF
,
1041 SourceLocation Loc
) {
1042 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
1045 CGBuilderTy
&Bld
= CGF
.Builder
;
1047 const auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
1048 if (I
== FunctionGlobalizedDecls
.end())
1051 for (auto &Rec
: I
->getSecond().LocalVarData
) {
1052 const auto *VD
= cast
<VarDecl
>(Rec
.first
);
1053 bool EscapedParam
= I
->getSecond().EscapedParameters
.count(Rec
.first
);
1054 QualType VarTy
= VD
->getType();
1056 // Get the local allocation of a firstprivate variable before sharing
1057 llvm::Value
*ParValue
;
1060 CGF
.MakeAddrLValue(CGF
.GetAddrOfLocalVar(VD
), VD
->getType());
1061 ParValue
= CGF
.EmitLoadOfScalar(ParLVal
, Loc
);
1064 // Allocate space for the variable to be globalized
1065 llvm::Value
*AllocArgs
[] = {CGF
.getTypeSize(VD
->getType())};
1066 llvm::CallBase
*VoidPtr
=
1067 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1068 CGM
.getModule(), OMPRTL___kmpc_alloc_shared
),
1069 AllocArgs
, VD
->getName());
1070 // FIXME: We should use the variables actual alignment as an argument.
1071 VoidPtr
->addRetAttr(llvm::Attribute::get(
1072 CGM
.getLLVMContext(), llvm::Attribute::Alignment
,
1073 CGM
.getContext().getTargetInfo().getNewAlign() / 8));
1075 // Cast the void pointer and get the address of the globalized variable.
1076 llvm::PointerType
*VarPtrTy
= CGF
.ConvertTypeForMem(VarTy
)->getPointerTo();
1077 llvm::Value
*CastedVoidPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1078 VoidPtr
, VarPtrTy
, VD
->getName() + "_on_stack");
1080 CGF
.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr
, VarTy
);
1081 Rec
.second
.PrivateAddr
= VarAddr
.getAddress();
1082 Rec
.second
.GlobalizedVal
= VoidPtr
;
1084 // Assign the local allocation to the newly globalized location.
1086 CGF
.EmitStoreOfScalar(ParValue
, VarAddr
);
1087 I
->getSecond().MappedParams
->setVarAddr(CGF
, VD
, VarAddr
.getAddress());
1089 if (auto *DI
= CGF
.getDebugInfo())
1090 VoidPtr
->setDebugLoc(DI
->SourceLocToDebugLoc(VD
->getLocation()));
1093 for (const auto *ValueD
: I
->getSecond().EscapedVariableLengthDecls
) {
1094 const auto *VD
= cast
<VarDecl
>(ValueD
);
1095 std::pair
<llvm::Value
*, llvm::Value
*> AddrSizePair
=
1096 getKmpcAllocShared(CGF
, VD
);
1097 I
->getSecond().EscapedVariableLengthDeclsAddrs
.emplace_back(AddrSizePair
);
1098 LValue Base
= CGF
.MakeAddrLValue(AddrSizePair
.first
, VD
->getType(),
1099 CGM
.getContext().getDeclAlign(VD
),
1100 AlignmentSource::Decl
);
1101 I
->getSecond().MappedParams
->setVarAddr(CGF
, VD
, Base
.getAddress());
1103 I
->getSecond().MappedParams
->apply(CGF
);
1106 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction
&CGF
,
1107 const VarDecl
*VD
) const {
1108 const auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
1109 if (I
== FunctionGlobalizedDecls
.end())
1112 // Check variable declaration is delayed:
1113 return llvm::is_contained(I
->getSecond().DelayedVariableLengthDecls
, VD
);
1116 std::pair
<llvm::Value
*, llvm::Value
*>
1117 CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction
&CGF
,
1118 const VarDecl
*VD
) {
1119 CGBuilderTy
&Bld
= CGF
.Builder
;
1121 // Compute size and alignment.
1122 llvm::Value
*Size
= CGF
.getTypeSize(VD
->getType());
1123 CharUnits Align
= CGM
.getContext().getDeclAlign(VD
);
1124 Size
= Bld
.CreateNUWAdd(
1125 Size
, llvm::ConstantInt::get(CGF
.SizeTy
, Align
.getQuantity() - 1));
1126 llvm::Value
*AlignVal
=
1127 llvm::ConstantInt::get(CGF
.SizeTy
, Align
.getQuantity());
1128 Size
= Bld
.CreateUDiv(Size
, AlignVal
);
1129 Size
= Bld
.CreateNUWMul(Size
, AlignVal
);
1131 // Allocate space for this VLA object to be globalized.
1132 llvm::Value
*AllocArgs
[] = {Size
};
1133 llvm::CallBase
*VoidPtr
=
1134 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1135 CGM
.getModule(), OMPRTL___kmpc_alloc_shared
),
1136 AllocArgs
, VD
->getName());
1137 VoidPtr
->addRetAttr(llvm::Attribute::get(
1138 CGM
.getLLVMContext(), llvm::Attribute::Alignment
, Align
.getQuantity()));
1140 return std::make_pair(VoidPtr
, Size
);
1143 void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1144 CodeGenFunction
&CGF
,
1145 const std::pair
<llvm::Value
*, llvm::Value
*> &AddrSizePair
) {
1146 // Deallocate the memory for each globalized VLA object
1147 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1148 CGM
.getModule(), OMPRTL___kmpc_free_shared
),
1149 {AddrSizePair
.first
, AddrSizePair
.second
});
1152 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction
&CGF
) {
1153 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
1156 const auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
1157 if (I
!= FunctionGlobalizedDecls
.end()) {
1158 // Deallocate the memory for each globalized VLA object that was
1159 // globalized in the prolog (i.e. emitGenericVarsProlog).
1160 for (const auto &AddrSizePair
:
1161 llvm::reverse(I
->getSecond().EscapedVariableLengthDeclsAddrs
)) {
1162 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1163 CGM
.getModule(), OMPRTL___kmpc_free_shared
),
1164 {AddrSizePair
.first
, AddrSizePair
.second
});
1166 // Deallocate the memory for each globalized value
1167 for (auto &Rec
: llvm::reverse(I
->getSecond().LocalVarData
)) {
1168 const auto *VD
= cast
<VarDecl
>(Rec
.first
);
1169 I
->getSecond().MappedParams
->restore(CGF
);
1171 llvm::Value
*FreeArgs
[] = {Rec
.second
.GlobalizedVal
,
1172 CGF
.getTypeSize(VD
->getType())};
1173 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1174 CGM
.getModule(), OMPRTL___kmpc_free_shared
),
1180 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction
&CGF
,
1181 const OMPExecutableDirective
&D
,
1183 llvm::Function
*OutlinedFn
,
1184 ArrayRef
<llvm::Value
*> CapturedVars
) {
1185 if (!CGF
.HaveInsertPoint())
1188 bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
1190 RawAddress ZeroAddr
= CGF
.CreateDefaultAlignTempAlloca(CGF
.Int32Ty
,
1191 /*Name=*/".zero.addr");
1192 CGF
.Builder
.CreateStore(CGF
.Builder
.getInt32(/*C*/ 0), ZeroAddr
);
1193 llvm::SmallVector
<llvm::Value
*, 16> OutlinedFnArgs
;
1194 // We don't emit any thread id function call in bare kernel, but because the
1195 // outlined function has a pointer argument, we emit a nullptr here.
1197 OutlinedFnArgs
.push_back(llvm::ConstantPointerNull::get(CGM
.VoidPtrTy
));
1199 OutlinedFnArgs
.push_back(emitThreadIDAddress(CGF
, Loc
).emitRawPointer(CGF
));
1200 OutlinedFnArgs
.push_back(ZeroAddr
.getPointer());
1201 OutlinedFnArgs
.append(CapturedVars
.begin(), CapturedVars
.end());
1202 emitOutlinedFunctionCall(CGF
, Loc
, OutlinedFn
, OutlinedFnArgs
);
1205 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction
&CGF
,
1207 llvm::Function
*OutlinedFn
,
1208 ArrayRef
<llvm::Value
*> CapturedVars
,
1210 llvm::Value
*NumThreads
) {
1211 if (!CGF
.HaveInsertPoint())
1214 auto &&ParallelGen
= [this, Loc
, OutlinedFn
, CapturedVars
, IfCond
,
1215 NumThreads
](CodeGenFunction
&CGF
,
1216 PrePostActionTy
&Action
) {
1217 CGBuilderTy
&Bld
= CGF
.Builder
;
1218 llvm::Value
*NumThreadsVal
= NumThreads
;
1219 llvm::Function
*WFn
= WrapperFunctionsMap
[OutlinedFn
];
1220 llvm::Value
*ID
= llvm::ConstantPointerNull::get(CGM
.Int8PtrTy
);
1222 ID
= Bld
.CreateBitOrPointerCast(WFn
, CGM
.Int8PtrTy
);
1223 llvm::Value
*FnPtr
= Bld
.CreateBitOrPointerCast(OutlinedFn
, CGM
.Int8PtrTy
);
1225 // Create a private scope that will globalize the arguments
1226 // passed from the outside of the target region.
1227 // TODO: Is that needed?
1228 CodeGenFunction::OMPPrivateScope
PrivateArgScope(CGF
);
1230 Address CapturedVarsAddrs
= CGF
.CreateDefaultAlignTempAlloca(
1231 llvm::ArrayType::get(CGM
.VoidPtrTy
, CapturedVars
.size()),
1232 "captured_vars_addrs");
1233 // There's something to share.
1234 if (!CapturedVars
.empty()) {
1235 // Prepare for parallel region. Indicate the outlined function.
1236 ASTContext
&Ctx
= CGF
.getContext();
1238 for (llvm::Value
*V
: CapturedVars
) {
1239 Address Dst
= Bld
.CreateConstArrayGEP(CapturedVarsAddrs
, Idx
);
1241 if (V
->getType()->isIntegerTy())
1242 PtrV
= Bld
.CreateIntToPtr(V
, CGF
.VoidPtrTy
);
1244 PtrV
= Bld
.CreatePointerBitCastOrAddrSpaceCast(V
, CGF
.VoidPtrTy
);
1245 CGF
.EmitStoreOfScalar(PtrV
, Dst
, /*Volatile=*/false,
1246 Ctx
.getPointerType(Ctx
.VoidPtrTy
));
1251 llvm::Value
*IfCondVal
= nullptr;
1253 IfCondVal
= Bld
.CreateIntCast(CGF
.EvaluateExprAsBool(IfCond
), CGF
.Int32Ty
,
1254 /* isSigned */ false);
1256 IfCondVal
= llvm::ConstantInt::get(CGF
.Int32Ty
, 1);
1259 NumThreadsVal
= llvm::ConstantInt::get(CGF
.Int32Ty
, -1);
1261 NumThreadsVal
= Bld
.CreateZExtOrTrunc(NumThreadsVal
, CGF
.Int32Ty
),
1263 assert(IfCondVal
&& "Expected a value");
1264 llvm::Value
*RTLoc
= emitUpdateLocation(CGF
, Loc
);
1265 llvm::Value
*Args
[] = {
1267 getThreadID(CGF
, Loc
),
1270 llvm::ConstantInt::get(CGF
.Int32Ty
, -1),
1273 Bld
.CreateBitOrPointerCast(CapturedVarsAddrs
.emitRawPointer(CGF
),
1275 llvm::ConstantInt::get(CGM
.SizeTy
, CapturedVars
.size())};
1276 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1277 CGM
.getModule(), OMPRTL___kmpc_parallel_51
),
1281 RegionCodeGenTy
RCG(ParallelGen
);
1285 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction
&CGF
) {
1286 // Always emit simple barriers!
1287 if (!CGF
.HaveInsertPoint())
1289 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1290 // This function does not use parameters, so we can emit just default values.
1291 llvm::Value
*Args
[] = {
1292 llvm::ConstantPointerNull::get(
1293 cast
<llvm::PointerType
>(getIdentTyPointerTy())),
1294 llvm::ConstantInt::get(CGF
.Int32Ty
, /*V=*/0, /*isSigned=*/true)};
1295 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1296 CGM
.getModule(), OMPRTL___kmpc_barrier_simple_spmd
),
1300 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction
&CGF
,
1302 OpenMPDirectiveKind Kind
, bool,
1304 // Always emit simple barriers!
1305 if (!CGF
.HaveInsertPoint())
1307 // Build call __kmpc_cancel_barrier(loc, thread_id);
1308 unsigned Flags
= getDefaultFlagsForBarriers(Kind
);
1309 llvm::Value
*Args
[] = {emitUpdateLocation(CGF
, Loc
, Flags
),
1310 getThreadID(CGF
, Loc
)};
1312 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1313 CGM
.getModule(), OMPRTL___kmpc_barrier
),
1317 void CGOpenMPRuntimeGPU::emitCriticalRegion(
1318 CodeGenFunction
&CGF
, StringRef CriticalName
,
1319 const RegionCodeGenTy
&CriticalOpGen
, SourceLocation Loc
,
1321 llvm::BasicBlock
*LoopBB
= CGF
.createBasicBlock("omp.critical.loop");
1322 llvm::BasicBlock
*TestBB
= CGF
.createBasicBlock("omp.critical.test");
1323 llvm::BasicBlock
*SyncBB
= CGF
.createBasicBlock("omp.critical.sync");
1324 llvm::BasicBlock
*BodyBB
= CGF
.createBasicBlock("omp.critical.body");
1325 llvm::BasicBlock
*ExitBB
= CGF
.createBasicBlock("omp.critical.exit");
1327 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
1329 // Get the mask of active threads in the warp.
1330 llvm::Value
*Mask
= CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1331 CGM
.getModule(), OMPRTL___kmpc_warp_active_thread_mask
));
1332 // Fetch team-local id of the thread.
1333 llvm::Value
*ThreadID
= RT
.getGPUThreadID(CGF
);
1335 // Get the width of the team.
1336 llvm::Value
*TeamWidth
= RT
.getGPUNumThreads(CGF
);
1338 // Initialize the counter variable for the loop.
1340 CGF
.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1341 Address Counter
= CGF
.CreateMemTemp(Int32Ty
, "critical_counter");
1342 LValue CounterLVal
= CGF
.MakeAddrLValue(Counter
, Int32Ty
);
1343 CGF
.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM
.Int32Ty
), CounterLVal
,
1346 // Block checks if loop counter exceeds upper bound.
1347 CGF
.EmitBlock(LoopBB
);
1348 llvm::Value
*CounterVal
= CGF
.EmitLoadOfScalar(CounterLVal
, Loc
);
1349 llvm::Value
*CmpLoopBound
= CGF
.Builder
.CreateICmpSLT(CounterVal
, TeamWidth
);
1350 CGF
.Builder
.CreateCondBr(CmpLoopBound
, TestBB
, ExitBB
);
1352 // Block tests which single thread should execute region, and which threads
1353 // should go straight to synchronisation point.
1354 CGF
.EmitBlock(TestBB
);
1355 CounterVal
= CGF
.EmitLoadOfScalar(CounterLVal
, Loc
);
1356 llvm::Value
*CmpThreadToCounter
=
1357 CGF
.Builder
.CreateICmpEQ(ThreadID
, CounterVal
);
1358 CGF
.Builder
.CreateCondBr(CmpThreadToCounter
, BodyBB
, SyncBB
);
1360 // Block emits the body of the critical region.
1361 CGF
.EmitBlock(BodyBB
);
1363 // Output the critical statement.
1364 CGOpenMPRuntime::emitCriticalRegion(CGF
, CriticalName
, CriticalOpGen
, Loc
,
1367 // After the body surrounded by the critical region, the single executing
1368 // thread will jump to the synchronisation point.
1369 // Block waits for all threads in current team to finish then increments the
1370 // counter variable and returns to the loop.
1371 CGF
.EmitBlock(SyncBB
);
1372 // Reconverge active threads in the warp.
1373 (void)CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1374 CGM
.getModule(), OMPRTL___kmpc_syncwarp
),
1377 llvm::Value
*IncCounterVal
=
1378 CGF
.Builder
.CreateNSWAdd(CounterVal
, CGF
.Builder
.getInt32(1));
1379 CGF
.EmitStoreOfScalar(IncCounterVal
, CounterLVal
);
1380 CGF
.EmitBranch(LoopBB
);
1382 // Block that is reached when all threads in the team complete the region.
1383 CGF
.EmitBlock(ExitBB
, /*IsFinished=*/true);
1386 /// Cast value to the specified type.
1387 static llvm::Value
*castValueToType(CodeGenFunction
&CGF
, llvm::Value
*Val
,
1388 QualType ValTy
, QualType CastTy
,
1389 SourceLocation Loc
) {
1390 assert(!CGF
.getContext().getTypeSizeInChars(CastTy
).isZero() &&
1391 "Cast type must sized.");
1392 assert(!CGF
.getContext().getTypeSizeInChars(ValTy
).isZero() &&
1393 "Val type must sized.");
1394 llvm::Type
*LLVMCastTy
= CGF
.ConvertTypeForMem(CastTy
);
1395 if (ValTy
== CastTy
)
1397 if (CGF
.getContext().getTypeSizeInChars(ValTy
) ==
1398 CGF
.getContext().getTypeSizeInChars(CastTy
))
1399 return CGF
.Builder
.CreateBitCast(Val
, LLVMCastTy
);
1400 if (CastTy
->isIntegerType() && ValTy
->isIntegerType())
1401 return CGF
.Builder
.CreateIntCast(Val
, LLVMCastTy
,
1402 CastTy
->hasSignedIntegerRepresentation());
1403 Address CastItem
= CGF
.CreateMemTemp(CastTy
);
1404 Address ValCastItem
= CastItem
.withElementType(Val
->getType());
1405 CGF
.EmitStoreOfScalar(Val
, ValCastItem
, /*Volatile=*/false, ValTy
,
1406 LValueBaseInfo(AlignmentSource::Type
),
1408 return CGF
.EmitLoadOfScalar(CastItem
, /*Volatile=*/false, CastTy
, Loc
,
1409 LValueBaseInfo(AlignmentSource::Type
),
1414 /// Design of OpenMP reductions on the GPU
1416 /// Consider a typical OpenMP program with one or more reduction
1421 /// #pragma omp target teams distribute parallel for \
1422 /// reduction(+:foo) reduction(*:bar)
1423 /// for (int i = 0; i < N; i++) {
1424 /// foo += A[i]; bar *= B[i];
1427 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1428 /// all teams. In our OpenMP implementation on the NVPTX device an
1429 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1430 /// within a team are mapped to CUDA threads within a threadblock.
1431 /// Our goal is to efficiently aggregate values across all OpenMP
1432 /// threads such that:
1434 /// - the compiler and runtime are logically concise, and
1435 /// - the reduction is performed efficiently in a hierarchical
1436 /// manner as follows: within OpenMP threads in the same warp,
1437 /// across warps in a threadblock, and finally across teams on
1438 /// the NVPTX device.
1440 /// Introduction to Decoupling
1442 /// We would like to decouple the compiler and the runtime so that the
1443 /// latter is ignorant of the reduction variables (number, data types)
1444 /// and the reduction operators. This allows a simpler interface
1445 /// and implementation while still attaining good performance.
1447 /// Pseudocode for the aforementioned OpenMP program generated by the
1448 /// compiler is as follows:
1450 /// 1. Create private copies of reduction variables on each OpenMP
1451 /// thread: 'foo_private', 'bar_private'
1452 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1453 /// to it and writes the result in 'foo_private' and 'bar_private'
1455 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1456 /// and store the result on the team master:
1458 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1459 /// reduceData, shuffleReduceFn, interWarpCpyFn)
1462 /// struct ReduceData {
1466 /// reduceData.foo = &foo_private
1467 /// reduceData.bar = &bar_private
1469 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1470 /// auxiliary functions generated by the compiler that operate on
1471 /// variables of type 'ReduceData'. They aid the runtime perform
1472 /// algorithmic steps in a data agnostic manner.
1474 /// 'shuffleReduceFn' is a pointer to a function that reduces data
1475 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
1476 /// same warp. It takes the following arguments as input:
1478 /// a. variable of type 'ReduceData' on the calling lane,
1480 /// c. an offset relative to the current lane_id to generate a
1481 /// remote_lane_id. The remote lane contains the second
1482 /// variable of type 'ReduceData' that is to be reduced.
1483 /// d. an algorithm version parameter determining which reduction
1484 /// algorithm to use.
1486 /// 'shuffleReduceFn' retrieves data from the remote lane using
1487 /// efficient GPU shuffle intrinsics and reduces, using the
1488 /// algorithm specified by the 4th parameter, the two operands
1489 /// element-wise. The result is written to the first operand.
1491 /// Different reduction algorithms are implemented in different
1492 /// runtime functions, all calling 'shuffleReduceFn' to perform
1493 /// the essential reduction step. Therefore, based on the 4th
1494 /// parameter, this function behaves slightly differently to
1495 /// cooperate with the runtime to ensure correctness under
1496 /// different circumstances.
1498 /// 'InterWarpCpyFn' is a pointer to a function that transfers
1499 /// reduced variables across warps. It tunnels, through CUDA
1500 /// shared memory, the thread-private data of type 'ReduceData'
1501 /// from lane 0 of each warp to a lane in the first warp.
1502 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1503 /// The last team writes the global reduced value to memory.
1505 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1506 /// reduceData, shuffleReduceFn, interWarpCpyFn,
1507 /// scratchpadCopyFn, loadAndReduceFn)
1509 /// 'scratchpadCopyFn' is a helper that stores reduced
1510 /// data from the team master to a scratchpad array in
1513 /// 'loadAndReduceFn' is a helper that loads data from
1514 /// the scratchpad array and reduces it with the input
1517 /// These compiler generated functions hide address
1518 /// calculation and alignment information from the runtime.
1520 /// The team master of the last team stores the reduced
1521 /// result to the globals in memory.
1522 /// foo += reduceData.foo; bar *= reduceData.bar
1525 /// Warp Reduction Algorithms
1527 /// On the warp level, we have three algorithms implemented in the
1528 /// OpenMP runtime depending on the number of active lanes:
1530 /// Full Warp Reduction
1532 /// The reduce algorithm within a warp where all lanes are active
1533 /// is implemented in the runtime as follows:
1535 /// full_warp_reduce(void *reduce_data,
1536 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1537 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1538 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
1541 /// The algorithm completes in log(2, WARPSIZE) steps.
1543 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1544 /// not used therefore we save instructions by not retrieving lane_id
1545 /// from the corresponding special registers. The 4th parameter, which
1546 /// represents the version of the algorithm being used, is set to 0 to
1547 /// signify full warp reduction.
1549 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1551 /// #reduce_elem refers to an element in the local lane's data structure
1552 /// #remote_elem is retrieved from a remote lane
1553 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1554 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1556 /// Contiguous Partial Warp Reduction
1558 /// This reduce algorithm is used within a warp where only the first
1559 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1560 /// number of OpenMP threads in a parallel region is not a multiple of
1561 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
1564 /// contiguous_partial_reduce(void *reduce_data,
1565 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1566 /// int size, int lane_id) {
1569 /// curr_size = size;
1570 /// mask = curr_size/2;
1571 /// while (offset>0) {
1572 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1573 /// curr_size = (curr_size+1)/2;
1574 /// offset = curr_size/2;
1578 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1580 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1581 /// if (lane_id < offset)
1582 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1584 /// reduce_elem = remote_elem
1586 /// This algorithm assumes that the data to be reduced are located in a
1587 /// contiguous subset of lanes starting from the first. When there is
1588 /// an odd number of active lanes, the data in the last lane is not
1589 /// aggregated with any other lane's dat but is instead copied over.
1591 /// Dispersed Partial Warp Reduction
1593 /// This algorithm is used within a warp when any discontiguous subset of
1594 /// lanes are active. It is used to implement the reduction operation
1595 /// across lanes in an OpenMP simd region or in a nested parallel region.
1598 /// dispersed_partial_reduce(void *reduce_data,
1599 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1600 /// int size, remote_id;
1601 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1603 /// remote_id = next_active_lane_id_right_after_me();
1604 /// # the above function returns 0 of no active lane
1605 /// # is present right after the current lane.
1606 /// size = number_of_active_lanes_in_this_warp();
1607 /// logical_lane_id /= 2;
1608 /// ShuffleReduceFn(reduce_data, logical_lane_id,
1609 /// remote_id-1-threadIdx.x, 2);
1610 /// } while (logical_lane_id % 2 == 0 && size > 1);
1613 /// There is no assumption made about the initial state of the reduction.
1614 /// Any number of lanes (>=1) could be active at any position. The reduction
1615 /// result is returned in the first active lane.
1617 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1619 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1620 /// if (lane_id % 2 == 0 && offset > 0)
1621 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1623 /// reduce_elem = remote_elem
1626 /// Intra-Team Reduction
1628 /// This function, as implemented in the runtime call
1629 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1630 /// threads in a team. It first reduces within a warp using the
1631 /// aforementioned algorithms. We then proceed to gather all such
1632 /// reduced values at the first warp.
1634 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1635 /// data from each of the "warp master" (zeroth lane of each warp, where
1636 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
1637 /// a mathematical sense) the problem of reduction across warp masters in
1638 /// a block to the problem of warp reduction.
1641 /// Inter-Team Reduction
1643 /// Once a team has reduced its data to a single value, it is stored in
1644 /// a global scratchpad array. Since each team has a distinct slot, this
1645 /// can be done without locking.
1647 /// The last team to write to the scratchpad array proceeds to reduce the
1648 /// scratchpad array. One or more workers in the last team use the helper
1649 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1650 /// the k'th worker reduces every k'th element.
1652 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1653 /// reduce across workers and compute a globally reduced value.
1655 void CGOpenMPRuntimeGPU::emitReduction(
1656 CodeGenFunction
&CGF
, SourceLocation Loc
, ArrayRef
<const Expr
*> Privates
,
1657 ArrayRef
<const Expr
*> LHSExprs
, ArrayRef
<const Expr
*> RHSExprs
,
1658 ArrayRef
<const Expr
*> ReductionOps
, ReductionOptionsTy Options
) {
1659 if (!CGF
.HaveInsertPoint())
1662 bool ParallelReduction
= isOpenMPParallelDirective(Options
.ReductionKind
);
1663 bool DistributeReduction
= isOpenMPDistributeDirective(Options
.ReductionKind
);
1664 bool TeamsReduction
= isOpenMPTeamsDirective(Options
.ReductionKind
);
1666 ASTContext
&C
= CGM
.getContext();
1668 if (Options
.SimpleReduction
) {
1669 assert(!TeamsReduction
&& !ParallelReduction
&&
1670 "Invalid reduction selection in emitReduction.");
1671 (void)ParallelReduction
;
1672 CGOpenMPRuntime::emitReduction(CGF
, Loc
, Privates
, LHSExprs
, RHSExprs
,
1673 ReductionOps
, Options
);
1677 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> VarFieldMap
;
1678 llvm::SmallVector
<const ValueDecl
*, 4> PrivatesReductions(Privates
.size());
1680 for (const Expr
*DRE
: Privates
) {
1681 PrivatesReductions
[Cnt
] = cast
<DeclRefExpr
>(DRE
)->getDecl();
1684 const RecordDecl
*ReductionRec
= ::buildRecordForGlobalizedVars(
1685 CGM
.getContext(), PrivatesReductions
, std::nullopt
, VarFieldMap
, 1);
1688 TeamsReductions
.push_back(ReductionRec
);
1690 // Source location for the ident struct
1691 llvm::Value
*RTLoc
= emitUpdateLocation(CGF
, Loc
);
1693 using InsertPointTy
= llvm::OpenMPIRBuilder::InsertPointTy
;
1694 InsertPointTy
AllocaIP(CGF
.AllocaInsertPt
->getParent(),
1695 CGF
.AllocaInsertPt
->getIterator());
1696 InsertPointTy
CodeGenIP(CGF
.Builder
.GetInsertBlock(),
1697 CGF
.Builder
.GetInsertPoint());
1698 llvm::OpenMPIRBuilder::LocationDescription
OmpLoc(
1699 CodeGenIP
, CGF
.SourceLocToDebugLoc(Loc
));
1700 llvm::SmallVector
<llvm::OpenMPIRBuilder::ReductionInfo
> ReductionInfos
;
1702 CodeGenFunction::OMPPrivateScope
Scope(CGF
);
1704 for (const Expr
*Private
: Privates
) {
1705 llvm::Type
*ElementType
;
1706 llvm::Value
*Variable
;
1707 llvm::Value
*PrivateVariable
;
1708 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen
= nullptr;
1709 ElementType
= CGF
.ConvertTypeForMem(Private
->getType());
1710 const auto *RHSVar
=
1711 cast
<VarDecl
>(cast
<DeclRefExpr
>(RHSExprs
[Idx
])->getDecl());
1712 PrivateVariable
= CGF
.GetAddrOfLocalVar(RHSVar
).emitRawPointer(CGF
);
1713 const auto *LHSVar
=
1714 cast
<VarDecl
>(cast
<DeclRefExpr
>(LHSExprs
[Idx
])->getDecl());
1715 Variable
= CGF
.GetAddrOfLocalVar(LHSVar
).emitRawPointer(CGF
);
1716 llvm::OpenMPIRBuilder::EvalKind EvalKind
;
1717 switch (CGF
.getEvaluationKind(Private
->getType())) {
1719 EvalKind
= llvm::OpenMPIRBuilder::EvalKind::Scalar
;
1722 EvalKind
= llvm::OpenMPIRBuilder::EvalKind::Complex
;
1725 EvalKind
= llvm::OpenMPIRBuilder::EvalKind::Aggregate
;
1728 auto ReductionGen
= [&](InsertPointTy CodeGenIP
, unsigned I
,
1729 llvm::Value
**LHSPtr
, llvm::Value
**RHSPtr
,
1730 llvm::Function
*NewFunc
) {
1731 CGF
.Builder
.restoreIP(CodeGenIP
);
1732 auto *CurFn
= CGF
.CurFn
;
1733 CGF
.CurFn
= NewFunc
;
1735 *LHSPtr
= CGF
.GetAddrOfLocalVar(
1736 cast
<VarDecl
>(cast
<DeclRefExpr
>(LHSExprs
[I
])->getDecl()))
1737 .emitRawPointer(CGF
);
1738 *RHSPtr
= CGF
.GetAddrOfLocalVar(
1739 cast
<VarDecl
>(cast
<DeclRefExpr
>(RHSExprs
[I
])->getDecl()))
1740 .emitRawPointer(CGF
);
1742 emitSingleReductionCombiner(CGF
, ReductionOps
[I
], Privates
[I
],
1743 cast
<DeclRefExpr
>(LHSExprs
[I
]),
1744 cast
<DeclRefExpr
>(RHSExprs
[I
]));
1748 return InsertPointTy(CGF
.Builder
.GetInsertBlock(),
1749 CGF
.Builder
.GetInsertPoint());
1751 ReductionInfos
.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
1752 ElementType
, Variable
, PrivateVariable
, EvalKind
,
1753 /*ReductionGen=*/nullptr, ReductionGen
, AtomicReductionGen
));
1757 CGF
.Builder
.restoreIP(OMPBuilder
.createReductionsGPU(
1758 OmpLoc
, AllocaIP
, CodeGenIP
, ReductionInfos
, false, TeamsReduction
,
1759 DistributeReduction
, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang
,
1760 CGF
.getTarget().getGridValue(), C
.getLangOpts().OpenMPCUDAReductionBufNum
,
1766 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl
*FD
,
1767 const VarDecl
*NativeParam
) const {
1768 if (!NativeParam
->getType()->isReferenceType())
1770 QualType ArgType
= NativeParam
->getType();
1771 QualifierCollector QC
;
1772 const Type
*NonQualTy
= QC
.strip(ArgType
);
1773 QualType PointeeTy
= cast
<ReferenceType
>(NonQualTy
)->getPointeeType();
1774 if (const auto *Attr
= FD
->getAttr
<OMPCaptureKindAttr
>()) {
1775 if (Attr
->getCaptureKind() == OMPC_map
) {
1776 PointeeTy
= CGM
.getContext().getAddrSpaceQualType(PointeeTy
,
1777 LangAS::opencl_global
);
1780 ArgType
= CGM
.getContext().getPointerType(PointeeTy
);
1782 enum { NVPTX_local_addr
= 5 };
1783 QC
.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr
));
1784 ArgType
= QC
.apply(CGM
.getContext(), ArgType
);
1785 if (isa
<ImplicitParamDecl
>(NativeParam
))
1786 return ImplicitParamDecl::Create(
1787 CGM
.getContext(), /*DC=*/nullptr, NativeParam
->getLocation(),
1788 NativeParam
->getIdentifier(), ArgType
, ImplicitParamKind::Other
);
1789 return ParmVarDecl::Create(
1791 const_cast<DeclContext
*>(NativeParam
->getDeclContext()),
1792 NativeParam
->getBeginLoc(), NativeParam
->getLocation(),
1793 NativeParam
->getIdentifier(), ArgType
,
1794 /*TInfo=*/nullptr, SC_None
, /*DefArg=*/nullptr);
1798 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction
&CGF
,
1799 const VarDecl
*NativeParam
,
1800 const VarDecl
*TargetParam
) const {
1801 assert(NativeParam
!= TargetParam
&&
1802 NativeParam
->getType()->isReferenceType() &&
1803 "Native arg must not be the same as target arg.");
1804 Address LocalAddr
= CGF
.GetAddrOfLocalVar(TargetParam
);
1805 QualType NativeParamType
= NativeParam
->getType();
1806 QualifierCollector QC
;
1807 const Type
*NonQualTy
= QC
.strip(NativeParamType
);
1808 QualType NativePointeeTy
= cast
<ReferenceType
>(NonQualTy
)->getPointeeType();
1809 unsigned NativePointeeAddrSpace
=
1810 CGF
.getTypes().getTargetAddressSpace(NativePointeeTy
);
1811 QualType TargetTy
= TargetParam
->getType();
1812 llvm::Value
*TargetAddr
= CGF
.EmitLoadOfScalar(LocalAddr
, /*Volatile=*/false,
1813 TargetTy
, SourceLocation());
1814 // Cast to native address space.
1815 TargetAddr
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
1817 llvm::PointerType::get(CGF
.getLLVMContext(), NativePointeeAddrSpace
));
1818 Address NativeParamAddr
= CGF
.CreateMemTemp(NativeParamType
);
1819 CGF
.EmitStoreOfScalar(TargetAddr
, NativeParamAddr
, /*Volatile=*/false,
1821 return NativeParamAddr
;
1824 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
1825 CodeGenFunction
&CGF
, SourceLocation Loc
, llvm::FunctionCallee OutlinedFn
,
1826 ArrayRef
<llvm::Value
*> Args
) const {
1827 SmallVector
<llvm::Value
*, 4> TargetArgs
;
1828 TargetArgs
.reserve(Args
.size());
1829 auto *FnType
= OutlinedFn
.getFunctionType();
1830 for (unsigned I
= 0, E
= Args
.size(); I
< E
; ++I
) {
1831 if (FnType
->isVarArg() && FnType
->getNumParams() <= I
) {
1832 TargetArgs
.append(std::next(Args
.begin(), I
), Args
.end());
1835 llvm::Type
*TargetType
= FnType
->getParamType(I
);
1836 llvm::Value
*NativeArg
= Args
[I
];
1837 if (!TargetType
->isPointerTy()) {
1838 TargetArgs
.emplace_back(NativeArg
);
1841 TargetArgs
.emplace_back(
1842 CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(NativeArg
, TargetType
));
1844 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF
, Loc
, OutlinedFn
, TargetArgs
);
1847 /// Emit function which wraps the outline parallel region
1848 /// and controls the arguments which are passed to this function.
1849 /// The wrapper ensures that the outlined function is called
1850 /// with the correct arguments when data is shared.
1851 llvm::Function
*CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1852 llvm::Function
*OutlinedParallelFn
, const OMPExecutableDirective
&D
) {
1853 ASTContext
&Ctx
= CGM
.getContext();
1854 const auto &CS
= *D
.getCapturedStmt(OMPD_parallel
);
1856 // Create a function that takes as argument the source thread.
1857 FunctionArgList WrapperArgs
;
1859 Ctx
.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
1861 Ctx
.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
1862 ImplicitParamDecl
ParallelLevelArg(Ctx
, /*DC=*/nullptr, D
.getBeginLoc(),
1863 /*Id=*/nullptr, Int16QTy
,
1864 ImplicitParamKind::Other
);
1865 ImplicitParamDecl
WrapperArg(Ctx
, /*DC=*/nullptr, D
.getBeginLoc(),
1866 /*Id=*/nullptr, Int32QTy
,
1867 ImplicitParamKind::Other
);
1868 WrapperArgs
.emplace_back(&ParallelLevelArg
);
1869 WrapperArgs
.emplace_back(&WrapperArg
);
1871 const CGFunctionInfo
&CGFI
=
1872 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(Ctx
.VoidTy
, WrapperArgs
);
1874 auto *Fn
= llvm::Function::Create(
1875 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
1876 Twine(OutlinedParallelFn
->getName(), "_wrapper"), &CGM
.getModule());
1878 // Ensure we do not inline the function. This is trivially true for the ones
1879 // passed to __kmpc_fork_call but the ones calles in serialized regions
1880 // could be inlined. This is not a perfect but it is closer to the invariant
1881 // we want, namely, every data environment starts with a new function.
1882 // TODO: We should pass the if condition to the runtime function and do the
1883 // handling there. Much cleaner code.
1884 Fn
->addFnAttr(llvm::Attribute::NoInline
);
1886 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
1887 Fn
->setLinkage(llvm::GlobalValue::InternalLinkage
);
1888 Fn
->setDoesNotRecurse();
1890 CodeGenFunction
CGF(CGM
, /*suppressNewContext=*/true);
1891 CGF
.StartFunction(GlobalDecl(), Ctx
.VoidTy
, Fn
, CGFI
, WrapperArgs
,
1892 D
.getBeginLoc(), D
.getBeginLoc());
1894 const auto *RD
= CS
.getCapturedRecordDecl();
1895 auto CurField
= RD
->field_begin();
1897 Address ZeroAddr
= CGF
.CreateDefaultAlignTempAlloca(CGF
.Int32Ty
,
1898 /*Name=*/".zero.addr");
1899 CGF
.Builder
.CreateStore(CGF
.Builder
.getInt32(/*C*/ 0), ZeroAddr
);
1900 // Get the array of arguments.
1901 SmallVector
<llvm::Value
*, 8> Args
;
1903 Args
.emplace_back(CGF
.GetAddrOfLocalVar(&WrapperArg
).emitRawPointer(CGF
));
1904 Args
.emplace_back(ZeroAddr
.emitRawPointer(CGF
));
1906 CGBuilderTy
&Bld
= CGF
.Builder
;
1907 auto CI
= CS
.capture_begin();
1909 // Use global memory for data sharing.
1910 // Handle passing of global args to workers.
1911 RawAddress GlobalArgs
=
1912 CGF
.CreateDefaultAlignTempAlloca(CGF
.VoidPtrPtrTy
, "global_args");
1913 llvm::Value
*GlobalArgsPtr
= GlobalArgs
.getPointer();
1914 llvm::Value
*DataSharingArgs
[] = {GlobalArgsPtr
};
1915 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1916 CGM
.getModule(), OMPRTL___kmpc_get_shared_variables
),
1919 // Retrieve the shared variables from the list of references returned
1920 // by the runtime. Pass the variables to the outlined function.
1921 Address SharedArgListAddress
= Address::invalid();
1922 if (CS
.capture_size() > 0 ||
1923 isOpenMPLoopBoundSharingDirective(D
.getDirectiveKind())) {
1924 SharedArgListAddress
= CGF
.EmitLoadOfPointer(
1925 GlobalArgs
, CGF
.getContext()
1926 .getPointerType(CGF
.getContext().VoidPtrTy
)
1927 .castAs
<PointerType
>());
1930 if (isOpenMPLoopBoundSharingDirective(D
.getDirectiveKind())) {
1931 Address Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, Idx
);
1932 Address TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1933 Src
, CGF
.SizeTy
->getPointerTo(), CGF
.SizeTy
);
1934 llvm::Value
*LB
= CGF
.EmitLoadOfScalar(
1937 CGF
.getContext().getPointerType(CGF
.getContext().getSizeType()),
1938 cast
<OMPLoopDirective
>(D
).getLowerBoundVariable()->getExprLoc());
1939 Args
.emplace_back(LB
);
1941 Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, Idx
);
1942 TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1943 Src
, CGF
.SizeTy
->getPointerTo(), CGF
.SizeTy
);
1944 llvm::Value
*UB
= CGF
.EmitLoadOfScalar(
1947 CGF
.getContext().getPointerType(CGF
.getContext().getSizeType()),
1948 cast
<OMPLoopDirective
>(D
).getUpperBoundVariable()->getExprLoc());
1949 Args
.emplace_back(UB
);
1952 if (CS
.capture_size() > 0) {
1953 ASTContext
&CGFContext
= CGF
.getContext();
1954 for (unsigned I
= 0, E
= CS
.capture_size(); I
< E
; ++I
, ++CI
, ++CurField
) {
1955 QualType ElemTy
= CurField
->getType();
1956 Address Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, I
+ Idx
);
1957 Address TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1958 Src
, CGF
.ConvertTypeForMem(CGFContext
.getPointerType(ElemTy
)),
1959 CGF
.ConvertTypeForMem(ElemTy
));
1960 llvm::Value
*Arg
= CGF
.EmitLoadOfScalar(TypedAddress
,
1962 CGFContext
.getPointerType(ElemTy
),
1964 if (CI
->capturesVariableByCopy() &&
1965 !CI
->getCapturedVar()->getType()->isAnyPointerType()) {
1966 Arg
= castValueToType(CGF
, Arg
, ElemTy
, CGFContext
.getUIntPtrType(),
1969 Args
.emplace_back(Arg
);
1973 emitOutlinedFunctionCall(CGF
, D
.getBeginLoc(), OutlinedParallelFn
, Args
);
1974 CGF
.FinishFunction();
1978 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction
&CGF
,
1980 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
1983 assert(D
&& "Expected function or captured|block decl.");
1984 assert(FunctionGlobalizedDecls
.count(CGF
.CurFn
) == 0 &&
1985 "Function is registered already.");
1986 assert((!TeamAndReductions
.first
|| TeamAndReductions
.first
== D
) &&
1987 "Team is set but not processed.");
1988 const Stmt
*Body
= nullptr;
1989 bool NeedToDelayGlobalization
= false;
1990 if (const auto *FD
= dyn_cast
<FunctionDecl
>(D
)) {
1991 Body
= FD
->getBody();
1992 } else if (const auto *BD
= dyn_cast
<BlockDecl
>(D
)) {
1993 Body
= BD
->getBody();
1994 } else if (const auto *CD
= dyn_cast
<CapturedDecl
>(D
)) {
1995 Body
= CD
->getBody();
1996 NeedToDelayGlobalization
= CGF
.CapturedStmtInfo
->getKind() == CR_OpenMP
;
1997 if (NeedToDelayGlobalization
&&
1998 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
)
2003 CheckVarsEscapingDeclContext
VarChecker(CGF
, TeamAndReductions
.second
);
2004 VarChecker
.Visit(Body
);
2005 const RecordDecl
*GlobalizedVarsRecord
=
2006 VarChecker
.getGlobalizedRecord(IsInTTDRegion
);
2007 TeamAndReductions
.first
= nullptr;
2008 TeamAndReductions
.second
.clear();
2009 ArrayRef
<const ValueDecl
*> EscapedVariableLengthDecls
=
2010 VarChecker
.getEscapedVariableLengthDecls();
2011 ArrayRef
<const ValueDecl
*> DelayedVariableLengthDecls
=
2012 VarChecker
.getDelayedVariableLengthDecls();
2013 if (!GlobalizedVarsRecord
&& EscapedVariableLengthDecls
.empty() &&
2014 DelayedVariableLengthDecls
.empty())
2016 auto I
= FunctionGlobalizedDecls
.try_emplace(CGF
.CurFn
).first
;
2017 I
->getSecond().MappedParams
=
2018 std::make_unique
<CodeGenFunction::OMPMapVars
>();
2019 I
->getSecond().EscapedParameters
.insert(
2020 VarChecker
.getEscapedParameters().begin(),
2021 VarChecker
.getEscapedParameters().end());
2022 I
->getSecond().EscapedVariableLengthDecls
.append(
2023 EscapedVariableLengthDecls
.begin(), EscapedVariableLengthDecls
.end());
2024 I
->getSecond().DelayedVariableLengthDecls
.append(
2025 DelayedVariableLengthDecls
.begin(), DelayedVariableLengthDecls
.end());
2026 DeclToAddrMapTy
&Data
= I
->getSecond().LocalVarData
;
2027 for (const ValueDecl
*VD
: VarChecker
.getEscapedDecls()) {
2028 assert(VD
->isCanonicalDecl() && "Expected canonical declaration");
2029 Data
.insert(std::make_pair(VD
, MappedVarData()));
2031 if (!NeedToDelayGlobalization
) {
2032 emitGenericVarsProlog(CGF
, D
->getBeginLoc());
2033 struct GlobalizationScope final
: EHScopeStack::Cleanup
{
2034 GlobalizationScope() = default;
2036 void Emit(CodeGenFunction
&CGF
, Flags flags
) override
{
2037 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime())
2038 .emitGenericVarsEpilog(CGF
);
2041 CGF
.EHStack
.pushCleanup
<GlobalizationScope
>(NormalAndEHCleanup
);
2045 Address
CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction
&CGF
,
2046 const VarDecl
*VD
) {
2047 if (VD
&& VD
->hasAttr
<OMPAllocateDeclAttr
>()) {
2048 const auto *A
= VD
->getAttr
<OMPAllocateDeclAttr
>();
2049 auto AS
= LangAS::Default
;
2050 switch (A
->getAllocatorType()) {
2051 // Use the default allocator here as by default local vars are
2053 case OMPAllocateDeclAttr::OMPNullMemAlloc
:
2054 case OMPAllocateDeclAttr::OMPDefaultMemAlloc
:
2055 case OMPAllocateDeclAttr::OMPThreadMemAlloc
:
2056 case OMPAllocateDeclAttr::OMPHighBWMemAlloc
:
2057 case OMPAllocateDeclAttr::OMPLowLatMemAlloc
:
2058 // Follow the user decision - use default allocation.
2059 return Address::invalid();
2060 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc
:
2061 // TODO: implement aupport for user-defined allocators.
2062 return Address::invalid();
2063 case OMPAllocateDeclAttr::OMPConstMemAlloc
:
2064 AS
= LangAS::cuda_constant
;
2066 case OMPAllocateDeclAttr::OMPPTeamMemAlloc
:
2067 AS
= LangAS::cuda_shared
;
2069 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc
:
2070 case OMPAllocateDeclAttr::OMPCGroupMemAlloc
:
2073 llvm::Type
*VarTy
= CGF
.ConvertTypeForMem(VD
->getType());
2074 auto *GV
= new llvm::GlobalVariable(
2075 CGM
.getModule(), VarTy
, /*isConstant=*/false,
2076 llvm::GlobalValue::InternalLinkage
, llvm::PoisonValue::get(VarTy
),
2078 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal
,
2079 CGM
.getContext().getTargetAddressSpace(AS
));
2080 CharUnits Align
= CGM
.getContext().getDeclAlign(VD
);
2081 GV
->setAlignment(Align
.getAsAlign());
2083 CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
2084 GV
, VarTy
->getPointerTo(CGM
.getContext().getTargetAddressSpace(
2085 VD
->getType().getAddressSpace()))),
2089 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
2090 return Address::invalid();
2092 VD
= VD
->getCanonicalDecl();
2093 auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
2094 if (I
== FunctionGlobalizedDecls
.end())
2095 return Address::invalid();
2096 auto VDI
= I
->getSecond().LocalVarData
.find(VD
);
2097 if (VDI
!= I
->getSecond().LocalVarData
.end())
2098 return VDI
->second
.PrivateAddr
;
2099 if (VD
->hasAttrs()) {
2100 for (specific_attr_iterator
<OMPReferencedVarAttr
> IT(VD
->attr_begin()),
2103 auto VDI
= I
->getSecond().LocalVarData
.find(
2104 cast
<VarDecl
>(cast
<DeclRefExpr
>(IT
->getRef())->getDecl())
2105 ->getCanonicalDecl());
2106 if (VDI
!= I
->getSecond().LocalVarData
.end())
2107 return VDI
->second
.PrivateAddr
;
2111 return Address::invalid();
2114 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction
&CGF
) {
2115 FunctionGlobalizedDecls
.erase(CGF
.CurFn
);
2116 CGOpenMPRuntime::functionFinished(CGF
);
2119 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
2120 CodeGenFunction
&CGF
, const OMPLoopDirective
&S
,
2121 OpenMPDistScheduleClauseKind
&ScheduleKind
,
2122 llvm::Value
*&Chunk
) const {
2123 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
2124 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
) {
2125 ScheduleKind
= OMPC_DIST_SCHEDULE_static
;
2126 Chunk
= CGF
.EmitScalarConversion(
2127 RT
.getGPUNumThreads(CGF
),
2128 CGF
.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2129 S
.getIterationVariable()->getType(), S
.getBeginLoc());
2132 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
2133 CGF
, S
, ScheduleKind
, Chunk
);
2136 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
2137 CodeGenFunction
&CGF
, const OMPLoopDirective
&S
,
2138 OpenMPScheduleClauseKind
&ScheduleKind
,
2139 const Expr
*&ChunkExpr
) const {
2140 ScheduleKind
= OMPC_SCHEDULE_static
;
2141 // Chunk size is 1 in this case.
2142 llvm::APInt
ChunkSize(32, 1);
2143 ChunkExpr
= IntegerLiteral::Create(CGF
.getContext(), ChunkSize
,
2144 CGF
.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2148 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
2149 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
) const {
2150 assert(isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) &&
2151 " Expected target-based directive.");
2152 const CapturedStmt
*CS
= D
.getCapturedStmt(OMPD_target
);
2153 for (const CapturedStmt::Capture
&C
: CS
->captures()) {
2154 // Capture variables captured by reference in lambdas for target-based
2156 if (!C
.capturesVariable())
2158 const VarDecl
*VD
= C
.getCapturedVar();
2159 const auto *RD
= VD
->getType()
2161 .getNonReferenceType()
2162 ->getAsCXXRecordDecl();
2163 if (!RD
|| !RD
->isLambda())
2165 Address VDAddr
= CGF
.GetAddrOfLocalVar(VD
);
2167 if (VD
->getType().getCanonicalType()->isReferenceType())
2168 VDLVal
= CGF
.EmitLoadOfReferenceLValue(VDAddr
, VD
->getType());
2170 VDLVal
= CGF
.MakeAddrLValue(
2171 VDAddr
, VD
->getType().getCanonicalType().getNonReferenceType());
2172 llvm::DenseMap
<const ValueDecl
*, FieldDecl
*> Captures
;
2173 FieldDecl
*ThisCapture
= nullptr;
2174 RD
->getCaptureFields(Captures
, ThisCapture
);
2175 if (ThisCapture
&& CGF
.CapturedStmtInfo
->isCXXThisExprCaptured()) {
2177 CGF
.EmitLValueForFieldInitialization(VDLVal
, ThisCapture
);
2178 llvm::Value
*CXXThis
= CGF
.LoadCXXThis();
2179 CGF
.EmitStoreOfScalar(CXXThis
, ThisLVal
);
2181 for (const LambdaCapture
&LC
: RD
->captures()) {
2182 if (LC
.getCaptureKind() != LCK_ByRef
)
2184 const ValueDecl
*VD
= LC
.getCapturedVar();
2185 // FIXME: For now VD is always a VarDecl because OpenMP does not support
2186 // capturing structured bindings in lambdas yet.
2187 if (!CS
->capturesVariable(cast
<VarDecl
>(VD
)))
2189 auto It
= Captures
.find(VD
);
2190 assert(It
!= Captures
.end() && "Found lambda capture without field.");
2191 LValue VarLVal
= CGF
.EmitLValueForFieldInitialization(VDLVal
, It
->second
);
2192 Address VDAddr
= CGF
.GetAddrOfLocalVar(cast
<VarDecl
>(VD
));
2193 if (VD
->getType().getCanonicalType()->isReferenceType())
2194 VDAddr
= CGF
.EmitLoadOfReferenceLValue(VDAddr
,
2195 VD
->getType().getCanonicalType())
2197 CGF
.EmitStoreOfScalar(VDAddr
.emitRawPointer(CGF
), VarLVal
);
2202 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl
*VD
,
2204 if (!VD
|| !VD
->hasAttr
<OMPAllocateDeclAttr
>())
2206 const auto *A
= VD
->getAttr
<OMPAllocateDeclAttr
>();
2207 switch(A
->getAllocatorType()) {
2208 case OMPAllocateDeclAttr::OMPNullMemAlloc
:
2209 case OMPAllocateDeclAttr::OMPDefaultMemAlloc
:
2210 // Not supported, fallback to the default mem space.
2211 case OMPAllocateDeclAttr::OMPThreadMemAlloc
:
2212 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc
:
2213 case OMPAllocateDeclAttr::OMPCGroupMemAlloc
:
2214 case OMPAllocateDeclAttr::OMPHighBWMemAlloc
:
2215 case OMPAllocateDeclAttr::OMPLowLatMemAlloc
:
2216 AS
= LangAS::Default
;
2218 case OMPAllocateDeclAttr::OMPConstMemAlloc
:
2219 AS
= LangAS::cuda_constant
;
2221 case OMPAllocateDeclAttr::OMPPTeamMemAlloc
:
2222 AS
= LangAS::cuda_shared
;
2224 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc
:
2225 llvm_unreachable("Expected predefined allocator for the variables with the "
2231 // Get current OffloadArch and ignore any unknown values
2232 static OffloadArch
getOffloadArch(CodeGenModule
&CGM
) {
2233 if (!CGM
.getTarget().hasFeature("ptx"))
2234 return OffloadArch::UNKNOWN
;
2235 for (const auto &Feature
: CGM
.getTarget().getTargetOpts().FeatureMap
) {
2236 if (Feature
.getValue()) {
2237 OffloadArch Arch
= StringToOffloadArch(Feature
.getKey());
2238 if (Arch
!= OffloadArch::UNKNOWN
)
2242 return OffloadArch::UNKNOWN
;
2245 /// Check to see if target architecture supports unified addressing which is
2246 /// a restriction for OpenMP requires clause "unified_shared_memory".
2247 void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl
*D
) {
2248 for (const OMPClause
*Clause
: D
->clauselists()) {
2249 if (Clause
->getClauseKind() == OMPC_unified_shared_memory
) {
2250 OffloadArch Arch
= getOffloadArch(CGM
);
2252 case OffloadArch::SM_20
:
2253 case OffloadArch::SM_21
:
2254 case OffloadArch::SM_30
:
2255 case OffloadArch::SM_32_
:
2256 case OffloadArch::SM_35
:
2257 case OffloadArch::SM_37
:
2258 case OffloadArch::SM_50
:
2259 case OffloadArch::SM_52
:
2260 case OffloadArch::SM_53
: {
2261 SmallString
<256> Buffer
;
2262 llvm::raw_svector_ostream
Out(Buffer
);
2263 Out
<< "Target architecture " << OffloadArchToString(Arch
)
2264 << " does not support unified addressing";
2265 CGM
.Error(Clause
->getBeginLoc(), Out
.str());
2268 case OffloadArch::SM_60
:
2269 case OffloadArch::SM_61
:
2270 case OffloadArch::SM_62
:
2271 case OffloadArch::SM_70
:
2272 case OffloadArch::SM_72
:
2273 case OffloadArch::SM_75
:
2274 case OffloadArch::SM_80
:
2275 case OffloadArch::SM_86
:
2276 case OffloadArch::SM_87
:
2277 case OffloadArch::SM_89
:
2278 case OffloadArch::SM_90
:
2279 case OffloadArch::SM_90a
:
2280 case OffloadArch::GFX600
:
2281 case OffloadArch::GFX601
:
2282 case OffloadArch::GFX602
:
2283 case OffloadArch::GFX700
:
2284 case OffloadArch::GFX701
:
2285 case OffloadArch::GFX702
:
2286 case OffloadArch::GFX703
:
2287 case OffloadArch::GFX704
:
2288 case OffloadArch::GFX705
:
2289 case OffloadArch::GFX801
:
2290 case OffloadArch::GFX802
:
2291 case OffloadArch::GFX803
:
2292 case OffloadArch::GFX805
:
2293 case OffloadArch::GFX810
:
2294 case OffloadArch::GFX9_GENERIC
:
2295 case OffloadArch::GFX900
:
2296 case OffloadArch::GFX902
:
2297 case OffloadArch::GFX904
:
2298 case OffloadArch::GFX906
:
2299 case OffloadArch::GFX908
:
2300 case OffloadArch::GFX909
:
2301 case OffloadArch::GFX90a
:
2302 case OffloadArch::GFX90c
:
2303 case OffloadArch::GFX940
:
2304 case OffloadArch::GFX941
:
2305 case OffloadArch::GFX942
:
2306 case OffloadArch::GFX10_1_GENERIC
:
2307 case OffloadArch::GFX1010
:
2308 case OffloadArch::GFX1011
:
2309 case OffloadArch::GFX1012
:
2310 case OffloadArch::GFX1013
:
2311 case OffloadArch::GFX10_3_GENERIC
:
2312 case OffloadArch::GFX1030
:
2313 case OffloadArch::GFX1031
:
2314 case OffloadArch::GFX1032
:
2315 case OffloadArch::GFX1033
:
2316 case OffloadArch::GFX1034
:
2317 case OffloadArch::GFX1035
:
2318 case OffloadArch::GFX1036
:
2319 case OffloadArch::GFX11_GENERIC
:
2320 case OffloadArch::GFX1100
:
2321 case OffloadArch::GFX1101
:
2322 case OffloadArch::GFX1102
:
2323 case OffloadArch::GFX1103
:
2324 case OffloadArch::GFX1150
:
2325 case OffloadArch::GFX1151
:
2326 case OffloadArch::GFX1152
:
2327 case OffloadArch::GFX12_GENERIC
:
2328 case OffloadArch::GFX1200
:
2329 case OffloadArch::GFX1201
:
2330 case OffloadArch::AMDGCNSPIRV
:
2331 case OffloadArch::Generic
:
2332 case OffloadArch::UNUSED
:
2333 case OffloadArch::UNKNOWN
:
2335 case OffloadArch::LAST
:
2336 llvm_unreachable("Unexpected GPU arch.");
2340 CGOpenMPRuntime::processRequiresDirective(D
);
2343 llvm::Value
*CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction
&CGF
) {
2344 CGBuilderTy
&Bld
= CGF
.Builder
;
2345 llvm::Module
*M
= &CGF
.CGM
.getModule();
2346 const char *LocSize
= "__kmpc_get_hardware_num_threads_in_block";
2347 llvm::Function
*F
= M
->getFunction(LocSize
);
2349 F
= llvm::Function::Create(
2350 llvm::FunctionType::get(CGF
.Int32Ty
, std::nullopt
, false),
2351 llvm::GlobalVariable::ExternalLinkage
, LocSize
, &CGF
.CGM
.getModule());
2353 return Bld
.CreateCall(F
, std::nullopt
, "nvptx_num_threads");
2356 llvm::Value
*CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction
&CGF
) {
2357 ArrayRef
<llvm::Value
*> Args
{};
2358 return CGF
.EmitRuntimeCall(
2359 OMPBuilder
.getOrCreateRuntimeFunction(
2360 CGM
.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block
),