1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This provides a generalized class for OpenMP runtime code generation
10 // specialized by GPU targets NVPTX and AMDGCN.
12 //===----------------------------------------------------------------------===//
14 #include "CGOpenMPRuntimeGPU.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/Attr.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/OpenMPClause.h"
19 #include "clang/AST/StmtOpenMP.h"
20 #include "clang/AST/StmtVisitor.h"
21 #include "clang/Basic/Cuda.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
25 using namespace clang
;
26 using namespace CodeGen
;
27 using namespace llvm::omp
;
30 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
31 class NVPTXActionTy final
: public PrePostActionTy
{
32 llvm::FunctionCallee EnterCallee
= nullptr;
33 ArrayRef
<llvm::Value
*> EnterArgs
;
34 llvm::FunctionCallee ExitCallee
= nullptr;
35 ArrayRef
<llvm::Value
*> ExitArgs
;
36 bool Conditional
= false;
37 llvm::BasicBlock
*ContBlock
= nullptr;
40 NVPTXActionTy(llvm::FunctionCallee EnterCallee
,
41 ArrayRef
<llvm::Value
*> EnterArgs
,
42 llvm::FunctionCallee ExitCallee
,
43 ArrayRef
<llvm::Value
*> ExitArgs
, bool Conditional
= false)
44 : EnterCallee(EnterCallee
), EnterArgs(EnterArgs
), ExitCallee(ExitCallee
),
45 ExitArgs(ExitArgs
), Conditional(Conditional
) {}
46 void Enter(CodeGenFunction
&CGF
) override
{
47 llvm::Value
*EnterRes
= CGF
.EmitRuntimeCall(EnterCallee
, EnterArgs
);
49 llvm::Value
*CallBool
= CGF
.Builder
.CreateIsNotNull(EnterRes
);
50 auto *ThenBlock
= CGF
.createBasicBlock("omp_if.then");
51 ContBlock
= CGF
.createBasicBlock("omp_if.end");
52 // Generate the branch (If-stmt)
53 CGF
.Builder
.CreateCondBr(CallBool
, ThenBlock
, ContBlock
);
54 CGF
.EmitBlock(ThenBlock
);
57 void Done(CodeGenFunction
&CGF
) {
58 // Emit the rest of blocks/branches
59 CGF
.EmitBranch(ContBlock
);
60 CGF
.EmitBlock(ContBlock
, true);
62 void Exit(CodeGenFunction
&CGF
) override
{
63 CGF
.EmitRuntimeCall(ExitCallee
, ExitArgs
);
67 /// A class to track the execution mode when codegening directives within
68 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
69 /// to the target region and used by containing directives such as 'parallel'
70 /// to emit optimized code.
71 class ExecutionRuntimeModesRAII
{
73 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode
=
74 CGOpenMPRuntimeGPU::EM_Unknown
;
75 CGOpenMPRuntimeGPU::ExecutionMode
&ExecMode
;
78 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode
&ExecMode
,
79 CGOpenMPRuntimeGPU::ExecutionMode EntryMode
)
80 : ExecMode(ExecMode
) {
81 SavedExecMode
= ExecMode
;
84 ~ExecutionRuntimeModesRAII() { ExecMode
= SavedExecMode
; }
87 static const ValueDecl
*getPrivateItem(const Expr
*RefExpr
) {
88 RefExpr
= RefExpr
->IgnoreParens();
89 if (const auto *ASE
= dyn_cast
<ArraySubscriptExpr
>(RefExpr
)) {
90 const Expr
*Base
= ASE
->getBase()->IgnoreParenImpCasts();
91 while (const auto *TempASE
= dyn_cast
<ArraySubscriptExpr
>(Base
))
92 Base
= TempASE
->getBase()->IgnoreParenImpCasts();
94 } else if (auto *OASE
= dyn_cast
<ArraySectionExpr
>(RefExpr
)) {
95 const Expr
*Base
= OASE
->getBase()->IgnoreParenImpCasts();
96 while (const auto *TempOASE
= dyn_cast
<ArraySectionExpr
>(Base
))
97 Base
= TempOASE
->getBase()->IgnoreParenImpCasts();
98 while (const auto *TempASE
= dyn_cast
<ArraySubscriptExpr
>(Base
))
99 Base
= TempASE
->getBase()->IgnoreParenImpCasts();
102 RefExpr
= RefExpr
->IgnoreParenImpCasts();
103 if (const auto *DE
= dyn_cast
<DeclRefExpr
>(RefExpr
))
104 return cast
<ValueDecl
>(DE
->getDecl()->getCanonicalDecl());
105 const auto *ME
= cast
<MemberExpr
>(RefExpr
);
106 return cast
<ValueDecl
>(ME
->getMemberDecl()->getCanonicalDecl());
109 static RecordDecl
*buildRecordForGlobalizedVars(
110 ASTContext
&C
, ArrayRef
<const ValueDecl
*> EscapedDecls
,
111 ArrayRef
<const ValueDecl
*> EscapedDeclsForTeams
,
112 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
115 using VarsDataTy
= std::pair
<CharUnits
/*Align*/, const ValueDecl
*>;
116 if (EscapedDecls
.empty() && EscapedDeclsForTeams
.empty())
118 SmallVector
<VarsDataTy
, 4> GlobalizedVars
;
119 for (const ValueDecl
*D
: EscapedDecls
)
120 GlobalizedVars
.emplace_back(C
.getDeclAlign(D
), D
);
121 for (const ValueDecl
*D
: EscapedDeclsForTeams
)
122 GlobalizedVars
.emplace_back(C
.getDeclAlign(D
), D
);
124 // Build struct _globalized_locals_ty {
125 // /* globalized vars */[WarSize] align (decl_align)
126 // /* globalized vars */ for EscapedDeclsForTeams
128 RecordDecl
*GlobalizedRD
= C
.buildImplicitRecord("_globalized_locals_ty");
129 GlobalizedRD
->startDefinition();
130 llvm::SmallPtrSet
<const ValueDecl
*, 16> SingleEscaped(
131 EscapedDeclsForTeams
.begin(), EscapedDeclsForTeams
.end());
132 for (const auto &Pair
: GlobalizedVars
) {
133 const ValueDecl
*VD
= Pair
.second
;
134 QualType Type
= VD
->getType();
135 if (Type
->isLValueReferenceType())
136 Type
= C
.getPointerType(Type
.getNonReferenceType());
138 Type
= Type
.getNonReferenceType();
139 SourceLocation Loc
= VD
->getLocation();
141 if (SingleEscaped
.count(VD
)) {
142 Field
= FieldDecl::Create(
143 C
, GlobalizedRD
, Loc
, Loc
, VD
->getIdentifier(), Type
,
144 C
.getTrivialTypeSourceInfo(Type
, SourceLocation()),
145 /*BW=*/nullptr, /*Mutable=*/false,
146 /*InitStyle=*/ICIS_NoInit
);
147 Field
->setAccess(AS_public
);
148 if (VD
->hasAttrs()) {
149 for (specific_attr_iterator
<AlignedAttr
> I(VD
->getAttrs().begin()),
150 E(VD
->getAttrs().end());
156 llvm::APInt
ArraySize(32, BufSize
);
157 Type
= C
.getConstantArrayType(Type
, ArraySize
, nullptr,
158 ArraySizeModifier::Normal
, 0);
160 Field
= FieldDecl::Create(
161 C
, GlobalizedRD
, Loc
, Loc
, VD
->getIdentifier(), Type
,
162 C
.getTrivialTypeSourceInfo(Type
, SourceLocation()),
163 /*BW=*/nullptr, /*Mutable=*/false,
164 /*InitStyle=*/ICIS_NoInit
);
165 Field
->setAccess(AS_public
);
166 llvm::APInt
Align(32, Pair
.first
.getQuantity());
167 Field
->addAttr(AlignedAttr::CreateImplicit(
168 C
, /*IsAlignmentExpr=*/true,
169 IntegerLiteral::Create(C
, Align
,
170 C
.getIntTypeForBitwidth(32, /*Signed=*/0),
172 {}, AlignedAttr::GNU_aligned
));
174 GlobalizedRD
->addDecl(Field
);
175 MappedDeclsFields
.try_emplace(VD
, Field
);
177 GlobalizedRD
->completeDefinition();
181 /// Get the list of variables that can escape their declaration context.
182 class CheckVarsEscapingDeclContext final
183 : public ConstStmtVisitor
<CheckVarsEscapingDeclContext
> {
184 CodeGenFunction
&CGF
;
185 llvm::SetVector
<const ValueDecl
*> EscapedDecls
;
186 llvm::SetVector
<const ValueDecl
*> EscapedVariableLengthDecls
;
187 llvm::SetVector
<const ValueDecl
*> DelayedVariableLengthDecls
;
188 llvm::SmallPtrSet
<const Decl
*, 4> EscapedParameters
;
189 RecordDecl
*GlobalizedRD
= nullptr;
190 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> MappedDeclsFields
;
191 bool AllEscaped
= false;
192 bool IsForCombinedParallelRegion
= false;
194 void markAsEscaped(const ValueDecl
*VD
) {
195 // Do not globalize declare target variables.
196 if (!isa
<VarDecl
>(VD
) ||
197 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD
))
199 VD
= cast
<ValueDecl
>(VD
->getCanonicalDecl());
200 // Use user-specified allocation.
201 if (VD
->hasAttrs() && VD
->hasAttr
<OMPAllocateDeclAttr
>())
203 // Variables captured by value must be globalized.
204 bool IsCaptured
= false;
205 if (auto *CSI
= CGF
.CapturedStmtInfo
) {
206 if (const FieldDecl
*FD
= CSI
->lookup(cast
<VarDecl
>(VD
))) {
207 // Check if need to capture the variable that was already captured by
208 // value in the outer region.
210 if (!IsForCombinedParallelRegion
) {
213 const auto *Attr
= FD
->getAttr
<OMPCaptureKindAttr
>();
216 if (((Attr
->getCaptureKind() != OMPC_map
) &&
217 !isOpenMPPrivate(Attr
->getCaptureKind())) ||
218 ((Attr
->getCaptureKind() == OMPC_map
) &&
219 !FD
->getType()->isAnyPointerType()))
222 if (!FD
->getType()->isReferenceType()) {
223 assert(!VD
->getType()->isVariablyModifiedType() &&
224 "Parameter captured by value with variably modified type");
225 EscapedParameters
.insert(VD
);
226 } else if (!IsForCombinedParallelRegion
) {
231 if ((!CGF
.CapturedStmtInfo
||
232 (IsForCombinedParallelRegion
&& CGF
.CapturedStmtInfo
)) &&
233 VD
->getType()->isReferenceType())
234 // Do not globalize variables with reference type.
236 if (VD
->getType()->isVariablyModifiedType()) {
237 // If not captured at the target region level then mark the escaped
238 // variable as delayed.
240 EscapedVariableLengthDecls
.insert(VD
);
242 DelayedVariableLengthDecls
.insert(VD
);
244 EscapedDecls
.insert(VD
);
247 void VisitValueDecl(const ValueDecl
*VD
) {
248 if (VD
->getType()->isLValueReferenceType())
250 if (const auto *VarD
= dyn_cast
<VarDecl
>(VD
)) {
251 if (!isa
<ParmVarDecl
>(VarD
) && VarD
->hasInit()) {
252 const bool SavedAllEscaped
= AllEscaped
;
253 AllEscaped
= VD
->getType()->isLValueReferenceType();
254 Visit(VarD
->getInit());
255 AllEscaped
= SavedAllEscaped
;
259 void VisitOpenMPCapturedStmt(const CapturedStmt
*S
,
260 ArrayRef
<OMPClause
*> Clauses
,
261 bool IsCombinedParallelRegion
) {
264 for (const CapturedStmt::Capture
&C
: S
->captures()) {
265 if (C
.capturesVariable() && !C
.capturesVariableByCopy()) {
266 const ValueDecl
*VD
= C
.getCapturedVar();
267 bool SavedIsForCombinedParallelRegion
= IsForCombinedParallelRegion
;
268 if (IsCombinedParallelRegion
) {
269 // Check if the variable is privatized in the combined construct and
270 // those private copies must be shared in the inner parallel
272 IsForCombinedParallelRegion
= false;
273 for (const OMPClause
*C
: Clauses
) {
274 if (!isOpenMPPrivate(C
->getClauseKind()) ||
275 C
->getClauseKind() == OMPC_reduction
||
276 C
->getClauseKind() == OMPC_linear
||
277 C
->getClauseKind() == OMPC_private
)
279 ArrayRef
<const Expr
*> Vars
;
280 if (const auto *PC
= dyn_cast
<OMPFirstprivateClause
>(C
))
281 Vars
= PC
->getVarRefs();
282 else if (const auto *PC
= dyn_cast
<OMPLastprivateClause
>(C
))
283 Vars
= PC
->getVarRefs();
285 llvm_unreachable("Unexpected clause.");
286 for (const auto *E
: Vars
) {
288 cast
<DeclRefExpr
>(E
)->getDecl()->getCanonicalDecl();
289 if (D
== VD
->getCanonicalDecl()) {
290 IsForCombinedParallelRegion
= true;
294 if (IsForCombinedParallelRegion
)
299 if (isa
<OMPCapturedExprDecl
>(VD
))
301 IsForCombinedParallelRegion
= SavedIsForCombinedParallelRegion
;
306 void buildRecordForGlobalizedVars(bool IsInTTDRegion
) {
307 assert(!GlobalizedRD
&&
308 "Record for globalized variables is built already.");
309 ArrayRef
<const ValueDecl
*> EscapedDeclsForParallel
, EscapedDeclsForTeams
;
310 unsigned WarpSize
= CGF
.getTarget().getGridValue().GV_Warp_Size
;
312 EscapedDeclsForTeams
= EscapedDecls
.getArrayRef();
314 EscapedDeclsForParallel
= EscapedDecls
.getArrayRef();
315 GlobalizedRD
= ::buildRecordForGlobalizedVars(
316 CGF
.getContext(), EscapedDeclsForParallel
, EscapedDeclsForTeams
,
317 MappedDeclsFields
, WarpSize
);
321 CheckVarsEscapingDeclContext(CodeGenFunction
&CGF
,
322 ArrayRef
<const ValueDecl
*> TeamsReductions
)
323 : CGF(CGF
), EscapedDecls(TeamsReductions
.begin(), TeamsReductions
.end()) {
325 virtual ~CheckVarsEscapingDeclContext() = default;
326 void VisitDeclStmt(const DeclStmt
*S
) {
329 for (const Decl
*D
: S
->decls())
330 if (const auto *VD
= dyn_cast_or_null
<ValueDecl
>(D
))
333 void VisitOMPExecutableDirective(const OMPExecutableDirective
*D
) {
336 if (!D
->hasAssociatedStmt())
339 dyn_cast_or_null
<CapturedStmt
>(D
->getAssociatedStmt())) {
340 // Do not analyze directives that do not actually require capturing,
341 // like `omp for` or `omp simd` directives.
342 llvm::SmallVector
<OpenMPDirectiveKind
, 4> CaptureRegions
;
343 getOpenMPCaptureRegions(CaptureRegions
, D
->getDirectiveKind());
344 if (CaptureRegions
.size() == 1 && CaptureRegions
.back() == OMPD_unknown
) {
345 VisitStmt(S
->getCapturedStmt());
348 VisitOpenMPCapturedStmt(
350 CaptureRegions
.back() == OMPD_parallel
&&
351 isOpenMPDistributeDirective(D
->getDirectiveKind()));
354 void VisitCapturedStmt(const CapturedStmt
*S
) {
357 for (const CapturedStmt::Capture
&C
: S
->captures()) {
358 if (C
.capturesVariable() && !C
.capturesVariableByCopy()) {
359 const ValueDecl
*VD
= C
.getCapturedVar();
361 if (isa
<OMPCapturedExprDecl
>(VD
))
366 void VisitLambdaExpr(const LambdaExpr
*E
) {
369 for (const LambdaCapture
&C
: E
->captures()) {
370 if (C
.capturesVariable()) {
371 if (C
.getCaptureKind() == LCK_ByRef
) {
372 const ValueDecl
*VD
= C
.getCapturedVar();
374 if (E
->isInitCapture(&C
) || isa
<OMPCapturedExprDecl
>(VD
))
380 void VisitBlockExpr(const BlockExpr
*E
) {
383 for (const BlockDecl::Capture
&C
: E
->getBlockDecl()->captures()) {
385 const VarDecl
*VD
= C
.getVariable();
387 if (isa
<OMPCapturedExprDecl
>(VD
) || VD
->isInitCapture())
392 void VisitCallExpr(const CallExpr
*E
) {
395 for (const Expr
*Arg
: E
->arguments()) {
398 if (Arg
->isLValue()) {
399 const bool SavedAllEscaped
= AllEscaped
;
402 AllEscaped
= SavedAllEscaped
;
407 Visit(E
->getCallee());
409 void VisitDeclRefExpr(const DeclRefExpr
*E
) {
412 const ValueDecl
*VD
= E
->getDecl();
415 if (isa
<OMPCapturedExprDecl
>(VD
))
417 else if (VD
->isInitCapture())
420 void VisitUnaryOperator(const UnaryOperator
*E
) {
423 if (E
->getOpcode() == UO_AddrOf
) {
424 const bool SavedAllEscaped
= AllEscaped
;
426 Visit(E
->getSubExpr());
427 AllEscaped
= SavedAllEscaped
;
429 Visit(E
->getSubExpr());
432 void VisitImplicitCastExpr(const ImplicitCastExpr
*E
) {
435 if (E
->getCastKind() == CK_ArrayToPointerDecay
) {
436 const bool SavedAllEscaped
= AllEscaped
;
438 Visit(E
->getSubExpr());
439 AllEscaped
= SavedAllEscaped
;
441 Visit(E
->getSubExpr());
444 void VisitExpr(const Expr
*E
) {
447 bool SavedAllEscaped
= AllEscaped
;
450 for (const Stmt
*Child
: E
->children())
453 AllEscaped
= SavedAllEscaped
;
455 void VisitStmt(const Stmt
*S
) {
458 for (const Stmt
*Child
: S
->children())
463 /// Returns the record that handles all the escaped local variables and used
464 /// instead of their original storage.
465 const RecordDecl
*getGlobalizedRecord(bool IsInTTDRegion
) {
467 buildRecordForGlobalizedVars(IsInTTDRegion
);
471 /// Returns the field in the globalized record for the escaped variable.
472 const FieldDecl
*getFieldForGlobalizedVar(const ValueDecl
*VD
) const {
473 assert(GlobalizedRD
&&
474 "Record for globalized variables must be generated already.");
475 return MappedDeclsFields
.lookup(VD
);
478 /// Returns the list of the escaped local variables/parameters.
479 ArrayRef
<const ValueDecl
*> getEscapedDecls() const {
480 return EscapedDecls
.getArrayRef();
483 /// Checks if the escaped local variable is actually a parameter passed by
485 const llvm::SmallPtrSetImpl
<const Decl
*> &getEscapedParameters() const {
486 return EscapedParameters
;
489 /// Returns the list of the escaped variables with the variably modified
491 ArrayRef
<const ValueDecl
*> getEscapedVariableLengthDecls() const {
492 return EscapedVariableLengthDecls
.getArrayRef();
495 /// Returns the list of the delayed variables with the variably modified
497 ArrayRef
<const ValueDecl
*> getDelayedVariableLengthDecls() const {
498 return DelayedVariableLengthDecls
.getArrayRef();
501 } // anonymous namespace
503 CGOpenMPRuntimeGPU::ExecutionMode
504 CGOpenMPRuntimeGPU::getExecutionMode() const {
505 return CurrentExecutionMode
;
508 CGOpenMPRuntimeGPU::DataSharingMode
509 CGOpenMPRuntimeGPU::getDataSharingMode() const {
510 return CurrentDataSharingMode
;
513 /// Check for inner (nested) SPMD construct, if any
514 static bool hasNestedSPMDDirective(ASTContext
&Ctx
,
515 const OMPExecutableDirective
&D
) {
516 const auto *CS
= D
.getInnermostCapturedStmt();
518 CS
->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
519 const Stmt
*ChildStmt
= CGOpenMPRuntime::getSingleCompoundChild(Ctx
, Body
);
521 if (const auto *NestedDir
=
522 dyn_cast_or_null
<OMPExecutableDirective
>(ChildStmt
)) {
523 OpenMPDirectiveKind DKind
= NestedDir
->getDirectiveKind();
524 switch (D
.getDirectiveKind()) {
526 if (isOpenMPParallelDirective(DKind
))
528 if (DKind
== OMPD_teams
) {
529 Body
= NestedDir
->getInnermostCapturedStmt()->IgnoreContainers(
530 /*IgnoreCaptured=*/true);
533 ChildStmt
= CGOpenMPRuntime::getSingleCompoundChild(Ctx
, Body
);
534 if (const auto *NND
=
535 dyn_cast_or_null
<OMPExecutableDirective
>(ChildStmt
)) {
536 DKind
= NND
->getDirectiveKind();
537 if (isOpenMPParallelDirective(DKind
))
542 case OMPD_target_teams
:
543 return isOpenMPParallelDirective(DKind
);
544 case OMPD_target_simd
:
545 case OMPD_target_parallel
:
546 case OMPD_target_parallel_for
:
547 case OMPD_target_parallel_for_simd
:
548 case OMPD_target_teams_distribute
:
549 case OMPD_target_teams_distribute_simd
:
550 case OMPD_target_teams_distribute_parallel_for
:
551 case OMPD_target_teams_distribute_parallel_for_simd
:
554 case OMPD_parallel_for
:
555 case OMPD_parallel_master
:
556 case OMPD_parallel_sections
:
558 case OMPD_parallel_for_simd
:
560 case OMPD_cancellation_point
:
562 case OMPD_threadprivate
:
580 case OMPD_target_data
:
581 case OMPD_target_exit_data
:
582 case OMPD_target_enter_data
:
583 case OMPD_distribute
:
584 case OMPD_distribute_simd
:
585 case OMPD_distribute_parallel_for
:
586 case OMPD_distribute_parallel_for_simd
:
587 case OMPD_teams_distribute
:
588 case OMPD_teams_distribute_simd
:
589 case OMPD_teams_distribute_parallel_for
:
590 case OMPD_teams_distribute_parallel_for_simd
:
591 case OMPD_target_update
:
592 case OMPD_declare_simd
:
593 case OMPD_declare_variant
:
594 case OMPD_begin_declare_variant
:
595 case OMPD_end_declare_variant
:
596 case OMPD_declare_target
:
597 case OMPD_end_declare_target
:
598 case OMPD_declare_reduction
:
599 case OMPD_declare_mapper
:
601 case OMPD_taskloop_simd
:
602 case OMPD_master_taskloop
:
603 case OMPD_master_taskloop_simd
:
604 case OMPD_parallel_master_taskloop
:
605 case OMPD_parallel_master_taskloop_simd
:
609 llvm_unreachable("Unexpected directive.");
616 static bool supportsSPMDExecutionMode(ASTContext
&Ctx
,
617 const OMPExecutableDirective
&D
) {
618 OpenMPDirectiveKind DirectiveKind
= D
.getDirectiveKind();
619 switch (DirectiveKind
) {
621 case OMPD_target_teams
:
622 return hasNestedSPMDDirective(Ctx
, D
);
623 case OMPD_target_parallel_loop
:
624 case OMPD_target_parallel
:
625 case OMPD_target_parallel_for
:
626 case OMPD_target_parallel_for_simd
:
627 case OMPD_target_teams_distribute_parallel_for
:
628 case OMPD_target_teams_distribute_parallel_for_simd
:
629 case OMPD_target_simd
:
630 case OMPD_target_teams_distribute_simd
:
632 case OMPD_target_teams_distribute
:
634 case OMPD_target_teams_loop
:
635 // Whether this is true or not depends on how the directive will
636 // eventually be emitted.
637 if (auto *TTLD
= dyn_cast
<OMPTargetTeamsGenericLoopDirective
>(&D
))
638 return TTLD
->canBeParallelFor();
642 case OMPD_parallel_for
:
643 case OMPD_parallel_master
:
644 case OMPD_parallel_sections
:
646 case OMPD_parallel_for_simd
:
648 case OMPD_cancellation_point
:
650 case OMPD_threadprivate
:
668 case OMPD_target_data
:
669 case OMPD_target_exit_data
:
670 case OMPD_target_enter_data
:
671 case OMPD_distribute
:
672 case OMPD_distribute_simd
:
673 case OMPD_distribute_parallel_for
:
674 case OMPD_distribute_parallel_for_simd
:
675 case OMPD_teams_distribute
:
676 case OMPD_teams_distribute_simd
:
677 case OMPD_teams_distribute_parallel_for
:
678 case OMPD_teams_distribute_parallel_for_simd
:
679 case OMPD_target_update
:
680 case OMPD_declare_simd
:
681 case OMPD_declare_variant
:
682 case OMPD_begin_declare_variant
:
683 case OMPD_end_declare_variant
:
684 case OMPD_declare_target
:
685 case OMPD_end_declare_target
:
686 case OMPD_declare_reduction
:
687 case OMPD_declare_mapper
:
689 case OMPD_taskloop_simd
:
690 case OMPD_master_taskloop
:
691 case OMPD_master_taskloop_simd
:
692 case OMPD_parallel_master_taskloop
:
693 case OMPD_parallel_master_taskloop_simd
:
700 "Unknown programming model for OpenMP directive on NVPTX target.");
703 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective
&D
,
704 StringRef ParentName
,
705 llvm::Function
*&OutlinedFn
,
706 llvm::Constant
*&OutlinedFnID
,
708 const RegionCodeGenTy
&CodeGen
) {
709 ExecutionRuntimeModesRAII
ModeRAII(CurrentExecutionMode
, EM_NonSPMD
);
710 EntryFunctionState EST
;
711 WrapperFunctionsMap
.clear();
713 [[maybe_unused
]] bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
714 assert(!IsBareKernel
&& "bare kernel should not be at generic mode");
716 // Emit target region as a standalone region.
717 class NVPTXPrePostActionTy
: public PrePostActionTy
{
718 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
;
719 const OMPExecutableDirective
&D
;
722 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState
&EST
,
723 const OMPExecutableDirective
&D
)
725 void Enter(CodeGenFunction
&CGF
) override
{
726 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
727 RT
.emitKernelInit(D
, CGF
, EST
, /* IsSPMD */ false);
728 // Skip target region initialization.
729 RT
.setLocThreadIdInsertPt(CGF
, /*AtCurrentPoint=*/true);
731 void Exit(CodeGenFunction
&CGF
) override
{
732 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
733 RT
.clearLocThreadIdInsertPt(CGF
);
734 RT
.emitKernelDeinit(CGF
, EST
, /* IsSPMD */ false);
737 CodeGen
.setAction(Action
);
738 IsInTTDRegion
= true;
739 emitTargetOutlinedFunctionHelper(D
, ParentName
, OutlinedFn
, OutlinedFnID
,
740 IsOffloadEntry
, CodeGen
);
741 IsInTTDRegion
= false;
744 void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective
&D
,
745 CodeGenFunction
&CGF
,
746 EntryFunctionState
&EST
, bool IsSPMD
) {
747 int32_t MinThreadsVal
= 1, MaxThreadsVal
= -1, MinTeamsVal
= 1,
749 computeMinAndMaxThreadsAndTeams(D
, CGF
, MinThreadsVal
, MaxThreadsVal
,
750 MinTeamsVal
, MaxTeamsVal
);
752 CGBuilderTy
&Bld
= CGF
.Builder
;
753 Bld
.restoreIP(OMPBuilder
.createTargetInit(
754 Bld
, IsSPMD
, MinThreadsVal
, MaxThreadsVal
, MinTeamsVal
, MaxTeamsVal
));
756 emitGenericVarsProlog(CGF
, EST
.Loc
);
759 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction
&CGF
,
760 EntryFunctionState
&EST
,
763 emitGenericVarsEpilog(CGF
);
765 // This is temporary until we remove the fixed sized buffer.
766 ASTContext
&C
= CGM
.getContext();
767 RecordDecl
*StaticRD
= C
.buildImplicitRecord(
768 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union
);
769 StaticRD
->startDefinition();
770 for (const RecordDecl
*TeamReductionRec
: TeamsReductions
) {
771 QualType RecTy
= C
.getRecordType(TeamReductionRec
);
772 auto *Field
= FieldDecl::Create(
773 C
, StaticRD
, SourceLocation(), SourceLocation(), nullptr, RecTy
,
774 C
.getTrivialTypeSourceInfo(RecTy
, SourceLocation()),
775 /*BW=*/nullptr, /*Mutable=*/false,
776 /*InitStyle=*/ICIS_NoInit
);
777 Field
->setAccess(AS_public
);
778 StaticRD
->addDecl(Field
);
780 StaticRD
->completeDefinition();
781 QualType StaticTy
= C
.getRecordType(StaticRD
);
782 llvm::Type
*LLVMReductionsBufferTy
=
783 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
784 const auto &DL
= CGM
.getModule().getDataLayout();
785 uint64_t ReductionDataSize
=
786 TeamsReductions
.empty()
788 : DL
.getTypeAllocSize(LLVMReductionsBufferTy
).getFixedValue();
789 CGBuilderTy
&Bld
= CGF
.Builder
;
790 OMPBuilder
.createTargetDeinit(Bld
, ReductionDataSize
,
791 C
.getLangOpts().OpenMPCUDAReductionBufNum
);
792 TeamsReductions
.clear();
795 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective
&D
,
796 StringRef ParentName
,
797 llvm::Function
*&OutlinedFn
,
798 llvm::Constant
*&OutlinedFnID
,
800 const RegionCodeGenTy
&CodeGen
) {
801 ExecutionRuntimeModesRAII
ModeRAII(CurrentExecutionMode
, EM_SPMD
);
802 EntryFunctionState EST
;
804 bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
806 // Emit target region as a standalone region.
807 class NVPTXPrePostActionTy
: public PrePostActionTy
{
808 CGOpenMPRuntimeGPU
&RT
;
809 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
;
811 DataSharingMode Mode
;
812 const OMPExecutableDirective
&D
;
815 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU
&RT
,
816 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
,
817 bool IsBareKernel
, const OMPExecutableDirective
&D
)
818 : RT(RT
), EST(EST
), IsBareKernel(IsBareKernel
),
819 Mode(RT
.CurrentDataSharingMode
), D(D
) {}
820 void Enter(CodeGenFunction
&CGF
) override
{
822 RT
.CurrentDataSharingMode
= DataSharingMode::DS_CUDA
;
825 RT
.emitKernelInit(D
, CGF
, EST
, /* IsSPMD */ true);
826 // Skip target region initialization.
827 RT
.setLocThreadIdInsertPt(CGF
, /*AtCurrentPoint=*/true);
829 void Exit(CodeGenFunction
&CGF
) override
{
831 RT
.CurrentDataSharingMode
= Mode
;
834 RT
.clearLocThreadIdInsertPt(CGF
);
835 RT
.emitKernelDeinit(CGF
, EST
, /* IsSPMD */ true);
837 } Action(*this, EST
, IsBareKernel
, D
);
838 CodeGen
.setAction(Action
);
839 IsInTTDRegion
= true;
840 emitTargetOutlinedFunctionHelper(D
, ParentName
, OutlinedFn
, OutlinedFnID
,
841 IsOffloadEntry
, CodeGen
);
842 IsInTTDRegion
= false;
845 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
846 const OMPExecutableDirective
&D
, StringRef ParentName
,
847 llvm::Function
*&OutlinedFn
, llvm::Constant
*&OutlinedFnID
,
848 bool IsOffloadEntry
, const RegionCodeGenTy
&CodeGen
) {
849 if (!IsOffloadEntry
) // Nothing to do.
852 assert(!ParentName
.empty() && "Invalid target region parent name!");
854 bool Mode
= supportsSPMDExecutionMode(CGM
.getContext(), D
);
855 bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
856 if (Mode
|| IsBareKernel
)
857 emitSPMDKernel(D
, ParentName
, OutlinedFn
, OutlinedFnID
, IsOffloadEntry
,
860 emitNonSPMDKernel(D
, ParentName
, OutlinedFn
, OutlinedFnID
, IsOffloadEntry
,
864 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule
&CGM
)
865 : CGOpenMPRuntime(CGM
) {
866 llvm::OpenMPIRBuilderConfig
Config(
867 CGM
.getLangOpts().OpenMPIsTargetDevice
, isGPU(),
868 CGM
.getLangOpts().OpenMPOffloadMandatory
,
869 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
870 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
871 OMPBuilder
.setConfig(Config
);
873 if (!CGM
.getLangOpts().OpenMPIsTargetDevice
)
874 llvm_unreachable("OpenMP can only handle device code.");
876 if (CGM
.getLangOpts().OpenMPCUDAMode
)
877 CurrentDataSharingMode
= CGOpenMPRuntimeGPU::DS_CUDA
;
879 llvm::OpenMPIRBuilder
&OMPBuilder
= getOMPBuilder();
880 if (CGM
.getLangOpts().NoGPULib
|| CGM
.getLangOpts().OMPHostIRFile
.empty())
883 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPTargetDebug
,
884 "__omp_rtl_debug_kind");
885 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPTeamSubscription
,
886 "__omp_rtl_assume_teams_oversubscription");
887 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPThreadSubscription
,
888 "__omp_rtl_assume_threads_oversubscription");
889 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPNoThreadState
,
890 "__omp_rtl_assume_no_thread_state");
891 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPNoNestedParallelism
,
892 "__omp_rtl_assume_no_nested_parallelism");
895 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction
&CGF
,
896 ProcBindKind ProcBind
,
897 SourceLocation Loc
) {
901 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction
&CGF
,
902 llvm::Value
*NumThreads
,
903 SourceLocation Loc
) {
907 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction
&CGF
,
908 const Expr
*NumTeams
,
909 const Expr
*ThreadLimit
,
910 SourceLocation Loc
) {}
912 llvm::Function
*CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
913 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
,
914 const VarDecl
*ThreadIDVar
, OpenMPDirectiveKind InnermostKind
,
915 const RegionCodeGenTy
&CodeGen
) {
916 // Emit target region as a standalone region.
917 bool PrevIsInTTDRegion
= IsInTTDRegion
;
918 IsInTTDRegion
= false;
920 cast
<llvm::Function
>(CGOpenMPRuntime::emitParallelOutlinedFunction(
921 CGF
, D
, ThreadIDVar
, InnermostKind
, CodeGen
));
922 IsInTTDRegion
= PrevIsInTTDRegion
;
923 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD
) {
924 llvm::Function
*WrapperFun
=
925 createParallelDataSharingWrapper(OutlinedFun
, D
);
926 WrapperFunctionsMap
[OutlinedFun
] = WrapperFun
;
932 /// Get list of lastprivate variables from the teams distribute ... or
933 /// teams {distribute ...} directives.
935 getDistributeLastprivateVars(ASTContext
&Ctx
, const OMPExecutableDirective
&D
,
936 llvm::SmallVectorImpl
<const ValueDecl
*> &Vars
) {
937 assert(isOpenMPTeamsDirective(D
.getDirectiveKind()) &&
938 "expected teams directive.");
939 const OMPExecutableDirective
*Dir
= &D
;
940 if (!isOpenMPDistributeDirective(D
.getDirectiveKind())) {
941 if (const Stmt
*S
= CGOpenMPRuntime::getSingleCompoundChild(
943 D
.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
944 /*IgnoreCaptured=*/true))) {
945 Dir
= dyn_cast_or_null
<OMPExecutableDirective
>(S
);
946 if (Dir
&& !isOpenMPDistributeDirective(Dir
->getDirectiveKind()))
952 for (const auto *C
: Dir
->getClausesOfKind
<OMPLastprivateClause
>()) {
953 for (const Expr
*E
: C
->getVarRefs())
954 Vars
.push_back(getPrivateItem(E
));
958 /// Get list of reduction variables from the teams ... directives.
960 getTeamsReductionVars(ASTContext
&Ctx
, const OMPExecutableDirective
&D
,
961 llvm::SmallVectorImpl
<const ValueDecl
*> &Vars
) {
962 assert(isOpenMPTeamsDirective(D
.getDirectiveKind()) &&
963 "expected teams directive.");
964 for (const auto *C
: D
.getClausesOfKind
<OMPReductionClause
>()) {
965 for (const Expr
*E
: C
->privates())
966 Vars
.push_back(getPrivateItem(E
));
970 llvm::Function
*CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
971 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
,
972 const VarDecl
*ThreadIDVar
, OpenMPDirectiveKind InnermostKind
,
973 const RegionCodeGenTy
&CodeGen
) {
974 SourceLocation Loc
= D
.getBeginLoc();
976 const RecordDecl
*GlobalizedRD
= nullptr;
977 llvm::SmallVector
<const ValueDecl
*, 4> LastPrivatesReductions
;
978 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> MappedDeclsFields
;
979 unsigned WarpSize
= CGM
.getTarget().getGridValue().GV_Warp_Size
;
980 // Globalize team reductions variable unconditionally in all modes.
981 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD
)
982 getTeamsReductionVars(CGM
.getContext(), D
, LastPrivatesReductions
);
983 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
) {
984 getDistributeLastprivateVars(CGM
.getContext(), D
, LastPrivatesReductions
);
985 if (!LastPrivatesReductions
.empty()) {
986 GlobalizedRD
= ::buildRecordForGlobalizedVars(
987 CGM
.getContext(), {}, LastPrivatesReductions
, MappedDeclsFields
,
990 } else if (!LastPrivatesReductions
.empty()) {
991 assert(!TeamAndReductions
.first
&&
992 "Previous team declaration is not expected.");
993 TeamAndReductions
.first
= D
.getCapturedStmt(OMPD_teams
)->getCapturedDecl();
994 std::swap(TeamAndReductions
.second
, LastPrivatesReductions
);
997 // Emit target region as a standalone region.
998 class NVPTXPrePostActionTy
: public PrePostActionTy
{
1000 const RecordDecl
*GlobalizedRD
;
1001 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
1005 NVPTXPrePostActionTy(
1006 SourceLocation
&Loc
, const RecordDecl
*GlobalizedRD
,
1007 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
1009 : Loc(Loc
), GlobalizedRD(GlobalizedRD
),
1010 MappedDeclsFields(MappedDeclsFields
) {}
1011 void Enter(CodeGenFunction
&CGF
) override
{
1013 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
1015 auto I
= Rt
.FunctionGlobalizedDecls
.try_emplace(CGF
.CurFn
).first
;
1016 I
->getSecond().MappedParams
=
1017 std::make_unique
<CodeGenFunction::OMPMapVars
>();
1018 DeclToAddrMapTy
&Data
= I
->getSecond().LocalVarData
;
1019 for (const auto &Pair
: MappedDeclsFields
) {
1020 assert(Pair
.getFirst()->isCanonicalDecl() &&
1021 "Expected canonical declaration");
1022 Data
.insert(std::make_pair(Pair
.getFirst(), MappedVarData()));
1025 Rt
.emitGenericVarsProlog(CGF
, Loc
);
1027 void Exit(CodeGenFunction
&CGF
) override
{
1028 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime())
1029 .emitGenericVarsEpilog(CGF
);
1031 } Action(Loc
, GlobalizedRD
, MappedDeclsFields
);
1032 CodeGen
.setAction(Action
);
1033 llvm::Function
*OutlinedFun
= CGOpenMPRuntime::emitTeamsOutlinedFunction(
1034 CGF
, D
, ThreadIDVar
, InnermostKind
, CodeGen
);
1039 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction
&CGF
,
1040 SourceLocation Loc
) {
1041 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
1044 CGBuilderTy
&Bld
= CGF
.Builder
;
1046 const auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
1047 if (I
== FunctionGlobalizedDecls
.end())
1050 for (auto &Rec
: I
->getSecond().LocalVarData
) {
1051 const auto *VD
= cast
<VarDecl
>(Rec
.first
);
1052 bool EscapedParam
= I
->getSecond().EscapedParameters
.count(Rec
.first
);
1053 QualType VarTy
= VD
->getType();
1055 // Get the local allocation of a firstprivate variable before sharing
1056 llvm::Value
*ParValue
;
1059 CGF
.MakeAddrLValue(CGF
.GetAddrOfLocalVar(VD
), VD
->getType());
1060 ParValue
= CGF
.EmitLoadOfScalar(ParLVal
, Loc
);
1063 // Allocate space for the variable to be globalized
1064 llvm::Value
*AllocArgs
[] = {CGF
.getTypeSize(VD
->getType())};
1065 llvm::CallBase
*VoidPtr
=
1066 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1067 CGM
.getModule(), OMPRTL___kmpc_alloc_shared
),
1068 AllocArgs
, VD
->getName());
1069 // FIXME: We should use the variables actual alignment as an argument.
1070 VoidPtr
->addRetAttr(llvm::Attribute::get(
1071 CGM
.getLLVMContext(), llvm::Attribute::Alignment
,
1072 CGM
.getContext().getTargetInfo().getNewAlign() / 8));
1074 // Cast the void pointer and get the address of the globalized variable.
1075 llvm::Value
*CastedVoidPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1076 VoidPtr
, Bld
.getPtrTy(0), VD
->getName() + "_on_stack");
1078 CGF
.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr
, VarTy
);
1079 Rec
.second
.PrivateAddr
= VarAddr
.getAddress();
1080 Rec
.second
.GlobalizedVal
= VoidPtr
;
1082 // Assign the local allocation to the newly globalized location.
1084 CGF
.EmitStoreOfScalar(ParValue
, VarAddr
);
1085 I
->getSecond().MappedParams
->setVarAddr(CGF
, VD
, VarAddr
.getAddress());
1087 if (auto *DI
= CGF
.getDebugInfo())
1088 VoidPtr
->setDebugLoc(DI
->SourceLocToDebugLoc(VD
->getLocation()));
1091 for (const auto *ValueD
: I
->getSecond().EscapedVariableLengthDecls
) {
1092 const auto *VD
= cast
<VarDecl
>(ValueD
);
1093 std::pair
<llvm::Value
*, llvm::Value
*> AddrSizePair
=
1094 getKmpcAllocShared(CGF
, VD
);
1095 I
->getSecond().EscapedVariableLengthDeclsAddrs
.emplace_back(AddrSizePair
);
1096 LValue Base
= CGF
.MakeAddrLValue(AddrSizePair
.first
, VD
->getType(),
1097 CGM
.getContext().getDeclAlign(VD
),
1098 AlignmentSource::Decl
);
1099 I
->getSecond().MappedParams
->setVarAddr(CGF
, VD
, Base
.getAddress());
1101 I
->getSecond().MappedParams
->apply(CGF
);
1104 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction
&CGF
,
1105 const VarDecl
*VD
) const {
1106 const auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
1107 if (I
== FunctionGlobalizedDecls
.end())
1110 // Check variable declaration is delayed:
1111 return llvm::is_contained(I
->getSecond().DelayedVariableLengthDecls
, VD
);
1114 std::pair
<llvm::Value
*, llvm::Value
*>
1115 CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction
&CGF
,
1116 const VarDecl
*VD
) {
1117 CGBuilderTy
&Bld
= CGF
.Builder
;
1119 // Compute size and alignment.
1120 llvm::Value
*Size
= CGF
.getTypeSize(VD
->getType());
1121 CharUnits Align
= CGM
.getContext().getDeclAlign(VD
);
1122 Size
= Bld
.CreateNUWAdd(
1123 Size
, llvm::ConstantInt::get(CGF
.SizeTy
, Align
.getQuantity() - 1));
1124 llvm::Value
*AlignVal
=
1125 llvm::ConstantInt::get(CGF
.SizeTy
, Align
.getQuantity());
1126 Size
= Bld
.CreateUDiv(Size
, AlignVal
);
1127 Size
= Bld
.CreateNUWMul(Size
, AlignVal
);
1129 // Allocate space for this VLA object to be globalized.
1130 llvm::Value
*AllocArgs
[] = {Size
};
1131 llvm::CallBase
*VoidPtr
=
1132 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1133 CGM
.getModule(), OMPRTL___kmpc_alloc_shared
),
1134 AllocArgs
, VD
->getName());
1135 VoidPtr
->addRetAttr(llvm::Attribute::get(
1136 CGM
.getLLVMContext(), llvm::Attribute::Alignment
, Align
.getQuantity()));
1138 return std::make_pair(VoidPtr
, Size
);
1141 void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1142 CodeGenFunction
&CGF
,
1143 const std::pair
<llvm::Value
*, llvm::Value
*> &AddrSizePair
) {
1144 // Deallocate the memory for each globalized VLA object
1145 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1146 CGM
.getModule(), OMPRTL___kmpc_free_shared
),
1147 {AddrSizePair
.first
, AddrSizePair
.second
});
1150 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction
&CGF
) {
1151 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
1154 const auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
1155 if (I
!= FunctionGlobalizedDecls
.end()) {
1156 // Deallocate the memory for each globalized VLA object that was
1157 // globalized in the prolog (i.e. emitGenericVarsProlog).
1158 for (const auto &AddrSizePair
:
1159 llvm::reverse(I
->getSecond().EscapedVariableLengthDeclsAddrs
)) {
1160 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1161 CGM
.getModule(), OMPRTL___kmpc_free_shared
),
1162 {AddrSizePair
.first
, AddrSizePair
.second
});
1164 // Deallocate the memory for each globalized value
1165 for (auto &Rec
: llvm::reverse(I
->getSecond().LocalVarData
)) {
1166 const auto *VD
= cast
<VarDecl
>(Rec
.first
);
1167 I
->getSecond().MappedParams
->restore(CGF
);
1169 llvm::Value
*FreeArgs
[] = {Rec
.second
.GlobalizedVal
,
1170 CGF
.getTypeSize(VD
->getType())};
1171 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1172 CGM
.getModule(), OMPRTL___kmpc_free_shared
),
1178 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction
&CGF
,
1179 const OMPExecutableDirective
&D
,
1181 llvm::Function
*OutlinedFn
,
1182 ArrayRef
<llvm::Value
*> CapturedVars
) {
1183 if (!CGF
.HaveInsertPoint())
1186 bool IsBareKernel
= D
.getSingleClause
<OMPXBareClause
>();
1188 RawAddress ZeroAddr
= CGF
.CreateDefaultAlignTempAlloca(CGF
.Int32Ty
,
1189 /*Name=*/".zero.addr");
1190 CGF
.Builder
.CreateStore(CGF
.Builder
.getInt32(/*C*/ 0), ZeroAddr
);
1191 llvm::SmallVector
<llvm::Value
*, 16> OutlinedFnArgs
;
1192 // We don't emit any thread id function call in bare kernel, but because the
1193 // outlined function has a pointer argument, we emit a nullptr here.
1195 OutlinedFnArgs
.push_back(llvm::ConstantPointerNull::get(CGM
.VoidPtrTy
));
1197 OutlinedFnArgs
.push_back(emitThreadIDAddress(CGF
, Loc
).emitRawPointer(CGF
));
1198 OutlinedFnArgs
.push_back(ZeroAddr
.getPointer());
1199 OutlinedFnArgs
.append(CapturedVars
.begin(), CapturedVars
.end());
1200 emitOutlinedFunctionCall(CGF
, Loc
, OutlinedFn
, OutlinedFnArgs
);
1203 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction
&CGF
,
1205 llvm::Function
*OutlinedFn
,
1206 ArrayRef
<llvm::Value
*> CapturedVars
,
1208 llvm::Value
*NumThreads
) {
1209 if (!CGF
.HaveInsertPoint())
1212 auto &&ParallelGen
= [this, Loc
, OutlinedFn
, CapturedVars
, IfCond
,
1213 NumThreads
](CodeGenFunction
&CGF
,
1214 PrePostActionTy
&Action
) {
1215 CGBuilderTy
&Bld
= CGF
.Builder
;
1216 llvm::Value
*NumThreadsVal
= NumThreads
;
1217 llvm::Function
*WFn
= WrapperFunctionsMap
[OutlinedFn
];
1218 llvm::Value
*ID
= llvm::ConstantPointerNull::get(CGM
.Int8PtrTy
);
1220 ID
= Bld
.CreateBitOrPointerCast(WFn
, CGM
.Int8PtrTy
);
1221 llvm::Value
*FnPtr
= Bld
.CreateBitOrPointerCast(OutlinedFn
, CGM
.Int8PtrTy
);
1223 // Create a private scope that will globalize the arguments
1224 // passed from the outside of the target region.
1225 // TODO: Is that needed?
1226 CodeGenFunction::OMPPrivateScope
PrivateArgScope(CGF
);
1228 Address CapturedVarsAddrs
= CGF
.CreateDefaultAlignTempAlloca(
1229 llvm::ArrayType::get(CGM
.VoidPtrTy
, CapturedVars
.size()),
1230 "captured_vars_addrs");
1231 // There's something to share.
1232 if (!CapturedVars
.empty()) {
1233 // Prepare for parallel region. Indicate the outlined function.
1234 ASTContext
&Ctx
= CGF
.getContext();
1236 for (llvm::Value
*V
: CapturedVars
) {
1237 Address Dst
= Bld
.CreateConstArrayGEP(CapturedVarsAddrs
, Idx
);
1239 if (V
->getType()->isIntegerTy())
1240 PtrV
= Bld
.CreateIntToPtr(V
, CGF
.VoidPtrTy
);
1242 PtrV
= Bld
.CreatePointerBitCastOrAddrSpaceCast(V
, CGF
.VoidPtrTy
);
1243 CGF
.EmitStoreOfScalar(PtrV
, Dst
, /*Volatile=*/false,
1244 Ctx
.getPointerType(Ctx
.VoidPtrTy
));
1249 llvm::Value
*IfCondVal
= nullptr;
1251 IfCondVal
= Bld
.CreateIntCast(CGF
.EvaluateExprAsBool(IfCond
), CGF
.Int32Ty
,
1252 /* isSigned */ false);
1254 IfCondVal
= llvm::ConstantInt::get(CGF
.Int32Ty
, 1);
1257 NumThreadsVal
= llvm::ConstantInt::get(CGF
.Int32Ty
, -1);
1259 NumThreadsVal
= Bld
.CreateZExtOrTrunc(NumThreadsVal
, CGF
.Int32Ty
),
1261 assert(IfCondVal
&& "Expected a value");
1262 llvm::Value
*RTLoc
= emitUpdateLocation(CGF
, Loc
);
1263 llvm::Value
*Args
[] = {
1265 getThreadID(CGF
, Loc
),
1268 llvm::ConstantInt::get(CGF
.Int32Ty
, -1),
1271 Bld
.CreateBitOrPointerCast(CapturedVarsAddrs
.emitRawPointer(CGF
),
1273 llvm::ConstantInt::get(CGM
.SizeTy
, CapturedVars
.size())};
1274 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1275 CGM
.getModule(), OMPRTL___kmpc_parallel_51
),
1279 RegionCodeGenTy
RCG(ParallelGen
);
1283 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction
&CGF
) {
1284 // Always emit simple barriers!
1285 if (!CGF
.HaveInsertPoint())
1287 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1288 // This function does not use parameters, so we can emit just default values.
1289 llvm::Value
*Args
[] = {
1290 llvm::ConstantPointerNull::get(
1291 cast
<llvm::PointerType
>(getIdentTyPointerTy())),
1292 llvm::ConstantInt::get(CGF
.Int32Ty
, /*V=*/0, /*isSigned=*/true)};
1293 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1294 CGM
.getModule(), OMPRTL___kmpc_barrier_simple_spmd
),
1298 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction
&CGF
,
1300 OpenMPDirectiveKind Kind
, bool,
1302 // Always emit simple barriers!
1303 if (!CGF
.HaveInsertPoint())
1305 // Build call __kmpc_cancel_barrier(loc, thread_id);
1306 unsigned Flags
= getDefaultFlagsForBarriers(Kind
);
1307 llvm::Value
*Args
[] = {emitUpdateLocation(CGF
, Loc
, Flags
),
1308 getThreadID(CGF
, Loc
)};
1310 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1311 CGM
.getModule(), OMPRTL___kmpc_barrier
),
1315 void CGOpenMPRuntimeGPU::emitCriticalRegion(
1316 CodeGenFunction
&CGF
, StringRef CriticalName
,
1317 const RegionCodeGenTy
&CriticalOpGen
, SourceLocation Loc
,
1319 llvm::BasicBlock
*LoopBB
= CGF
.createBasicBlock("omp.critical.loop");
1320 llvm::BasicBlock
*TestBB
= CGF
.createBasicBlock("omp.critical.test");
1321 llvm::BasicBlock
*SyncBB
= CGF
.createBasicBlock("omp.critical.sync");
1322 llvm::BasicBlock
*BodyBB
= CGF
.createBasicBlock("omp.critical.body");
1323 llvm::BasicBlock
*ExitBB
= CGF
.createBasicBlock("omp.critical.exit");
1325 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
1327 // Get the mask of active threads in the warp.
1328 llvm::Value
*Mask
= CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1329 CGM
.getModule(), OMPRTL___kmpc_warp_active_thread_mask
));
1330 // Fetch team-local id of the thread.
1331 llvm::Value
*ThreadID
= RT
.getGPUThreadID(CGF
);
1333 // Get the width of the team.
1334 llvm::Value
*TeamWidth
= RT
.getGPUNumThreads(CGF
);
1336 // Initialize the counter variable for the loop.
1338 CGF
.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1339 Address Counter
= CGF
.CreateMemTemp(Int32Ty
, "critical_counter");
1340 LValue CounterLVal
= CGF
.MakeAddrLValue(Counter
, Int32Ty
);
1341 CGF
.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM
.Int32Ty
), CounterLVal
,
1344 // Block checks if loop counter exceeds upper bound.
1345 CGF
.EmitBlock(LoopBB
);
1346 llvm::Value
*CounterVal
= CGF
.EmitLoadOfScalar(CounterLVal
, Loc
);
1347 llvm::Value
*CmpLoopBound
= CGF
.Builder
.CreateICmpSLT(CounterVal
, TeamWidth
);
1348 CGF
.Builder
.CreateCondBr(CmpLoopBound
, TestBB
, ExitBB
);
1350 // Block tests which single thread should execute region, and which threads
1351 // should go straight to synchronisation point.
1352 CGF
.EmitBlock(TestBB
);
1353 CounterVal
= CGF
.EmitLoadOfScalar(CounterLVal
, Loc
);
1354 llvm::Value
*CmpThreadToCounter
=
1355 CGF
.Builder
.CreateICmpEQ(ThreadID
, CounterVal
);
1356 CGF
.Builder
.CreateCondBr(CmpThreadToCounter
, BodyBB
, SyncBB
);
1358 // Block emits the body of the critical region.
1359 CGF
.EmitBlock(BodyBB
);
1361 // Output the critical statement.
1362 CGOpenMPRuntime::emitCriticalRegion(CGF
, CriticalName
, CriticalOpGen
, Loc
,
1365 // After the body surrounded by the critical region, the single executing
1366 // thread will jump to the synchronisation point.
1367 // Block waits for all threads in current team to finish then increments the
1368 // counter variable and returns to the loop.
1369 CGF
.EmitBlock(SyncBB
);
1370 // Reconverge active threads in the warp.
1371 (void)CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1372 CGM
.getModule(), OMPRTL___kmpc_syncwarp
),
1375 llvm::Value
*IncCounterVal
=
1376 CGF
.Builder
.CreateNSWAdd(CounterVal
, CGF
.Builder
.getInt32(1));
1377 CGF
.EmitStoreOfScalar(IncCounterVal
, CounterLVal
);
1378 CGF
.EmitBranch(LoopBB
);
1380 // Block that is reached when all threads in the team complete the region.
1381 CGF
.EmitBlock(ExitBB
, /*IsFinished=*/true);
1384 /// Cast value to the specified type.
1385 static llvm::Value
*castValueToType(CodeGenFunction
&CGF
, llvm::Value
*Val
,
1386 QualType ValTy
, QualType CastTy
,
1387 SourceLocation Loc
) {
1388 assert(!CGF
.getContext().getTypeSizeInChars(CastTy
).isZero() &&
1389 "Cast type must sized.");
1390 assert(!CGF
.getContext().getTypeSizeInChars(ValTy
).isZero() &&
1391 "Val type must sized.");
1392 llvm::Type
*LLVMCastTy
= CGF
.ConvertTypeForMem(CastTy
);
1393 if (ValTy
== CastTy
)
1395 if (CGF
.getContext().getTypeSizeInChars(ValTy
) ==
1396 CGF
.getContext().getTypeSizeInChars(CastTy
))
1397 return CGF
.Builder
.CreateBitCast(Val
, LLVMCastTy
);
1398 if (CastTy
->isIntegerType() && ValTy
->isIntegerType())
1399 return CGF
.Builder
.CreateIntCast(Val
, LLVMCastTy
,
1400 CastTy
->hasSignedIntegerRepresentation());
1401 Address CastItem
= CGF
.CreateMemTemp(CastTy
);
1402 Address ValCastItem
= CastItem
.withElementType(Val
->getType());
1403 CGF
.EmitStoreOfScalar(Val
, ValCastItem
, /*Volatile=*/false, ValTy
,
1404 LValueBaseInfo(AlignmentSource::Type
),
1406 return CGF
.EmitLoadOfScalar(CastItem
, /*Volatile=*/false, CastTy
, Loc
,
1407 LValueBaseInfo(AlignmentSource::Type
),
1412 /// Design of OpenMP reductions on the GPU
1414 /// Consider a typical OpenMP program with one or more reduction
1419 /// #pragma omp target teams distribute parallel for \
1420 /// reduction(+:foo) reduction(*:bar)
1421 /// for (int i = 0; i < N; i++) {
1422 /// foo += A[i]; bar *= B[i];
1425 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1426 /// all teams. In our OpenMP implementation on the NVPTX device an
1427 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1428 /// within a team are mapped to CUDA threads within a threadblock.
1429 /// Our goal is to efficiently aggregate values across all OpenMP
1430 /// threads such that:
1432 /// - the compiler and runtime are logically concise, and
1433 /// - the reduction is performed efficiently in a hierarchical
1434 /// manner as follows: within OpenMP threads in the same warp,
1435 /// across warps in a threadblock, and finally across teams on
1436 /// the NVPTX device.
1438 /// Introduction to Decoupling
1440 /// We would like to decouple the compiler and the runtime so that the
1441 /// latter is ignorant of the reduction variables (number, data types)
1442 /// and the reduction operators. This allows a simpler interface
1443 /// and implementation while still attaining good performance.
1445 /// Pseudocode for the aforementioned OpenMP program generated by the
1446 /// compiler is as follows:
1448 /// 1. Create private copies of reduction variables on each OpenMP
1449 /// thread: 'foo_private', 'bar_private'
1450 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1451 /// to it and writes the result in 'foo_private' and 'bar_private'
1453 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1454 /// and store the result on the team master:
1456 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1457 /// reduceData, shuffleReduceFn, interWarpCpyFn)
1460 /// struct ReduceData {
1464 /// reduceData.foo = &foo_private
1465 /// reduceData.bar = &bar_private
1467 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1468 /// auxiliary functions generated by the compiler that operate on
1469 /// variables of type 'ReduceData'. They aid the runtime perform
1470 /// algorithmic steps in a data agnostic manner.
1472 /// 'shuffleReduceFn' is a pointer to a function that reduces data
1473 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
1474 /// same warp. It takes the following arguments as input:
1476 /// a. variable of type 'ReduceData' on the calling lane,
1478 /// c. an offset relative to the current lane_id to generate a
1479 /// remote_lane_id. The remote lane contains the second
1480 /// variable of type 'ReduceData' that is to be reduced.
1481 /// d. an algorithm version parameter determining which reduction
1482 /// algorithm to use.
1484 /// 'shuffleReduceFn' retrieves data from the remote lane using
1485 /// efficient GPU shuffle intrinsics and reduces, using the
1486 /// algorithm specified by the 4th parameter, the two operands
1487 /// element-wise. The result is written to the first operand.
1489 /// Different reduction algorithms are implemented in different
1490 /// runtime functions, all calling 'shuffleReduceFn' to perform
1491 /// the essential reduction step. Therefore, based on the 4th
1492 /// parameter, this function behaves slightly differently to
1493 /// cooperate with the runtime to ensure correctness under
1494 /// different circumstances.
1496 /// 'InterWarpCpyFn' is a pointer to a function that transfers
1497 /// reduced variables across warps. It tunnels, through CUDA
1498 /// shared memory, the thread-private data of type 'ReduceData'
1499 /// from lane 0 of each warp to a lane in the first warp.
1500 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1501 /// The last team writes the global reduced value to memory.
1503 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1504 /// reduceData, shuffleReduceFn, interWarpCpyFn,
1505 /// scratchpadCopyFn, loadAndReduceFn)
1507 /// 'scratchpadCopyFn' is a helper that stores reduced
1508 /// data from the team master to a scratchpad array in
1511 /// 'loadAndReduceFn' is a helper that loads data from
1512 /// the scratchpad array and reduces it with the input
1515 /// These compiler generated functions hide address
1516 /// calculation and alignment information from the runtime.
1518 /// The team master of the last team stores the reduced
1519 /// result to the globals in memory.
1520 /// foo += reduceData.foo; bar *= reduceData.bar
1523 /// Warp Reduction Algorithms
1525 /// On the warp level, we have three algorithms implemented in the
1526 /// OpenMP runtime depending on the number of active lanes:
1528 /// Full Warp Reduction
1530 /// The reduce algorithm within a warp where all lanes are active
1531 /// is implemented in the runtime as follows:
1533 /// full_warp_reduce(void *reduce_data,
1534 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1535 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1536 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
1539 /// The algorithm completes in log(2, WARPSIZE) steps.
1541 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1542 /// not used therefore we save instructions by not retrieving lane_id
1543 /// from the corresponding special registers. The 4th parameter, which
1544 /// represents the version of the algorithm being used, is set to 0 to
1545 /// signify full warp reduction.
1547 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1549 /// #reduce_elem refers to an element in the local lane's data structure
1550 /// #remote_elem is retrieved from a remote lane
1551 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1552 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1554 /// Contiguous Partial Warp Reduction
1556 /// This reduce algorithm is used within a warp where only the first
1557 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1558 /// number of OpenMP threads in a parallel region is not a multiple of
1559 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
1562 /// contiguous_partial_reduce(void *reduce_data,
1563 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1564 /// int size, int lane_id) {
1567 /// curr_size = size;
1568 /// mask = curr_size/2;
1569 /// while (offset>0) {
1570 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1571 /// curr_size = (curr_size+1)/2;
1572 /// offset = curr_size/2;
1576 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1578 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1579 /// if (lane_id < offset)
1580 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1582 /// reduce_elem = remote_elem
1584 /// This algorithm assumes that the data to be reduced are located in a
1585 /// contiguous subset of lanes starting from the first. When there is
1586 /// an odd number of active lanes, the data in the last lane is not
1587 /// aggregated with any other lane's dat but is instead copied over.
1589 /// Dispersed Partial Warp Reduction
1591 /// This algorithm is used within a warp when any discontiguous subset of
1592 /// lanes are active. It is used to implement the reduction operation
1593 /// across lanes in an OpenMP simd region or in a nested parallel region.
1596 /// dispersed_partial_reduce(void *reduce_data,
1597 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1598 /// int size, remote_id;
1599 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1601 /// remote_id = next_active_lane_id_right_after_me();
1602 /// # the above function returns 0 of no active lane
1603 /// # is present right after the current lane.
1604 /// size = number_of_active_lanes_in_this_warp();
1605 /// logical_lane_id /= 2;
1606 /// ShuffleReduceFn(reduce_data, logical_lane_id,
1607 /// remote_id-1-threadIdx.x, 2);
1608 /// } while (logical_lane_id % 2 == 0 && size > 1);
1611 /// There is no assumption made about the initial state of the reduction.
1612 /// Any number of lanes (>=1) could be active at any position. The reduction
1613 /// result is returned in the first active lane.
1615 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1617 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1618 /// if (lane_id % 2 == 0 && offset > 0)
1619 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1621 /// reduce_elem = remote_elem
1624 /// Intra-Team Reduction
1626 /// This function, as implemented in the runtime call
1627 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1628 /// threads in a team. It first reduces within a warp using the
1629 /// aforementioned algorithms. We then proceed to gather all such
1630 /// reduced values at the first warp.
1632 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1633 /// data from each of the "warp master" (zeroth lane of each warp, where
1634 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
1635 /// a mathematical sense) the problem of reduction across warp masters in
1636 /// a block to the problem of warp reduction.
1639 /// Inter-Team Reduction
1641 /// Once a team has reduced its data to a single value, it is stored in
1642 /// a global scratchpad array. Since each team has a distinct slot, this
1643 /// can be done without locking.
1645 /// The last team to write to the scratchpad array proceeds to reduce the
1646 /// scratchpad array. One or more workers in the last team use the helper
1647 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1648 /// the k'th worker reduces every k'th element.
1650 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1651 /// reduce across workers and compute a globally reduced value.
1653 void CGOpenMPRuntimeGPU::emitReduction(
1654 CodeGenFunction
&CGF
, SourceLocation Loc
, ArrayRef
<const Expr
*> Privates
,
1655 ArrayRef
<const Expr
*> LHSExprs
, ArrayRef
<const Expr
*> RHSExprs
,
1656 ArrayRef
<const Expr
*> ReductionOps
, ReductionOptionsTy Options
) {
1657 if (!CGF
.HaveInsertPoint())
1660 bool ParallelReduction
= isOpenMPParallelDirective(Options
.ReductionKind
);
1661 bool DistributeReduction
= isOpenMPDistributeDirective(Options
.ReductionKind
);
1662 bool TeamsReduction
= isOpenMPTeamsDirective(Options
.ReductionKind
);
1664 ASTContext
&C
= CGM
.getContext();
1666 if (Options
.SimpleReduction
) {
1667 assert(!TeamsReduction
&& !ParallelReduction
&&
1668 "Invalid reduction selection in emitReduction.");
1669 (void)ParallelReduction
;
1670 CGOpenMPRuntime::emitReduction(CGF
, Loc
, Privates
, LHSExprs
, RHSExprs
,
1671 ReductionOps
, Options
);
1675 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> VarFieldMap
;
1676 llvm::SmallVector
<const ValueDecl
*, 4> PrivatesReductions(Privates
.size());
1678 for (const Expr
*DRE
: Privates
) {
1679 PrivatesReductions
[Cnt
] = cast
<DeclRefExpr
>(DRE
)->getDecl();
1682 const RecordDecl
*ReductionRec
= ::buildRecordForGlobalizedVars(
1683 CGM
.getContext(), PrivatesReductions
, {}, VarFieldMap
, 1);
1686 TeamsReductions
.push_back(ReductionRec
);
1688 // Source location for the ident struct
1689 llvm::Value
*RTLoc
= emitUpdateLocation(CGF
, Loc
);
1691 using InsertPointTy
= llvm::OpenMPIRBuilder::InsertPointTy
;
1692 InsertPointTy
AllocaIP(CGF
.AllocaInsertPt
->getParent(),
1693 CGF
.AllocaInsertPt
->getIterator());
1694 InsertPointTy
CodeGenIP(CGF
.Builder
.GetInsertBlock(),
1695 CGF
.Builder
.GetInsertPoint());
1696 llvm::OpenMPIRBuilder::LocationDescription
OmpLoc(
1697 CodeGenIP
, CGF
.SourceLocToDebugLoc(Loc
));
1698 llvm::SmallVector
<llvm::OpenMPIRBuilder::ReductionInfo
> ReductionInfos
;
1700 CodeGenFunction::OMPPrivateScope
Scope(CGF
);
1702 for (const Expr
*Private
: Privates
) {
1703 llvm::Type
*ElementType
;
1704 llvm::Value
*Variable
;
1705 llvm::Value
*PrivateVariable
;
1706 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen
= nullptr;
1707 ElementType
= CGF
.ConvertTypeForMem(Private
->getType());
1708 const auto *RHSVar
=
1709 cast
<VarDecl
>(cast
<DeclRefExpr
>(RHSExprs
[Idx
])->getDecl());
1710 PrivateVariable
= CGF
.GetAddrOfLocalVar(RHSVar
).emitRawPointer(CGF
);
1711 const auto *LHSVar
=
1712 cast
<VarDecl
>(cast
<DeclRefExpr
>(LHSExprs
[Idx
])->getDecl());
1713 Variable
= CGF
.GetAddrOfLocalVar(LHSVar
).emitRawPointer(CGF
);
1714 llvm::OpenMPIRBuilder::EvalKind EvalKind
;
1715 switch (CGF
.getEvaluationKind(Private
->getType())) {
1717 EvalKind
= llvm::OpenMPIRBuilder::EvalKind::Scalar
;
1720 EvalKind
= llvm::OpenMPIRBuilder::EvalKind::Complex
;
1723 EvalKind
= llvm::OpenMPIRBuilder::EvalKind::Aggregate
;
1726 auto ReductionGen
= [&](InsertPointTy CodeGenIP
, unsigned I
,
1727 llvm::Value
**LHSPtr
, llvm::Value
**RHSPtr
,
1728 llvm::Function
*NewFunc
) {
1729 CGF
.Builder
.restoreIP(CodeGenIP
);
1730 auto *CurFn
= CGF
.CurFn
;
1731 CGF
.CurFn
= NewFunc
;
1733 *LHSPtr
= CGF
.GetAddrOfLocalVar(
1734 cast
<VarDecl
>(cast
<DeclRefExpr
>(LHSExprs
[I
])->getDecl()))
1735 .emitRawPointer(CGF
);
1736 *RHSPtr
= CGF
.GetAddrOfLocalVar(
1737 cast
<VarDecl
>(cast
<DeclRefExpr
>(RHSExprs
[I
])->getDecl()))
1738 .emitRawPointer(CGF
);
1740 emitSingleReductionCombiner(CGF
, ReductionOps
[I
], Privates
[I
],
1741 cast
<DeclRefExpr
>(LHSExprs
[I
]),
1742 cast
<DeclRefExpr
>(RHSExprs
[I
]));
1746 return InsertPointTy(CGF
.Builder
.GetInsertBlock(),
1747 CGF
.Builder
.GetInsertPoint());
1749 ReductionInfos
.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
1750 ElementType
, Variable
, PrivateVariable
, EvalKind
,
1751 /*ReductionGen=*/nullptr, ReductionGen
, AtomicReductionGen
));
1755 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP
=
1756 OMPBuilder
.createReductionsGPU(
1757 OmpLoc
, AllocaIP
, CodeGenIP
, ReductionInfos
, false, TeamsReduction
,
1758 DistributeReduction
, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang
,
1759 CGF
.getTarget().getGridValue(),
1760 C
.getLangOpts().OpenMPCUDAReductionBufNum
, RTLoc
);
1761 assert(AfterIP
&& "unexpected error creating GPU reductions");
1762 CGF
.Builder
.restoreIP(*AfterIP
);
1767 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl
*FD
,
1768 const VarDecl
*NativeParam
) const {
1769 if (!NativeParam
->getType()->isReferenceType())
1771 QualType ArgType
= NativeParam
->getType();
1772 QualifierCollector QC
;
1773 const Type
*NonQualTy
= QC
.strip(ArgType
);
1774 QualType PointeeTy
= cast
<ReferenceType
>(NonQualTy
)->getPointeeType();
1775 if (const auto *Attr
= FD
->getAttr
<OMPCaptureKindAttr
>()) {
1776 if (Attr
->getCaptureKind() == OMPC_map
) {
1777 PointeeTy
= CGM
.getContext().getAddrSpaceQualType(PointeeTy
,
1778 LangAS::opencl_global
);
1781 ArgType
= CGM
.getContext().getPointerType(PointeeTy
);
1783 enum { NVPTX_local_addr
= 5 };
1784 QC
.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr
));
1785 ArgType
= QC
.apply(CGM
.getContext(), ArgType
);
1786 if (isa
<ImplicitParamDecl
>(NativeParam
))
1787 return ImplicitParamDecl::Create(
1788 CGM
.getContext(), /*DC=*/nullptr, NativeParam
->getLocation(),
1789 NativeParam
->getIdentifier(), ArgType
, ImplicitParamKind::Other
);
1790 return ParmVarDecl::Create(
1792 const_cast<DeclContext
*>(NativeParam
->getDeclContext()),
1793 NativeParam
->getBeginLoc(), NativeParam
->getLocation(),
1794 NativeParam
->getIdentifier(), ArgType
,
1795 /*TInfo=*/nullptr, SC_None
, /*DefArg=*/nullptr);
1799 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction
&CGF
,
1800 const VarDecl
*NativeParam
,
1801 const VarDecl
*TargetParam
) const {
1802 assert(NativeParam
!= TargetParam
&&
1803 NativeParam
->getType()->isReferenceType() &&
1804 "Native arg must not be the same as target arg.");
1805 Address LocalAddr
= CGF
.GetAddrOfLocalVar(TargetParam
);
1806 QualType NativeParamType
= NativeParam
->getType();
1807 QualifierCollector QC
;
1808 const Type
*NonQualTy
= QC
.strip(NativeParamType
);
1809 QualType NativePointeeTy
= cast
<ReferenceType
>(NonQualTy
)->getPointeeType();
1810 unsigned NativePointeeAddrSpace
=
1811 CGF
.getTypes().getTargetAddressSpace(NativePointeeTy
);
1812 QualType TargetTy
= TargetParam
->getType();
1813 llvm::Value
*TargetAddr
= CGF
.EmitLoadOfScalar(LocalAddr
, /*Volatile=*/false,
1814 TargetTy
, SourceLocation());
1815 // Cast to native address space.
1816 TargetAddr
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
1818 llvm::PointerType::get(CGF
.getLLVMContext(), NativePointeeAddrSpace
));
1819 Address NativeParamAddr
= CGF
.CreateMemTemp(NativeParamType
);
1820 CGF
.EmitStoreOfScalar(TargetAddr
, NativeParamAddr
, /*Volatile=*/false,
1822 return NativeParamAddr
;
1825 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
1826 CodeGenFunction
&CGF
, SourceLocation Loc
, llvm::FunctionCallee OutlinedFn
,
1827 ArrayRef
<llvm::Value
*> Args
) const {
1828 SmallVector
<llvm::Value
*, 4> TargetArgs
;
1829 TargetArgs
.reserve(Args
.size());
1830 auto *FnType
= OutlinedFn
.getFunctionType();
1831 for (unsigned I
= 0, E
= Args
.size(); I
< E
; ++I
) {
1832 if (FnType
->isVarArg() && FnType
->getNumParams() <= I
) {
1833 TargetArgs
.append(std::next(Args
.begin(), I
), Args
.end());
1836 llvm::Type
*TargetType
= FnType
->getParamType(I
);
1837 llvm::Value
*NativeArg
= Args
[I
];
1838 if (!TargetType
->isPointerTy()) {
1839 TargetArgs
.emplace_back(NativeArg
);
1842 TargetArgs
.emplace_back(
1843 CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(NativeArg
, TargetType
));
1845 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF
, Loc
, OutlinedFn
, TargetArgs
);
1848 /// Emit function which wraps the outline parallel region
1849 /// and controls the arguments which are passed to this function.
1850 /// The wrapper ensures that the outlined function is called
1851 /// with the correct arguments when data is shared.
1852 llvm::Function
*CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1853 llvm::Function
*OutlinedParallelFn
, const OMPExecutableDirective
&D
) {
1854 ASTContext
&Ctx
= CGM
.getContext();
1855 const auto &CS
= *D
.getCapturedStmt(OMPD_parallel
);
1857 // Create a function that takes as argument the source thread.
1858 FunctionArgList WrapperArgs
;
1860 Ctx
.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
1862 Ctx
.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
1863 ImplicitParamDecl
ParallelLevelArg(Ctx
, /*DC=*/nullptr, D
.getBeginLoc(),
1864 /*Id=*/nullptr, Int16QTy
,
1865 ImplicitParamKind::Other
);
1866 ImplicitParamDecl
WrapperArg(Ctx
, /*DC=*/nullptr, D
.getBeginLoc(),
1867 /*Id=*/nullptr, Int32QTy
,
1868 ImplicitParamKind::Other
);
1869 WrapperArgs
.emplace_back(&ParallelLevelArg
);
1870 WrapperArgs
.emplace_back(&WrapperArg
);
1872 const CGFunctionInfo
&CGFI
=
1873 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(Ctx
.VoidTy
, WrapperArgs
);
1875 auto *Fn
= llvm::Function::Create(
1876 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
1877 Twine(OutlinedParallelFn
->getName(), "_wrapper"), &CGM
.getModule());
1879 // Ensure we do not inline the function. This is trivially true for the ones
1880 // passed to __kmpc_fork_call but the ones calles in serialized regions
1881 // could be inlined. This is not a perfect but it is closer to the invariant
1882 // we want, namely, every data environment starts with a new function.
1883 // TODO: We should pass the if condition to the runtime function and do the
1884 // handling there. Much cleaner code.
1885 Fn
->addFnAttr(llvm::Attribute::NoInline
);
1887 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
1888 Fn
->setLinkage(llvm::GlobalValue::InternalLinkage
);
1889 Fn
->setDoesNotRecurse();
1891 CodeGenFunction
CGF(CGM
, /*suppressNewContext=*/true);
1892 CGF
.StartFunction(GlobalDecl(), Ctx
.VoidTy
, Fn
, CGFI
, WrapperArgs
,
1893 D
.getBeginLoc(), D
.getBeginLoc());
1895 const auto *RD
= CS
.getCapturedRecordDecl();
1896 auto CurField
= RD
->field_begin();
1898 Address ZeroAddr
= CGF
.CreateDefaultAlignTempAlloca(CGF
.Int32Ty
,
1899 /*Name=*/".zero.addr");
1900 CGF
.Builder
.CreateStore(CGF
.Builder
.getInt32(/*C*/ 0), ZeroAddr
);
1901 // Get the array of arguments.
1902 SmallVector
<llvm::Value
*, 8> Args
;
1904 Args
.emplace_back(CGF
.GetAddrOfLocalVar(&WrapperArg
).emitRawPointer(CGF
));
1905 Args
.emplace_back(ZeroAddr
.emitRawPointer(CGF
));
1907 CGBuilderTy
&Bld
= CGF
.Builder
;
1908 auto CI
= CS
.capture_begin();
1910 // Use global memory for data sharing.
1911 // Handle passing of global args to workers.
1912 RawAddress GlobalArgs
=
1913 CGF
.CreateDefaultAlignTempAlloca(CGF
.VoidPtrPtrTy
, "global_args");
1914 llvm::Value
*GlobalArgsPtr
= GlobalArgs
.getPointer();
1915 llvm::Value
*DataSharingArgs
[] = {GlobalArgsPtr
};
1916 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1917 CGM
.getModule(), OMPRTL___kmpc_get_shared_variables
),
1920 // Retrieve the shared variables from the list of references returned
1921 // by the runtime. Pass the variables to the outlined function.
1922 Address SharedArgListAddress
= Address::invalid();
1923 if (CS
.capture_size() > 0 ||
1924 isOpenMPLoopBoundSharingDirective(D
.getDirectiveKind())) {
1925 SharedArgListAddress
= CGF
.EmitLoadOfPointer(
1926 GlobalArgs
, CGF
.getContext()
1927 .getPointerType(CGF
.getContext().VoidPtrTy
)
1928 .castAs
<PointerType
>());
1931 if (isOpenMPLoopBoundSharingDirective(D
.getDirectiveKind())) {
1932 Address Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, Idx
);
1933 Address TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1934 Src
, Bld
.getPtrTy(0), CGF
.SizeTy
);
1935 llvm::Value
*LB
= CGF
.EmitLoadOfScalar(
1938 CGF
.getContext().getPointerType(CGF
.getContext().getSizeType()),
1939 cast
<OMPLoopDirective
>(D
).getLowerBoundVariable()->getExprLoc());
1940 Args
.emplace_back(LB
);
1942 Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, Idx
);
1943 TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(Src
, Bld
.getPtrTy(0),
1945 llvm::Value
*UB
= CGF
.EmitLoadOfScalar(
1948 CGF
.getContext().getPointerType(CGF
.getContext().getSizeType()),
1949 cast
<OMPLoopDirective
>(D
).getUpperBoundVariable()->getExprLoc());
1950 Args
.emplace_back(UB
);
1953 if (CS
.capture_size() > 0) {
1954 ASTContext
&CGFContext
= CGF
.getContext();
1955 for (unsigned I
= 0, E
= CS
.capture_size(); I
< E
; ++I
, ++CI
, ++CurField
) {
1956 QualType ElemTy
= CurField
->getType();
1957 Address Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, I
+ Idx
);
1958 Address TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1959 Src
, CGF
.ConvertTypeForMem(CGFContext
.getPointerType(ElemTy
)),
1960 CGF
.ConvertTypeForMem(ElemTy
));
1961 llvm::Value
*Arg
= CGF
.EmitLoadOfScalar(TypedAddress
,
1963 CGFContext
.getPointerType(ElemTy
),
1965 if (CI
->capturesVariableByCopy() &&
1966 !CI
->getCapturedVar()->getType()->isAnyPointerType()) {
1967 Arg
= castValueToType(CGF
, Arg
, ElemTy
, CGFContext
.getUIntPtrType(),
1970 Args
.emplace_back(Arg
);
1974 emitOutlinedFunctionCall(CGF
, D
.getBeginLoc(), OutlinedParallelFn
, Args
);
1975 CGF
.FinishFunction();
1979 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction
&CGF
,
1981 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
1984 assert(D
&& "Expected function or captured|block decl.");
1985 assert(FunctionGlobalizedDecls
.count(CGF
.CurFn
) == 0 &&
1986 "Function is registered already.");
1987 assert((!TeamAndReductions
.first
|| TeamAndReductions
.first
== D
) &&
1988 "Team is set but not processed.");
1989 const Stmt
*Body
= nullptr;
1990 bool NeedToDelayGlobalization
= false;
1991 if (const auto *FD
= dyn_cast
<FunctionDecl
>(D
)) {
1992 Body
= FD
->getBody();
1993 } else if (const auto *BD
= dyn_cast
<BlockDecl
>(D
)) {
1994 Body
= BD
->getBody();
1995 } else if (const auto *CD
= dyn_cast
<CapturedDecl
>(D
)) {
1996 Body
= CD
->getBody();
1997 NeedToDelayGlobalization
= CGF
.CapturedStmtInfo
->getKind() == CR_OpenMP
;
1998 if (NeedToDelayGlobalization
&&
1999 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
)
2004 CheckVarsEscapingDeclContext
VarChecker(CGF
, TeamAndReductions
.second
);
2005 VarChecker
.Visit(Body
);
2006 const RecordDecl
*GlobalizedVarsRecord
=
2007 VarChecker
.getGlobalizedRecord(IsInTTDRegion
);
2008 TeamAndReductions
.first
= nullptr;
2009 TeamAndReductions
.second
.clear();
2010 ArrayRef
<const ValueDecl
*> EscapedVariableLengthDecls
=
2011 VarChecker
.getEscapedVariableLengthDecls();
2012 ArrayRef
<const ValueDecl
*> DelayedVariableLengthDecls
=
2013 VarChecker
.getDelayedVariableLengthDecls();
2014 if (!GlobalizedVarsRecord
&& EscapedVariableLengthDecls
.empty() &&
2015 DelayedVariableLengthDecls
.empty())
2017 auto I
= FunctionGlobalizedDecls
.try_emplace(CGF
.CurFn
).first
;
2018 I
->getSecond().MappedParams
=
2019 std::make_unique
<CodeGenFunction::OMPMapVars
>();
2020 I
->getSecond().EscapedParameters
.insert(
2021 VarChecker
.getEscapedParameters().begin(),
2022 VarChecker
.getEscapedParameters().end());
2023 I
->getSecond().EscapedVariableLengthDecls
.append(
2024 EscapedVariableLengthDecls
.begin(), EscapedVariableLengthDecls
.end());
2025 I
->getSecond().DelayedVariableLengthDecls
.append(
2026 DelayedVariableLengthDecls
.begin(), DelayedVariableLengthDecls
.end());
2027 DeclToAddrMapTy
&Data
= I
->getSecond().LocalVarData
;
2028 for (const ValueDecl
*VD
: VarChecker
.getEscapedDecls()) {
2029 assert(VD
->isCanonicalDecl() && "Expected canonical declaration");
2030 Data
.insert(std::make_pair(VD
, MappedVarData()));
2032 if (!NeedToDelayGlobalization
) {
2033 emitGenericVarsProlog(CGF
, D
->getBeginLoc());
2034 struct GlobalizationScope final
: EHScopeStack::Cleanup
{
2035 GlobalizationScope() = default;
2037 void Emit(CodeGenFunction
&CGF
, Flags flags
) override
{
2038 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime())
2039 .emitGenericVarsEpilog(CGF
);
2042 CGF
.EHStack
.pushCleanup
<GlobalizationScope
>(NormalAndEHCleanup
);
2046 Address
CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction
&CGF
,
2047 const VarDecl
*VD
) {
2048 if (VD
&& VD
->hasAttr
<OMPAllocateDeclAttr
>()) {
2049 const auto *A
= VD
->getAttr
<OMPAllocateDeclAttr
>();
2050 auto AS
= LangAS::Default
;
2051 switch (A
->getAllocatorType()) {
2052 case OMPAllocateDeclAttr::OMPNullMemAlloc
:
2053 case OMPAllocateDeclAttr::OMPDefaultMemAlloc
:
2054 case OMPAllocateDeclAttr::OMPHighBWMemAlloc
:
2055 case OMPAllocateDeclAttr::OMPLowLatMemAlloc
:
2057 case OMPAllocateDeclAttr::OMPThreadMemAlloc
:
2058 return Address::invalid();
2059 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc
:
2060 // TODO: implement aupport for user-defined allocators.
2061 return Address::invalid();
2062 case OMPAllocateDeclAttr::OMPConstMemAlloc
:
2063 AS
= LangAS::cuda_constant
;
2065 case OMPAllocateDeclAttr::OMPPTeamMemAlloc
:
2066 AS
= LangAS::cuda_shared
;
2068 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc
:
2069 case OMPAllocateDeclAttr::OMPCGroupMemAlloc
:
2072 llvm::Type
*VarTy
= CGF
.ConvertTypeForMem(VD
->getType());
2073 auto *GV
= new llvm::GlobalVariable(
2074 CGM
.getModule(), VarTy
, /*isConstant=*/false,
2075 llvm::GlobalValue::InternalLinkage
, llvm::PoisonValue::get(VarTy
),
2077 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal
,
2078 CGM
.getContext().getTargetAddressSpace(AS
));
2079 CharUnits Align
= CGM
.getContext().getDeclAlign(VD
);
2080 GV
->setAlignment(Align
.getAsAlign());
2082 CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
2083 GV
, CGF
.Builder
.getPtrTy(CGM
.getContext().getTargetAddressSpace(
2084 VD
->getType().getAddressSpace()))),
2088 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic
)
2089 return Address::invalid();
2091 VD
= VD
->getCanonicalDecl();
2092 auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
2093 if (I
== FunctionGlobalizedDecls
.end())
2094 return Address::invalid();
2095 auto VDI
= I
->getSecond().LocalVarData
.find(VD
);
2096 if (VDI
!= I
->getSecond().LocalVarData
.end())
2097 return VDI
->second
.PrivateAddr
;
2098 if (VD
->hasAttrs()) {
2099 for (specific_attr_iterator
<OMPReferencedVarAttr
> IT(VD
->attr_begin()),
2102 auto VDI
= I
->getSecond().LocalVarData
.find(
2103 cast
<VarDecl
>(cast
<DeclRefExpr
>(IT
->getRef())->getDecl())
2104 ->getCanonicalDecl());
2105 if (VDI
!= I
->getSecond().LocalVarData
.end())
2106 return VDI
->second
.PrivateAddr
;
2110 return Address::invalid();
2113 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction
&CGF
) {
2114 FunctionGlobalizedDecls
.erase(CGF
.CurFn
);
2115 CGOpenMPRuntime::functionFinished(CGF
);
2118 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
2119 CodeGenFunction
&CGF
, const OMPLoopDirective
&S
,
2120 OpenMPDistScheduleClauseKind
&ScheduleKind
,
2121 llvm::Value
*&Chunk
) const {
2122 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
2123 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
) {
2124 ScheduleKind
= OMPC_DIST_SCHEDULE_static
;
2125 Chunk
= CGF
.EmitScalarConversion(
2126 RT
.getGPUNumThreads(CGF
),
2127 CGF
.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2128 S
.getIterationVariable()->getType(), S
.getBeginLoc());
2131 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
2132 CGF
, S
, ScheduleKind
, Chunk
);
2135 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
2136 CodeGenFunction
&CGF
, const OMPLoopDirective
&S
,
2137 OpenMPScheduleClauseKind
&ScheduleKind
,
2138 const Expr
*&ChunkExpr
) const {
2139 ScheduleKind
= OMPC_SCHEDULE_static
;
2140 // Chunk size is 1 in this case.
2141 llvm::APInt
ChunkSize(32, 1);
2142 ChunkExpr
= IntegerLiteral::Create(CGF
.getContext(), ChunkSize
,
2143 CGF
.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2147 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
2148 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
) const {
2149 assert(isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) &&
2150 " Expected target-based directive.");
2151 const CapturedStmt
*CS
= D
.getCapturedStmt(OMPD_target
);
2152 for (const CapturedStmt::Capture
&C
: CS
->captures()) {
2153 // Capture variables captured by reference in lambdas for target-based
2155 if (!C
.capturesVariable())
2157 const VarDecl
*VD
= C
.getCapturedVar();
2158 const auto *RD
= VD
->getType()
2160 .getNonReferenceType()
2161 ->getAsCXXRecordDecl();
2162 if (!RD
|| !RD
->isLambda())
2164 Address VDAddr
= CGF
.GetAddrOfLocalVar(VD
);
2166 if (VD
->getType().getCanonicalType()->isReferenceType())
2167 VDLVal
= CGF
.EmitLoadOfReferenceLValue(VDAddr
, VD
->getType());
2169 VDLVal
= CGF
.MakeAddrLValue(
2170 VDAddr
, VD
->getType().getCanonicalType().getNonReferenceType());
2171 llvm::DenseMap
<const ValueDecl
*, FieldDecl
*> Captures
;
2172 FieldDecl
*ThisCapture
= nullptr;
2173 RD
->getCaptureFields(Captures
, ThisCapture
);
2174 if (ThisCapture
&& CGF
.CapturedStmtInfo
->isCXXThisExprCaptured()) {
2176 CGF
.EmitLValueForFieldInitialization(VDLVal
, ThisCapture
);
2177 llvm::Value
*CXXThis
= CGF
.LoadCXXThis();
2178 CGF
.EmitStoreOfScalar(CXXThis
, ThisLVal
);
2180 for (const LambdaCapture
&LC
: RD
->captures()) {
2181 if (LC
.getCaptureKind() != LCK_ByRef
)
2183 const ValueDecl
*VD
= LC
.getCapturedVar();
2184 // FIXME: For now VD is always a VarDecl because OpenMP does not support
2185 // capturing structured bindings in lambdas yet.
2186 if (!CS
->capturesVariable(cast
<VarDecl
>(VD
)))
2188 auto It
= Captures
.find(VD
);
2189 assert(It
!= Captures
.end() && "Found lambda capture without field.");
2190 LValue VarLVal
= CGF
.EmitLValueForFieldInitialization(VDLVal
, It
->second
);
2191 Address VDAddr
= CGF
.GetAddrOfLocalVar(cast
<VarDecl
>(VD
));
2192 if (VD
->getType().getCanonicalType()->isReferenceType())
2193 VDAddr
= CGF
.EmitLoadOfReferenceLValue(VDAddr
,
2194 VD
->getType().getCanonicalType())
2196 CGF
.EmitStoreOfScalar(VDAddr
.emitRawPointer(CGF
), VarLVal
);
2201 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl
*VD
,
2203 if (!VD
|| !VD
->hasAttr
<OMPAllocateDeclAttr
>())
2205 const auto *A
= VD
->getAttr
<OMPAllocateDeclAttr
>();
2206 switch(A
->getAllocatorType()) {
2207 case OMPAllocateDeclAttr::OMPNullMemAlloc
:
2208 case OMPAllocateDeclAttr::OMPDefaultMemAlloc
:
2209 // Not supported, fallback to the default mem space.
2210 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc
:
2211 case OMPAllocateDeclAttr::OMPCGroupMemAlloc
:
2212 case OMPAllocateDeclAttr::OMPHighBWMemAlloc
:
2213 case OMPAllocateDeclAttr::OMPLowLatMemAlloc
:
2214 case OMPAllocateDeclAttr::OMPThreadMemAlloc
:
2215 AS
= LangAS::Default
;
2217 case OMPAllocateDeclAttr::OMPConstMemAlloc
:
2218 AS
= LangAS::cuda_constant
;
2220 case OMPAllocateDeclAttr::OMPPTeamMemAlloc
:
2221 AS
= LangAS::cuda_shared
;
2223 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc
:
2224 llvm_unreachable("Expected predefined allocator for the variables with the "
2230 // Get current OffloadArch and ignore any unknown values
2231 static OffloadArch
getOffloadArch(CodeGenModule
&CGM
) {
2232 if (!CGM
.getTarget().hasFeature("ptx"))
2233 return OffloadArch::UNKNOWN
;
2234 for (const auto &Feature
: CGM
.getTarget().getTargetOpts().FeatureMap
) {
2235 if (Feature
.getValue()) {
2236 OffloadArch Arch
= StringToOffloadArch(Feature
.getKey());
2237 if (Arch
!= OffloadArch::UNKNOWN
)
2241 return OffloadArch::UNKNOWN
;
2244 /// Check to see if target architecture supports unified addressing which is
2245 /// a restriction for OpenMP requires clause "unified_shared_memory".
2246 void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl
*D
) {
2247 for (const OMPClause
*Clause
: D
->clauselists()) {
2248 if (Clause
->getClauseKind() == OMPC_unified_shared_memory
) {
2249 OffloadArch Arch
= getOffloadArch(CGM
);
2251 case OffloadArch::SM_20
:
2252 case OffloadArch::SM_21
:
2253 case OffloadArch::SM_30
:
2254 case OffloadArch::SM_32_
:
2255 case OffloadArch::SM_35
:
2256 case OffloadArch::SM_37
:
2257 case OffloadArch::SM_50
:
2258 case OffloadArch::SM_52
:
2259 case OffloadArch::SM_53
: {
2260 SmallString
<256> Buffer
;
2261 llvm::raw_svector_ostream
Out(Buffer
);
2262 Out
<< "Target architecture " << OffloadArchToString(Arch
)
2263 << " does not support unified addressing";
2264 CGM
.Error(Clause
->getBeginLoc(), Out
.str());
2267 case OffloadArch::SM_60
:
2268 case OffloadArch::SM_61
:
2269 case OffloadArch::SM_62
:
2270 case OffloadArch::SM_70
:
2271 case OffloadArch::SM_72
:
2272 case OffloadArch::SM_75
:
2273 case OffloadArch::SM_80
:
2274 case OffloadArch::SM_86
:
2275 case OffloadArch::SM_87
:
2276 case OffloadArch::SM_89
:
2277 case OffloadArch::SM_90
:
2278 case OffloadArch::SM_90a
:
2279 case OffloadArch::SM_100
:
2280 case OffloadArch::GFX600
:
2281 case OffloadArch::GFX601
:
2282 case OffloadArch::GFX602
:
2283 case OffloadArch::GFX700
:
2284 case OffloadArch::GFX701
:
2285 case OffloadArch::GFX702
:
2286 case OffloadArch::GFX703
:
2287 case OffloadArch::GFX704
:
2288 case OffloadArch::GFX705
:
2289 case OffloadArch::GFX801
:
2290 case OffloadArch::GFX802
:
2291 case OffloadArch::GFX803
:
2292 case OffloadArch::GFX805
:
2293 case OffloadArch::GFX810
:
2294 case OffloadArch::GFX9_GENERIC
:
2295 case OffloadArch::GFX900
:
2296 case OffloadArch::GFX902
:
2297 case OffloadArch::GFX904
:
2298 case OffloadArch::GFX906
:
2299 case OffloadArch::GFX908
:
2300 case OffloadArch::GFX909
:
2301 case OffloadArch::GFX90a
:
2302 case OffloadArch::GFX90c
:
2303 case OffloadArch::GFX9_4_GENERIC
:
2304 case OffloadArch::GFX940
:
2305 case OffloadArch::GFX941
:
2306 case OffloadArch::GFX942
:
2307 case OffloadArch::GFX950
:
2308 case OffloadArch::GFX10_1_GENERIC
:
2309 case OffloadArch::GFX1010
:
2310 case OffloadArch::GFX1011
:
2311 case OffloadArch::GFX1012
:
2312 case OffloadArch::GFX1013
:
2313 case OffloadArch::GFX10_3_GENERIC
:
2314 case OffloadArch::GFX1030
:
2315 case OffloadArch::GFX1031
:
2316 case OffloadArch::GFX1032
:
2317 case OffloadArch::GFX1033
:
2318 case OffloadArch::GFX1034
:
2319 case OffloadArch::GFX1035
:
2320 case OffloadArch::GFX1036
:
2321 case OffloadArch::GFX11_GENERIC
:
2322 case OffloadArch::GFX1100
:
2323 case OffloadArch::GFX1101
:
2324 case OffloadArch::GFX1102
:
2325 case OffloadArch::GFX1103
:
2326 case OffloadArch::GFX1150
:
2327 case OffloadArch::GFX1151
:
2328 case OffloadArch::GFX1152
:
2329 case OffloadArch::GFX1153
:
2330 case OffloadArch::GFX12_GENERIC
:
2331 case OffloadArch::GFX1200
:
2332 case OffloadArch::GFX1201
:
2333 case OffloadArch::AMDGCNSPIRV
:
2334 case OffloadArch::Generic
:
2335 case OffloadArch::UNUSED
:
2336 case OffloadArch::UNKNOWN
:
2338 case OffloadArch::LAST
:
2339 llvm_unreachable("Unexpected GPU arch.");
2343 CGOpenMPRuntime::processRequiresDirective(D
);
2346 llvm::Value
*CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction
&CGF
) {
2347 CGBuilderTy
&Bld
= CGF
.Builder
;
2348 llvm::Module
*M
= &CGF
.CGM
.getModule();
2349 const char *LocSize
= "__kmpc_get_hardware_num_threads_in_block";
2350 llvm::Function
*F
= M
->getFunction(LocSize
);
2352 F
= llvm::Function::Create(llvm::FunctionType::get(CGF
.Int32Ty
, {}, false),
2353 llvm::GlobalVariable::ExternalLinkage
, LocSize
,
2354 &CGF
.CGM
.getModule());
2356 return Bld
.CreateCall(F
, {}, "nvptx_num_threads");
2359 llvm::Value
*CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction
&CGF
) {
2360 ArrayRef
<llvm::Value
*> Args
{};
2361 return CGF
.EmitRuntimeCall(
2362 OMPBuilder
.getOrCreateRuntimeFunction(
2363 CGM
.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block
),