1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This provides a generalized class for OpenMP runtime code generation
10 // specialized by GPU targets NVPTX and AMDGCN.
12 //===----------------------------------------------------------------------===//
14 #include "CGOpenMPRuntimeGPU.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/Attr.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/OpenMPClause.h"
19 #include "clang/AST/StmtOpenMP.h"
20 #include "clang/AST/StmtVisitor.h"
21 #include "clang/Basic/Cuda.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24 #include "llvm/Support/MathExtras.h"
26 using namespace clang
;
27 using namespace CodeGen
;
28 using namespace llvm::omp
;
31 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32 class NVPTXActionTy final
: public PrePostActionTy
{
33 llvm::FunctionCallee EnterCallee
= nullptr;
34 ArrayRef
<llvm::Value
*> EnterArgs
;
35 llvm::FunctionCallee ExitCallee
= nullptr;
36 ArrayRef
<llvm::Value
*> ExitArgs
;
37 bool Conditional
= false;
38 llvm::BasicBlock
*ContBlock
= nullptr;
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee
,
42 ArrayRef
<llvm::Value
*> EnterArgs
,
43 llvm::FunctionCallee ExitCallee
,
44 ArrayRef
<llvm::Value
*> ExitArgs
, bool Conditional
= false)
45 : EnterCallee(EnterCallee
), EnterArgs(EnterArgs
), ExitCallee(ExitCallee
),
46 ExitArgs(ExitArgs
), Conditional(Conditional
) {}
47 void Enter(CodeGenFunction
&CGF
) override
{
48 llvm::Value
*EnterRes
= CGF
.EmitRuntimeCall(EnterCallee
, EnterArgs
);
50 llvm::Value
*CallBool
= CGF
.Builder
.CreateIsNotNull(EnterRes
);
51 auto *ThenBlock
= CGF
.createBasicBlock("omp_if.then");
52 ContBlock
= CGF
.createBasicBlock("omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF
.Builder
.CreateCondBr(CallBool
, ThenBlock
, ContBlock
);
55 CGF
.EmitBlock(ThenBlock
);
58 void Done(CodeGenFunction
&CGF
) {
59 // Emit the rest of blocks/branches
60 CGF
.EmitBranch(ContBlock
);
61 CGF
.EmitBlock(ContBlock
, true);
63 void Exit(CodeGenFunction
&CGF
) override
{
64 CGF
.EmitRuntimeCall(ExitCallee
, ExitArgs
);
68 /// A class to track the execution mode when codegening directives within
69 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70 /// to the target region and used by containing directives such as 'parallel'
71 /// to emit optimized code.
72 class ExecutionRuntimeModesRAII
{
74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode
=
75 CGOpenMPRuntimeGPU::EM_Unknown
;
76 CGOpenMPRuntimeGPU::ExecutionMode
&ExecMode
;
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode
&ExecMode
,
80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode
)
81 : ExecMode(ExecMode
) {
82 SavedExecMode
= ExecMode
;
85 ~ExecutionRuntimeModesRAII() { ExecMode
= SavedExecMode
; }
88 /// GPU Configuration: This information can be derived from cuda registers,
89 /// however, providing compile time constants helps generate more efficient
90 /// code. For all practical purposes this is fine because the configuration
91 /// is the same for all known NVPTX architectures.
92 enum MachineConfiguration
: unsigned {
93 /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
94 /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
96 /// Global memory alignment for performance.
97 GlobalMemoryAlignment
= 128,
100 static const ValueDecl
*getPrivateItem(const Expr
*RefExpr
) {
101 RefExpr
= RefExpr
->IgnoreParens();
102 if (const auto *ASE
= dyn_cast
<ArraySubscriptExpr
>(RefExpr
)) {
103 const Expr
*Base
= ASE
->getBase()->IgnoreParenImpCasts();
104 while (const auto *TempASE
= dyn_cast
<ArraySubscriptExpr
>(Base
))
105 Base
= TempASE
->getBase()->IgnoreParenImpCasts();
107 } else if (auto *OASE
= dyn_cast
<OMPArraySectionExpr
>(RefExpr
)) {
108 const Expr
*Base
= OASE
->getBase()->IgnoreParenImpCasts();
109 while (const auto *TempOASE
= dyn_cast
<OMPArraySectionExpr
>(Base
))
110 Base
= TempOASE
->getBase()->IgnoreParenImpCasts();
111 while (const auto *TempASE
= dyn_cast
<ArraySubscriptExpr
>(Base
))
112 Base
= TempASE
->getBase()->IgnoreParenImpCasts();
115 RefExpr
= RefExpr
->IgnoreParenImpCasts();
116 if (const auto *DE
= dyn_cast
<DeclRefExpr
>(RefExpr
))
117 return cast
<ValueDecl
>(DE
->getDecl()->getCanonicalDecl());
118 const auto *ME
= cast
<MemberExpr
>(RefExpr
);
119 return cast
<ValueDecl
>(ME
->getMemberDecl()->getCanonicalDecl());
123 static RecordDecl
*buildRecordForGlobalizedVars(
124 ASTContext
&C
, ArrayRef
<const ValueDecl
*> EscapedDecls
,
125 ArrayRef
<const ValueDecl
*> EscapedDeclsForTeams
,
126 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
127 &MappedDeclsFields
, int BufSize
) {
128 using VarsDataTy
= std::pair
<CharUnits
/*Align*/, const ValueDecl
*>;
129 if (EscapedDecls
.empty() && EscapedDeclsForTeams
.empty())
131 SmallVector
<VarsDataTy
, 4> GlobalizedVars
;
132 for (const ValueDecl
*D
: EscapedDecls
)
133 GlobalizedVars
.emplace_back(
134 CharUnits::fromQuantity(std::max(
135 C
.getDeclAlign(D
).getQuantity(),
136 static_cast<CharUnits::QuantityType
>(GlobalMemoryAlignment
))),
138 for (const ValueDecl
*D
: EscapedDeclsForTeams
)
139 GlobalizedVars
.emplace_back(C
.getDeclAlign(D
), D
);
140 llvm::stable_sort(GlobalizedVars
, [](VarsDataTy L
, VarsDataTy R
) {
141 return L
.first
> R
.first
;
144 // Build struct _globalized_locals_ty {
145 // /* globalized vars */[WarSize] align (max(decl_align,
146 // GlobalMemoryAlignment))
147 // /* globalized vars */ for EscapedDeclsForTeams
149 RecordDecl
*GlobalizedRD
= C
.buildImplicitRecord("_globalized_locals_ty");
150 GlobalizedRD
->startDefinition();
151 llvm::SmallPtrSet
<const ValueDecl
*, 16> SingleEscaped(
152 EscapedDeclsForTeams
.begin(), EscapedDeclsForTeams
.end());
153 for (const auto &Pair
: GlobalizedVars
) {
154 const ValueDecl
*VD
= Pair
.second
;
155 QualType Type
= VD
->getType();
156 if (Type
->isLValueReferenceType())
157 Type
= C
.getPointerType(Type
.getNonReferenceType());
159 Type
= Type
.getNonReferenceType();
160 SourceLocation Loc
= VD
->getLocation();
162 if (SingleEscaped
.count(VD
)) {
163 Field
= FieldDecl::Create(
164 C
, GlobalizedRD
, Loc
, Loc
, VD
->getIdentifier(), Type
,
165 C
.getTrivialTypeSourceInfo(Type
, SourceLocation()),
166 /*BW=*/nullptr, /*Mutable=*/false,
167 /*InitStyle=*/ICIS_NoInit
);
168 Field
->setAccess(AS_public
);
169 if (VD
->hasAttrs()) {
170 for (specific_attr_iterator
<AlignedAttr
> I(VD
->getAttrs().begin()),
171 E(VD
->getAttrs().end());
176 llvm::APInt
ArraySize(32, BufSize
);
177 Type
= C
.getConstantArrayType(Type
, ArraySize
, nullptr, ArrayType::Normal
,
179 Field
= FieldDecl::Create(
180 C
, GlobalizedRD
, Loc
, Loc
, VD
->getIdentifier(), Type
,
181 C
.getTrivialTypeSourceInfo(Type
, SourceLocation()),
182 /*BW=*/nullptr, /*Mutable=*/false,
183 /*InitStyle=*/ICIS_NoInit
);
184 Field
->setAccess(AS_public
);
185 llvm::APInt
Align(32, std::max(C
.getDeclAlign(VD
).getQuantity(),
186 static_cast<CharUnits::QuantityType
>(
187 GlobalMemoryAlignment
)));
188 Field
->addAttr(AlignedAttr::CreateImplicit(
189 C
, /*IsAlignmentExpr=*/true,
190 IntegerLiteral::Create(C
, Align
,
191 C
.getIntTypeForBitwidth(32, /*Signed=*/0),
193 {}, AlignedAttr::GNU_aligned
));
195 GlobalizedRD
->addDecl(Field
);
196 MappedDeclsFields
.try_emplace(VD
, Field
);
198 GlobalizedRD
->completeDefinition();
202 /// Get the list of variables that can escape their declaration context.
203 class CheckVarsEscapingDeclContext final
204 : public ConstStmtVisitor
<CheckVarsEscapingDeclContext
> {
205 CodeGenFunction
&CGF
;
206 llvm::SetVector
<const ValueDecl
*> EscapedDecls
;
207 llvm::SetVector
<const ValueDecl
*> EscapedVariableLengthDecls
;
208 llvm::SmallPtrSet
<const Decl
*, 4> EscapedParameters
;
209 RecordDecl
*GlobalizedRD
= nullptr;
210 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> MappedDeclsFields
;
211 bool AllEscaped
= false;
212 bool IsForCombinedParallelRegion
= false;
214 void markAsEscaped(const ValueDecl
*VD
) {
215 // Do not globalize declare target variables.
216 if (!isa
<VarDecl
>(VD
) ||
217 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD
))
219 VD
= cast
<ValueDecl
>(VD
->getCanonicalDecl());
220 // Use user-specified allocation.
221 if (VD
->hasAttrs() && VD
->hasAttr
<OMPAllocateDeclAttr
>())
223 // Variables captured by value must be globalized.
224 if (auto *CSI
= CGF
.CapturedStmtInfo
) {
225 if (const FieldDecl
*FD
= CSI
->lookup(cast
<VarDecl
>(VD
))) {
226 // Check if need to capture the variable that was already captured by
227 // value in the outer region.
228 if (!IsForCombinedParallelRegion
) {
231 const auto *Attr
= FD
->getAttr
<OMPCaptureKindAttr
>();
234 if (((Attr
->getCaptureKind() != OMPC_map
) &&
235 !isOpenMPPrivate(Attr
->getCaptureKind())) ||
236 ((Attr
->getCaptureKind() == OMPC_map
) &&
237 !FD
->getType()->isAnyPointerType()))
240 if (!FD
->getType()->isReferenceType()) {
241 assert(!VD
->getType()->isVariablyModifiedType() &&
242 "Parameter captured by value with variably modified type");
243 EscapedParameters
.insert(VD
);
244 } else if (!IsForCombinedParallelRegion
) {
249 if ((!CGF
.CapturedStmtInfo
||
250 (IsForCombinedParallelRegion
&& CGF
.CapturedStmtInfo
)) &&
251 VD
->getType()->isReferenceType())
252 // Do not globalize variables with reference type.
254 if (VD
->getType()->isVariablyModifiedType())
255 EscapedVariableLengthDecls
.insert(VD
);
257 EscapedDecls
.insert(VD
);
260 void VisitValueDecl(const ValueDecl
*VD
) {
261 if (VD
->getType()->isLValueReferenceType())
263 if (const auto *VarD
= dyn_cast
<VarDecl
>(VD
)) {
264 if (!isa
<ParmVarDecl
>(VarD
) && VarD
->hasInit()) {
265 const bool SavedAllEscaped
= AllEscaped
;
266 AllEscaped
= VD
->getType()->isLValueReferenceType();
267 Visit(VarD
->getInit());
268 AllEscaped
= SavedAllEscaped
;
272 void VisitOpenMPCapturedStmt(const CapturedStmt
*S
,
273 ArrayRef
<OMPClause
*> Clauses
,
274 bool IsCombinedParallelRegion
) {
277 for (const CapturedStmt::Capture
&C
: S
->captures()) {
278 if (C
.capturesVariable() && !C
.capturesVariableByCopy()) {
279 const ValueDecl
*VD
= C
.getCapturedVar();
280 bool SavedIsForCombinedParallelRegion
= IsForCombinedParallelRegion
;
281 if (IsCombinedParallelRegion
) {
282 // Check if the variable is privatized in the combined construct and
283 // those private copies must be shared in the inner parallel
285 IsForCombinedParallelRegion
= false;
286 for (const OMPClause
*C
: Clauses
) {
287 if (!isOpenMPPrivate(C
->getClauseKind()) ||
288 C
->getClauseKind() == OMPC_reduction
||
289 C
->getClauseKind() == OMPC_linear
||
290 C
->getClauseKind() == OMPC_private
)
292 ArrayRef
<const Expr
*> Vars
;
293 if (const auto *PC
= dyn_cast
<OMPFirstprivateClause
>(C
))
294 Vars
= PC
->getVarRefs();
295 else if (const auto *PC
= dyn_cast
<OMPLastprivateClause
>(C
))
296 Vars
= PC
->getVarRefs();
298 llvm_unreachable("Unexpected clause.");
299 for (const auto *E
: Vars
) {
301 cast
<DeclRefExpr
>(E
)->getDecl()->getCanonicalDecl();
302 if (D
== VD
->getCanonicalDecl()) {
303 IsForCombinedParallelRegion
= true;
307 if (IsForCombinedParallelRegion
)
312 if (isa
<OMPCapturedExprDecl
>(VD
))
314 IsForCombinedParallelRegion
= SavedIsForCombinedParallelRegion
;
319 void buildRecordForGlobalizedVars(bool IsInTTDRegion
) {
320 assert(!GlobalizedRD
&&
321 "Record for globalized variables is built already.");
322 ArrayRef
<const ValueDecl
*> EscapedDeclsForParallel
, EscapedDeclsForTeams
;
323 unsigned WarpSize
= CGF
.getTarget().getGridValue().GV_Warp_Size
;
325 EscapedDeclsForTeams
= EscapedDecls
.getArrayRef();
327 EscapedDeclsForParallel
= EscapedDecls
.getArrayRef();
328 GlobalizedRD
= ::buildRecordForGlobalizedVars(
329 CGF
.getContext(), EscapedDeclsForParallel
, EscapedDeclsForTeams
,
330 MappedDeclsFields
, WarpSize
);
334 CheckVarsEscapingDeclContext(CodeGenFunction
&CGF
,
335 ArrayRef
<const ValueDecl
*> TeamsReductions
)
336 : CGF(CGF
), EscapedDecls(TeamsReductions
.begin(), TeamsReductions
.end()) {
338 virtual ~CheckVarsEscapingDeclContext() = default;
339 void VisitDeclStmt(const DeclStmt
*S
) {
342 for (const Decl
*D
: S
->decls())
343 if (const auto *VD
= dyn_cast_or_null
<ValueDecl
>(D
))
346 void VisitOMPExecutableDirective(const OMPExecutableDirective
*D
) {
349 if (!D
->hasAssociatedStmt())
352 dyn_cast_or_null
<CapturedStmt
>(D
->getAssociatedStmt())) {
353 // Do not analyze directives that do not actually require capturing,
354 // like `omp for` or `omp simd` directives.
355 llvm::SmallVector
<OpenMPDirectiveKind
, 4> CaptureRegions
;
356 getOpenMPCaptureRegions(CaptureRegions
, D
->getDirectiveKind());
357 if (CaptureRegions
.size() == 1 && CaptureRegions
.back() == OMPD_unknown
) {
358 VisitStmt(S
->getCapturedStmt());
361 VisitOpenMPCapturedStmt(
363 CaptureRegions
.back() == OMPD_parallel
&&
364 isOpenMPDistributeDirective(D
->getDirectiveKind()));
367 void VisitCapturedStmt(const CapturedStmt
*S
) {
370 for (const CapturedStmt::Capture
&C
: S
->captures()) {
371 if (C
.capturesVariable() && !C
.capturesVariableByCopy()) {
372 const ValueDecl
*VD
= C
.getCapturedVar();
374 if (isa
<OMPCapturedExprDecl
>(VD
))
379 void VisitLambdaExpr(const LambdaExpr
*E
) {
382 for (const LambdaCapture
&C
: E
->captures()) {
383 if (C
.capturesVariable()) {
384 if (C
.getCaptureKind() == LCK_ByRef
) {
385 const ValueDecl
*VD
= C
.getCapturedVar();
387 if (E
->isInitCapture(&C
) || isa
<OMPCapturedExprDecl
>(VD
))
393 void VisitBlockExpr(const BlockExpr
*E
) {
396 for (const BlockDecl::Capture
&C
: E
->getBlockDecl()->captures()) {
398 const VarDecl
*VD
= C
.getVariable();
400 if (isa
<OMPCapturedExprDecl
>(VD
) || VD
->isInitCapture())
405 void VisitCallExpr(const CallExpr
*E
) {
408 for (const Expr
*Arg
: E
->arguments()) {
411 if (Arg
->isLValue()) {
412 const bool SavedAllEscaped
= AllEscaped
;
415 AllEscaped
= SavedAllEscaped
;
420 Visit(E
->getCallee());
422 void VisitDeclRefExpr(const DeclRefExpr
*E
) {
425 const ValueDecl
*VD
= E
->getDecl();
428 if (isa
<OMPCapturedExprDecl
>(VD
))
430 else if (VD
->isInitCapture())
433 void VisitUnaryOperator(const UnaryOperator
*E
) {
436 if (E
->getOpcode() == UO_AddrOf
) {
437 const bool SavedAllEscaped
= AllEscaped
;
439 Visit(E
->getSubExpr());
440 AllEscaped
= SavedAllEscaped
;
442 Visit(E
->getSubExpr());
445 void VisitImplicitCastExpr(const ImplicitCastExpr
*E
) {
448 if (E
->getCastKind() == CK_ArrayToPointerDecay
) {
449 const bool SavedAllEscaped
= AllEscaped
;
451 Visit(E
->getSubExpr());
452 AllEscaped
= SavedAllEscaped
;
454 Visit(E
->getSubExpr());
457 void VisitExpr(const Expr
*E
) {
460 bool SavedAllEscaped
= AllEscaped
;
463 for (const Stmt
*Child
: E
->children())
466 AllEscaped
= SavedAllEscaped
;
468 void VisitStmt(const Stmt
*S
) {
471 for (const Stmt
*Child
: S
->children())
476 /// Returns the record that handles all the escaped local variables and used
477 /// instead of their original storage.
478 const RecordDecl
*getGlobalizedRecord(bool IsInTTDRegion
) {
480 buildRecordForGlobalizedVars(IsInTTDRegion
);
484 /// Returns the field in the globalized record for the escaped variable.
485 const FieldDecl
*getFieldForGlobalizedVar(const ValueDecl
*VD
) const {
486 assert(GlobalizedRD
&&
487 "Record for globalized variables must be generated already.");
488 return MappedDeclsFields
.lookup(VD
);
491 /// Returns the list of the escaped local variables/parameters.
492 ArrayRef
<const ValueDecl
*> getEscapedDecls() const {
493 return EscapedDecls
.getArrayRef();
496 /// Checks if the escaped local variable is actually a parameter passed by
498 const llvm::SmallPtrSetImpl
<const Decl
*> &getEscapedParameters() const {
499 return EscapedParameters
;
502 /// Returns the list of the escaped variables with the variably modified
504 ArrayRef
<const ValueDecl
*> getEscapedVariableLengthDecls() const {
505 return EscapedVariableLengthDecls
.getArrayRef();
508 } // anonymous namespace
510 /// Get the id of the warp in the block.
511 /// We assume that the warp size is 32, which is always the case
512 /// on the NVPTX device, to generate more efficient code.
513 static llvm::Value
*getNVPTXWarpID(CodeGenFunction
&CGF
) {
514 CGBuilderTy
&Bld
= CGF
.Builder
;
515 unsigned LaneIDBits
=
516 llvm::Log2_32(CGF
.getTarget().getGridValue().GV_Warp_Size
);
517 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
518 return Bld
.CreateAShr(RT
.getGPUThreadID(CGF
), LaneIDBits
, "nvptx_warp_id");
521 /// Get the id of the current lane in the Warp.
522 /// We assume that the warp size is 32, which is always the case
523 /// on the NVPTX device, to generate more efficient code.
524 static llvm::Value
*getNVPTXLaneID(CodeGenFunction
&CGF
) {
525 CGBuilderTy
&Bld
= CGF
.Builder
;
526 unsigned LaneIDBits
=
527 llvm::Log2_32(CGF
.getTarget().getGridValue().GV_Warp_Size
);
528 unsigned LaneIDMask
= ~0u >> (32u - LaneIDBits
);
529 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
530 return Bld
.CreateAnd(RT
.getGPUThreadID(CGF
), Bld
.getInt32(LaneIDMask
),
534 CGOpenMPRuntimeGPU::ExecutionMode
535 CGOpenMPRuntimeGPU::getExecutionMode() const {
536 return CurrentExecutionMode
;
539 static CGOpenMPRuntimeGPU::DataSharingMode
540 getDataSharingMode(CodeGenModule
&CGM
) {
541 return CGM
.getLangOpts().OpenMPCUDAMode
? CGOpenMPRuntimeGPU::CUDA
542 : CGOpenMPRuntimeGPU::Generic
;
545 /// Check for inner (nested) SPMD construct, if any
546 static bool hasNestedSPMDDirective(ASTContext
&Ctx
,
547 const OMPExecutableDirective
&D
) {
548 const auto *CS
= D
.getInnermostCapturedStmt();
550 CS
->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
551 const Stmt
*ChildStmt
= CGOpenMPRuntime::getSingleCompoundChild(Ctx
, Body
);
553 if (const auto *NestedDir
=
554 dyn_cast_or_null
<OMPExecutableDirective
>(ChildStmt
)) {
555 OpenMPDirectiveKind DKind
= NestedDir
->getDirectiveKind();
556 switch (D
.getDirectiveKind()) {
558 if (isOpenMPParallelDirective(DKind
))
560 if (DKind
== OMPD_teams
) {
561 Body
= NestedDir
->getInnermostCapturedStmt()->IgnoreContainers(
562 /*IgnoreCaptured=*/true);
565 ChildStmt
= CGOpenMPRuntime::getSingleCompoundChild(Ctx
, Body
);
566 if (const auto *NND
=
567 dyn_cast_or_null
<OMPExecutableDirective
>(ChildStmt
)) {
568 DKind
= NND
->getDirectiveKind();
569 if (isOpenMPParallelDirective(DKind
))
574 case OMPD_target_teams
:
575 return isOpenMPParallelDirective(DKind
);
576 case OMPD_target_simd
:
577 case OMPD_target_parallel
:
578 case OMPD_target_parallel_for
:
579 case OMPD_target_parallel_for_simd
:
580 case OMPD_target_teams_distribute
:
581 case OMPD_target_teams_distribute_simd
:
582 case OMPD_target_teams_distribute_parallel_for
:
583 case OMPD_target_teams_distribute_parallel_for_simd
:
586 case OMPD_parallel_for
:
587 case OMPD_parallel_master
:
588 case OMPD_parallel_sections
:
590 case OMPD_parallel_for_simd
:
592 case OMPD_cancellation_point
:
594 case OMPD_threadprivate
:
612 case OMPD_target_data
:
613 case OMPD_target_exit_data
:
614 case OMPD_target_enter_data
:
615 case OMPD_distribute
:
616 case OMPD_distribute_simd
:
617 case OMPD_distribute_parallel_for
:
618 case OMPD_distribute_parallel_for_simd
:
619 case OMPD_teams_distribute
:
620 case OMPD_teams_distribute_simd
:
621 case OMPD_teams_distribute_parallel_for
:
622 case OMPD_teams_distribute_parallel_for_simd
:
623 case OMPD_target_update
:
624 case OMPD_declare_simd
:
625 case OMPD_declare_variant
:
626 case OMPD_begin_declare_variant
:
627 case OMPD_end_declare_variant
:
628 case OMPD_declare_target
:
629 case OMPD_end_declare_target
:
630 case OMPD_declare_reduction
:
631 case OMPD_declare_mapper
:
633 case OMPD_taskloop_simd
:
634 case OMPD_master_taskloop
:
635 case OMPD_master_taskloop_simd
:
636 case OMPD_parallel_master_taskloop
:
637 case OMPD_parallel_master_taskloop_simd
:
641 llvm_unreachable("Unexpected directive.");
648 static bool supportsSPMDExecutionMode(ASTContext
&Ctx
,
649 const OMPExecutableDirective
&D
) {
650 OpenMPDirectiveKind DirectiveKind
= D
.getDirectiveKind();
651 switch (DirectiveKind
) {
653 case OMPD_target_teams
:
654 return hasNestedSPMDDirective(Ctx
, D
);
655 case OMPD_target_parallel
:
656 case OMPD_target_parallel_for
:
657 case OMPD_target_parallel_for_simd
:
658 case OMPD_target_teams_distribute_parallel_for
:
659 case OMPD_target_teams_distribute_parallel_for_simd
:
660 case OMPD_target_simd
:
661 case OMPD_target_teams_distribute_simd
:
663 case OMPD_target_teams_distribute
:
667 case OMPD_parallel_for
:
668 case OMPD_parallel_master
:
669 case OMPD_parallel_sections
:
671 case OMPD_parallel_for_simd
:
673 case OMPD_cancellation_point
:
675 case OMPD_threadprivate
:
693 case OMPD_target_data
:
694 case OMPD_target_exit_data
:
695 case OMPD_target_enter_data
:
696 case OMPD_distribute
:
697 case OMPD_distribute_simd
:
698 case OMPD_distribute_parallel_for
:
699 case OMPD_distribute_parallel_for_simd
:
700 case OMPD_teams_distribute
:
701 case OMPD_teams_distribute_simd
:
702 case OMPD_teams_distribute_parallel_for
:
703 case OMPD_teams_distribute_parallel_for_simd
:
704 case OMPD_target_update
:
705 case OMPD_declare_simd
:
706 case OMPD_declare_variant
:
707 case OMPD_begin_declare_variant
:
708 case OMPD_end_declare_variant
:
709 case OMPD_declare_target
:
710 case OMPD_end_declare_target
:
711 case OMPD_declare_reduction
:
712 case OMPD_declare_mapper
:
714 case OMPD_taskloop_simd
:
715 case OMPD_master_taskloop
:
716 case OMPD_master_taskloop_simd
:
717 case OMPD_parallel_master_taskloop
:
718 case OMPD_parallel_master_taskloop_simd
:
725 "Unknown programming model for OpenMP directive on NVPTX target.");
728 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective
&D
,
729 StringRef ParentName
,
730 llvm::Function
*&OutlinedFn
,
731 llvm::Constant
*&OutlinedFnID
,
733 const RegionCodeGenTy
&CodeGen
) {
734 ExecutionRuntimeModesRAII
ModeRAII(CurrentExecutionMode
, EM_NonSPMD
);
735 EntryFunctionState EST
;
736 WrapperFunctionsMap
.clear();
738 // Emit target region as a standalone region.
739 class NVPTXPrePostActionTy
: public PrePostActionTy
{
740 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
;
743 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState
&EST
)
745 void Enter(CodeGenFunction
&CGF
) override
{
747 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
748 RT
.emitKernelInit(CGF
, EST
, /* IsSPMD */ false);
749 // Skip target region initialization.
750 RT
.setLocThreadIdInsertPt(CGF
, /*AtCurrentPoint=*/true);
752 void Exit(CodeGenFunction
&CGF
) override
{
754 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
755 RT
.clearLocThreadIdInsertPt(CGF
);
756 RT
.emitKernelDeinit(CGF
, EST
, /* IsSPMD */ false);
759 CodeGen
.setAction(Action
);
760 IsInTTDRegion
= true;
761 emitTargetOutlinedFunctionHelper(D
, ParentName
, OutlinedFn
, OutlinedFnID
,
762 IsOffloadEntry
, CodeGen
);
763 IsInTTDRegion
= false;
766 void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction
&CGF
,
767 EntryFunctionState
&EST
, bool IsSPMD
) {
768 CGBuilderTy
&Bld
= CGF
.Builder
;
769 Bld
.restoreIP(OMPBuilder
.createTargetInit(Bld
, IsSPMD
));
771 emitGenericVarsProlog(CGF
, EST
.Loc
);
774 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction
&CGF
,
775 EntryFunctionState
&EST
,
778 emitGenericVarsEpilog(CGF
);
780 CGBuilderTy
&Bld
= CGF
.Builder
;
781 OMPBuilder
.createTargetDeinit(Bld
, IsSPMD
);
784 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective
&D
,
785 StringRef ParentName
,
786 llvm::Function
*&OutlinedFn
,
787 llvm::Constant
*&OutlinedFnID
,
789 const RegionCodeGenTy
&CodeGen
) {
790 ExecutionRuntimeModesRAII
ModeRAII(CurrentExecutionMode
, EM_SPMD
);
791 EntryFunctionState EST
;
793 // Emit target region as a standalone region.
794 class NVPTXPrePostActionTy
: public PrePostActionTy
{
795 CGOpenMPRuntimeGPU
&RT
;
796 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
;
799 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU
&RT
,
800 CGOpenMPRuntimeGPU::EntryFunctionState
&EST
)
801 : RT(RT
), EST(EST
) {}
802 void Enter(CodeGenFunction
&CGF
) override
{
803 RT
.emitKernelInit(CGF
, EST
, /* IsSPMD */ true);
804 // Skip target region initialization.
805 RT
.setLocThreadIdInsertPt(CGF
, /*AtCurrentPoint=*/true);
807 void Exit(CodeGenFunction
&CGF
) override
{
808 RT
.clearLocThreadIdInsertPt(CGF
);
809 RT
.emitKernelDeinit(CGF
, EST
, /* IsSPMD */ true);
811 } Action(*this, EST
);
812 CodeGen
.setAction(Action
);
813 IsInTTDRegion
= true;
814 emitTargetOutlinedFunctionHelper(D
, ParentName
, OutlinedFn
, OutlinedFnID
,
815 IsOffloadEntry
, CodeGen
);
816 IsInTTDRegion
= false;
819 // Create a unique global variable to indicate the execution mode of this target
820 // region. The execution mode is either 'generic', or 'spmd' depending on the
821 // target directive. This variable is picked up by the offload library to setup
822 // the device appropriately before kernel launch. If the execution mode is
823 // 'generic', the runtime reserves one warp for the master, otherwise, all
824 // warps participate in parallel work.
825 static void setPropertyExecutionMode(CodeGenModule
&CGM
, StringRef Name
,
827 auto *GVMode
= new llvm::GlobalVariable(
828 CGM
.getModule(), CGM
.Int8Ty
, /*isConstant=*/true,
829 llvm::GlobalValue::WeakAnyLinkage
,
830 llvm::ConstantInt::get(CGM
.Int8Ty
, Mode
? OMP_TGT_EXEC_MODE_SPMD
831 : OMP_TGT_EXEC_MODE_GENERIC
),
832 Twine(Name
, "_exec_mode"));
833 GVMode
->setVisibility(llvm::GlobalVariable::ProtectedVisibility
);
834 CGM
.addCompilerUsedGlobal(GVMode
);
837 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
838 const OMPExecutableDirective
&D
, StringRef ParentName
,
839 llvm::Function
*&OutlinedFn
, llvm::Constant
*&OutlinedFnID
,
840 bool IsOffloadEntry
, const RegionCodeGenTy
&CodeGen
) {
841 if (!IsOffloadEntry
) // Nothing to do.
844 assert(!ParentName
.empty() && "Invalid target region parent name!");
846 bool Mode
= supportsSPMDExecutionMode(CGM
.getContext(), D
);
848 emitSPMDKernel(D
, ParentName
, OutlinedFn
, OutlinedFnID
, IsOffloadEntry
,
851 emitNonSPMDKernel(D
, ParentName
, OutlinedFn
, OutlinedFnID
, IsOffloadEntry
,
854 setPropertyExecutionMode(CGM
, OutlinedFn
->getName(), Mode
);
857 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule
&CGM
)
858 : CGOpenMPRuntime(CGM
) {
859 llvm::OpenMPIRBuilderConfig
Config(CGM
.getLangOpts().OpenMPIsDevice
, true,
860 hasRequiresUnifiedSharedMemory(),
861 CGM
.getLangOpts().OpenMPOffloadMandatory
);
862 OMPBuilder
.setConfig(Config
);
864 if (!CGM
.getLangOpts().OpenMPIsDevice
)
865 llvm_unreachable("OpenMP can only handle device code.");
867 llvm::OpenMPIRBuilder
&OMPBuilder
= getOMPBuilder();
868 if (CGM
.getLangOpts().NoGPULib
|| CGM
.getLangOpts().OMPHostIRFile
.empty())
871 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPTargetDebug
,
872 "__omp_rtl_debug_kind");
873 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPTeamSubscription
,
874 "__omp_rtl_assume_teams_oversubscription");
875 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPThreadSubscription
,
876 "__omp_rtl_assume_threads_oversubscription");
877 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPNoThreadState
,
878 "__omp_rtl_assume_no_thread_state");
879 OMPBuilder
.createGlobalFlag(CGM
.getLangOpts().OpenMPNoNestedParallelism
,
880 "__omp_rtl_assume_no_nested_parallelism");
883 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction
&CGF
,
884 ProcBindKind ProcBind
,
885 SourceLocation Loc
) {
886 // Do nothing in case of SPMD mode and L0 parallel.
887 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
)
890 CGOpenMPRuntime::emitProcBindClause(CGF
, ProcBind
, Loc
);
893 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction
&CGF
,
894 llvm::Value
*NumThreads
,
895 SourceLocation Loc
) {
899 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction
&CGF
,
900 const Expr
*NumTeams
,
901 const Expr
*ThreadLimit
,
902 SourceLocation Loc
) {}
904 llvm::Function
*CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
905 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
,
906 const VarDecl
*ThreadIDVar
, OpenMPDirectiveKind InnermostKind
,
907 const RegionCodeGenTy
&CodeGen
) {
908 // Emit target region as a standalone region.
909 bool PrevIsInTTDRegion
= IsInTTDRegion
;
910 IsInTTDRegion
= false;
912 cast
<llvm::Function
>(CGOpenMPRuntime::emitParallelOutlinedFunction(
913 CGF
, D
, ThreadIDVar
, InnermostKind
, CodeGen
));
914 IsInTTDRegion
= PrevIsInTTDRegion
;
915 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD
) {
916 llvm::Function
*WrapperFun
=
917 createParallelDataSharingWrapper(OutlinedFun
, D
);
918 WrapperFunctionsMap
[OutlinedFun
] = WrapperFun
;
924 /// Get list of lastprivate variables from the teams distribute ... or
925 /// teams {distribute ...} directives.
927 getDistributeLastprivateVars(ASTContext
&Ctx
, const OMPExecutableDirective
&D
,
928 llvm::SmallVectorImpl
<const ValueDecl
*> &Vars
) {
929 assert(isOpenMPTeamsDirective(D
.getDirectiveKind()) &&
930 "expected teams directive.");
931 const OMPExecutableDirective
*Dir
= &D
;
932 if (!isOpenMPDistributeDirective(D
.getDirectiveKind())) {
933 if (const Stmt
*S
= CGOpenMPRuntime::getSingleCompoundChild(
935 D
.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
936 /*IgnoreCaptured=*/true))) {
937 Dir
= dyn_cast_or_null
<OMPExecutableDirective
>(S
);
938 if (Dir
&& !isOpenMPDistributeDirective(Dir
->getDirectiveKind()))
944 for (const auto *C
: Dir
->getClausesOfKind
<OMPLastprivateClause
>()) {
945 for (const Expr
*E
: C
->getVarRefs())
946 Vars
.push_back(getPrivateItem(E
));
950 /// Get list of reduction variables from the teams ... directives.
952 getTeamsReductionVars(ASTContext
&Ctx
, const OMPExecutableDirective
&D
,
953 llvm::SmallVectorImpl
<const ValueDecl
*> &Vars
) {
954 assert(isOpenMPTeamsDirective(D
.getDirectiveKind()) &&
955 "expected teams directive.");
956 for (const auto *C
: D
.getClausesOfKind
<OMPReductionClause
>()) {
957 for (const Expr
*E
: C
->privates())
958 Vars
.push_back(getPrivateItem(E
));
962 llvm::Function
*CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
963 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
,
964 const VarDecl
*ThreadIDVar
, OpenMPDirectiveKind InnermostKind
,
965 const RegionCodeGenTy
&CodeGen
) {
966 SourceLocation Loc
= D
.getBeginLoc();
968 const RecordDecl
*GlobalizedRD
= nullptr;
969 llvm::SmallVector
<const ValueDecl
*, 4> LastPrivatesReductions
;
970 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> MappedDeclsFields
;
971 unsigned WarpSize
= CGM
.getTarget().getGridValue().GV_Warp_Size
;
972 // Globalize team reductions variable unconditionally in all modes.
973 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD
)
974 getTeamsReductionVars(CGM
.getContext(), D
, LastPrivatesReductions
);
975 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
) {
976 getDistributeLastprivateVars(CGM
.getContext(), D
, LastPrivatesReductions
);
977 if (!LastPrivatesReductions
.empty()) {
978 GlobalizedRD
= ::buildRecordForGlobalizedVars(
979 CGM
.getContext(), std::nullopt
, LastPrivatesReductions
,
980 MappedDeclsFields
, WarpSize
);
982 } else if (!LastPrivatesReductions
.empty()) {
983 assert(!TeamAndReductions
.first
&&
984 "Previous team declaration is not expected.");
985 TeamAndReductions
.first
= D
.getCapturedStmt(OMPD_teams
)->getCapturedDecl();
986 std::swap(TeamAndReductions
.second
, LastPrivatesReductions
);
989 // Emit target region as a standalone region.
990 class NVPTXPrePostActionTy
: public PrePostActionTy
{
992 const RecordDecl
*GlobalizedRD
;
993 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
997 NVPTXPrePostActionTy(
998 SourceLocation
&Loc
, const RecordDecl
*GlobalizedRD
,
999 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
1001 : Loc(Loc
), GlobalizedRD(GlobalizedRD
),
1002 MappedDeclsFields(MappedDeclsFields
) {}
1003 void Enter(CodeGenFunction
&CGF
) override
{
1005 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
1007 auto I
= Rt
.FunctionGlobalizedDecls
.try_emplace(CGF
.CurFn
).first
;
1008 I
->getSecond().MappedParams
=
1009 std::make_unique
<CodeGenFunction::OMPMapVars
>();
1010 DeclToAddrMapTy
&Data
= I
->getSecond().LocalVarData
;
1011 for (const auto &Pair
: MappedDeclsFields
) {
1012 assert(Pair
.getFirst()->isCanonicalDecl() &&
1013 "Expected canonical declaration");
1014 Data
.insert(std::make_pair(Pair
.getFirst(), MappedVarData()));
1017 Rt
.emitGenericVarsProlog(CGF
, Loc
);
1019 void Exit(CodeGenFunction
&CGF
) override
{
1020 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime())
1021 .emitGenericVarsEpilog(CGF
);
1023 } Action(Loc
, GlobalizedRD
, MappedDeclsFields
);
1024 CodeGen
.setAction(Action
);
1025 llvm::Function
*OutlinedFun
= CGOpenMPRuntime::emitTeamsOutlinedFunction(
1026 CGF
, D
, ThreadIDVar
, InnermostKind
, CodeGen
);
1031 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction
&CGF
,
1033 bool WithSPMDCheck
) {
1034 if (getDataSharingMode(CGM
) != CGOpenMPRuntimeGPU::Generic
&&
1035 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD
)
1038 CGBuilderTy
&Bld
= CGF
.Builder
;
1040 const auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
1041 if (I
== FunctionGlobalizedDecls
.end())
1044 for (auto &Rec
: I
->getSecond().LocalVarData
) {
1045 const auto *VD
= cast
<VarDecl
>(Rec
.first
);
1046 bool EscapedParam
= I
->getSecond().EscapedParameters
.count(Rec
.first
);
1047 QualType VarTy
= VD
->getType();
1049 // Get the local allocation of a firstprivate variable before sharing
1050 llvm::Value
*ParValue
;
1053 CGF
.MakeAddrLValue(CGF
.GetAddrOfLocalVar(VD
), VD
->getType());
1054 ParValue
= CGF
.EmitLoadOfScalar(ParLVal
, Loc
);
1057 // Allocate space for the variable to be globalized
1058 llvm::Value
*AllocArgs
[] = {CGF
.getTypeSize(VD
->getType())};
1059 llvm::CallBase
*VoidPtr
=
1060 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1061 CGM
.getModule(), OMPRTL___kmpc_alloc_shared
),
1062 AllocArgs
, VD
->getName());
1063 // FIXME: We should use the variables actual alignment as an argument.
1064 VoidPtr
->addRetAttr(llvm::Attribute::get(
1065 CGM
.getLLVMContext(), llvm::Attribute::Alignment
,
1066 CGM
.getContext().getTargetInfo().getNewAlign() / 8));
1068 // Cast the void pointer and get the address of the globalized variable.
1069 llvm::PointerType
*VarPtrTy
= CGF
.ConvertTypeForMem(VarTy
)->getPointerTo();
1070 llvm::Value
*CastedVoidPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1071 VoidPtr
, VarPtrTy
, VD
->getName() + "_on_stack");
1072 LValue VarAddr
= CGF
.MakeNaturalAlignAddrLValue(CastedVoidPtr
, VarTy
);
1073 Rec
.second
.PrivateAddr
= VarAddr
.getAddress(CGF
);
1074 Rec
.second
.GlobalizedVal
= VoidPtr
;
1076 // Assign the local allocation to the newly globalized location.
1078 CGF
.EmitStoreOfScalar(ParValue
, VarAddr
);
1079 I
->getSecond().MappedParams
->setVarAddr(CGF
, VD
, VarAddr
.getAddress(CGF
));
1081 if (auto *DI
= CGF
.getDebugInfo())
1082 VoidPtr
->setDebugLoc(DI
->SourceLocToDebugLoc(VD
->getLocation()));
1084 for (const auto *VD
: I
->getSecond().EscapedVariableLengthDecls
) {
1085 // Use actual memory size of the VLA object including the padding
1086 // for alignment purposes.
1087 llvm::Value
*Size
= CGF
.getTypeSize(VD
->getType());
1088 CharUnits Align
= CGM
.getContext().getDeclAlign(VD
);
1089 Size
= Bld
.CreateNUWAdd(
1090 Size
, llvm::ConstantInt::get(CGF
.SizeTy
, Align
.getQuantity() - 1));
1091 llvm::Value
*AlignVal
=
1092 llvm::ConstantInt::get(CGF
.SizeTy
, Align
.getQuantity());
1094 Size
= Bld
.CreateUDiv(Size
, AlignVal
);
1095 Size
= Bld
.CreateNUWMul(Size
, AlignVal
);
1097 // Allocate space for this VLA object to be globalized.
1098 llvm::Value
*AllocArgs
[] = {CGF
.getTypeSize(VD
->getType())};
1099 llvm::CallBase
*VoidPtr
=
1100 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1101 CGM
.getModule(), OMPRTL___kmpc_alloc_shared
),
1102 AllocArgs
, VD
->getName());
1103 VoidPtr
->addRetAttr(
1104 llvm::Attribute::get(CGM
.getLLVMContext(), llvm::Attribute::Alignment
,
1105 CGM
.getContext().getTargetInfo().getNewAlign()));
1107 I
->getSecond().EscapedVariableLengthDeclsAddrs
.emplace_back(
1108 std::pair
<llvm::Value
*, llvm::Value
*>(
1109 {VoidPtr
, CGF
.getTypeSize(VD
->getType())}));
1110 LValue Base
= CGF
.MakeAddrLValue(VoidPtr
, VD
->getType(),
1111 CGM
.getContext().getDeclAlign(VD
),
1112 AlignmentSource::Decl
);
1113 I
->getSecond().MappedParams
->setVarAddr(CGF
, cast
<VarDecl
>(VD
),
1114 Base
.getAddress(CGF
));
1116 I
->getSecond().MappedParams
->apply(CGF
);
1119 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction
&CGF
,
1120 bool WithSPMDCheck
) {
1121 if (getDataSharingMode(CGM
) != CGOpenMPRuntimeGPU::Generic
&&
1122 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD
)
1125 const auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
1126 if (I
!= FunctionGlobalizedDecls
.end()) {
1127 // Deallocate the memory for each globalized VLA object
1128 for (const auto &AddrSizePair
:
1129 llvm::reverse(I
->getSecond().EscapedVariableLengthDeclsAddrs
)) {
1130 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1131 CGM
.getModule(), OMPRTL___kmpc_free_shared
),
1132 {AddrSizePair
.first
, AddrSizePair
.second
});
1134 // Deallocate the memory for each globalized value
1135 for (auto &Rec
: llvm::reverse(I
->getSecond().LocalVarData
)) {
1136 const auto *VD
= cast
<VarDecl
>(Rec
.first
);
1137 I
->getSecond().MappedParams
->restore(CGF
);
1139 llvm::Value
*FreeArgs
[] = {Rec
.second
.GlobalizedVal
,
1140 CGF
.getTypeSize(VD
->getType())};
1141 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1142 CGM
.getModule(), OMPRTL___kmpc_free_shared
),
1148 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction
&CGF
,
1149 const OMPExecutableDirective
&D
,
1151 llvm::Function
*OutlinedFn
,
1152 ArrayRef
<llvm::Value
*> CapturedVars
) {
1153 if (!CGF
.HaveInsertPoint())
1156 Address ZeroAddr
= CGF
.CreateDefaultAlignTempAlloca(CGF
.Int32Ty
,
1157 /*Name=*/".zero.addr");
1158 CGF
.Builder
.CreateStore(CGF
.Builder
.getInt32(/*C*/ 0), ZeroAddr
);
1159 llvm::SmallVector
<llvm::Value
*, 16> OutlinedFnArgs
;
1160 OutlinedFnArgs
.push_back(emitThreadIDAddress(CGF
, Loc
).getPointer());
1161 OutlinedFnArgs
.push_back(ZeroAddr
.getPointer());
1162 OutlinedFnArgs
.append(CapturedVars
.begin(), CapturedVars
.end());
1163 emitOutlinedFunctionCall(CGF
, Loc
, OutlinedFn
, OutlinedFnArgs
);
1166 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction
&CGF
,
1168 llvm::Function
*OutlinedFn
,
1169 ArrayRef
<llvm::Value
*> CapturedVars
,
1171 llvm::Value
*NumThreads
) {
1172 if (!CGF
.HaveInsertPoint())
1175 auto &&ParallelGen
= [this, Loc
, OutlinedFn
, CapturedVars
, IfCond
,
1176 NumThreads
](CodeGenFunction
&CGF
,
1177 PrePostActionTy
&Action
) {
1178 CGBuilderTy
&Bld
= CGF
.Builder
;
1179 llvm::Value
*NumThreadsVal
= NumThreads
;
1180 llvm::Function
*WFn
= WrapperFunctionsMap
[OutlinedFn
];
1181 llvm::Value
*ID
= llvm::ConstantPointerNull::get(CGM
.Int8PtrTy
);
1183 ID
= Bld
.CreateBitOrPointerCast(WFn
, CGM
.Int8PtrTy
);
1184 llvm::Value
*FnPtr
= Bld
.CreateBitOrPointerCast(OutlinedFn
, CGM
.Int8PtrTy
);
1186 // Create a private scope that will globalize the arguments
1187 // passed from the outside of the target region.
1188 // TODO: Is that needed?
1189 CodeGenFunction::OMPPrivateScope
PrivateArgScope(CGF
);
1191 Address CapturedVarsAddrs
= CGF
.CreateDefaultAlignTempAlloca(
1192 llvm::ArrayType::get(CGM
.VoidPtrTy
, CapturedVars
.size()),
1193 "captured_vars_addrs");
1194 // There's something to share.
1195 if (!CapturedVars
.empty()) {
1196 // Prepare for parallel region. Indicate the outlined function.
1197 ASTContext
&Ctx
= CGF
.getContext();
1199 for (llvm::Value
*V
: CapturedVars
) {
1200 Address Dst
= Bld
.CreateConstArrayGEP(CapturedVarsAddrs
, Idx
);
1202 if (V
->getType()->isIntegerTy())
1203 PtrV
= Bld
.CreateIntToPtr(V
, CGF
.VoidPtrTy
);
1205 PtrV
= Bld
.CreatePointerBitCastOrAddrSpaceCast(V
, CGF
.VoidPtrTy
);
1206 CGF
.EmitStoreOfScalar(PtrV
, Dst
, /*Volatile=*/false,
1207 Ctx
.getPointerType(Ctx
.VoidPtrTy
));
1212 llvm::Value
*IfCondVal
= nullptr;
1214 IfCondVal
= Bld
.CreateIntCast(CGF
.EvaluateExprAsBool(IfCond
), CGF
.Int32Ty
,
1215 /* isSigned */ false);
1217 IfCondVal
= llvm::ConstantInt::get(CGF
.Int32Ty
, 1);
1220 NumThreadsVal
= llvm::ConstantInt::get(CGF
.Int32Ty
, -1);
1222 NumThreadsVal
= Bld
.CreateZExtOrTrunc(NumThreadsVal
, CGF
.Int32Ty
),
1224 assert(IfCondVal
&& "Expected a value");
1225 llvm::Value
*RTLoc
= emitUpdateLocation(CGF
, Loc
);
1226 llvm::Value
*Args
[] = {
1228 getThreadID(CGF
, Loc
),
1231 llvm::ConstantInt::get(CGF
.Int32Ty
, -1),
1234 Bld
.CreateBitOrPointerCast(CapturedVarsAddrs
.getPointer(),
1236 llvm::ConstantInt::get(CGM
.SizeTy
, CapturedVars
.size())};
1237 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1238 CGM
.getModule(), OMPRTL___kmpc_parallel_51
),
1242 RegionCodeGenTy
RCG(ParallelGen
);
1246 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction
&CGF
) {
1247 // Always emit simple barriers!
1248 if (!CGF
.HaveInsertPoint())
1250 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1251 // This function does not use parameters, so we can emit just default values.
1252 llvm::Value
*Args
[] = {
1253 llvm::ConstantPointerNull::get(
1254 cast
<llvm::PointerType
>(getIdentTyPointerTy())),
1255 llvm::ConstantInt::get(CGF
.Int32Ty
, /*V=*/0, /*isSigned=*/true)};
1256 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1257 CGM
.getModule(), OMPRTL___kmpc_barrier_simple_spmd
),
1261 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction
&CGF
,
1263 OpenMPDirectiveKind Kind
, bool,
1265 // Always emit simple barriers!
1266 if (!CGF
.HaveInsertPoint())
1268 // Build call __kmpc_cancel_barrier(loc, thread_id);
1269 unsigned Flags
= getDefaultFlagsForBarriers(Kind
);
1270 llvm::Value
*Args
[] = {emitUpdateLocation(CGF
, Loc
, Flags
),
1271 getThreadID(CGF
, Loc
)};
1273 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1274 CGM
.getModule(), OMPRTL___kmpc_barrier
),
1278 void CGOpenMPRuntimeGPU::emitCriticalRegion(
1279 CodeGenFunction
&CGF
, StringRef CriticalName
,
1280 const RegionCodeGenTy
&CriticalOpGen
, SourceLocation Loc
,
1282 llvm::BasicBlock
*LoopBB
= CGF
.createBasicBlock("omp.critical.loop");
1283 llvm::BasicBlock
*TestBB
= CGF
.createBasicBlock("omp.critical.test");
1284 llvm::BasicBlock
*SyncBB
= CGF
.createBasicBlock("omp.critical.sync");
1285 llvm::BasicBlock
*BodyBB
= CGF
.createBasicBlock("omp.critical.body");
1286 llvm::BasicBlock
*ExitBB
= CGF
.createBasicBlock("omp.critical.exit");
1288 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
1290 // Get the mask of active threads in the warp.
1291 llvm::Value
*Mask
= CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1292 CGM
.getModule(), OMPRTL___kmpc_warp_active_thread_mask
));
1293 // Fetch team-local id of the thread.
1294 llvm::Value
*ThreadID
= RT
.getGPUThreadID(CGF
);
1296 // Get the width of the team.
1297 llvm::Value
*TeamWidth
= RT
.getGPUNumThreads(CGF
);
1299 // Initialize the counter variable for the loop.
1301 CGF
.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1302 Address Counter
= CGF
.CreateMemTemp(Int32Ty
, "critical_counter");
1303 LValue CounterLVal
= CGF
.MakeAddrLValue(Counter
, Int32Ty
);
1304 CGF
.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM
.Int32Ty
), CounterLVal
,
1307 // Block checks if loop counter exceeds upper bound.
1308 CGF
.EmitBlock(LoopBB
);
1309 llvm::Value
*CounterVal
= CGF
.EmitLoadOfScalar(CounterLVal
, Loc
);
1310 llvm::Value
*CmpLoopBound
= CGF
.Builder
.CreateICmpSLT(CounterVal
, TeamWidth
);
1311 CGF
.Builder
.CreateCondBr(CmpLoopBound
, TestBB
, ExitBB
);
1313 // Block tests which single thread should execute region, and which threads
1314 // should go straight to synchronisation point.
1315 CGF
.EmitBlock(TestBB
);
1316 CounterVal
= CGF
.EmitLoadOfScalar(CounterLVal
, Loc
);
1317 llvm::Value
*CmpThreadToCounter
=
1318 CGF
.Builder
.CreateICmpEQ(ThreadID
, CounterVal
);
1319 CGF
.Builder
.CreateCondBr(CmpThreadToCounter
, BodyBB
, SyncBB
);
1321 // Block emits the body of the critical region.
1322 CGF
.EmitBlock(BodyBB
);
1324 // Output the critical statement.
1325 CGOpenMPRuntime::emitCriticalRegion(CGF
, CriticalName
, CriticalOpGen
, Loc
,
1328 // After the body surrounded by the critical region, the single executing
1329 // thread will jump to the synchronisation point.
1330 // Block waits for all threads in current team to finish then increments the
1331 // counter variable and returns to the loop.
1332 CGF
.EmitBlock(SyncBB
);
1333 // Reconverge active threads in the warp.
1334 (void)CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
1335 CGM
.getModule(), OMPRTL___kmpc_syncwarp
),
1338 llvm::Value
*IncCounterVal
=
1339 CGF
.Builder
.CreateNSWAdd(CounterVal
, CGF
.Builder
.getInt32(1));
1340 CGF
.EmitStoreOfScalar(IncCounterVal
, CounterLVal
);
1341 CGF
.EmitBranch(LoopBB
);
1343 // Block that is reached when all threads in the team complete the region.
1344 CGF
.EmitBlock(ExitBB
, /*IsFinished=*/true);
1347 /// Cast value to the specified type.
1348 static llvm::Value
*castValueToType(CodeGenFunction
&CGF
, llvm::Value
*Val
,
1349 QualType ValTy
, QualType CastTy
,
1350 SourceLocation Loc
) {
1351 assert(!CGF
.getContext().getTypeSizeInChars(CastTy
).isZero() &&
1352 "Cast type must sized.");
1353 assert(!CGF
.getContext().getTypeSizeInChars(ValTy
).isZero() &&
1354 "Val type must sized.");
1355 llvm::Type
*LLVMCastTy
= CGF
.ConvertTypeForMem(CastTy
);
1356 if (ValTy
== CastTy
)
1358 if (CGF
.getContext().getTypeSizeInChars(ValTy
) ==
1359 CGF
.getContext().getTypeSizeInChars(CastTy
))
1360 return CGF
.Builder
.CreateBitCast(Val
, LLVMCastTy
);
1361 if (CastTy
->isIntegerType() && ValTy
->isIntegerType())
1362 return CGF
.Builder
.CreateIntCast(Val
, LLVMCastTy
,
1363 CastTy
->hasSignedIntegerRepresentation());
1364 Address CastItem
= CGF
.CreateMemTemp(CastTy
);
1365 Address ValCastItem
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
1366 CastItem
, Val
->getType()->getPointerTo(CastItem
.getAddressSpace()),
1368 CGF
.EmitStoreOfScalar(Val
, ValCastItem
, /*Volatile=*/false, ValTy
,
1369 LValueBaseInfo(AlignmentSource::Type
),
1371 return CGF
.EmitLoadOfScalar(CastItem
, /*Volatile=*/false, CastTy
, Loc
,
1372 LValueBaseInfo(AlignmentSource::Type
),
1376 /// This function creates calls to one of two shuffle functions to copy
1377 /// variables between lanes in a warp.
1378 static llvm::Value
*createRuntimeShuffleFunction(CodeGenFunction
&CGF
,
1381 llvm::Value
*Offset
,
1382 SourceLocation Loc
) {
1383 CodeGenModule
&CGM
= CGF
.CGM
;
1384 CGBuilderTy
&Bld
= CGF
.Builder
;
1385 CGOpenMPRuntimeGPU
&RT
=
1386 *(static_cast<CGOpenMPRuntimeGPU
*>(&CGM
.getOpenMPRuntime()));
1387 llvm::OpenMPIRBuilder
&OMPBuilder
= RT
.getOMPBuilder();
1389 CharUnits Size
= CGF
.getContext().getTypeSizeInChars(ElemType
);
1390 assert(Size
.getQuantity() <= 8 &&
1391 "Unsupported bitwidth in shuffle instruction.");
1393 RuntimeFunction ShuffleFn
= Size
.getQuantity() <= 4
1394 ? OMPRTL___kmpc_shuffle_int32
1395 : OMPRTL___kmpc_shuffle_int64
;
1397 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1398 QualType CastTy
= CGF
.getContext().getIntTypeForBitwidth(
1399 Size
.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1400 llvm::Value
*ElemCast
= castValueToType(CGF
, Elem
, ElemType
, CastTy
, Loc
);
1401 llvm::Value
*WarpSize
=
1402 Bld
.CreateIntCast(RT
.getGPUWarpSize(CGF
), CGM
.Int16Ty
, /*isSigned=*/true);
1404 llvm::Value
*ShuffledVal
= CGF
.EmitRuntimeCall(
1405 OMPBuilder
.getOrCreateRuntimeFunction(CGM
.getModule(), ShuffleFn
),
1406 {ElemCast
, Offset
, WarpSize
});
1408 return castValueToType(CGF
, ShuffledVal
, CastTy
, ElemType
, Loc
);
1411 static void shuffleAndStore(CodeGenFunction
&CGF
, Address SrcAddr
,
1412 Address DestAddr
, QualType ElemType
,
1413 llvm::Value
*Offset
, SourceLocation Loc
) {
1414 CGBuilderTy
&Bld
= CGF
.Builder
;
1416 CharUnits Size
= CGF
.getContext().getTypeSizeInChars(ElemType
);
1417 // Create the loop over the big sized data.
1418 // ptr = (void*)Elem;
1419 // ptrEnd = (void*) Elem + 1;
1421 // while (ptr + Step < ptrEnd)
1422 // shuffle((int64_t)*ptr);
1424 // while (ptr + Step < ptrEnd)
1425 // shuffle((int32_t)*ptr);
1427 Address ElemPtr
= DestAddr
;
1428 Address Ptr
= SrcAddr
;
1429 Address PtrEnd
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1430 Bld
.CreateConstGEP(SrcAddr
, 1), CGF
.VoidPtrTy
, CGF
.Int8Ty
);
1431 for (int IntSize
= 8; IntSize
>= 1; IntSize
/= 2) {
1432 if (Size
< CharUnits::fromQuantity(IntSize
))
1434 QualType IntType
= CGF
.getContext().getIntTypeForBitwidth(
1435 CGF
.getContext().toBits(CharUnits::fromQuantity(IntSize
)),
1437 llvm::Type
*IntTy
= CGF
.ConvertTypeForMem(IntType
);
1438 Ptr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(Ptr
, IntTy
->getPointerTo(),
1440 ElemPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
1441 ElemPtr
, IntTy
->getPointerTo(), IntTy
);
1442 if (Size
.getQuantity() / IntSize
> 1) {
1443 llvm::BasicBlock
*PreCondBB
= CGF
.createBasicBlock(".shuffle.pre_cond");
1444 llvm::BasicBlock
*ThenBB
= CGF
.createBasicBlock(".shuffle.then");
1445 llvm::BasicBlock
*ExitBB
= CGF
.createBasicBlock(".shuffle.exit");
1446 llvm::BasicBlock
*CurrentBB
= Bld
.GetInsertBlock();
1447 CGF
.EmitBlock(PreCondBB
);
1448 llvm::PHINode
*PhiSrc
=
1449 Bld
.CreatePHI(Ptr
.getType(), /*NumReservedValues=*/2);
1450 PhiSrc
->addIncoming(Ptr
.getPointer(), CurrentBB
);
1451 llvm::PHINode
*PhiDest
=
1452 Bld
.CreatePHI(ElemPtr
.getType(), /*NumReservedValues=*/2);
1453 PhiDest
->addIncoming(ElemPtr
.getPointer(), CurrentBB
);
1454 Ptr
= Address(PhiSrc
, Ptr
.getElementType(), Ptr
.getAlignment());
1456 Address(PhiDest
, ElemPtr
.getElementType(), ElemPtr
.getAlignment());
1457 llvm::Value
*PtrDiff
= Bld
.CreatePtrDiff(
1458 CGF
.Int8Ty
, PtrEnd
.getPointer(),
1459 Bld
.CreatePointerBitCastOrAddrSpaceCast(Ptr
.getPointer(),
1461 Bld
.CreateCondBr(Bld
.CreateICmpSGT(PtrDiff
, Bld
.getInt64(IntSize
- 1)),
1463 CGF
.EmitBlock(ThenBB
);
1464 llvm::Value
*Res
= createRuntimeShuffleFunction(
1466 CGF
.EmitLoadOfScalar(Ptr
, /*Volatile=*/false, IntType
, Loc
,
1467 LValueBaseInfo(AlignmentSource::Type
),
1469 IntType
, Offset
, Loc
);
1470 CGF
.EmitStoreOfScalar(Res
, ElemPtr
, /*Volatile=*/false, IntType
,
1471 LValueBaseInfo(AlignmentSource::Type
),
1473 Address LocalPtr
= Bld
.CreateConstGEP(Ptr
, 1);
1474 Address LocalElemPtr
= Bld
.CreateConstGEP(ElemPtr
, 1);
1475 PhiSrc
->addIncoming(LocalPtr
.getPointer(), ThenBB
);
1476 PhiDest
->addIncoming(LocalElemPtr
.getPointer(), ThenBB
);
1477 CGF
.EmitBranch(PreCondBB
);
1478 CGF
.EmitBlock(ExitBB
);
1480 llvm::Value
*Res
= createRuntimeShuffleFunction(
1482 CGF
.EmitLoadOfScalar(Ptr
, /*Volatile=*/false, IntType
, Loc
,
1483 LValueBaseInfo(AlignmentSource::Type
),
1485 IntType
, Offset
, Loc
);
1486 CGF
.EmitStoreOfScalar(Res
, ElemPtr
, /*Volatile=*/false, IntType
,
1487 LValueBaseInfo(AlignmentSource::Type
),
1489 Ptr
= Bld
.CreateConstGEP(Ptr
, 1);
1490 ElemPtr
= Bld
.CreateConstGEP(ElemPtr
, 1);
1492 Size
= Size
% IntSize
;
1497 enum CopyAction
: unsigned {
1498 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1499 // the warp using shuffle instructions.
1501 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1503 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1505 // ScratchpadToThread: Copy from a scratchpad array in global memory
1506 // containing team-reduced data to a thread's stack.
1511 struct CopyOptionsTy
{
1512 llvm::Value
*RemoteLaneOffset
;
1513 llvm::Value
*ScratchpadIndex
;
1514 llvm::Value
*ScratchpadWidth
;
1517 /// Emit instructions to copy a Reduce list, which contains partially
1518 /// aggregated values, in the specified direction.
1519 static void emitReductionListCopy(
1520 CopyAction Action
, CodeGenFunction
&CGF
, QualType ReductionArrayTy
,
1521 ArrayRef
<const Expr
*> Privates
, Address SrcBase
, Address DestBase
,
1522 CopyOptionsTy CopyOptions
= {nullptr, nullptr, nullptr}) {
1524 CodeGenModule
&CGM
= CGF
.CGM
;
1525 ASTContext
&C
= CGM
.getContext();
1526 CGBuilderTy
&Bld
= CGF
.Builder
;
1528 llvm::Value
*RemoteLaneOffset
= CopyOptions
.RemoteLaneOffset
;
1529 llvm::Value
*ScratchpadIndex
= CopyOptions
.ScratchpadIndex
;
1530 llvm::Value
*ScratchpadWidth
= CopyOptions
.ScratchpadWidth
;
1532 // Iterates, element-by-element, through the source Reduce list and
1535 unsigned Size
= Privates
.size();
1536 for (const Expr
*Private
: Privates
) {
1537 Address SrcElementAddr
= Address::invalid();
1538 Address DestElementAddr
= Address::invalid();
1539 Address DestElementPtrAddr
= Address::invalid();
1540 // Should we shuffle in an element from a remote lane?
1541 bool ShuffleInElement
= false;
1542 // Set to true to update the pointer in the dest Reduce list to a
1543 // newly created element.
1544 bool UpdateDestListPtr
= false;
1545 // Increment the src or dest pointer to the scratchpad, for each
1547 bool IncrScratchpadSrc
= false;
1548 bool IncrScratchpadDest
= false;
1549 QualType PrivatePtrType
= C
.getPointerType(Private
->getType());
1550 llvm::Type
*PrivateLlvmPtrType
= CGF
.ConvertType(PrivatePtrType
);
1553 case RemoteLaneToThread
: {
1554 // Step 1.1: Get the address for the src element in the Reduce list.
1555 Address SrcElementPtrAddr
= Bld
.CreateConstArrayGEP(SrcBase
, Idx
);
1557 CGF
.EmitLoadOfPointer(CGF
.Builder
.CreateElementBitCast(
1558 SrcElementPtrAddr
, PrivateLlvmPtrType
),
1559 PrivatePtrType
->castAs
<PointerType
>());
1561 // Step 1.2: Create a temporary to store the element in the destination
1563 DestElementPtrAddr
= Bld
.CreateConstArrayGEP(DestBase
, Idx
);
1565 CGF
.CreateMemTemp(Private
->getType(), ".omp.reduction.element");
1566 ShuffleInElement
= true;
1567 UpdateDestListPtr
= true;
1571 // Step 1.1: Get the address for the src element in the Reduce list.
1572 Address SrcElementPtrAddr
= Bld
.CreateConstArrayGEP(SrcBase
, Idx
);
1574 CGF
.EmitLoadOfPointer(CGF
.Builder
.CreateElementBitCast(
1575 SrcElementPtrAddr
, PrivateLlvmPtrType
),
1576 PrivatePtrType
->castAs
<PointerType
>());
1578 // Step 1.2: Get the address for dest element. The destination
1579 // element has already been created on the thread's stack.
1580 DestElementPtrAddr
= Bld
.CreateConstArrayGEP(DestBase
, Idx
);
1582 CGF
.EmitLoadOfPointer(CGF
.Builder
.CreateElementBitCast(
1583 DestElementPtrAddr
, PrivateLlvmPtrType
),
1584 PrivatePtrType
->castAs
<PointerType
>());
1587 case ThreadToScratchpad
: {
1588 // Step 1.1: Get the address for the src element in the Reduce list.
1589 Address SrcElementPtrAddr
= Bld
.CreateConstArrayGEP(SrcBase
, Idx
);
1591 CGF
.EmitLoadOfPointer(CGF
.Builder
.CreateElementBitCast(
1592 SrcElementPtrAddr
, PrivateLlvmPtrType
),
1593 PrivatePtrType
->castAs
<PointerType
>());
1595 // Step 1.2: Get the address for dest element:
1596 // address = base + index * ElementSizeInChars.
1597 llvm::Value
*ElementSizeInChars
= CGF
.getTypeSize(Private
->getType());
1598 llvm::Value
*CurrentOffset
=
1599 Bld
.CreateNUWMul(ElementSizeInChars
, ScratchpadIndex
);
1600 llvm::Value
*ScratchPadElemAbsolutePtrVal
=
1601 Bld
.CreateNUWAdd(DestBase
.getPointer(), CurrentOffset
);
1602 ScratchPadElemAbsolutePtrVal
=
1603 Bld
.CreateIntToPtr(ScratchPadElemAbsolutePtrVal
, CGF
.VoidPtrTy
);
1604 DestElementAddr
= Address(ScratchPadElemAbsolutePtrVal
, CGF
.Int8Ty
,
1605 C
.getTypeAlignInChars(Private
->getType()));
1606 IncrScratchpadDest
= true;
1609 case ScratchpadToThread
: {
1610 // Step 1.1: Get the address for the src element in the scratchpad.
1611 // address = base + index * ElementSizeInChars.
1612 llvm::Value
*ElementSizeInChars
= CGF
.getTypeSize(Private
->getType());
1613 llvm::Value
*CurrentOffset
=
1614 Bld
.CreateNUWMul(ElementSizeInChars
, ScratchpadIndex
);
1615 llvm::Value
*ScratchPadElemAbsolutePtrVal
=
1616 Bld
.CreateNUWAdd(SrcBase
.getPointer(), CurrentOffset
);
1617 ScratchPadElemAbsolutePtrVal
=
1618 Bld
.CreateIntToPtr(ScratchPadElemAbsolutePtrVal
, CGF
.VoidPtrTy
);
1619 SrcElementAddr
= Address(ScratchPadElemAbsolutePtrVal
, CGF
.Int8Ty
,
1620 C
.getTypeAlignInChars(Private
->getType()));
1621 IncrScratchpadSrc
= true;
1623 // Step 1.2: Create a temporary to store the element in the destination
1625 DestElementPtrAddr
= Bld
.CreateConstArrayGEP(DestBase
, Idx
);
1627 CGF
.CreateMemTemp(Private
->getType(), ".omp.reduction.element");
1628 UpdateDestListPtr
= true;
1633 // Regardless of src and dest of copy, we emit the load of src
1634 // element as this is required in all directions
1635 SrcElementAddr
= Bld
.CreateElementBitCast(
1636 SrcElementAddr
, CGF
.ConvertTypeForMem(Private
->getType()));
1637 DestElementAddr
= Bld
.CreateElementBitCast(DestElementAddr
,
1638 SrcElementAddr
.getElementType());
1640 // Now that all active lanes have read the element in the
1641 // Reduce list, shuffle over the value from the remote lane.
1642 if (ShuffleInElement
) {
1643 shuffleAndStore(CGF
, SrcElementAddr
, DestElementAddr
, Private
->getType(),
1644 RemoteLaneOffset
, Private
->getExprLoc());
1646 switch (CGF
.getEvaluationKind(Private
->getType())) {
1648 llvm::Value
*Elem
= CGF
.EmitLoadOfScalar(
1649 SrcElementAddr
, /*Volatile=*/false, Private
->getType(),
1650 Private
->getExprLoc(), LValueBaseInfo(AlignmentSource::Type
),
1652 // Store the source element value to the dest element address.
1653 CGF
.EmitStoreOfScalar(
1654 Elem
, DestElementAddr
, /*Volatile=*/false, Private
->getType(),
1655 LValueBaseInfo(AlignmentSource::Type
), TBAAAccessInfo());
1659 CodeGenFunction::ComplexPairTy Elem
= CGF
.EmitLoadOfComplex(
1660 CGF
.MakeAddrLValue(SrcElementAddr
, Private
->getType()),
1661 Private
->getExprLoc());
1662 CGF
.EmitStoreOfComplex(
1663 Elem
, CGF
.MakeAddrLValue(DestElementAddr
, Private
->getType()),
1668 CGF
.EmitAggregateCopy(
1669 CGF
.MakeAddrLValue(DestElementAddr
, Private
->getType()),
1670 CGF
.MakeAddrLValue(SrcElementAddr
, Private
->getType()),
1671 Private
->getType(), AggValueSlot::DoesNotOverlap
);
1676 // Step 3.1: Modify reference in dest Reduce list as needed.
1677 // Modifying the reference in Reduce list to point to the newly
1678 // created element. The element is live in the current function
1679 // scope and that of functions it invokes (i.e., reduce_function).
1680 // RemoteReduceData[i] = (void*)&RemoteElem
1681 if (UpdateDestListPtr
) {
1682 CGF
.EmitStoreOfScalar(Bld
.CreatePointerBitCastOrAddrSpaceCast(
1683 DestElementAddr
.getPointer(), CGF
.VoidPtrTy
),
1684 DestElementPtrAddr
, /*Volatile=*/false,
1688 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1689 // address of the next element in scratchpad memory, unless we're currently
1690 // processing the last one. Memory alignment is also taken care of here.
1691 if ((IncrScratchpadDest
|| IncrScratchpadSrc
) && (Idx
+ 1 < Size
)) {
1692 // FIXME: This code doesn't make any sense, it's trying to perform
1693 // integer arithmetic on pointers.
1694 llvm::Value
*ScratchpadBasePtr
=
1695 IncrScratchpadDest
? DestBase
.getPointer() : SrcBase
.getPointer();
1696 llvm::Value
*ElementSizeInChars
= CGF
.getTypeSize(Private
->getType());
1697 ScratchpadBasePtr
= Bld
.CreateNUWAdd(
1699 Bld
.CreateNUWMul(ScratchpadWidth
, ElementSizeInChars
));
1701 // Take care of global memory alignment for performance
1702 ScratchpadBasePtr
= Bld
.CreateNUWSub(
1703 ScratchpadBasePtr
, llvm::ConstantInt::get(CGM
.SizeTy
, 1));
1704 ScratchpadBasePtr
= Bld
.CreateUDiv(
1706 llvm::ConstantInt::get(CGM
.SizeTy
, GlobalMemoryAlignment
));
1707 ScratchpadBasePtr
= Bld
.CreateNUWAdd(
1708 ScratchpadBasePtr
, llvm::ConstantInt::get(CGM
.SizeTy
, 1));
1709 ScratchpadBasePtr
= Bld
.CreateNUWMul(
1711 llvm::ConstantInt::get(CGM
.SizeTy
, GlobalMemoryAlignment
));
1713 if (IncrScratchpadDest
)
1715 Address(ScratchpadBasePtr
, CGF
.VoidPtrTy
, CGF
.getPointerAlign());
1716 else /* IncrScratchpadSrc = true */
1718 Address(ScratchpadBasePtr
, CGF
.VoidPtrTy
, CGF
.getPointerAlign());
1725 /// This function emits a helper that gathers Reduce lists from the first
1726 /// lane of every active warp to lanes in the first warp.
1728 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1729 /// shared smem[warp_size];
1730 /// For all data entries D in reduce_data:
1732 /// If (I am the first lane in each warp)
1733 /// Copy my local D to smem[warp_id]
1735 /// if (I am the first warp)
1736 /// Copy smem[thread_id] to my local D
1737 static llvm::Value
*emitInterWarpCopyFunction(CodeGenModule
&CGM
,
1738 ArrayRef
<const Expr
*> Privates
,
1739 QualType ReductionArrayTy
,
1740 SourceLocation Loc
) {
1741 ASTContext
&C
= CGM
.getContext();
1742 llvm::Module
&M
= CGM
.getModule();
1744 // ReduceList: thread local Reduce list.
1745 // At the stage of the computation when this function is called, partially
1746 // aggregated values reside in the first lane of every active warp.
1747 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
1748 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
1749 // NumWarps: number of warps active in the parallel region. This could
1750 // be smaller than 32 (max warps in a CTA) for partial block reduction.
1751 ImplicitParamDecl
NumWarpsArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
1752 C
.getIntTypeForBitwidth(32, /* Signed */ true),
1753 ImplicitParamDecl::Other
);
1754 FunctionArgList Args
;
1755 Args
.push_back(&ReduceListArg
);
1756 Args
.push_back(&NumWarpsArg
);
1758 const CGFunctionInfo
&CGFI
=
1759 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
1760 auto *Fn
= llvm::Function::Create(CGM
.getTypes().GetFunctionType(CGFI
),
1761 llvm::GlobalValue::InternalLinkage
,
1762 "_omp_reduction_inter_warp_copy_func", &M
);
1763 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
1764 Fn
->setDoesNotRecurse();
1765 CodeGenFunction
CGF(CGM
);
1766 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
1768 CGBuilderTy
&Bld
= CGF
.Builder
;
1770 // This array is used as a medium to transfer, one reduce element at a time,
1771 // the data from the first lane of every warp to lanes in the first warp
1772 // in order to perform the final step of a reduction in a parallel region
1773 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1774 // for reduced latency, as well as to have a distinct copy for concurrently
1775 // executing target regions. The array is declared with common linkage so
1776 // as to be shared across compilation units.
1777 StringRef TransferMediumName
=
1778 "__openmp_nvptx_data_transfer_temporary_storage";
1779 llvm::GlobalVariable
*TransferMedium
=
1780 M
.getGlobalVariable(TransferMediumName
);
1781 unsigned WarpSize
= CGF
.getTarget().getGridValue().GV_Warp_Size
;
1782 if (!TransferMedium
) {
1783 auto *Ty
= llvm::ArrayType::get(CGM
.Int32Ty
, WarpSize
);
1784 unsigned SharedAddressSpace
= C
.getTargetAddressSpace(LangAS::cuda_shared
);
1785 TransferMedium
= new llvm::GlobalVariable(
1786 M
, Ty
, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage
,
1787 llvm::UndefValue::get(Ty
), TransferMediumName
,
1788 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal
,
1789 SharedAddressSpace
);
1790 CGM
.addCompilerUsedGlobal(TransferMedium
);
1793 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
1794 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1795 llvm::Value
*ThreadID
= RT
.getGPUThreadID(CGF
);
1796 // nvptx_lane_id = nvptx_id % warpsize
1797 llvm::Value
*LaneID
= getNVPTXLaneID(CGF
);
1798 // nvptx_warp_id = nvptx_id / warpsize
1799 llvm::Value
*WarpID
= getNVPTXWarpID(CGF
);
1801 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
1802 llvm::Type
*ElemTy
= CGF
.ConvertTypeForMem(ReductionArrayTy
);
1803 Address
LocalReduceList(
1804 Bld
.CreatePointerBitCastOrAddrSpaceCast(
1805 CGF
.EmitLoadOfScalar(
1806 AddrReduceListArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
,
1807 LValueBaseInfo(AlignmentSource::Type
), TBAAAccessInfo()),
1808 ElemTy
->getPointerTo()),
1809 ElemTy
, CGF
.getPointerAlign());
1812 for (const Expr
*Private
: Privates
) {
1814 // Warp master copies reduce element to transfer medium in __shared__
1817 unsigned RealTySize
=
1818 C
.getTypeSizeInChars(Private
->getType())
1819 .alignTo(C
.getTypeAlignInChars(Private
->getType()))
1821 for (unsigned TySize
= 4; TySize
> 0 && RealTySize
> 0; TySize
/=2) {
1822 unsigned NumIters
= RealTySize
/ TySize
;
1825 QualType CType
= C
.getIntTypeForBitwidth(
1826 C
.toBits(CharUnits::fromQuantity(TySize
)), /*Signed=*/1);
1827 llvm::Type
*CopyType
= CGF
.ConvertTypeForMem(CType
);
1828 CharUnits Align
= CharUnits::fromQuantity(TySize
);
1829 llvm::Value
*Cnt
= nullptr;
1830 Address CntAddr
= Address::invalid();
1831 llvm::BasicBlock
*PrecondBB
= nullptr;
1832 llvm::BasicBlock
*ExitBB
= nullptr;
1834 CntAddr
= CGF
.CreateMemTemp(C
.IntTy
, ".cnt.addr");
1835 CGF
.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM
.IntTy
), CntAddr
,
1836 /*Volatile=*/false, C
.IntTy
);
1837 PrecondBB
= CGF
.createBasicBlock("precond");
1838 ExitBB
= CGF
.createBasicBlock("exit");
1839 llvm::BasicBlock
*BodyBB
= CGF
.createBasicBlock("body");
1840 // There is no need to emit line number for unconditional branch.
1841 (void)ApplyDebugLocation::CreateEmpty(CGF
);
1842 CGF
.EmitBlock(PrecondBB
);
1843 Cnt
= CGF
.EmitLoadOfScalar(CntAddr
, /*Volatile=*/false, C
.IntTy
, Loc
);
1845 Bld
.CreateICmpULT(Cnt
, llvm::ConstantInt::get(CGM
.IntTy
, NumIters
));
1846 Bld
.CreateCondBr(Cmp
, BodyBB
, ExitBB
);
1847 CGF
.EmitBlock(BodyBB
);
1850 CGM
.getOpenMPRuntime().emitBarrierCall(CGF
, Loc
, OMPD_unknown
,
1851 /*EmitChecks=*/false,
1852 /*ForceSimpleCall=*/true);
1853 llvm::BasicBlock
*ThenBB
= CGF
.createBasicBlock("then");
1854 llvm::BasicBlock
*ElseBB
= CGF
.createBasicBlock("else");
1855 llvm::BasicBlock
*MergeBB
= CGF
.createBasicBlock("ifcont");
1857 // if (lane_id == 0)
1858 llvm::Value
*IsWarpMaster
= Bld
.CreateIsNull(LaneID
, "warp_master");
1859 Bld
.CreateCondBr(IsWarpMaster
, ThenBB
, ElseBB
);
1860 CGF
.EmitBlock(ThenBB
);
1862 // Reduce element = LocalReduceList[i]
1863 Address ElemPtrPtrAddr
= Bld
.CreateConstArrayGEP(LocalReduceList
, Idx
);
1864 llvm::Value
*ElemPtrPtr
= CGF
.EmitLoadOfScalar(
1865 ElemPtrPtrAddr
, /*Volatile=*/false, C
.VoidPtrTy
, SourceLocation());
1866 // elemptr = ((CopyType*)(elemptrptr)) + I
1867 Address
ElemPtr(ElemPtrPtr
, CGF
.Int8Ty
, Align
);
1868 ElemPtr
= Bld
.CreateElementBitCast(ElemPtr
, CopyType
);
1870 ElemPtr
= Bld
.CreateGEP(ElemPtr
, Cnt
);
1872 // Get pointer to location in transfer medium.
1873 // MediumPtr = &medium[warp_id]
1874 llvm::Value
*MediumPtrVal
= Bld
.CreateInBoundsGEP(
1875 TransferMedium
->getValueType(), TransferMedium
,
1876 {llvm::Constant::getNullValue(CGM
.Int64Ty
), WarpID
});
1877 // Casting to actual data type.
1878 // MediumPtr = (CopyType*)MediumPtrAddr;
1882 CopyType
->getPointerTo(
1883 MediumPtrVal
->getType()->getPointerAddressSpace())),
1888 llvm::Value
*Elem
= CGF
.EmitLoadOfScalar(
1889 ElemPtr
, /*Volatile=*/false, CType
, Loc
,
1890 LValueBaseInfo(AlignmentSource::Type
), TBAAAccessInfo());
1891 // Store the source element value to the dest element address.
1892 CGF
.EmitStoreOfScalar(Elem
, MediumPtr
, /*Volatile=*/true, CType
,
1893 LValueBaseInfo(AlignmentSource::Type
),
1896 Bld
.CreateBr(MergeBB
);
1898 CGF
.EmitBlock(ElseBB
);
1899 Bld
.CreateBr(MergeBB
);
1901 CGF
.EmitBlock(MergeBB
);
1904 CGM
.getOpenMPRuntime().emitBarrierCall(CGF
, Loc
, OMPD_unknown
,
1905 /*EmitChecks=*/false,
1906 /*ForceSimpleCall=*/true);
1909 // Warp 0 copies reduce element from transfer medium.
1911 llvm::BasicBlock
*W0ThenBB
= CGF
.createBasicBlock("then");
1912 llvm::BasicBlock
*W0ElseBB
= CGF
.createBasicBlock("else");
1913 llvm::BasicBlock
*W0MergeBB
= CGF
.createBasicBlock("ifcont");
1915 Address AddrNumWarpsArg
= CGF
.GetAddrOfLocalVar(&NumWarpsArg
);
1916 llvm::Value
*NumWarpsVal
= CGF
.EmitLoadOfScalar(
1917 AddrNumWarpsArg
, /*Volatile=*/false, C
.IntTy
, Loc
);
1919 // Up to 32 threads in warp 0 are active.
1920 llvm::Value
*IsActiveThread
=
1921 Bld
.CreateICmpULT(ThreadID
, NumWarpsVal
, "is_active_thread");
1922 Bld
.CreateCondBr(IsActiveThread
, W0ThenBB
, W0ElseBB
);
1924 CGF
.EmitBlock(W0ThenBB
);
1926 // SrcMediumPtr = &medium[tid]
1927 llvm::Value
*SrcMediumPtrVal
= Bld
.CreateInBoundsGEP(
1928 TransferMedium
->getValueType(), TransferMedium
,
1929 {llvm::Constant::getNullValue(CGM
.Int64Ty
), ThreadID
});
1930 // SrcMediumVal = *SrcMediumPtr;
1931 Address
SrcMediumPtr(
1934 CopyType
->getPointerTo(
1935 SrcMediumPtrVal
->getType()->getPointerAddressSpace())),
1938 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
1939 Address TargetElemPtrPtr
= Bld
.CreateConstArrayGEP(LocalReduceList
, Idx
);
1940 llvm::Value
*TargetElemPtrVal
= CGF
.EmitLoadOfScalar(
1941 TargetElemPtrPtr
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
);
1942 Address
TargetElemPtr(TargetElemPtrVal
, CGF
.Int8Ty
, Align
);
1943 TargetElemPtr
= Bld
.CreateElementBitCast(TargetElemPtr
, CopyType
);
1945 TargetElemPtr
= Bld
.CreateGEP(TargetElemPtr
, Cnt
);
1947 // *TargetElemPtr = SrcMediumVal;
1948 llvm::Value
*SrcMediumValue
=
1949 CGF
.EmitLoadOfScalar(SrcMediumPtr
, /*Volatile=*/true, CType
, Loc
);
1950 CGF
.EmitStoreOfScalar(SrcMediumValue
, TargetElemPtr
, /*Volatile=*/false,
1952 Bld
.CreateBr(W0MergeBB
);
1954 CGF
.EmitBlock(W0ElseBB
);
1955 Bld
.CreateBr(W0MergeBB
);
1957 CGF
.EmitBlock(W0MergeBB
);
1960 Cnt
= Bld
.CreateNSWAdd(Cnt
, llvm::ConstantInt::get(CGM
.IntTy
, /*V=*/1));
1961 CGF
.EmitStoreOfScalar(Cnt
, CntAddr
, /*Volatile=*/false, C
.IntTy
);
1962 CGF
.EmitBranch(PrecondBB
);
1963 (void)ApplyDebugLocation::CreateEmpty(CGF
);
1964 CGF
.EmitBlock(ExitBB
);
1966 RealTySize
%= TySize
;
1971 CGF
.FinishFunction();
1975 /// Emit a helper that reduces data across two OpenMP threads (lanes)
1976 /// in the same warp. It uses shuffle instructions to copy over data from
1977 /// a remote lane's stack. The reduction algorithm performed is specified
1978 /// by the fourth parameter.
1980 /// Algorithm Versions.
1981 /// Full Warp Reduce (argument value 0):
1982 /// This algorithm assumes that all 32 lanes are active and gathers
1983 /// data from these 32 lanes, producing a single resultant value.
1984 /// Contiguous Partial Warp Reduce (argument value 1):
1985 /// This algorithm assumes that only a *contiguous* subset of lanes
1986 /// are active. This happens for the last warp in a parallel region
1987 /// when the user specified num_threads is not an integer multiple of
1988 /// 32. This contiguous subset always starts with the zeroth lane.
1989 /// Partial Warp Reduce (argument value 2):
1990 /// This algorithm gathers data from any number of lanes at any position.
1991 /// All reduced values are stored in the lowest possible lane. The set
1992 /// of problems every algorithm addresses is a super set of those
1993 /// addressable by algorithms with a lower version number. Overhead
1994 /// increases as algorithm version increases.
1998 /// Reduce element refers to the individual data field with primitive
1999 /// data types to be combined and reduced across threads.
2001 /// Reduce list refers to a collection of local, thread-private
2002 /// reduce elements.
2003 /// Remote Reduce list:
2004 /// Remote Reduce list refers to a collection of remote (relative to
2005 /// the current thread) reduce elements.
2007 /// We distinguish between three states of threads that are important to
2008 /// the implementation of this function.
2010 /// Threads in a warp executing the SIMT instruction, as distinguished from
2011 /// threads that are inactive due to divergent control flow.
2013 /// The minimal set of threads that has to be alive upon entry to this
2014 /// function. The computation is correct iff active threads are alive.
2015 /// Some threads are alive but they are not active because they do not
2016 /// contribute to the computation in any useful manner. Turning them off
2017 /// may introduce control flow overheads without any tangible benefits.
2018 /// Effective threads:
2019 /// In order to comply with the argument requirements of the shuffle
2020 /// function, we must keep all lanes holding data alive. But at most
2021 /// half of them perform value aggregation; we refer to this half of
2022 /// threads as effective. The other half is simply handing off their
2027 /// In this step active threads transfer data from higher lane positions
2028 /// in the warp to lower lane positions, creating Remote Reduce list.
2029 /// Value aggregation:
2030 /// In this step, effective threads combine their thread local Reduce list
2031 /// with Remote Reduce list and store the result in the thread local
2034 /// In this step, we deal with the assumption made by algorithm 2
2035 /// (i.e. contiguity assumption). When we have an odd number of lanes
2036 /// active, say 2k+1, only k threads will be effective and therefore k
2037 /// new values will be produced. However, the Reduce list owned by the
2038 /// (2k+1)th thread is ignored in the value aggregation. Therefore
2039 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2040 /// that the contiguity assumption still holds.
2041 static llvm::Function
*emitShuffleAndReduceFunction(
2042 CodeGenModule
&CGM
, ArrayRef
<const Expr
*> Privates
,
2043 QualType ReductionArrayTy
, llvm::Function
*ReduceFn
, SourceLocation Loc
) {
2044 ASTContext
&C
= CGM
.getContext();
2046 // Thread local Reduce list used to host the values of data to be reduced.
2047 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2048 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2049 // Current lane id; could be logical.
2050 ImplicitParamDecl
LaneIDArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr, C
.ShortTy
,
2051 ImplicitParamDecl::Other
);
2052 // Offset of the remote source lane relative to the current lane.
2053 ImplicitParamDecl
RemoteLaneOffsetArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2054 C
.ShortTy
, ImplicitParamDecl::Other
);
2055 // Algorithm version. This is expected to be known at compile time.
2056 ImplicitParamDecl
AlgoVerArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2057 C
.ShortTy
, ImplicitParamDecl::Other
);
2058 FunctionArgList Args
;
2059 Args
.push_back(&ReduceListArg
);
2060 Args
.push_back(&LaneIDArg
);
2061 Args
.push_back(&RemoteLaneOffsetArg
);
2062 Args
.push_back(&AlgoVerArg
);
2064 const CGFunctionInfo
&CGFI
=
2065 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
2066 auto *Fn
= llvm::Function::Create(
2067 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
2068 "_omp_reduction_shuffle_and_reduce_func", &CGM
.getModule());
2069 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
2070 Fn
->setDoesNotRecurse();
2072 CodeGenFunction
CGF(CGM
);
2073 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
2075 CGBuilderTy
&Bld
= CGF
.Builder
;
2077 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
2078 llvm::Type
*ElemTy
= CGF
.ConvertTypeForMem(ReductionArrayTy
);
2079 Address
LocalReduceList(
2080 Bld
.CreatePointerBitCastOrAddrSpaceCast(
2081 CGF
.EmitLoadOfScalar(AddrReduceListArg
, /*Volatile=*/false,
2082 C
.VoidPtrTy
, SourceLocation()),
2083 ElemTy
->getPointerTo()),
2084 ElemTy
, CGF
.getPointerAlign());
2086 Address AddrLaneIDArg
= CGF
.GetAddrOfLocalVar(&LaneIDArg
);
2087 llvm::Value
*LaneIDArgVal
= CGF
.EmitLoadOfScalar(
2088 AddrLaneIDArg
, /*Volatile=*/false, C
.ShortTy
, SourceLocation());
2090 Address AddrRemoteLaneOffsetArg
= CGF
.GetAddrOfLocalVar(&RemoteLaneOffsetArg
);
2091 llvm::Value
*RemoteLaneOffsetArgVal
= CGF
.EmitLoadOfScalar(
2092 AddrRemoteLaneOffsetArg
, /*Volatile=*/false, C
.ShortTy
, SourceLocation());
2094 Address AddrAlgoVerArg
= CGF
.GetAddrOfLocalVar(&AlgoVerArg
);
2095 llvm::Value
*AlgoVerArgVal
= CGF
.EmitLoadOfScalar(
2096 AddrAlgoVerArg
, /*Volatile=*/false, C
.ShortTy
, SourceLocation());
2098 // Create a local thread-private variable to host the Reduce list
2099 // from a remote lane.
2100 Address RemoteReduceList
=
2101 CGF
.CreateMemTemp(ReductionArrayTy
, ".omp.reduction.remote_reduce_list");
2103 // This loop iterates through the list of reduce elements and copies,
2104 // element by element, from a remote lane in the warp to RemoteReduceList,
2105 // hosted on the thread's stack.
2106 emitReductionListCopy(RemoteLaneToThread
, CGF
, ReductionArrayTy
, Privates
,
2107 LocalReduceList
, RemoteReduceList
,
2108 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal
,
2109 /*ScratchpadIndex=*/nullptr,
2110 /*ScratchpadWidth=*/nullptr});
2112 // The actions to be performed on the Remote Reduce list is dependent
2113 // on the algorithm version.
2115 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2116 // LaneId % 2 == 0 && Offset > 0):
2117 // do the reduction value aggregation
2119 // The thread local variable Reduce list is mutated in place to host the
2120 // reduced data, which is the aggregated value produced from local and
2123 // Note that AlgoVer is expected to be a constant integer known at compile
2125 // When AlgoVer==0, the first conjunction evaluates to true, making
2126 // the entire predicate true during compile time.
2127 // When AlgoVer==1, the second conjunction has only the second part to be
2128 // evaluated during runtime. Other conjunctions evaluates to false
2129 // during compile time.
2130 // When AlgoVer==2, the third conjunction has only the second part to be
2131 // evaluated during runtime. Other conjunctions evaluates to false
2132 // during compile time.
2133 llvm::Value
*CondAlgo0
= Bld
.CreateIsNull(AlgoVerArgVal
);
2135 llvm::Value
*Algo1
= Bld
.CreateICmpEQ(AlgoVerArgVal
, Bld
.getInt16(1));
2136 llvm::Value
*CondAlgo1
= Bld
.CreateAnd(
2137 Algo1
, Bld
.CreateICmpULT(LaneIDArgVal
, RemoteLaneOffsetArgVal
));
2139 llvm::Value
*Algo2
= Bld
.CreateICmpEQ(AlgoVerArgVal
, Bld
.getInt16(2));
2140 llvm::Value
*CondAlgo2
= Bld
.CreateAnd(
2141 Algo2
, Bld
.CreateIsNull(Bld
.CreateAnd(LaneIDArgVal
, Bld
.getInt16(1))));
2142 CondAlgo2
= Bld
.CreateAnd(
2143 CondAlgo2
, Bld
.CreateICmpSGT(RemoteLaneOffsetArgVal
, Bld
.getInt16(0)));
2145 llvm::Value
*CondReduce
= Bld
.CreateOr(CondAlgo0
, CondAlgo1
);
2146 CondReduce
= Bld
.CreateOr(CondReduce
, CondAlgo2
);
2148 llvm::BasicBlock
*ThenBB
= CGF
.createBasicBlock("then");
2149 llvm::BasicBlock
*ElseBB
= CGF
.createBasicBlock("else");
2150 llvm::BasicBlock
*MergeBB
= CGF
.createBasicBlock("ifcont");
2151 Bld
.CreateCondBr(CondReduce
, ThenBB
, ElseBB
);
2153 CGF
.EmitBlock(ThenBB
);
2154 // reduce_function(LocalReduceList, RemoteReduceList)
2155 llvm::Value
*LocalReduceListPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2156 LocalReduceList
.getPointer(), CGF
.VoidPtrTy
);
2157 llvm::Value
*RemoteReduceListPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2158 RemoteReduceList
.getPointer(), CGF
.VoidPtrTy
);
2159 CGM
.getOpenMPRuntime().emitOutlinedFunctionCall(
2160 CGF
, Loc
, ReduceFn
, {LocalReduceListPtr
, RemoteReduceListPtr
});
2161 Bld
.CreateBr(MergeBB
);
2163 CGF
.EmitBlock(ElseBB
);
2164 Bld
.CreateBr(MergeBB
);
2166 CGF
.EmitBlock(MergeBB
);
2168 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2170 Algo1
= Bld
.CreateICmpEQ(AlgoVerArgVal
, Bld
.getInt16(1));
2171 llvm::Value
*CondCopy
= Bld
.CreateAnd(
2172 Algo1
, Bld
.CreateICmpUGE(LaneIDArgVal
, RemoteLaneOffsetArgVal
));
2174 llvm::BasicBlock
*CpyThenBB
= CGF
.createBasicBlock("then");
2175 llvm::BasicBlock
*CpyElseBB
= CGF
.createBasicBlock("else");
2176 llvm::BasicBlock
*CpyMergeBB
= CGF
.createBasicBlock("ifcont");
2177 Bld
.CreateCondBr(CondCopy
, CpyThenBB
, CpyElseBB
);
2179 CGF
.EmitBlock(CpyThenBB
);
2180 emitReductionListCopy(ThreadCopy
, CGF
, ReductionArrayTy
, Privates
,
2181 RemoteReduceList
, LocalReduceList
);
2182 Bld
.CreateBr(CpyMergeBB
);
2184 CGF
.EmitBlock(CpyElseBB
);
2185 Bld
.CreateBr(CpyMergeBB
);
2187 CGF
.EmitBlock(CpyMergeBB
);
2189 CGF
.FinishFunction();
2193 /// This function emits a helper that copies all the reduction variables from
2194 /// the team into the provided global buffer for the reduction variables.
2196 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2197 /// For all data entries D in reduce_data:
2198 /// Copy local D to buffer.D[Idx]
2199 static llvm::Value
*emitListToGlobalCopyFunction(
2200 CodeGenModule
&CGM
, ArrayRef
<const Expr
*> Privates
,
2201 QualType ReductionArrayTy
, SourceLocation Loc
,
2202 const RecordDecl
*TeamReductionRec
,
2203 const llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
2205 ASTContext
&C
= CGM
.getContext();
2207 // Buffer: global reduction buffer.
2208 ImplicitParamDecl
BufferArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2209 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2210 // Idx: index of the buffer.
2211 ImplicitParamDecl
IdxArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr, C
.IntTy
,
2212 ImplicitParamDecl::Other
);
2213 // ReduceList: thread local Reduce list.
2214 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2215 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2216 FunctionArgList Args
;
2217 Args
.push_back(&BufferArg
);
2218 Args
.push_back(&IdxArg
);
2219 Args
.push_back(&ReduceListArg
);
2221 const CGFunctionInfo
&CGFI
=
2222 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
2223 auto *Fn
= llvm::Function::Create(
2224 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
2225 "_omp_reduction_list_to_global_copy_func", &CGM
.getModule());
2226 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
2227 Fn
->setDoesNotRecurse();
2228 CodeGenFunction
CGF(CGM
);
2229 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
2231 CGBuilderTy
&Bld
= CGF
.Builder
;
2233 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
2234 Address AddrBufferArg
= CGF
.GetAddrOfLocalVar(&BufferArg
);
2235 llvm::Type
*ElemTy
= CGF
.ConvertTypeForMem(ReductionArrayTy
);
2236 Address
LocalReduceList(
2237 Bld
.CreatePointerBitCastOrAddrSpaceCast(
2238 CGF
.EmitLoadOfScalar(AddrReduceListArg
, /*Volatile=*/false,
2240 ElemTy
->getPointerTo()),
2241 ElemTy
, CGF
.getPointerAlign());
2242 QualType StaticTy
= C
.getRecordType(TeamReductionRec
);
2243 llvm::Type
*LLVMReductionsBufferTy
=
2244 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
2245 llvm::Value
*BufferArrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2246 CGF
.EmitLoadOfScalar(AddrBufferArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
),
2247 LLVMReductionsBufferTy
->getPointerTo());
2248 llvm::Value
*Idxs
[] = {llvm::ConstantInt::getNullValue(CGF
.Int32Ty
),
2249 CGF
.EmitLoadOfScalar(CGF
.GetAddrOfLocalVar(&IdxArg
),
2250 /*Volatile=*/false, C
.IntTy
,
2253 for (const Expr
*Private
: Privates
) {
2254 // Reduce element = LocalReduceList[i]
2255 Address ElemPtrPtrAddr
= Bld
.CreateConstArrayGEP(LocalReduceList
, Idx
);
2256 llvm::Value
*ElemPtrPtr
= CGF
.EmitLoadOfScalar(
2257 ElemPtrPtrAddr
, /*Volatile=*/false, C
.VoidPtrTy
, SourceLocation());
2258 // elemptr = ((CopyType*)(elemptrptr)) + I
2259 ElemTy
= CGF
.ConvertTypeForMem(Private
->getType());
2260 ElemPtrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2261 ElemPtrPtr
, ElemTy
->getPointerTo());
2263 Address(ElemPtrPtr
, ElemTy
, C
.getTypeAlignInChars(Private
->getType()));
2264 const ValueDecl
*VD
= cast
<DeclRefExpr
>(Private
)->getDecl();
2265 // Global = Buffer.VD[Idx];
2266 const FieldDecl
*FD
= VarFieldMap
.lookup(VD
);
2267 LValue GlobLVal
= CGF
.EmitLValueForField(
2268 CGF
.MakeNaturalAlignAddrLValue(BufferArrPtr
, StaticTy
), FD
);
2269 Address GlobAddr
= GlobLVal
.getAddress(CGF
);
2270 llvm::Value
*BufferPtr
= Bld
.CreateInBoundsGEP(GlobAddr
.getElementType(),
2271 GlobAddr
.getPointer(), Idxs
);
2272 GlobLVal
.setAddress(Address(BufferPtr
,
2273 CGF
.ConvertTypeForMem(Private
->getType()),
2274 GlobAddr
.getAlignment()));
2275 switch (CGF
.getEvaluationKind(Private
->getType())) {
2277 llvm::Value
*V
= CGF
.EmitLoadOfScalar(
2278 ElemPtr
, /*Volatile=*/false, Private
->getType(), Loc
,
2279 LValueBaseInfo(AlignmentSource::Type
), TBAAAccessInfo());
2280 CGF
.EmitStoreOfScalar(V
, GlobLVal
);
2284 CodeGenFunction::ComplexPairTy V
= CGF
.EmitLoadOfComplex(
2285 CGF
.MakeAddrLValue(ElemPtr
, Private
->getType()), Loc
);
2286 CGF
.EmitStoreOfComplex(V
, GlobLVal
, /*isInit=*/false);
2290 CGF
.EmitAggregateCopy(GlobLVal
,
2291 CGF
.MakeAddrLValue(ElemPtr
, Private
->getType()),
2292 Private
->getType(), AggValueSlot::DoesNotOverlap
);
2298 CGF
.FinishFunction();
2302 /// This function emits a helper that reduces all the reduction variables from
2303 /// the team into the provided global buffer for the reduction variables.
2305 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2306 /// void *GlobPtrs[];
2307 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2309 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2310 /// reduce_function(GlobPtrs, reduce_data);
2311 static llvm::Value
*emitListToGlobalReduceFunction(
2312 CodeGenModule
&CGM
, ArrayRef
<const Expr
*> Privates
,
2313 QualType ReductionArrayTy
, SourceLocation Loc
,
2314 const RecordDecl
*TeamReductionRec
,
2315 const llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
2317 llvm::Function
*ReduceFn
) {
2318 ASTContext
&C
= CGM
.getContext();
2320 // Buffer: global reduction buffer.
2321 ImplicitParamDecl
BufferArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2322 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2323 // Idx: index of the buffer.
2324 ImplicitParamDecl
IdxArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr, C
.IntTy
,
2325 ImplicitParamDecl::Other
);
2326 // ReduceList: thread local Reduce list.
2327 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2328 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2329 FunctionArgList Args
;
2330 Args
.push_back(&BufferArg
);
2331 Args
.push_back(&IdxArg
);
2332 Args
.push_back(&ReduceListArg
);
2334 const CGFunctionInfo
&CGFI
=
2335 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
2336 auto *Fn
= llvm::Function::Create(
2337 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
2338 "_omp_reduction_list_to_global_reduce_func", &CGM
.getModule());
2339 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
2340 Fn
->setDoesNotRecurse();
2341 CodeGenFunction
CGF(CGM
);
2342 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
2344 CGBuilderTy
&Bld
= CGF
.Builder
;
2346 Address AddrBufferArg
= CGF
.GetAddrOfLocalVar(&BufferArg
);
2347 QualType StaticTy
= C
.getRecordType(TeamReductionRec
);
2348 llvm::Type
*LLVMReductionsBufferTy
=
2349 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
2350 llvm::Value
*BufferArrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2351 CGF
.EmitLoadOfScalar(AddrBufferArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
),
2352 LLVMReductionsBufferTy
->getPointerTo());
2354 // 1. Build a list of reduction variables.
2355 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2356 Address ReductionList
=
2357 CGF
.CreateMemTemp(ReductionArrayTy
, ".omp.reduction.red_list");
2358 auto IPriv
= Privates
.begin();
2359 llvm::Value
*Idxs
[] = {llvm::ConstantInt::getNullValue(CGF
.Int32Ty
),
2360 CGF
.EmitLoadOfScalar(CGF
.GetAddrOfLocalVar(&IdxArg
),
2361 /*Volatile=*/false, C
.IntTy
,
2364 for (unsigned I
= 0, E
= Privates
.size(); I
< E
; ++I
, ++IPriv
, ++Idx
) {
2365 Address Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2366 // Global = Buffer.VD[Idx];
2367 const ValueDecl
*VD
= cast
<DeclRefExpr
>(*IPriv
)->getDecl();
2368 const FieldDecl
*FD
= VarFieldMap
.lookup(VD
);
2369 LValue GlobLVal
= CGF
.EmitLValueForField(
2370 CGF
.MakeNaturalAlignAddrLValue(BufferArrPtr
, StaticTy
), FD
);
2371 Address GlobAddr
= GlobLVal
.getAddress(CGF
);
2372 llvm::Value
*BufferPtr
= Bld
.CreateInBoundsGEP(
2373 GlobAddr
.getElementType(), GlobAddr
.getPointer(), Idxs
);
2374 llvm::Value
*Ptr
= CGF
.EmitCastToVoidPtr(BufferPtr
);
2375 CGF
.EmitStoreOfScalar(Ptr
, Elem
, /*Volatile=*/false, C
.VoidPtrTy
);
2376 if ((*IPriv
)->getType()->isVariablyModifiedType()) {
2377 // Store array size.
2379 Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2380 llvm::Value
*Size
= CGF
.Builder
.CreateIntCast(
2382 CGF
.getContext().getAsVariableArrayType((*IPriv
)->getType()))
2384 CGF
.SizeTy
, /*isSigned=*/false);
2385 CGF
.Builder
.CreateStore(CGF
.Builder
.CreateIntToPtr(Size
, CGF
.VoidPtrTy
),
2390 // Call reduce_function(GlobalReduceList, ReduceList)
2391 llvm::Value
*GlobalReduceList
=
2392 CGF
.EmitCastToVoidPtr(ReductionList
.getPointer());
2393 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
2394 llvm::Value
*ReducedPtr
= CGF
.EmitLoadOfScalar(
2395 AddrReduceListArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
);
2396 CGM
.getOpenMPRuntime().emitOutlinedFunctionCall(
2397 CGF
, Loc
, ReduceFn
, {GlobalReduceList
, ReducedPtr
});
2398 CGF
.FinishFunction();
2402 /// This function emits a helper that copies all the reduction variables from
2403 /// the team into the provided global buffer for the reduction variables.
2405 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2406 /// For all data entries D in reduce_data:
2407 /// Copy buffer.D[Idx] to local D;
2408 static llvm::Value
*emitGlobalToListCopyFunction(
2409 CodeGenModule
&CGM
, ArrayRef
<const Expr
*> Privates
,
2410 QualType ReductionArrayTy
, SourceLocation Loc
,
2411 const RecordDecl
*TeamReductionRec
,
2412 const llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
2414 ASTContext
&C
= CGM
.getContext();
2416 // Buffer: global reduction buffer.
2417 ImplicitParamDecl
BufferArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2418 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2419 // Idx: index of the buffer.
2420 ImplicitParamDecl
IdxArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr, C
.IntTy
,
2421 ImplicitParamDecl::Other
);
2422 // ReduceList: thread local Reduce list.
2423 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2424 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2425 FunctionArgList Args
;
2426 Args
.push_back(&BufferArg
);
2427 Args
.push_back(&IdxArg
);
2428 Args
.push_back(&ReduceListArg
);
2430 const CGFunctionInfo
&CGFI
=
2431 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
2432 auto *Fn
= llvm::Function::Create(
2433 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
2434 "_omp_reduction_global_to_list_copy_func", &CGM
.getModule());
2435 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
2436 Fn
->setDoesNotRecurse();
2437 CodeGenFunction
CGF(CGM
);
2438 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
2440 CGBuilderTy
&Bld
= CGF
.Builder
;
2442 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
2443 Address AddrBufferArg
= CGF
.GetAddrOfLocalVar(&BufferArg
);
2444 llvm::Type
*ElemTy
= CGF
.ConvertTypeForMem(ReductionArrayTy
);
2445 Address
LocalReduceList(
2446 Bld
.CreatePointerBitCastOrAddrSpaceCast(
2447 CGF
.EmitLoadOfScalar(AddrReduceListArg
, /*Volatile=*/false,
2449 ElemTy
->getPointerTo()),
2450 ElemTy
, CGF
.getPointerAlign());
2451 QualType StaticTy
= C
.getRecordType(TeamReductionRec
);
2452 llvm::Type
*LLVMReductionsBufferTy
=
2453 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
2454 llvm::Value
*BufferArrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2455 CGF
.EmitLoadOfScalar(AddrBufferArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
),
2456 LLVMReductionsBufferTy
->getPointerTo());
2458 llvm::Value
*Idxs
[] = {llvm::ConstantInt::getNullValue(CGF
.Int32Ty
),
2459 CGF
.EmitLoadOfScalar(CGF
.GetAddrOfLocalVar(&IdxArg
),
2460 /*Volatile=*/false, C
.IntTy
,
2463 for (const Expr
*Private
: Privates
) {
2464 // Reduce element = LocalReduceList[i]
2465 Address ElemPtrPtrAddr
= Bld
.CreateConstArrayGEP(LocalReduceList
, Idx
);
2466 llvm::Value
*ElemPtrPtr
= CGF
.EmitLoadOfScalar(
2467 ElemPtrPtrAddr
, /*Volatile=*/false, C
.VoidPtrTy
, SourceLocation());
2468 // elemptr = ((CopyType*)(elemptrptr)) + I
2469 ElemTy
= CGF
.ConvertTypeForMem(Private
->getType());
2470 ElemPtrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2471 ElemPtrPtr
, ElemTy
->getPointerTo());
2473 Address(ElemPtrPtr
, ElemTy
, C
.getTypeAlignInChars(Private
->getType()));
2474 const ValueDecl
*VD
= cast
<DeclRefExpr
>(Private
)->getDecl();
2475 // Global = Buffer.VD[Idx];
2476 const FieldDecl
*FD
= VarFieldMap
.lookup(VD
);
2477 LValue GlobLVal
= CGF
.EmitLValueForField(
2478 CGF
.MakeNaturalAlignAddrLValue(BufferArrPtr
, StaticTy
), FD
);
2479 Address GlobAddr
= GlobLVal
.getAddress(CGF
);
2480 llvm::Value
*BufferPtr
= Bld
.CreateInBoundsGEP(GlobAddr
.getElementType(),
2481 GlobAddr
.getPointer(), Idxs
);
2482 GlobLVal
.setAddress(Address(BufferPtr
,
2483 CGF
.ConvertTypeForMem(Private
->getType()),
2484 GlobAddr
.getAlignment()));
2485 switch (CGF
.getEvaluationKind(Private
->getType())) {
2487 llvm::Value
*V
= CGF
.EmitLoadOfScalar(GlobLVal
, Loc
);
2488 CGF
.EmitStoreOfScalar(V
, ElemPtr
, /*Volatile=*/false, Private
->getType(),
2489 LValueBaseInfo(AlignmentSource::Type
),
2494 CodeGenFunction::ComplexPairTy V
= CGF
.EmitLoadOfComplex(GlobLVal
, Loc
);
2495 CGF
.EmitStoreOfComplex(V
, CGF
.MakeAddrLValue(ElemPtr
, Private
->getType()),
2500 CGF
.EmitAggregateCopy(CGF
.MakeAddrLValue(ElemPtr
, Private
->getType()),
2501 GlobLVal
, Private
->getType(),
2502 AggValueSlot::DoesNotOverlap
);
2508 CGF
.FinishFunction();
2512 /// This function emits a helper that reduces all the reduction variables from
2513 /// the team into the provided global buffer for the reduction variables.
2515 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2516 /// void *GlobPtrs[];
2517 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2519 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2520 /// reduce_function(reduce_data, GlobPtrs);
2521 static llvm::Value
*emitGlobalToListReduceFunction(
2522 CodeGenModule
&CGM
, ArrayRef
<const Expr
*> Privates
,
2523 QualType ReductionArrayTy
, SourceLocation Loc
,
2524 const RecordDecl
*TeamReductionRec
,
2525 const llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*>
2527 llvm::Function
*ReduceFn
) {
2528 ASTContext
&C
= CGM
.getContext();
2530 // Buffer: global reduction buffer.
2531 ImplicitParamDecl
BufferArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2532 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2533 // Idx: index of the buffer.
2534 ImplicitParamDecl
IdxArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr, C
.IntTy
,
2535 ImplicitParamDecl::Other
);
2536 // ReduceList: thread local Reduce list.
2537 ImplicitParamDecl
ReduceListArg(C
, /*DC=*/nullptr, Loc
, /*Id=*/nullptr,
2538 C
.VoidPtrTy
, ImplicitParamDecl::Other
);
2539 FunctionArgList Args
;
2540 Args
.push_back(&BufferArg
);
2541 Args
.push_back(&IdxArg
);
2542 Args
.push_back(&ReduceListArg
);
2544 const CGFunctionInfo
&CGFI
=
2545 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(C
.VoidTy
, Args
);
2546 auto *Fn
= llvm::Function::Create(
2547 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
2548 "_omp_reduction_global_to_list_reduce_func", &CGM
.getModule());
2549 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
2550 Fn
->setDoesNotRecurse();
2551 CodeGenFunction
CGF(CGM
);
2552 CGF
.StartFunction(GlobalDecl(), C
.VoidTy
, Fn
, CGFI
, Args
, Loc
, Loc
);
2554 CGBuilderTy
&Bld
= CGF
.Builder
;
2556 Address AddrBufferArg
= CGF
.GetAddrOfLocalVar(&BufferArg
);
2557 QualType StaticTy
= C
.getRecordType(TeamReductionRec
);
2558 llvm::Type
*LLVMReductionsBufferTy
=
2559 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
2560 llvm::Value
*BufferArrPtr
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
2561 CGF
.EmitLoadOfScalar(AddrBufferArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
),
2562 LLVMReductionsBufferTy
->getPointerTo());
2564 // 1. Build a list of reduction variables.
2565 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2566 Address ReductionList
=
2567 CGF
.CreateMemTemp(ReductionArrayTy
, ".omp.reduction.red_list");
2568 auto IPriv
= Privates
.begin();
2569 llvm::Value
*Idxs
[] = {llvm::ConstantInt::getNullValue(CGF
.Int32Ty
),
2570 CGF
.EmitLoadOfScalar(CGF
.GetAddrOfLocalVar(&IdxArg
),
2571 /*Volatile=*/false, C
.IntTy
,
2574 for (unsigned I
= 0, E
= Privates
.size(); I
< E
; ++I
, ++IPriv
, ++Idx
) {
2575 Address Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2576 // Global = Buffer.VD[Idx];
2577 const ValueDecl
*VD
= cast
<DeclRefExpr
>(*IPriv
)->getDecl();
2578 const FieldDecl
*FD
= VarFieldMap
.lookup(VD
);
2579 LValue GlobLVal
= CGF
.EmitLValueForField(
2580 CGF
.MakeNaturalAlignAddrLValue(BufferArrPtr
, StaticTy
), FD
);
2581 Address GlobAddr
= GlobLVal
.getAddress(CGF
);
2582 llvm::Value
*BufferPtr
= Bld
.CreateInBoundsGEP(
2583 GlobAddr
.getElementType(), GlobAddr
.getPointer(), Idxs
);
2584 llvm::Value
*Ptr
= CGF
.EmitCastToVoidPtr(BufferPtr
);
2585 CGF
.EmitStoreOfScalar(Ptr
, Elem
, /*Volatile=*/false, C
.VoidPtrTy
);
2586 if ((*IPriv
)->getType()->isVariablyModifiedType()) {
2587 // Store array size.
2589 Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2590 llvm::Value
*Size
= CGF
.Builder
.CreateIntCast(
2592 CGF
.getContext().getAsVariableArrayType((*IPriv
)->getType()))
2594 CGF
.SizeTy
, /*isSigned=*/false);
2595 CGF
.Builder
.CreateStore(CGF
.Builder
.CreateIntToPtr(Size
, CGF
.VoidPtrTy
),
2600 // Call reduce_function(ReduceList, GlobalReduceList)
2601 llvm::Value
*GlobalReduceList
=
2602 CGF
.EmitCastToVoidPtr(ReductionList
.getPointer());
2603 Address AddrReduceListArg
= CGF
.GetAddrOfLocalVar(&ReduceListArg
);
2604 llvm::Value
*ReducedPtr
= CGF
.EmitLoadOfScalar(
2605 AddrReduceListArg
, /*Volatile=*/false, C
.VoidPtrTy
, Loc
);
2606 CGM
.getOpenMPRuntime().emitOutlinedFunctionCall(
2607 CGF
, Loc
, ReduceFn
, {ReducedPtr
, GlobalReduceList
});
2608 CGF
.FinishFunction();
2613 /// Design of OpenMP reductions on the GPU
2615 /// Consider a typical OpenMP program with one or more reduction
2620 /// #pragma omp target teams distribute parallel for \
2621 /// reduction(+:foo) reduction(*:bar)
2622 /// for (int i = 0; i < N; i++) {
2623 /// foo += A[i]; bar *= B[i];
2626 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
2627 /// all teams. In our OpenMP implementation on the NVPTX device an
2628 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2629 /// within a team are mapped to CUDA threads within a threadblock.
2630 /// Our goal is to efficiently aggregate values across all OpenMP
2631 /// threads such that:
2633 /// - the compiler and runtime are logically concise, and
2634 /// - the reduction is performed efficiently in a hierarchical
2635 /// manner as follows: within OpenMP threads in the same warp,
2636 /// across warps in a threadblock, and finally across teams on
2637 /// the NVPTX device.
2639 /// Introduction to Decoupling
2641 /// We would like to decouple the compiler and the runtime so that the
2642 /// latter is ignorant of the reduction variables (number, data types)
2643 /// and the reduction operators. This allows a simpler interface
2644 /// and implementation while still attaining good performance.
2646 /// Pseudocode for the aforementioned OpenMP program generated by the
2647 /// compiler is as follows:
2649 /// 1. Create private copies of reduction variables on each OpenMP
2650 /// thread: 'foo_private', 'bar_private'
2651 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2652 /// to it and writes the result in 'foo_private' and 'bar_private'
2654 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
2655 /// and store the result on the team master:
2657 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2658 /// reduceData, shuffleReduceFn, interWarpCpyFn)
2661 /// struct ReduceData {
2665 /// reduceData.foo = &foo_private
2666 /// reduceData.bar = &bar_private
2668 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2669 /// auxiliary functions generated by the compiler that operate on
2670 /// variables of type 'ReduceData'. They aid the runtime perform
2671 /// algorithmic steps in a data agnostic manner.
2673 /// 'shuffleReduceFn' is a pointer to a function that reduces data
2674 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
2675 /// same warp. It takes the following arguments as input:
2677 /// a. variable of type 'ReduceData' on the calling lane,
2679 /// c. an offset relative to the current lane_id to generate a
2680 /// remote_lane_id. The remote lane contains the second
2681 /// variable of type 'ReduceData' that is to be reduced.
2682 /// d. an algorithm version parameter determining which reduction
2683 /// algorithm to use.
2685 /// 'shuffleReduceFn' retrieves data from the remote lane using
2686 /// efficient GPU shuffle intrinsics and reduces, using the
2687 /// algorithm specified by the 4th parameter, the two operands
2688 /// element-wise. The result is written to the first operand.
2690 /// Different reduction algorithms are implemented in different
2691 /// runtime functions, all calling 'shuffleReduceFn' to perform
2692 /// the essential reduction step. Therefore, based on the 4th
2693 /// parameter, this function behaves slightly differently to
2694 /// cooperate with the runtime to ensure correctness under
2695 /// different circumstances.
2697 /// 'InterWarpCpyFn' is a pointer to a function that transfers
2698 /// reduced variables across warps. It tunnels, through CUDA
2699 /// shared memory, the thread-private data of type 'ReduceData'
2700 /// from lane 0 of each warp to a lane in the first warp.
2701 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2702 /// The last team writes the global reduced value to memory.
2704 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2705 /// reduceData, shuffleReduceFn, interWarpCpyFn,
2706 /// scratchpadCopyFn, loadAndReduceFn)
2708 /// 'scratchpadCopyFn' is a helper that stores reduced
2709 /// data from the team master to a scratchpad array in
2712 /// 'loadAndReduceFn' is a helper that loads data from
2713 /// the scratchpad array and reduces it with the input
2716 /// These compiler generated functions hide address
2717 /// calculation and alignment information from the runtime.
2719 /// The team master of the last team stores the reduced
2720 /// result to the globals in memory.
2721 /// foo += reduceData.foo; bar *= reduceData.bar
2724 /// Warp Reduction Algorithms
2726 /// On the warp level, we have three algorithms implemented in the
2727 /// OpenMP runtime depending on the number of active lanes:
2729 /// Full Warp Reduction
2731 /// The reduce algorithm within a warp where all lanes are active
2732 /// is implemented in the runtime as follows:
2734 /// full_warp_reduce(void *reduce_data,
2735 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2736 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2737 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
2740 /// The algorithm completes in log(2, WARPSIZE) steps.
2742 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2743 /// not used therefore we save instructions by not retrieving lane_id
2744 /// from the corresponding special registers. The 4th parameter, which
2745 /// represents the version of the algorithm being used, is set to 0 to
2746 /// signify full warp reduction.
2748 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2750 /// #reduce_elem refers to an element in the local lane's data structure
2751 /// #remote_elem is retrieved from a remote lane
2752 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2753 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2755 /// Contiguous Partial Warp Reduction
2757 /// This reduce algorithm is used within a warp where only the first
2758 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2759 /// number of OpenMP threads in a parallel region is not a multiple of
2760 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
2763 /// contiguous_partial_reduce(void *reduce_data,
2764 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2765 /// int size, int lane_id) {
2768 /// curr_size = size;
2769 /// mask = curr_size/2;
2770 /// while (offset>0) {
2771 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2772 /// curr_size = (curr_size+1)/2;
2773 /// offset = curr_size/2;
2777 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2779 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2780 /// if (lane_id < offset)
2781 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2783 /// reduce_elem = remote_elem
2785 /// This algorithm assumes that the data to be reduced are located in a
2786 /// contiguous subset of lanes starting from the first. When there is
2787 /// an odd number of active lanes, the data in the last lane is not
2788 /// aggregated with any other lane's dat but is instead copied over.
2790 /// Dispersed Partial Warp Reduction
2792 /// This algorithm is used within a warp when any discontiguous subset of
2793 /// lanes are active. It is used to implement the reduction operation
2794 /// across lanes in an OpenMP simd region or in a nested parallel region.
2797 /// dispersed_partial_reduce(void *reduce_data,
2798 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2799 /// int size, remote_id;
2800 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2802 /// remote_id = next_active_lane_id_right_after_me();
2803 /// # the above function returns 0 of no active lane
2804 /// # is present right after the current lane.
2805 /// size = number_of_active_lanes_in_this_warp();
2806 /// logical_lane_id /= 2;
2807 /// ShuffleReduceFn(reduce_data, logical_lane_id,
2808 /// remote_id-1-threadIdx.x, 2);
2809 /// } while (logical_lane_id % 2 == 0 && size > 1);
2812 /// There is no assumption made about the initial state of the reduction.
2813 /// Any number of lanes (>=1) could be active at any position. The reduction
2814 /// result is returned in the first active lane.
2816 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2818 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2819 /// if (lane_id % 2 == 0 && offset > 0)
2820 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2822 /// reduce_elem = remote_elem
2825 /// Intra-Team Reduction
2827 /// This function, as implemented in the runtime call
2828 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2829 /// threads in a team. It first reduces within a warp using the
2830 /// aforementioned algorithms. We then proceed to gather all such
2831 /// reduced values at the first warp.
2833 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
2834 /// data from each of the "warp master" (zeroth lane of each warp, where
2835 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
2836 /// a mathematical sense) the problem of reduction across warp masters in
2837 /// a block to the problem of warp reduction.
2840 /// Inter-Team Reduction
2842 /// Once a team has reduced its data to a single value, it is stored in
2843 /// a global scratchpad array. Since each team has a distinct slot, this
2844 /// can be done without locking.
2846 /// The last team to write to the scratchpad array proceeds to reduce the
2847 /// scratchpad array. One or more workers in the last team use the helper
2848 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2849 /// the k'th worker reduces every k'th element.
2851 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2852 /// reduce across workers and compute a globally reduced value.
2854 void CGOpenMPRuntimeGPU::emitReduction(
2855 CodeGenFunction
&CGF
, SourceLocation Loc
, ArrayRef
<const Expr
*> Privates
,
2856 ArrayRef
<const Expr
*> LHSExprs
, ArrayRef
<const Expr
*> RHSExprs
,
2857 ArrayRef
<const Expr
*> ReductionOps
, ReductionOptionsTy Options
) {
2858 if (!CGF
.HaveInsertPoint())
2861 bool ParallelReduction
= isOpenMPParallelDirective(Options
.ReductionKind
);
2863 bool TeamsReduction
= isOpenMPTeamsDirective(Options
.ReductionKind
);
2866 if (Options
.SimpleReduction
) {
2867 assert(!TeamsReduction
&& !ParallelReduction
&&
2868 "Invalid reduction selection in emitReduction.");
2869 CGOpenMPRuntime::emitReduction(CGF
, Loc
, Privates
, LHSExprs
, RHSExprs
,
2870 ReductionOps
, Options
);
2874 assert((TeamsReduction
|| ParallelReduction
) &&
2875 "Invalid reduction selection in emitReduction.");
2877 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2878 // RedList, shuffle_reduce_func, interwarp_copy_func);
2880 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2881 llvm::Value
*RTLoc
= emitUpdateLocation(CGF
, Loc
);
2882 llvm::Value
*ThreadId
= getThreadID(CGF
, Loc
);
2885 ASTContext
&C
= CGM
.getContext();
2886 // 1. Build a list of reduction variables.
2887 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2888 auto Size
= RHSExprs
.size();
2889 for (const Expr
*E
: Privates
) {
2890 if (E
->getType()->isVariablyModifiedType())
2891 // Reserve place for array size.
2894 llvm::APInt
ArraySize(/*unsigned int numBits=*/32, Size
);
2895 QualType ReductionArrayTy
=
2896 C
.getConstantArrayType(C
.VoidPtrTy
, ArraySize
, nullptr, ArrayType::Normal
,
2897 /*IndexTypeQuals=*/0);
2898 Address ReductionList
=
2899 CGF
.CreateMemTemp(ReductionArrayTy
, ".omp.reduction.red_list");
2900 auto IPriv
= Privates
.begin();
2902 for (unsigned I
= 0, E
= RHSExprs
.size(); I
< E
; ++I
, ++IPriv
, ++Idx
) {
2903 Address Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2904 CGF
.Builder
.CreateStore(
2905 CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
2906 CGF
.EmitLValue(RHSExprs
[I
]).getPointer(CGF
), CGF
.VoidPtrTy
),
2908 if ((*IPriv
)->getType()->isVariablyModifiedType()) {
2909 // Store array size.
2911 Elem
= CGF
.Builder
.CreateConstArrayGEP(ReductionList
, Idx
);
2912 llvm::Value
*Size
= CGF
.Builder
.CreateIntCast(
2914 CGF
.getContext().getAsVariableArrayType((*IPriv
)->getType()))
2916 CGF
.SizeTy
, /*isSigned=*/false);
2917 CGF
.Builder
.CreateStore(CGF
.Builder
.CreateIntToPtr(Size
, CGF
.VoidPtrTy
),
2922 llvm::Value
*RL
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
2923 ReductionList
.getPointer(), CGF
.VoidPtrTy
);
2924 llvm::Function
*ReductionFn
= emitReductionFunction(
2925 CGF
.CurFn
->getName(), Loc
, CGF
.ConvertTypeForMem(ReductionArrayTy
),
2926 Privates
, LHSExprs
, RHSExprs
, ReductionOps
);
2927 llvm::Value
*ReductionArrayTySize
= CGF
.getTypeSize(ReductionArrayTy
);
2928 llvm::Function
*ShuffleAndReduceFn
= emitShuffleAndReduceFunction(
2929 CGM
, Privates
, ReductionArrayTy
, ReductionFn
, Loc
);
2930 llvm::Value
*InterWarpCopyFn
=
2931 emitInterWarpCopyFunction(CGM
, Privates
, ReductionArrayTy
, Loc
);
2933 if (ParallelReduction
) {
2934 llvm::Value
*Args
[] = {RTLoc
,
2936 CGF
.Builder
.getInt32(RHSExprs
.size()),
2937 ReductionArrayTySize
,
2942 Res
= CGF
.EmitRuntimeCall(
2943 OMPBuilder
.getOrCreateRuntimeFunction(
2944 CGM
.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2
),
2947 assert(TeamsReduction
&& "expected teams reduction.");
2948 llvm::SmallDenseMap
<const ValueDecl
*, const FieldDecl
*> VarFieldMap
;
2949 llvm::SmallVector
<const ValueDecl
*, 4> PrivatesReductions(Privates
.size());
2951 for (const Expr
*DRE
: Privates
) {
2952 PrivatesReductions
[Cnt
] = cast
<DeclRefExpr
>(DRE
)->getDecl();
2955 const RecordDecl
*TeamReductionRec
= ::buildRecordForGlobalizedVars(
2956 CGM
.getContext(), PrivatesReductions
, std::nullopt
, VarFieldMap
,
2957 C
.getLangOpts().OpenMPCUDAReductionBufNum
);
2958 TeamsReductions
.push_back(TeamReductionRec
);
2959 if (!KernelTeamsReductionPtr
) {
2960 KernelTeamsReductionPtr
= new llvm::GlobalVariable(
2961 CGM
.getModule(), CGM
.VoidPtrTy
, /*isConstant=*/true,
2962 llvm::GlobalValue::InternalLinkage
, nullptr,
2963 "_openmp_teams_reductions_buffer_$_$ptr");
2965 llvm::Value
*GlobalBufferPtr
= CGF
.EmitLoadOfScalar(
2966 Address(KernelTeamsReductionPtr
, CGF
.VoidPtrTy
, CGM
.getPointerAlign()),
2967 /*Volatile=*/false, C
.getPointerType(C
.VoidPtrTy
), Loc
);
2968 llvm::Value
*GlobalToBufferCpyFn
= ::emitListToGlobalCopyFunction(
2969 CGM
, Privates
, ReductionArrayTy
, Loc
, TeamReductionRec
, VarFieldMap
);
2970 llvm::Value
*GlobalToBufferRedFn
= ::emitListToGlobalReduceFunction(
2971 CGM
, Privates
, ReductionArrayTy
, Loc
, TeamReductionRec
, VarFieldMap
,
2973 llvm::Value
*BufferToGlobalCpyFn
= ::emitGlobalToListCopyFunction(
2974 CGM
, Privates
, ReductionArrayTy
, Loc
, TeamReductionRec
, VarFieldMap
);
2975 llvm::Value
*BufferToGlobalRedFn
= ::emitGlobalToListReduceFunction(
2976 CGM
, Privates
, ReductionArrayTy
, Loc
, TeamReductionRec
, VarFieldMap
,
2979 llvm::Value
*Args
[] = {
2983 CGF
.Builder
.getInt32(C
.getLangOpts().OpenMPCUDAReductionBufNum
),
2987 GlobalToBufferCpyFn
,
2988 GlobalToBufferRedFn
,
2989 BufferToGlobalCpyFn
,
2990 BufferToGlobalRedFn
};
2992 Res
= CGF
.EmitRuntimeCall(
2993 OMPBuilder
.getOrCreateRuntimeFunction(
2994 CGM
.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2
),
2998 // 5. Build if (res == 1)
2999 llvm::BasicBlock
*ExitBB
= CGF
.createBasicBlock(".omp.reduction.done");
3000 llvm::BasicBlock
*ThenBB
= CGF
.createBasicBlock(".omp.reduction.then");
3001 llvm::Value
*Cond
= CGF
.Builder
.CreateICmpEQ(
3002 Res
, llvm::ConstantInt::get(CGM
.Int32Ty
, /*V=*/1));
3003 CGF
.Builder
.CreateCondBr(Cond
, ThenBB
, ExitBB
);
3005 // 6. Build then branch: where we have reduced values in the master
3006 // thread in each team.
3007 // __kmpc_end_reduce{_nowait}(<gtid>);
3009 CGF
.EmitBlock(ThenBB
);
3011 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3012 auto &&CodeGen
= [Privates
, LHSExprs
, RHSExprs
, ReductionOps
,
3013 this](CodeGenFunction
&CGF
, PrePostActionTy
&Action
) {
3014 auto IPriv
= Privates
.begin();
3015 auto ILHS
= LHSExprs
.begin();
3016 auto IRHS
= RHSExprs
.begin();
3017 for (const Expr
*E
: ReductionOps
) {
3018 emitSingleReductionCombiner(CGF
, E
, *IPriv
, cast
<DeclRefExpr
>(*ILHS
),
3019 cast
<DeclRefExpr
>(*IRHS
));
3025 llvm::Value
*EndArgs
[] = {ThreadId
};
3026 RegionCodeGenTy
RCG(CodeGen
);
3027 NVPTXActionTy
Action(
3028 nullptr, std::nullopt
,
3029 OMPBuilder
.getOrCreateRuntimeFunction(
3030 CGM
.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait
),
3032 RCG
.setAction(Action
);
3034 // There is no need to emit line number for unconditional branch.
3035 (void)ApplyDebugLocation::CreateEmpty(CGF
);
3036 CGF
.EmitBlock(ExitBB
, /*IsFinished=*/true);
3040 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl
*FD
,
3041 const VarDecl
*NativeParam
) const {
3042 if (!NativeParam
->getType()->isReferenceType())
3044 QualType ArgType
= NativeParam
->getType();
3045 QualifierCollector QC
;
3046 const Type
*NonQualTy
= QC
.strip(ArgType
);
3047 QualType PointeeTy
= cast
<ReferenceType
>(NonQualTy
)->getPointeeType();
3048 if (const auto *Attr
= FD
->getAttr
<OMPCaptureKindAttr
>()) {
3049 if (Attr
->getCaptureKind() == OMPC_map
) {
3050 PointeeTy
= CGM
.getContext().getAddrSpaceQualType(PointeeTy
,
3051 LangAS::opencl_global
);
3054 ArgType
= CGM
.getContext().getPointerType(PointeeTy
);
3056 enum { NVPTX_local_addr
= 5 };
3057 QC
.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr
));
3058 ArgType
= QC
.apply(CGM
.getContext(), ArgType
);
3059 if (isa
<ImplicitParamDecl
>(NativeParam
))
3060 return ImplicitParamDecl::Create(
3061 CGM
.getContext(), /*DC=*/nullptr, NativeParam
->getLocation(),
3062 NativeParam
->getIdentifier(), ArgType
, ImplicitParamDecl::Other
);
3063 return ParmVarDecl::Create(
3065 const_cast<DeclContext
*>(NativeParam
->getDeclContext()),
3066 NativeParam
->getBeginLoc(), NativeParam
->getLocation(),
3067 NativeParam
->getIdentifier(), ArgType
,
3068 /*TInfo=*/nullptr, SC_None
, /*DefArg=*/nullptr);
3072 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction
&CGF
,
3073 const VarDecl
*NativeParam
,
3074 const VarDecl
*TargetParam
) const {
3075 assert(NativeParam
!= TargetParam
&&
3076 NativeParam
->getType()->isReferenceType() &&
3077 "Native arg must not be the same as target arg.");
3078 Address LocalAddr
= CGF
.GetAddrOfLocalVar(TargetParam
);
3079 QualType NativeParamType
= NativeParam
->getType();
3080 QualifierCollector QC
;
3081 const Type
*NonQualTy
= QC
.strip(NativeParamType
);
3082 QualType NativePointeeTy
= cast
<ReferenceType
>(NonQualTy
)->getPointeeType();
3083 unsigned NativePointeeAddrSpace
=
3084 CGF
.getTypes().getTargetAddressSpace(NativePointeeTy
);
3085 QualType TargetTy
= TargetParam
->getType();
3086 llvm::Value
*TargetAddr
= CGF
.EmitLoadOfScalar(LocalAddr
, /*Volatile=*/false,
3087 TargetTy
, SourceLocation());
3088 // First cast to generic.
3089 TargetAddr
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
3091 llvm::PointerType::get(CGF
.getLLVMContext(), /*AddrSpace=*/0));
3092 // Cast from generic to native address space.
3093 TargetAddr
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
3095 llvm::PointerType::get(CGF
.getLLVMContext(), NativePointeeAddrSpace
));
3096 Address NativeParamAddr
= CGF
.CreateMemTemp(NativeParamType
);
3097 CGF
.EmitStoreOfScalar(TargetAddr
, NativeParamAddr
, /*Volatile=*/false,
3099 return NativeParamAddr
;
3102 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3103 CodeGenFunction
&CGF
, SourceLocation Loc
, llvm::FunctionCallee OutlinedFn
,
3104 ArrayRef
<llvm::Value
*> Args
) const {
3105 SmallVector
<llvm::Value
*, 4> TargetArgs
;
3106 TargetArgs
.reserve(Args
.size());
3107 auto *FnType
= OutlinedFn
.getFunctionType();
3108 for (unsigned I
= 0, E
= Args
.size(); I
< E
; ++I
) {
3109 if (FnType
->isVarArg() && FnType
->getNumParams() <= I
) {
3110 TargetArgs
.append(std::next(Args
.begin(), I
), Args
.end());
3113 llvm::Type
*TargetType
= FnType
->getParamType(I
);
3114 llvm::Value
*NativeArg
= Args
[I
];
3115 if (!TargetType
->isPointerTy()) {
3116 TargetArgs
.emplace_back(NativeArg
);
3119 llvm::Value
*TargetArg
= CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
3121 llvm::PointerType::get(CGF
.getLLVMContext(), /*AddrSpace*/ 0));
3122 TargetArgs
.emplace_back(
3123 CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(TargetArg
, TargetType
));
3125 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF
, Loc
, OutlinedFn
, TargetArgs
);
3128 /// Emit function which wraps the outline parallel region
3129 /// and controls the arguments which are passed to this function.
3130 /// The wrapper ensures that the outlined function is called
3131 /// with the correct arguments when data is shared.
3132 llvm::Function
*CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3133 llvm::Function
*OutlinedParallelFn
, const OMPExecutableDirective
&D
) {
3134 ASTContext
&Ctx
= CGM
.getContext();
3135 const auto &CS
= *D
.getCapturedStmt(OMPD_parallel
);
3137 // Create a function that takes as argument the source thread.
3138 FunctionArgList WrapperArgs
;
3140 Ctx
.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3142 Ctx
.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3143 ImplicitParamDecl
ParallelLevelArg(Ctx
, /*DC=*/nullptr, D
.getBeginLoc(),
3144 /*Id=*/nullptr, Int16QTy
,
3145 ImplicitParamDecl::Other
);
3146 ImplicitParamDecl
WrapperArg(Ctx
, /*DC=*/nullptr, D
.getBeginLoc(),
3147 /*Id=*/nullptr, Int32QTy
,
3148 ImplicitParamDecl::Other
);
3149 WrapperArgs
.emplace_back(&ParallelLevelArg
);
3150 WrapperArgs
.emplace_back(&WrapperArg
);
3152 const CGFunctionInfo
&CGFI
=
3153 CGM
.getTypes().arrangeBuiltinFunctionDeclaration(Ctx
.VoidTy
, WrapperArgs
);
3155 auto *Fn
= llvm::Function::Create(
3156 CGM
.getTypes().GetFunctionType(CGFI
), llvm::GlobalValue::InternalLinkage
,
3157 Twine(OutlinedParallelFn
->getName(), "_wrapper"), &CGM
.getModule());
3159 // Ensure we do not inline the function. This is trivially true for the ones
3160 // passed to __kmpc_fork_call but the ones calles in serialized regions
3161 // could be inlined. This is not a perfect but it is closer to the invariant
3162 // we want, namely, every data environment starts with a new function.
3163 // TODO: We should pass the if condition to the runtime function and do the
3164 // handling there. Much cleaner code.
3165 Fn
->addFnAttr(llvm::Attribute::NoInline
);
3167 CGM
.SetInternalFunctionAttributes(GlobalDecl(), Fn
, CGFI
);
3168 Fn
->setLinkage(llvm::GlobalValue::InternalLinkage
);
3169 Fn
->setDoesNotRecurse();
3171 CodeGenFunction
CGF(CGM
, /*suppressNewContext=*/true);
3172 CGF
.StartFunction(GlobalDecl(), Ctx
.VoidTy
, Fn
, CGFI
, WrapperArgs
,
3173 D
.getBeginLoc(), D
.getBeginLoc());
3175 const auto *RD
= CS
.getCapturedRecordDecl();
3176 auto CurField
= RD
->field_begin();
3178 Address ZeroAddr
= CGF
.CreateDefaultAlignTempAlloca(CGF
.Int32Ty
,
3179 /*Name=*/".zero.addr");
3180 CGF
.Builder
.CreateStore(CGF
.Builder
.getInt32(/*C*/ 0), ZeroAddr
);
3181 // Get the array of arguments.
3182 SmallVector
<llvm::Value
*, 8> Args
;
3184 Args
.emplace_back(CGF
.GetAddrOfLocalVar(&WrapperArg
).getPointer());
3185 Args
.emplace_back(ZeroAddr
.getPointer());
3187 CGBuilderTy
&Bld
= CGF
.Builder
;
3188 auto CI
= CS
.capture_begin();
3190 // Use global memory for data sharing.
3191 // Handle passing of global args to workers.
3192 Address GlobalArgs
=
3193 CGF
.CreateDefaultAlignTempAlloca(CGF
.VoidPtrPtrTy
, "global_args");
3194 llvm::Value
*GlobalArgsPtr
= GlobalArgs
.getPointer();
3195 llvm::Value
*DataSharingArgs
[] = {GlobalArgsPtr
};
3196 CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
3197 CGM
.getModule(), OMPRTL___kmpc_get_shared_variables
),
3200 // Retrieve the shared variables from the list of references returned
3201 // by the runtime. Pass the variables to the outlined function.
3202 Address SharedArgListAddress
= Address::invalid();
3203 if (CS
.capture_size() > 0 ||
3204 isOpenMPLoopBoundSharingDirective(D
.getDirectiveKind())) {
3205 SharedArgListAddress
= CGF
.EmitLoadOfPointer(
3206 GlobalArgs
, CGF
.getContext()
3207 .getPointerType(CGF
.getContext().VoidPtrTy
)
3208 .castAs
<PointerType
>());
3211 if (isOpenMPLoopBoundSharingDirective(D
.getDirectiveKind())) {
3212 Address Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, Idx
);
3213 Address TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
3214 Src
, CGF
.SizeTy
->getPointerTo(), CGF
.SizeTy
);
3215 llvm::Value
*LB
= CGF
.EmitLoadOfScalar(
3218 CGF
.getContext().getPointerType(CGF
.getContext().getSizeType()),
3219 cast
<OMPLoopDirective
>(D
).getLowerBoundVariable()->getExprLoc());
3220 Args
.emplace_back(LB
);
3222 Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, Idx
);
3223 TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
3224 Src
, CGF
.SizeTy
->getPointerTo(), CGF
.SizeTy
);
3225 llvm::Value
*UB
= CGF
.EmitLoadOfScalar(
3228 CGF
.getContext().getPointerType(CGF
.getContext().getSizeType()),
3229 cast
<OMPLoopDirective
>(D
).getUpperBoundVariable()->getExprLoc());
3230 Args
.emplace_back(UB
);
3233 if (CS
.capture_size() > 0) {
3234 ASTContext
&CGFContext
= CGF
.getContext();
3235 for (unsigned I
= 0, E
= CS
.capture_size(); I
< E
; ++I
, ++CI
, ++CurField
) {
3236 QualType ElemTy
= CurField
->getType();
3237 Address Src
= Bld
.CreateConstInBoundsGEP(SharedArgListAddress
, I
+ Idx
);
3238 Address TypedAddress
= Bld
.CreatePointerBitCastOrAddrSpaceCast(
3239 Src
, CGF
.ConvertTypeForMem(CGFContext
.getPointerType(ElemTy
)),
3240 CGF
.ConvertTypeForMem(ElemTy
));
3241 llvm::Value
*Arg
= CGF
.EmitLoadOfScalar(TypedAddress
,
3243 CGFContext
.getPointerType(ElemTy
),
3245 if (CI
->capturesVariableByCopy() &&
3246 !CI
->getCapturedVar()->getType()->isAnyPointerType()) {
3247 Arg
= castValueToType(CGF
, Arg
, ElemTy
, CGFContext
.getUIntPtrType(),
3250 Args
.emplace_back(Arg
);
3254 emitOutlinedFunctionCall(CGF
, D
.getBeginLoc(), OutlinedParallelFn
, Args
);
3255 CGF
.FinishFunction();
3259 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction
&CGF
,
3261 if (getDataSharingMode(CGM
) != CGOpenMPRuntimeGPU::Generic
)
3264 assert(D
&& "Expected function or captured|block decl.");
3265 assert(FunctionGlobalizedDecls
.count(CGF
.CurFn
) == 0 &&
3266 "Function is registered already.");
3267 assert((!TeamAndReductions
.first
|| TeamAndReductions
.first
== D
) &&
3268 "Team is set but not processed.");
3269 const Stmt
*Body
= nullptr;
3270 bool NeedToDelayGlobalization
= false;
3271 if (const auto *FD
= dyn_cast
<FunctionDecl
>(D
)) {
3272 Body
= FD
->getBody();
3273 } else if (const auto *BD
= dyn_cast
<BlockDecl
>(D
)) {
3274 Body
= BD
->getBody();
3275 } else if (const auto *CD
= dyn_cast
<CapturedDecl
>(D
)) {
3276 Body
= CD
->getBody();
3277 NeedToDelayGlobalization
= CGF
.CapturedStmtInfo
->getKind() == CR_OpenMP
;
3278 if (NeedToDelayGlobalization
&&
3279 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
)
3284 CheckVarsEscapingDeclContext
VarChecker(CGF
, TeamAndReductions
.second
);
3285 VarChecker
.Visit(Body
);
3286 const RecordDecl
*GlobalizedVarsRecord
=
3287 VarChecker
.getGlobalizedRecord(IsInTTDRegion
);
3288 TeamAndReductions
.first
= nullptr;
3289 TeamAndReductions
.second
.clear();
3290 ArrayRef
<const ValueDecl
*> EscapedVariableLengthDecls
=
3291 VarChecker
.getEscapedVariableLengthDecls();
3292 if (!GlobalizedVarsRecord
&& EscapedVariableLengthDecls
.empty())
3294 auto I
= FunctionGlobalizedDecls
.try_emplace(CGF
.CurFn
).first
;
3295 I
->getSecond().MappedParams
=
3296 std::make_unique
<CodeGenFunction::OMPMapVars
>();
3297 I
->getSecond().EscapedParameters
.insert(
3298 VarChecker
.getEscapedParameters().begin(),
3299 VarChecker
.getEscapedParameters().end());
3300 I
->getSecond().EscapedVariableLengthDecls
.append(
3301 EscapedVariableLengthDecls
.begin(), EscapedVariableLengthDecls
.end());
3302 DeclToAddrMapTy
&Data
= I
->getSecond().LocalVarData
;
3303 for (const ValueDecl
*VD
: VarChecker
.getEscapedDecls()) {
3304 assert(VD
->isCanonicalDecl() && "Expected canonical declaration");
3305 Data
.insert(std::make_pair(VD
, MappedVarData()));
3307 if (!NeedToDelayGlobalization
) {
3308 emitGenericVarsProlog(CGF
, D
->getBeginLoc(), /*WithSPMDCheck=*/true);
3309 struct GlobalizationScope final
: EHScopeStack::Cleanup
{
3310 GlobalizationScope() = default;
3312 void Emit(CodeGenFunction
&CGF
, Flags flags
) override
{
3313 static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime())
3314 .emitGenericVarsEpilog(CGF
, /*WithSPMDCheck=*/true);
3317 CGF
.EHStack
.pushCleanup
<GlobalizationScope
>(NormalAndEHCleanup
);
3321 Address
CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction
&CGF
,
3322 const VarDecl
*VD
) {
3323 if (VD
&& VD
->hasAttr
<OMPAllocateDeclAttr
>()) {
3324 const auto *A
= VD
->getAttr
<OMPAllocateDeclAttr
>();
3325 auto AS
= LangAS::Default
;
3326 switch (A
->getAllocatorType()) {
3327 // Use the default allocator here as by default local vars are
3329 case OMPAllocateDeclAttr::OMPNullMemAlloc
:
3330 case OMPAllocateDeclAttr::OMPDefaultMemAlloc
:
3331 case OMPAllocateDeclAttr::OMPThreadMemAlloc
:
3332 case OMPAllocateDeclAttr::OMPHighBWMemAlloc
:
3333 case OMPAllocateDeclAttr::OMPLowLatMemAlloc
:
3334 // Follow the user decision - use default allocation.
3335 return Address::invalid();
3336 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc
:
3337 // TODO: implement aupport for user-defined allocators.
3338 return Address::invalid();
3339 case OMPAllocateDeclAttr::OMPConstMemAlloc
:
3340 AS
= LangAS::cuda_constant
;
3342 case OMPAllocateDeclAttr::OMPPTeamMemAlloc
:
3343 AS
= LangAS::cuda_shared
;
3345 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc
:
3346 case OMPAllocateDeclAttr::OMPCGroupMemAlloc
:
3349 llvm::Type
*VarTy
= CGF
.ConvertTypeForMem(VD
->getType());
3350 auto *GV
= new llvm::GlobalVariable(
3351 CGM
.getModule(), VarTy
, /*isConstant=*/false,
3352 llvm::GlobalValue::InternalLinkage
, llvm::PoisonValue::get(VarTy
),
3354 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal
,
3355 CGM
.getContext().getTargetAddressSpace(AS
));
3356 CharUnits Align
= CGM
.getContext().getDeclAlign(VD
);
3357 GV
->setAlignment(Align
.getAsAlign());
3359 CGF
.Builder
.CreatePointerBitCastOrAddrSpaceCast(
3360 GV
, VarTy
->getPointerTo(CGM
.getContext().getTargetAddressSpace(
3361 VD
->getType().getAddressSpace()))),
3365 if (getDataSharingMode(CGM
) != CGOpenMPRuntimeGPU::Generic
)
3366 return Address::invalid();
3368 VD
= VD
->getCanonicalDecl();
3369 auto I
= FunctionGlobalizedDecls
.find(CGF
.CurFn
);
3370 if (I
== FunctionGlobalizedDecls
.end())
3371 return Address::invalid();
3372 auto VDI
= I
->getSecond().LocalVarData
.find(VD
);
3373 if (VDI
!= I
->getSecond().LocalVarData
.end())
3374 return VDI
->second
.PrivateAddr
;
3375 if (VD
->hasAttrs()) {
3376 for (specific_attr_iterator
<OMPReferencedVarAttr
> IT(VD
->attr_begin()),
3379 auto VDI
= I
->getSecond().LocalVarData
.find(
3380 cast
<VarDecl
>(cast
<DeclRefExpr
>(IT
->getRef())->getDecl())
3381 ->getCanonicalDecl());
3382 if (VDI
!= I
->getSecond().LocalVarData
.end())
3383 return VDI
->second
.PrivateAddr
;
3387 return Address::invalid();
3390 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction
&CGF
) {
3391 FunctionGlobalizedDecls
.erase(CGF
.CurFn
);
3392 CGOpenMPRuntime::functionFinished(CGF
);
3395 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3396 CodeGenFunction
&CGF
, const OMPLoopDirective
&S
,
3397 OpenMPDistScheduleClauseKind
&ScheduleKind
,
3398 llvm::Value
*&Chunk
) const {
3399 auto &RT
= static_cast<CGOpenMPRuntimeGPU
&>(CGF
.CGM
.getOpenMPRuntime());
3400 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD
) {
3401 ScheduleKind
= OMPC_DIST_SCHEDULE_static
;
3402 Chunk
= CGF
.EmitScalarConversion(
3403 RT
.getGPUNumThreads(CGF
),
3404 CGF
.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3405 S
.getIterationVariable()->getType(), S
.getBeginLoc());
3408 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3409 CGF
, S
, ScheduleKind
, Chunk
);
3412 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3413 CodeGenFunction
&CGF
, const OMPLoopDirective
&S
,
3414 OpenMPScheduleClauseKind
&ScheduleKind
,
3415 const Expr
*&ChunkExpr
) const {
3416 ScheduleKind
= OMPC_SCHEDULE_static
;
3417 // Chunk size is 1 in this case.
3418 llvm::APInt
ChunkSize(32, 1);
3419 ChunkExpr
= IntegerLiteral::Create(CGF
.getContext(), ChunkSize
,
3420 CGF
.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3424 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3425 CodeGenFunction
&CGF
, const OMPExecutableDirective
&D
) const {
3426 assert(isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) &&
3427 " Expected target-based directive.");
3428 const CapturedStmt
*CS
= D
.getCapturedStmt(OMPD_target
);
3429 for (const CapturedStmt::Capture
&C
: CS
->captures()) {
3430 // Capture variables captured by reference in lambdas for target-based
3432 if (!C
.capturesVariable())
3434 const VarDecl
*VD
= C
.getCapturedVar();
3435 const auto *RD
= VD
->getType()
3437 .getNonReferenceType()
3438 ->getAsCXXRecordDecl();
3439 if (!RD
|| !RD
->isLambda())
3441 Address VDAddr
= CGF
.GetAddrOfLocalVar(VD
);
3443 if (VD
->getType().getCanonicalType()->isReferenceType())
3444 VDLVal
= CGF
.EmitLoadOfReferenceLValue(VDAddr
, VD
->getType());
3446 VDLVal
= CGF
.MakeAddrLValue(
3447 VDAddr
, VD
->getType().getCanonicalType().getNonReferenceType());
3448 llvm::DenseMap
<const ValueDecl
*, FieldDecl
*> Captures
;
3449 FieldDecl
*ThisCapture
= nullptr;
3450 RD
->getCaptureFields(Captures
, ThisCapture
);
3451 if (ThisCapture
&& CGF
.CapturedStmtInfo
->isCXXThisExprCaptured()) {
3453 CGF
.EmitLValueForFieldInitialization(VDLVal
, ThisCapture
);
3454 llvm::Value
*CXXThis
= CGF
.LoadCXXThis();
3455 CGF
.EmitStoreOfScalar(CXXThis
, ThisLVal
);
3457 for (const LambdaCapture
&LC
: RD
->captures()) {
3458 if (LC
.getCaptureKind() != LCK_ByRef
)
3460 const ValueDecl
*VD
= LC
.getCapturedVar();
3461 // FIXME: For now VD is always a VarDecl because OpenMP does not support
3462 // capturing structured bindings in lambdas yet.
3463 if (!CS
->capturesVariable(cast
<VarDecl
>(VD
)))
3465 auto It
= Captures
.find(VD
);
3466 assert(It
!= Captures
.end() && "Found lambda capture without field.");
3467 LValue VarLVal
= CGF
.EmitLValueForFieldInitialization(VDLVal
, It
->second
);
3468 Address VDAddr
= CGF
.GetAddrOfLocalVar(cast
<VarDecl
>(VD
));
3469 if (VD
->getType().getCanonicalType()->isReferenceType())
3470 VDAddr
= CGF
.EmitLoadOfReferenceLValue(VDAddr
,
3471 VD
->getType().getCanonicalType())
3473 CGF
.EmitStoreOfScalar(VDAddr
.getPointer(), VarLVal
);
3478 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl
*VD
,
3480 if (!VD
|| !VD
->hasAttr
<OMPAllocateDeclAttr
>())
3482 const auto *A
= VD
->getAttr
<OMPAllocateDeclAttr
>();
3483 switch(A
->getAllocatorType()) {
3484 case OMPAllocateDeclAttr::OMPNullMemAlloc
:
3485 case OMPAllocateDeclAttr::OMPDefaultMemAlloc
:
3486 // Not supported, fallback to the default mem space.
3487 case OMPAllocateDeclAttr::OMPThreadMemAlloc
:
3488 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc
:
3489 case OMPAllocateDeclAttr::OMPCGroupMemAlloc
:
3490 case OMPAllocateDeclAttr::OMPHighBWMemAlloc
:
3491 case OMPAllocateDeclAttr::OMPLowLatMemAlloc
:
3492 AS
= LangAS::Default
;
3494 case OMPAllocateDeclAttr::OMPConstMemAlloc
:
3495 AS
= LangAS::cuda_constant
;
3497 case OMPAllocateDeclAttr::OMPPTeamMemAlloc
:
3498 AS
= LangAS::cuda_shared
;
3500 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc
:
3501 llvm_unreachable("Expected predefined allocator for the variables with the "
3507 // Get current CudaArch and ignore any unknown values
3508 static CudaArch
getCudaArch(CodeGenModule
&CGM
) {
3509 if (!CGM
.getTarget().hasFeature("ptx"))
3510 return CudaArch::UNKNOWN
;
3511 for (const auto &Feature
: CGM
.getTarget().getTargetOpts().FeatureMap
) {
3512 if (Feature
.getValue()) {
3513 CudaArch Arch
= StringToCudaArch(Feature
.getKey());
3514 if (Arch
!= CudaArch::UNKNOWN
)
3518 return CudaArch::UNKNOWN
;
3521 /// Check to see if target architecture supports unified addressing which is
3522 /// a restriction for OpenMP requires clause "unified_shared_memory".
3523 void CGOpenMPRuntimeGPU::processRequiresDirective(
3524 const OMPRequiresDecl
*D
) {
3525 for (const OMPClause
*Clause
: D
->clauselists()) {
3526 if (Clause
->getClauseKind() == OMPC_unified_shared_memory
) {
3527 CudaArch Arch
= getCudaArch(CGM
);
3529 case CudaArch::SM_20
:
3530 case CudaArch::SM_21
:
3531 case CudaArch::SM_30
:
3532 case CudaArch::SM_32
:
3533 case CudaArch::SM_35
:
3534 case CudaArch::SM_37
:
3535 case CudaArch::SM_50
:
3536 case CudaArch::SM_52
:
3537 case CudaArch::SM_53
: {
3538 SmallString
<256> Buffer
;
3539 llvm::raw_svector_ostream
Out(Buffer
);
3540 Out
<< "Target architecture " << CudaArchToString(Arch
)
3541 << " does not support unified addressing";
3542 CGM
.Error(Clause
->getBeginLoc(), Out
.str());
3545 case CudaArch::SM_60
:
3546 case CudaArch::SM_61
:
3547 case CudaArch::SM_62
:
3548 case CudaArch::SM_70
:
3549 case CudaArch::SM_72
:
3550 case CudaArch::SM_75
:
3551 case CudaArch::SM_80
:
3552 case CudaArch::SM_86
:
3553 case CudaArch::SM_87
:
3554 case CudaArch::SM_89
:
3555 case CudaArch::SM_90
:
3556 case CudaArch::GFX600
:
3557 case CudaArch::GFX601
:
3558 case CudaArch::GFX602
:
3559 case CudaArch::GFX700
:
3560 case CudaArch::GFX701
:
3561 case CudaArch::GFX702
:
3562 case CudaArch::GFX703
:
3563 case CudaArch::GFX704
:
3564 case CudaArch::GFX705
:
3565 case CudaArch::GFX801
:
3566 case CudaArch::GFX802
:
3567 case CudaArch::GFX803
:
3568 case CudaArch::GFX805
:
3569 case CudaArch::GFX810
:
3570 case CudaArch::GFX900
:
3571 case CudaArch::GFX902
:
3572 case CudaArch::GFX904
:
3573 case CudaArch::GFX906
:
3574 case CudaArch::GFX908
:
3575 case CudaArch::GFX909
:
3576 case CudaArch::GFX90a
:
3577 case CudaArch::GFX90c
:
3578 case CudaArch::GFX940
:
3579 case CudaArch::GFX941
:
3580 case CudaArch::GFX942
:
3581 case CudaArch::GFX1010
:
3582 case CudaArch::GFX1011
:
3583 case CudaArch::GFX1012
:
3584 case CudaArch::GFX1013
:
3585 case CudaArch::GFX1030
:
3586 case CudaArch::GFX1031
:
3587 case CudaArch::GFX1032
:
3588 case CudaArch::GFX1033
:
3589 case CudaArch::GFX1034
:
3590 case CudaArch::GFX1035
:
3591 case CudaArch::GFX1036
:
3592 case CudaArch::GFX1100
:
3593 case CudaArch::GFX1101
:
3594 case CudaArch::GFX1102
:
3595 case CudaArch::GFX1103
:
3596 case CudaArch::Generic
:
3597 case CudaArch::UNUSED
:
3598 case CudaArch::UNKNOWN
:
3600 case CudaArch::LAST
:
3601 llvm_unreachable("Unexpected Cuda arch.");
3605 CGOpenMPRuntime::processRequiresDirective(D
);
3608 void CGOpenMPRuntimeGPU::clear() {
3610 if (!TeamsReductions
.empty()) {
3611 ASTContext
&C
= CGM
.getContext();
3612 RecordDecl
*StaticRD
= C
.buildImplicitRecord(
3613 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union
);
3614 StaticRD
->startDefinition();
3615 for (const RecordDecl
*TeamReductionRec
: TeamsReductions
) {
3616 QualType RecTy
= C
.getRecordType(TeamReductionRec
);
3617 auto *Field
= FieldDecl::Create(
3618 C
, StaticRD
, SourceLocation(), SourceLocation(), nullptr, RecTy
,
3619 C
.getTrivialTypeSourceInfo(RecTy
, SourceLocation()),
3620 /*BW=*/nullptr, /*Mutable=*/false,
3621 /*InitStyle=*/ICIS_NoInit
);
3622 Field
->setAccess(AS_public
);
3623 StaticRD
->addDecl(Field
);
3625 StaticRD
->completeDefinition();
3626 QualType StaticTy
= C
.getRecordType(StaticRD
);
3627 llvm::Type
*LLVMReductionsBufferTy
=
3628 CGM
.getTypes().ConvertTypeForMem(StaticTy
);
3629 // FIXME: nvlink does not handle weak linkage correctly (object with the
3630 // different size are reported as erroneous).
3631 // Restore CommonLinkage as soon as nvlink is fixed.
3632 auto *GV
= new llvm::GlobalVariable(
3633 CGM
.getModule(), LLVMReductionsBufferTy
,
3634 /*isConstant=*/false, llvm::GlobalValue::InternalLinkage
,
3635 llvm::Constant::getNullValue(LLVMReductionsBufferTy
),
3636 "_openmp_teams_reductions_buffer_$_");
3637 KernelTeamsReductionPtr
->setInitializer(
3638 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV
,
3641 CGOpenMPRuntime::clear();
3644 llvm::Value
*CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction
&CGF
) {
3645 CGBuilderTy
&Bld
= CGF
.Builder
;
3646 llvm::Module
*M
= &CGF
.CGM
.getModule();
3647 const char *LocSize
= "__kmpc_get_hardware_num_threads_in_block";
3648 llvm::Function
*F
= M
->getFunction(LocSize
);
3650 F
= llvm::Function::Create(
3651 llvm::FunctionType::get(CGF
.Int32Ty
, std::nullopt
, false),
3652 llvm::GlobalVariable::ExternalLinkage
, LocSize
, &CGF
.CGM
.getModule());
3654 return Bld
.CreateCall(F
, std::nullopt
, "nvptx_num_threads");
3657 llvm::Value
*CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction
&CGF
) {
3658 ArrayRef
<llvm::Value
*> Args
{};
3659 return CGF
.EmitRuntimeCall(
3660 OMPBuilder
.getOrCreateRuntimeFunction(
3661 CGM
.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block
),
3665 llvm::Value
*CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction
&CGF
) {
3666 ArrayRef
<llvm::Value
*> Args
{};
3667 return CGF
.EmitRuntimeCall(OMPBuilder
.getOrCreateRuntimeFunction(
3668 CGM
.getModule(), OMPRTL___kmpc_get_warp_size
),