[libc][docgen] simplify posix links (#119595)
[llvm-project.git] / clang / lib / CodeGen / CGOpenMPRuntimeGPU.cpp
blob756f0482b8ea72947f09c95ebb4151ea45317e78
1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a generalized class for OpenMP runtime code generation
10 // specialized by GPU targets NVPTX and AMDGCN.
12 //===----------------------------------------------------------------------===//
14 #include "CGOpenMPRuntimeGPU.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/Attr.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/OpenMPClause.h"
19 #include "clang/AST/StmtOpenMP.h"
20 #include "clang/AST/StmtVisitor.h"
21 #include "clang/Basic/Cuda.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
25 using namespace clang;
26 using namespace CodeGen;
27 using namespace llvm::omp;
29 namespace {
30 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
31 class NVPTXActionTy final : public PrePostActionTy {
32 llvm::FunctionCallee EnterCallee = nullptr;
33 ArrayRef<llvm::Value *> EnterArgs;
34 llvm::FunctionCallee ExitCallee = nullptr;
35 ArrayRef<llvm::Value *> ExitArgs;
36 bool Conditional = false;
37 llvm::BasicBlock *ContBlock = nullptr;
39 public:
40 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
41 ArrayRef<llvm::Value *> EnterArgs,
42 llvm::FunctionCallee ExitCallee,
43 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
44 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
45 ExitArgs(ExitArgs), Conditional(Conditional) {}
46 void Enter(CodeGenFunction &CGF) override {
47 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
48 if (Conditional) {
49 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
50 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
51 ContBlock = CGF.createBasicBlock("omp_if.end");
52 // Generate the branch (If-stmt)
53 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
54 CGF.EmitBlock(ThenBlock);
57 void Done(CodeGenFunction &CGF) {
58 // Emit the rest of blocks/branches
59 CGF.EmitBranch(ContBlock);
60 CGF.EmitBlock(ContBlock, true);
62 void Exit(CodeGenFunction &CGF) override {
63 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
67 /// A class to track the execution mode when codegening directives within
68 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
69 /// to the target region and used by containing directives such as 'parallel'
70 /// to emit optimized code.
71 class ExecutionRuntimeModesRAII {
72 private:
73 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
74 CGOpenMPRuntimeGPU::EM_Unknown;
75 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77 public:
78 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
79 CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
80 : ExecMode(ExecMode) {
81 SavedExecMode = ExecMode;
82 ExecMode = EntryMode;
84 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
87 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
88 RefExpr = RefExpr->IgnoreParens();
89 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
90 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
91 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
92 Base = TempASE->getBase()->IgnoreParenImpCasts();
93 RefExpr = Base;
94 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {
95 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
96 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
97 Base = TempOASE->getBase()->IgnoreParenImpCasts();
98 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
99 Base = TempASE->getBase()->IgnoreParenImpCasts();
100 RefExpr = Base;
102 RefExpr = RefExpr->IgnoreParenImpCasts();
103 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
104 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
105 const auto *ME = cast<MemberExpr>(RefExpr);
106 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
109 static RecordDecl *buildRecordForGlobalizedVars(
110 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
111 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
112 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
113 &MappedDeclsFields,
114 int BufSize) {
115 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
116 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
117 return nullptr;
118 SmallVector<VarsDataTy, 4> GlobalizedVars;
119 for (const ValueDecl *D : EscapedDecls)
120 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
121 for (const ValueDecl *D : EscapedDeclsForTeams)
122 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
124 // Build struct _globalized_locals_ty {
125 // /* globalized vars */[WarSize] align (decl_align)
126 // /* globalized vars */ for EscapedDeclsForTeams
127 // };
128 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
129 GlobalizedRD->startDefinition();
130 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
131 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
132 for (const auto &Pair : GlobalizedVars) {
133 const ValueDecl *VD = Pair.second;
134 QualType Type = VD->getType();
135 if (Type->isLValueReferenceType())
136 Type = C.getPointerType(Type.getNonReferenceType());
137 else
138 Type = Type.getNonReferenceType();
139 SourceLocation Loc = VD->getLocation();
140 FieldDecl *Field;
141 if (SingleEscaped.count(VD)) {
142 Field = FieldDecl::Create(
143 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
144 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
145 /*BW=*/nullptr, /*Mutable=*/false,
146 /*InitStyle=*/ICIS_NoInit);
147 Field->setAccess(AS_public);
148 if (VD->hasAttrs()) {
149 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
150 E(VD->getAttrs().end());
151 I != E; ++I)
152 Field->addAttr(*I);
154 } else {
155 if (BufSize > 1) {
156 llvm::APInt ArraySize(32, BufSize);
157 Type = C.getConstantArrayType(Type, ArraySize, nullptr,
158 ArraySizeModifier::Normal, 0);
160 Field = FieldDecl::Create(
161 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
162 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
163 /*BW=*/nullptr, /*Mutable=*/false,
164 /*InitStyle=*/ICIS_NoInit);
165 Field->setAccess(AS_public);
166 llvm::APInt Align(32, Pair.first.getQuantity());
167 Field->addAttr(AlignedAttr::CreateImplicit(
168 C, /*IsAlignmentExpr=*/true,
169 IntegerLiteral::Create(C, Align,
170 C.getIntTypeForBitwidth(32, /*Signed=*/0),
171 SourceLocation()),
172 {}, AlignedAttr::GNU_aligned));
174 GlobalizedRD->addDecl(Field);
175 MappedDeclsFields.try_emplace(VD, Field);
177 GlobalizedRD->completeDefinition();
178 return GlobalizedRD;
181 /// Get the list of variables that can escape their declaration context.
182 class CheckVarsEscapingDeclContext final
183 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
184 CodeGenFunction &CGF;
185 llvm::SetVector<const ValueDecl *> EscapedDecls;
186 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
187 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
188 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
189 RecordDecl *GlobalizedRD = nullptr;
190 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
191 bool AllEscaped = false;
192 bool IsForCombinedParallelRegion = false;
194 void markAsEscaped(const ValueDecl *VD) {
195 // Do not globalize declare target variables.
196 if (!isa<VarDecl>(VD) ||
197 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
198 return;
199 VD = cast<ValueDecl>(VD->getCanonicalDecl());
200 // Use user-specified allocation.
201 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
202 return;
203 // Variables captured by value must be globalized.
204 bool IsCaptured = false;
205 if (auto *CSI = CGF.CapturedStmtInfo) {
206 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
207 // Check if need to capture the variable that was already captured by
208 // value in the outer region.
209 IsCaptured = true;
210 if (!IsForCombinedParallelRegion) {
211 if (!FD->hasAttrs())
212 return;
213 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
214 if (!Attr)
215 return;
216 if (((Attr->getCaptureKind() != OMPC_map) &&
217 !isOpenMPPrivate(Attr->getCaptureKind())) ||
218 ((Attr->getCaptureKind() == OMPC_map) &&
219 !FD->getType()->isAnyPointerType()))
220 return;
222 if (!FD->getType()->isReferenceType()) {
223 assert(!VD->getType()->isVariablyModifiedType() &&
224 "Parameter captured by value with variably modified type");
225 EscapedParameters.insert(VD);
226 } else if (!IsForCombinedParallelRegion) {
227 return;
231 if ((!CGF.CapturedStmtInfo ||
232 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
233 VD->getType()->isReferenceType())
234 // Do not globalize variables with reference type.
235 return;
236 if (VD->getType()->isVariablyModifiedType()) {
237 // If not captured at the target region level then mark the escaped
238 // variable as delayed.
239 if (IsCaptured)
240 EscapedVariableLengthDecls.insert(VD);
241 else
242 DelayedVariableLengthDecls.insert(VD);
243 } else
244 EscapedDecls.insert(VD);
247 void VisitValueDecl(const ValueDecl *VD) {
248 if (VD->getType()->isLValueReferenceType())
249 markAsEscaped(VD);
250 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
251 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
252 const bool SavedAllEscaped = AllEscaped;
253 AllEscaped = VD->getType()->isLValueReferenceType();
254 Visit(VarD->getInit());
255 AllEscaped = SavedAllEscaped;
259 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
260 ArrayRef<OMPClause *> Clauses,
261 bool IsCombinedParallelRegion) {
262 if (!S)
263 return;
264 for (const CapturedStmt::Capture &C : S->captures()) {
265 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
266 const ValueDecl *VD = C.getCapturedVar();
267 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
268 if (IsCombinedParallelRegion) {
269 // Check if the variable is privatized in the combined construct and
270 // those private copies must be shared in the inner parallel
271 // directive.
272 IsForCombinedParallelRegion = false;
273 for (const OMPClause *C : Clauses) {
274 if (!isOpenMPPrivate(C->getClauseKind()) ||
275 C->getClauseKind() == OMPC_reduction ||
276 C->getClauseKind() == OMPC_linear ||
277 C->getClauseKind() == OMPC_private)
278 continue;
279 ArrayRef<const Expr *> Vars;
280 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
281 Vars = PC->getVarRefs();
282 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
283 Vars = PC->getVarRefs();
284 else
285 llvm_unreachable("Unexpected clause.");
286 for (const auto *E : Vars) {
287 const Decl *D =
288 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
289 if (D == VD->getCanonicalDecl()) {
290 IsForCombinedParallelRegion = true;
291 break;
294 if (IsForCombinedParallelRegion)
295 break;
298 markAsEscaped(VD);
299 if (isa<OMPCapturedExprDecl>(VD))
300 VisitValueDecl(VD);
301 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
306 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
307 assert(!GlobalizedRD &&
308 "Record for globalized variables is built already.");
309 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
310 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
311 if (IsInTTDRegion)
312 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
313 else
314 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
315 GlobalizedRD = ::buildRecordForGlobalizedVars(
316 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
317 MappedDeclsFields, WarpSize);
320 public:
321 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
322 ArrayRef<const ValueDecl *> TeamsReductions)
323 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
325 virtual ~CheckVarsEscapingDeclContext() = default;
326 void VisitDeclStmt(const DeclStmt *S) {
327 if (!S)
328 return;
329 for (const Decl *D : S->decls())
330 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
331 VisitValueDecl(VD);
333 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
334 if (!D)
335 return;
336 if (!D->hasAssociatedStmt())
337 return;
338 if (const auto *S =
339 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
340 // Do not analyze directives that do not actually require capturing,
341 // like `omp for` or `omp simd` directives.
342 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
343 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
344 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
345 VisitStmt(S->getCapturedStmt());
346 return;
348 VisitOpenMPCapturedStmt(
349 S, D->clauses(),
350 CaptureRegions.back() == OMPD_parallel &&
351 isOpenMPDistributeDirective(D->getDirectiveKind()));
354 void VisitCapturedStmt(const CapturedStmt *S) {
355 if (!S)
356 return;
357 for (const CapturedStmt::Capture &C : S->captures()) {
358 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
359 const ValueDecl *VD = C.getCapturedVar();
360 markAsEscaped(VD);
361 if (isa<OMPCapturedExprDecl>(VD))
362 VisitValueDecl(VD);
366 void VisitLambdaExpr(const LambdaExpr *E) {
367 if (!E)
368 return;
369 for (const LambdaCapture &C : E->captures()) {
370 if (C.capturesVariable()) {
371 if (C.getCaptureKind() == LCK_ByRef) {
372 const ValueDecl *VD = C.getCapturedVar();
373 markAsEscaped(VD);
374 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
375 VisitValueDecl(VD);
380 void VisitBlockExpr(const BlockExpr *E) {
381 if (!E)
382 return;
383 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
384 if (C.isByRef()) {
385 const VarDecl *VD = C.getVariable();
386 markAsEscaped(VD);
387 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
388 VisitValueDecl(VD);
392 void VisitCallExpr(const CallExpr *E) {
393 if (!E)
394 return;
395 for (const Expr *Arg : E->arguments()) {
396 if (!Arg)
397 continue;
398 if (Arg->isLValue()) {
399 const bool SavedAllEscaped = AllEscaped;
400 AllEscaped = true;
401 Visit(Arg);
402 AllEscaped = SavedAllEscaped;
403 } else {
404 Visit(Arg);
407 Visit(E->getCallee());
409 void VisitDeclRefExpr(const DeclRefExpr *E) {
410 if (!E)
411 return;
412 const ValueDecl *VD = E->getDecl();
413 if (AllEscaped)
414 markAsEscaped(VD);
415 if (isa<OMPCapturedExprDecl>(VD))
416 VisitValueDecl(VD);
417 else if (VD->isInitCapture())
418 VisitValueDecl(VD);
420 void VisitUnaryOperator(const UnaryOperator *E) {
421 if (!E)
422 return;
423 if (E->getOpcode() == UO_AddrOf) {
424 const bool SavedAllEscaped = AllEscaped;
425 AllEscaped = true;
426 Visit(E->getSubExpr());
427 AllEscaped = SavedAllEscaped;
428 } else {
429 Visit(E->getSubExpr());
432 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
433 if (!E)
434 return;
435 if (E->getCastKind() == CK_ArrayToPointerDecay) {
436 const bool SavedAllEscaped = AllEscaped;
437 AllEscaped = true;
438 Visit(E->getSubExpr());
439 AllEscaped = SavedAllEscaped;
440 } else {
441 Visit(E->getSubExpr());
444 void VisitExpr(const Expr *E) {
445 if (!E)
446 return;
447 bool SavedAllEscaped = AllEscaped;
448 if (!E->isLValue())
449 AllEscaped = false;
450 for (const Stmt *Child : E->children())
451 if (Child)
452 Visit(Child);
453 AllEscaped = SavedAllEscaped;
455 void VisitStmt(const Stmt *S) {
456 if (!S)
457 return;
458 for (const Stmt *Child : S->children())
459 if (Child)
460 Visit(Child);
463 /// Returns the record that handles all the escaped local variables and used
464 /// instead of their original storage.
465 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
466 if (!GlobalizedRD)
467 buildRecordForGlobalizedVars(IsInTTDRegion);
468 return GlobalizedRD;
471 /// Returns the field in the globalized record for the escaped variable.
472 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
473 assert(GlobalizedRD &&
474 "Record for globalized variables must be generated already.");
475 return MappedDeclsFields.lookup(VD);
478 /// Returns the list of the escaped local variables/parameters.
479 ArrayRef<const ValueDecl *> getEscapedDecls() const {
480 return EscapedDecls.getArrayRef();
483 /// Checks if the escaped local variable is actually a parameter passed by
484 /// value.
485 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
486 return EscapedParameters;
489 /// Returns the list of the escaped variables with the variably modified
490 /// types.
491 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
492 return EscapedVariableLengthDecls.getArrayRef();
495 /// Returns the list of the delayed variables with the variably modified
496 /// types.
497 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
498 return DelayedVariableLengthDecls.getArrayRef();
501 } // anonymous namespace
503 CGOpenMPRuntimeGPU::ExecutionMode
504 CGOpenMPRuntimeGPU::getExecutionMode() const {
505 return CurrentExecutionMode;
508 CGOpenMPRuntimeGPU::DataSharingMode
509 CGOpenMPRuntimeGPU::getDataSharingMode() const {
510 return CurrentDataSharingMode;
513 /// Check for inner (nested) SPMD construct, if any
514 static bool hasNestedSPMDDirective(ASTContext &Ctx,
515 const OMPExecutableDirective &D) {
516 const auto *CS = D.getInnermostCapturedStmt();
517 const auto *Body =
518 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
519 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
521 if (const auto *NestedDir =
522 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
523 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
524 switch (D.getDirectiveKind()) {
525 case OMPD_target:
526 if (isOpenMPParallelDirective(DKind))
527 return true;
528 if (DKind == OMPD_teams) {
529 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
530 /*IgnoreCaptured=*/true);
531 if (!Body)
532 return false;
533 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
534 if (const auto *NND =
535 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
536 DKind = NND->getDirectiveKind();
537 if (isOpenMPParallelDirective(DKind))
538 return true;
541 return false;
542 case OMPD_target_teams:
543 return isOpenMPParallelDirective(DKind);
544 case OMPD_target_simd:
545 case OMPD_target_parallel:
546 case OMPD_target_parallel_for:
547 case OMPD_target_parallel_for_simd:
548 case OMPD_target_teams_distribute:
549 case OMPD_target_teams_distribute_simd:
550 case OMPD_target_teams_distribute_parallel_for:
551 case OMPD_target_teams_distribute_parallel_for_simd:
552 case OMPD_parallel:
553 case OMPD_for:
554 case OMPD_parallel_for:
555 case OMPD_parallel_master:
556 case OMPD_parallel_sections:
557 case OMPD_for_simd:
558 case OMPD_parallel_for_simd:
559 case OMPD_cancel:
560 case OMPD_cancellation_point:
561 case OMPD_ordered:
562 case OMPD_threadprivate:
563 case OMPD_allocate:
564 case OMPD_task:
565 case OMPD_simd:
566 case OMPD_sections:
567 case OMPD_section:
568 case OMPD_single:
569 case OMPD_master:
570 case OMPD_critical:
571 case OMPD_taskyield:
572 case OMPD_barrier:
573 case OMPD_taskwait:
574 case OMPD_taskgroup:
575 case OMPD_atomic:
576 case OMPD_flush:
577 case OMPD_depobj:
578 case OMPD_scan:
579 case OMPD_teams:
580 case OMPD_target_data:
581 case OMPD_target_exit_data:
582 case OMPD_target_enter_data:
583 case OMPD_distribute:
584 case OMPD_distribute_simd:
585 case OMPD_distribute_parallel_for:
586 case OMPD_distribute_parallel_for_simd:
587 case OMPD_teams_distribute:
588 case OMPD_teams_distribute_simd:
589 case OMPD_teams_distribute_parallel_for:
590 case OMPD_teams_distribute_parallel_for_simd:
591 case OMPD_target_update:
592 case OMPD_declare_simd:
593 case OMPD_declare_variant:
594 case OMPD_begin_declare_variant:
595 case OMPD_end_declare_variant:
596 case OMPD_declare_target:
597 case OMPD_end_declare_target:
598 case OMPD_declare_reduction:
599 case OMPD_declare_mapper:
600 case OMPD_taskloop:
601 case OMPD_taskloop_simd:
602 case OMPD_master_taskloop:
603 case OMPD_master_taskloop_simd:
604 case OMPD_parallel_master_taskloop:
605 case OMPD_parallel_master_taskloop_simd:
606 case OMPD_requires:
607 case OMPD_unknown:
608 default:
609 llvm_unreachable("Unexpected directive.");
613 return false;
616 static bool supportsSPMDExecutionMode(ASTContext &Ctx,
617 const OMPExecutableDirective &D) {
618 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
619 switch (DirectiveKind) {
620 case OMPD_target:
621 case OMPD_target_teams:
622 return hasNestedSPMDDirective(Ctx, D);
623 case OMPD_target_parallel_loop:
624 case OMPD_target_parallel:
625 case OMPD_target_parallel_for:
626 case OMPD_target_parallel_for_simd:
627 case OMPD_target_teams_distribute_parallel_for:
628 case OMPD_target_teams_distribute_parallel_for_simd:
629 case OMPD_target_simd:
630 case OMPD_target_teams_distribute_simd:
631 return true;
632 case OMPD_target_teams_distribute:
633 return false;
634 case OMPD_target_teams_loop:
635 // Whether this is true or not depends on how the directive will
636 // eventually be emitted.
637 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))
638 return TTLD->canBeParallelFor();
639 return false;
640 case OMPD_parallel:
641 case OMPD_for:
642 case OMPD_parallel_for:
643 case OMPD_parallel_master:
644 case OMPD_parallel_sections:
645 case OMPD_for_simd:
646 case OMPD_parallel_for_simd:
647 case OMPD_cancel:
648 case OMPD_cancellation_point:
649 case OMPD_ordered:
650 case OMPD_threadprivate:
651 case OMPD_allocate:
652 case OMPD_task:
653 case OMPD_simd:
654 case OMPD_sections:
655 case OMPD_section:
656 case OMPD_single:
657 case OMPD_master:
658 case OMPD_critical:
659 case OMPD_taskyield:
660 case OMPD_barrier:
661 case OMPD_taskwait:
662 case OMPD_taskgroup:
663 case OMPD_atomic:
664 case OMPD_flush:
665 case OMPD_depobj:
666 case OMPD_scan:
667 case OMPD_teams:
668 case OMPD_target_data:
669 case OMPD_target_exit_data:
670 case OMPD_target_enter_data:
671 case OMPD_distribute:
672 case OMPD_distribute_simd:
673 case OMPD_distribute_parallel_for:
674 case OMPD_distribute_parallel_for_simd:
675 case OMPD_teams_distribute:
676 case OMPD_teams_distribute_simd:
677 case OMPD_teams_distribute_parallel_for:
678 case OMPD_teams_distribute_parallel_for_simd:
679 case OMPD_target_update:
680 case OMPD_declare_simd:
681 case OMPD_declare_variant:
682 case OMPD_begin_declare_variant:
683 case OMPD_end_declare_variant:
684 case OMPD_declare_target:
685 case OMPD_end_declare_target:
686 case OMPD_declare_reduction:
687 case OMPD_declare_mapper:
688 case OMPD_taskloop:
689 case OMPD_taskloop_simd:
690 case OMPD_master_taskloop:
691 case OMPD_master_taskloop_simd:
692 case OMPD_parallel_master_taskloop:
693 case OMPD_parallel_master_taskloop_simd:
694 case OMPD_requires:
695 case OMPD_unknown:
696 default:
697 break;
699 llvm_unreachable(
700 "Unknown programming model for OpenMP directive on NVPTX target.");
703 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
704 StringRef ParentName,
705 llvm::Function *&OutlinedFn,
706 llvm::Constant *&OutlinedFnID,
707 bool IsOffloadEntry,
708 const RegionCodeGenTy &CodeGen) {
709 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
710 EntryFunctionState EST;
711 WrapperFunctionsMap.clear();
713 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
714 assert(!IsBareKernel && "bare kernel should not be at generic mode");
716 // Emit target region as a standalone region.
717 class NVPTXPrePostActionTy : public PrePostActionTy {
718 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
719 const OMPExecutableDirective &D;
721 public:
722 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
723 const OMPExecutableDirective &D)
724 : EST(EST), D(D) {}
725 void Enter(CodeGenFunction &CGF) override {
726 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
727 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
728 // Skip target region initialization.
729 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
731 void Exit(CodeGenFunction &CGF) override {
732 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
733 RT.clearLocThreadIdInsertPt(CGF);
734 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
736 } Action(EST, D);
737 CodeGen.setAction(Action);
738 IsInTTDRegion = true;
739 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
740 IsOffloadEntry, CodeGen);
741 IsInTTDRegion = false;
744 void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
745 CodeGenFunction &CGF,
746 EntryFunctionState &EST, bool IsSPMD) {
747 int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
748 MaxTeamsVal = -1;
749 computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
750 MinTeamsVal, MaxTeamsVal);
752 CGBuilderTy &Bld = CGF.Builder;
753 Bld.restoreIP(OMPBuilder.createTargetInit(
754 Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
755 if (!IsSPMD)
756 emitGenericVarsProlog(CGF, EST.Loc);
759 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
760 EntryFunctionState &EST,
761 bool IsSPMD) {
762 if (!IsSPMD)
763 emitGenericVarsEpilog(CGF);
765 // This is temporary until we remove the fixed sized buffer.
766 ASTContext &C = CGM.getContext();
767 RecordDecl *StaticRD = C.buildImplicitRecord(
768 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);
769 StaticRD->startDefinition();
770 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
771 QualType RecTy = C.getRecordType(TeamReductionRec);
772 auto *Field = FieldDecl::Create(
773 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
774 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
775 /*BW=*/nullptr, /*Mutable=*/false,
776 /*InitStyle=*/ICIS_NoInit);
777 Field->setAccess(AS_public);
778 StaticRD->addDecl(Field);
780 StaticRD->completeDefinition();
781 QualType StaticTy = C.getRecordType(StaticRD);
782 llvm::Type *LLVMReductionsBufferTy =
783 CGM.getTypes().ConvertTypeForMem(StaticTy);
784 const auto &DL = CGM.getModule().getDataLayout();
785 uint64_t ReductionDataSize =
786 TeamsReductions.empty()
788 : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
789 CGBuilderTy &Bld = CGF.Builder;
790 OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
791 C.getLangOpts().OpenMPCUDAReductionBufNum);
792 TeamsReductions.clear();
795 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
796 StringRef ParentName,
797 llvm::Function *&OutlinedFn,
798 llvm::Constant *&OutlinedFnID,
799 bool IsOffloadEntry,
800 const RegionCodeGenTy &CodeGen) {
801 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
802 EntryFunctionState EST;
804 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
806 // Emit target region as a standalone region.
807 class NVPTXPrePostActionTy : public PrePostActionTy {
808 CGOpenMPRuntimeGPU &RT;
809 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
810 bool IsBareKernel;
811 DataSharingMode Mode;
812 const OMPExecutableDirective &D;
814 public:
815 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
816 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
817 bool IsBareKernel, const OMPExecutableDirective &D)
818 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
819 Mode(RT.CurrentDataSharingMode), D(D) {}
820 void Enter(CodeGenFunction &CGF) override {
821 if (IsBareKernel) {
822 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
823 return;
825 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
826 // Skip target region initialization.
827 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
829 void Exit(CodeGenFunction &CGF) override {
830 if (IsBareKernel) {
831 RT.CurrentDataSharingMode = Mode;
832 return;
834 RT.clearLocThreadIdInsertPt(CGF);
835 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
837 } Action(*this, EST, IsBareKernel, D);
838 CodeGen.setAction(Action);
839 IsInTTDRegion = true;
840 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
841 IsOffloadEntry, CodeGen);
842 IsInTTDRegion = false;
845 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
846 const OMPExecutableDirective &D, StringRef ParentName,
847 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
848 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
849 if (!IsOffloadEntry) // Nothing to do.
850 return;
852 assert(!ParentName.empty() && "Invalid target region parent name!");
854 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
855 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
856 if (Mode || IsBareKernel)
857 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
858 CodeGen);
859 else
860 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
861 CodeGen);
864 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
865 : CGOpenMPRuntime(CGM) {
866 llvm::OpenMPIRBuilderConfig Config(
867 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
868 CGM.getLangOpts().OpenMPOffloadMandatory,
869 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
870 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
871 OMPBuilder.setConfig(Config);
873 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
874 llvm_unreachable("OpenMP can only handle device code.");
876 if (CGM.getLangOpts().OpenMPCUDAMode)
877 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
879 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
880 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
881 return;
883 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
884 "__omp_rtl_debug_kind");
885 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
886 "__omp_rtl_assume_teams_oversubscription");
887 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
888 "__omp_rtl_assume_threads_oversubscription");
889 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
890 "__omp_rtl_assume_no_thread_state");
891 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
892 "__omp_rtl_assume_no_nested_parallelism");
895 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
896 ProcBindKind ProcBind,
897 SourceLocation Loc) {
898 // Nothing to do.
901 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
902 llvm::Value *NumThreads,
903 SourceLocation Loc) {
904 // Nothing to do.
907 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
908 const Expr *NumTeams,
909 const Expr *ThreadLimit,
910 SourceLocation Loc) {}
912 llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
913 CodeGenFunction &CGF, const OMPExecutableDirective &D,
914 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
915 const RegionCodeGenTy &CodeGen) {
916 // Emit target region as a standalone region.
917 bool PrevIsInTTDRegion = IsInTTDRegion;
918 IsInTTDRegion = false;
919 auto *OutlinedFun =
920 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
921 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
922 IsInTTDRegion = PrevIsInTTDRegion;
923 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
924 llvm::Function *WrapperFun =
925 createParallelDataSharingWrapper(OutlinedFun, D);
926 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
929 return OutlinedFun;
932 /// Get list of lastprivate variables from the teams distribute ... or
933 /// teams {distribute ...} directives.
934 static void
935 getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
936 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
937 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
938 "expected teams directive.");
939 const OMPExecutableDirective *Dir = &D;
940 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
941 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
942 Ctx,
943 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
944 /*IgnoreCaptured=*/true))) {
945 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
946 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
947 Dir = nullptr;
950 if (!Dir)
951 return;
952 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
953 for (const Expr *E : C->getVarRefs())
954 Vars.push_back(getPrivateItem(E));
958 /// Get list of reduction variables from the teams ... directives.
959 static void
960 getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
961 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
962 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
963 "expected teams directive.");
964 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
965 for (const Expr *E : C->privates())
966 Vars.push_back(getPrivateItem(E));
970 llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
971 CodeGenFunction &CGF, const OMPExecutableDirective &D,
972 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
973 const RegionCodeGenTy &CodeGen) {
974 SourceLocation Loc = D.getBeginLoc();
976 const RecordDecl *GlobalizedRD = nullptr;
977 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
978 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
979 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
980 // Globalize team reductions variable unconditionally in all modes.
981 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
982 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
983 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
984 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
985 if (!LastPrivatesReductions.empty()) {
986 GlobalizedRD = ::buildRecordForGlobalizedVars(
987 CGM.getContext(), {}, LastPrivatesReductions, MappedDeclsFields,
988 WarpSize);
990 } else if (!LastPrivatesReductions.empty()) {
991 assert(!TeamAndReductions.first &&
992 "Previous team declaration is not expected.");
993 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
994 std::swap(TeamAndReductions.second, LastPrivatesReductions);
997 // Emit target region as a standalone region.
998 class NVPTXPrePostActionTy : public PrePostActionTy {
999 SourceLocation &Loc;
1000 const RecordDecl *GlobalizedRD;
1001 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1002 &MappedDeclsFields;
1004 public:
1005 NVPTXPrePostActionTy(
1006 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1007 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1008 &MappedDeclsFields)
1009 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1010 MappedDeclsFields(MappedDeclsFields) {}
1011 void Enter(CodeGenFunction &CGF) override {
1012 auto &Rt =
1013 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1014 if (GlobalizedRD) {
1015 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1016 I->getSecond().MappedParams =
1017 std::make_unique<CodeGenFunction::OMPMapVars>();
1018 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1019 for (const auto &Pair : MappedDeclsFields) {
1020 assert(Pair.getFirst()->isCanonicalDecl() &&
1021 "Expected canonical declaration");
1022 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1025 Rt.emitGenericVarsProlog(CGF, Loc);
1027 void Exit(CodeGenFunction &CGF) override {
1028 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1029 .emitGenericVarsEpilog(CGF);
1031 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1032 CodeGen.setAction(Action);
1033 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1034 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1036 return OutlinedFun;
1039 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1040 SourceLocation Loc) {
1041 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1042 return;
1044 CGBuilderTy &Bld = CGF.Builder;
1046 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1047 if (I == FunctionGlobalizedDecls.end())
1048 return;
1050 for (auto &Rec : I->getSecond().LocalVarData) {
1051 const auto *VD = cast<VarDecl>(Rec.first);
1052 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1053 QualType VarTy = VD->getType();
1055 // Get the local allocation of a firstprivate variable before sharing
1056 llvm::Value *ParValue;
1057 if (EscapedParam) {
1058 LValue ParLVal =
1059 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1060 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1063 // Allocate space for the variable to be globalized
1064 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1065 llvm::CallBase *VoidPtr =
1066 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1067 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1068 AllocArgs, VD->getName());
1069 // FIXME: We should use the variables actual alignment as an argument.
1070 VoidPtr->addRetAttr(llvm::Attribute::get(
1071 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1072 CGM.getContext().getTargetInfo().getNewAlign() / 8));
1074 // Cast the void pointer and get the address of the globalized variable.
1075 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1076 VoidPtr, Bld.getPtrTy(0), VD->getName() + "_on_stack");
1077 LValue VarAddr =
1078 CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);
1079 Rec.second.PrivateAddr = VarAddr.getAddress();
1080 Rec.second.GlobalizedVal = VoidPtr;
1082 // Assign the local allocation to the newly globalized location.
1083 if (EscapedParam) {
1084 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1085 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1087 if (auto *DI = CGF.getDebugInfo())
1088 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1091 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1092 const auto *VD = cast<VarDecl>(ValueD);
1093 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1094 getKmpcAllocShared(CGF, VD);
1095 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
1096 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1097 CGM.getContext().getDeclAlign(VD),
1098 AlignmentSource::Decl);
1099 I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress());
1101 I->getSecond().MappedParams->apply(CGF);
1104 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
1105 const VarDecl *VD) const {
1106 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1107 if (I == FunctionGlobalizedDecls.end())
1108 return false;
1110 // Check variable declaration is delayed:
1111 return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);
1114 std::pair<llvm::Value *, llvm::Value *>
1115 CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
1116 const VarDecl *VD) {
1117 CGBuilderTy &Bld = CGF.Builder;
1119 // Compute size and alignment.
1120 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1121 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1122 Size = Bld.CreateNUWAdd(
1123 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1124 llvm::Value *AlignVal =
1125 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1126 Size = Bld.CreateUDiv(Size, AlignVal);
1127 Size = Bld.CreateNUWMul(Size, AlignVal);
1129 // Allocate space for this VLA object to be globalized.
1130 llvm::Value *AllocArgs[] = {Size};
1131 llvm::CallBase *VoidPtr =
1132 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1133 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1134 AllocArgs, VD->getName());
1135 VoidPtr->addRetAttr(llvm::Attribute::get(
1136 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1138 return std::make_pair(VoidPtr, Size);
1141 void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1142 CodeGenFunction &CGF,
1143 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1144 // Deallocate the memory for each globalized VLA object
1145 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1146 CGM.getModule(), OMPRTL___kmpc_free_shared),
1147 {AddrSizePair.first, AddrSizePair.second});
1150 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1151 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1152 return;
1154 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1155 if (I != FunctionGlobalizedDecls.end()) {
1156 // Deallocate the memory for each globalized VLA object that was
1157 // globalized in the prolog (i.e. emitGenericVarsProlog).
1158 for (const auto &AddrSizePair :
1159 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1160 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1161 CGM.getModule(), OMPRTL___kmpc_free_shared),
1162 {AddrSizePair.first, AddrSizePair.second});
1164 // Deallocate the memory for each globalized value
1165 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1166 const auto *VD = cast<VarDecl>(Rec.first);
1167 I->getSecond().MappedParams->restore(CGF);
1169 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1170 CGF.getTypeSize(VD->getType())};
1171 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1172 CGM.getModule(), OMPRTL___kmpc_free_shared),
1173 FreeArgs);
1178 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1179 const OMPExecutableDirective &D,
1180 SourceLocation Loc,
1181 llvm::Function *OutlinedFn,
1182 ArrayRef<llvm::Value *> CapturedVars) {
1183 if (!CGF.HaveInsertPoint())
1184 return;
1186 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1188 RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1189 /*Name=*/".zero.addr");
1190 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1191 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1192 // We don't emit any thread id function call in bare kernel, but because the
1193 // outlined function has a pointer argument, we emit a nullptr here.
1194 if (IsBareKernel)
1195 OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
1196 else
1197 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1198 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1199 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1200 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1203 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1204 SourceLocation Loc,
1205 llvm::Function *OutlinedFn,
1206 ArrayRef<llvm::Value *> CapturedVars,
1207 const Expr *IfCond,
1208 llvm::Value *NumThreads) {
1209 if (!CGF.HaveInsertPoint())
1210 return;
1212 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1213 NumThreads](CodeGenFunction &CGF,
1214 PrePostActionTy &Action) {
1215 CGBuilderTy &Bld = CGF.Builder;
1216 llvm::Value *NumThreadsVal = NumThreads;
1217 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1218 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1219 if (WFn)
1220 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1221 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1223 // Create a private scope that will globalize the arguments
1224 // passed from the outside of the target region.
1225 // TODO: Is that needed?
1226 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1228 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1229 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1230 "captured_vars_addrs");
1231 // There's something to share.
1232 if (!CapturedVars.empty()) {
1233 // Prepare for parallel region. Indicate the outlined function.
1234 ASTContext &Ctx = CGF.getContext();
1235 unsigned Idx = 0;
1236 for (llvm::Value *V : CapturedVars) {
1237 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1238 llvm::Value *PtrV;
1239 if (V->getType()->isIntegerTy())
1240 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1241 else
1242 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1243 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1244 Ctx.getPointerType(Ctx.VoidPtrTy));
1245 ++Idx;
1249 llvm::Value *IfCondVal = nullptr;
1250 if (IfCond)
1251 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1252 /* isSigned */ false);
1253 else
1254 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1256 if (!NumThreadsVal)
1257 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1258 else
1259 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1261 assert(IfCondVal && "Expected a value");
1262 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1263 llvm::Value *Args[] = {
1264 RTLoc,
1265 getThreadID(CGF, Loc),
1266 IfCondVal,
1267 NumThreadsVal,
1268 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1269 FnPtr,
1271 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
1272 CGF.VoidPtrPtrTy),
1273 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1274 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1275 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1276 Args);
1279 RegionCodeGenTy RCG(ParallelGen);
1280 RCG(CGF);
1283 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1284 // Always emit simple barriers!
1285 if (!CGF.HaveInsertPoint())
1286 return;
1287 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1288 // This function does not use parameters, so we can emit just default values.
1289 llvm::Value *Args[] = {
1290 llvm::ConstantPointerNull::get(
1291 cast<llvm::PointerType>(getIdentTyPointerTy())),
1292 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1293 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1294 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1295 Args);
1298 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1299 SourceLocation Loc,
1300 OpenMPDirectiveKind Kind, bool,
1301 bool) {
1302 // Always emit simple barriers!
1303 if (!CGF.HaveInsertPoint())
1304 return;
1305 // Build call __kmpc_cancel_barrier(loc, thread_id);
1306 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1307 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1308 getThreadID(CGF, Loc)};
1310 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1311 CGM.getModule(), OMPRTL___kmpc_barrier),
1312 Args);
1315 void CGOpenMPRuntimeGPU::emitCriticalRegion(
1316 CodeGenFunction &CGF, StringRef CriticalName,
1317 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1318 const Expr *Hint) {
1319 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1320 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1321 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1322 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1323 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1325 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1327 // Get the mask of active threads in the warp.
1328 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1329 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1330 // Fetch team-local id of the thread.
1331 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1333 // Get the width of the team.
1334 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1336 // Initialize the counter variable for the loop.
1337 QualType Int32Ty =
1338 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1339 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1340 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1341 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1342 /*isInit=*/true);
1344 // Block checks if loop counter exceeds upper bound.
1345 CGF.EmitBlock(LoopBB);
1346 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1347 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1348 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1350 // Block tests which single thread should execute region, and which threads
1351 // should go straight to synchronisation point.
1352 CGF.EmitBlock(TestBB);
1353 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1354 llvm::Value *CmpThreadToCounter =
1355 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1356 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1358 // Block emits the body of the critical region.
1359 CGF.EmitBlock(BodyBB);
1361 // Output the critical statement.
1362 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1363 Hint);
1365 // After the body surrounded by the critical region, the single executing
1366 // thread will jump to the synchronisation point.
1367 // Block waits for all threads in current team to finish then increments the
1368 // counter variable and returns to the loop.
1369 CGF.EmitBlock(SyncBB);
1370 // Reconverge active threads in the warp.
1371 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1372 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1373 Mask);
1375 llvm::Value *IncCounterVal =
1376 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1377 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1378 CGF.EmitBranch(LoopBB);
1380 // Block that is reached when all threads in the team complete the region.
1381 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1384 /// Cast value to the specified type.
1385 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1386 QualType ValTy, QualType CastTy,
1387 SourceLocation Loc) {
1388 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1389 "Cast type must sized.");
1390 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1391 "Val type must sized.");
1392 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1393 if (ValTy == CastTy)
1394 return Val;
1395 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1396 CGF.getContext().getTypeSizeInChars(CastTy))
1397 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1398 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1399 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1400 CastTy->hasSignedIntegerRepresentation());
1401 Address CastItem = CGF.CreateMemTemp(CastTy);
1402 Address ValCastItem = CastItem.withElementType(Val->getType());
1403 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1404 LValueBaseInfo(AlignmentSource::Type),
1405 TBAAAccessInfo());
1406 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1407 LValueBaseInfo(AlignmentSource::Type),
1408 TBAAAccessInfo());
1412 /// Design of OpenMP reductions on the GPU
1414 /// Consider a typical OpenMP program with one or more reduction
1415 /// clauses:
1417 /// float foo;
1418 /// double bar;
1419 /// #pragma omp target teams distribute parallel for \
1420 /// reduction(+:foo) reduction(*:bar)
1421 /// for (int i = 0; i < N; i++) {
1422 /// foo += A[i]; bar *= B[i];
1423 /// }
1425 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1426 /// all teams. In our OpenMP implementation on the NVPTX device an
1427 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1428 /// within a team are mapped to CUDA threads within a threadblock.
1429 /// Our goal is to efficiently aggregate values across all OpenMP
1430 /// threads such that:
1432 /// - the compiler and runtime are logically concise, and
1433 /// - the reduction is performed efficiently in a hierarchical
1434 /// manner as follows: within OpenMP threads in the same warp,
1435 /// across warps in a threadblock, and finally across teams on
1436 /// the NVPTX device.
1438 /// Introduction to Decoupling
1440 /// We would like to decouple the compiler and the runtime so that the
1441 /// latter is ignorant of the reduction variables (number, data types)
1442 /// and the reduction operators. This allows a simpler interface
1443 /// and implementation while still attaining good performance.
1445 /// Pseudocode for the aforementioned OpenMP program generated by the
1446 /// compiler is as follows:
1448 /// 1. Create private copies of reduction variables on each OpenMP
1449 /// thread: 'foo_private', 'bar_private'
1450 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1451 /// to it and writes the result in 'foo_private' and 'bar_private'
1452 /// respectively.
1453 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1454 /// and store the result on the team master:
1456 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1457 /// reduceData, shuffleReduceFn, interWarpCpyFn)
1459 /// where:
1460 /// struct ReduceData {
1461 /// double *foo;
1462 /// double *bar;
1463 /// } reduceData
1464 /// reduceData.foo = &foo_private
1465 /// reduceData.bar = &bar_private
1467 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1468 /// auxiliary functions generated by the compiler that operate on
1469 /// variables of type 'ReduceData'. They aid the runtime perform
1470 /// algorithmic steps in a data agnostic manner.
1472 /// 'shuffleReduceFn' is a pointer to a function that reduces data
1473 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
1474 /// same warp. It takes the following arguments as input:
1476 /// a. variable of type 'ReduceData' on the calling lane,
1477 /// b. its lane_id,
1478 /// c. an offset relative to the current lane_id to generate a
1479 /// remote_lane_id. The remote lane contains the second
1480 /// variable of type 'ReduceData' that is to be reduced.
1481 /// d. an algorithm version parameter determining which reduction
1482 /// algorithm to use.
1484 /// 'shuffleReduceFn' retrieves data from the remote lane using
1485 /// efficient GPU shuffle intrinsics and reduces, using the
1486 /// algorithm specified by the 4th parameter, the two operands
1487 /// element-wise. The result is written to the first operand.
1489 /// Different reduction algorithms are implemented in different
1490 /// runtime functions, all calling 'shuffleReduceFn' to perform
1491 /// the essential reduction step. Therefore, based on the 4th
1492 /// parameter, this function behaves slightly differently to
1493 /// cooperate with the runtime to ensure correctness under
1494 /// different circumstances.
1496 /// 'InterWarpCpyFn' is a pointer to a function that transfers
1497 /// reduced variables across warps. It tunnels, through CUDA
1498 /// shared memory, the thread-private data of type 'ReduceData'
1499 /// from lane 0 of each warp to a lane in the first warp.
1500 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1501 /// The last team writes the global reduced value to memory.
1503 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1504 /// reduceData, shuffleReduceFn, interWarpCpyFn,
1505 /// scratchpadCopyFn, loadAndReduceFn)
1507 /// 'scratchpadCopyFn' is a helper that stores reduced
1508 /// data from the team master to a scratchpad array in
1509 /// global memory.
1511 /// 'loadAndReduceFn' is a helper that loads data from
1512 /// the scratchpad array and reduces it with the input
1513 /// operand.
1515 /// These compiler generated functions hide address
1516 /// calculation and alignment information from the runtime.
1517 /// 5. if ret == 1:
1518 /// The team master of the last team stores the reduced
1519 /// result to the globals in memory.
1520 /// foo += reduceData.foo; bar *= reduceData.bar
1523 /// Warp Reduction Algorithms
1525 /// On the warp level, we have three algorithms implemented in the
1526 /// OpenMP runtime depending on the number of active lanes:
1528 /// Full Warp Reduction
1530 /// The reduce algorithm within a warp where all lanes are active
1531 /// is implemented in the runtime as follows:
1533 /// full_warp_reduce(void *reduce_data,
1534 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1535 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1536 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
1537 /// }
1539 /// The algorithm completes in log(2, WARPSIZE) steps.
1541 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1542 /// not used therefore we save instructions by not retrieving lane_id
1543 /// from the corresponding special registers. The 4th parameter, which
1544 /// represents the version of the algorithm being used, is set to 0 to
1545 /// signify full warp reduction.
1547 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1549 /// #reduce_elem refers to an element in the local lane's data structure
1550 /// #remote_elem is retrieved from a remote lane
1551 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1552 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1554 /// Contiguous Partial Warp Reduction
1556 /// This reduce algorithm is used within a warp where only the first
1557 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1558 /// number of OpenMP threads in a parallel region is not a multiple of
1559 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
1561 /// void
1562 /// contiguous_partial_reduce(void *reduce_data,
1563 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1564 /// int size, int lane_id) {
1565 /// int curr_size;
1566 /// int offset;
1567 /// curr_size = size;
1568 /// mask = curr_size/2;
1569 /// while (offset>0) {
1570 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1571 /// curr_size = (curr_size+1)/2;
1572 /// offset = curr_size/2;
1573 /// }
1574 /// }
1576 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1578 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1579 /// if (lane_id < offset)
1580 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1581 /// else
1582 /// reduce_elem = remote_elem
1584 /// This algorithm assumes that the data to be reduced are located in a
1585 /// contiguous subset of lanes starting from the first. When there is
1586 /// an odd number of active lanes, the data in the last lane is not
1587 /// aggregated with any other lane's dat but is instead copied over.
1589 /// Dispersed Partial Warp Reduction
1591 /// This algorithm is used within a warp when any discontiguous subset of
1592 /// lanes are active. It is used to implement the reduction operation
1593 /// across lanes in an OpenMP simd region or in a nested parallel region.
1595 /// void
1596 /// dispersed_partial_reduce(void *reduce_data,
1597 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1598 /// int size, remote_id;
1599 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1600 /// do {
1601 /// remote_id = next_active_lane_id_right_after_me();
1602 /// # the above function returns 0 of no active lane
1603 /// # is present right after the current lane.
1604 /// size = number_of_active_lanes_in_this_warp();
1605 /// logical_lane_id /= 2;
1606 /// ShuffleReduceFn(reduce_data, logical_lane_id,
1607 /// remote_id-1-threadIdx.x, 2);
1608 /// } while (logical_lane_id % 2 == 0 && size > 1);
1609 /// }
1611 /// There is no assumption made about the initial state of the reduction.
1612 /// Any number of lanes (>=1) could be active at any position. The reduction
1613 /// result is returned in the first active lane.
1615 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1617 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1618 /// if (lane_id % 2 == 0 && offset > 0)
1619 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1620 /// else
1621 /// reduce_elem = remote_elem
1624 /// Intra-Team Reduction
1626 /// This function, as implemented in the runtime call
1627 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1628 /// threads in a team. It first reduces within a warp using the
1629 /// aforementioned algorithms. We then proceed to gather all such
1630 /// reduced values at the first warp.
1632 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1633 /// data from each of the "warp master" (zeroth lane of each warp, where
1634 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
1635 /// a mathematical sense) the problem of reduction across warp masters in
1636 /// a block to the problem of warp reduction.
1639 /// Inter-Team Reduction
1641 /// Once a team has reduced its data to a single value, it is stored in
1642 /// a global scratchpad array. Since each team has a distinct slot, this
1643 /// can be done without locking.
1645 /// The last team to write to the scratchpad array proceeds to reduce the
1646 /// scratchpad array. One or more workers in the last team use the helper
1647 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1648 /// the k'th worker reduces every k'th element.
1650 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1651 /// reduce across workers and compute a globally reduced value.
1653 void CGOpenMPRuntimeGPU::emitReduction(
1654 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
1655 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
1656 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1657 if (!CGF.HaveInsertPoint())
1658 return;
1660 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
1661 bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
1662 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
1664 ASTContext &C = CGM.getContext();
1666 if (Options.SimpleReduction) {
1667 assert(!TeamsReduction && !ParallelReduction &&
1668 "Invalid reduction selection in emitReduction.");
1669 (void)ParallelReduction;
1670 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
1671 ReductionOps, Options);
1672 return;
1675 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
1676 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
1677 int Cnt = 0;
1678 for (const Expr *DRE : Privates) {
1679 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
1680 ++Cnt;
1682 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
1683 CGM.getContext(), PrivatesReductions, {}, VarFieldMap, 1);
1685 if (TeamsReduction)
1686 TeamsReductions.push_back(ReductionRec);
1688 // Source location for the ident struct
1689 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1691 using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
1692 InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
1693 CGF.AllocaInsertPt->getIterator());
1694 InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
1695 CGF.Builder.GetInsertPoint());
1696 llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
1697 CodeGenIP, CGF.SourceLocToDebugLoc(Loc));
1698 llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
1700 CodeGenFunction::OMPPrivateScope Scope(CGF);
1701 unsigned Idx = 0;
1702 for (const Expr *Private : Privates) {
1703 llvm::Type *ElementType;
1704 llvm::Value *Variable;
1705 llvm::Value *PrivateVariable;
1706 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
1707 ElementType = CGF.ConvertTypeForMem(Private->getType());
1708 const auto *RHSVar =
1709 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
1710 PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF);
1711 const auto *LHSVar =
1712 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
1713 Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF);
1714 llvm::OpenMPIRBuilder::EvalKind EvalKind;
1715 switch (CGF.getEvaluationKind(Private->getType())) {
1716 case TEK_Scalar:
1717 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
1718 break;
1719 case TEK_Complex:
1720 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
1721 break;
1722 case TEK_Aggregate:
1723 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
1724 break;
1726 auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
1727 llvm::Value **LHSPtr, llvm::Value **RHSPtr,
1728 llvm::Function *NewFunc) {
1729 CGF.Builder.restoreIP(CodeGenIP);
1730 auto *CurFn = CGF.CurFn;
1731 CGF.CurFn = NewFunc;
1733 *LHSPtr = CGF.GetAddrOfLocalVar(
1734 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
1735 .emitRawPointer(CGF);
1736 *RHSPtr = CGF.GetAddrOfLocalVar(
1737 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
1738 .emitRawPointer(CGF);
1740 emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
1741 cast<DeclRefExpr>(LHSExprs[I]),
1742 cast<DeclRefExpr>(RHSExprs[I]));
1744 CGF.CurFn = CurFn;
1746 return InsertPointTy(CGF.Builder.GetInsertBlock(),
1747 CGF.Builder.GetInsertPoint());
1749 ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
1750 ElementType, Variable, PrivateVariable, EvalKind,
1751 /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
1752 Idx++;
1755 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
1756 OMPBuilder.createReductionsGPU(
1757 OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
1758 DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
1759 CGF.getTarget().getGridValue(),
1760 C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc);
1761 assert(AfterIP && "unexpected error creating GPU reductions");
1762 CGF.Builder.restoreIP(*AfterIP);
1763 return;
1766 const VarDecl *
1767 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
1768 const VarDecl *NativeParam) const {
1769 if (!NativeParam->getType()->isReferenceType())
1770 return NativeParam;
1771 QualType ArgType = NativeParam->getType();
1772 QualifierCollector QC;
1773 const Type *NonQualTy = QC.strip(ArgType);
1774 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1775 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
1776 if (Attr->getCaptureKind() == OMPC_map) {
1777 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
1778 LangAS::opencl_global);
1781 ArgType = CGM.getContext().getPointerType(PointeeTy);
1782 QC.addRestrict();
1783 enum { NVPTX_local_addr = 5 };
1784 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
1785 ArgType = QC.apply(CGM.getContext(), ArgType);
1786 if (isa<ImplicitParamDecl>(NativeParam))
1787 return ImplicitParamDecl::Create(
1788 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
1789 NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
1790 return ParmVarDecl::Create(
1791 CGM.getContext(),
1792 const_cast<DeclContext *>(NativeParam->getDeclContext()),
1793 NativeParam->getBeginLoc(), NativeParam->getLocation(),
1794 NativeParam->getIdentifier(), ArgType,
1795 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
1798 Address
1799 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
1800 const VarDecl *NativeParam,
1801 const VarDecl *TargetParam) const {
1802 assert(NativeParam != TargetParam &&
1803 NativeParam->getType()->isReferenceType() &&
1804 "Native arg must not be the same as target arg.");
1805 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
1806 QualType NativeParamType = NativeParam->getType();
1807 QualifierCollector QC;
1808 const Type *NonQualTy = QC.strip(NativeParamType);
1809 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1810 unsigned NativePointeeAddrSpace =
1811 CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
1812 QualType TargetTy = TargetParam->getType();
1813 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false,
1814 TargetTy, SourceLocation());
1815 // Cast to native address space.
1816 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1817 TargetAddr,
1818 llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace));
1819 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
1820 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
1821 NativeParamType);
1822 return NativeParamAddr;
1825 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
1826 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
1827 ArrayRef<llvm::Value *> Args) const {
1828 SmallVector<llvm::Value *, 4> TargetArgs;
1829 TargetArgs.reserve(Args.size());
1830 auto *FnType = OutlinedFn.getFunctionType();
1831 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
1832 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
1833 TargetArgs.append(std::next(Args.begin(), I), Args.end());
1834 break;
1836 llvm::Type *TargetType = FnType->getParamType(I);
1837 llvm::Value *NativeArg = Args[I];
1838 if (!TargetType->isPointerTy()) {
1839 TargetArgs.emplace_back(NativeArg);
1840 continue;
1842 TargetArgs.emplace_back(
1843 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType));
1845 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
1848 /// Emit function which wraps the outline parallel region
1849 /// and controls the arguments which are passed to this function.
1850 /// The wrapper ensures that the outlined function is called
1851 /// with the correct arguments when data is shared.
1852 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1853 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
1854 ASTContext &Ctx = CGM.getContext();
1855 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
1857 // Create a function that takes as argument the source thread.
1858 FunctionArgList WrapperArgs;
1859 QualType Int16QTy =
1860 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
1861 QualType Int32QTy =
1862 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
1863 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1864 /*Id=*/nullptr, Int16QTy,
1865 ImplicitParamKind::Other);
1866 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1867 /*Id=*/nullptr, Int32QTy,
1868 ImplicitParamKind::Other);
1869 WrapperArgs.emplace_back(&ParallelLevelArg);
1870 WrapperArgs.emplace_back(&WrapperArg);
1872 const CGFunctionInfo &CGFI =
1873 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
1875 auto *Fn = llvm::Function::Create(
1876 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1877 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
1879 // Ensure we do not inline the function. This is trivially true for the ones
1880 // passed to __kmpc_fork_call but the ones calles in serialized regions
1881 // could be inlined. This is not a perfect but it is closer to the invariant
1882 // we want, namely, every data environment starts with a new function.
1883 // TODO: We should pass the if condition to the runtime function and do the
1884 // handling there. Much cleaner code.
1885 Fn->addFnAttr(llvm::Attribute::NoInline);
1887 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
1888 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1889 Fn->setDoesNotRecurse();
1891 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1892 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
1893 D.getBeginLoc(), D.getBeginLoc());
1895 const auto *RD = CS.getCapturedRecordDecl();
1896 auto CurField = RD->field_begin();
1898 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1899 /*Name=*/".zero.addr");
1900 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1901 // Get the array of arguments.
1902 SmallVector<llvm::Value *, 8> Args;
1904 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));
1905 Args.emplace_back(ZeroAddr.emitRawPointer(CGF));
1907 CGBuilderTy &Bld = CGF.Builder;
1908 auto CI = CS.capture_begin();
1910 // Use global memory for data sharing.
1911 // Handle passing of global args to workers.
1912 RawAddress GlobalArgs =
1913 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
1914 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
1915 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
1916 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1917 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
1918 DataSharingArgs);
1920 // Retrieve the shared variables from the list of references returned
1921 // by the runtime. Pass the variables to the outlined function.
1922 Address SharedArgListAddress = Address::invalid();
1923 if (CS.capture_size() > 0 ||
1924 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1925 SharedArgListAddress = CGF.EmitLoadOfPointer(
1926 GlobalArgs, CGF.getContext()
1927 .getPointerType(CGF.getContext().VoidPtrTy)
1928 .castAs<PointerType>());
1930 unsigned Idx = 0;
1931 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1932 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1933 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1934 Src, Bld.getPtrTy(0), CGF.SizeTy);
1935 llvm::Value *LB = CGF.EmitLoadOfScalar(
1936 TypedAddress,
1937 /*Volatile=*/false,
1938 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
1939 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
1940 Args.emplace_back(LB);
1941 ++Idx;
1942 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1943 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(Src, Bld.getPtrTy(0),
1944 CGF.SizeTy);
1945 llvm::Value *UB = CGF.EmitLoadOfScalar(
1946 TypedAddress,
1947 /*Volatile=*/false,
1948 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
1949 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
1950 Args.emplace_back(UB);
1951 ++Idx;
1953 if (CS.capture_size() > 0) {
1954 ASTContext &CGFContext = CGF.getContext();
1955 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
1956 QualType ElemTy = CurField->getType();
1957 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
1958 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
1959 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
1960 CGF.ConvertTypeForMem(ElemTy));
1961 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
1962 /*Volatile=*/false,
1963 CGFContext.getPointerType(ElemTy),
1964 CI->getLocation());
1965 if (CI->capturesVariableByCopy() &&
1966 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
1967 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
1968 CI->getLocation());
1970 Args.emplace_back(Arg);
1974 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
1975 CGF.FinishFunction();
1976 return Fn;
1979 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
1980 const Decl *D) {
1981 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1982 return;
1984 assert(D && "Expected function or captured|block decl.");
1985 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
1986 "Function is registered already.");
1987 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
1988 "Team is set but not processed.");
1989 const Stmt *Body = nullptr;
1990 bool NeedToDelayGlobalization = false;
1991 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
1992 Body = FD->getBody();
1993 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
1994 Body = BD->getBody();
1995 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
1996 Body = CD->getBody();
1997 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
1998 if (NeedToDelayGlobalization &&
1999 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
2000 return;
2002 if (!Body)
2003 return;
2004 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
2005 VarChecker.Visit(Body);
2006 const RecordDecl *GlobalizedVarsRecord =
2007 VarChecker.getGlobalizedRecord(IsInTTDRegion);
2008 TeamAndReductions.first = nullptr;
2009 TeamAndReductions.second.clear();
2010 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
2011 VarChecker.getEscapedVariableLengthDecls();
2012 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
2013 VarChecker.getDelayedVariableLengthDecls();
2014 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
2015 DelayedVariableLengthDecls.empty())
2016 return;
2017 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2018 I->getSecond().MappedParams =
2019 std::make_unique<CodeGenFunction::OMPMapVars>();
2020 I->getSecond().EscapedParameters.insert(
2021 VarChecker.getEscapedParameters().begin(),
2022 VarChecker.getEscapedParameters().end());
2023 I->getSecond().EscapedVariableLengthDecls.append(
2024 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
2025 I->getSecond().DelayedVariableLengthDecls.append(
2026 DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());
2027 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2028 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2029 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
2030 Data.insert(std::make_pair(VD, MappedVarData()));
2032 if (!NeedToDelayGlobalization) {
2033 emitGenericVarsProlog(CGF, D->getBeginLoc());
2034 struct GlobalizationScope final : EHScopeStack::Cleanup {
2035 GlobalizationScope() = default;
2037 void Emit(CodeGenFunction &CGF, Flags flags) override {
2038 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
2039 .emitGenericVarsEpilog(CGF);
2042 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
2046 Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
2047 const VarDecl *VD) {
2048 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
2049 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2050 auto AS = LangAS::Default;
2051 switch (A->getAllocatorType()) {
2052 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2053 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2054 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2055 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2056 break;
2057 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2058 return Address::invalid();
2059 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2060 // TODO: implement aupport for user-defined allocators.
2061 return Address::invalid();
2062 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2063 AS = LangAS::cuda_constant;
2064 break;
2065 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2066 AS = LangAS::cuda_shared;
2067 break;
2068 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2069 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2070 break;
2072 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
2073 auto *GV = new llvm::GlobalVariable(
2074 CGM.getModule(), VarTy, /*isConstant=*/false,
2075 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
2076 VD->getName(),
2077 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
2078 CGM.getContext().getTargetAddressSpace(AS));
2079 CharUnits Align = CGM.getContext().getDeclAlign(VD);
2080 GV->setAlignment(Align.getAsAlign());
2081 return Address(
2082 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2083 GV, CGF.Builder.getPtrTy(CGM.getContext().getTargetAddressSpace(
2084 VD->getType().getAddressSpace()))),
2085 VarTy, Align);
2088 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2089 return Address::invalid();
2091 VD = VD->getCanonicalDecl();
2092 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2093 if (I == FunctionGlobalizedDecls.end())
2094 return Address::invalid();
2095 auto VDI = I->getSecond().LocalVarData.find(VD);
2096 if (VDI != I->getSecond().LocalVarData.end())
2097 return VDI->second.PrivateAddr;
2098 if (VD->hasAttrs()) {
2099 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
2100 E(VD->attr_end());
2101 IT != E; ++IT) {
2102 auto VDI = I->getSecond().LocalVarData.find(
2103 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
2104 ->getCanonicalDecl());
2105 if (VDI != I->getSecond().LocalVarData.end())
2106 return VDI->second.PrivateAddr;
2110 return Address::invalid();
2113 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
2114 FunctionGlobalizedDecls.erase(CGF.CurFn);
2115 CGOpenMPRuntime::functionFinished(CGF);
2118 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
2119 CodeGenFunction &CGF, const OMPLoopDirective &S,
2120 OpenMPDistScheduleClauseKind &ScheduleKind,
2121 llvm::Value *&Chunk) const {
2122 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2123 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
2124 ScheduleKind = OMPC_DIST_SCHEDULE_static;
2125 Chunk = CGF.EmitScalarConversion(
2126 RT.getGPUNumThreads(CGF),
2127 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2128 S.getIterationVariable()->getType(), S.getBeginLoc());
2129 return;
2131 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
2132 CGF, S, ScheduleKind, Chunk);
2135 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
2136 CodeGenFunction &CGF, const OMPLoopDirective &S,
2137 OpenMPScheduleClauseKind &ScheduleKind,
2138 const Expr *&ChunkExpr) const {
2139 ScheduleKind = OMPC_SCHEDULE_static;
2140 // Chunk size is 1 in this case.
2141 llvm::APInt ChunkSize(32, 1);
2142 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
2143 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2144 SourceLocation());
2147 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
2148 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
2149 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
2150 " Expected target-based directive.");
2151 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
2152 for (const CapturedStmt::Capture &C : CS->captures()) {
2153 // Capture variables captured by reference in lambdas for target-based
2154 // directives.
2155 if (!C.capturesVariable())
2156 continue;
2157 const VarDecl *VD = C.getCapturedVar();
2158 const auto *RD = VD->getType()
2159 .getCanonicalType()
2160 .getNonReferenceType()
2161 ->getAsCXXRecordDecl();
2162 if (!RD || !RD->isLambda())
2163 continue;
2164 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
2165 LValue VDLVal;
2166 if (VD->getType().getCanonicalType()->isReferenceType())
2167 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
2168 else
2169 VDLVal = CGF.MakeAddrLValue(
2170 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
2171 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
2172 FieldDecl *ThisCapture = nullptr;
2173 RD->getCaptureFields(Captures, ThisCapture);
2174 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
2175 LValue ThisLVal =
2176 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
2177 llvm::Value *CXXThis = CGF.LoadCXXThis();
2178 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
2180 for (const LambdaCapture &LC : RD->captures()) {
2181 if (LC.getCaptureKind() != LCK_ByRef)
2182 continue;
2183 const ValueDecl *VD = LC.getCapturedVar();
2184 // FIXME: For now VD is always a VarDecl because OpenMP does not support
2185 // capturing structured bindings in lambdas yet.
2186 if (!CS->capturesVariable(cast<VarDecl>(VD)))
2187 continue;
2188 auto It = Captures.find(VD);
2189 assert(It != Captures.end() && "Found lambda capture without field.");
2190 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
2191 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
2192 if (VD->getType().getCanonicalType()->isReferenceType())
2193 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
2194 VD->getType().getCanonicalType())
2195 .getAddress();
2196 CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);
2201 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
2202 LangAS &AS) {
2203 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
2204 return false;
2205 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2206 switch(A->getAllocatorType()) {
2207 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2208 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2209 // Not supported, fallback to the default mem space.
2210 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2211 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2212 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2213 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2214 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2215 AS = LangAS::Default;
2216 return true;
2217 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2218 AS = LangAS::cuda_constant;
2219 return true;
2220 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2221 AS = LangAS::cuda_shared;
2222 return true;
2223 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2224 llvm_unreachable("Expected predefined allocator for the variables with the "
2225 "static storage.");
2227 return false;
2230 // Get current OffloadArch and ignore any unknown values
2231 static OffloadArch getOffloadArch(CodeGenModule &CGM) {
2232 if (!CGM.getTarget().hasFeature("ptx"))
2233 return OffloadArch::UNKNOWN;
2234 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
2235 if (Feature.getValue()) {
2236 OffloadArch Arch = StringToOffloadArch(Feature.getKey());
2237 if (Arch != OffloadArch::UNKNOWN)
2238 return Arch;
2241 return OffloadArch::UNKNOWN;
2244 /// Check to see if target architecture supports unified addressing which is
2245 /// a restriction for OpenMP requires clause "unified_shared_memory".
2246 void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) {
2247 for (const OMPClause *Clause : D->clauselists()) {
2248 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
2249 OffloadArch Arch = getOffloadArch(CGM);
2250 switch (Arch) {
2251 case OffloadArch::SM_20:
2252 case OffloadArch::SM_21:
2253 case OffloadArch::SM_30:
2254 case OffloadArch::SM_32_:
2255 case OffloadArch::SM_35:
2256 case OffloadArch::SM_37:
2257 case OffloadArch::SM_50:
2258 case OffloadArch::SM_52:
2259 case OffloadArch::SM_53: {
2260 SmallString<256> Buffer;
2261 llvm::raw_svector_ostream Out(Buffer);
2262 Out << "Target architecture " << OffloadArchToString(Arch)
2263 << " does not support unified addressing";
2264 CGM.Error(Clause->getBeginLoc(), Out.str());
2265 return;
2267 case OffloadArch::SM_60:
2268 case OffloadArch::SM_61:
2269 case OffloadArch::SM_62:
2270 case OffloadArch::SM_70:
2271 case OffloadArch::SM_72:
2272 case OffloadArch::SM_75:
2273 case OffloadArch::SM_80:
2274 case OffloadArch::SM_86:
2275 case OffloadArch::SM_87:
2276 case OffloadArch::SM_89:
2277 case OffloadArch::SM_90:
2278 case OffloadArch::SM_90a:
2279 case OffloadArch::SM_100:
2280 case OffloadArch::GFX600:
2281 case OffloadArch::GFX601:
2282 case OffloadArch::GFX602:
2283 case OffloadArch::GFX700:
2284 case OffloadArch::GFX701:
2285 case OffloadArch::GFX702:
2286 case OffloadArch::GFX703:
2287 case OffloadArch::GFX704:
2288 case OffloadArch::GFX705:
2289 case OffloadArch::GFX801:
2290 case OffloadArch::GFX802:
2291 case OffloadArch::GFX803:
2292 case OffloadArch::GFX805:
2293 case OffloadArch::GFX810:
2294 case OffloadArch::GFX9_GENERIC:
2295 case OffloadArch::GFX900:
2296 case OffloadArch::GFX902:
2297 case OffloadArch::GFX904:
2298 case OffloadArch::GFX906:
2299 case OffloadArch::GFX908:
2300 case OffloadArch::GFX909:
2301 case OffloadArch::GFX90a:
2302 case OffloadArch::GFX90c:
2303 case OffloadArch::GFX9_4_GENERIC:
2304 case OffloadArch::GFX940:
2305 case OffloadArch::GFX941:
2306 case OffloadArch::GFX942:
2307 case OffloadArch::GFX950:
2308 case OffloadArch::GFX10_1_GENERIC:
2309 case OffloadArch::GFX1010:
2310 case OffloadArch::GFX1011:
2311 case OffloadArch::GFX1012:
2312 case OffloadArch::GFX1013:
2313 case OffloadArch::GFX10_3_GENERIC:
2314 case OffloadArch::GFX1030:
2315 case OffloadArch::GFX1031:
2316 case OffloadArch::GFX1032:
2317 case OffloadArch::GFX1033:
2318 case OffloadArch::GFX1034:
2319 case OffloadArch::GFX1035:
2320 case OffloadArch::GFX1036:
2321 case OffloadArch::GFX11_GENERIC:
2322 case OffloadArch::GFX1100:
2323 case OffloadArch::GFX1101:
2324 case OffloadArch::GFX1102:
2325 case OffloadArch::GFX1103:
2326 case OffloadArch::GFX1150:
2327 case OffloadArch::GFX1151:
2328 case OffloadArch::GFX1152:
2329 case OffloadArch::GFX1153:
2330 case OffloadArch::GFX12_GENERIC:
2331 case OffloadArch::GFX1200:
2332 case OffloadArch::GFX1201:
2333 case OffloadArch::AMDGCNSPIRV:
2334 case OffloadArch::Generic:
2335 case OffloadArch::UNUSED:
2336 case OffloadArch::UNKNOWN:
2337 break;
2338 case OffloadArch::LAST:
2339 llvm_unreachable("Unexpected GPU arch.");
2343 CGOpenMPRuntime::processRequiresDirective(D);
2346 llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
2347 CGBuilderTy &Bld = CGF.Builder;
2348 llvm::Module *M = &CGF.CGM.getModule();
2349 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
2350 llvm::Function *F = M->getFunction(LocSize);
2351 if (!F) {
2352 F = llvm::Function::Create(llvm::FunctionType::get(CGF.Int32Ty, {}, false),
2353 llvm::GlobalVariable::ExternalLinkage, LocSize,
2354 &CGF.CGM.getModule());
2356 return Bld.CreateCall(F, {}, "nvptx_num_threads");
2359 llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
2360 ArrayRef<llvm::Value *> Args{};
2361 return CGF.EmitRuntimeCall(
2362 OMPBuilder.getOrCreateRuntimeFunction(
2363 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
2364 Args);