1 //===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements the OpenMPIRBuilder class, which is used as a
11 /// convenient way to create LLVM instructions for OpenMP directives.
13 //===----------------------------------------------------------------------===//
15 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
16 #include "llvm/ADT/SmallSet.h"
17 #include "llvm/ADT/StringExtras.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/CodeMetrics.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
23 #include "llvm/Analysis/ScalarEvolution.h"
24 #include "llvm/Analysis/TargetLibraryInfo.h"
25 #include "llvm/Bitcode/BitcodeReader.h"
26 #include "llvm/Frontend/Offloading/Utility.h"
27 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
28 #include "llvm/IR/Attributes.h"
29 #include "llvm/IR/BasicBlock.h"
30 #include "llvm/IR/CFG.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/Constant.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/DebugInfoMetadata.h"
35 #include "llvm/IR/DerivedTypes.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/GlobalVariable.h"
38 #include "llvm/IR/IRBuilder.h"
39 #include "llvm/IR/LLVMContext.h"
40 #include "llvm/IR/MDBuilder.h"
41 #include "llvm/IR/Metadata.h"
42 #include "llvm/IR/PassManager.h"
43 #include "llvm/IR/Value.h"
44 #include "llvm/MC/TargetRegistry.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/FileSystem.h"
48 #include "llvm/Target/TargetMachine.h"
49 #include "llvm/Target/TargetOptions.h"
50 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
51 #include "llvm/Transforms/Utils/Cloning.h"
52 #include "llvm/Transforms/Utils/CodeExtractor.h"
53 #include "llvm/Transforms/Utils/LoopPeel.h"
54 #include "llvm/Transforms/Utils/UnrollLoop.h"
59 #define DEBUG_TYPE "openmp-ir-builder"
65 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden
,
66 cl::desc("Use optimistic attributes describing "
67 "'as-if' properties of runtime calls."),
70 static cl::opt
<double> UnrollThresholdFactor(
71 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden
,
72 cl::desc("Factor for the unroll threshold to account for code "
73 "simplifications still taking place"),
77 /// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
78 /// at position IP1 may change the meaning of IP2 or vice-versa. This is because
79 /// an InsertPoint stores the instruction before something is inserted. For
80 /// instance, if both point to the same instruction, two IRBuilders alternating
81 /// creating instruction will cause the instructions to be interleaved.
82 static bool isConflictIP(IRBuilder
<>::InsertPoint IP1
,
83 IRBuilder
<>::InsertPoint IP2
) {
84 if (!IP1
.isSet() || !IP2
.isSet())
86 return IP1
.getBlock() == IP2
.getBlock() && IP1
.getPoint() == IP2
.getPoint();
89 static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType
) {
90 // Valid ordered/unordered and base algorithm combinations.
91 switch (SchedType
& ~OMPScheduleType::MonotonicityMask
) {
92 case OMPScheduleType::UnorderedStaticChunked
:
93 case OMPScheduleType::UnorderedStatic
:
94 case OMPScheduleType::UnorderedDynamicChunked
:
95 case OMPScheduleType::UnorderedGuidedChunked
:
96 case OMPScheduleType::UnorderedRuntime
:
97 case OMPScheduleType::UnorderedAuto
:
98 case OMPScheduleType::UnorderedTrapezoidal
:
99 case OMPScheduleType::UnorderedGreedy
:
100 case OMPScheduleType::UnorderedBalanced
:
101 case OMPScheduleType::UnorderedGuidedIterativeChunked
:
102 case OMPScheduleType::UnorderedGuidedAnalyticalChunked
:
103 case OMPScheduleType::UnorderedSteal
:
104 case OMPScheduleType::UnorderedStaticBalancedChunked
:
105 case OMPScheduleType::UnorderedGuidedSimd
:
106 case OMPScheduleType::UnorderedRuntimeSimd
:
107 case OMPScheduleType::OrderedStaticChunked
:
108 case OMPScheduleType::OrderedStatic
:
109 case OMPScheduleType::OrderedDynamicChunked
:
110 case OMPScheduleType::OrderedGuidedChunked
:
111 case OMPScheduleType::OrderedRuntime
:
112 case OMPScheduleType::OrderedAuto
:
113 case OMPScheduleType::OrderdTrapezoidal
:
114 case OMPScheduleType::NomergeUnorderedStaticChunked
:
115 case OMPScheduleType::NomergeUnorderedStatic
:
116 case OMPScheduleType::NomergeUnorderedDynamicChunked
:
117 case OMPScheduleType::NomergeUnorderedGuidedChunked
:
118 case OMPScheduleType::NomergeUnorderedRuntime
:
119 case OMPScheduleType::NomergeUnorderedAuto
:
120 case OMPScheduleType::NomergeUnorderedTrapezoidal
:
121 case OMPScheduleType::NomergeUnorderedGreedy
:
122 case OMPScheduleType::NomergeUnorderedBalanced
:
123 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked
:
124 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked
:
125 case OMPScheduleType::NomergeUnorderedSteal
:
126 case OMPScheduleType::NomergeOrderedStaticChunked
:
127 case OMPScheduleType::NomergeOrderedStatic
:
128 case OMPScheduleType::NomergeOrderedDynamicChunked
:
129 case OMPScheduleType::NomergeOrderedGuidedChunked
:
130 case OMPScheduleType::NomergeOrderedRuntime
:
131 case OMPScheduleType::NomergeOrderedAuto
:
132 case OMPScheduleType::NomergeOrderedTrapezoidal
:
138 // Must not set both monotonicity modifiers at the same time.
139 OMPScheduleType MonotonicityFlags
=
140 SchedType
& OMPScheduleType::MonotonicityMask
;
141 if (MonotonicityFlags
== OMPScheduleType::MonotonicityMask
)
148 static const omp::GV
&getGridValue(const Triple
&T
, Function
*Kernel
) {
151 Kernel
->getFnAttribute("target-features").getValueAsString();
152 if (Features
.count("+wavefrontsize64"))
153 return omp::getAMDGPUGridValues
<64>();
154 return omp::getAMDGPUGridValues
<32>();
157 return omp::NVPTXGridValues
;
158 llvm_unreachable("No grid value available for this architecture!");
161 /// Determine which scheduling algorithm to use, determined from schedule clause
163 static OMPScheduleType
164 getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind
, bool HasChunks
,
165 bool HasSimdModifier
) {
166 // Currently, the default schedule it static.
167 switch (ClauseKind
) {
168 case OMP_SCHEDULE_Default
:
169 case OMP_SCHEDULE_Static
:
170 return HasChunks
? OMPScheduleType::BaseStaticChunked
171 : OMPScheduleType::BaseStatic
;
172 case OMP_SCHEDULE_Dynamic
:
173 return OMPScheduleType::BaseDynamicChunked
;
174 case OMP_SCHEDULE_Guided
:
175 return HasSimdModifier
? OMPScheduleType::BaseGuidedSimd
176 : OMPScheduleType::BaseGuidedChunked
;
177 case OMP_SCHEDULE_Auto
:
178 return llvm::omp::OMPScheduleType::BaseAuto
;
179 case OMP_SCHEDULE_Runtime
:
180 return HasSimdModifier
? OMPScheduleType::BaseRuntimeSimd
181 : OMPScheduleType::BaseRuntime
;
183 llvm_unreachable("unhandled schedule clause argument");
186 /// Adds ordering modifier flags to schedule type.
187 static OMPScheduleType
188 getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType
,
189 bool HasOrderedClause
) {
190 assert((BaseScheduleType
& OMPScheduleType::ModifierMask
) ==
191 OMPScheduleType::None
&&
192 "Must not have ordering nor monotonicity flags already set");
194 OMPScheduleType OrderingModifier
= HasOrderedClause
195 ? OMPScheduleType::ModifierOrdered
196 : OMPScheduleType::ModifierUnordered
;
197 OMPScheduleType OrderingScheduleType
= BaseScheduleType
| OrderingModifier
;
199 // Unsupported combinations
200 if (OrderingScheduleType
==
201 (OMPScheduleType::BaseGuidedSimd
| OMPScheduleType::ModifierOrdered
))
202 return OMPScheduleType::OrderedGuidedChunked
;
203 else if (OrderingScheduleType
== (OMPScheduleType::BaseRuntimeSimd
|
204 OMPScheduleType::ModifierOrdered
))
205 return OMPScheduleType::OrderedRuntime
;
207 return OrderingScheduleType
;
210 /// Adds monotonicity modifier flags to schedule type.
211 static OMPScheduleType
212 getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType
,
213 bool HasSimdModifier
, bool HasMonotonic
,
214 bool HasNonmonotonic
, bool HasOrderedClause
) {
215 assert((ScheduleType
& OMPScheduleType::MonotonicityMask
) ==
216 OMPScheduleType::None
&&
217 "Must not have monotonicity flags already set");
218 assert((!HasMonotonic
|| !HasNonmonotonic
) &&
219 "Monotonic and Nonmonotonic are contradicting each other");
222 return ScheduleType
| OMPScheduleType::ModifierMonotonic
;
223 } else if (HasNonmonotonic
) {
224 return ScheduleType
| OMPScheduleType::ModifierNonmonotonic
;
226 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
227 // If the static schedule kind is specified or if the ordered clause is
228 // specified, and if the nonmonotonic modifier is not specified, the
229 // effect is as if the monotonic modifier is specified. Otherwise, unless
230 // the monotonic modifier is specified, the effect is as if the
231 // nonmonotonic modifier is specified.
232 OMPScheduleType BaseScheduleType
=
233 ScheduleType
& ~OMPScheduleType::ModifierMask
;
234 if ((BaseScheduleType
== OMPScheduleType::BaseStatic
) ||
235 (BaseScheduleType
== OMPScheduleType::BaseStaticChunked
) ||
237 // The monotonic is used by default in openmp runtime library, so no need
241 return ScheduleType
| OMPScheduleType::ModifierNonmonotonic
;
246 /// Determine the schedule type using schedule and ordering clause arguments.
247 static OMPScheduleType
248 computeOpenMPScheduleType(ScheduleKind ClauseKind
, bool HasChunks
,
249 bool HasSimdModifier
, bool HasMonotonicModifier
,
250 bool HasNonmonotonicModifier
, bool HasOrderedClause
) {
251 OMPScheduleType BaseSchedule
=
252 getOpenMPBaseScheduleType(ClauseKind
, HasChunks
, HasSimdModifier
);
253 OMPScheduleType OrderedSchedule
=
254 getOpenMPOrderingScheduleType(BaseSchedule
, HasOrderedClause
);
255 OMPScheduleType Result
= getOpenMPMonotonicityScheduleType(
256 OrderedSchedule
, HasSimdModifier
, HasMonotonicModifier
,
257 HasNonmonotonicModifier
, HasOrderedClause
);
259 assert(isValidWorkshareLoopScheduleType(Result
));
263 /// Make \p Source branch to \p Target.
265 /// Handles two situations:
266 /// * \p Source already has an unconditional branch.
267 /// * \p Source is a degenerate block (no terminator because the BB is
268 /// the current head of the IR construction).
269 static void redirectTo(BasicBlock
*Source
, BasicBlock
*Target
, DebugLoc DL
) {
270 if (Instruction
*Term
= Source
->getTerminator()) {
271 auto *Br
= cast
<BranchInst
>(Term
);
272 assert(!Br
->isConditional() &&
273 "BB's terminator must be an unconditional branch (or degenerate)");
274 BasicBlock
*Succ
= Br
->getSuccessor(0);
275 Succ
->removePredecessor(Source
, /*KeepOneInputPHIs=*/true);
276 Br
->setSuccessor(0, Target
);
280 auto *NewBr
= BranchInst::Create(Target
, Source
);
281 NewBr
->setDebugLoc(DL
);
284 void llvm::spliceBB(IRBuilderBase::InsertPoint IP
, BasicBlock
*New
,
286 assert(New
->getFirstInsertionPt() == New
->begin() &&
287 "Target BB must not have PHI nodes");
289 // Move instructions to new block.
290 BasicBlock
*Old
= IP
.getBlock();
291 New
->splice(New
->begin(), Old
, IP
.getPoint(), Old
->end());
294 BranchInst::Create(New
, Old
);
297 void llvm::spliceBB(IRBuilder
<> &Builder
, BasicBlock
*New
, bool CreateBranch
) {
298 DebugLoc DebugLoc
= Builder
.getCurrentDebugLocation();
299 BasicBlock
*Old
= Builder
.GetInsertBlock();
301 spliceBB(Builder
.saveIP(), New
, CreateBranch
);
303 Builder
.SetInsertPoint(Old
->getTerminator());
305 Builder
.SetInsertPoint(Old
);
307 // SetInsertPoint also updates the Builder's debug location, but we want to
308 // keep the one the Builder was configured to use.
309 Builder
.SetCurrentDebugLocation(DebugLoc
);
312 BasicBlock
*llvm::splitBB(IRBuilderBase::InsertPoint IP
, bool CreateBranch
,
314 BasicBlock
*Old
= IP
.getBlock();
315 BasicBlock
*New
= BasicBlock::Create(
316 Old
->getContext(), Name
.isTriviallyEmpty() ? Old
->getName() : Name
,
317 Old
->getParent(), Old
->getNextNode());
318 spliceBB(IP
, New
, CreateBranch
);
319 New
->replaceSuccessorsPhiUsesWith(Old
, New
);
323 BasicBlock
*llvm::splitBB(IRBuilderBase
&Builder
, bool CreateBranch
,
325 DebugLoc DebugLoc
= Builder
.getCurrentDebugLocation();
326 BasicBlock
*New
= splitBB(Builder
.saveIP(), CreateBranch
, Name
);
328 Builder
.SetInsertPoint(Builder
.GetInsertBlock()->getTerminator());
330 Builder
.SetInsertPoint(Builder
.GetInsertBlock());
331 // SetInsertPoint also updates the Builder's debug location, but we want to
332 // keep the one the Builder was configured to use.
333 Builder
.SetCurrentDebugLocation(DebugLoc
);
337 BasicBlock
*llvm::splitBB(IRBuilder
<> &Builder
, bool CreateBranch
,
339 DebugLoc DebugLoc
= Builder
.getCurrentDebugLocation();
340 BasicBlock
*New
= splitBB(Builder
.saveIP(), CreateBranch
, Name
);
342 Builder
.SetInsertPoint(Builder
.GetInsertBlock()->getTerminator());
344 Builder
.SetInsertPoint(Builder
.GetInsertBlock());
345 // SetInsertPoint also updates the Builder's debug location, but we want to
346 // keep the one the Builder was configured to use.
347 Builder
.SetCurrentDebugLocation(DebugLoc
);
351 BasicBlock
*llvm::splitBBWithSuffix(IRBuilderBase
&Builder
, bool CreateBranch
,
352 llvm::Twine Suffix
) {
353 BasicBlock
*Old
= Builder
.GetInsertBlock();
354 return splitBB(Builder
, CreateBranch
, Old
->getName() + Suffix
);
357 // This function creates a fake integer value and a fake use for the integer
358 // value. It returns the fake value created. This is useful in modeling the
359 // extra arguments to the outlined functions.
360 Value
*createFakeIntVal(IRBuilder
<> &Builder
,
361 OpenMPIRBuilder::InsertPointTy OuterAllocaIP
,
362 std::stack
<Instruction
*> &ToBeDeleted
,
363 OpenMPIRBuilder::InsertPointTy InnerAllocaIP
,
364 const Twine
&Name
= "", bool AsPtr
= true) {
365 Builder
.restoreIP(OuterAllocaIP
);
366 Instruction
*FakeVal
;
367 AllocaInst
*FakeValAddr
=
368 Builder
.CreateAlloca(Builder
.getInt32Ty(), nullptr, Name
+ ".addr");
369 ToBeDeleted
.push(FakeValAddr
);
372 FakeVal
= FakeValAddr
;
375 Builder
.CreateLoad(Builder
.getInt32Ty(), FakeValAddr
, Name
+ ".val");
376 ToBeDeleted
.push(FakeVal
);
379 // Generate a fake use of this value
380 Builder
.restoreIP(InnerAllocaIP
);
381 Instruction
*UseFakeVal
;
384 Builder
.CreateLoad(Builder
.getInt32Ty(), FakeVal
, Name
+ ".use");
387 cast
<BinaryOperator
>(Builder
.CreateAdd(FakeVal
, Builder
.getInt32(10)));
389 ToBeDeleted
.push(UseFakeVal
);
393 //===----------------------------------------------------------------------===//
394 // OpenMPIRBuilderConfig
395 //===----------------------------------------------------------------------===//
398 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
399 /// Values for bit flags for marking which requires clauses have been used.
400 enum OpenMPOffloadingRequiresDirFlags
{
402 OMP_REQ_UNDEFINED
= 0x000,
403 /// no requires directive present.
404 OMP_REQ_NONE
= 0x001,
405 /// reverse_offload clause.
406 OMP_REQ_REVERSE_OFFLOAD
= 0x002,
407 /// unified_address clause.
408 OMP_REQ_UNIFIED_ADDRESS
= 0x004,
409 /// unified_shared_memory clause.
410 OMP_REQ_UNIFIED_SHARED_MEMORY
= 0x008,
411 /// dynamic_allocators clause.
412 OMP_REQ_DYNAMIC_ALLOCATORS
= 0x010,
413 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS
)
416 } // anonymous namespace
418 OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
419 : RequiresFlags(OMP_REQ_UNDEFINED
) {}
421 OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
422 bool IsTargetDevice
, bool IsGPU
, bool OpenMPOffloadMandatory
,
423 bool HasRequiresReverseOffload
, bool HasRequiresUnifiedAddress
,
424 bool HasRequiresUnifiedSharedMemory
, bool HasRequiresDynamicAllocators
)
425 : IsTargetDevice(IsTargetDevice
), IsGPU(IsGPU
),
426 OpenMPOffloadMandatory(OpenMPOffloadMandatory
),
427 RequiresFlags(OMP_REQ_UNDEFINED
) {
428 if (HasRequiresReverseOffload
)
429 RequiresFlags
|= OMP_REQ_REVERSE_OFFLOAD
;
430 if (HasRequiresUnifiedAddress
)
431 RequiresFlags
|= OMP_REQ_UNIFIED_ADDRESS
;
432 if (HasRequiresUnifiedSharedMemory
)
433 RequiresFlags
|= OMP_REQ_UNIFIED_SHARED_MEMORY
;
434 if (HasRequiresDynamicAllocators
)
435 RequiresFlags
|= OMP_REQ_DYNAMIC_ALLOCATORS
;
438 bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
439 return RequiresFlags
& OMP_REQ_REVERSE_OFFLOAD
;
442 bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
443 return RequiresFlags
& OMP_REQ_UNIFIED_ADDRESS
;
446 bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
447 return RequiresFlags
& OMP_REQ_UNIFIED_SHARED_MEMORY
;
450 bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
451 return RequiresFlags
& OMP_REQ_DYNAMIC_ALLOCATORS
;
454 int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
455 return hasRequiresFlags() ? RequiresFlags
456 : static_cast<int64_t>(OMP_REQ_NONE
);
459 void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value
) {
461 RequiresFlags
|= OMP_REQ_REVERSE_OFFLOAD
;
463 RequiresFlags
&= ~OMP_REQ_REVERSE_OFFLOAD
;
466 void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value
) {
468 RequiresFlags
|= OMP_REQ_UNIFIED_ADDRESS
;
470 RequiresFlags
&= ~OMP_REQ_UNIFIED_ADDRESS
;
473 void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value
) {
475 RequiresFlags
|= OMP_REQ_UNIFIED_SHARED_MEMORY
;
477 RequiresFlags
&= ~OMP_REQ_UNIFIED_SHARED_MEMORY
;
480 void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value
) {
482 RequiresFlags
|= OMP_REQ_DYNAMIC_ALLOCATORS
;
484 RequiresFlags
&= ~OMP_REQ_DYNAMIC_ALLOCATORS
;
487 //===----------------------------------------------------------------------===//
489 //===----------------------------------------------------------------------===//
491 void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs
&KernelArgs
,
492 IRBuilderBase
&Builder
,
493 SmallVector
<Value
*> &ArgsVector
) {
494 Value
*Version
= Builder
.getInt32(OMP_KERNEL_ARG_VERSION
);
495 Value
*PointerNum
= Builder
.getInt32(KernelArgs
.NumTargetItems
);
496 auto Int32Ty
= Type::getInt32Ty(Builder
.getContext());
497 Value
*ZeroArray
= Constant::getNullValue(ArrayType::get(Int32Ty
, 3));
498 Value
*Flags
= Builder
.getInt64(KernelArgs
.HasNoWait
);
501 Builder
.CreateInsertValue(ZeroArray
, KernelArgs
.NumTeams
, {0});
502 Value
*NumThreads3D
=
503 Builder
.CreateInsertValue(ZeroArray
, KernelArgs
.NumThreads
, {0});
505 ArgsVector
= {Version
,
507 KernelArgs
.RTArgs
.BasePointersArray
,
508 KernelArgs
.RTArgs
.PointersArray
,
509 KernelArgs
.RTArgs
.SizesArray
,
510 KernelArgs
.RTArgs
.MapTypesArray
,
511 KernelArgs
.RTArgs
.MapNamesArray
,
512 KernelArgs
.RTArgs
.MappersArray
,
513 KernelArgs
.NumIterations
,
517 KernelArgs
.DynCGGroupMem
};
520 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID
, Function
&Fn
) {
521 LLVMContext
&Ctx
= Fn
.getContext();
523 // Get the function's current attributes.
524 auto Attrs
= Fn
.getAttributes();
525 auto FnAttrs
= Attrs
.getFnAttrs();
526 auto RetAttrs
= Attrs
.getRetAttrs();
527 SmallVector
<AttributeSet
, 4> ArgAttrs
;
528 for (size_t ArgNo
= 0; ArgNo
< Fn
.arg_size(); ++ArgNo
)
529 ArgAttrs
.emplace_back(Attrs
.getParamAttrs(ArgNo
));
531 // Add AS to FnAS while taking special care with integer extensions.
532 auto addAttrSet
= [&](AttributeSet
&FnAS
, const AttributeSet
&AS
,
533 bool Param
= true) -> void {
534 bool HasSignExt
= AS
.hasAttribute(Attribute::SExt
);
535 bool HasZeroExt
= AS
.hasAttribute(Attribute::ZExt
);
536 if (HasSignExt
|| HasZeroExt
) {
537 assert(AS
.getNumAttributes() == 1 &&
538 "Currently not handling extension attr combined with others.");
540 if (auto AK
= TargetLibraryInfo::getExtAttrForI32Param(T
, HasSignExt
))
541 FnAS
= FnAS
.addAttribute(Ctx
, AK
);
543 TargetLibraryInfo::getExtAttrForI32Return(T
, HasSignExt
))
544 FnAS
= FnAS
.addAttribute(Ctx
, AK
);
546 FnAS
= FnAS
.addAttributes(Ctx
, AS
);
550 #define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
551 #include "llvm/Frontend/OpenMP/OMPKinds.def"
553 // Add attributes to the function declaration.
555 #define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
557 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
558 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
559 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
560 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
561 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
563 #include "llvm/Frontend/OpenMP/OMPKinds.def"
565 // Attributes are optional.
571 OpenMPIRBuilder::getOrCreateRuntimeFunction(Module
&M
, RuntimeFunction FnID
) {
572 FunctionType
*FnTy
= nullptr;
573 Function
*Fn
= nullptr;
575 // Try to find the declation in the module first.
577 #define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
579 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
581 Fn = M.getFunction(Str); \
583 #include "llvm/Frontend/OpenMP/OMPKinds.def"
587 // Create a new declaration if we need one.
589 #define OMP_RTL(Enum, Str, ...) \
591 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
593 #include "llvm/Frontend/OpenMP/OMPKinds.def"
596 // Add information if the runtime function takes a callback function
597 if (FnID
== OMPRTL___kmpc_fork_call
|| FnID
== OMPRTL___kmpc_fork_teams
) {
598 if (!Fn
->hasMetadata(LLVMContext::MD_callback
)) {
599 LLVMContext
&Ctx
= Fn
->getContext();
601 // Annotate the callback behavior of the runtime function:
602 // - The callback callee is argument number 2 (microtask).
603 // - The first two arguments of the callback callee are unknown (-1).
604 // - All variadic arguments to the runtime function are passed to the
607 LLVMContext::MD_callback
,
608 *MDNode::get(Ctx
, {MDB
.createCallbackEncoding(
609 2, {-1, -1}, /* VarArgsArePassed */ true)}));
613 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn
->getName()
614 << " with type " << *Fn
->getFunctionType() << "\n");
615 addAttributes(FnID
, *Fn
);
618 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn
->getName()
619 << " with type " << *Fn
->getFunctionType() << "\n");
622 assert(Fn
&& "Failed to create OpenMP runtime function");
627 Function
*OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID
) {
628 FunctionCallee RTLFn
= getOrCreateRuntimeFunction(M
, FnID
);
629 auto *Fn
= dyn_cast
<llvm::Function
>(RTLFn
.getCallee());
630 assert(Fn
&& "Failed to create OpenMP runtime function pointer");
634 void OpenMPIRBuilder::initialize() { initializeTypes(M
); }
636 void OpenMPIRBuilder::finalize(Function
*Fn
) {
637 SmallPtrSet
<BasicBlock
*, 32> ParallelRegionBlockSet
;
638 SmallVector
<BasicBlock
*, 32> Blocks
;
639 SmallVector
<OutlineInfo
, 16> DeferredOutlines
;
640 for (OutlineInfo
&OI
: OutlineInfos
) {
641 // Skip functions that have not finalized yet; may happen with nested
642 // function generation.
643 if (Fn
&& OI
.getFunction() != Fn
) {
644 DeferredOutlines
.push_back(OI
);
648 ParallelRegionBlockSet
.clear();
650 OI
.collectBlocks(ParallelRegionBlockSet
, Blocks
);
652 Function
*OuterFn
= OI
.getFunction();
653 CodeExtractorAnalysisCache
CEAC(*OuterFn
);
654 // If we generate code for the target device, we need to allocate
655 // struct for aggregate params in the device default alloca address space.
656 // OpenMP runtime requires that the params of the extracted functions are
657 // passed as zero address space pointers. This flag ensures that
658 // CodeExtractor generates correct code for extracted functions
659 // which are used by OpenMP runtime.
660 bool ArgsInZeroAddressSpace
= Config
.isTargetDevice();
661 CodeExtractor
Extractor(Blocks
, /* DominatorTree */ nullptr,
662 /* AggregateArgs */ true,
663 /* BlockFrequencyInfo */ nullptr,
664 /* BranchProbabilityInfo */ nullptr,
665 /* AssumptionCache */ nullptr,
666 /* AllowVarArgs */ true,
667 /* AllowAlloca */ true,
668 /* AllocaBlock*/ OI
.OuterAllocaBB
,
669 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace
);
671 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn
<< "\n");
672 LLVM_DEBUG(dbgs() << "Entry " << OI
.EntryBB
->getName()
673 << " Exit: " << OI
.ExitBB
->getName() << "\n");
674 assert(Extractor
.isEligible() &&
675 "Expected OpenMP outlining to be possible!");
677 for (auto *V
: OI
.ExcludeArgsFromAggregate
)
678 Extractor
.excludeArgFromAggregate(V
);
680 Function
*OutlinedFn
= Extractor
.extractCodeRegion(CEAC
);
682 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn
<< "\n");
683 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn
<< "\n");
684 assert(OutlinedFn
->getReturnType()->isVoidTy() &&
685 "OpenMP outlined functions should not return a value!");
687 // For compability with the clang CG we move the outlined function after the
688 // one with the parallel region.
689 OutlinedFn
->removeFromParent();
690 M
.getFunctionList().insertAfter(OuterFn
->getIterator(), OutlinedFn
);
692 // Remove the artificial entry introduced by the extractor right away, we
693 // made our own entry block after all.
695 BasicBlock
&ArtificialEntry
= OutlinedFn
->getEntryBlock();
696 assert(ArtificialEntry
.getUniqueSuccessor() == OI
.EntryBB
);
697 assert(OI
.EntryBB
->getUniquePredecessor() == &ArtificialEntry
);
698 // Move instructions from the to-be-deleted ArtificialEntry to the entry
699 // basic block of the parallel region. CodeExtractor generates
700 // instructions to unwrap the aggregate argument and may sink
701 // allocas/bitcasts for values that are solely used in the outlined region
702 // and do not escape.
703 assert(!ArtificialEntry
.empty() &&
704 "Expected instructions to add in the outlined region entry");
705 for (BasicBlock::reverse_iterator It
= ArtificialEntry
.rbegin(),
706 End
= ArtificialEntry
.rend();
708 Instruction
&I
= *It
;
711 if (I
.isTerminator())
714 I
.moveBeforePreserving(*OI
.EntryBB
, OI
.EntryBB
->getFirstInsertionPt());
717 OI
.EntryBB
->moveBefore(&ArtificialEntry
);
718 ArtificialEntry
.eraseFromParent();
720 assert(&OutlinedFn
->getEntryBlock() == OI
.EntryBB
);
721 assert(OutlinedFn
&& OutlinedFn
->getNumUses() == 1);
723 // Run a user callback, e.g. to add attributes.
724 if (OI
.PostOutlineCB
)
725 OI
.PostOutlineCB(*OutlinedFn
);
728 // Remove work items that have been completed.
729 OutlineInfos
= std::move(DeferredOutlines
);
731 EmitMetadataErrorReportFunctionTy
&&ErrorReportFn
=
732 [](EmitMetadataErrorKind Kind
,
733 const TargetRegionEntryInfo
&EntryInfo
) -> void {
734 errs() << "Error of kind: " << Kind
735 << " when emitting offload entries and metadata during "
736 "OMPIRBuilder finalization \n";
739 if (!OffloadInfoManager
.empty())
740 createOffloadEntriesAndInfoMetadata(ErrorReportFn
);
743 OpenMPIRBuilder::~OpenMPIRBuilder() {
744 assert(OutlineInfos
.empty() && "There must be no outstanding outlinings");
747 GlobalValue
*OpenMPIRBuilder::createGlobalFlag(unsigned Value
, StringRef Name
) {
748 IntegerType
*I32Ty
= Type::getInt32Ty(M
.getContext());
750 new GlobalVariable(M
, I32Ty
,
751 /* isConstant = */ true, GlobalValue::WeakODRLinkage
,
752 ConstantInt::get(I32Ty
, Value
), Name
);
753 GV
->setVisibility(GlobalValue::HiddenVisibility
);
758 Constant
*OpenMPIRBuilder::getOrCreateIdent(Constant
*SrcLocStr
,
759 uint32_t SrcLocStrSize
,
761 unsigned Reserve2Flags
) {
763 LocFlags
|= OMP_IDENT_FLAG_KMPC
;
766 IdentMap
[{SrcLocStr
, uint64_t(LocFlags
) << 31 | Reserve2Flags
}];
768 Constant
*I32Null
= ConstantInt::getNullValue(Int32
);
769 Constant
*IdentData
[] = {I32Null
,
770 ConstantInt::get(Int32
, uint32_t(LocFlags
)),
771 ConstantInt::get(Int32
, Reserve2Flags
),
772 ConstantInt::get(Int32
, SrcLocStrSize
), SrcLocStr
};
773 Constant
*Initializer
=
774 ConstantStruct::get(OpenMPIRBuilder::Ident
, IdentData
);
776 // Look for existing encoding of the location + flags, not needed but
777 // minimizes the difference to the existing solution while we transition.
778 for (GlobalVariable
&GV
: M
.globals())
779 if (GV
.getValueType() == OpenMPIRBuilder::Ident
&& GV
.hasInitializer())
780 if (GV
.getInitializer() == Initializer
)
784 auto *GV
= new GlobalVariable(
785 M
, OpenMPIRBuilder::Ident
,
786 /* isConstant = */ true, GlobalValue::PrivateLinkage
, Initializer
, "",
787 nullptr, GlobalValue::NotThreadLocal
,
788 M
.getDataLayout().getDefaultGlobalsAddressSpace());
789 GV
->setUnnamedAddr(GlobalValue::UnnamedAddr::Global
);
790 GV
->setAlignment(Align(8));
795 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident
, IdentPtr
);
798 Constant
*OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr
,
799 uint32_t &SrcLocStrSize
) {
800 SrcLocStrSize
= LocStr
.size();
801 Constant
*&SrcLocStr
= SrcLocStrMap
[LocStr
];
803 Constant
*Initializer
=
804 ConstantDataArray::getString(M
.getContext(), LocStr
);
806 // Look for existing encoding of the location, not needed but minimizes the
807 // difference to the existing solution while we transition.
808 for (GlobalVariable
&GV
: M
.globals())
809 if (GV
.isConstant() && GV
.hasInitializer() &&
810 GV
.getInitializer() == Initializer
)
811 return SrcLocStr
= ConstantExpr::getPointerCast(&GV
, Int8Ptr
);
813 SrcLocStr
= Builder
.CreateGlobalStringPtr(LocStr
, /* Name */ "",
814 /* AddressSpace */ 0, &M
);
819 Constant
*OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName
,
821 unsigned Line
, unsigned Column
,
822 uint32_t &SrcLocStrSize
) {
823 SmallString
<128> Buffer
;
824 Buffer
.push_back(';');
825 Buffer
.append(FileName
);
826 Buffer
.push_back(';');
827 Buffer
.append(FunctionName
);
828 Buffer
.push_back(';');
829 Buffer
.append(std::to_string(Line
));
830 Buffer
.push_back(';');
831 Buffer
.append(std::to_string(Column
));
832 Buffer
.push_back(';');
833 Buffer
.push_back(';');
834 return getOrCreateSrcLocStr(Buffer
.str(), SrcLocStrSize
);
838 OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize
) {
839 StringRef UnknownLoc
= ";unknown;unknown;0;0;;";
840 return getOrCreateSrcLocStr(UnknownLoc
, SrcLocStrSize
);
843 Constant
*OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL
,
844 uint32_t &SrcLocStrSize
,
846 DILocation
*DIL
= DL
.get();
848 return getOrCreateDefaultSrcLocStr(SrcLocStrSize
);
849 StringRef FileName
= M
.getName();
850 if (DIFile
*DIF
= DIL
->getFile())
851 if (std::optional
<StringRef
> Source
= DIF
->getSource())
853 StringRef Function
= DIL
->getScope()->getSubprogram()->getName();
854 if (Function
.empty() && F
)
855 Function
= F
->getName();
856 return getOrCreateSrcLocStr(Function
, FileName
, DIL
->getLine(),
857 DIL
->getColumn(), SrcLocStrSize
);
860 Constant
*OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription
&Loc
,
861 uint32_t &SrcLocStrSize
) {
862 return getOrCreateSrcLocStr(Loc
.DL
, SrcLocStrSize
,
863 Loc
.IP
.getBlock()->getParent());
866 Value
*OpenMPIRBuilder::getOrCreateThreadID(Value
*Ident
) {
867 return Builder
.CreateCall(
868 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num
), Ident
,
869 "omp_global_thread_num");
872 OpenMPIRBuilder::InsertPointTy
873 OpenMPIRBuilder::createBarrier(const LocationDescription
&Loc
, Directive DK
,
874 bool ForceSimpleCall
, bool CheckCancelFlag
) {
875 if (!updateToLocation(Loc
))
877 return emitBarrierImpl(Loc
, DK
, ForceSimpleCall
, CheckCancelFlag
);
880 OpenMPIRBuilder::InsertPointTy
881 OpenMPIRBuilder::emitBarrierImpl(const LocationDescription
&Loc
, Directive Kind
,
882 bool ForceSimpleCall
, bool CheckCancelFlag
) {
883 // Build call __kmpc_cancel_barrier(loc, thread_id) or
884 // __kmpc_barrier(loc, thread_id);
886 IdentFlag BarrierLocFlags
;
889 BarrierLocFlags
= OMP_IDENT_FLAG_BARRIER_IMPL_FOR
;
892 BarrierLocFlags
= OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS
;
895 BarrierLocFlags
= OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE
;
898 BarrierLocFlags
= OMP_IDENT_FLAG_BARRIER_EXPL
;
901 BarrierLocFlags
= OMP_IDENT_FLAG_BARRIER_IMPL
;
905 uint32_t SrcLocStrSize
;
906 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
908 getOrCreateIdent(SrcLocStr
, SrcLocStrSize
, BarrierLocFlags
),
909 getOrCreateThreadID(getOrCreateIdent(SrcLocStr
, SrcLocStrSize
))};
911 // If we are in a cancellable parallel region, barriers are cancellation
913 // TODO: Check why we would force simple calls or to ignore the cancel flag.
914 bool UseCancelBarrier
=
915 !ForceSimpleCall
&& isLastFinalizationInfoCancellable(OMPD_parallel
);
918 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(
919 UseCancelBarrier
? OMPRTL___kmpc_cancel_barrier
920 : OMPRTL___kmpc_barrier
),
923 if (UseCancelBarrier
&& CheckCancelFlag
)
924 emitCancelationCheckImpl(Result
, OMPD_parallel
);
926 return Builder
.saveIP();
929 OpenMPIRBuilder::InsertPointTy
930 OpenMPIRBuilder::createCancel(const LocationDescription
&Loc
,
932 omp::Directive CanceledDirective
) {
933 if (!updateToLocation(Loc
))
936 // LLVM utilities like blocks with terminators.
937 auto *UI
= Builder
.CreateUnreachable();
939 Instruction
*ThenTI
= UI
, *ElseTI
= nullptr;
941 SplitBlockAndInsertIfThenElse(IfCondition
, UI
, &ThenTI
, &ElseTI
);
942 Builder
.SetInsertPoint(ThenTI
);
944 Value
*CancelKind
= nullptr;
945 switch (CanceledDirective
) {
946 #define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
947 case DirectiveEnum: \
948 CancelKind = Builder.getInt32(Value); \
950 #include "llvm/Frontend/OpenMP/OMPKinds.def"
952 llvm_unreachable("Unknown cancel kind!");
955 uint32_t SrcLocStrSize
;
956 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
957 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
958 Value
*Args
[] = {Ident
, getOrCreateThreadID(Ident
), CancelKind
};
959 Value
*Result
= Builder
.CreateCall(
960 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel
), Args
);
961 auto ExitCB
= [this, CanceledDirective
, Loc
](InsertPointTy IP
) {
962 if (CanceledDirective
== OMPD_parallel
) {
963 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
964 Builder
.restoreIP(IP
);
965 createBarrier(LocationDescription(Builder
.saveIP(), Loc
.DL
),
966 omp::Directive::OMPD_unknown
, /* ForceSimpleCall */ false,
967 /* CheckCancelFlag */ false);
971 // The actual cancel logic is shared with others, e.g., cancel_barriers.
972 emitCancelationCheckImpl(Result
, CanceledDirective
, ExitCB
);
974 // Update the insertion point and remove the terminator we introduced.
975 Builder
.SetInsertPoint(UI
->getParent());
976 UI
->eraseFromParent();
978 return Builder
.saveIP();
981 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::emitTargetKernel(
982 const LocationDescription
&Loc
, InsertPointTy AllocaIP
, Value
*&Return
,
983 Value
*Ident
, Value
*DeviceID
, Value
*NumTeams
, Value
*NumThreads
,
984 Value
*HostPtr
, ArrayRef
<Value
*> KernelArgs
) {
985 if (!updateToLocation(Loc
))
988 Builder
.restoreIP(AllocaIP
);
989 auto *KernelArgsPtr
=
990 Builder
.CreateAlloca(OpenMPIRBuilder::KernelArgs
, nullptr, "kernel_args");
991 Builder
.restoreIP(Loc
.IP
);
993 for (unsigned I
= 0, Size
= KernelArgs
.size(); I
!= Size
; ++I
) {
995 Builder
.CreateStructGEP(OpenMPIRBuilder::KernelArgs
, KernelArgsPtr
, I
);
996 Builder
.CreateAlignedStore(
998 M
.getDataLayout().getPrefTypeAlign(KernelArgs
[I
]->getType()));
1001 SmallVector
<Value
*> OffloadingArgs
{Ident
, DeviceID
, NumTeams
,
1002 NumThreads
, HostPtr
, KernelArgsPtr
};
1004 Return
= Builder
.CreateCall(
1005 getOrCreateRuntimeFunction(M
, OMPRTL___tgt_target_kernel
),
1008 return Builder
.saveIP();
1011 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::emitKernelLaunch(
1012 const LocationDescription
&Loc
, Function
*OutlinedFn
, Value
*OutlinedFnID
,
1013 EmitFallbackCallbackTy emitTargetCallFallbackCB
, TargetKernelArgs
&Args
,
1014 Value
*DeviceID
, Value
*RTLoc
, InsertPointTy AllocaIP
) {
1016 if (!updateToLocation(Loc
))
1019 Builder
.restoreIP(Loc
.IP
);
1020 // On top of the arrays that were filled up, the target offloading call
1021 // takes as arguments the device id as well as the host pointer. The host
1022 // pointer is used by the runtime library to identify the current target
1023 // region, so it only has to be unique and not necessarily point to
1024 // anything. It could be the pointer to the outlined function that
1025 // implements the target region, but we aren't using that so that the
1026 // compiler doesn't need to keep that, and could therefore inline the host
1027 // function if proven worthwhile during optimization.
1029 // From this point on, we need to have an ID of the target region defined.
1030 assert(OutlinedFnID
&& "Invalid outlined function ID!");
1033 // Return value of the runtime offloading call.
1034 Value
*Return
= nullptr;
1036 // Arguments for the target kernel.
1037 SmallVector
<Value
*> ArgsVector
;
1038 getKernelArgsVector(Args
, Builder
, ArgsVector
);
1040 // The target region is an outlined function launched by the runtime
1041 // via calls to __tgt_target_kernel().
1043 // Note that on the host and CPU targets, the runtime implementation of
1044 // these calls simply call the outlined function without forking threads.
1045 // The outlined functions themselves have runtime calls to
1046 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1047 // the compiler in emitTeamsCall() and emitParallelCall().
1049 // In contrast, on the NVPTX target, the implementation of
1050 // __tgt_target_teams() launches a GPU kernel with the requested number
1051 // of teams and threads so no additional calls to the runtime are required.
1052 // Check the error code and execute the host version if required.
1053 Builder
.restoreIP(emitTargetKernel(Builder
, AllocaIP
, Return
, RTLoc
, DeviceID
,
1054 Args
.NumTeams
, Args
.NumThreads
,
1055 OutlinedFnID
, ArgsVector
));
1057 BasicBlock
*OffloadFailedBlock
=
1058 BasicBlock::Create(Builder
.getContext(), "omp_offload.failed");
1059 BasicBlock
*OffloadContBlock
=
1060 BasicBlock::Create(Builder
.getContext(), "omp_offload.cont");
1061 Value
*Failed
= Builder
.CreateIsNotNull(Return
);
1062 Builder
.CreateCondBr(Failed
, OffloadFailedBlock
, OffloadContBlock
);
1064 auto CurFn
= Builder
.GetInsertBlock()->getParent();
1065 emitBlock(OffloadFailedBlock
, CurFn
);
1066 Builder
.restoreIP(emitTargetCallFallbackCB(Builder
.saveIP()));
1067 emitBranch(OffloadContBlock
);
1068 emitBlock(OffloadContBlock
, CurFn
, /*IsFinished=*/true);
1069 return Builder
.saveIP();
1072 void OpenMPIRBuilder::emitCancelationCheckImpl(Value
*CancelFlag
,
1073 omp::Directive CanceledDirective
,
1074 FinalizeCallbackTy ExitCB
) {
1075 assert(isLastFinalizationInfoCancellable(CanceledDirective
) &&
1076 "Unexpected cancellation!");
1078 // For a cancel barrier we create two new blocks.
1079 BasicBlock
*BB
= Builder
.GetInsertBlock();
1080 BasicBlock
*NonCancellationBlock
;
1081 if (Builder
.GetInsertPoint() == BB
->end()) {
1082 // TODO: This branch will not be needed once we moved to the
1083 // OpenMPIRBuilder codegen completely.
1084 NonCancellationBlock
= BasicBlock::Create(
1085 BB
->getContext(), BB
->getName() + ".cont", BB
->getParent());
1087 NonCancellationBlock
= SplitBlock(BB
, &*Builder
.GetInsertPoint());
1088 BB
->getTerminator()->eraseFromParent();
1089 Builder
.SetInsertPoint(BB
);
1091 BasicBlock
*CancellationBlock
= BasicBlock::Create(
1092 BB
->getContext(), BB
->getName() + ".cncl", BB
->getParent());
1094 // Jump to them based on the return value.
1095 Value
*Cmp
= Builder
.CreateIsNull(CancelFlag
);
1096 Builder
.CreateCondBr(Cmp
, NonCancellationBlock
, CancellationBlock
,
1097 /* TODO weight */ nullptr, nullptr);
1099 // From the cancellation block we finalize all variables and go to the
1100 // post finalization block that is known to the FiniCB callback.
1101 Builder
.SetInsertPoint(CancellationBlock
);
1103 ExitCB(Builder
.saveIP());
1104 auto &FI
= FinalizationStack
.back();
1105 FI
.FiniCB(Builder
.saveIP());
1107 // The continuation block is where code generation continues.
1108 Builder
.SetInsertPoint(NonCancellationBlock
, NonCancellationBlock
->begin());
1111 // Callback used to create OpenMP runtime calls to support
1112 // omp parallel clause for the device.
1113 // We need to use this callback to replace call to the OutlinedFn in OuterFn
1114 // by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1115 static void targetParallelCallback(
1116 OpenMPIRBuilder
*OMPIRBuilder
, Function
&OutlinedFn
, Function
*OuterFn
,
1117 BasicBlock
*OuterAllocaBB
, Value
*Ident
, Value
*IfCondition
,
1118 Value
*NumThreads
, Instruction
*PrivTID
, AllocaInst
*PrivTIDAddr
,
1119 Value
*ThreadID
, const SmallVector
<Instruction
*, 4> &ToBeDeleted
) {
1120 // Add some known attributes.
1121 IRBuilder
<> &Builder
= OMPIRBuilder
->Builder
;
1122 OutlinedFn
.addParamAttr(0, Attribute::NoAlias
);
1123 OutlinedFn
.addParamAttr(1, Attribute::NoAlias
);
1124 OutlinedFn
.addParamAttr(0, Attribute::NoUndef
);
1125 OutlinedFn
.addParamAttr(1, Attribute::NoUndef
);
1126 OutlinedFn
.addFnAttr(Attribute::NoUnwind
);
1128 assert(OutlinedFn
.arg_size() >= 2 &&
1129 "Expected at least tid and bounded tid as arguments");
1130 unsigned NumCapturedVars
= OutlinedFn
.arg_size() - /* tid & bounded tid */ 2;
1132 CallInst
*CI
= cast
<CallInst
>(OutlinedFn
.user_back());
1133 assert(CI
&& "Expected call instruction to outlined function");
1134 CI
->getParent()->setName("omp_parallel");
1136 Builder
.SetInsertPoint(CI
);
1137 Type
*PtrTy
= OMPIRBuilder
->VoidPtr
;
1138 Value
*NullPtrValue
= Constant::getNullValue(PtrTy
);
1140 // Add alloca for kernel args
1141 OpenMPIRBuilder ::InsertPointTy CurrentIP
= Builder
.saveIP();
1142 Builder
.SetInsertPoint(OuterAllocaBB
, OuterAllocaBB
->getFirstInsertionPt());
1143 AllocaInst
*ArgsAlloca
=
1144 Builder
.CreateAlloca(ArrayType::get(PtrTy
, NumCapturedVars
));
1145 Value
*Args
= ArgsAlloca
;
1146 // Add address space cast if array for storing arguments is not allocated
1147 // in address space 0
1148 if (ArgsAlloca
->getAddressSpace())
1149 Args
= Builder
.CreatePointerCast(ArgsAlloca
, PtrTy
);
1150 Builder
.restoreIP(CurrentIP
);
1152 // Store captured vars which are used by kmpc_parallel_51
1153 for (unsigned Idx
= 0; Idx
< NumCapturedVars
; Idx
++) {
1154 Value
*V
= *(CI
->arg_begin() + 2 + Idx
);
1155 Value
*StoreAddress
= Builder
.CreateConstInBoundsGEP2_64(
1156 ArrayType::get(PtrTy
, NumCapturedVars
), Args
, 0, Idx
);
1157 Builder
.CreateStore(V
, StoreAddress
);
1161 IfCondition
? Builder
.CreateSExtOrTrunc(IfCondition
, OMPIRBuilder
->Int32
)
1162 : Builder
.getInt32(1);
1164 // Build kmpc_parallel_51 call
1165 Value
*Parallel51CallArgs
[] = {
1166 /* identifier*/ Ident
,
1167 /* global thread num*/ ThreadID
,
1168 /* if expression */ Cond
,
1169 /* number of threads */ NumThreads
? NumThreads
: Builder
.getInt32(-1),
1170 /* Proc bind */ Builder
.getInt32(-1),
1171 /* outlined function */
1172 Builder
.CreateBitCast(&OutlinedFn
, OMPIRBuilder
->ParallelTaskPtr
),
1173 /* wrapper function */ NullPtrValue
,
1174 /* arguments of the outlined funciton*/ Args
,
1175 /* number of arguments */ Builder
.getInt64(NumCapturedVars
)};
1177 FunctionCallee RTLFn
=
1178 OMPIRBuilder
->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51
);
1180 Builder
.CreateCall(RTLFn
, Parallel51CallArgs
);
1182 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1183 << *Builder
.GetInsertBlock()->getParent() << "\n");
1185 // Initialize the local TID stack location with the argument value.
1186 Builder
.SetInsertPoint(PrivTID
);
1187 Function::arg_iterator OutlinedAI
= OutlinedFn
.arg_begin();
1188 Builder
.CreateStore(Builder
.CreateLoad(OMPIRBuilder
->Int32
, OutlinedAI
),
1191 // Remove redundant call to the outlined function.
1192 CI
->eraseFromParent();
1194 for (Instruction
*I
: ToBeDeleted
) {
1195 I
->eraseFromParent();
1199 // Callback used to create OpenMP runtime calls to support
1200 // omp parallel clause for the host.
1201 // We need to use this callback to replace call to the OutlinedFn in OuterFn
1202 // by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1204 hostParallelCallback(OpenMPIRBuilder
*OMPIRBuilder
, Function
&OutlinedFn
,
1205 Function
*OuterFn
, Value
*Ident
, Value
*IfCondition
,
1206 Instruction
*PrivTID
, AllocaInst
*PrivTIDAddr
,
1207 const SmallVector
<Instruction
*, 4> &ToBeDeleted
) {
1208 IRBuilder
<> &Builder
= OMPIRBuilder
->Builder
;
1209 FunctionCallee RTLFn
;
1212 OMPIRBuilder
->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if
);
1215 OMPIRBuilder
->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call
);
1217 if (auto *F
= dyn_cast
<Function
>(RTLFn
.getCallee())) {
1218 if (!F
->hasMetadata(LLVMContext::MD_callback
)) {
1219 LLVMContext
&Ctx
= F
->getContext();
1221 // Annotate the callback behavior of the __kmpc_fork_call:
1222 // - The callback callee is argument number 2 (microtask).
1223 // - The first two arguments of the callback callee are unknown (-1).
1224 // - All variadic arguments to the __kmpc_fork_call are passed to the
1226 F
->addMetadata(LLVMContext::MD_callback
,
1227 *MDNode::get(Ctx
, {MDB
.createCallbackEncoding(
1229 /* VarArgsArePassed */ true)}));
1232 // Add some known attributes.
1233 OutlinedFn
.addParamAttr(0, Attribute::NoAlias
);
1234 OutlinedFn
.addParamAttr(1, Attribute::NoAlias
);
1235 OutlinedFn
.addFnAttr(Attribute::NoUnwind
);
1237 assert(OutlinedFn
.arg_size() >= 2 &&
1238 "Expected at least tid and bounded tid as arguments");
1239 unsigned NumCapturedVars
= OutlinedFn
.arg_size() - /* tid & bounded tid */ 2;
1241 CallInst
*CI
= cast
<CallInst
>(OutlinedFn
.user_back());
1242 CI
->getParent()->setName("omp_parallel");
1243 Builder
.SetInsertPoint(CI
);
1245 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1246 Value
*ForkCallArgs
[] = {
1247 Ident
, Builder
.getInt32(NumCapturedVars
),
1248 Builder
.CreateBitCast(&OutlinedFn
, OMPIRBuilder
->ParallelTaskPtr
)};
1250 SmallVector
<Value
*, 16> RealArgs
;
1251 RealArgs
.append(std::begin(ForkCallArgs
), std::end(ForkCallArgs
));
1253 Value
*Cond
= Builder
.CreateSExtOrTrunc(IfCondition
, OMPIRBuilder
->Int32
);
1254 RealArgs
.push_back(Cond
);
1256 RealArgs
.append(CI
->arg_begin() + /* tid & bound tid */ 2, CI
->arg_end());
1258 // __kmpc_fork_call_if always expects a void ptr as the last argument
1259 // If there are no arguments, pass a null pointer.
1260 auto PtrTy
= OMPIRBuilder
->VoidPtr
;
1261 if (IfCondition
&& NumCapturedVars
== 0) {
1262 Value
*NullPtrValue
= Constant::getNullValue(PtrTy
);
1263 RealArgs
.push_back(NullPtrValue
);
1265 if (IfCondition
&& RealArgs
.back()->getType() != PtrTy
)
1266 RealArgs
.back() = Builder
.CreateBitCast(RealArgs
.back(), PtrTy
);
1268 Builder
.CreateCall(RTLFn
, RealArgs
);
1270 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1271 << *Builder
.GetInsertBlock()->getParent() << "\n");
1273 // Initialize the local TID stack location with the argument value.
1274 Builder
.SetInsertPoint(PrivTID
);
1275 Function::arg_iterator OutlinedAI
= OutlinedFn
.arg_begin();
1276 Builder
.CreateStore(Builder
.CreateLoad(OMPIRBuilder
->Int32
, OutlinedAI
),
1279 // Remove redundant call to the outlined function.
1280 CI
->eraseFromParent();
1282 for (Instruction
*I
: ToBeDeleted
) {
1283 I
->eraseFromParent();
1287 IRBuilder
<>::InsertPoint
OpenMPIRBuilder::createParallel(
1288 const LocationDescription
&Loc
, InsertPointTy OuterAllocaIP
,
1289 BodyGenCallbackTy BodyGenCB
, PrivatizeCallbackTy PrivCB
,
1290 FinalizeCallbackTy FiniCB
, Value
*IfCondition
, Value
*NumThreads
,
1291 omp::ProcBindKind ProcBind
, bool IsCancellable
) {
1292 assert(!isConflictIP(Loc
.IP
, OuterAllocaIP
) && "IPs must not be ambiguous");
1294 if (!updateToLocation(Loc
))
1297 uint32_t SrcLocStrSize
;
1298 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1299 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
1300 Value
*ThreadID
= getOrCreateThreadID(Ident
);
1301 // If we generate code for the target device, we need to allocate
1302 // struct for aggregate params in the device default alloca address space.
1303 // OpenMP runtime requires that the params of the extracted functions are
1304 // passed as zero address space pointers. This flag ensures that extracted
1305 // function arguments are declared in zero address space
1306 bool ArgsInZeroAddressSpace
= Config
.isTargetDevice();
1308 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1309 // only if we compile for host side.
1310 if (NumThreads
&& !Config
.isTargetDevice()) {
1313 Builder
.CreateIntCast(NumThreads
, Int32
, /*isSigned*/ false)};
1315 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads
), Args
);
1318 if (ProcBind
!= OMP_PROC_BIND_default
) {
1319 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1322 ConstantInt::get(Int32
, unsigned(ProcBind
), /*isSigned=*/true)};
1324 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind
), Args
);
1327 BasicBlock
*InsertBB
= Builder
.GetInsertBlock();
1328 Function
*OuterFn
= InsertBB
->getParent();
1330 // Save the outer alloca block because the insertion iterator may get
1331 // invalidated and we still need this later.
1332 BasicBlock
*OuterAllocaBlock
= OuterAllocaIP
.getBlock();
1334 // Vector to remember instructions we used only during the modeling but which
1335 // we want to delete at the end.
1336 SmallVector
<Instruction
*, 4> ToBeDeleted
;
1338 // Change the location to the outer alloca insertion point to create and
1339 // initialize the allocas we pass into the parallel region.
1340 Builder
.restoreIP(OuterAllocaIP
);
1341 AllocaInst
*TIDAddrAlloca
= Builder
.CreateAlloca(Int32
, nullptr, "tid.addr");
1342 AllocaInst
*ZeroAddrAlloca
=
1343 Builder
.CreateAlloca(Int32
, nullptr, "zero.addr");
1344 Instruction
*TIDAddr
= TIDAddrAlloca
;
1345 Instruction
*ZeroAddr
= ZeroAddrAlloca
;
1346 if (ArgsInZeroAddressSpace
&& M
.getDataLayout().getAllocaAddrSpace() != 0) {
1347 // Add additional casts to enforce pointers in zero address space
1348 TIDAddr
= new AddrSpaceCastInst(
1349 TIDAddrAlloca
, PointerType ::get(M
.getContext(), 0), "tid.addr.ascast");
1350 TIDAddr
->insertAfter(TIDAddrAlloca
);
1351 ToBeDeleted
.push_back(TIDAddr
);
1352 ZeroAddr
= new AddrSpaceCastInst(ZeroAddrAlloca
,
1353 PointerType ::get(M
.getContext(), 0),
1354 "zero.addr.ascast");
1355 ZeroAddr
->insertAfter(ZeroAddrAlloca
);
1356 ToBeDeleted
.push_back(ZeroAddr
);
1359 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1360 // associated arguments in the outlined function, so we delete them later.
1361 ToBeDeleted
.push_back(TIDAddrAlloca
);
1362 ToBeDeleted
.push_back(ZeroAddrAlloca
);
1364 // Create an artificial insertion point that will also ensure the blocks we
1365 // are about to split are not degenerated.
1366 auto *UI
= new UnreachableInst(Builder
.getContext(), InsertBB
);
1368 BasicBlock
*EntryBB
= UI
->getParent();
1369 BasicBlock
*PRegEntryBB
= EntryBB
->splitBasicBlock(UI
, "omp.par.entry");
1370 BasicBlock
*PRegBodyBB
= PRegEntryBB
->splitBasicBlock(UI
, "omp.par.region");
1371 BasicBlock
*PRegPreFiniBB
=
1372 PRegBodyBB
->splitBasicBlock(UI
, "omp.par.pre_finalize");
1373 BasicBlock
*PRegExitBB
= PRegPreFiniBB
->splitBasicBlock(UI
, "omp.par.exit");
1375 auto FiniCBWrapper
= [&](InsertPointTy IP
) {
1376 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1377 // target to the region exit block.
1378 if (IP
.getBlock()->end() == IP
.getPoint()) {
1379 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
1380 Builder
.restoreIP(IP
);
1381 Instruction
*I
= Builder
.CreateBr(PRegExitBB
);
1382 IP
= InsertPointTy(I
->getParent(), I
->getIterator());
1384 assert(IP
.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1385 IP
.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB
&&
1386 "Unexpected insertion point for finalization call!");
1390 FinalizationStack
.push_back({FiniCBWrapper
, OMPD_parallel
, IsCancellable
});
1392 // Generate the privatization allocas in the block that will become the entry
1393 // of the outlined function.
1394 Builder
.SetInsertPoint(PRegEntryBB
->getTerminator());
1395 InsertPointTy InnerAllocaIP
= Builder
.saveIP();
1397 AllocaInst
*PrivTIDAddr
=
1398 Builder
.CreateAlloca(Int32
, nullptr, "tid.addr.local");
1399 Instruction
*PrivTID
= Builder
.CreateLoad(Int32
, PrivTIDAddr
, "tid");
1401 // Add some fake uses for OpenMP provided arguments.
1402 ToBeDeleted
.push_back(Builder
.CreateLoad(Int32
, TIDAddr
, "tid.addr.use"));
1403 Instruction
*ZeroAddrUse
=
1404 Builder
.CreateLoad(Int32
, ZeroAddr
, "zero.addr.use");
1405 ToBeDeleted
.push_back(ZeroAddrUse
);
1410 // PRegionEntryBB <- Privatization allocas are placed here.
1413 // PRegionBodyBB <- BodeGen is invoked here.
1416 // PRegPreFiniBB <- The block we will start finalization from.
1419 // PRegionExitBB <- A common exit to simplify block collection.
1422 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn
<< "\n");
1424 // Let the caller create the body.
1425 assert(BodyGenCB
&& "Expected body generation callback!");
1426 InsertPointTy
CodeGenIP(PRegBodyBB
, PRegBodyBB
->begin());
1427 BodyGenCB(InnerAllocaIP
, CodeGenIP
);
1429 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn
<< "\n");
1432 if (Config
.isTargetDevice()) {
1433 // Generate OpenMP target specific runtime call
1434 OI
.PostOutlineCB
= [=, ToBeDeletedVec
=
1435 std::move(ToBeDeleted
)](Function
&OutlinedFn
) {
1436 targetParallelCallback(this, OutlinedFn
, OuterFn
, OuterAllocaBlock
, Ident
,
1437 IfCondition
, NumThreads
, PrivTID
, PrivTIDAddr
,
1438 ThreadID
, ToBeDeletedVec
);
1441 // Generate OpenMP host runtime call
1442 OI
.PostOutlineCB
= [=, ToBeDeletedVec
=
1443 std::move(ToBeDeleted
)](Function
&OutlinedFn
) {
1444 hostParallelCallback(this, OutlinedFn
, OuterFn
, Ident
, IfCondition
,
1445 PrivTID
, PrivTIDAddr
, ToBeDeletedVec
);
1449 // Adjust the finalization stack, verify the adjustment, and call the
1450 // finalize function a last time to finalize values between the pre-fini
1451 // block and the exit block if we left the parallel "the normal way".
1452 auto FiniInfo
= FinalizationStack
.pop_back_val();
1454 assert(FiniInfo
.DK
== OMPD_parallel
&&
1455 "Unexpected finalization stack state!");
1457 Instruction
*PRegPreFiniTI
= PRegPreFiniBB
->getTerminator();
1459 InsertPointTy
PreFiniIP(PRegPreFiniBB
, PRegPreFiniTI
->getIterator());
1462 OI
.OuterAllocaBB
= OuterAllocaBlock
;
1463 OI
.EntryBB
= PRegEntryBB
;
1464 OI
.ExitBB
= PRegExitBB
;
1466 SmallPtrSet
<BasicBlock
*, 32> ParallelRegionBlockSet
;
1467 SmallVector
<BasicBlock
*, 32> Blocks
;
1468 OI
.collectBlocks(ParallelRegionBlockSet
, Blocks
);
1470 // Ensure a single exit node for the outlined region by creating one.
1471 // We might have multiple incoming edges to the exit now due to finalizations,
1472 // e.g., cancel calls that cause the control flow to leave the region.
1473 BasicBlock
*PRegOutlinedExitBB
= PRegExitBB
;
1474 PRegExitBB
= SplitBlock(PRegExitBB
, &*PRegExitBB
->getFirstInsertionPt());
1475 PRegOutlinedExitBB
->setName("omp.par.outlined.exit");
1476 Blocks
.push_back(PRegOutlinedExitBB
);
1478 CodeExtractorAnalysisCache
CEAC(*OuterFn
);
1479 CodeExtractor
Extractor(Blocks
, /* DominatorTree */ nullptr,
1480 /* AggregateArgs */ false,
1481 /* BlockFrequencyInfo */ nullptr,
1482 /* BranchProbabilityInfo */ nullptr,
1483 /* AssumptionCache */ nullptr,
1484 /* AllowVarArgs */ true,
1485 /* AllowAlloca */ true,
1486 /* AllocationBlock */ OuterAllocaBlock
,
1487 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace
);
1489 // Find inputs to, outputs from the code region.
1490 BasicBlock
*CommonExit
= nullptr;
1491 SetVector
<Value
*> Inputs
, Outputs
, SinkingCands
, HoistingCands
;
1492 Extractor
.findAllocas(CEAC
, SinkingCands
, HoistingCands
, CommonExit
);
1493 Extractor
.findInputsOutputs(Inputs
, Outputs
, SinkingCands
);
1495 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn
<< "\n");
1497 FunctionCallee TIDRTLFn
=
1498 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num
);
1500 auto PrivHelper
= [&](Value
&V
) {
1501 if (&V
== TIDAddr
|| &V
== ZeroAddr
) {
1502 OI
.ExcludeArgsFromAggregate
.push_back(&V
);
1506 SetVector
<Use
*> Uses
;
1507 for (Use
&U
: V
.uses())
1508 if (auto *UserI
= dyn_cast
<Instruction
>(U
.getUser()))
1509 if (ParallelRegionBlockSet
.count(UserI
->getParent()))
1512 // __kmpc_fork_call expects extra arguments as pointers. If the input
1513 // already has a pointer type, everything is fine. Otherwise, store the
1514 // value onto stack and load it back inside the to-be-outlined region. This
1515 // will ensure only the pointer will be passed to the function.
1516 // FIXME: if there are more than 15 trailing arguments, they must be
1517 // additionally packed in a struct.
1519 if (!V
.getType()->isPointerTy()) {
1520 IRBuilder
<>::InsertPointGuard
Guard(Builder
);
1521 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V
<< "\n");
1523 Builder
.restoreIP(OuterAllocaIP
);
1525 Builder
.CreateAlloca(V
.getType(), nullptr, V
.getName() + ".reloaded");
1527 // Store to stack at end of the block that currently branches to the entry
1528 // block of the to-be-outlined region.
1529 Builder
.SetInsertPoint(InsertBB
,
1530 InsertBB
->getTerminator()->getIterator());
1531 Builder
.CreateStore(&V
, Ptr
);
1533 // Load back next to allocations in the to-be-outlined region.
1534 Builder
.restoreIP(InnerAllocaIP
);
1535 Inner
= Builder
.CreateLoad(V
.getType(), Ptr
);
1538 Value
*ReplacementValue
= nullptr;
1539 CallInst
*CI
= dyn_cast
<CallInst
>(&V
);
1540 if (CI
&& CI
->getCalledFunction() == TIDRTLFn
.getCallee()) {
1541 ReplacementValue
= PrivTID
;
1544 PrivCB(InnerAllocaIP
, Builder
.saveIP(), V
, *Inner
, ReplacementValue
));
1545 assert(ReplacementValue
&&
1546 "Expected copy/create callback to set replacement value!");
1547 if (ReplacementValue
== &V
)
1551 for (Use
*UPtr
: Uses
)
1552 UPtr
->set(ReplacementValue
);
1555 // Reset the inner alloca insertion as it will be used for loading the values
1556 // wrapped into pointers before passing them into the to-be-outlined region.
1557 // Configure it to insert immediately after the fake use of zero address so
1558 // that they are available in the generated body and so that the
1559 // OpenMP-related values (thread ID and zero address pointers) remain leading
1560 // in the argument list.
1561 InnerAllocaIP
= IRBuilder
<>::InsertPoint(
1562 ZeroAddrUse
->getParent(), ZeroAddrUse
->getNextNode()->getIterator());
1564 // Reset the outer alloca insertion point to the entry of the relevant block
1565 // in case it was invalidated.
1566 OuterAllocaIP
= IRBuilder
<>::InsertPoint(
1567 OuterAllocaBlock
, OuterAllocaBlock
->getFirstInsertionPt());
1569 for (Value
*Input
: Inputs
) {
1570 LLVM_DEBUG(dbgs() << "Captured input: " << *Input
<< "\n");
1574 for (Value
*Output
: Outputs
)
1575 LLVM_DEBUG(dbgs() << "Captured output: " << *Output
<< "\n");
1577 assert(Outputs
.empty() &&
1578 "OpenMP outlining should not produce live-out values!");
1580 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn
<< "\n");
1582 for (auto *BB
: Blocks
)
1583 dbgs() << " PBR: " << BB
->getName() << "\n";
1586 // Register the outlined info.
1587 addOutlineInfo(std::move(OI
));
1589 InsertPointTy
AfterIP(UI
->getParent(), UI
->getParent()->end());
1590 UI
->eraseFromParent();
1595 void OpenMPIRBuilder::emitFlush(const LocationDescription
&Loc
) {
1596 // Build call void __kmpc_flush(ident_t *loc)
1597 uint32_t SrcLocStrSize
;
1598 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1599 Value
*Args
[] = {getOrCreateIdent(SrcLocStr
, SrcLocStrSize
)};
1601 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush
), Args
);
1604 void OpenMPIRBuilder::createFlush(const LocationDescription
&Loc
) {
1605 if (!updateToLocation(Loc
))
1610 void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription
&Loc
) {
1611 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1613 uint32_t SrcLocStrSize
;
1614 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1615 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
1616 Value
*Args
[] = {Ident
, getOrCreateThreadID(Ident
)};
1618 // Ignore return result until untied tasks are supported.
1619 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait
),
1623 void OpenMPIRBuilder::createTaskwait(const LocationDescription
&Loc
) {
1624 if (!updateToLocation(Loc
))
1626 emitTaskwaitImpl(Loc
);
1629 void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription
&Loc
) {
1630 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1631 uint32_t SrcLocStrSize
;
1632 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1633 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
1634 Constant
*I32Null
= ConstantInt::getNullValue(Int32
);
1635 Value
*Args
[] = {Ident
, getOrCreateThreadID(Ident
), I32Null
};
1637 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield
),
1641 void OpenMPIRBuilder::createTaskyield(const LocationDescription
&Loc
) {
1642 if (!updateToLocation(Loc
))
1644 emitTaskyieldImpl(Loc
);
1647 OpenMPIRBuilder::InsertPointTy
1648 OpenMPIRBuilder::createTask(const LocationDescription
&Loc
,
1649 InsertPointTy AllocaIP
, BodyGenCallbackTy BodyGenCB
,
1650 bool Tied
, Value
*Final
, Value
*IfCondition
,
1651 SmallVector
<DependData
> Dependencies
) {
1653 if (!updateToLocation(Loc
))
1654 return InsertPointTy();
1656 uint32_t SrcLocStrSize
;
1657 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1658 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
1659 // The current basic block is split into four basic blocks. After outlining,
1660 // they will be mapped as follows:
1662 // def current_fn() {
1663 // current_basic_block:
1664 // br label %task.exit
1666 // ; instructions after task
1668 // def outlined_fn() {
1670 // br label %task.body
1675 BasicBlock
*TaskExitBB
= splitBB(Builder
, /*CreateBranch=*/true, "task.exit");
1676 BasicBlock
*TaskBodyBB
= splitBB(Builder
, /*CreateBranch=*/true, "task.body");
1677 BasicBlock
*TaskAllocaBB
=
1678 splitBB(Builder
, /*CreateBranch=*/true, "task.alloca");
1680 InsertPointTy TaskAllocaIP
=
1681 InsertPointTy(TaskAllocaBB
, TaskAllocaBB
->begin());
1682 InsertPointTy TaskBodyIP
= InsertPointTy(TaskBodyBB
, TaskBodyBB
->begin());
1683 BodyGenCB(TaskAllocaIP
, TaskBodyIP
);
1686 OI
.EntryBB
= TaskAllocaBB
;
1687 OI
.OuterAllocaBB
= AllocaIP
.getBlock();
1688 OI
.ExitBB
= TaskExitBB
;
1690 // Add the thread ID argument.
1691 std::stack
<Instruction
*> ToBeDeleted
;
1692 OI
.ExcludeArgsFromAggregate
.push_back(createFakeIntVal(
1693 Builder
, AllocaIP
, ToBeDeleted
, TaskAllocaIP
, "global.tid", false));
1695 OI
.PostOutlineCB
= [this, Ident
, Tied
, Final
, IfCondition
, Dependencies
,
1696 TaskAllocaBB
, ToBeDeleted
](Function
&OutlinedFn
) mutable {
1697 // Replace the Stale CI by appropriate RTL function call.
1698 assert(OutlinedFn
.getNumUses() == 1 &&
1699 "there must be a single user for the outlined function");
1700 CallInst
*StaleCI
= cast
<CallInst
>(OutlinedFn
.user_back());
1702 // HasShareds is true if any variables are captured in the outlined region,
1704 bool HasShareds
= StaleCI
->arg_size() > 1;
1705 Builder
.SetInsertPoint(StaleCI
);
1707 // Gather the arguments for emitting the runtime call for
1708 // @__kmpc_omp_task_alloc
1709 Function
*TaskAllocFn
=
1710 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc
);
1712 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1714 Value
*ThreadID
= getOrCreateThreadID(Ident
);
1716 // Argument - `flags`
1717 // Task is tied iff (Flags & 1) == 1.
1718 // Task is untied iff (Flags & 1) == 0.
1719 // Task is final iff (Flags & 2) == 2.
1720 // Task is not final iff (Flags & 2) == 0.
1721 // TODO: Handle the other flags.
1722 Value
*Flags
= Builder
.getInt32(Tied
);
1725 Builder
.CreateSelect(Final
, Builder
.getInt32(2), Builder
.getInt32(0));
1726 Flags
= Builder
.CreateOr(FinalFlag
, Flags
);
1729 // Argument - `sizeof_kmp_task_t` (TaskSize)
1730 // Tasksize refers to the size in bytes of kmp_task_t data structure
1731 // including private vars accessed in task.
1732 // TODO: add kmp_task_t_with_privates (privates)
1733 Value
*TaskSize
= Builder
.getInt64(
1734 divideCeil(M
.getDataLayout().getTypeSizeInBits(Task
), 8));
1736 // Argument - `sizeof_shareds` (SharedsSize)
1737 // SharedsSize refers to the shareds array size in the kmp_task_t data
1739 Value
*SharedsSize
= Builder
.getInt64(0);
1741 AllocaInst
*ArgStructAlloca
=
1742 dyn_cast
<AllocaInst
>(StaleCI
->getArgOperand(1));
1743 assert(ArgStructAlloca
&&
1744 "Unable to find the alloca instruction corresponding to arguments "
1745 "for extracted function");
1746 StructType
*ArgStructType
=
1747 dyn_cast
<StructType
>(ArgStructAlloca
->getAllocatedType());
1748 assert(ArgStructType
&& "Unable to find struct type corresponding to "
1749 "arguments for extracted function");
1751 Builder
.getInt64(M
.getDataLayout().getTypeStoreSize(ArgStructType
));
1753 // Emit the @__kmpc_omp_task_alloc runtime call
1754 // The runtime call returns a pointer to an area where the task captured
1755 // variables must be copied before the task is run (TaskData)
1756 CallInst
*TaskData
= Builder
.CreateCall(
1757 TaskAllocFn
, {/*loc_ref=*/Ident
, /*gtid=*/ThreadID
, /*flags=*/Flags
,
1758 /*sizeof_task=*/TaskSize
, /*sizeof_shared=*/SharedsSize
,
1759 /*task_func=*/&OutlinedFn
});
1761 // Copy the arguments for outlined function
1763 Value
*Shareds
= StaleCI
->getArgOperand(1);
1764 Align Alignment
= TaskData
->getPointerAlignment(M
.getDataLayout());
1765 Value
*TaskShareds
= Builder
.CreateLoad(VoidPtr
, TaskData
);
1766 Builder
.CreateMemCpy(TaskShareds
, Alignment
, Shareds
, Alignment
,
1770 Value
*DepArray
= nullptr;
1771 if (Dependencies
.size()) {
1772 InsertPointTy OldIP
= Builder
.saveIP();
1773 Builder
.SetInsertPoint(
1774 &OldIP
.getBlock()->getParent()->getEntryBlock().back());
1776 Type
*DepArrayTy
= ArrayType::get(DependInfo
, Dependencies
.size());
1777 DepArray
= Builder
.CreateAlloca(DepArrayTy
, nullptr, ".dep.arr.addr");
1780 for (const DependData
&Dep
: Dependencies
) {
1782 Builder
.CreateConstInBoundsGEP2_64(DepArrayTy
, DepArray
, 0, P
);
1783 // Store the pointer to the variable
1784 Value
*Addr
= Builder
.CreateStructGEP(
1786 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr
));
1788 Builder
.CreatePtrToInt(Dep
.DepVal
, Builder
.getInt64Ty());
1789 Builder
.CreateStore(DepValPtr
, Addr
);
1790 // Store the size of the variable
1791 Value
*Size
= Builder
.CreateStructGEP(
1793 static_cast<unsigned int>(RTLDependInfoFields::Len
));
1794 Builder
.CreateStore(Builder
.getInt64(M
.getDataLayout().getTypeStoreSize(
1797 // Store the dependency kind
1798 Value
*Flags
= Builder
.CreateStructGEP(
1800 static_cast<unsigned int>(RTLDependInfoFields::Flags
));
1801 Builder
.CreateStore(
1802 ConstantInt::get(Builder
.getInt8Ty(),
1803 static_cast<unsigned int>(Dep
.DepKind
)),
1808 Builder
.restoreIP(OldIP
);
1811 // In the presence of the `if` clause, the following IR is generated:
1813 // %data = call @__kmpc_omp_task_alloc(...)
1814 // br i1 %if_condition, label %then, label %else
1816 // call @__kmpc_omp_task(...)
1819 // call @__kmpc_omp_task_begin_if0(...)
1820 // call @outlined_fn(...)
1821 // call @__kmpc_omp_task_complete_if0(...)
1826 // `SplitBlockAndInsertIfThenElse` requires the block to have a
1828 splitBB(Builder
, /*CreateBranch=*/true, "if.end");
1829 Instruction
*IfTerminator
=
1830 Builder
.GetInsertPoint()->getParent()->getTerminator();
1831 Instruction
*ThenTI
= IfTerminator
, *ElseTI
= nullptr;
1832 Builder
.SetInsertPoint(IfTerminator
);
1833 SplitBlockAndInsertIfThenElse(IfCondition
, IfTerminator
, &ThenTI
,
1835 Builder
.SetInsertPoint(ElseTI
);
1836 Function
*TaskBeginFn
=
1837 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0
);
1838 Function
*TaskCompleteFn
=
1839 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0
);
1840 Builder
.CreateCall(TaskBeginFn
, {Ident
, ThreadID
, TaskData
});
1841 CallInst
*CI
= nullptr;
1843 CI
= Builder
.CreateCall(&OutlinedFn
, {ThreadID
, TaskData
});
1845 CI
= Builder
.CreateCall(&OutlinedFn
, {ThreadID
});
1846 CI
->setDebugLoc(StaleCI
->getDebugLoc());
1847 Builder
.CreateCall(TaskCompleteFn
, {Ident
, ThreadID
, TaskData
});
1848 Builder
.SetInsertPoint(ThenTI
);
1851 if (Dependencies
.size()) {
1853 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps
);
1856 {Ident
, ThreadID
, TaskData
, Builder
.getInt32(Dependencies
.size()),
1857 DepArray
, ConstantInt::get(Builder
.getInt32Ty(), 0),
1858 ConstantPointerNull::get(PointerType::getUnqual(M
.getContext()))});
1861 // Emit the @__kmpc_omp_task runtime call to spawn the task
1862 Function
*TaskFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task
);
1863 Builder
.CreateCall(TaskFn
, {Ident
, ThreadID
, TaskData
});
1866 StaleCI
->eraseFromParent();
1868 Builder
.SetInsertPoint(TaskAllocaBB
, TaskAllocaBB
->begin());
1870 LoadInst
*Shareds
= Builder
.CreateLoad(VoidPtr
, OutlinedFn
.getArg(1));
1871 OutlinedFn
.getArg(1)->replaceUsesWithIf(
1872 Shareds
, [Shareds
](Use
&U
) { return U
.getUser() != Shareds
; });
1875 while (!ToBeDeleted
.empty()) {
1876 ToBeDeleted
.top()->eraseFromParent();
1881 addOutlineInfo(std::move(OI
));
1882 Builder
.SetInsertPoint(TaskExitBB
, TaskExitBB
->begin());
1884 return Builder
.saveIP();
1887 OpenMPIRBuilder::InsertPointTy
1888 OpenMPIRBuilder::createTaskgroup(const LocationDescription
&Loc
,
1889 InsertPointTy AllocaIP
,
1890 BodyGenCallbackTy BodyGenCB
) {
1891 if (!updateToLocation(Loc
))
1892 return InsertPointTy();
1894 uint32_t SrcLocStrSize
;
1895 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1896 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
1897 Value
*ThreadID
= getOrCreateThreadID(Ident
);
1899 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
1900 Function
*TaskgroupFn
=
1901 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup
);
1902 Builder
.CreateCall(TaskgroupFn
, {Ident
, ThreadID
});
1904 BasicBlock
*TaskgroupExitBB
= splitBB(Builder
, true, "taskgroup.exit");
1905 BodyGenCB(AllocaIP
, Builder
.saveIP());
1907 Builder
.SetInsertPoint(TaskgroupExitBB
);
1908 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
1909 Function
*EndTaskgroupFn
=
1910 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup
);
1911 Builder
.CreateCall(EndTaskgroupFn
, {Ident
, ThreadID
});
1913 return Builder
.saveIP();
1916 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createSections(
1917 const LocationDescription
&Loc
, InsertPointTy AllocaIP
,
1918 ArrayRef
<StorableBodyGenCallbackTy
> SectionCBs
, PrivatizeCallbackTy PrivCB
,
1919 FinalizeCallbackTy FiniCB
, bool IsCancellable
, bool IsNowait
) {
1920 assert(!isConflictIP(AllocaIP
, Loc
.IP
) && "Dedicated IP allocas required");
1922 if (!updateToLocation(Loc
))
1925 auto FiniCBWrapper
= [&](InsertPointTy IP
) {
1926 if (IP
.getBlock()->end() != IP
.getPoint())
1928 // This must be done otherwise any nested constructs using FinalizeOMPRegion
1929 // will fail because that function requires the Finalization Basic Block to
1930 // have a terminator, which is already removed by EmitOMPRegionBody.
1931 // IP is currently at cancelation block.
1932 // We need to backtrack to the condition block to fetch
1933 // the exit block and create a branch from cancelation
1935 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
1936 Builder
.restoreIP(IP
);
1937 auto *CaseBB
= IP
.getBlock()->getSinglePredecessor();
1938 auto *CondBB
= CaseBB
->getSinglePredecessor()->getSinglePredecessor();
1939 auto *ExitBB
= CondBB
->getTerminator()->getSuccessor(1);
1940 Instruction
*I
= Builder
.CreateBr(ExitBB
);
1941 IP
= InsertPointTy(I
->getParent(), I
->getIterator());
1945 FinalizationStack
.push_back({FiniCBWrapper
, OMPD_sections
, IsCancellable
});
1947 // Each section is emitted as a switch case
1948 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
1949 // -> OMP.createSection() which generates the IR for each section
1950 // Iterate through all sections and emit a switch construct:
1953 // <SectionStmt[0]>;
1956 // case <NumSection> - 1:
1957 // <SectionStmt[<NumSection> - 1]>;
1961 // section_loop.after:
1963 auto LoopBodyGenCB
= [&](InsertPointTy CodeGenIP
, Value
*IndVar
) {
1964 Builder
.restoreIP(CodeGenIP
);
1965 BasicBlock
*Continue
=
1966 splitBBWithSuffix(Builder
, /*CreateBranch=*/false, ".sections.after");
1967 Function
*CurFn
= Continue
->getParent();
1968 SwitchInst
*SwitchStmt
= Builder
.CreateSwitch(IndVar
, Continue
);
1970 unsigned CaseNumber
= 0;
1971 for (auto SectionCB
: SectionCBs
) {
1972 BasicBlock
*CaseBB
= BasicBlock::Create(
1973 M
.getContext(), "omp_section_loop.body.case", CurFn
, Continue
);
1974 SwitchStmt
->addCase(Builder
.getInt32(CaseNumber
), CaseBB
);
1975 Builder
.SetInsertPoint(CaseBB
);
1976 BranchInst
*CaseEndBr
= Builder
.CreateBr(Continue
);
1977 SectionCB(InsertPointTy(),
1978 {CaseEndBr
->getParent(), CaseEndBr
->getIterator()});
1981 // remove the existing terminator from body BB since there can be no
1982 // terminators after switch/case
1984 // Loop body ends here
1985 // LowerBound, UpperBound, and STride for createCanonicalLoop
1986 Type
*I32Ty
= Type::getInt32Ty(M
.getContext());
1987 Value
*LB
= ConstantInt::get(I32Ty
, 0);
1988 Value
*UB
= ConstantInt::get(I32Ty
, SectionCBs
.size());
1989 Value
*ST
= ConstantInt::get(I32Ty
, 1);
1990 llvm::CanonicalLoopInfo
*LoopInfo
= createCanonicalLoop(
1991 Loc
, LoopBodyGenCB
, LB
, UB
, ST
, true, false, AllocaIP
, "section_loop");
1992 InsertPointTy AfterIP
=
1993 applyStaticWorkshareLoop(Loc
.DL
, LoopInfo
, AllocaIP
, !IsNowait
);
1995 // Apply the finalization callback in LoopAfterBB
1996 auto FiniInfo
= FinalizationStack
.pop_back_val();
1997 assert(FiniInfo
.DK
== OMPD_sections
&&
1998 "Unexpected finalization stack state!");
1999 if (FinalizeCallbackTy
&CB
= FiniInfo
.FiniCB
) {
2000 Builder
.restoreIP(AfterIP
);
2001 BasicBlock
*FiniBB
=
2002 splitBBWithSuffix(Builder
, /*CreateBranch=*/true, "sections.fini");
2003 CB(Builder
.saveIP());
2004 AfterIP
= {FiniBB
, FiniBB
->begin()};
2010 OpenMPIRBuilder::InsertPointTy
2011 OpenMPIRBuilder::createSection(const LocationDescription
&Loc
,
2012 BodyGenCallbackTy BodyGenCB
,
2013 FinalizeCallbackTy FiniCB
) {
2014 if (!updateToLocation(Loc
))
2017 auto FiniCBWrapper
= [&](InsertPointTy IP
) {
2018 if (IP
.getBlock()->end() != IP
.getPoint())
2020 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2021 // will fail because that function requires the Finalization Basic Block to
2022 // have a terminator, which is already removed by EmitOMPRegionBody.
2023 // IP is currently at cancelation block.
2024 // We need to backtrack to the condition block to fetch
2025 // the exit block and create a branch from cancelation
2027 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
2028 Builder
.restoreIP(IP
);
2029 auto *CaseBB
= Loc
.IP
.getBlock();
2030 auto *CondBB
= CaseBB
->getSinglePredecessor()->getSinglePredecessor();
2031 auto *ExitBB
= CondBB
->getTerminator()->getSuccessor(1);
2032 Instruction
*I
= Builder
.CreateBr(ExitBB
);
2033 IP
= InsertPointTy(I
->getParent(), I
->getIterator());
2037 Directive OMPD
= Directive::OMPD_sections
;
2038 // Since we are using Finalization Callback here, HasFinalize
2039 // and IsCancellable have to be true
2040 return EmitOMPInlinedRegion(OMPD
, nullptr, nullptr, BodyGenCB
, FiniCBWrapper
,
2041 /*Conditional*/ false, /*hasFinalize*/ true,
2042 /*IsCancellable*/ true);
2045 /// Create a function with a unique name and a "void (i8*, i8*)" signature in
2046 /// the given module and return it.
2047 Function
*getFreshReductionFunc(Module
&M
) {
2048 Type
*VoidTy
= Type::getVoidTy(M
.getContext());
2049 Type
*Int8PtrTy
= PointerType::getUnqual(M
.getContext());
2051 FunctionType::get(VoidTy
, {Int8PtrTy
, Int8PtrTy
}, /* IsVarArg */ false);
2052 return Function::Create(FuncTy
, GlobalVariable::InternalLinkage
,
2053 M
.getDataLayout().getDefaultGlobalsAddressSpace(),
2054 ".omp.reduction.func", &M
);
2057 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createReductions(
2058 const LocationDescription
&Loc
, InsertPointTy AllocaIP
,
2059 ArrayRef
<ReductionInfo
> ReductionInfos
, bool IsNoWait
) {
2060 for (const ReductionInfo
&RI
: ReductionInfos
) {
2062 assert(RI
.Variable
&& "expected non-null variable");
2063 assert(RI
.PrivateVariable
&& "expected non-null private variable");
2064 assert(RI
.ReductionGen
&& "expected non-null reduction generator callback");
2065 assert(RI
.Variable
->getType() == RI
.PrivateVariable
->getType() &&
2066 "expected variables and their private equivalents to have the same "
2068 assert(RI
.Variable
->getType()->isPointerTy() &&
2069 "expected variables to be pointers");
2072 if (!updateToLocation(Loc
))
2073 return InsertPointTy();
2075 BasicBlock
*InsertBlock
= Loc
.IP
.getBlock();
2076 BasicBlock
*ContinuationBlock
=
2077 InsertBlock
->splitBasicBlock(Loc
.IP
.getPoint(), "reduce.finalize");
2078 InsertBlock
->getTerminator()->eraseFromParent();
2080 // Create and populate array of type-erased pointers to private reduction
2082 unsigned NumReductions
= ReductionInfos
.size();
2083 Type
*RedArrayTy
= ArrayType::get(Builder
.getPtrTy(), NumReductions
);
2084 Builder
.restoreIP(AllocaIP
);
2085 Value
*RedArray
= Builder
.CreateAlloca(RedArrayTy
, nullptr, "red.array");
2087 Builder
.SetInsertPoint(InsertBlock
, InsertBlock
->end());
2089 for (auto En
: enumerate(ReductionInfos
)) {
2090 unsigned Index
= En
.index();
2091 const ReductionInfo
&RI
= En
.value();
2092 Value
*RedArrayElemPtr
= Builder
.CreateConstInBoundsGEP2_64(
2093 RedArrayTy
, RedArray
, 0, Index
, "red.array.elem." + Twine(Index
));
2094 Builder
.CreateStore(RI
.PrivateVariable
, RedArrayElemPtr
);
2097 // Emit a call to the runtime function that orchestrates the reduction.
2098 // Declare the reduction function in the process.
2099 Function
*Func
= Builder
.GetInsertBlock()->getParent();
2100 Module
*Module
= Func
->getParent();
2101 uint32_t SrcLocStrSize
;
2102 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
2103 bool CanGenerateAtomic
=
2104 llvm::all_of(ReductionInfos
, [](const ReductionInfo
&RI
) {
2105 return RI
.AtomicReductionGen
;
2107 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
,
2109 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
2111 Value
*ThreadId
= getOrCreateThreadID(Ident
);
2112 Constant
*NumVariables
= Builder
.getInt32(NumReductions
);
2113 const DataLayout
&DL
= Module
->getDataLayout();
2114 unsigned RedArrayByteSize
= DL
.getTypeStoreSize(RedArrayTy
);
2115 Constant
*RedArraySize
= Builder
.getInt64(RedArrayByteSize
);
2116 Function
*ReductionFunc
= getFreshReductionFunc(*Module
);
2117 Value
*Lock
= getOMPCriticalRegionLock(".reduction");
2118 Function
*ReduceFunc
= getOrCreateRuntimeFunctionPtr(
2119 IsNoWait
? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
2120 : RuntimeFunction::OMPRTL___kmpc_reduce
);
2121 CallInst
*ReduceCall
=
2122 Builder
.CreateCall(ReduceFunc
,
2123 {Ident
, ThreadId
, NumVariables
, RedArraySize
, RedArray
,
2124 ReductionFunc
, Lock
},
2127 // Create final reduction entry blocks for the atomic and non-atomic case.
2128 // Emit IR that dispatches control flow to one of the blocks based on the
2129 // reduction supporting the atomic mode.
2130 BasicBlock
*NonAtomicRedBlock
=
2131 BasicBlock::Create(Module
->getContext(), "reduce.switch.nonatomic", Func
);
2132 BasicBlock
*AtomicRedBlock
=
2133 BasicBlock::Create(Module
->getContext(), "reduce.switch.atomic", Func
);
2134 SwitchInst
*Switch
=
2135 Builder
.CreateSwitch(ReduceCall
, ContinuationBlock
, /* NumCases */ 2);
2136 Switch
->addCase(Builder
.getInt32(1), NonAtomicRedBlock
);
2137 Switch
->addCase(Builder
.getInt32(2), AtomicRedBlock
);
2139 // Populate the non-atomic reduction using the elementwise reduction function.
2140 // This loads the elements from the global and private variables and reduces
2141 // them before storing back the result to the global variable.
2142 Builder
.SetInsertPoint(NonAtomicRedBlock
);
2143 for (auto En
: enumerate(ReductionInfos
)) {
2144 const ReductionInfo
&RI
= En
.value();
2145 Type
*ValueType
= RI
.ElementType
;
2146 Value
*RedValue
= Builder
.CreateLoad(ValueType
, RI
.Variable
,
2147 "red.value." + Twine(En
.index()));
2148 Value
*PrivateRedValue
=
2149 Builder
.CreateLoad(ValueType
, RI
.PrivateVariable
,
2150 "red.private.value." + Twine(En
.index()));
2153 RI
.ReductionGen(Builder
.saveIP(), RedValue
, PrivateRedValue
, Reduced
));
2154 if (!Builder
.GetInsertBlock())
2155 return InsertPointTy();
2156 Builder
.CreateStore(Reduced
, RI
.Variable
);
2158 Function
*EndReduceFunc
= getOrCreateRuntimeFunctionPtr(
2159 IsNoWait
? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
2160 : RuntimeFunction::OMPRTL___kmpc_end_reduce
);
2161 Builder
.CreateCall(EndReduceFunc
, {Ident
, ThreadId
, Lock
});
2162 Builder
.CreateBr(ContinuationBlock
);
2164 // Populate the atomic reduction using the atomic elementwise reduction
2165 // function. There are no loads/stores here because they will be happening
2166 // inside the atomic elementwise reduction.
2167 Builder
.SetInsertPoint(AtomicRedBlock
);
2168 if (CanGenerateAtomic
) {
2169 for (const ReductionInfo
&RI
: ReductionInfos
) {
2170 Builder
.restoreIP(RI
.AtomicReductionGen(Builder
.saveIP(), RI
.ElementType
,
2171 RI
.Variable
, RI
.PrivateVariable
));
2172 if (!Builder
.GetInsertBlock())
2173 return InsertPointTy();
2175 Builder
.CreateBr(ContinuationBlock
);
2177 Builder
.CreateUnreachable();
2180 // Populate the outlined reduction function using the elementwise reduction
2181 // function. Partial values are extracted from the type-erased array of
2182 // pointers to private variables.
2183 BasicBlock
*ReductionFuncBlock
=
2184 BasicBlock::Create(Module
->getContext(), "", ReductionFunc
);
2185 Builder
.SetInsertPoint(ReductionFuncBlock
);
2186 Value
*LHSArrayPtr
= ReductionFunc
->getArg(0);
2187 Value
*RHSArrayPtr
= ReductionFunc
->getArg(1);
2189 for (auto En
: enumerate(ReductionInfos
)) {
2190 const ReductionInfo
&RI
= En
.value();
2191 Value
*LHSI8PtrPtr
= Builder
.CreateConstInBoundsGEP2_64(
2192 RedArrayTy
, LHSArrayPtr
, 0, En
.index());
2193 Value
*LHSI8Ptr
= Builder
.CreateLoad(Builder
.getPtrTy(), LHSI8PtrPtr
);
2194 Value
*LHSPtr
= Builder
.CreateBitCast(LHSI8Ptr
, RI
.Variable
->getType());
2195 Value
*LHS
= Builder
.CreateLoad(RI
.ElementType
, LHSPtr
);
2196 Value
*RHSI8PtrPtr
= Builder
.CreateConstInBoundsGEP2_64(
2197 RedArrayTy
, RHSArrayPtr
, 0, En
.index());
2198 Value
*RHSI8Ptr
= Builder
.CreateLoad(Builder
.getPtrTy(), RHSI8PtrPtr
);
2200 Builder
.CreateBitCast(RHSI8Ptr
, RI
.PrivateVariable
->getType());
2201 Value
*RHS
= Builder
.CreateLoad(RI
.ElementType
, RHSPtr
);
2203 Builder
.restoreIP(RI
.ReductionGen(Builder
.saveIP(), LHS
, RHS
, Reduced
));
2204 if (!Builder
.GetInsertBlock())
2205 return InsertPointTy();
2206 Builder
.CreateStore(Reduced
, LHSPtr
);
2208 Builder
.CreateRetVoid();
2210 Builder
.SetInsertPoint(ContinuationBlock
);
2211 return Builder
.saveIP();
2214 OpenMPIRBuilder::InsertPointTy
2215 OpenMPIRBuilder::createMaster(const LocationDescription
&Loc
,
2216 BodyGenCallbackTy BodyGenCB
,
2217 FinalizeCallbackTy FiniCB
) {
2219 if (!updateToLocation(Loc
))
2222 Directive OMPD
= Directive::OMPD_master
;
2223 uint32_t SrcLocStrSize
;
2224 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
2225 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
2226 Value
*ThreadId
= getOrCreateThreadID(Ident
);
2227 Value
*Args
[] = {Ident
, ThreadId
};
2229 Function
*EntryRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master
);
2230 Instruction
*EntryCall
= Builder
.CreateCall(EntryRTLFn
, Args
);
2232 Function
*ExitRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master
);
2233 Instruction
*ExitCall
= Builder
.CreateCall(ExitRTLFn
, Args
);
2235 return EmitOMPInlinedRegion(OMPD
, EntryCall
, ExitCall
, BodyGenCB
, FiniCB
,
2236 /*Conditional*/ true, /*hasFinalize*/ true);
2239 OpenMPIRBuilder::InsertPointTy
2240 OpenMPIRBuilder::createMasked(const LocationDescription
&Loc
,
2241 BodyGenCallbackTy BodyGenCB
,
2242 FinalizeCallbackTy FiniCB
, Value
*Filter
) {
2243 if (!updateToLocation(Loc
))
2246 Directive OMPD
= Directive::OMPD_masked
;
2247 uint32_t SrcLocStrSize
;
2248 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
2249 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
2250 Value
*ThreadId
= getOrCreateThreadID(Ident
);
2251 Value
*Args
[] = {Ident
, ThreadId
, Filter
};
2252 Value
*ArgsEnd
[] = {Ident
, ThreadId
};
2254 Function
*EntryRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked
);
2255 Instruction
*EntryCall
= Builder
.CreateCall(EntryRTLFn
, Args
);
2257 Function
*ExitRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked
);
2258 Instruction
*ExitCall
= Builder
.CreateCall(ExitRTLFn
, ArgsEnd
);
2260 return EmitOMPInlinedRegion(OMPD
, EntryCall
, ExitCall
, BodyGenCB
, FiniCB
,
2261 /*Conditional*/ true, /*hasFinalize*/ true);
2264 CanonicalLoopInfo
*OpenMPIRBuilder::createLoopSkeleton(
2265 DebugLoc DL
, Value
*TripCount
, Function
*F
, BasicBlock
*PreInsertBefore
,
2266 BasicBlock
*PostInsertBefore
, const Twine
&Name
) {
2267 Module
*M
= F
->getParent();
2268 LLVMContext
&Ctx
= M
->getContext();
2269 Type
*IndVarTy
= TripCount
->getType();
2271 // Create the basic block structure.
2272 BasicBlock
*Preheader
=
2273 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".preheader", F
, PreInsertBefore
);
2274 BasicBlock
*Header
=
2275 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".header", F
, PreInsertBefore
);
2277 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".cond", F
, PreInsertBefore
);
2279 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".body", F
, PreInsertBefore
);
2281 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".inc", F
, PostInsertBefore
);
2283 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".exit", F
, PostInsertBefore
);
2285 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".after", F
, PostInsertBefore
);
2287 // Use specified DebugLoc for new instructions.
2288 Builder
.SetCurrentDebugLocation(DL
);
2290 Builder
.SetInsertPoint(Preheader
);
2291 Builder
.CreateBr(Header
);
2293 Builder
.SetInsertPoint(Header
);
2294 PHINode
*IndVarPHI
= Builder
.CreatePHI(IndVarTy
, 2, "omp_" + Name
+ ".iv");
2295 IndVarPHI
->addIncoming(ConstantInt::get(IndVarTy
, 0), Preheader
);
2296 Builder
.CreateBr(Cond
);
2298 Builder
.SetInsertPoint(Cond
);
2300 Builder
.CreateICmpULT(IndVarPHI
, TripCount
, "omp_" + Name
+ ".cmp");
2301 Builder
.CreateCondBr(Cmp
, Body
, Exit
);
2303 Builder
.SetInsertPoint(Body
);
2304 Builder
.CreateBr(Latch
);
2306 Builder
.SetInsertPoint(Latch
);
2307 Value
*Next
= Builder
.CreateAdd(IndVarPHI
, ConstantInt::get(IndVarTy
, 1),
2308 "omp_" + Name
+ ".next", /*HasNUW=*/true);
2309 Builder
.CreateBr(Header
);
2310 IndVarPHI
->addIncoming(Next
, Latch
);
2312 Builder
.SetInsertPoint(Exit
);
2313 Builder
.CreateBr(After
);
2315 // Remember and return the canonical control flow.
2316 LoopInfos
.emplace_front();
2317 CanonicalLoopInfo
*CL
= &LoopInfos
.front();
2319 CL
->Header
= Header
;
2331 OpenMPIRBuilder::createCanonicalLoop(const LocationDescription
&Loc
,
2332 LoopBodyGenCallbackTy BodyGenCB
,
2333 Value
*TripCount
, const Twine
&Name
) {
2334 BasicBlock
*BB
= Loc
.IP
.getBlock();
2335 BasicBlock
*NextBB
= BB
->getNextNode();
2337 CanonicalLoopInfo
*CL
= createLoopSkeleton(Loc
.DL
, TripCount
, BB
->getParent(),
2338 NextBB
, NextBB
, Name
);
2339 BasicBlock
*After
= CL
->getAfter();
2341 // If location is not set, don't connect the loop.
2342 if (updateToLocation(Loc
)) {
2343 // Split the loop at the insertion point: Branch to the preheader and move
2344 // every following instruction to after the loop (the After BB). Also, the
2345 // new successor is the loop's after block.
2346 spliceBB(Builder
, After
, /*CreateBranch=*/false);
2347 Builder
.CreateBr(CL
->getPreheader());
2350 // Emit the body content. We do it after connecting the loop to the CFG to
2351 // avoid that the callback encounters degenerate BBs.
2352 BodyGenCB(CL
->getBodyIP(), CL
->getIndVar());
2360 CanonicalLoopInfo
*OpenMPIRBuilder::createCanonicalLoop(
2361 const LocationDescription
&Loc
, LoopBodyGenCallbackTy BodyGenCB
,
2362 Value
*Start
, Value
*Stop
, Value
*Step
, bool IsSigned
, bool InclusiveStop
,
2363 InsertPointTy ComputeIP
, const Twine
&Name
) {
2365 // Consider the following difficulties (assuming 8-bit signed integers):
2366 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
2367 // DO I = 1, 100, 50
2368 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
2369 // DO I = 100, 0, -128
2371 // Start, Stop and Step must be of the same integer type.
2372 auto *IndVarTy
= cast
<IntegerType
>(Start
->getType());
2373 assert(IndVarTy
== Stop
->getType() && "Stop type mismatch");
2374 assert(IndVarTy
== Step
->getType() && "Step type mismatch");
2376 LocationDescription ComputeLoc
=
2377 ComputeIP
.isSet() ? LocationDescription(ComputeIP
, Loc
.DL
) : Loc
;
2378 updateToLocation(ComputeLoc
);
2380 ConstantInt
*Zero
= ConstantInt::get(IndVarTy
, 0);
2381 ConstantInt
*One
= ConstantInt::get(IndVarTy
, 1);
2383 // Like Step, but always positive.
2386 // Distance between Start and Stop; always positive.
2389 // Condition whether there are no iterations are executed at all, e.g. because
2394 // Ensure that increment is positive. If not, negate and invert LB and UB.
2395 Value
*IsNeg
= Builder
.CreateICmpSLT(Step
, Zero
);
2396 Incr
= Builder
.CreateSelect(IsNeg
, Builder
.CreateNeg(Step
), Step
);
2397 Value
*LB
= Builder
.CreateSelect(IsNeg
, Stop
, Start
);
2398 Value
*UB
= Builder
.CreateSelect(IsNeg
, Start
, Stop
);
2399 Span
= Builder
.CreateSub(UB
, LB
, "", false, true);
2400 ZeroCmp
= Builder
.CreateICmp(
2401 InclusiveStop
? CmpInst::ICMP_SLT
: CmpInst::ICMP_SLE
, UB
, LB
);
2403 Span
= Builder
.CreateSub(Stop
, Start
, "", true);
2404 ZeroCmp
= Builder
.CreateICmp(
2405 InclusiveStop
? CmpInst::ICMP_ULT
: CmpInst::ICMP_ULE
, Stop
, Start
);
2408 Value
*CountIfLooping
;
2409 if (InclusiveStop
) {
2410 CountIfLooping
= Builder
.CreateAdd(Builder
.CreateUDiv(Span
, Incr
), One
);
2412 // Avoid incrementing past stop since it could overflow.
2413 Value
*CountIfTwo
= Builder
.CreateAdd(
2414 Builder
.CreateUDiv(Builder
.CreateSub(Span
, One
), Incr
), One
);
2415 Value
*OneCmp
= Builder
.CreateICmp(CmpInst::ICMP_ULE
, Span
, Incr
);
2416 CountIfLooping
= Builder
.CreateSelect(OneCmp
, One
, CountIfTwo
);
2418 Value
*TripCount
= Builder
.CreateSelect(ZeroCmp
, Zero
, CountIfLooping
,
2419 "omp_" + Name
+ ".tripcount");
2421 auto BodyGen
= [=](InsertPointTy CodeGenIP
, Value
*IV
) {
2422 Builder
.restoreIP(CodeGenIP
);
2423 Value
*Span
= Builder
.CreateMul(IV
, Step
);
2424 Value
*IndVar
= Builder
.CreateAdd(Span
, Start
);
2425 BodyGenCB(Builder
.saveIP(), IndVar
);
2427 LocationDescription LoopLoc
= ComputeIP
.isSet() ? Loc
.IP
: Builder
.saveIP();
2428 return createCanonicalLoop(LoopLoc
, BodyGen
, TripCount
, Name
);
2431 // Returns an LLVM function to call for initializing loop bounds using OpenMP
2432 // static scheduling depending on `type`. Only i32 and i64 are supported by the
2433 // runtime. Always interpret integers as unsigned similarly to
2434 // CanonicalLoopInfo.
2435 static FunctionCallee
getKmpcForStaticInitForType(Type
*Ty
, Module
&M
,
2436 OpenMPIRBuilder
&OMPBuilder
) {
2437 unsigned Bitwidth
= Ty
->getIntegerBitWidth();
2439 return OMPBuilder
.getOrCreateRuntimeFunction(
2440 M
, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u
);
2442 return OMPBuilder
.getOrCreateRuntimeFunction(
2443 M
, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u
);
2444 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2447 OpenMPIRBuilder::InsertPointTy
2448 OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL
, CanonicalLoopInfo
*CLI
,
2449 InsertPointTy AllocaIP
,
2450 bool NeedsBarrier
) {
2451 assert(CLI
->isValid() && "Requires a valid canonical loop");
2452 assert(!isConflictIP(AllocaIP
, CLI
->getPreheaderIP()) &&
2453 "Require dedicated allocate IP");
2455 // Set up the source location value for OpenMP runtime.
2456 Builder
.restoreIP(CLI
->getPreheaderIP());
2457 Builder
.SetCurrentDebugLocation(DL
);
2459 uint32_t SrcLocStrSize
;
2460 Constant
*SrcLocStr
= getOrCreateSrcLocStr(DL
, SrcLocStrSize
);
2461 Value
*SrcLoc
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
2463 // Declare useful OpenMP runtime functions.
2464 Value
*IV
= CLI
->getIndVar();
2465 Type
*IVTy
= IV
->getType();
2466 FunctionCallee StaticInit
= getKmpcForStaticInitForType(IVTy
, M
, *this);
2467 FunctionCallee StaticFini
=
2468 getOrCreateRuntimeFunction(M
, omp::OMPRTL___kmpc_for_static_fini
);
2470 // Allocate space for computed loop bounds as expected by the "init" function.
2471 Builder
.restoreIP(AllocaIP
);
2472 Type
*I32Type
= Type::getInt32Ty(M
.getContext());
2473 Value
*PLastIter
= Builder
.CreateAlloca(I32Type
, nullptr, "p.lastiter");
2474 Value
*PLowerBound
= Builder
.CreateAlloca(IVTy
, nullptr, "p.lowerbound");
2475 Value
*PUpperBound
= Builder
.CreateAlloca(IVTy
, nullptr, "p.upperbound");
2476 Value
*PStride
= Builder
.CreateAlloca(IVTy
, nullptr, "p.stride");
2478 // At the end of the preheader, prepare for calling the "init" function by
2479 // storing the current loop bounds into the allocated space. A canonical loop
2480 // always iterates from 0 to trip-count with step 1. Note that "init" expects
2481 // and produces an inclusive upper bound.
2482 Builder
.SetInsertPoint(CLI
->getPreheader()->getTerminator());
2483 Constant
*Zero
= ConstantInt::get(IVTy
, 0);
2484 Constant
*One
= ConstantInt::get(IVTy
, 1);
2485 Builder
.CreateStore(Zero
, PLowerBound
);
2486 Value
*UpperBound
= Builder
.CreateSub(CLI
->getTripCount(), One
);
2487 Builder
.CreateStore(UpperBound
, PUpperBound
);
2488 Builder
.CreateStore(One
, PStride
);
2490 Value
*ThreadNum
= getOrCreateThreadID(SrcLoc
);
2492 Constant
*SchedulingType
= ConstantInt::get(
2493 I32Type
, static_cast<int>(OMPScheduleType::UnorderedStatic
));
2495 // Call the "init" function and update the trip count of the loop with the
2496 // value it produced.
2497 Builder
.CreateCall(StaticInit
,
2498 {SrcLoc
, ThreadNum
, SchedulingType
, PLastIter
, PLowerBound
,
2499 PUpperBound
, PStride
, One
, Zero
});
2500 Value
*LowerBound
= Builder
.CreateLoad(IVTy
, PLowerBound
);
2501 Value
*InclusiveUpperBound
= Builder
.CreateLoad(IVTy
, PUpperBound
);
2502 Value
*TripCountMinusOne
= Builder
.CreateSub(InclusiveUpperBound
, LowerBound
);
2503 Value
*TripCount
= Builder
.CreateAdd(TripCountMinusOne
, One
);
2504 CLI
->setTripCount(TripCount
);
2506 // Update all uses of the induction variable except the one in the condition
2507 // block that compares it with the actual upper bound, and the increment in
2510 CLI
->mapIndVar([&](Instruction
*OldIV
) -> Value
* {
2511 Builder
.SetInsertPoint(CLI
->getBody(),
2512 CLI
->getBody()->getFirstInsertionPt());
2513 Builder
.SetCurrentDebugLocation(DL
);
2514 return Builder
.CreateAdd(OldIV
, LowerBound
);
2517 // In the "exit" block, call the "fini" function.
2518 Builder
.SetInsertPoint(CLI
->getExit(),
2519 CLI
->getExit()->getTerminator()->getIterator());
2520 Builder
.CreateCall(StaticFini
, {SrcLoc
, ThreadNum
});
2522 // Add the barrier if requested.
2524 createBarrier(LocationDescription(Builder
.saveIP(), DL
),
2525 omp::Directive::OMPD_for
, /* ForceSimpleCall */ false,
2526 /* CheckCancelFlag */ false);
2528 InsertPointTy AfterIP
= CLI
->getAfterIP();
2534 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
2535 DebugLoc DL
, CanonicalLoopInfo
*CLI
, InsertPointTy AllocaIP
,
2536 bool NeedsBarrier
, Value
*ChunkSize
) {
2537 assert(CLI
->isValid() && "Requires a valid canonical loop");
2538 assert(ChunkSize
&& "Chunk size is required");
2540 LLVMContext
&Ctx
= CLI
->getFunction()->getContext();
2541 Value
*IV
= CLI
->getIndVar();
2542 Value
*OrigTripCount
= CLI
->getTripCount();
2543 Type
*IVTy
= IV
->getType();
2544 assert(IVTy
->getIntegerBitWidth() <= 64 &&
2545 "Max supported tripcount bitwidth is 64 bits");
2546 Type
*InternalIVTy
= IVTy
->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx
)
2547 : Type::getInt64Ty(Ctx
);
2548 Type
*I32Type
= Type::getInt32Ty(M
.getContext());
2549 Constant
*Zero
= ConstantInt::get(InternalIVTy
, 0);
2550 Constant
*One
= ConstantInt::get(InternalIVTy
, 1);
2552 // Declare useful OpenMP runtime functions.
2553 FunctionCallee StaticInit
=
2554 getKmpcForStaticInitForType(InternalIVTy
, M
, *this);
2555 FunctionCallee StaticFini
=
2556 getOrCreateRuntimeFunction(M
, omp::OMPRTL___kmpc_for_static_fini
);
2558 // Allocate space for computed loop bounds as expected by the "init" function.
2559 Builder
.restoreIP(AllocaIP
);
2560 Builder
.SetCurrentDebugLocation(DL
);
2561 Value
*PLastIter
= Builder
.CreateAlloca(I32Type
, nullptr, "p.lastiter");
2562 Value
*PLowerBound
=
2563 Builder
.CreateAlloca(InternalIVTy
, nullptr, "p.lowerbound");
2564 Value
*PUpperBound
=
2565 Builder
.CreateAlloca(InternalIVTy
, nullptr, "p.upperbound");
2566 Value
*PStride
= Builder
.CreateAlloca(InternalIVTy
, nullptr, "p.stride");
2568 // Set up the source location value for the OpenMP runtime.
2569 Builder
.restoreIP(CLI
->getPreheaderIP());
2570 Builder
.SetCurrentDebugLocation(DL
);
2572 // TODO: Detect overflow in ubsan or max-out with current tripcount.
2573 Value
*CastedChunkSize
=
2574 Builder
.CreateZExtOrTrunc(ChunkSize
, InternalIVTy
, "chunksize");
2575 Value
*CastedTripCount
=
2576 Builder
.CreateZExt(OrigTripCount
, InternalIVTy
, "tripcount");
2578 Constant
*SchedulingType
= ConstantInt::get(
2579 I32Type
, static_cast<int>(OMPScheduleType::UnorderedStaticChunked
));
2580 Builder
.CreateStore(Zero
, PLowerBound
);
2581 Value
*OrigUpperBound
= Builder
.CreateSub(CastedTripCount
, One
);
2582 Builder
.CreateStore(OrigUpperBound
, PUpperBound
);
2583 Builder
.CreateStore(One
, PStride
);
2585 // Call the "init" function and update the trip count of the loop with the
2586 // value it produced.
2587 uint32_t SrcLocStrSize
;
2588 Constant
*SrcLocStr
= getOrCreateSrcLocStr(DL
, SrcLocStrSize
);
2589 Value
*SrcLoc
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
2590 Value
*ThreadNum
= getOrCreateThreadID(SrcLoc
);
2591 Builder
.CreateCall(StaticInit
,
2592 {/*loc=*/SrcLoc
, /*global_tid=*/ThreadNum
,
2593 /*schedtype=*/SchedulingType
, /*plastiter=*/PLastIter
,
2594 /*plower=*/PLowerBound
, /*pupper=*/PUpperBound
,
2595 /*pstride=*/PStride
, /*incr=*/One
,
2596 /*chunk=*/CastedChunkSize
});
2598 // Load values written by the "init" function.
2599 Value
*FirstChunkStart
=
2600 Builder
.CreateLoad(InternalIVTy
, PLowerBound
, "omp_firstchunk.lb");
2601 Value
*FirstChunkStop
=
2602 Builder
.CreateLoad(InternalIVTy
, PUpperBound
, "omp_firstchunk.ub");
2603 Value
*FirstChunkEnd
= Builder
.CreateAdd(FirstChunkStop
, One
);
2605 Builder
.CreateSub(FirstChunkEnd
, FirstChunkStart
, "omp_chunk.range");
2606 Value
*NextChunkStride
=
2607 Builder
.CreateLoad(InternalIVTy
, PStride
, "omp_dispatch.stride");
2609 // Create outer "dispatch" loop for enumerating the chunks.
2610 BasicBlock
*DispatchEnter
= splitBB(Builder
, true);
2611 Value
*DispatchCounter
;
2612 CanonicalLoopInfo
*DispatchCLI
= createCanonicalLoop(
2613 {Builder
.saveIP(), DL
},
2614 [&](InsertPointTy BodyIP
, Value
*Counter
) { DispatchCounter
= Counter
; },
2615 FirstChunkStart
, CastedTripCount
, NextChunkStride
,
2616 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
2619 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
2620 // not have to preserve the canonical invariant.
2621 BasicBlock
*DispatchBody
= DispatchCLI
->getBody();
2622 BasicBlock
*DispatchLatch
= DispatchCLI
->getLatch();
2623 BasicBlock
*DispatchExit
= DispatchCLI
->getExit();
2624 BasicBlock
*DispatchAfter
= DispatchCLI
->getAfter();
2625 DispatchCLI
->invalidate();
2627 // Rewire the original loop to become the chunk loop inside the dispatch loop.
2628 redirectTo(DispatchAfter
, CLI
->getAfter(), DL
);
2629 redirectTo(CLI
->getExit(), DispatchLatch
, DL
);
2630 redirectTo(DispatchBody
, DispatchEnter
, DL
);
2632 // Prepare the prolog of the chunk loop.
2633 Builder
.restoreIP(CLI
->getPreheaderIP());
2634 Builder
.SetCurrentDebugLocation(DL
);
2636 // Compute the number of iterations of the chunk loop.
2637 Builder
.SetInsertPoint(CLI
->getPreheader()->getTerminator());
2638 Value
*ChunkEnd
= Builder
.CreateAdd(DispatchCounter
, ChunkRange
);
2639 Value
*IsLastChunk
=
2640 Builder
.CreateICmpUGE(ChunkEnd
, CastedTripCount
, "omp_chunk.is_last");
2641 Value
*CountUntilOrigTripCount
=
2642 Builder
.CreateSub(CastedTripCount
, DispatchCounter
);
2643 Value
*ChunkTripCount
= Builder
.CreateSelect(
2644 IsLastChunk
, CountUntilOrigTripCount
, ChunkRange
, "omp_chunk.tripcount");
2645 Value
*BackcastedChunkTC
=
2646 Builder
.CreateTrunc(ChunkTripCount
, IVTy
, "omp_chunk.tripcount.trunc");
2647 CLI
->setTripCount(BackcastedChunkTC
);
2649 // Update all uses of the induction variable except the one in the condition
2650 // block that compares it with the actual upper bound, and the increment in
2652 Value
*BackcastedDispatchCounter
=
2653 Builder
.CreateTrunc(DispatchCounter
, IVTy
, "omp_dispatch.iv.trunc");
2654 CLI
->mapIndVar([&](Instruction
*) -> Value
* {
2655 Builder
.restoreIP(CLI
->getBodyIP());
2656 return Builder
.CreateAdd(IV
, BackcastedDispatchCounter
);
2659 // In the "exit" block, call the "fini" function.
2660 Builder
.SetInsertPoint(DispatchExit
, DispatchExit
->getFirstInsertionPt());
2661 Builder
.CreateCall(StaticFini
, {SrcLoc
, ThreadNum
});
2663 // Add the barrier if requested.
2665 createBarrier(LocationDescription(Builder
.saveIP(), DL
), OMPD_for
,
2666 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
2669 // Even though we currently do not support applying additional methods to it,
2670 // the chunk loop should remain a canonical loop.
2674 return {DispatchAfter
, DispatchAfter
->getFirstInsertionPt()};
2677 // Returns an LLVM function to call for executing an OpenMP static worksharing
2678 // for loop depending on `type`. Only i32 and i64 are supported by the runtime.
2679 // Always interpret integers as unsigned similarly to CanonicalLoopInfo.
2680 static FunctionCallee
2681 getKmpcForStaticLoopForType(Type
*Ty
, OpenMPIRBuilder
*OMPBuilder
,
2682 WorksharingLoopType LoopType
) {
2683 unsigned Bitwidth
= Ty
->getIntegerBitWidth();
2684 Module
&M
= OMPBuilder
->M
;
2686 case WorksharingLoopType::ForStaticLoop
:
2688 return OMPBuilder
->getOrCreateRuntimeFunction(
2689 M
, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u
);
2691 return OMPBuilder
->getOrCreateRuntimeFunction(
2692 M
, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u
);
2694 case WorksharingLoopType::DistributeStaticLoop
:
2696 return OMPBuilder
->getOrCreateRuntimeFunction(
2697 M
, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u
);
2699 return OMPBuilder
->getOrCreateRuntimeFunction(
2700 M
, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u
);
2702 case WorksharingLoopType::DistributeForStaticLoop
:
2704 return OMPBuilder
->getOrCreateRuntimeFunction(
2705 M
, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u
);
2707 return OMPBuilder
->getOrCreateRuntimeFunction(
2708 M
, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u
);
2711 if (Bitwidth
!= 32 && Bitwidth
!= 64) {
2712 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
2714 llvm_unreachable("Unknown type of OpenMP worksharing loop");
2717 // Inserts a call to proper OpenMP Device RTL function which handles
2718 // loop worksharing.
2719 static void createTargetLoopWorkshareCall(
2720 OpenMPIRBuilder
*OMPBuilder
, WorksharingLoopType LoopType
,
2721 BasicBlock
*InsertBlock
, Value
*Ident
, Value
*LoopBodyArg
,
2722 Type
*ParallelTaskPtr
, Value
*TripCount
, Function
&LoopBodyFn
) {
2723 Type
*TripCountTy
= TripCount
->getType();
2724 Module
&M
= OMPBuilder
->M
;
2725 IRBuilder
<> &Builder
= OMPBuilder
->Builder
;
2726 FunctionCallee RTLFn
=
2727 getKmpcForStaticLoopForType(TripCountTy
, OMPBuilder
, LoopType
);
2728 SmallVector
<Value
*, 8> RealArgs
;
2729 RealArgs
.push_back(Ident
);
2730 RealArgs
.push_back(Builder
.CreateBitCast(&LoopBodyFn
, ParallelTaskPtr
));
2731 RealArgs
.push_back(LoopBodyArg
);
2732 RealArgs
.push_back(TripCount
);
2733 if (LoopType
== WorksharingLoopType::DistributeStaticLoop
) {
2734 RealArgs
.push_back(ConstantInt::get(TripCountTy
, 0));
2735 Builder
.CreateCall(RTLFn
, RealArgs
);
2738 FunctionCallee RTLNumThreads
= OMPBuilder
->getOrCreateRuntimeFunction(
2739 M
, omp::RuntimeFunction::OMPRTL_omp_get_num_threads
);
2740 Builder
.restoreIP({InsertBlock
, std::prev(InsertBlock
->end())});
2741 Value
*NumThreads
= Builder
.CreateCall(RTLNumThreads
, {});
2744 Builder
.CreateZExtOrTrunc(NumThreads
, TripCountTy
, "num.threads.cast"));
2745 RealArgs
.push_back(ConstantInt::get(TripCountTy
, 0));
2746 if (LoopType
== WorksharingLoopType::DistributeForStaticLoop
) {
2747 RealArgs
.push_back(ConstantInt::get(TripCountTy
, 0));
2750 Builder
.CreateCall(RTLFn
, RealArgs
);
2754 workshareLoopTargetCallback(OpenMPIRBuilder
*OMPIRBuilder
,
2755 CanonicalLoopInfo
*CLI
, Value
*Ident
,
2756 Function
&OutlinedFn
, Type
*ParallelTaskPtr
,
2757 const SmallVector
<Instruction
*, 4> &ToBeDeleted
,
2758 WorksharingLoopType LoopType
) {
2759 IRBuilder
<> &Builder
= OMPIRBuilder
->Builder
;
2760 BasicBlock
*Preheader
= CLI
->getPreheader();
2761 Value
*TripCount
= CLI
->getTripCount();
2763 // After loop body outling, the loop body contains only set up
2764 // of loop body argument structure and the call to the outlined
2765 // loop body function. Firstly, we need to move setup of loop body args
2766 // into loop preheader.
2767 Preheader
->splice(std::prev(Preheader
->end()), CLI
->getBody(),
2768 CLI
->getBody()->begin(), std::prev(CLI
->getBody()->end()));
2770 // The next step is to remove the whole loop. We do not it need anymore.
2771 // That's why make an unconditional branch from loop preheader to loop
2773 Builder
.restoreIP({Preheader
, Preheader
->end()});
2774 Preheader
->getTerminator()->eraseFromParent();
2775 Builder
.CreateBr(CLI
->getExit());
2777 // Delete dead loop blocks
2778 OpenMPIRBuilder::OutlineInfo CleanUpInfo
;
2779 SmallPtrSet
<BasicBlock
*, 32> RegionBlockSet
;
2780 SmallVector
<BasicBlock
*, 32> BlocksToBeRemoved
;
2781 CleanUpInfo
.EntryBB
= CLI
->getHeader();
2782 CleanUpInfo
.ExitBB
= CLI
->getExit();
2783 CleanUpInfo
.collectBlocks(RegionBlockSet
, BlocksToBeRemoved
);
2784 DeleteDeadBlocks(BlocksToBeRemoved
);
2786 // Find the instruction which corresponds to loop body argument structure
2787 // and remove the call to loop body function instruction.
2789 User
*OutlinedFnUser
= OutlinedFn
.getUniqueUndroppableUser();
2790 assert(OutlinedFnUser
&&
2791 "Expected unique undroppable user of outlined function");
2792 CallInst
*OutlinedFnCallInstruction
= dyn_cast
<CallInst
>(OutlinedFnUser
);
2793 assert(OutlinedFnCallInstruction
&& "Expected outlined function call");
2794 assert((OutlinedFnCallInstruction
->getParent() == Preheader
) &&
2795 "Expected outlined function call to be located in loop preheader");
2796 // Check in case no argument structure has been passed.
2797 if (OutlinedFnCallInstruction
->arg_size() > 1)
2798 LoopBodyArg
= OutlinedFnCallInstruction
->getArgOperand(1);
2800 LoopBodyArg
= Constant::getNullValue(Builder
.getPtrTy());
2801 OutlinedFnCallInstruction
->eraseFromParent();
2803 createTargetLoopWorkshareCall(OMPIRBuilder
, LoopType
, Preheader
, Ident
,
2804 LoopBodyArg
, ParallelTaskPtr
, TripCount
,
2807 for (auto &ToBeDeletedItem
: ToBeDeleted
)
2808 ToBeDeletedItem
->eraseFromParent();
2812 OpenMPIRBuilder::InsertPointTy
2813 OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL
, CanonicalLoopInfo
*CLI
,
2814 InsertPointTy AllocaIP
,
2815 WorksharingLoopType LoopType
) {
2816 uint32_t SrcLocStrSize
;
2817 Constant
*SrcLocStr
= getOrCreateSrcLocStr(DL
, SrcLocStrSize
);
2818 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
2821 OI
.OuterAllocaBB
= CLI
->getPreheader();
2822 Function
*OuterFn
= CLI
->getPreheader()->getParent();
2824 // Instructions which need to be deleted at the end of code generation
2825 SmallVector
<Instruction
*, 4> ToBeDeleted
;
2827 OI
.OuterAllocaBB
= AllocaIP
.getBlock();
2829 // Mark the body loop as region which needs to be extracted
2830 OI
.EntryBB
= CLI
->getBody();
2831 OI
.ExitBB
= CLI
->getLatch()->splitBasicBlock(CLI
->getLatch()->begin(),
2832 "omp.prelatch", true);
2834 // Prepare loop body for extraction
2835 Builder
.restoreIP({CLI
->getPreheader(), CLI
->getPreheader()->begin()});
2837 // Insert new loop counter variable which will be used only in loop
2839 AllocaInst
*NewLoopCnt
= Builder
.CreateAlloca(CLI
->getIndVarType(), 0, "");
2840 Instruction
*NewLoopCntLoad
=
2841 Builder
.CreateLoad(CLI
->getIndVarType(), NewLoopCnt
);
2842 // New loop counter instructions are redundant in the loop preheader when
2843 // code generation for workshare loop is finshed. That's why mark them as
2844 // ready for deletion.
2845 ToBeDeleted
.push_back(NewLoopCntLoad
);
2846 ToBeDeleted
.push_back(NewLoopCnt
);
2848 // Analyse loop body region. Find all input variables which are used inside
2849 // loop body region.
2850 SmallPtrSet
<BasicBlock
*, 32> ParallelRegionBlockSet
;
2851 SmallVector
<BasicBlock
*, 32> Blocks
;
2852 OI
.collectBlocks(ParallelRegionBlockSet
, Blocks
);
2853 SmallVector
<BasicBlock
*, 32> BlocksT(ParallelRegionBlockSet
.begin(),
2854 ParallelRegionBlockSet
.end());
2856 CodeExtractorAnalysisCache
CEAC(*OuterFn
);
2857 CodeExtractor
Extractor(Blocks
,
2858 /* DominatorTree */ nullptr,
2859 /* AggregateArgs */ true,
2860 /* BlockFrequencyInfo */ nullptr,
2861 /* BranchProbabilityInfo */ nullptr,
2862 /* AssumptionCache */ nullptr,
2863 /* AllowVarArgs */ true,
2864 /* AllowAlloca */ true,
2865 /* AllocationBlock */ CLI
->getPreheader(),
2866 /* Suffix */ ".omp_wsloop",
2867 /* AggrArgsIn0AddrSpace */ true);
2869 BasicBlock
*CommonExit
= nullptr;
2870 SetVector
<Value
*> Inputs
, Outputs
, SinkingCands
, HoistingCands
;
2872 // Find allocas outside the loop body region which are used inside loop
2874 Extractor
.findAllocas(CEAC
, SinkingCands
, HoistingCands
, CommonExit
);
2876 // We need to model loop body region as the function f(cnt, loop_arg).
2877 // That's why we replace loop induction variable by the new counter
2878 // which will be one of loop body function argument
2879 SmallVector
<User
*> Users(CLI
->getIndVar()->user_begin(),
2880 CLI
->getIndVar()->user_end());
2881 for (auto Use
: Users
) {
2882 if (Instruction
*Inst
= dyn_cast
<Instruction
>(Use
)) {
2883 if (ParallelRegionBlockSet
.count(Inst
->getParent())) {
2884 Inst
->replaceUsesOfWith(CLI
->getIndVar(), NewLoopCntLoad
);
2888 // Make sure that loop counter variable is not merged into loop body
2889 // function argument structure and it is passed as separate variable
2890 OI
.ExcludeArgsFromAggregate
.push_back(NewLoopCntLoad
);
2892 // PostOutline CB is invoked when loop body function is outlined and
2893 // loop body is replaced by call to outlined function. We need to add
2894 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
2895 // function will handle loop control logic.
2897 OI
.PostOutlineCB
= [=, ToBeDeletedVec
=
2898 std::move(ToBeDeleted
)](Function
&OutlinedFn
) {
2899 workshareLoopTargetCallback(this, CLI
, Ident
, OutlinedFn
, ParallelTaskPtr
,
2900 ToBeDeletedVec
, LoopType
);
2902 addOutlineInfo(std::move(OI
));
2903 return CLI
->getAfterIP();
2906 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::applyWorkshareLoop(
2907 DebugLoc DL
, CanonicalLoopInfo
*CLI
, InsertPointTy AllocaIP
,
2908 bool NeedsBarrier
, omp::ScheduleKind SchedKind
, Value
*ChunkSize
,
2909 bool HasSimdModifier
, bool HasMonotonicModifier
,
2910 bool HasNonmonotonicModifier
, bool HasOrderedClause
,
2911 WorksharingLoopType LoopType
) {
2912 if (Config
.isTargetDevice())
2913 return applyWorkshareLoopTarget(DL
, CLI
, AllocaIP
, LoopType
);
2914 OMPScheduleType EffectiveScheduleType
= computeOpenMPScheduleType(
2915 SchedKind
, ChunkSize
, HasSimdModifier
, HasMonotonicModifier
,
2916 HasNonmonotonicModifier
, HasOrderedClause
);
2918 bool IsOrdered
= (EffectiveScheduleType
& OMPScheduleType::ModifierOrdered
) ==
2919 OMPScheduleType::ModifierOrdered
;
2920 switch (EffectiveScheduleType
& ~OMPScheduleType::ModifierMask
) {
2921 case OMPScheduleType::BaseStatic
:
2922 assert(!ChunkSize
&& "No chunk size with static-chunked schedule");
2924 return applyDynamicWorkshareLoop(DL
, CLI
, AllocaIP
, EffectiveScheduleType
,
2925 NeedsBarrier
, ChunkSize
);
2926 // FIXME: Monotonicity ignored?
2927 return applyStaticWorkshareLoop(DL
, CLI
, AllocaIP
, NeedsBarrier
);
2929 case OMPScheduleType::BaseStaticChunked
:
2931 return applyDynamicWorkshareLoop(DL
, CLI
, AllocaIP
, EffectiveScheduleType
,
2932 NeedsBarrier
, ChunkSize
);
2933 // FIXME: Monotonicity ignored?
2934 return applyStaticChunkedWorkshareLoop(DL
, CLI
, AllocaIP
, NeedsBarrier
,
2937 case OMPScheduleType::BaseRuntime
:
2938 case OMPScheduleType::BaseAuto
:
2939 case OMPScheduleType::BaseGreedy
:
2940 case OMPScheduleType::BaseBalanced
:
2941 case OMPScheduleType::BaseSteal
:
2942 case OMPScheduleType::BaseGuidedSimd
:
2943 case OMPScheduleType::BaseRuntimeSimd
:
2944 assert(!ChunkSize
&&
2945 "schedule type does not support user-defined chunk sizes");
2947 case OMPScheduleType::BaseDynamicChunked
:
2948 case OMPScheduleType::BaseGuidedChunked
:
2949 case OMPScheduleType::BaseGuidedIterativeChunked
:
2950 case OMPScheduleType::BaseGuidedAnalyticalChunked
:
2951 case OMPScheduleType::BaseStaticBalancedChunked
:
2952 return applyDynamicWorkshareLoop(DL
, CLI
, AllocaIP
, EffectiveScheduleType
,
2953 NeedsBarrier
, ChunkSize
);
2956 llvm_unreachable("Unknown/unimplemented schedule kind");
2960 /// Returns an LLVM function to call for initializing loop bounds using OpenMP
2961 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
2962 /// the runtime. Always interpret integers as unsigned similarly to
2963 /// CanonicalLoopInfo.
2964 static FunctionCallee
2965 getKmpcForDynamicInitForType(Type
*Ty
, Module
&M
, OpenMPIRBuilder
&OMPBuilder
) {
2966 unsigned Bitwidth
= Ty
->getIntegerBitWidth();
2968 return OMPBuilder
.getOrCreateRuntimeFunction(
2969 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u
);
2971 return OMPBuilder
.getOrCreateRuntimeFunction(
2972 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u
);
2973 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2976 /// Returns an LLVM function to call for updating the next loop using OpenMP
2977 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
2978 /// the runtime. Always interpret integers as unsigned similarly to
2979 /// CanonicalLoopInfo.
2980 static FunctionCallee
2981 getKmpcForDynamicNextForType(Type
*Ty
, Module
&M
, OpenMPIRBuilder
&OMPBuilder
) {
2982 unsigned Bitwidth
= Ty
->getIntegerBitWidth();
2984 return OMPBuilder
.getOrCreateRuntimeFunction(
2985 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u
);
2987 return OMPBuilder
.getOrCreateRuntimeFunction(
2988 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u
);
2989 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2992 /// Returns an LLVM function to call for finalizing the dynamic loop using
2993 /// depending on `type`. Only i32 and i64 are supported by the runtime. Always
2994 /// interpret integers as unsigned similarly to CanonicalLoopInfo.
2995 static FunctionCallee
2996 getKmpcForDynamicFiniForType(Type
*Ty
, Module
&M
, OpenMPIRBuilder
&OMPBuilder
) {
2997 unsigned Bitwidth
= Ty
->getIntegerBitWidth();
2999 return OMPBuilder
.getOrCreateRuntimeFunction(
3000 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u
);
3002 return OMPBuilder
.getOrCreateRuntimeFunction(
3003 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u
);
3004 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3007 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::applyDynamicWorkshareLoop(
3008 DebugLoc DL
, CanonicalLoopInfo
*CLI
, InsertPointTy AllocaIP
,
3009 OMPScheduleType SchedType
, bool NeedsBarrier
, Value
*Chunk
) {
3010 assert(CLI
->isValid() && "Requires a valid canonical loop");
3011 assert(!isConflictIP(AllocaIP
, CLI
->getPreheaderIP()) &&
3012 "Require dedicated allocate IP");
3013 assert(isValidWorkshareLoopScheduleType(SchedType
) &&
3014 "Require valid schedule type");
3016 bool Ordered
= (SchedType
& OMPScheduleType::ModifierOrdered
) ==
3017 OMPScheduleType::ModifierOrdered
;
3019 // Set up the source location value for OpenMP runtime.
3020 Builder
.SetCurrentDebugLocation(DL
);
3022 uint32_t SrcLocStrSize
;
3023 Constant
*SrcLocStr
= getOrCreateSrcLocStr(DL
, SrcLocStrSize
);
3024 Value
*SrcLoc
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
3026 // Declare useful OpenMP runtime functions.
3027 Value
*IV
= CLI
->getIndVar();
3028 Type
*IVTy
= IV
->getType();
3029 FunctionCallee DynamicInit
= getKmpcForDynamicInitForType(IVTy
, M
, *this);
3030 FunctionCallee DynamicNext
= getKmpcForDynamicNextForType(IVTy
, M
, *this);
3032 // Allocate space for computed loop bounds as expected by the "init" function.
3033 Builder
.restoreIP(AllocaIP
);
3034 Type
*I32Type
= Type::getInt32Ty(M
.getContext());
3035 Value
*PLastIter
= Builder
.CreateAlloca(I32Type
, nullptr, "p.lastiter");
3036 Value
*PLowerBound
= Builder
.CreateAlloca(IVTy
, nullptr, "p.lowerbound");
3037 Value
*PUpperBound
= Builder
.CreateAlloca(IVTy
, nullptr, "p.upperbound");
3038 Value
*PStride
= Builder
.CreateAlloca(IVTy
, nullptr, "p.stride");
3040 // At the end of the preheader, prepare for calling the "init" function by
3041 // storing the current loop bounds into the allocated space. A canonical loop
3042 // always iterates from 0 to trip-count with step 1. Note that "init" expects
3043 // and produces an inclusive upper bound.
3044 BasicBlock
*PreHeader
= CLI
->getPreheader();
3045 Builder
.SetInsertPoint(PreHeader
->getTerminator());
3046 Constant
*One
= ConstantInt::get(IVTy
, 1);
3047 Builder
.CreateStore(One
, PLowerBound
);
3048 Value
*UpperBound
= CLI
->getTripCount();
3049 Builder
.CreateStore(UpperBound
, PUpperBound
);
3050 Builder
.CreateStore(One
, PStride
);
3052 BasicBlock
*Header
= CLI
->getHeader();
3053 BasicBlock
*Exit
= CLI
->getExit();
3054 BasicBlock
*Cond
= CLI
->getCond();
3055 BasicBlock
*Latch
= CLI
->getLatch();
3056 InsertPointTy AfterIP
= CLI
->getAfterIP();
3058 // The CLI will be "broken" in the code below, as the loop is no longer
3059 // a valid canonical loop.
3064 Value
*ThreadNum
= getOrCreateThreadID(SrcLoc
);
3066 Constant
*SchedulingType
=
3067 ConstantInt::get(I32Type
, static_cast<int>(SchedType
));
3069 // Call the "init" function.
3070 Builder
.CreateCall(DynamicInit
,
3071 {SrcLoc
, ThreadNum
, SchedulingType
, /* LowerBound */ One
,
3072 UpperBound
, /* step */ One
, Chunk
});
3074 // An outer loop around the existing one.
3075 BasicBlock
*OuterCond
= BasicBlock::Create(
3076 PreHeader
->getContext(), Twine(PreHeader
->getName()) + ".outer.cond",
3077 PreHeader
->getParent());
3078 // This needs to be 32-bit always, so can't use the IVTy Zero above.
3079 Builder
.SetInsertPoint(OuterCond
, OuterCond
->getFirstInsertionPt());
3081 Builder
.CreateCall(DynamicNext
, {SrcLoc
, ThreadNum
, PLastIter
,
3082 PLowerBound
, PUpperBound
, PStride
});
3083 Constant
*Zero32
= ConstantInt::get(I32Type
, 0);
3084 Value
*MoreWork
= Builder
.CreateCmp(CmpInst::ICMP_NE
, Res
, Zero32
);
3086 Builder
.CreateSub(Builder
.CreateLoad(IVTy
, PLowerBound
), One
, "lb");
3087 Builder
.CreateCondBr(MoreWork
, Header
, Exit
);
3089 // Change PHI-node in loop header to use outer cond rather than preheader,
3090 // and set IV to the LowerBound.
3091 Instruction
*Phi
= &Header
->front();
3092 auto *PI
= cast
<PHINode
>(Phi
);
3093 PI
->setIncomingBlock(0, OuterCond
);
3094 PI
->setIncomingValue(0, LowerBound
);
3096 // Then set the pre-header to jump to the OuterCond
3097 Instruction
*Term
= PreHeader
->getTerminator();
3098 auto *Br
= cast
<BranchInst
>(Term
);
3099 Br
->setSuccessor(0, OuterCond
);
3101 // Modify the inner condition:
3102 // * Use the UpperBound returned from the DynamicNext call.
3103 // * jump to the loop outer loop when done with one of the inner loops.
3104 Builder
.SetInsertPoint(Cond
, Cond
->getFirstInsertionPt());
3105 UpperBound
= Builder
.CreateLoad(IVTy
, PUpperBound
, "ub");
3106 Instruction
*Comp
= &*Builder
.GetInsertPoint();
3107 auto *CI
= cast
<CmpInst
>(Comp
);
3108 CI
->setOperand(1, UpperBound
);
3109 // Redirect the inner exit to branch to outer condition.
3110 Instruction
*Branch
= &Cond
->back();
3111 auto *BI
= cast
<BranchInst
>(Branch
);
3112 assert(BI
->getSuccessor(1) == Exit
);
3113 BI
->setSuccessor(1, OuterCond
);
3115 // Call the "fini" function if "ordered" is present in wsloop directive.
3117 Builder
.SetInsertPoint(&Latch
->back());
3118 FunctionCallee DynamicFini
= getKmpcForDynamicFiniForType(IVTy
, M
, *this);
3119 Builder
.CreateCall(DynamicFini
, {SrcLoc
, ThreadNum
});
3122 // Add the barrier if requested.
3124 Builder
.SetInsertPoint(&Exit
->back());
3125 createBarrier(LocationDescription(Builder
.saveIP(), DL
),
3126 omp::Directive::OMPD_for
, /* ForceSimpleCall */ false,
3127 /* CheckCancelFlag */ false);
3134 /// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
3135 /// after this \p OldTarget will be orphaned.
3136 static void redirectAllPredecessorsTo(BasicBlock
*OldTarget
,
3137 BasicBlock
*NewTarget
, DebugLoc DL
) {
3138 for (BasicBlock
*Pred
: make_early_inc_range(predecessors(OldTarget
)))
3139 redirectTo(Pred
, NewTarget
, DL
);
3142 /// Determine which blocks in \p BBs are reachable from outside and remove the
3143 /// ones that are not reachable from the function.
3144 static void removeUnusedBlocksFromParent(ArrayRef
<BasicBlock
*> BBs
) {
3145 SmallPtrSet
<BasicBlock
*, 6> BBsToErase
{BBs
.begin(), BBs
.end()};
3146 auto HasRemainingUses
= [&BBsToErase
](BasicBlock
*BB
) {
3147 for (Use
&U
: BB
->uses()) {
3148 auto *UseInst
= dyn_cast
<Instruction
>(U
.getUser());
3151 if (BBsToErase
.count(UseInst
->getParent()))
3159 bool Changed
= false;
3160 for (BasicBlock
*BB
: make_early_inc_range(BBsToErase
)) {
3161 if (HasRemainingUses(BB
)) {
3162 BBsToErase
.erase(BB
);
3170 SmallVector
<BasicBlock
*, 7> BBVec(BBsToErase
.begin(), BBsToErase
.end());
3171 DeleteDeadBlocks(BBVec
);
3175 OpenMPIRBuilder::collapseLoops(DebugLoc DL
, ArrayRef
<CanonicalLoopInfo
*> Loops
,
3176 InsertPointTy ComputeIP
) {
3177 assert(Loops
.size() >= 1 && "At least one loop required");
3178 size_t NumLoops
= Loops
.size();
3180 // Nothing to do if there is already just one loop.
3182 return Loops
.front();
3184 CanonicalLoopInfo
*Outermost
= Loops
.front();
3185 CanonicalLoopInfo
*Innermost
= Loops
.back();
3186 BasicBlock
*OrigPreheader
= Outermost
->getPreheader();
3187 BasicBlock
*OrigAfter
= Outermost
->getAfter();
3188 Function
*F
= OrigPreheader
->getParent();
3190 // Loop control blocks that may become orphaned later.
3191 SmallVector
<BasicBlock
*, 12> OldControlBBs
;
3192 OldControlBBs
.reserve(6 * Loops
.size());
3193 for (CanonicalLoopInfo
*Loop
: Loops
)
3194 Loop
->collectControlBlocks(OldControlBBs
);
3196 // Setup the IRBuilder for inserting the trip count computation.
3197 Builder
.SetCurrentDebugLocation(DL
);
3198 if (ComputeIP
.isSet())
3199 Builder
.restoreIP(ComputeIP
);
3201 Builder
.restoreIP(Outermost
->getPreheaderIP());
3203 // Derive the collapsed' loop trip count.
3204 // TODO: Find common/largest indvar type.
3205 Value
*CollapsedTripCount
= nullptr;
3206 for (CanonicalLoopInfo
*L
: Loops
) {
3207 assert(L
->isValid() &&
3208 "All loops to collapse must be valid canonical loops");
3209 Value
*OrigTripCount
= L
->getTripCount();
3210 if (!CollapsedTripCount
) {
3211 CollapsedTripCount
= OrigTripCount
;
3215 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
3216 CollapsedTripCount
= Builder
.CreateMul(CollapsedTripCount
, OrigTripCount
,
3217 {}, /*HasNUW=*/true);
3220 // Create the collapsed loop control flow.
3221 CanonicalLoopInfo
*Result
=
3222 createLoopSkeleton(DL
, CollapsedTripCount
, F
,
3223 OrigPreheader
->getNextNode(), OrigAfter
, "collapsed");
3225 // Build the collapsed loop body code.
3226 // Start with deriving the input loop induction variables from the collapsed
3227 // one, using a divmod scheme. To preserve the original loops' order, the
3228 // innermost loop use the least significant bits.
3229 Builder
.restoreIP(Result
->getBodyIP());
3231 Value
*Leftover
= Result
->getIndVar();
3232 SmallVector
<Value
*> NewIndVars
;
3233 NewIndVars
.resize(NumLoops
);
3234 for (int i
= NumLoops
- 1; i
>= 1; --i
) {
3235 Value
*OrigTripCount
= Loops
[i
]->getTripCount();
3237 Value
*NewIndVar
= Builder
.CreateURem(Leftover
, OrigTripCount
);
3238 NewIndVars
[i
] = NewIndVar
;
3240 Leftover
= Builder
.CreateUDiv(Leftover
, OrigTripCount
);
3242 // Outermost loop gets all the remaining bits.
3243 NewIndVars
[0] = Leftover
;
3245 // Construct the loop body control flow.
3246 // We progressively construct the branch structure following in direction of
3247 // the control flow, from the leading in-between code, the loop nest body, the
3248 // trailing in-between code, and rejoining the collapsed loop's latch.
3249 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
3250 // the ContinueBlock is set, continue with that block. If ContinuePred, use
3251 // its predecessors as sources.
3252 BasicBlock
*ContinueBlock
= Result
->getBody();
3253 BasicBlock
*ContinuePred
= nullptr;
3254 auto ContinueWith
= [&ContinueBlock
, &ContinuePred
, DL
](BasicBlock
*Dest
,
3255 BasicBlock
*NextSrc
) {
3257 redirectTo(ContinueBlock
, Dest
, DL
);
3259 redirectAllPredecessorsTo(ContinuePred
, Dest
, DL
);
3261 ContinueBlock
= nullptr;
3262 ContinuePred
= NextSrc
;
3265 // The code before the nested loop of each level.
3266 // Because we are sinking it into the nest, it will be executed more often
3267 // that the original loop. More sophisticated schemes could keep track of what
3268 // the in-between code is and instantiate it only once per thread.
3269 for (size_t i
= 0; i
< NumLoops
- 1; ++i
)
3270 ContinueWith(Loops
[i
]->getBody(), Loops
[i
+ 1]->getHeader());
3272 // Connect the loop nest body.
3273 ContinueWith(Innermost
->getBody(), Innermost
->getLatch());
3275 // The code after the nested loop at each level.
3276 for (size_t i
= NumLoops
- 1; i
> 0; --i
)
3277 ContinueWith(Loops
[i
]->getAfter(), Loops
[i
- 1]->getLatch());
3279 // Connect the finished loop to the collapsed loop latch.
3280 ContinueWith(Result
->getLatch(), nullptr);
3282 // Replace the input loops with the new collapsed loop.
3283 redirectTo(Outermost
->getPreheader(), Result
->getPreheader(), DL
);
3284 redirectTo(Result
->getAfter(), Outermost
->getAfter(), DL
);
3286 // Replace the input loop indvars with the derived ones.
3287 for (size_t i
= 0; i
< NumLoops
; ++i
)
3288 Loops
[i
]->getIndVar()->replaceAllUsesWith(NewIndVars
[i
]);
3290 // Remove unused parts of the input loops.
3291 removeUnusedBlocksFromParent(OldControlBBs
);
3293 for (CanonicalLoopInfo
*L
: Loops
)
3302 std::vector
<CanonicalLoopInfo
*>
3303 OpenMPIRBuilder::tileLoops(DebugLoc DL
, ArrayRef
<CanonicalLoopInfo
*> Loops
,
3304 ArrayRef
<Value
*> TileSizes
) {
3305 assert(TileSizes
.size() == Loops
.size() &&
3306 "Must pass as many tile sizes as there are loops");
3307 int NumLoops
= Loops
.size();
3308 assert(NumLoops
>= 1 && "At least one loop to tile required");
3310 CanonicalLoopInfo
*OutermostLoop
= Loops
.front();
3311 CanonicalLoopInfo
*InnermostLoop
= Loops
.back();
3312 Function
*F
= OutermostLoop
->getBody()->getParent();
3313 BasicBlock
*InnerEnter
= InnermostLoop
->getBody();
3314 BasicBlock
*InnerLatch
= InnermostLoop
->getLatch();
3316 // Loop control blocks that may become orphaned later.
3317 SmallVector
<BasicBlock
*, 12> OldControlBBs
;
3318 OldControlBBs
.reserve(6 * Loops
.size());
3319 for (CanonicalLoopInfo
*Loop
: Loops
)
3320 Loop
->collectControlBlocks(OldControlBBs
);
3322 // Collect original trip counts and induction variable to be accessible by
3323 // index. Also, the structure of the original loops is not preserved during
3324 // the construction of the tiled loops, so do it before we scavenge the BBs of
3325 // any original CanonicalLoopInfo.
3326 SmallVector
<Value
*, 4> OrigTripCounts
, OrigIndVars
;
3327 for (CanonicalLoopInfo
*L
: Loops
) {
3328 assert(L
->isValid() && "All input loops must be valid canonical loops");
3329 OrigTripCounts
.push_back(L
->getTripCount());
3330 OrigIndVars
.push_back(L
->getIndVar());
3333 // Collect the code between loop headers. These may contain SSA definitions
3334 // that are used in the loop nest body. To be usable with in the innermost
3335 // body, these BasicBlocks will be sunk into the loop nest body. That is,
3336 // these instructions may be executed more often than before the tiling.
3337 // TODO: It would be sufficient to only sink them into body of the
3338 // corresponding tile loop.
3339 SmallVector
<std::pair
<BasicBlock
*, BasicBlock
*>, 4> InbetweenCode
;
3340 for (int i
= 0; i
< NumLoops
- 1; ++i
) {
3341 CanonicalLoopInfo
*Surrounding
= Loops
[i
];
3342 CanonicalLoopInfo
*Nested
= Loops
[i
+ 1];
3344 BasicBlock
*EnterBB
= Surrounding
->getBody();
3345 BasicBlock
*ExitBB
= Nested
->getHeader();
3346 InbetweenCode
.emplace_back(EnterBB
, ExitBB
);
3349 // Compute the trip counts of the floor loops.
3350 Builder
.SetCurrentDebugLocation(DL
);
3351 Builder
.restoreIP(OutermostLoop
->getPreheaderIP());
3352 SmallVector
<Value
*, 4> FloorCount
, FloorRems
;
3353 for (int i
= 0; i
< NumLoops
; ++i
) {
3354 Value
*TileSize
= TileSizes
[i
];
3355 Value
*OrigTripCount
= OrigTripCounts
[i
];
3356 Type
*IVType
= OrigTripCount
->getType();
3358 Value
*FloorTripCount
= Builder
.CreateUDiv(OrigTripCount
, TileSize
);
3359 Value
*FloorTripRem
= Builder
.CreateURem(OrigTripCount
, TileSize
);
3361 // 0 if tripcount divides the tilesize, 1 otherwise.
3362 // 1 means we need an additional iteration for a partial tile.
3364 // Unfortunately we cannot just use the roundup-formula
3365 // (tripcount + tilesize - 1)/tilesize
3366 // because the summation might overflow. We do not want introduce undefined
3367 // behavior when the untiled loop nest did not.
3368 Value
*FloorTripOverflow
=
3369 Builder
.CreateICmpNE(FloorTripRem
, ConstantInt::get(IVType
, 0));
3371 FloorTripOverflow
= Builder
.CreateZExt(FloorTripOverflow
, IVType
);
3373 Builder
.CreateAdd(FloorTripCount
, FloorTripOverflow
,
3374 "omp_floor" + Twine(i
) + ".tripcount", true);
3376 // Remember some values for later use.
3377 FloorCount
.push_back(FloorTripCount
);
3378 FloorRems
.push_back(FloorTripRem
);
3381 // Generate the new loop nest, from the outermost to the innermost.
3382 std::vector
<CanonicalLoopInfo
*> Result
;
3383 Result
.reserve(NumLoops
* 2);
3385 // The basic block of the surrounding loop that enters the nest generated
3387 BasicBlock
*Enter
= OutermostLoop
->getPreheader();
3389 // The basic block of the surrounding loop where the inner code should
3391 BasicBlock
*Continue
= OutermostLoop
->getAfter();
3393 // Where the next loop basic block should be inserted.
3394 BasicBlock
*OutroInsertBefore
= InnermostLoop
->getExit();
3396 auto EmbeddNewLoop
=
3397 [this, DL
, F
, InnerEnter
, &Enter
, &Continue
, &OutroInsertBefore
](
3398 Value
*TripCount
, const Twine
&Name
) -> CanonicalLoopInfo
* {
3399 CanonicalLoopInfo
*EmbeddedLoop
= createLoopSkeleton(
3400 DL
, TripCount
, F
, InnerEnter
, OutroInsertBefore
, Name
);
3401 redirectTo(Enter
, EmbeddedLoop
->getPreheader(), DL
);
3402 redirectTo(EmbeddedLoop
->getAfter(), Continue
, DL
);
3404 // Setup the position where the next embedded loop connects to this loop.
3405 Enter
= EmbeddedLoop
->getBody();
3406 Continue
= EmbeddedLoop
->getLatch();
3407 OutroInsertBefore
= EmbeddedLoop
->getLatch();
3408 return EmbeddedLoop
;
3411 auto EmbeddNewLoops
= [&Result
, &EmbeddNewLoop
](ArrayRef
<Value
*> TripCounts
,
3412 const Twine
&NameBase
) {
3413 for (auto P
: enumerate(TripCounts
)) {
3414 CanonicalLoopInfo
*EmbeddedLoop
=
3415 EmbeddNewLoop(P
.value(), NameBase
+ Twine(P
.index()));
3416 Result
.push_back(EmbeddedLoop
);
3420 EmbeddNewLoops(FloorCount
, "floor");
3422 // Within the innermost floor loop, emit the code that computes the tile
3424 Builder
.SetInsertPoint(Enter
->getTerminator());
3425 SmallVector
<Value
*, 4> TileCounts
;
3426 for (int i
= 0; i
< NumLoops
; ++i
) {
3427 CanonicalLoopInfo
*FloorLoop
= Result
[i
];
3428 Value
*TileSize
= TileSizes
[i
];
3430 Value
*FloorIsEpilogue
=
3431 Builder
.CreateICmpEQ(FloorLoop
->getIndVar(), FloorCount
[i
]);
3432 Value
*TileTripCount
=
3433 Builder
.CreateSelect(FloorIsEpilogue
, FloorRems
[i
], TileSize
);
3435 TileCounts
.push_back(TileTripCount
);
3438 // Create the tile loops.
3439 EmbeddNewLoops(TileCounts
, "tile");
3441 // Insert the inbetween code into the body.
3442 BasicBlock
*BodyEnter
= Enter
;
3443 BasicBlock
*BodyEntered
= nullptr;
3444 for (std::pair
<BasicBlock
*, BasicBlock
*> P
: InbetweenCode
) {
3445 BasicBlock
*EnterBB
= P
.first
;
3446 BasicBlock
*ExitBB
= P
.second
;
3449 redirectTo(BodyEnter
, EnterBB
, DL
);
3451 redirectAllPredecessorsTo(BodyEntered
, EnterBB
, DL
);
3453 BodyEnter
= nullptr;
3454 BodyEntered
= ExitBB
;
3457 // Append the original loop nest body into the generated loop nest body.
3459 redirectTo(BodyEnter
, InnerEnter
, DL
);
3461 redirectAllPredecessorsTo(BodyEntered
, InnerEnter
, DL
);
3462 redirectAllPredecessorsTo(InnerLatch
, Continue
, DL
);
3464 // Replace the original induction variable with an induction variable computed
3465 // from the tile and floor induction variables.
3466 Builder
.restoreIP(Result
.back()->getBodyIP());
3467 for (int i
= 0; i
< NumLoops
; ++i
) {
3468 CanonicalLoopInfo
*FloorLoop
= Result
[i
];
3469 CanonicalLoopInfo
*TileLoop
= Result
[NumLoops
+ i
];
3470 Value
*OrigIndVar
= OrigIndVars
[i
];
3471 Value
*Size
= TileSizes
[i
];
3474 Builder
.CreateMul(Size
, FloorLoop
->getIndVar(), {}, /*HasNUW=*/true);
3476 Builder
.CreateAdd(Scale
, TileLoop
->getIndVar(), {}, /*HasNUW=*/true);
3477 OrigIndVar
->replaceAllUsesWith(Shift
);
3480 // Remove unused parts of the original loops.
3481 removeUnusedBlocksFromParent(OldControlBBs
);
3483 for (CanonicalLoopInfo
*L
: Loops
)
3487 for (CanonicalLoopInfo
*GenL
: Result
)
3493 /// Attach metadata \p Properties to the basic block described by \p BB. If the
3494 /// basic block already has metadata, the basic block properties are appended.
3495 static void addBasicBlockMetadata(BasicBlock
*BB
,
3496 ArrayRef
<Metadata
*> Properties
) {
3497 // Nothing to do if no property to attach.
3498 if (Properties
.empty())
3501 LLVMContext
&Ctx
= BB
->getContext();
3502 SmallVector
<Metadata
*> NewProperties
;
3503 NewProperties
.push_back(nullptr);
3505 // If the basic block already has metadata, prepend it to the new metadata.
3506 MDNode
*Existing
= BB
->getTerminator()->getMetadata(LLVMContext::MD_loop
);
3508 append_range(NewProperties
, drop_begin(Existing
->operands(), 1));
3510 append_range(NewProperties
, Properties
);
3511 MDNode
*BasicBlockID
= MDNode::getDistinct(Ctx
, NewProperties
);
3512 BasicBlockID
->replaceOperandWith(0, BasicBlockID
);
3514 BB
->getTerminator()->setMetadata(LLVMContext::MD_loop
, BasicBlockID
);
3517 /// Attach loop metadata \p Properties to the loop described by \p Loop. If the
3518 /// loop already has metadata, the loop properties are appended.
3519 static void addLoopMetadata(CanonicalLoopInfo
*Loop
,
3520 ArrayRef
<Metadata
*> Properties
) {
3521 assert(Loop
->isValid() && "Expecting a valid CanonicalLoopInfo");
3523 // Attach metadata to the loop's latch
3524 BasicBlock
*Latch
= Loop
->getLatch();
3525 assert(Latch
&& "A valid CanonicalLoopInfo must have a unique latch");
3526 addBasicBlockMetadata(Latch
, Properties
);
3529 /// Attach llvm.access.group metadata to the memref instructions of \p Block
3530 static void addSimdMetadata(BasicBlock
*Block
, MDNode
*AccessGroup
,
3532 for (Instruction
&I
: *Block
) {
3533 if (I
.mayReadOrWriteMemory()) {
3534 // TODO: This instruction may already have access group from
3535 // other pragmas e.g. #pragma clang loop vectorize. Append
3536 // so that the existing metadata is not overwritten.
3537 I
.setMetadata(LLVMContext::MD_access_group
, AccessGroup
);
3542 void OpenMPIRBuilder::unrollLoopFull(DebugLoc
, CanonicalLoopInfo
*Loop
) {
3543 LLVMContext
&Ctx
= Builder
.getContext();
3545 Loop
, {MDNode::get(Ctx
, MDString::get(Ctx
, "llvm.loop.unroll.enable")),
3546 MDNode::get(Ctx
, MDString::get(Ctx
, "llvm.loop.unroll.full"))});
3549 void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc
, CanonicalLoopInfo
*Loop
) {
3550 LLVMContext
&Ctx
= Builder
.getContext();
3553 MDNode::get(Ctx
, MDString::get(Ctx
, "llvm.loop.unroll.enable")),
3557 void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo
*CanonicalLoop
,
3558 Value
*IfCond
, ValueToValueMapTy
&VMap
,
3559 const Twine
&NamePrefix
) {
3560 Function
*F
= CanonicalLoop
->getFunction();
3562 // Define where if branch should be inserted
3563 Instruction
*SplitBefore
;
3564 if (Instruction::classof(IfCond
)) {
3565 SplitBefore
= dyn_cast
<Instruction
>(IfCond
);
3567 SplitBefore
= CanonicalLoop
->getPreheader()->getTerminator();
3570 // TODO: We should not rely on pass manager. Currently we use pass manager
3571 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
3572 // object. We should have a method which returns all blocks between
3573 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
3574 FunctionAnalysisManager FAM
;
3575 FAM
.registerPass([]() { return DominatorTreeAnalysis(); });
3576 FAM
.registerPass([]() { return LoopAnalysis(); });
3577 FAM
.registerPass([]() { return PassInstrumentationAnalysis(); });
3579 // Get the loop which needs to be cloned
3581 LoopInfo
&&LI
= LIA
.run(*F
, FAM
);
3582 Loop
*L
= LI
.getLoopFor(CanonicalLoop
->getHeader());
3584 // Create additional blocks for the if statement
3585 BasicBlock
*Head
= SplitBefore
->getParent();
3586 Instruction
*HeadOldTerm
= Head
->getTerminator();
3587 llvm::LLVMContext
&C
= Head
->getContext();
3588 llvm::BasicBlock
*ThenBlock
= llvm::BasicBlock::Create(
3589 C
, NamePrefix
+ ".if.then", Head
->getParent(), Head
->getNextNode());
3590 llvm::BasicBlock
*ElseBlock
= llvm::BasicBlock::Create(
3591 C
, NamePrefix
+ ".if.else", Head
->getParent(), CanonicalLoop
->getExit());
3593 // Create if condition branch.
3594 Builder
.SetInsertPoint(HeadOldTerm
);
3595 Instruction
*BrInstr
=
3596 Builder
.CreateCondBr(IfCond
, ThenBlock
, /*ifFalse*/ ElseBlock
);
3597 InsertPointTy IP
{BrInstr
->getParent(), ++BrInstr
->getIterator()};
3598 // Then block contains branch to omp loop which needs to be vectorized
3599 spliceBB(IP
, ThenBlock
, false);
3600 ThenBlock
->replaceSuccessorsPhiUsesWith(Head
, ThenBlock
);
3602 Builder
.SetInsertPoint(ElseBlock
);
3604 // Clone loop for the else branch
3605 SmallVector
<BasicBlock
*, 8> NewBlocks
;
3607 VMap
[CanonicalLoop
->getPreheader()] = ElseBlock
;
3608 for (BasicBlock
*Block
: L
->getBlocks()) {
3609 BasicBlock
*NewBB
= CloneBasicBlock(Block
, VMap
, "", F
);
3610 NewBB
->moveBefore(CanonicalLoop
->getExit());
3611 VMap
[Block
] = NewBB
;
3612 NewBlocks
.push_back(NewBB
);
3614 remapInstructionsInBlocks(NewBlocks
, VMap
);
3615 Builder
.CreateBr(NewBlocks
.front());
3619 OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple
&TargetTriple
,
3620 const StringMap
<bool> &Features
) {
3621 if (TargetTriple
.isX86()) {
3622 if (Features
.lookup("avx512f"))
3624 else if (Features
.lookup("avx"))
3628 if (TargetTriple
.isPPC())
3630 if (TargetTriple
.isWasm())
3635 void OpenMPIRBuilder::applySimd(CanonicalLoopInfo
*CanonicalLoop
,
3636 MapVector
<Value
*, Value
*> AlignedVars
,
3637 Value
*IfCond
, OrderKind Order
,
3638 ConstantInt
*Simdlen
, ConstantInt
*Safelen
) {
3639 LLVMContext
&Ctx
= Builder
.getContext();
3641 Function
*F
= CanonicalLoop
->getFunction();
3643 // TODO: We should not rely on pass manager. Currently we use pass manager
3644 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
3645 // object. We should have a method which returns all blocks between
3646 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
3647 FunctionAnalysisManager FAM
;
3648 FAM
.registerPass([]() { return DominatorTreeAnalysis(); });
3649 FAM
.registerPass([]() { return LoopAnalysis(); });
3650 FAM
.registerPass([]() { return PassInstrumentationAnalysis(); });
3653 LoopInfo
&&LI
= LIA
.run(*F
, FAM
);
3655 Loop
*L
= LI
.getLoopFor(CanonicalLoop
->getHeader());
3656 if (AlignedVars
.size()) {
3657 InsertPointTy IP
= Builder
.saveIP();
3658 Builder
.SetInsertPoint(CanonicalLoop
->getPreheader()->getTerminator());
3659 for (auto &AlignedItem
: AlignedVars
) {
3660 Value
*AlignedPtr
= AlignedItem
.first
;
3661 Value
*Alignment
= AlignedItem
.second
;
3662 Builder
.CreateAlignmentAssumption(F
->getParent()->getDataLayout(),
3663 AlignedPtr
, Alignment
);
3665 Builder
.restoreIP(IP
);
3669 ValueToValueMapTy VMap
;
3670 createIfVersion(CanonicalLoop
, IfCond
, VMap
, "simd");
3671 // Add metadata to the cloned loop which disables vectorization
3672 Value
*MappedLatch
= VMap
.lookup(CanonicalLoop
->getLatch());
3673 assert(MappedLatch
&&
3674 "Cannot find value which corresponds to original loop latch");
3675 assert(isa
<BasicBlock
>(MappedLatch
) &&
3676 "Cannot cast mapped latch block value to BasicBlock");
3677 BasicBlock
*NewLatchBlock
= dyn_cast
<BasicBlock
>(MappedLatch
);
3678 ConstantAsMetadata
*BoolConst
=
3679 ConstantAsMetadata::get(ConstantInt::getFalse(Type::getInt1Ty(Ctx
)));
3680 addBasicBlockMetadata(
3682 {MDNode::get(Ctx
, {MDString::get(Ctx
, "llvm.loop.vectorize.enable"),
3686 SmallSet
<BasicBlock
*, 8> Reachable
;
3688 // Get the basic blocks from the loop in which memref instructions
3690 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
3691 // preferably without running any passes.
3692 for (BasicBlock
*Block
: L
->getBlocks()) {
3693 if (Block
== CanonicalLoop
->getCond() ||
3694 Block
== CanonicalLoop
->getHeader())
3696 Reachable
.insert(Block
);
3699 SmallVector
<Metadata
*> LoopMDList
;
3701 // In presence of finite 'safelen', it may be unsafe to mark all
3702 // the memory instructions parallel, because loop-carried
3703 // dependences of 'safelen' iterations are possible.
3704 // If clause order(concurrent) is specified then the memory instructions
3705 // are marked parallel even if 'safelen' is finite.
3706 if ((Safelen
== nullptr) || (Order
== OrderKind::OMP_ORDER_concurrent
)) {
3707 // Add access group metadata to memory-access instructions.
3708 MDNode
*AccessGroup
= MDNode::getDistinct(Ctx
, {});
3709 for (BasicBlock
*BB
: Reachable
)
3710 addSimdMetadata(BB
, AccessGroup
, LI
);
3711 // TODO: If the loop has existing parallel access metadata, have
3712 // to combine two lists.
3713 LoopMDList
.push_back(MDNode::get(
3714 Ctx
, {MDString::get(Ctx
, "llvm.loop.parallel_accesses"), AccessGroup
}));
3717 // Use the above access group metadata to create loop level
3718 // metadata, which should be distinct for each loop.
3719 ConstantAsMetadata
*BoolConst
=
3720 ConstantAsMetadata::get(ConstantInt::getTrue(Type::getInt1Ty(Ctx
)));
3721 LoopMDList
.push_back(MDNode::get(
3722 Ctx
, {MDString::get(Ctx
, "llvm.loop.vectorize.enable"), BoolConst
}));
3724 if (Simdlen
|| Safelen
) {
3725 // If both simdlen and safelen clauses are specified, the value of the
3726 // simdlen parameter must be less than or equal to the value of the safelen
3727 // parameter. Therefore, use safelen only in the absence of simdlen.
3728 ConstantInt
*VectorizeWidth
= Simdlen
== nullptr ? Safelen
: Simdlen
;
3729 LoopMDList
.push_back(
3730 MDNode::get(Ctx
, {MDString::get(Ctx
, "llvm.loop.vectorize.width"),
3731 ConstantAsMetadata::get(VectorizeWidth
)}));
3734 addLoopMetadata(CanonicalLoop
, LoopMDList
);
3737 /// Create the TargetMachine object to query the backend for optimization
3740 /// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
3741 /// e.g. Clang does not pass it to its CodeGen layer and creates it only when
3742 /// needed for the LLVM pass pipline. We use some default options to avoid
3743 /// having to pass too many settings from the frontend that probably do not
3746 /// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
3747 /// method. If we are going to use TargetMachine for more purposes, especially
3748 /// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
3749 /// might become be worth requiring front-ends to pass on their TargetMachine,
3750 /// or at least cache it between methods. Note that while fontends such as Clang
3751 /// have just a single main TargetMachine per translation unit, "target-cpu" and
3752 /// "target-features" that determine the TargetMachine are per-function and can
3753 /// be overrided using __attribute__((target("OPTIONS"))).
3754 static std::unique_ptr
<TargetMachine
>
3755 createTargetMachine(Function
*F
, CodeGenOptLevel OptLevel
) {
3756 Module
*M
= F
->getParent();
3758 StringRef CPU
= F
->getFnAttribute("target-cpu").getValueAsString();
3759 StringRef Features
= F
->getFnAttribute("target-features").getValueAsString();
3760 const std::string
&Triple
= M
->getTargetTriple();
3763 const llvm::Target
*TheTarget
= TargetRegistry::lookupTarget(Triple
, Error
);
3767 llvm::TargetOptions Options
;
3768 return std::unique_ptr
<TargetMachine
>(TheTarget
->createTargetMachine(
3769 Triple
, CPU
, Features
, Options
, /*RelocModel=*/std::nullopt
,
3770 /*CodeModel=*/std::nullopt
, OptLevel
));
3773 /// Heuristically determine the best-performant unroll factor for \p CLI. This
3774 /// depends on the target processor. We are re-using the same heuristics as the
3776 static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo
*CLI
) {
3777 Function
*F
= CLI
->getFunction();
3779 // Assume the user requests the most aggressive unrolling, even if the rest of
3780 // the code is optimized using a lower setting.
3781 CodeGenOptLevel OptLevel
= CodeGenOptLevel::Aggressive
;
3782 std::unique_ptr
<TargetMachine
> TM
= createTargetMachine(F
, OptLevel
);
3784 FunctionAnalysisManager FAM
;
3785 FAM
.registerPass([]() { return TargetLibraryAnalysis(); });
3786 FAM
.registerPass([]() { return AssumptionAnalysis(); });
3787 FAM
.registerPass([]() { return DominatorTreeAnalysis(); });
3788 FAM
.registerPass([]() { return LoopAnalysis(); });
3789 FAM
.registerPass([]() { return ScalarEvolutionAnalysis(); });
3790 FAM
.registerPass([]() { return PassInstrumentationAnalysis(); });
3791 TargetIRAnalysis TIRA
;
3793 TIRA
= TargetIRAnalysis(
3794 [&](const Function
&F
) { return TM
->getTargetTransformInfo(F
); });
3795 FAM
.registerPass([&]() { return TIRA
; });
3797 TargetIRAnalysis::Result
&&TTI
= TIRA
.run(*F
, FAM
);
3798 ScalarEvolutionAnalysis SEA
;
3799 ScalarEvolution
&&SE
= SEA
.run(*F
, FAM
);
3800 DominatorTreeAnalysis DTA
;
3801 DominatorTree
&&DT
= DTA
.run(*F
, FAM
);
3803 LoopInfo
&&LI
= LIA
.run(*F
, FAM
);
3804 AssumptionAnalysis ACT
;
3805 AssumptionCache
&&AC
= ACT
.run(*F
, FAM
);
3806 OptimizationRemarkEmitter ORE
{F
};
3808 Loop
*L
= LI
.getLoopFor(CLI
->getHeader());
3809 assert(L
&& "Expecting CanonicalLoopInfo to be recognized as a loop");
3811 TargetTransformInfo::UnrollingPreferences UP
=
3812 gatherUnrollingPreferences(L
, SE
, TTI
,
3813 /*BlockFrequencyInfo=*/nullptr,
3814 /*ProfileSummaryInfo=*/nullptr, ORE
, static_cast<int>(OptLevel
),
3815 /*UserThreshold=*/std::nullopt
,
3816 /*UserCount=*/std::nullopt
,
3817 /*UserAllowPartial=*/true,
3818 /*UserAllowRuntime=*/true,
3819 /*UserUpperBound=*/std::nullopt
,
3820 /*UserFullUnrollMaxCount=*/std::nullopt
);
3824 // Account for additional optimizations taking place before the LoopUnrollPass
3825 // would unroll the loop.
3826 UP
.Threshold
*= UnrollThresholdFactor
;
3827 UP
.PartialThreshold
*= UnrollThresholdFactor
;
3829 // Use normal unroll factors even if the rest of the code is optimized for
3831 UP
.OptSizeThreshold
= UP
.Threshold
;
3832 UP
.PartialOptSizeThreshold
= UP
.PartialThreshold
;
3834 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
3835 << " Threshold=" << UP
.Threshold
<< "\n"
3836 << " PartialThreshold=" << UP
.PartialThreshold
<< "\n"
3837 << " OptSizeThreshold=" << UP
.OptSizeThreshold
<< "\n"
3838 << " PartialOptSizeThreshold="
3839 << UP
.PartialOptSizeThreshold
<< "\n");
3842 TargetTransformInfo::PeelingPreferences PP
=
3843 gatherPeelingPreferences(L
, SE
, TTI
,
3844 /*UserAllowPeeling=*/false,
3845 /*UserAllowProfileBasedPeeling=*/false,
3846 /*UnrollingSpecficValues=*/false);
3848 SmallPtrSet
<const Value
*, 32> EphValues
;
3849 CodeMetrics::collectEphemeralValues(L
, &AC
, EphValues
);
3851 // Assume that reads and writes to stack variables can be eliminated by
3852 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
3854 for (BasicBlock
*BB
: L
->blocks()) {
3855 for (Instruction
&I
: *BB
) {
3857 if (auto *Load
= dyn_cast
<LoadInst
>(&I
)) {
3858 Ptr
= Load
->getPointerOperand();
3859 } else if (auto *Store
= dyn_cast
<StoreInst
>(&I
)) {
3860 Ptr
= Store
->getPointerOperand();
3864 Ptr
= Ptr
->stripPointerCasts();
3866 if (auto *Alloca
= dyn_cast
<AllocaInst
>(Ptr
)) {
3867 if (Alloca
->getParent() == &F
->getEntryBlock())
3868 EphValues
.insert(&I
);
3873 UnrollCostEstimator
UCE(L
, TTI
, EphValues
, UP
.BEInsns
);
3875 // Loop is not unrollable if the loop contains certain instructions.
3876 if (!UCE
.canUnroll() || UCE
.Convergent
) {
3877 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
3881 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE
.getRolledLoopSize()
3884 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
3885 // be able to use it.
3887 int MaxTripCount
= 0;
3888 bool MaxOrZero
= false;
3889 unsigned TripMultiple
= 0;
3891 bool UseUpperBound
= false;
3892 computeUnrollCount(L
, TTI
, DT
, &LI
, &AC
, SE
, EphValues
, &ORE
, TripCount
,
3893 MaxTripCount
, MaxOrZero
, TripMultiple
, UCE
, UP
, PP
,
3895 unsigned Factor
= UP
.Count
;
3896 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor
<< "\n");
3898 // This function returns 1 to signal to not unroll a loop.
3904 void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL
, CanonicalLoopInfo
*Loop
,
3906 CanonicalLoopInfo
**UnrolledCLI
) {
3907 assert(Factor
>= 0 && "Unroll factor must not be negative");
3909 Function
*F
= Loop
->getFunction();
3910 LLVMContext
&Ctx
= F
->getContext();
3912 // If the unrolled loop is not used for another loop-associated directive, it
3913 // is sufficient to add metadata for the LoopUnrollPass.
3915 SmallVector
<Metadata
*, 2> LoopMetadata
;
3916 LoopMetadata
.push_back(
3917 MDNode::get(Ctx
, MDString::get(Ctx
, "llvm.loop.unroll.enable")));
3920 ConstantAsMetadata
*FactorConst
= ConstantAsMetadata::get(
3921 ConstantInt::get(Type::getInt32Ty(Ctx
), APInt(32, Factor
)));
3922 LoopMetadata
.push_back(MDNode::get(
3923 Ctx
, {MDString::get(Ctx
, "llvm.loop.unroll.count"), FactorConst
}));
3926 addLoopMetadata(Loop
, LoopMetadata
);
3930 // Heuristically determine the unroll factor.
3932 Factor
= computeHeuristicUnrollFactor(Loop
);
3934 // No change required with unroll factor 1.
3936 *UnrolledCLI
= Loop
;
3940 assert(Factor
>= 2 &&
3941 "unrolling only makes sense with a factor of 2 or larger");
3943 Type
*IndVarTy
= Loop
->getIndVarType();
3945 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
3946 // unroll the inner loop.
3948 ConstantInt::get(IndVarTy
, APInt(IndVarTy
->getIntegerBitWidth(), Factor
,
3949 /*isSigned=*/false));
3950 std::vector
<CanonicalLoopInfo
*> LoopNest
=
3951 tileLoops(DL
, {Loop
}, {FactorVal
});
3952 assert(LoopNest
.size() == 2 && "Expect 2 loops after tiling");
3953 *UnrolledCLI
= LoopNest
[0];
3954 CanonicalLoopInfo
*InnerLoop
= LoopNest
[1];
3956 // LoopUnrollPass can only fully unroll loops with constant trip count.
3957 // Unroll by the unroll factor with a fallback epilog for the remainder
3958 // iterations if necessary.
3959 ConstantAsMetadata
*FactorConst
= ConstantAsMetadata::get(
3960 ConstantInt::get(Type::getInt32Ty(Ctx
), APInt(32, Factor
)));
3963 {MDNode::get(Ctx
, MDString::get(Ctx
, "llvm.loop.unroll.enable")),
3965 Ctx
, {MDString::get(Ctx
, "llvm.loop.unroll.count"), FactorConst
})});
3968 (*UnrolledCLI
)->assertOK();
3972 OpenMPIRBuilder::InsertPointTy
3973 OpenMPIRBuilder::createCopyPrivate(const LocationDescription
&Loc
,
3974 llvm::Value
*BufSize
, llvm::Value
*CpyBuf
,
3975 llvm::Value
*CpyFn
, llvm::Value
*DidIt
) {
3976 if (!updateToLocation(Loc
))
3979 uint32_t SrcLocStrSize
;
3980 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
3981 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
3982 Value
*ThreadId
= getOrCreateThreadID(Ident
);
3984 llvm::Value
*DidItLD
= Builder
.CreateLoad(Builder
.getInt32Ty(), DidIt
);
3986 Value
*Args
[] = {Ident
, ThreadId
, BufSize
, CpyBuf
, CpyFn
, DidItLD
};
3988 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate
);
3989 Builder
.CreateCall(Fn
, Args
);
3991 return Builder
.saveIP();
3994 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createSingle(
3995 const LocationDescription
&Loc
, BodyGenCallbackTy BodyGenCB
,
3996 FinalizeCallbackTy FiniCB
, bool IsNowait
, llvm::Value
*DidIt
) {
3998 if (!updateToLocation(Loc
))
4001 // If needed (i.e. not null), initialize `DidIt` with 0
4003 Builder
.CreateStore(Builder
.getInt32(0), DidIt
);
4006 Directive OMPD
= Directive::OMPD_single
;
4007 uint32_t SrcLocStrSize
;
4008 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4009 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4010 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4011 Value
*Args
[] = {Ident
, ThreadId
};
4013 Function
*EntryRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single
);
4014 Instruction
*EntryCall
= Builder
.CreateCall(EntryRTLFn
, Args
);
4016 Function
*ExitRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single
);
4017 Instruction
*ExitCall
= Builder
.CreateCall(ExitRTLFn
, Args
);
4019 // generates the following:
4020 // if (__kmpc_single()) {
4021 // .... single region ...
4022 // __kmpc_end_single
4026 EmitOMPInlinedRegion(OMPD
, EntryCall
, ExitCall
, BodyGenCB
, FiniCB
,
4027 /*Conditional*/ true,
4028 /*hasFinalize*/ true);
4030 createBarrier(LocationDescription(Builder
.saveIP(), Loc
.DL
),
4031 omp::Directive::OMPD_unknown
, /* ForceSimpleCall */ false,
4032 /* CheckCancelFlag */ false);
4033 return Builder
.saveIP();
4036 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createCritical(
4037 const LocationDescription
&Loc
, BodyGenCallbackTy BodyGenCB
,
4038 FinalizeCallbackTy FiniCB
, StringRef CriticalName
, Value
*HintInst
) {
4040 if (!updateToLocation(Loc
))
4043 Directive OMPD
= Directive::OMPD_critical
;
4044 uint32_t SrcLocStrSize
;
4045 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4046 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4047 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4048 Value
*LockVar
= getOMPCriticalRegionLock(CriticalName
);
4049 Value
*Args
[] = {Ident
, ThreadId
, LockVar
};
4051 SmallVector
<llvm::Value
*, 4> EnterArgs(std::begin(Args
), std::end(Args
));
4052 Function
*RTFn
= nullptr;
4054 // Add Hint to entry Args and create call
4055 EnterArgs
.push_back(HintInst
);
4056 RTFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint
);
4058 RTFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical
);
4060 Instruction
*EntryCall
= Builder
.CreateCall(RTFn
, EnterArgs
);
4062 Function
*ExitRTLFn
=
4063 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical
);
4064 Instruction
*ExitCall
= Builder
.CreateCall(ExitRTLFn
, Args
);
4066 return EmitOMPInlinedRegion(OMPD
, EntryCall
, ExitCall
, BodyGenCB
, FiniCB
,
4067 /*Conditional*/ false, /*hasFinalize*/ true);
4070 OpenMPIRBuilder::InsertPointTy
4071 OpenMPIRBuilder::createOrderedDepend(const LocationDescription
&Loc
,
4072 InsertPointTy AllocaIP
, unsigned NumLoops
,
4073 ArrayRef
<llvm::Value
*> StoreValues
,
4074 const Twine
&Name
, bool IsDependSource
) {
4076 llvm::all_of(StoreValues
,
4077 [](Value
*SV
) { return SV
->getType()->isIntegerTy(64); }) &&
4078 "OpenMP runtime requires depend vec with i64 type");
4080 if (!updateToLocation(Loc
))
4083 // Allocate space for vector and generate alloc instruction.
4084 auto *ArrI64Ty
= ArrayType::get(Int64
, NumLoops
);
4085 Builder
.restoreIP(AllocaIP
);
4086 AllocaInst
*ArgsBase
= Builder
.CreateAlloca(ArrI64Ty
, nullptr, Name
);
4087 ArgsBase
->setAlignment(Align(8));
4088 Builder
.restoreIP(Loc
.IP
);
4090 // Store the index value with offset in depend vector.
4091 for (unsigned I
= 0; I
< NumLoops
; ++I
) {
4092 Value
*DependAddrGEPIter
= Builder
.CreateInBoundsGEP(
4093 ArrI64Ty
, ArgsBase
, {Builder
.getInt64(0), Builder
.getInt64(I
)});
4094 StoreInst
*STInst
= Builder
.CreateStore(StoreValues
[I
], DependAddrGEPIter
);
4095 STInst
->setAlignment(Align(8));
4098 Value
*DependBaseAddrGEP
= Builder
.CreateInBoundsGEP(
4099 ArrI64Ty
, ArgsBase
, {Builder
.getInt64(0), Builder
.getInt64(0)});
4101 uint32_t SrcLocStrSize
;
4102 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4103 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4104 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4105 Value
*Args
[] = {Ident
, ThreadId
, DependBaseAddrGEP
};
4107 Function
*RTLFn
= nullptr;
4109 RTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post
);
4111 RTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait
);
4112 Builder
.CreateCall(RTLFn
, Args
);
4114 return Builder
.saveIP();
4117 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createOrderedThreadsSimd(
4118 const LocationDescription
&Loc
, BodyGenCallbackTy BodyGenCB
,
4119 FinalizeCallbackTy FiniCB
, bool IsThreads
) {
4120 if (!updateToLocation(Loc
))
4123 Directive OMPD
= Directive::OMPD_ordered
;
4124 Instruction
*EntryCall
= nullptr;
4125 Instruction
*ExitCall
= nullptr;
4128 uint32_t SrcLocStrSize
;
4129 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4130 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4131 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4132 Value
*Args
[] = {Ident
, ThreadId
};
4134 Function
*EntryRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered
);
4135 EntryCall
= Builder
.CreateCall(EntryRTLFn
, Args
);
4137 Function
*ExitRTLFn
=
4138 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered
);
4139 ExitCall
= Builder
.CreateCall(ExitRTLFn
, Args
);
4142 return EmitOMPInlinedRegion(OMPD
, EntryCall
, ExitCall
, BodyGenCB
, FiniCB
,
4143 /*Conditional*/ false, /*hasFinalize*/ true);
4146 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::EmitOMPInlinedRegion(
4147 Directive OMPD
, Instruction
*EntryCall
, Instruction
*ExitCall
,
4148 BodyGenCallbackTy BodyGenCB
, FinalizeCallbackTy FiniCB
, bool Conditional
,
4149 bool HasFinalize
, bool IsCancellable
) {
4152 FinalizationStack
.push_back({FiniCB
, OMPD
, IsCancellable
});
4154 // Create inlined region's entry and body blocks, in preparation
4155 // for conditional creation
4156 BasicBlock
*EntryBB
= Builder
.GetInsertBlock();
4157 Instruction
*SplitPos
= EntryBB
->getTerminator();
4158 if (!isa_and_nonnull
<BranchInst
>(SplitPos
))
4159 SplitPos
= new UnreachableInst(Builder
.getContext(), EntryBB
);
4160 BasicBlock
*ExitBB
= EntryBB
->splitBasicBlock(SplitPos
, "omp_region.end");
4161 BasicBlock
*FiniBB
=
4162 EntryBB
->splitBasicBlock(EntryBB
->getTerminator(), "omp_region.finalize");
4164 Builder
.SetInsertPoint(EntryBB
->getTerminator());
4165 emitCommonDirectiveEntry(OMPD
, EntryCall
, ExitBB
, Conditional
);
4168 BodyGenCB(/* AllocaIP */ InsertPointTy(),
4169 /* CodeGenIP */ Builder
.saveIP());
4171 // emit exit call and do any needed finalization.
4172 auto FinIP
= InsertPointTy(FiniBB
, FiniBB
->getFirstInsertionPt());
4173 assert(FiniBB
->getTerminator()->getNumSuccessors() == 1 &&
4174 FiniBB
->getTerminator()->getSuccessor(0) == ExitBB
&&
4175 "Unexpected control flow graph state!!");
4176 emitCommonDirectiveExit(OMPD
, FinIP
, ExitCall
, HasFinalize
);
4177 assert(FiniBB
->getUniquePredecessor()->getUniqueSuccessor() == FiniBB
&&
4178 "Unexpected Control Flow State!");
4179 MergeBlockIntoPredecessor(FiniBB
);
4181 // If we are skipping the region of a non conditional, remove the exit
4182 // block, and clear the builder's insertion point.
4183 assert(SplitPos
->getParent() == ExitBB
&&
4184 "Unexpected Insertion point location!");
4185 auto merged
= MergeBlockIntoPredecessor(ExitBB
);
4186 BasicBlock
*ExitPredBB
= SplitPos
->getParent();
4187 auto InsertBB
= merged
? ExitPredBB
: ExitBB
;
4188 if (!isa_and_nonnull
<BranchInst
>(SplitPos
))
4189 SplitPos
->eraseFromParent();
4190 Builder
.SetInsertPoint(InsertBB
);
4192 return Builder
.saveIP();
4195 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::emitCommonDirectiveEntry(
4196 Directive OMPD
, Value
*EntryCall
, BasicBlock
*ExitBB
, bool Conditional
) {
4197 // if nothing to do, Return current insertion point.
4198 if (!Conditional
|| !EntryCall
)
4199 return Builder
.saveIP();
4201 BasicBlock
*EntryBB
= Builder
.GetInsertBlock();
4202 Value
*CallBool
= Builder
.CreateIsNotNull(EntryCall
);
4203 auto *ThenBB
= BasicBlock::Create(M
.getContext(), "omp_region.body");
4204 auto *UI
= new UnreachableInst(Builder
.getContext(), ThenBB
);
4206 // Emit thenBB and set the Builder's insertion point there for
4207 // body generation next. Place the block after the current block.
4208 Function
*CurFn
= EntryBB
->getParent();
4209 CurFn
->insert(std::next(EntryBB
->getIterator()), ThenBB
);
4211 // Move Entry branch to end of ThenBB, and replace with conditional
4213 Instruction
*EntryBBTI
= EntryBB
->getTerminator();
4214 Builder
.CreateCondBr(CallBool
, ThenBB
, ExitBB
);
4215 EntryBBTI
->removeFromParent();
4216 Builder
.SetInsertPoint(UI
);
4217 Builder
.Insert(EntryBBTI
);
4218 UI
->eraseFromParent();
4219 Builder
.SetInsertPoint(ThenBB
->getTerminator());
4221 // return an insertion point to ExitBB.
4222 return IRBuilder
<>::InsertPoint(ExitBB
, ExitBB
->getFirstInsertionPt());
4225 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::emitCommonDirectiveExit(
4226 omp::Directive OMPD
, InsertPointTy FinIP
, Instruction
*ExitCall
,
4229 Builder
.restoreIP(FinIP
);
4231 // If there is finalization to do, emit it before the exit call
4233 assert(!FinalizationStack
.empty() &&
4234 "Unexpected finalization stack state!");
4236 FinalizationInfo Fi
= FinalizationStack
.pop_back_val();
4237 assert(Fi
.DK
== OMPD
&& "Unexpected Directive for Finalization call!");
4241 BasicBlock
*FiniBB
= FinIP
.getBlock();
4242 Instruction
*FiniBBTI
= FiniBB
->getTerminator();
4244 // set Builder IP for call creation
4245 Builder
.SetInsertPoint(FiniBBTI
);
4249 return Builder
.saveIP();
4251 // place the Exitcall as last instruction before Finalization block terminator
4252 ExitCall
->removeFromParent();
4253 Builder
.Insert(ExitCall
);
4255 return IRBuilder
<>::InsertPoint(ExitCall
->getParent(),
4256 ExitCall
->getIterator());
4259 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createCopyinClauseBlocks(
4260 InsertPointTy IP
, Value
*MasterAddr
, Value
*PrivateAddr
,
4261 llvm::IntegerType
*IntPtrTy
, bool BranchtoEnd
) {
4265 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
4267 // creates the following CFG structure
4268 // OMP_Entry : (MasterAddr != PrivateAddr)?
4271 // | copin.not.master
4274 // copyin.not.master.end
4279 BasicBlock
*OMP_Entry
= IP
.getBlock();
4280 Function
*CurFn
= OMP_Entry
->getParent();
4281 BasicBlock
*CopyBegin
=
4282 BasicBlock::Create(M
.getContext(), "copyin.not.master", CurFn
);
4283 BasicBlock
*CopyEnd
= nullptr;
4285 // If entry block is terminated, split to preserve the branch to following
4286 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
4287 if (isa_and_nonnull
<BranchInst
>(OMP_Entry
->getTerminator())) {
4288 CopyEnd
= OMP_Entry
->splitBasicBlock(OMP_Entry
->getTerminator(),
4289 "copyin.not.master.end");
4290 OMP_Entry
->getTerminator()->eraseFromParent();
4293 BasicBlock::Create(M
.getContext(), "copyin.not.master.end", CurFn
);
4296 Builder
.SetInsertPoint(OMP_Entry
);
4297 Value
*MasterPtr
= Builder
.CreatePtrToInt(MasterAddr
, IntPtrTy
);
4298 Value
*PrivatePtr
= Builder
.CreatePtrToInt(PrivateAddr
, IntPtrTy
);
4299 Value
*cmp
= Builder
.CreateICmpNE(MasterPtr
, PrivatePtr
);
4300 Builder
.CreateCondBr(cmp
, CopyBegin
, CopyEnd
);
4302 Builder
.SetInsertPoint(CopyBegin
);
4304 Builder
.SetInsertPoint(Builder
.CreateBr(CopyEnd
));
4306 return Builder
.saveIP();
4309 CallInst
*OpenMPIRBuilder::createOMPAlloc(const LocationDescription
&Loc
,
4310 Value
*Size
, Value
*Allocator
,
4312 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
4313 Builder
.restoreIP(Loc
.IP
);
4315 uint32_t SrcLocStrSize
;
4316 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4317 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4318 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4319 Value
*Args
[] = {ThreadId
, Size
, Allocator
};
4321 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc
);
4323 return Builder
.CreateCall(Fn
, Args
, Name
);
4326 CallInst
*OpenMPIRBuilder::createOMPFree(const LocationDescription
&Loc
,
4327 Value
*Addr
, Value
*Allocator
,
4329 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
4330 Builder
.restoreIP(Loc
.IP
);
4332 uint32_t SrcLocStrSize
;
4333 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4334 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4335 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4336 Value
*Args
[] = {ThreadId
, Addr
, Allocator
};
4337 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free
);
4338 return Builder
.CreateCall(Fn
, Args
, Name
);
4341 CallInst
*OpenMPIRBuilder::createOMPInteropInit(
4342 const LocationDescription
&Loc
, Value
*InteropVar
,
4343 omp::OMPInteropType InteropType
, Value
*Device
, Value
*NumDependences
,
4344 Value
*DependenceAddress
, bool HaveNowaitClause
) {
4345 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
4346 Builder
.restoreIP(Loc
.IP
);
4348 uint32_t SrcLocStrSize
;
4349 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4350 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4351 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4352 if (Device
== nullptr)
4353 Device
= ConstantInt::get(Int32
, -1);
4354 Constant
*InteropTypeVal
= ConstantInt::get(Int32
, (int)InteropType
);
4355 if (NumDependences
== nullptr) {
4356 NumDependences
= ConstantInt::get(Int32
, 0);
4357 PointerType
*PointerTypeVar
= PointerType::getUnqual(M
.getContext());
4358 DependenceAddress
= ConstantPointerNull::get(PointerTypeVar
);
4360 Value
*HaveNowaitClauseVal
= ConstantInt::get(Int32
, HaveNowaitClause
);
4362 Ident
, ThreadId
, InteropVar
, InteropTypeVal
,
4363 Device
, NumDependences
, DependenceAddress
, HaveNowaitClauseVal
};
4365 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init
);
4367 return Builder
.CreateCall(Fn
, Args
);
4370 CallInst
*OpenMPIRBuilder::createOMPInteropDestroy(
4371 const LocationDescription
&Loc
, Value
*InteropVar
, Value
*Device
,
4372 Value
*NumDependences
, Value
*DependenceAddress
, bool HaveNowaitClause
) {
4373 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
4374 Builder
.restoreIP(Loc
.IP
);
4376 uint32_t SrcLocStrSize
;
4377 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4378 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4379 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4380 if (Device
== nullptr)
4381 Device
= ConstantInt::get(Int32
, -1);
4382 if (NumDependences
== nullptr) {
4383 NumDependences
= ConstantInt::get(Int32
, 0);
4384 PointerType
*PointerTypeVar
= PointerType::getUnqual(M
.getContext());
4385 DependenceAddress
= ConstantPointerNull::get(PointerTypeVar
);
4387 Value
*HaveNowaitClauseVal
= ConstantInt::get(Int32
, HaveNowaitClause
);
4389 Ident
, ThreadId
, InteropVar
, Device
,
4390 NumDependences
, DependenceAddress
, HaveNowaitClauseVal
};
4392 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy
);
4394 return Builder
.CreateCall(Fn
, Args
);
4397 CallInst
*OpenMPIRBuilder::createOMPInteropUse(const LocationDescription
&Loc
,
4398 Value
*InteropVar
, Value
*Device
,
4399 Value
*NumDependences
,
4400 Value
*DependenceAddress
,
4401 bool HaveNowaitClause
) {
4402 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
4403 Builder
.restoreIP(Loc
.IP
);
4404 uint32_t SrcLocStrSize
;
4405 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4406 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4407 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4408 if (Device
== nullptr)
4409 Device
= ConstantInt::get(Int32
, -1);
4410 if (NumDependences
== nullptr) {
4411 NumDependences
= ConstantInt::get(Int32
, 0);
4412 PointerType
*PointerTypeVar
= PointerType::getUnqual(M
.getContext());
4413 DependenceAddress
= ConstantPointerNull::get(PointerTypeVar
);
4415 Value
*HaveNowaitClauseVal
= ConstantInt::get(Int32
, HaveNowaitClause
);
4417 Ident
, ThreadId
, InteropVar
, Device
,
4418 NumDependences
, DependenceAddress
, HaveNowaitClauseVal
};
4420 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use
);
4422 return Builder
.CreateCall(Fn
, Args
);
4425 CallInst
*OpenMPIRBuilder::createCachedThreadPrivate(
4426 const LocationDescription
&Loc
, llvm::Value
*Pointer
,
4427 llvm::ConstantInt
*Size
, const llvm::Twine
&Name
) {
4428 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
4429 Builder
.restoreIP(Loc
.IP
);
4431 uint32_t SrcLocStrSize
;
4432 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4433 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4434 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4435 Constant
*ThreadPrivateCache
=
4436 getOrCreateInternalVariable(Int8PtrPtr
, Name
.str());
4437 llvm::Value
*Args
[] = {Ident
, ThreadId
, Pointer
, Size
, ThreadPrivateCache
};
4440 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached
);
4442 return Builder
.CreateCall(Fn
, Args
);
4445 OpenMPIRBuilder::InsertPointTy
4446 OpenMPIRBuilder::createTargetInit(const LocationDescription
&Loc
, bool IsSPMD
,
4447 int32_t MinThreadsVal
, int32_t MaxThreadsVal
,
4448 int32_t MinTeamsVal
, int32_t MaxTeamsVal
) {
4449 if (!updateToLocation(Loc
))
4452 uint32_t SrcLocStrSize
;
4453 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4454 Constant
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4455 Constant
*IsSPMDVal
= ConstantInt::getSigned(
4456 Int8
, IsSPMD
? OMP_TGT_EXEC_MODE_SPMD
: OMP_TGT_EXEC_MODE_GENERIC
);
4457 Constant
*UseGenericStateMachineVal
= ConstantInt::getSigned(Int8
, !IsSPMD
);
4458 Constant
*MayUseNestedParallelismVal
= ConstantInt::getSigned(Int8
, true);
4459 Constant
*DebugIndentionLevelVal
= ConstantInt::getSigned(Int16
, 0);
4461 Function
*Kernel
= Builder
.GetInsertBlock()->getParent();
4463 // Manifest the launch configuration in the metadata matching the kernel
4465 if (MinTeamsVal
> 1 || MaxTeamsVal
> 0)
4466 writeTeamsForKernel(T
, *Kernel
, MinTeamsVal
, MaxTeamsVal
);
4468 // For max values, < 0 means unset, == 0 means set but unknown.
4469 if (MaxThreadsVal
< 0)
4470 MaxThreadsVal
= std::max(
4471 int32_t(getGridValue(T
, Kernel
).GV_Default_WG_Size
), MinThreadsVal
);
4473 if (MaxThreadsVal
> 0)
4474 writeThreadBoundsForKernel(T
, *Kernel
, MinThreadsVal
, MaxThreadsVal
);
4476 Constant
*MinThreads
= ConstantInt::getSigned(Int32
, MinThreadsVal
);
4477 Constant
*MaxThreads
= ConstantInt::getSigned(Int32
, MaxThreadsVal
);
4478 Constant
*MinTeams
= ConstantInt::getSigned(Int32
, MinTeamsVal
);
4479 Constant
*MaxTeams
= ConstantInt::getSigned(Int32
, MaxTeamsVal
);
4480 Constant
*ReductionDataSize
= ConstantInt::getSigned(Int32
, 0);
4481 Constant
*ReductionBufferLength
= ConstantInt::getSigned(Int32
, 0);
4483 // We need to strip the debug prefix to get the correct kernel name.
4484 StringRef KernelName
= Kernel
->getName();
4485 const std::string DebugPrefix
= "_debug__";
4486 if (KernelName
.ends_with(DebugPrefix
))
4487 KernelName
= KernelName
.drop_back(DebugPrefix
.length());
4489 Function
*Fn
= getOrCreateRuntimeFunctionPtr(
4490 omp::RuntimeFunction::OMPRTL___kmpc_target_init
);
4491 const DataLayout
&DL
= Fn
->getParent()->getDataLayout();
4493 Twine DynamicEnvironmentName
= KernelName
+ "_dynamic_environment";
4494 Constant
*DynamicEnvironmentInitializer
=
4495 ConstantStruct::get(DynamicEnvironment
, {DebugIndentionLevelVal
});
4496 GlobalVariable
*DynamicEnvironmentGV
= new GlobalVariable(
4497 M
, DynamicEnvironment
, /*IsConstant=*/false, GlobalValue::WeakODRLinkage
,
4498 DynamicEnvironmentInitializer
, DynamicEnvironmentName
,
4499 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal
,
4500 DL
.getDefaultGlobalsAddressSpace());
4501 DynamicEnvironmentGV
->setVisibility(GlobalValue::ProtectedVisibility
);
4503 Constant
*DynamicEnvironment
=
4504 DynamicEnvironmentGV
->getType() == DynamicEnvironmentPtr
4505 ? DynamicEnvironmentGV
4506 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV
,
4507 DynamicEnvironmentPtr
);
4509 Constant
*ConfigurationEnvironmentInitializer
= ConstantStruct::get(
4510 ConfigurationEnvironment
, {
4511 UseGenericStateMachineVal
,
4512 MayUseNestedParallelismVal
,
4519 ReductionBufferLength
,
4521 Constant
*KernelEnvironmentInitializer
= ConstantStruct::get(
4522 KernelEnvironment
, {
4523 ConfigurationEnvironmentInitializer
,
4527 Twine KernelEnvironmentName
= KernelName
+ "_kernel_environment";
4528 GlobalVariable
*KernelEnvironmentGV
= new GlobalVariable(
4529 M
, KernelEnvironment
, /*IsConstant=*/true, GlobalValue::WeakODRLinkage
,
4530 KernelEnvironmentInitializer
, KernelEnvironmentName
,
4531 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal
,
4532 DL
.getDefaultGlobalsAddressSpace());
4533 KernelEnvironmentGV
->setVisibility(GlobalValue::ProtectedVisibility
);
4535 Constant
*KernelEnvironment
=
4536 KernelEnvironmentGV
->getType() == KernelEnvironmentPtr
4537 ? KernelEnvironmentGV
4538 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV
,
4539 KernelEnvironmentPtr
);
4540 Value
*KernelLaunchEnvironment
= Kernel
->getArg(0);
4541 CallInst
*ThreadKind
=
4542 Builder
.CreateCall(Fn
, {KernelEnvironment
, KernelLaunchEnvironment
});
4544 Value
*ExecUserCode
= Builder
.CreateICmpEQ(
4545 ThreadKind
, ConstantInt::get(ThreadKind
->getType(), -1),
4548 // ThreadKind = __kmpc_target_init(...)
4549 // if (ThreadKind == -1)
4554 auto *UI
= Builder
.CreateUnreachable();
4555 BasicBlock
*CheckBB
= UI
->getParent();
4556 BasicBlock
*UserCodeEntryBB
= CheckBB
->splitBasicBlock(UI
, "user_code.entry");
4558 BasicBlock
*WorkerExitBB
= BasicBlock::Create(
4559 CheckBB
->getContext(), "worker.exit", CheckBB
->getParent());
4560 Builder
.SetInsertPoint(WorkerExitBB
);
4561 Builder
.CreateRetVoid();
4563 auto *CheckBBTI
= CheckBB
->getTerminator();
4564 Builder
.SetInsertPoint(CheckBBTI
);
4565 Builder
.CreateCondBr(ExecUserCode
, UI
->getParent(), WorkerExitBB
);
4567 CheckBBTI
->eraseFromParent();
4568 UI
->eraseFromParent();
4570 // Continue in the "user_code" block, see diagram above and in
4571 // openmp/libomptarget/deviceRTLs/common/include/target.h .
4572 return InsertPointTy(UserCodeEntryBB
, UserCodeEntryBB
->getFirstInsertionPt());
4575 void OpenMPIRBuilder::createTargetDeinit(const LocationDescription
&Loc
,
4576 int32_t TeamsReductionDataSize
,
4577 int32_t TeamsReductionBufferLength
) {
4578 if (!updateToLocation(Loc
))
4581 Function
*Fn
= getOrCreateRuntimeFunctionPtr(
4582 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit
);
4584 Builder
.CreateCall(Fn
, {});
4586 if (!TeamsReductionBufferLength
|| !TeamsReductionDataSize
)
4589 Function
*Kernel
= Builder
.GetInsertBlock()->getParent();
4590 // We need to strip the debug prefix to get the correct kernel name.
4591 StringRef KernelName
= Kernel
->getName();
4592 const std::string DebugPrefix
= "_debug__";
4593 if (KernelName
.ends_with(DebugPrefix
))
4594 KernelName
= KernelName
.drop_back(DebugPrefix
.length());
4595 auto *KernelEnvironmentGV
=
4596 M
.getNamedGlobal((KernelName
+ "_kernel_environment").str());
4597 assert(KernelEnvironmentGV
&& "Expected kernel environment global\n");
4598 auto *KernelEnvironmentInitializer
= KernelEnvironmentGV
->getInitializer();
4599 auto *NewInitializer
= ConstantFoldInsertValueInstruction(
4600 KernelEnvironmentInitializer
,
4601 ConstantInt::get(Int32
, TeamsReductionDataSize
), {0, 7});
4602 NewInitializer
= ConstantFoldInsertValueInstruction(
4603 NewInitializer
, ConstantInt::get(Int32
, TeamsReductionBufferLength
),
4605 KernelEnvironmentGV
->setInitializer(NewInitializer
);
4608 static MDNode
*getNVPTXMDNode(Function
&Kernel
, StringRef Name
) {
4609 Module
&M
= *Kernel
.getParent();
4610 NamedMDNode
*MD
= M
.getOrInsertNamedMetadata("nvvm.annotations");
4611 for (auto *Op
: MD
->operands()) {
4612 if (Op
->getNumOperands() != 3)
4614 auto *KernelOp
= dyn_cast
<ConstantAsMetadata
>(Op
->getOperand(0));
4615 if (!KernelOp
|| KernelOp
->getValue() != &Kernel
)
4617 auto *Prop
= dyn_cast
<MDString
>(Op
->getOperand(1));
4618 if (!Prop
|| Prop
->getString() != Name
)
4625 static void updateNVPTXMetadata(Function
&Kernel
, StringRef Name
, int32_t Value
,
4627 // Update the "maxntidx" metadata for NVIDIA, or add it.
4628 MDNode
*ExistingOp
= getNVPTXMDNode(Kernel
, Name
);
4630 auto *OldVal
= cast
<ConstantAsMetadata
>(ExistingOp
->getOperand(2));
4631 int32_t OldLimit
= cast
<ConstantInt
>(OldVal
->getValue())->getZExtValue();
4632 ExistingOp
->replaceOperandWith(
4633 2, ConstantAsMetadata::get(ConstantInt::get(
4634 OldVal
->getValue()->getType(),
4635 Min
? std::min(OldLimit
, Value
) : std::max(OldLimit
, Value
))));
4637 LLVMContext
&Ctx
= Kernel
.getContext();
4638 Metadata
*MDVals
[] = {ConstantAsMetadata::get(&Kernel
),
4639 MDString::get(Ctx
, Name
),
4640 ConstantAsMetadata::get(
4641 ConstantInt::get(Type::getInt32Ty(Ctx
), Value
))};
4642 // Append metadata to nvvm.annotations
4643 Module
&M
= *Kernel
.getParent();
4644 NamedMDNode
*MD
= M
.getOrInsertNamedMetadata("nvvm.annotations");
4645 MD
->addOperand(MDNode::get(Ctx
, MDVals
));
4649 std::pair
<int32_t, int32_t>
4650 OpenMPIRBuilder::readThreadBoundsForKernel(const Triple
&T
, Function
&Kernel
) {
4651 int32_t ThreadLimit
=
4652 Kernel
.getFnAttributeAsParsedInteger("omp_target_thread_limit");
4655 const auto &Attr
= Kernel
.getFnAttribute("amdgpu-flat-work-group-size");
4656 if (!Attr
.isValid() || !Attr
.isStringAttribute())
4657 return {0, ThreadLimit
};
4658 auto [LBStr
, UBStr
] = Attr
.getValueAsString().split(',');
4660 if (!llvm::to_integer(UBStr
, UB
, 10))
4661 return {0, ThreadLimit
};
4662 UB
= ThreadLimit
? std::min(ThreadLimit
, UB
) : UB
;
4663 if (!llvm::to_integer(LBStr
, LB
, 10))
4668 if (MDNode
*ExistingOp
= getNVPTXMDNode(Kernel
, "maxntidx")) {
4669 auto *OldVal
= cast
<ConstantAsMetadata
>(ExistingOp
->getOperand(2));
4670 int32_t UB
= cast
<ConstantInt
>(OldVal
->getValue())->getZExtValue();
4671 return {0, ThreadLimit
? std::min(ThreadLimit
, UB
) : UB
};
4673 return {0, ThreadLimit
};
4676 void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple
&T
,
4677 Function
&Kernel
, int32_t LB
,
4679 Kernel
.addFnAttr("omp_target_thread_limit", std::to_string(UB
));
4682 Kernel
.addFnAttr("amdgpu-flat-work-group-size",
4683 llvm::utostr(LB
) + "," + llvm::utostr(UB
));
4687 updateNVPTXMetadata(Kernel
, "maxntidx", UB
, true);
4690 std::pair
<int32_t, int32_t>
4691 OpenMPIRBuilder::readTeamBoundsForKernel(const Triple
&, Function
&Kernel
) {
4692 // TODO: Read from backend annotations if available.
4693 return {0, Kernel
.getFnAttributeAsParsedInteger("omp_target_num_teams")};
4696 void OpenMPIRBuilder::writeTeamsForKernel(const Triple
&T
, Function
&Kernel
,
4697 int32_t LB
, int32_t UB
) {
4700 updateNVPTXMetadata(Kernel
, "maxclusterrank", UB
, true);
4701 updateNVPTXMetadata(Kernel
, "minctasm", LB
, false);
4703 Kernel
.addFnAttr("omp_target_num_teams", std::to_string(LB
));
4706 void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
4707 Function
*OutlinedFn
) {
4708 if (Config
.isTargetDevice()) {
4709 OutlinedFn
->setLinkage(GlobalValue::WeakODRLinkage
);
4710 // TODO: Determine if DSO local can be set to true.
4711 OutlinedFn
->setDSOLocal(false);
4712 OutlinedFn
->setVisibility(GlobalValue::ProtectedVisibility
);
4714 OutlinedFn
->setCallingConv(CallingConv::AMDGPU_KERNEL
);
4718 Constant
*OpenMPIRBuilder::createOutlinedFunctionID(Function
*OutlinedFn
,
4719 StringRef EntryFnIDName
) {
4720 if (Config
.isTargetDevice()) {
4721 assert(OutlinedFn
&& "The outlined function must exist if embedded");
4725 return new GlobalVariable(
4726 M
, Builder
.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage
,
4727 Constant::getNullValue(Builder
.getInt8Ty()), EntryFnIDName
);
4730 Constant
*OpenMPIRBuilder::createTargetRegionEntryAddr(Function
*OutlinedFn
,
4731 StringRef EntryFnName
) {
4735 assert(!M
.getGlobalVariable(EntryFnName
, true) &&
4736 "Named kernel already exists?");
4737 return new GlobalVariable(
4738 M
, Builder
.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage
,
4739 Constant::getNullValue(Builder
.getInt8Ty()), EntryFnName
);
4742 void OpenMPIRBuilder::emitTargetRegionFunction(
4743 TargetRegionEntryInfo
&EntryInfo
,
4744 FunctionGenCallback
&GenerateFunctionCallback
, bool IsOffloadEntry
,
4745 Function
*&OutlinedFn
, Constant
*&OutlinedFnID
) {
4747 SmallString
<64> EntryFnName
;
4748 OffloadInfoManager
.getTargetRegionEntryFnName(EntryFnName
, EntryInfo
);
4750 OutlinedFn
= Config
.isTargetDevice() || !Config
.openMPOffloadMandatory()
4751 ? GenerateFunctionCallback(EntryFnName
)
4754 // If this target outline function is not an offload entry, we don't need to
4755 // register it. This may be in the case of a false if clause, or if there are
4756 // no OpenMP targets.
4757 if (!IsOffloadEntry
)
4760 std::string EntryFnIDName
=
4761 Config
.isTargetDevice()
4762 ? std::string(EntryFnName
)
4763 : createPlatformSpecificName({EntryFnName
, "region_id"});
4765 OutlinedFnID
= registerTargetRegionFunction(EntryInfo
, OutlinedFn
,
4766 EntryFnName
, EntryFnIDName
);
4769 Constant
*OpenMPIRBuilder::registerTargetRegionFunction(
4770 TargetRegionEntryInfo
&EntryInfo
, Function
*OutlinedFn
,
4771 StringRef EntryFnName
, StringRef EntryFnIDName
) {
4773 setOutlinedTargetRegionFunctionAttributes(OutlinedFn
);
4774 auto OutlinedFnID
= createOutlinedFunctionID(OutlinedFn
, EntryFnIDName
);
4775 auto EntryAddr
= createTargetRegionEntryAddr(OutlinedFn
, EntryFnName
);
4776 OffloadInfoManager
.registerTargetRegionEntryInfo(
4777 EntryInfo
, EntryAddr
, OutlinedFnID
,
4778 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion
);
4779 return OutlinedFnID
;
4782 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createTargetData(
4783 const LocationDescription
&Loc
, InsertPointTy AllocaIP
,
4784 InsertPointTy CodeGenIP
, Value
*DeviceID
, Value
*IfCond
,
4785 TargetDataInfo
&Info
, GenMapInfoCallbackTy GenMapInfoCB
,
4786 omp::RuntimeFunction
*MapperFunc
,
4787 function_ref
<InsertPointTy(InsertPointTy CodeGenIP
, BodyGenTy BodyGenType
)>
4789 function_ref
<void(unsigned int, Value
*)> DeviceAddrCB
,
4790 function_ref
<Value
*(unsigned int)> CustomMapperCB
, Value
*SrcLocInfo
) {
4791 if (!updateToLocation(Loc
))
4792 return InsertPointTy();
4794 // Disable TargetData CodeGen on Device pass.
4795 if (Config
.IsTargetDevice
.value_or(false))
4796 return Builder
.saveIP();
4798 Builder
.restoreIP(CodeGenIP
);
4799 bool IsStandAlone
= !BodyGenCB
;
4800 MapInfosTy
*MapInfo
;
4801 // Generate the code for the opening of the data environment. Capture all the
4802 // arguments of the runtime call by reference because they are used in the
4803 // closing of the region.
4804 auto BeginThenGen
= [&](InsertPointTy AllocaIP
, InsertPointTy CodeGenIP
) {
4805 MapInfo
= &GenMapInfoCB(Builder
.saveIP());
4806 emitOffloadingArrays(AllocaIP
, Builder
.saveIP(), *MapInfo
, Info
,
4807 /*IsNonContiguous=*/true, DeviceAddrCB
,
4810 TargetDataRTArgs RTArgs
;
4811 emitOffloadingArraysArgument(Builder
, RTArgs
, Info
,
4812 !MapInfo
->Names
.empty());
4814 // Emit the number of elements in the offloading arrays.
4815 Value
*PointerNum
= Builder
.getInt32(Info
.NumberOfPtrs
);
4817 // Source location for the ident struct
4819 uint32_t SrcLocStrSize
;
4820 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4821 SrcLocInfo
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4824 Value
*OffloadingArgs
[] = {SrcLocInfo
, DeviceID
,
4825 PointerNum
, RTArgs
.BasePointersArray
,
4826 RTArgs
.PointersArray
, RTArgs
.SizesArray
,
4827 RTArgs
.MapTypesArray
, RTArgs
.MapNamesArray
,
4828 RTArgs
.MappersArray
};
4831 assert(MapperFunc
&& "MapperFunc missing for standalone target data");
4832 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc
),
4835 Function
*BeginMapperFunc
= getOrCreateRuntimeFunctionPtr(
4836 omp::OMPRTL___tgt_target_data_begin_mapper
);
4838 Builder
.CreateCall(BeginMapperFunc
, OffloadingArgs
);
4840 for (auto DeviceMap
: Info
.DevicePtrInfoMap
) {
4841 if (isa
<AllocaInst
>(DeviceMap
.second
.second
)) {
4843 Builder
.CreateLoad(Builder
.getPtrTy(), DeviceMap
.second
.first
);
4844 Builder
.CreateStore(LI
, DeviceMap
.second
.second
);
4848 // If device pointer privatization is required, emit the body of the
4849 // region here. It will have to be duplicated: with and without
4851 Builder
.restoreIP(BodyGenCB(Builder
.saveIP(), BodyGenTy::Priv
));
4855 // If we need device pointer privatization, we need to emit the body of the
4856 // region with no privatization in the 'else' branch of the conditional.
4857 // Otherwise, we don't have to do anything.
4858 auto BeginElseGen
= [&](InsertPointTy AllocaIP
, InsertPointTy CodeGenIP
) {
4859 Builder
.restoreIP(BodyGenCB(Builder
.saveIP(), BodyGenTy::DupNoPriv
));
4862 // Generate code for the closing of the data region.
4863 auto EndThenGen
= [&](InsertPointTy AllocaIP
, InsertPointTy CodeGenIP
) {
4864 TargetDataRTArgs RTArgs
;
4865 emitOffloadingArraysArgument(Builder
, RTArgs
, Info
, !MapInfo
->Names
.empty(),
4866 /*ForEndCall=*/true);
4868 // Emit the number of elements in the offloading arrays.
4869 Value
*PointerNum
= Builder
.getInt32(Info
.NumberOfPtrs
);
4871 // Source location for the ident struct
4873 uint32_t SrcLocStrSize
;
4874 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4875 SrcLocInfo
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4878 Value
*OffloadingArgs
[] = {SrcLocInfo
, DeviceID
,
4879 PointerNum
, RTArgs
.BasePointersArray
,
4880 RTArgs
.PointersArray
, RTArgs
.SizesArray
,
4881 RTArgs
.MapTypesArray
, RTArgs
.MapNamesArray
,
4882 RTArgs
.MappersArray
};
4883 Function
*EndMapperFunc
=
4884 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper
);
4886 Builder
.CreateCall(EndMapperFunc
, OffloadingArgs
);
4889 // We don't have to do anything to close the region if the if clause evaluates
4891 auto EndElseGen
= [&](InsertPointTy AllocaIP
, InsertPointTy CodeGenIP
) {};
4895 emitIfClause(IfCond
, BeginThenGen
, BeginElseGen
, AllocaIP
);
4897 BeginThenGen(AllocaIP
, Builder
.saveIP());
4900 // If we don't require privatization of device pointers, we emit the body in
4901 // between the runtime calls. This avoids duplicating the body code.
4902 Builder
.restoreIP(BodyGenCB(Builder
.saveIP(), BodyGenTy::NoPriv
));
4905 emitIfClause(IfCond
, EndThenGen
, EndElseGen
, AllocaIP
);
4907 EndThenGen(AllocaIP
, Builder
.saveIP());
4911 emitIfClause(IfCond
, BeginThenGen
, EndElseGen
, AllocaIP
);
4913 BeginThenGen(AllocaIP
, Builder
.saveIP());
4917 return Builder
.saveIP();
4921 OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize
, bool IVSigned
,
4922 bool IsGPUDistribute
) {
4923 assert((IVSize
== 32 || IVSize
== 64) &&
4924 "IV size is not compatible with the omp runtime");
4925 RuntimeFunction Name
;
4926 if (IsGPUDistribute
)
4928 ? (IVSigned
? omp::OMPRTL___kmpc_distribute_static_init_4
4929 : omp::OMPRTL___kmpc_distribute_static_init_4u
)
4930 : (IVSigned
? omp::OMPRTL___kmpc_distribute_static_init_8
4931 : omp::OMPRTL___kmpc_distribute_static_init_8u
);
4933 Name
= IVSize
== 32 ? (IVSigned
? omp::OMPRTL___kmpc_for_static_init_4
4934 : omp::OMPRTL___kmpc_for_static_init_4u
)
4935 : (IVSigned
? omp::OMPRTL___kmpc_for_static_init_8
4936 : omp::OMPRTL___kmpc_for_static_init_8u
);
4938 return getOrCreateRuntimeFunction(M
, Name
);
4941 FunctionCallee
OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize
,
4943 assert((IVSize
== 32 || IVSize
== 64) &&
4944 "IV size is not compatible with the omp runtime");
4945 RuntimeFunction Name
= IVSize
== 32
4946 ? (IVSigned
? omp::OMPRTL___kmpc_dispatch_init_4
4947 : omp::OMPRTL___kmpc_dispatch_init_4u
)
4948 : (IVSigned
? omp::OMPRTL___kmpc_dispatch_init_8
4949 : omp::OMPRTL___kmpc_dispatch_init_8u
);
4951 return getOrCreateRuntimeFunction(M
, Name
);
4954 FunctionCallee
OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize
,
4956 assert((IVSize
== 32 || IVSize
== 64) &&
4957 "IV size is not compatible with the omp runtime");
4958 RuntimeFunction Name
= IVSize
== 32
4959 ? (IVSigned
? omp::OMPRTL___kmpc_dispatch_next_4
4960 : omp::OMPRTL___kmpc_dispatch_next_4u
)
4961 : (IVSigned
? omp::OMPRTL___kmpc_dispatch_next_8
4962 : omp::OMPRTL___kmpc_dispatch_next_8u
);
4964 return getOrCreateRuntimeFunction(M
, Name
);
4967 FunctionCallee
OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize
,
4969 assert((IVSize
== 32 || IVSize
== 64) &&
4970 "IV size is not compatible with the omp runtime");
4971 RuntimeFunction Name
= IVSize
== 32
4972 ? (IVSigned
? omp::OMPRTL___kmpc_dispatch_fini_4
4973 : omp::OMPRTL___kmpc_dispatch_fini_4u
)
4974 : (IVSigned
? omp::OMPRTL___kmpc_dispatch_fini_8
4975 : omp::OMPRTL___kmpc_dispatch_fini_8u
);
4977 return getOrCreateRuntimeFunction(M
, Name
);
4980 static void replaceConstatExprUsesInFuncWithInstr(ConstantExpr
*ConstExpr
,
4982 for (User
*User
: make_early_inc_range(ConstExpr
->users()))
4983 if (auto *Instr
= dyn_cast
<Instruction
>(User
))
4984 if (Instr
->getFunction() == Func
)
4985 Instr
->replaceUsesOfWith(ConstExpr
, ConstExpr
->getAsInstruction(Instr
));
4988 static void replaceConstantValueUsesInFuncWithInstr(llvm::Value
*Input
,
4990 for (User
*User
: make_early_inc_range(Input
->users()))
4991 if (auto *Const
= dyn_cast
<Constant
>(User
))
4992 if (auto *ConstExpr
= dyn_cast
<ConstantExpr
>(Const
))
4993 replaceConstatExprUsesInFuncWithInstr(ConstExpr
, Func
);
4996 static Function
*createOutlinedFunction(
4997 OpenMPIRBuilder
&OMPBuilder
, IRBuilderBase
&Builder
, StringRef FuncName
,
4998 SmallVectorImpl
<Value
*> &Inputs
,
4999 OpenMPIRBuilder::TargetBodyGenCallbackTy
&CBFunc
,
5000 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy
&ArgAccessorFuncCB
) {
5001 SmallVector
<Type
*> ParameterTypes
;
5002 if (OMPBuilder
.Config
.isTargetDevice()) {
5003 // Add the "implicit" runtime argument we use to provide launch specific
5004 // information for target devices.
5005 auto *Int8PtrTy
= PointerType::getUnqual(Builder
.getContext());
5006 ParameterTypes
.push_back(Int8PtrTy
);
5008 // All parameters to target devices are passed as pointers
5009 // or i64. This assumes 64-bit address spaces/pointers.
5010 for (auto &Arg
: Inputs
)
5011 ParameterTypes
.push_back(Arg
->getType()->isPointerTy()
5013 : Type::getInt64Ty(Builder
.getContext()));
5015 for (auto &Arg
: Inputs
)
5016 ParameterTypes
.push_back(Arg
->getType());
5019 auto FuncType
= FunctionType::get(Builder
.getVoidTy(), ParameterTypes
,
5020 /*isVarArg*/ false);
5021 auto Func
= Function::Create(FuncType
, GlobalValue::InternalLinkage
, FuncName
,
5022 Builder
.GetInsertBlock()->getModule());
5024 // Save insert point.
5025 auto OldInsertPoint
= Builder
.saveIP();
5027 // Generate the region into the function.
5028 BasicBlock
*EntryBB
= BasicBlock::Create(Builder
.getContext(), "entry", Func
);
5029 Builder
.SetInsertPoint(EntryBB
);
5031 // Insert target init call in the device compilation pass.
5032 if (OMPBuilder
.Config
.isTargetDevice())
5033 Builder
.restoreIP(OMPBuilder
.createTargetInit(Builder
, /*IsSPMD*/ false));
5035 BasicBlock
*UserCodeEntryBB
= Builder
.GetInsertBlock();
5037 // Insert target deinit call in the device compilation pass.
5038 Builder
.restoreIP(CBFunc(Builder
.saveIP(), Builder
.saveIP()));
5039 if (OMPBuilder
.Config
.isTargetDevice())
5040 OMPBuilder
.createTargetDeinit(Builder
);
5042 // Insert return instruction.
5043 Builder
.CreateRetVoid();
5045 // New Alloca IP at entry point of created device function.
5046 Builder
.SetInsertPoint(EntryBB
->getFirstNonPHI());
5047 auto AllocaIP
= Builder
.saveIP();
5049 Builder
.SetInsertPoint(UserCodeEntryBB
->getFirstNonPHIOrDbg());
5051 // Skip the artificial dyn_ptr on the device.
5052 const auto &ArgRange
=
5053 OMPBuilder
.Config
.isTargetDevice()
5054 ? make_range(Func
->arg_begin() + 1, Func
->arg_end())
5057 // Rewrite uses of input valus to parameters.
5058 for (auto InArg
: zip(Inputs
, ArgRange
)) {
5059 Value
*Input
= std::get
<0>(InArg
);
5060 Argument
&Arg
= std::get
<1>(InArg
);
5061 Value
*InputCopy
= nullptr;
5064 ArgAccessorFuncCB(Arg
, Input
, InputCopy
, AllocaIP
, Builder
.saveIP()));
5066 // Things like GEP's can come in the form of Constants. Constants and
5067 // ConstantExpr's do not have access to the knowledge of what they're
5068 // contained in, so we must dig a little to find an instruction so we can
5069 // tell if they're used inside of the function we're outlining. We also
5070 // replace the original constant expression with a new instruction
5071 // equivalent; an instruction as it allows easy modification in the
5072 // following loop, as we can now know the constant (instruction) is owned by
5073 // our target function and replaceUsesOfWith can now be invoked on it
5074 // (cannot do this with constants it seems). A brand new one also allows us
5075 // to be cautious as it is perhaps possible the old expression was used
5076 // inside of the function but exists and is used externally (unlikely by the
5077 // nature of a Constant, but still).
5078 replaceConstantValueUsesInFuncWithInstr(Input
, Func
);
5080 // Collect all the instructions
5081 for (User
*User
: make_early_inc_range(Input
->users()))
5082 if (auto *Instr
= dyn_cast
<Instruction
>(User
))
5083 if (Instr
->getFunction() == Func
)
5084 Instr
->replaceUsesOfWith(Input
, InputCopy
);
5087 // Restore insert point.
5088 Builder
.restoreIP(OldInsertPoint
);
5093 static void emitTargetOutlinedFunction(
5094 OpenMPIRBuilder
&OMPBuilder
, IRBuilderBase
&Builder
,
5095 TargetRegionEntryInfo
&EntryInfo
, Function
*&OutlinedFn
,
5096 Constant
*&OutlinedFnID
, SmallVectorImpl
<Value
*> &Inputs
,
5097 OpenMPIRBuilder::TargetBodyGenCallbackTy
&CBFunc
,
5098 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy
&ArgAccessorFuncCB
) {
5100 OpenMPIRBuilder::FunctionGenCallback
&&GenerateOutlinedFunction
=
5101 [&OMPBuilder
, &Builder
, &Inputs
, &CBFunc
,
5102 &ArgAccessorFuncCB
](StringRef EntryFnName
) {
5103 return createOutlinedFunction(OMPBuilder
, Builder
, EntryFnName
, Inputs
,
5104 CBFunc
, ArgAccessorFuncCB
);
5107 OMPBuilder
.emitTargetRegionFunction(EntryInfo
, GenerateOutlinedFunction
, true,
5108 OutlinedFn
, OutlinedFnID
);
5111 static void emitTargetCall(OpenMPIRBuilder
&OMPBuilder
, IRBuilderBase
&Builder
,
5112 OpenMPIRBuilder::InsertPointTy AllocaIP
,
5113 Function
*OutlinedFn
, Constant
*OutlinedFnID
,
5114 int32_t NumTeams
, int32_t NumThreads
,
5115 SmallVectorImpl
<Value
*> &Args
,
5116 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB
) {
5118 OpenMPIRBuilder::TargetDataInfo
Info(
5119 /*RequiresDevicePointerInfo=*/false,
5120 /*SeparateBeginEndCalls=*/true);
5122 OpenMPIRBuilder::MapInfosTy
&MapInfo
= GenMapInfoCB(Builder
.saveIP());
5123 OMPBuilder
.emitOffloadingArrays(AllocaIP
, Builder
.saveIP(), MapInfo
, Info
,
5124 /*IsNonContiguous=*/true);
5126 OpenMPIRBuilder::TargetDataRTArgs RTArgs
;
5127 OMPBuilder
.emitOffloadingArraysArgument(Builder
, RTArgs
, Info
,
5128 !MapInfo
.Names
.empty());
5131 auto &&EmitTargetCallFallbackCB
=
5132 [&](OpenMPIRBuilder::InsertPointTy IP
) -> OpenMPIRBuilder::InsertPointTy
{
5133 Builder
.restoreIP(IP
);
5134 Builder
.CreateCall(OutlinedFn
, Args
);
5135 return Builder
.saveIP();
5138 unsigned NumTargetItems
= MapInfo
.BasePointers
.size();
5139 // TODO: Use correct device ID
5140 Value
*DeviceID
= Builder
.getInt64(OMP_DEVICEID_UNDEF
);
5141 Value
*NumTeamsVal
= Builder
.getInt32(NumTeams
);
5142 Value
*NumThreadsVal
= Builder
.getInt32(NumThreads
);
5143 uint32_t SrcLocStrSize
;
5144 Constant
*SrcLocStr
= OMPBuilder
.getOrCreateDefaultSrcLocStr(SrcLocStrSize
);
5145 Value
*RTLoc
= OMPBuilder
.getOrCreateIdent(SrcLocStr
, SrcLocStrSize
,
5146 llvm::omp::IdentFlag(0), 0);
5147 // TODO: Use correct NumIterations
5148 Value
*NumIterations
= Builder
.getInt64(0);
5149 // TODO: Use correct DynCGGroupMem
5150 Value
*DynCGGroupMem
= Builder
.getInt32(0);
5152 bool HasNoWait
= false;
5154 OpenMPIRBuilder::TargetKernelArgs
KArgs(NumTargetItems
, RTArgs
, NumIterations
,
5155 NumTeamsVal
, NumThreadsVal
,
5156 DynCGGroupMem
, HasNoWait
);
5158 Builder
.restoreIP(OMPBuilder
.emitKernelLaunch(
5159 Builder
, OutlinedFn
, OutlinedFnID
, EmitTargetCallFallbackCB
, KArgs
,
5160 DeviceID
, RTLoc
, AllocaIP
));
5163 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createTarget(
5164 const LocationDescription
&Loc
, InsertPointTy AllocaIP
,
5165 InsertPointTy CodeGenIP
, TargetRegionEntryInfo
&EntryInfo
, int32_t NumTeams
,
5166 int32_t NumThreads
, SmallVectorImpl
<Value
*> &Args
,
5167 GenMapInfoCallbackTy GenMapInfoCB
,
5168 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc
,
5169 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB
) {
5170 if (!updateToLocation(Loc
))
5171 return InsertPointTy();
5173 Builder
.restoreIP(CodeGenIP
);
5175 Function
*OutlinedFn
;
5176 Constant
*OutlinedFnID
;
5177 emitTargetOutlinedFunction(*this, Builder
, EntryInfo
, OutlinedFn
,
5178 OutlinedFnID
, Args
, CBFunc
, ArgAccessorFuncCB
);
5179 if (!Config
.isTargetDevice())
5180 emitTargetCall(*this, Builder
, AllocaIP
, OutlinedFn
, OutlinedFnID
, NumTeams
,
5181 NumThreads
, Args
, GenMapInfoCB
);
5183 return Builder
.saveIP();
5186 std::string
OpenMPIRBuilder::getNameWithSeparators(ArrayRef
<StringRef
> Parts
,
5187 StringRef FirstSeparator
,
5188 StringRef Separator
) {
5189 SmallString
<128> Buffer
;
5190 llvm::raw_svector_ostream
OS(Buffer
);
5191 StringRef Sep
= FirstSeparator
;
5192 for (StringRef Part
: Parts
) {
5196 return OS
.str().str();
5200 OpenMPIRBuilder::createPlatformSpecificName(ArrayRef
<StringRef
> Parts
) const {
5201 return OpenMPIRBuilder::getNameWithSeparators(Parts
, Config
.firstSeparator(),
5202 Config
.separator());
5206 OpenMPIRBuilder::getOrCreateInternalVariable(Type
*Ty
, const StringRef
&Name
,
5207 unsigned AddressSpace
) {
5208 auto &Elem
= *InternalVars
.try_emplace(Name
, nullptr).first
;
5210 assert(Elem
.second
->getValueType() == Ty
&&
5211 "OMP internal variable has different type than requested");
5213 // TODO: investigate the appropriate linkage type used for the global
5214 // variable for possibly changing that to internal or private, or maybe
5215 // create different versions of the function for different OMP internal
5217 auto Linkage
= this->M
.getTargetTriple().rfind("wasm32") == 0
5218 ? GlobalValue::ExternalLinkage
5219 : GlobalValue::CommonLinkage
;
5220 auto *GV
= new GlobalVariable(M
, Ty
, /*IsConstant=*/false, Linkage
,
5221 Constant::getNullValue(Ty
), Elem
.first(),
5222 /*InsertBefore=*/nullptr,
5223 GlobalValue::NotThreadLocal
, AddressSpace
);
5224 const DataLayout
&DL
= M
.getDataLayout();
5225 const llvm::Align TypeAlign
= DL
.getABITypeAlign(Ty
);
5226 const llvm::Align PtrAlign
= DL
.getPointerABIAlignment(AddressSpace
);
5227 GV
->setAlignment(std::max(TypeAlign
, PtrAlign
));
5234 Value
*OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName
) {
5235 std::string Prefix
= Twine("gomp_critical_user_", CriticalName
).str();
5236 std::string Name
= getNameWithSeparators({Prefix
, "var"}, ".", ".");
5237 return getOrCreateInternalVariable(KmpCriticalNameTy
, Name
);
5240 Value
*OpenMPIRBuilder::getSizeInBytes(Value
*BasePtr
) {
5241 LLVMContext
&Ctx
= Builder
.getContext();
5243 Constant::getNullValue(PointerType::getUnqual(BasePtr
->getContext()));
5245 Builder
.CreateGEP(BasePtr
->getType(), Null
, Builder
.getInt32(1));
5246 Value
*SizePtrToInt
= Builder
.CreatePtrToInt(SizeGep
, Type::getInt64Ty(Ctx
));
5247 return SizePtrToInt
;
5251 OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl
<uint64_t> &Mappings
,
5252 std::string VarName
) {
5253 llvm::Constant
*MaptypesArrayInit
=
5254 llvm::ConstantDataArray::get(M
.getContext(), Mappings
);
5255 auto *MaptypesArrayGlobal
= new llvm::GlobalVariable(
5256 M
, MaptypesArrayInit
->getType(),
5257 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage
, MaptypesArrayInit
,
5259 MaptypesArrayGlobal
->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global
);
5260 return MaptypesArrayGlobal
;
5263 void OpenMPIRBuilder::createMapperAllocas(const LocationDescription
&Loc
,
5264 InsertPointTy AllocaIP
,
5265 unsigned NumOperands
,
5266 struct MapperAllocas
&MapperAllocas
) {
5267 if (!updateToLocation(Loc
))
5270 auto *ArrI8PtrTy
= ArrayType::get(Int8Ptr
, NumOperands
);
5271 auto *ArrI64Ty
= ArrayType::get(Int64
, NumOperands
);
5272 Builder
.restoreIP(AllocaIP
);
5273 AllocaInst
*ArgsBase
= Builder
.CreateAlloca(
5274 ArrI8PtrTy
, /* ArraySize = */ nullptr, ".offload_baseptrs");
5275 AllocaInst
*Args
= Builder
.CreateAlloca(ArrI8PtrTy
, /* ArraySize = */ nullptr,
5277 AllocaInst
*ArgSizes
= Builder
.CreateAlloca(
5278 ArrI64Ty
, /* ArraySize = */ nullptr, ".offload_sizes");
5279 Builder
.restoreIP(Loc
.IP
);
5280 MapperAllocas
.ArgsBase
= ArgsBase
;
5281 MapperAllocas
.Args
= Args
;
5282 MapperAllocas
.ArgSizes
= ArgSizes
;
5285 void OpenMPIRBuilder::emitMapperCall(const LocationDescription
&Loc
,
5286 Function
*MapperFunc
, Value
*SrcLocInfo
,
5287 Value
*MaptypesArg
, Value
*MapnamesArg
,
5288 struct MapperAllocas
&MapperAllocas
,
5289 int64_t DeviceID
, unsigned NumOperands
) {
5290 if (!updateToLocation(Loc
))
5293 auto *ArrI8PtrTy
= ArrayType::get(Int8Ptr
, NumOperands
);
5294 auto *ArrI64Ty
= ArrayType::get(Int64
, NumOperands
);
5295 Value
*ArgsBaseGEP
=
5296 Builder
.CreateInBoundsGEP(ArrI8PtrTy
, MapperAllocas
.ArgsBase
,
5297 {Builder
.getInt32(0), Builder
.getInt32(0)});
5299 Builder
.CreateInBoundsGEP(ArrI8PtrTy
, MapperAllocas
.Args
,
5300 {Builder
.getInt32(0), Builder
.getInt32(0)});
5301 Value
*ArgSizesGEP
=
5302 Builder
.CreateInBoundsGEP(ArrI64Ty
, MapperAllocas
.ArgSizes
,
5303 {Builder
.getInt32(0), Builder
.getInt32(0)});
5305 Constant::getNullValue(PointerType::getUnqual(Int8Ptr
->getContext()));
5306 Builder
.CreateCall(MapperFunc
,
5307 {SrcLocInfo
, Builder
.getInt64(DeviceID
),
5308 Builder
.getInt32(NumOperands
), ArgsBaseGEP
, ArgsGEP
,
5309 ArgSizesGEP
, MaptypesArg
, MapnamesArg
, NullPtr
});
5312 void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase
&Builder
,
5313 TargetDataRTArgs
&RTArgs
,
5314 TargetDataInfo
&Info
,
5317 assert((!ForEndCall
|| Info
.separateBeginEndCalls()) &&
5318 "expected region end call to runtime only when end call is separate");
5319 auto UnqualPtrTy
= PointerType::getUnqual(M
.getContext());
5320 auto VoidPtrTy
= UnqualPtrTy
;
5321 auto VoidPtrPtrTy
= UnqualPtrTy
;
5322 auto Int64Ty
= Type::getInt64Ty(M
.getContext());
5323 auto Int64PtrTy
= UnqualPtrTy
;
5325 if (!Info
.NumberOfPtrs
) {
5326 RTArgs
.BasePointersArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
5327 RTArgs
.PointersArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
5328 RTArgs
.SizesArray
= ConstantPointerNull::get(Int64PtrTy
);
5329 RTArgs
.MapTypesArray
= ConstantPointerNull::get(Int64PtrTy
);
5330 RTArgs
.MapNamesArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
5331 RTArgs
.MappersArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
5335 RTArgs
.BasePointersArray
= Builder
.CreateConstInBoundsGEP2_32(
5336 ArrayType::get(VoidPtrTy
, Info
.NumberOfPtrs
),
5337 Info
.RTArgs
.BasePointersArray
,
5338 /*Idx0=*/0, /*Idx1=*/0);
5339 RTArgs
.PointersArray
= Builder
.CreateConstInBoundsGEP2_32(
5340 ArrayType::get(VoidPtrTy
, Info
.NumberOfPtrs
), Info
.RTArgs
.PointersArray
,
5343 RTArgs
.SizesArray
= Builder
.CreateConstInBoundsGEP2_32(
5344 ArrayType::get(Int64Ty
, Info
.NumberOfPtrs
), Info
.RTArgs
.SizesArray
,
5345 /*Idx0=*/0, /*Idx1=*/0);
5346 RTArgs
.MapTypesArray
= Builder
.CreateConstInBoundsGEP2_32(
5347 ArrayType::get(Int64Ty
, Info
.NumberOfPtrs
),
5348 ForEndCall
&& Info
.RTArgs
.MapTypesArrayEnd
? Info
.RTArgs
.MapTypesArrayEnd
5349 : Info
.RTArgs
.MapTypesArray
,
5353 // Only emit the mapper information arrays if debug information is
5356 RTArgs
.MapNamesArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
5358 RTArgs
.MapNamesArray
= Builder
.CreateConstInBoundsGEP2_32(
5359 ArrayType::get(VoidPtrTy
, Info
.NumberOfPtrs
), Info
.RTArgs
.MapNamesArray
,
5362 // If there is no user-defined mapper, set the mapper array to nullptr to
5363 // avoid an unnecessary data privatization
5364 if (!Info
.HasMapper
)
5365 RTArgs
.MappersArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
5367 RTArgs
.MappersArray
=
5368 Builder
.CreatePointerCast(Info
.RTArgs
.MappersArray
, VoidPtrPtrTy
);
5371 void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP
,
5372 InsertPointTy CodeGenIP
,
5373 MapInfosTy
&CombinedInfo
,
5374 TargetDataInfo
&Info
) {
5375 MapInfosTy::StructNonContiguousInfo
&NonContigInfo
=
5376 CombinedInfo
.NonContigInfo
;
5378 // Build an array of struct descriptor_dim and then assign it to
5381 // struct descriptor_dim {
5386 Type
*Int64Ty
= Builder
.getInt64Ty();
5387 StructType
*DimTy
= StructType::create(
5388 M
.getContext(), ArrayRef
<Type
*>({Int64Ty
, Int64Ty
, Int64Ty
}),
5389 "struct.descriptor_dim");
5391 enum { OffsetFD
= 0, CountFD
, StrideFD
};
5392 // We need two index variable here since the size of "Dims" is the same as
5393 // the size of Components, however, the size of offset, count, and stride is
5394 // equal to the size of base declaration that is non-contiguous.
5395 for (unsigned I
= 0, L
= 0, E
= NonContigInfo
.Dims
.size(); I
< E
; ++I
) {
5396 // Skip emitting ir if dimension size is 1 since it cannot be
5398 if (NonContigInfo
.Dims
[I
] == 1)
5400 Builder
.restoreIP(AllocaIP
);
5401 ArrayType
*ArrayTy
= ArrayType::get(DimTy
, NonContigInfo
.Dims
[I
]);
5402 AllocaInst
*DimsAddr
=
5403 Builder
.CreateAlloca(ArrayTy
, /* ArraySize = */ nullptr, "dims");
5404 Builder
.restoreIP(CodeGenIP
);
5405 for (unsigned II
= 0, EE
= NonContigInfo
.Dims
[I
]; II
< EE
; ++II
) {
5406 unsigned RevIdx
= EE
- II
- 1;
5407 Value
*DimsLVal
= Builder
.CreateInBoundsGEP(
5408 DimsAddr
->getAllocatedType(), DimsAddr
,
5409 {Builder
.getInt64(0), Builder
.getInt64(II
)});
5411 Value
*OffsetLVal
= Builder
.CreateStructGEP(DimTy
, DimsLVal
, OffsetFD
);
5412 Builder
.CreateAlignedStore(
5413 NonContigInfo
.Offsets
[L
][RevIdx
], OffsetLVal
,
5414 M
.getDataLayout().getPrefTypeAlign(OffsetLVal
->getType()));
5416 Value
*CountLVal
= Builder
.CreateStructGEP(DimTy
, DimsLVal
, CountFD
);
5417 Builder
.CreateAlignedStore(
5418 NonContigInfo
.Counts
[L
][RevIdx
], CountLVal
,
5419 M
.getDataLayout().getPrefTypeAlign(CountLVal
->getType()));
5421 Value
*StrideLVal
= Builder
.CreateStructGEP(DimTy
, DimsLVal
, StrideFD
);
5422 Builder
.CreateAlignedStore(
5423 NonContigInfo
.Strides
[L
][RevIdx
], StrideLVal
,
5424 M
.getDataLayout().getPrefTypeAlign(CountLVal
->getType()));
5427 Builder
.restoreIP(CodeGenIP
);
5428 Value
*DAddr
= Builder
.CreatePointerBitCastOrAddrSpaceCast(
5429 DimsAddr
, Builder
.getPtrTy());
5430 Value
*P
= Builder
.CreateConstInBoundsGEP2_32(
5431 ArrayType::get(Builder
.getPtrTy(), Info
.NumberOfPtrs
),
5432 Info
.RTArgs
.PointersArray
, 0, I
);
5433 Builder
.CreateAlignedStore(
5434 DAddr
, P
, M
.getDataLayout().getPrefTypeAlign(Builder
.getPtrTy()));
5439 void OpenMPIRBuilder::emitOffloadingArrays(
5440 InsertPointTy AllocaIP
, InsertPointTy CodeGenIP
, MapInfosTy
&CombinedInfo
,
5441 TargetDataInfo
&Info
, bool IsNonContiguous
,
5442 function_ref
<void(unsigned int, Value
*)> DeviceAddrCB
,
5443 function_ref
<Value
*(unsigned int)> CustomMapperCB
) {
5445 // Reset the array information.
5446 Info
.clearArrayInfo();
5447 Info
.NumberOfPtrs
= CombinedInfo
.BasePointers
.size();
5449 if (Info
.NumberOfPtrs
== 0)
5452 Builder
.restoreIP(AllocaIP
);
5453 // Detect if we have any capture size requiring runtime evaluation of the
5454 // size so that a constant array could be eventually used.
5455 ArrayType
*PointerArrayType
=
5456 ArrayType::get(Builder
.getPtrTy(), Info
.NumberOfPtrs
);
5458 Info
.RTArgs
.BasePointersArray
= Builder
.CreateAlloca(
5459 PointerArrayType
, /* ArraySize = */ nullptr, ".offload_baseptrs");
5461 Info
.RTArgs
.PointersArray
= Builder
.CreateAlloca(
5462 PointerArrayType
, /* ArraySize = */ nullptr, ".offload_ptrs");
5463 AllocaInst
*MappersArray
= Builder
.CreateAlloca(
5464 PointerArrayType
, /* ArraySize = */ nullptr, ".offload_mappers");
5465 Info
.RTArgs
.MappersArray
= MappersArray
;
5467 // If we don't have any VLA types or other types that require runtime
5468 // evaluation, we can use a constant array for the map sizes, otherwise we
5469 // need to fill up the arrays as we do for the pointers.
5470 Type
*Int64Ty
= Builder
.getInt64Ty();
5471 SmallVector
<Constant
*> ConstSizes(CombinedInfo
.Sizes
.size(),
5472 ConstantInt::get(Int64Ty
, 0));
5473 SmallBitVector
RuntimeSizes(CombinedInfo
.Sizes
.size());
5474 for (unsigned I
= 0, E
= CombinedInfo
.Sizes
.size(); I
< E
; ++I
) {
5475 if (auto *CI
= dyn_cast
<Constant
>(CombinedInfo
.Sizes
[I
])) {
5476 if (!isa
<ConstantExpr
>(CI
) && !isa
<GlobalValue
>(CI
)) {
5477 if (IsNonContiguous
&&
5478 static_cast<std::underlying_type_t
<OpenMPOffloadMappingFlags
>>(
5479 CombinedInfo
.Types
[I
] &
5480 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG
))
5482 ConstantInt::get(Int64Ty
, CombinedInfo
.NonContigInfo
.Dims
[I
]);
5488 RuntimeSizes
.set(I
);
5491 if (RuntimeSizes
.all()) {
5492 ArrayType
*SizeArrayType
= ArrayType::get(Int64Ty
, Info
.NumberOfPtrs
);
5493 Info
.RTArgs
.SizesArray
= Builder
.CreateAlloca(
5494 SizeArrayType
, /* ArraySize = */ nullptr, ".offload_sizes");
5495 Builder
.restoreIP(CodeGenIP
);
5497 auto *SizesArrayInit
= ConstantArray::get(
5498 ArrayType::get(Int64Ty
, ConstSizes
.size()), ConstSizes
);
5499 std::string Name
= createPlatformSpecificName({"offload_sizes"});
5500 auto *SizesArrayGbl
=
5501 new GlobalVariable(M
, SizesArrayInit
->getType(), /*isConstant=*/true,
5502 GlobalValue::PrivateLinkage
, SizesArrayInit
, Name
);
5503 SizesArrayGbl
->setUnnamedAddr(GlobalValue::UnnamedAddr::Global
);
5505 if (!RuntimeSizes
.any()) {
5506 Info
.RTArgs
.SizesArray
= SizesArrayGbl
;
5508 unsigned IndexSize
= M
.getDataLayout().getIndexSizeInBits(0);
5509 Align OffloadSizeAlign
= M
.getDataLayout().getABIIntegerTypeAlignment(64);
5510 ArrayType
*SizeArrayType
= ArrayType::get(Int64Ty
, Info
.NumberOfPtrs
);
5511 AllocaInst
*Buffer
= Builder
.CreateAlloca(
5512 SizeArrayType
, /* ArraySize = */ nullptr, ".offload_sizes");
5513 Buffer
->setAlignment(OffloadSizeAlign
);
5514 Builder
.restoreIP(CodeGenIP
);
5515 Builder
.CreateMemCpy(
5516 Buffer
, M
.getDataLayout().getPrefTypeAlign(Buffer
->getType()),
5517 SizesArrayGbl
, OffloadSizeAlign
,
5520 Buffer
->getAllocationSize(M
.getDataLayout())->getFixedValue()));
5522 Info
.RTArgs
.SizesArray
= Buffer
;
5524 Builder
.restoreIP(CodeGenIP
);
5527 // The map types are always constant so we don't need to generate code to
5528 // fill arrays. Instead, we create an array constant.
5529 SmallVector
<uint64_t, 4> Mapping
;
5530 for (auto mapFlag
: CombinedInfo
.Types
)
5532 static_cast<std::underlying_type_t
<OpenMPOffloadMappingFlags
>>(
5534 std::string MaptypesName
= createPlatformSpecificName({"offload_maptypes"});
5535 auto *MapTypesArrayGbl
= createOffloadMaptypes(Mapping
, MaptypesName
);
5536 Info
.RTArgs
.MapTypesArray
= MapTypesArrayGbl
;
5538 // The information types are only built if provided.
5539 if (!CombinedInfo
.Names
.empty()) {
5540 std::string MapnamesName
= createPlatformSpecificName({"offload_mapnames"});
5541 auto *MapNamesArrayGbl
=
5542 createOffloadMapnames(CombinedInfo
.Names
, MapnamesName
);
5543 Info
.RTArgs
.MapNamesArray
= MapNamesArrayGbl
;
5545 Info
.RTArgs
.MapNamesArray
=
5546 Constant::getNullValue(PointerType::getUnqual(Builder
.getContext()));
5549 // If there's a present map type modifier, it must not be applied to the end
5550 // of a region, so generate a separate map type array in that case.
5551 if (Info
.separateBeginEndCalls()) {
5552 bool EndMapTypesDiffer
= false;
5553 for (uint64_t &Type
: Mapping
) {
5554 if (Type
& static_cast<std::underlying_type_t
<OpenMPOffloadMappingFlags
>>(
5555 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT
)) {
5556 Type
&= ~static_cast<std::underlying_type_t
<OpenMPOffloadMappingFlags
>>(
5557 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT
);
5558 EndMapTypesDiffer
= true;
5561 if (EndMapTypesDiffer
) {
5562 MapTypesArrayGbl
= createOffloadMaptypes(Mapping
, MaptypesName
);
5563 Info
.RTArgs
.MapTypesArrayEnd
= MapTypesArrayGbl
;
5567 PointerType
*PtrTy
= Builder
.getPtrTy();
5568 for (unsigned I
= 0; I
< Info
.NumberOfPtrs
; ++I
) {
5569 Value
*BPVal
= CombinedInfo
.BasePointers
[I
];
5570 Value
*BP
= Builder
.CreateConstInBoundsGEP2_32(
5571 ArrayType::get(PtrTy
, Info
.NumberOfPtrs
), Info
.RTArgs
.BasePointersArray
,
5573 Builder
.CreateAlignedStore(BPVal
, BP
,
5574 M
.getDataLayout().getPrefTypeAlign(PtrTy
));
5576 if (Info
.requiresDevicePointerInfo()) {
5577 if (CombinedInfo
.DevicePointers
[I
] == DeviceInfoTy::Pointer
) {
5578 CodeGenIP
= Builder
.saveIP();
5579 Builder
.restoreIP(AllocaIP
);
5580 Info
.DevicePtrInfoMap
[BPVal
] = {BP
, Builder
.CreateAlloca(PtrTy
)};
5581 Builder
.restoreIP(CodeGenIP
);
5583 DeviceAddrCB(I
, Info
.DevicePtrInfoMap
[BPVal
].second
);
5584 } else if (CombinedInfo
.DevicePointers
[I
] == DeviceInfoTy::Address
) {
5585 Info
.DevicePtrInfoMap
[BPVal
] = {BP
, BP
};
5587 DeviceAddrCB(I
, BP
);
5591 Value
*PVal
= CombinedInfo
.Pointers
[I
];
5592 Value
*P
= Builder
.CreateConstInBoundsGEP2_32(
5593 ArrayType::get(PtrTy
, Info
.NumberOfPtrs
), Info
.RTArgs
.PointersArray
, 0,
5595 // TODO: Check alignment correct.
5596 Builder
.CreateAlignedStore(PVal
, P
,
5597 M
.getDataLayout().getPrefTypeAlign(PtrTy
));
5599 if (RuntimeSizes
.test(I
)) {
5600 Value
*S
= Builder
.CreateConstInBoundsGEP2_32(
5601 ArrayType::get(Int64Ty
, Info
.NumberOfPtrs
), Info
.RTArgs
.SizesArray
,
5604 Builder
.CreateAlignedStore(Builder
.CreateIntCast(CombinedInfo
.Sizes
[I
],
5607 S
, M
.getDataLayout().getPrefTypeAlign(PtrTy
));
5609 // Fill up the mapper array.
5610 unsigned IndexSize
= M
.getDataLayout().getIndexSizeInBits(0);
5611 Value
*MFunc
= ConstantPointerNull::get(PtrTy
);
5613 if (Value
*CustomMFunc
= CustomMapperCB(I
))
5614 MFunc
= Builder
.CreatePointerCast(CustomMFunc
, PtrTy
);
5615 Value
*MAddr
= Builder
.CreateInBoundsGEP(
5616 MappersArray
->getAllocatedType(), MappersArray
,
5617 {Builder
.getIntN(IndexSize
, 0), Builder
.getIntN(IndexSize
, I
)});
5618 Builder
.CreateAlignedStore(
5619 MFunc
, MAddr
, M
.getDataLayout().getPrefTypeAlign(MAddr
->getType()));
5622 if (!IsNonContiguous
|| CombinedInfo
.NonContigInfo
.Offsets
.empty() ||
5623 Info
.NumberOfPtrs
== 0)
5625 emitNonContiguousDescriptor(AllocaIP
, CodeGenIP
, CombinedInfo
, Info
);
5628 void OpenMPIRBuilder::emitBranch(BasicBlock
*Target
) {
5629 BasicBlock
*CurBB
= Builder
.GetInsertBlock();
5631 if (!CurBB
|| CurBB
->getTerminator()) {
5632 // If there is no insert point or the previous block is already
5633 // terminated, don't touch it.
5635 // Otherwise, create a fall-through branch.
5636 Builder
.CreateBr(Target
);
5639 Builder
.ClearInsertionPoint();
5642 void OpenMPIRBuilder::emitBlock(BasicBlock
*BB
, Function
*CurFn
,
5644 BasicBlock
*CurBB
= Builder
.GetInsertBlock();
5646 // Fall out of the current block (if necessary).
5649 if (IsFinished
&& BB
->use_empty()) {
5650 BB
->eraseFromParent();
5654 // Place the block after the current block, if possible, or else at
5655 // the end of the function.
5656 if (CurBB
&& CurBB
->getParent())
5657 CurFn
->insert(std::next(CurBB
->getIterator()), BB
);
5659 CurFn
->insert(CurFn
->end(), BB
);
5660 Builder
.SetInsertPoint(BB
);
5663 void OpenMPIRBuilder::emitIfClause(Value
*Cond
, BodyGenCallbackTy ThenGen
,
5664 BodyGenCallbackTy ElseGen
,
5665 InsertPointTy AllocaIP
) {
5666 // If the condition constant folds and can be elided, try to avoid emitting
5667 // the condition and the dead arm of the if/else.
5668 if (auto *CI
= dyn_cast
<ConstantInt
>(Cond
)) {
5669 auto CondConstant
= CI
->getSExtValue();
5671 ThenGen(AllocaIP
, Builder
.saveIP());
5673 ElseGen(AllocaIP
, Builder
.saveIP());
5677 Function
*CurFn
= Builder
.GetInsertBlock()->getParent();
5679 // Otherwise, the condition did not fold, or we couldn't elide it. Just
5680 // emit the conditional branch.
5681 BasicBlock
*ThenBlock
= BasicBlock::Create(M
.getContext(), "omp_if.then");
5682 BasicBlock
*ElseBlock
= BasicBlock::Create(M
.getContext(), "omp_if.else");
5683 BasicBlock
*ContBlock
= BasicBlock::Create(M
.getContext(), "omp_if.end");
5684 Builder
.CreateCondBr(Cond
, ThenBlock
, ElseBlock
);
5685 // Emit the 'then' code.
5686 emitBlock(ThenBlock
, CurFn
);
5687 ThenGen(AllocaIP
, Builder
.saveIP());
5688 emitBranch(ContBlock
);
5689 // Emit the 'else' code if present.
5690 // There is no need to emit line number for unconditional branch.
5691 emitBlock(ElseBlock
, CurFn
);
5692 ElseGen(AllocaIP
, Builder
.saveIP());
5693 // There is no need to emit line number for unconditional branch.
5694 emitBranch(ContBlock
);
5695 // Emit the continuation block for code after the if.
5696 emitBlock(ContBlock
, CurFn
, /*IsFinished=*/true);
5699 bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
5700 const LocationDescription
&Loc
, llvm::AtomicOrdering AO
, AtomicKind AK
) {
5701 assert(!(AO
== AtomicOrdering::NotAtomic
||
5702 AO
== llvm::AtomicOrdering::Unordered
) &&
5703 "Unexpected Atomic Ordering.");
5706 llvm::AtomicOrdering FlushAO
= AtomicOrdering::Monotonic
;
5710 if (AO
== AtomicOrdering::Acquire
|| AO
== AtomicOrdering::AcquireRelease
||
5711 AO
== AtomicOrdering::SequentiallyConsistent
) {
5712 FlushAO
= AtomicOrdering::Acquire
;
5719 if (AO
== AtomicOrdering::Release
|| AO
== AtomicOrdering::AcquireRelease
||
5720 AO
== AtomicOrdering::SequentiallyConsistent
) {
5721 FlushAO
= AtomicOrdering::Release
;
5727 case AtomicOrdering::Acquire
:
5728 FlushAO
= AtomicOrdering::Acquire
;
5731 case AtomicOrdering::Release
:
5732 FlushAO
= AtomicOrdering::Release
;
5735 case AtomicOrdering::AcquireRelease
:
5736 case AtomicOrdering::SequentiallyConsistent
:
5737 FlushAO
= AtomicOrdering::AcquireRelease
;
5741 // do nothing - leave silently.
5747 // Currently Flush RT call still doesn't take memory_ordering, so for when
5748 // that happens, this tries to do the resolution of which atomic ordering
5749 // to use with but issue the flush call
5750 // TODO: pass `FlushAO` after memory ordering support is added
5755 // for AO == AtomicOrdering::Monotonic and all other case combinations
5760 OpenMPIRBuilder::InsertPointTy
5761 OpenMPIRBuilder::createAtomicRead(const LocationDescription
&Loc
,
5762 AtomicOpValue
&X
, AtomicOpValue
&V
,
5763 AtomicOrdering AO
) {
5764 if (!updateToLocation(Loc
))
5767 assert(X
.Var
->getType()->isPointerTy() &&
5768 "OMP Atomic expects a pointer to target memory");
5769 Type
*XElemTy
= X
.ElemTy
;
5770 assert((XElemTy
->isFloatingPointTy() || XElemTy
->isIntegerTy() ||
5771 XElemTy
->isPointerTy()) &&
5772 "OMP atomic read expected a scalar type");
5774 Value
*XRead
= nullptr;
5776 if (XElemTy
->isIntegerTy()) {
5778 Builder
.CreateLoad(XElemTy
, X
.Var
, X
.IsVolatile
, "omp.atomic.read");
5780 XRead
= cast
<Value
>(XLD
);
5782 // We need to perform atomic op as integer
5783 IntegerType
*IntCastTy
=
5784 IntegerType::get(M
.getContext(), XElemTy
->getScalarSizeInBits());
5786 Builder
.CreateLoad(IntCastTy
, X
.Var
, X
.IsVolatile
, "omp.atomic.load");
5787 XLoad
->setAtomic(AO
);
5788 if (XElemTy
->isFloatingPointTy()) {
5789 XRead
= Builder
.CreateBitCast(XLoad
, XElemTy
, "atomic.flt.cast");
5791 XRead
= Builder
.CreateIntToPtr(XLoad
, XElemTy
, "atomic.ptr.cast");
5794 checkAndEmitFlushAfterAtomic(Loc
, AO
, AtomicKind::Read
);
5795 Builder
.CreateStore(XRead
, V
.Var
, V
.IsVolatile
);
5796 return Builder
.saveIP();
5799 OpenMPIRBuilder::InsertPointTy
5800 OpenMPIRBuilder::createAtomicWrite(const LocationDescription
&Loc
,
5801 AtomicOpValue
&X
, Value
*Expr
,
5802 AtomicOrdering AO
) {
5803 if (!updateToLocation(Loc
))
5806 assert(X
.Var
->getType()->isPointerTy() &&
5807 "OMP Atomic expects a pointer to target memory");
5808 Type
*XElemTy
= X
.ElemTy
;
5809 assert((XElemTy
->isFloatingPointTy() || XElemTy
->isIntegerTy() ||
5810 XElemTy
->isPointerTy()) &&
5811 "OMP atomic write expected a scalar type");
5813 if (XElemTy
->isIntegerTy()) {
5814 StoreInst
*XSt
= Builder
.CreateStore(Expr
, X
.Var
, X
.IsVolatile
);
5817 // We need to bitcast and perform atomic op as integers
5818 IntegerType
*IntCastTy
=
5819 IntegerType::get(M
.getContext(), XElemTy
->getScalarSizeInBits());
5821 Builder
.CreateBitCast(Expr
, IntCastTy
, "atomic.src.int.cast");
5822 StoreInst
*XSt
= Builder
.CreateStore(ExprCast
, X
.Var
, X
.IsVolatile
);
5826 checkAndEmitFlushAfterAtomic(Loc
, AO
, AtomicKind::Write
);
5827 return Builder
.saveIP();
5830 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createAtomicUpdate(
5831 const LocationDescription
&Loc
, InsertPointTy AllocaIP
, AtomicOpValue
&X
,
5832 Value
*Expr
, AtomicOrdering AO
, AtomicRMWInst::BinOp RMWOp
,
5833 AtomicUpdateCallbackTy
&UpdateOp
, bool IsXBinopExpr
) {
5834 assert(!isConflictIP(Loc
.IP
, AllocaIP
) && "IPs must not be ambiguous");
5835 if (!updateToLocation(Loc
))
5839 Type
*XTy
= X
.Var
->getType();
5840 assert(XTy
->isPointerTy() &&
5841 "OMP Atomic expects a pointer to target memory");
5842 Type
*XElemTy
= X
.ElemTy
;
5843 assert((XElemTy
->isFloatingPointTy() || XElemTy
->isIntegerTy() ||
5844 XElemTy
->isPointerTy()) &&
5845 "OMP atomic update expected a scalar type");
5846 assert((RMWOp
!= AtomicRMWInst::Max
) && (RMWOp
!= AtomicRMWInst::Min
) &&
5847 (RMWOp
!= AtomicRMWInst::UMax
) && (RMWOp
!= AtomicRMWInst::UMin
) &&
5848 "OpenMP atomic does not support LT or GT operations");
5851 emitAtomicUpdate(AllocaIP
, X
.Var
, X
.ElemTy
, Expr
, AO
, RMWOp
, UpdateOp
,
5852 X
.IsVolatile
, IsXBinopExpr
);
5853 checkAndEmitFlushAfterAtomic(Loc
, AO
, AtomicKind::Update
);
5854 return Builder
.saveIP();
5857 // FIXME: Duplicating AtomicExpand
5858 Value
*OpenMPIRBuilder::emitRMWOpAsInstruction(Value
*Src1
, Value
*Src2
,
5859 AtomicRMWInst::BinOp RMWOp
) {
5861 case AtomicRMWInst::Add
:
5862 return Builder
.CreateAdd(Src1
, Src2
);
5863 case AtomicRMWInst::Sub
:
5864 return Builder
.CreateSub(Src1
, Src2
);
5865 case AtomicRMWInst::And
:
5866 return Builder
.CreateAnd(Src1
, Src2
);
5867 case AtomicRMWInst::Nand
:
5868 return Builder
.CreateNeg(Builder
.CreateAnd(Src1
, Src2
));
5869 case AtomicRMWInst::Or
:
5870 return Builder
.CreateOr(Src1
, Src2
);
5871 case AtomicRMWInst::Xor
:
5872 return Builder
.CreateXor(Src1
, Src2
);
5873 case AtomicRMWInst::Xchg
:
5874 case AtomicRMWInst::FAdd
:
5875 case AtomicRMWInst::FSub
:
5876 case AtomicRMWInst::BAD_BINOP
:
5877 case AtomicRMWInst::Max
:
5878 case AtomicRMWInst::Min
:
5879 case AtomicRMWInst::UMax
:
5880 case AtomicRMWInst::UMin
:
5881 case AtomicRMWInst::FMax
:
5882 case AtomicRMWInst::FMin
:
5883 case AtomicRMWInst::UIncWrap
:
5884 case AtomicRMWInst::UDecWrap
:
5885 llvm_unreachable("Unsupported atomic update operation");
5887 llvm_unreachable("Unsupported atomic update operation");
5890 std::pair
<Value
*, Value
*> OpenMPIRBuilder::emitAtomicUpdate(
5891 InsertPointTy AllocaIP
, Value
*X
, Type
*XElemTy
, Value
*Expr
,
5892 AtomicOrdering AO
, AtomicRMWInst::BinOp RMWOp
,
5893 AtomicUpdateCallbackTy
&UpdateOp
, bool VolatileX
, bool IsXBinopExpr
) {
5894 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
5895 // or a complex datatype.
5896 bool emitRMWOp
= false;
5898 case AtomicRMWInst::Add
:
5899 case AtomicRMWInst::And
:
5900 case AtomicRMWInst::Nand
:
5901 case AtomicRMWInst::Or
:
5902 case AtomicRMWInst::Xor
:
5903 case AtomicRMWInst::Xchg
:
5904 emitRMWOp
= XElemTy
;
5906 case AtomicRMWInst::Sub
:
5907 emitRMWOp
= (IsXBinopExpr
&& XElemTy
);
5912 emitRMWOp
&= XElemTy
->isIntegerTy();
5914 std::pair
<Value
*, Value
*> Res
;
5916 Res
.first
= Builder
.CreateAtomicRMW(RMWOp
, X
, Expr
, llvm::MaybeAlign(), AO
);
5917 // not needed except in case of postfix captures. Generate anyway for
5918 // consistency with the else part. Will be removed with any DCE pass.
5919 // AtomicRMWInst::Xchg does not have a coressponding instruction.
5920 if (RMWOp
== AtomicRMWInst::Xchg
)
5921 Res
.second
= Res
.first
;
5923 Res
.second
= emitRMWOpAsInstruction(Res
.first
, Expr
, RMWOp
);
5925 IntegerType
*IntCastTy
=
5926 IntegerType::get(M
.getContext(), XElemTy
->getScalarSizeInBits());
5928 Builder
.CreateLoad(IntCastTy
, X
, X
->getName() + ".atomic.load");
5929 OldVal
->setAtomic(AO
);
5935 BasicBlock
*CurBB
= Builder
.GetInsertBlock();
5936 Instruction
*CurBBTI
= CurBB
->getTerminator();
5937 CurBBTI
= CurBBTI
? CurBBTI
: Builder
.CreateUnreachable();
5938 BasicBlock
*ExitBB
=
5939 CurBB
->splitBasicBlock(CurBBTI
, X
->getName() + ".atomic.exit");
5940 BasicBlock
*ContBB
= CurBB
->splitBasicBlock(CurBB
->getTerminator(),
5941 X
->getName() + ".atomic.cont");
5942 ContBB
->getTerminator()->eraseFromParent();
5943 Builder
.restoreIP(AllocaIP
);
5944 AllocaInst
*NewAtomicAddr
= Builder
.CreateAlloca(XElemTy
);
5945 NewAtomicAddr
->setName(X
->getName() + "x.new.val");
5946 Builder
.SetInsertPoint(ContBB
);
5947 llvm::PHINode
*PHI
= Builder
.CreatePHI(OldVal
->getType(), 2);
5948 PHI
->addIncoming(OldVal
, CurBB
);
5949 bool IsIntTy
= XElemTy
->isIntegerTy();
5950 Value
*OldExprVal
= PHI
;
5952 if (XElemTy
->isFloatingPointTy()) {
5953 OldExprVal
= Builder
.CreateBitCast(PHI
, XElemTy
,
5954 X
->getName() + ".atomic.fltCast");
5956 OldExprVal
= Builder
.CreateIntToPtr(PHI
, XElemTy
,
5957 X
->getName() + ".atomic.ptrCast");
5961 Value
*Upd
= UpdateOp(OldExprVal
, Builder
);
5962 Builder
.CreateStore(Upd
, NewAtomicAddr
);
5963 LoadInst
*DesiredVal
= Builder
.CreateLoad(IntCastTy
, NewAtomicAddr
);
5964 AtomicOrdering Failure
=
5965 llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(AO
);
5966 AtomicCmpXchgInst
*Result
= Builder
.CreateAtomicCmpXchg(
5967 X
, PHI
, DesiredVal
, llvm::MaybeAlign(), AO
, Failure
);
5968 Result
->setVolatile(VolatileX
);
5969 Value
*PreviousVal
= Builder
.CreateExtractValue(Result
, /*Idxs=*/0);
5970 Value
*SuccessFailureVal
= Builder
.CreateExtractValue(Result
, /*Idxs=*/1);
5971 PHI
->addIncoming(PreviousVal
, Builder
.GetInsertBlock());
5972 Builder
.CreateCondBr(SuccessFailureVal
, ExitBB
, ContBB
);
5974 Res
.first
= OldExprVal
;
5977 // set Insertion point in exit block
5978 if (UnreachableInst
*ExitTI
=
5979 dyn_cast
<UnreachableInst
>(ExitBB
->getTerminator())) {
5980 CurBBTI
->eraseFromParent();
5981 Builder
.SetInsertPoint(ExitBB
);
5983 Builder
.SetInsertPoint(ExitTI
);
5990 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createAtomicCapture(
5991 const LocationDescription
&Loc
, InsertPointTy AllocaIP
, AtomicOpValue
&X
,
5992 AtomicOpValue
&V
, Value
*Expr
, AtomicOrdering AO
,
5993 AtomicRMWInst::BinOp RMWOp
, AtomicUpdateCallbackTy
&UpdateOp
,
5994 bool UpdateExpr
, bool IsPostfixUpdate
, bool IsXBinopExpr
) {
5995 if (!updateToLocation(Loc
))
5999 Type
*XTy
= X
.Var
->getType();
6000 assert(XTy
->isPointerTy() &&
6001 "OMP Atomic expects a pointer to target memory");
6002 Type
*XElemTy
= X
.ElemTy
;
6003 assert((XElemTy
->isFloatingPointTy() || XElemTy
->isIntegerTy() ||
6004 XElemTy
->isPointerTy()) &&
6005 "OMP atomic capture expected a scalar type");
6006 assert((RMWOp
!= AtomicRMWInst::Max
) && (RMWOp
!= AtomicRMWInst::Min
) &&
6007 "OpenMP atomic does not support LT or GT operations");
6010 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
6011 // 'x' is simply atomically rewritten with 'expr'.
6012 AtomicRMWInst::BinOp AtomicOp
= (UpdateExpr
? RMWOp
: AtomicRMWInst::Xchg
);
6013 std::pair
<Value
*, Value
*> Result
=
6014 emitAtomicUpdate(AllocaIP
, X
.Var
, X
.ElemTy
, Expr
, AO
, AtomicOp
, UpdateOp
,
6015 X
.IsVolatile
, IsXBinopExpr
);
6017 Value
*CapturedVal
= (IsPostfixUpdate
? Result
.first
: Result
.second
);
6018 Builder
.CreateStore(CapturedVal
, V
.Var
, V
.IsVolatile
);
6020 checkAndEmitFlushAfterAtomic(Loc
, AO
, AtomicKind::Capture
);
6021 return Builder
.saveIP();
6024 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createAtomicCompare(
6025 const LocationDescription
&Loc
, AtomicOpValue
&X
, AtomicOpValue
&V
,
6026 AtomicOpValue
&R
, Value
*E
, Value
*D
, AtomicOrdering AO
,
6027 omp::OMPAtomicCompareOp Op
, bool IsXBinopExpr
, bool IsPostfixUpdate
,
6030 AtomicOrdering Failure
= AtomicCmpXchgInst::getStrongestFailureOrdering(AO
);
6031 return createAtomicCompare(Loc
, X
, V
, R
, E
, D
, AO
, Op
, IsXBinopExpr
,
6032 IsPostfixUpdate
, IsFailOnly
, Failure
);
6035 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createAtomicCompare(
6036 const LocationDescription
&Loc
, AtomicOpValue
&X
, AtomicOpValue
&V
,
6037 AtomicOpValue
&R
, Value
*E
, Value
*D
, AtomicOrdering AO
,
6038 omp::OMPAtomicCompareOp Op
, bool IsXBinopExpr
, bool IsPostfixUpdate
,
6039 bool IsFailOnly
, AtomicOrdering Failure
) {
6041 if (!updateToLocation(Loc
))
6044 assert(X
.Var
->getType()->isPointerTy() &&
6045 "OMP atomic expects a pointer to target memory");
6048 assert(V
.Var
->getType()->isPointerTy() && "v.var must be of pointer type");
6049 assert(V
.ElemTy
== X
.ElemTy
&& "x and v must be of same type");
6052 bool IsInteger
= E
->getType()->isIntegerTy();
6054 if (Op
== OMPAtomicCompareOp::EQ
) {
6055 AtomicCmpXchgInst
*Result
= nullptr;
6057 IntegerType
*IntCastTy
=
6058 IntegerType::get(M
.getContext(), X
.ElemTy
->getScalarSizeInBits());
6059 Value
*EBCast
= Builder
.CreateBitCast(E
, IntCastTy
);
6060 Value
*DBCast
= Builder
.CreateBitCast(D
, IntCastTy
);
6061 Result
= Builder
.CreateAtomicCmpXchg(X
.Var
, EBCast
, DBCast
, MaybeAlign(),
6065 Builder
.CreateAtomicCmpXchg(X
.Var
, E
, D
, MaybeAlign(), AO
, Failure
);
6069 Value
*OldValue
= Builder
.CreateExtractValue(Result
, /*Idxs=*/0);
6071 OldValue
= Builder
.CreateBitCast(OldValue
, X
.ElemTy
);
6072 assert(OldValue
->getType() == V
.ElemTy
&&
6073 "OldValue and V must be of same type");
6074 if (IsPostfixUpdate
) {
6075 Builder
.CreateStore(OldValue
, V
.Var
, V
.IsVolatile
);
6077 Value
*SuccessOrFail
= Builder
.CreateExtractValue(Result
, /*Idxs=*/1);
6087 // where ContBB only contains the store of old value to 'v'.
6088 BasicBlock
*CurBB
= Builder
.GetInsertBlock();
6089 Instruction
*CurBBTI
= CurBB
->getTerminator();
6090 CurBBTI
= CurBBTI
? CurBBTI
: Builder
.CreateUnreachable();
6091 BasicBlock
*ExitBB
= CurBB
->splitBasicBlock(
6092 CurBBTI
, X
.Var
->getName() + ".atomic.exit");
6093 BasicBlock
*ContBB
= CurBB
->splitBasicBlock(
6094 CurBB
->getTerminator(), X
.Var
->getName() + ".atomic.cont");
6095 ContBB
->getTerminator()->eraseFromParent();
6096 CurBB
->getTerminator()->eraseFromParent();
6098 Builder
.CreateCondBr(SuccessOrFail
, ExitBB
, ContBB
);
6100 Builder
.SetInsertPoint(ContBB
);
6101 Builder
.CreateStore(OldValue
, V
.Var
);
6102 Builder
.CreateBr(ExitBB
);
6104 if (UnreachableInst
*ExitTI
=
6105 dyn_cast
<UnreachableInst
>(ExitBB
->getTerminator())) {
6106 CurBBTI
->eraseFromParent();
6107 Builder
.SetInsertPoint(ExitBB
);
6109 Builder
.SetInsertPoint(ExitTI
);
6112 Value
*CapturedValue
=
6113 Builder
.CreateSelect(SuccessOrFail
, E
, OldValue
);
6114 Builder
.CreateStore(CapturedValue
, V
.Var
, V
.IsVolatile
);
6118 // The comparison result has to be stored.
6120 assert(R
.Var
->getType()->isPointerTy() &&
6121 "r.var must be of pointer type");
6122 assert(R
.ElemTy
->isIntegerTy() && "r must be of integral type");
6124 Value
*SuccessFailureVal
= Builder
.CreateExtractValue(Result
, /*Idxs=*/1);
6125 Value
*ResultCast
= R
.IsSigned
6126 ? Builder
.CreateSExt(SuccessFailureVal
, R
.ElemTy
)
6127 : Builder
.CreateZExt(SuccessFailureVal
, R
.ElemTy
);
6128 Builder
.CreateStore(ResultCast
, R
.Var
, R
.IsVolatile
);
6131 assert((Op
== OMPAtomicCompareOp::MAX
|| Op
== OMPAtomicCompareOp::MIN
) &&
6132 "Op should be either max or min at this point");
6133 assert(!IsFailOnly
&& "IsFailOnly is only valid when the comparison is ==");
6135 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
6136 // Let's take max as example.
6138 // x = x > expr ? expr : x;
6140 // *ptr = *ptr > val ? *ptr : val;
6141 // We need to transform to LLVM form.
6142 // x = x <= expr ? x : expr;
6143 AtomicRMWInst::BinOp NewOp
;
6147 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::Min
6148 : AtomicRMWInst::Max
;
6150 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::UMin
6151 : AtomicRMWInst::UMax
;
6153 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::FMin
6154 : AtomicRMWInst::FMax
;
6159 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::Max
6160 : AtomicRMWInst::Min
;
6162 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::UMax
6163 : AtomicRMWInst::UMin
;
6165 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::FMax
6166 : AtomicRMWInst::FMin
;
6170 AtomicRMWInst
*OldValue
=
6171 Builder
.CreateAtomicRMW(NewOp
, X
.Var
, E
, MaybeAlign(), AO
);
6173 Value
*CapturedValue
= nullptr;
6174 if (IsPostfixUpdate
) {
6175 CapturedValue
= OldValue
;
6177 CmpInst::Predicate Pred
;
6179 case AtomicRMWInst::Max
:
6180 Pred
= CmpInst::ICMP_SGT
;
6182 case AtomicRMWInst::UMax
:
6183 Pred
= CmpInst::ICMP_UGT
;
6185 case AtomicRMWInst::FMax
:
6186 Pred
= CmpInst::FCMP_OGT
;
6188 case AtomicRMWInst::Min
:
6189 Pred
= CmpInst::ICMP_SLT
;
6191 case AtomicRMWInst::UMin
:
6192 Pred
= CmpInst::ICMP_ULT
;
6194 case AtomicRMWInst::FMin
:
6195 Pred
= CmpInst::FCMP_OLT
;
6198 llvm_unreachable("unexpected comparison op");
6200 Value
*NonAtomicCmp
= Builder
.CreateCmp(Pred
, OldValue
, E
);
6201 CapturedValue
= Builder
.CreateSelect(NonAtomicCmp
, E
, OldValue
);
6203 Builder
.CreateStore(CapturedValue
, V
.Var
, V
.IsVolatile
);
6207 checkAndEmitFlushAfterAtomic(Loc
, AO
, AtomicKind::Compare
);
6209 return Builder
.saveIP();
6212 OpenMPIRBuilder::InsertPointTy
6213 OpenMPIRBuilder::createTeams(const LocationDescription
&Loc
,
6214 BodyGenCallbackTy BodyGenCB
, Value
*NumTeamsLower
,
6215 Value
*NumTeamsUpper
, Value
*ThreadLimit
,
6217 if (!updateToLocation(Loc
))
6218 return InsertPointTy();
6220 uint32_t SrcLocStrSize
;
6221 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
6222 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
6223 Function
*CurrentFunction
= Builder
.GetInsertBlock()->getParent();
6225 // Outer allocation basicblock is the entry block of the current function.
6226 BasicBlock
&OuterAllocaBB
= CurrentFunction
->getEntryBlock();
6227 if (&OuterAllocaBB
== Builder
.GetInsertBlock()) {
6228 BasicBlock
*BodyBB
= splitBB(Builder
, /*CreateBranch=*/true, "teams.entry");
6229 Builder
.SetInsertPoint(BodyBB
, BodyBB
->begin());
6232 // The current basic block is split into four basic blocks. After outlining,
6233 // they will be mapped as follows:
6235 // def current_fn() {
6236 // current_basic_block:
6237 // br label %teams.exit
6239 // ; instructions after teams
6242 // def outlined_fn() {
6244 // br label %teams.body
6246 // ; instructions within teams body
6249 BasicBlock
*ExitBB
= splitBB(Builder
, /*CreateBranch=*/true, "teams.exit");
6250 BasicBlock
*BodyBB
= splitBB(Builder
, /*CreateBranch=*/true, "teams.body");
6251 BasicBlock
*AllocaBB
=
6252 splitBB(Builder
, /*CreateBranch=*/true, "teams.alloca");
6255 if (NumTeamsLower
|| NumTeamsUpper
|| ThreadLimit
|| IfExpr
) {
6256 assert((NumTeamsLower
== nullptr || NumTeamsUpper
!= nullptr) &&
6257 "if lowerbound is non-null, then upperbound must also be non-null "
6258 "for bounds on num_teams");
6260 if (NumTeamsUpper
== nullptr)
6261 NumTeamsUpper
= Builder
.getInt32(0);
6263 if (NumTeamsLower
== nullptr)
6264 NumTeamsLower
= NumTeamsUpper
;
6267 assert(IfExpr
->getType()->isIntegerTy() &&
6268 "argument to if clause must be an integer value");
6270 // upper = ifexpr ? upper : 1
6271 if (IfExpr
->getType() != Int1
)
6272 IfExpr
= Builder
.CreateICmpNE(IfExpr
,
6273 ConstantInt::get(IfExpr
->getType(), 0));
6274 NumTeamsUpper
= Builder
.CreateSelect(
6275 IfExpr
, NumTeamsUpper
, Builder
.getInt32(1), "numTeamsUpper");
6277 // lower = ifexpr ? lower : 1
6278 NumTeamsLower
= Builder
.CreateSelect(
6279 IfExpr
, NumTeamsLower
, Builder
.getInt32(1), "numTeamsLower");
6282 if (ThreadLimit
== nullptr)
6283 ThreadLimit
= Builder
.getInt32(0);
6285 Value
*ThreadNum
= getOrCreateThreadID(Ident
);
6287 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51
),
6288 {Ident
, ThreadNum
, NumTeamsLower
, NumTeamsUpper
, ThreadLimit
});
6290 // Generate the body of teams.
6291 InsertPointTy
AllocaIP(AllocaBB
, AllocaBB
->begin());
6292 InsertPointTy
CodeGenIP(BodyBB
, BodyBB
->begin());
6293 BodyGenCB(AllocaIP
, CodeGenIP
);
6296 OI
.EntryBB
= AllocaBB
;
6298 OI
.OuterAllocaBB
= &OuterAllocaBB
;
6300 // Insert fake values for global tid and bound tid.
6301 std::stack
<Instruction
*> ToBeDeleted
;
6302 InsertPointTy
OuterAllocaIP(&OuterAllocaBB
, OuterAllocaBB
.begin());
6303 OI
.ExcludeArgsFromAggregate
.push_back(createFakeIntVal(
6304 Builder
, OuterAllocaIP
, ToBeDeleted
, AllocaIP
, "gid", true));
6305 OI
.ExcludeArgsFromAggregate
.push_back(createFakeIntVal(
6306 Builder
, OuterAllocaIP
, ToBeDeleted
, AllocaIP
, "tid", true));
6308 OI
.PostOutlineCB
= [this, Ident
, ToBeDeleted
](Function
&OutlinedFn
) mutable {
6309 // The stale call instruction will be replaced with a new call instruction
6310 // for runtime call with the outlined function.
6312 assert(OutlinedFn
.getNumUses() == 1 &&
6313 "there must be a single user for the outlined function");
6314 CallInst
*StaleCI
= cast
<CallInst
>(OutlinedFn
.user_back());
6315 ToBeDeleted
.push(StaleCI
);
6317 assert((OutlinedFn
.arg_size() == 2 || OutlinedFn
.arg_size() == 3) &&
6318 "Outlined function must have two or three arguments only");
6320 bool HasShared
= OutlinedFn
.arg_size() == 3;
6322 OutlinedFn
.getArg(0)->setName("global.tid.ptr");
6323 OutlinedFn
.getArg(1)->setName("bound.tid.ptr");
6325 OutlinedFn
.getArg(2)->setName("data");
6327 // Call to the runtime function for teams in the current function.
6328 assert(StaleCI
&& "Error while outlining - no CallInst user found for the "
6329 "outlined function.");
6330 Builder
.SetInsertPoint(StaleCI
);
6331 SmallVector
<Value
*> Args
= {
6332 Ident
, Builder
.getInt32(StaleCI
->arg_size() - 2), &OutlinedFn
};
6334 Args
.push_back(StaleCI
->getArgOperand(2));
6335 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(
6336 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams
),
6339 while (!ToBeDeleted
.empty()) {
6340 ToBeDeleted
.top()->eraseFromParent();
6345 addOutlineInfo(std::move(OI
));
6347 Builder
.SetInsertPoint(ExitBB
, ExitBB
->begin());
6349 return Builder
.saveIP();
6353 OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl
<llvm::Constant
*> &Names
,
6354 std::string VarName
) {
6355 llvm::Constant
*MapNamesArrayInit
= llvm::ConstantArray::get(
6356 llvm::ArrayType::get(llvm::PointerType::getUnqual(M
.getContext()),
6359 auto *MapNamesArrayGlobal
= new llvm::GlobalVariable(
6360 M
, MapNamesArrayInit
->getType(),
6361 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage
, MapNamesArrayInit
,
6363 return MapNamesArrayGlobal
;
6366 // Create all simple and struct types exposed by the runtime and remember
6367 // the llvm::PointerTypes of them for easy access later.
6368 void OpenMPIRBuilder::initializeTypes(Module
&M
) {
6369 LLVMContext
&Ctx
= M
.getContext();
6371 #define OMP_TYPE(VarName, InitValue) VarName = InitValue;
6372 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
6373 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
6374 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
6375 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
6376 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
6377 VarName##Ptr = PointerType::getUnqual(VarName);
6378 #define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
6379 T = StructType::getTypeByName(Ctx, StructName); \
6381 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
6383 VarName##Ptr = PointerType::getUnqual(T);
6384 #include "llvm/Frontend/OpenMP/OMPKinds.def"
6387 void OpenMPIRBuilder::OutlineInfo::collectBlocks(
6388 SmallPtrSetImpl
<BasicBlock
*> &BlockSet
,
6389 SmallVectorImpl
<BasicBlock
*> &BlockVector
) {
6390 SmallVector
<BasicBlock
*, 32> Worklist
;
6391 BlockSet
.insert(EntryBB
);
6392 BlockSet
.insert(ExitBB
);
6394 Worklist
.push_back(EntryBB
);
6395 while (!Worklist
.empty()) {
6396 BasicBlock
*BB
= Worklist
.pop_back_val();
6397 BlockVector
.push_back(BB
);
6398 for (BasicBlock
*SuccBB
: successors(BB
))
6399 if (BlockSet
.insert(SuccBB
).second
)
6400 Worklist
.push_back(SuccBB
);
6404 void OpenMPIRBuilder::createOffloadEntry(Constant
*ID
, Constant
*Addr
,
6405 uint64_t Size
, int32_t Flags
,
6406 GlobalValue::LinkageTypes
,
6408 if (!Config
.isGPU()) {
6409 llvm::offloading::emitOffloadingEntry(
6410 M
, ID
, Name
.empty() ? Addr
->getName() : Name
, Size
, Flags
, /*Data=*/0,
6411 "omp_offloading_entries");
6414 // TODO: Add support for global variables on the device after declare target
6416 Function
*Fn
= dyn_cast
<Function
>(Addr
);
6420 Module
&M
= *(Fn
->getParent());
6421 LLVMContext
&Ctx
= M
.getContext();
6423 // Get "nvvm.annotations" metadata node.
6424 NamedMDNode
*MD
= M
.getOrInsertNamedMetadata("nvvm.annotations");
6426 Metadata
*MDVals
[] = {
6427 ConstantAsMetadata::get(Fn
), MDString::get(Ctx
, "kernel"),
6428 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx
), 1))};
6429 // Append metadata to nvvm.annotations.
6430 MD
->addOperand(MDNode::get(Ctx
, MDVals
));
6432 // Add a function attribute for the kernel.
6433 Fn
->addFnAttr(Attribute::get(Ctx
, "kernel"));
6435 Fn
->addFnAttr("uniform-work-group-size", "true");
6436 Fn
->addFnAttr(Attribute::MustProgress
);
6439 // We only generate metadata for function that contain target regions.
6440 void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
6441 EmitMetadataErrorReportFunctionTy
&ErrorFn
) {
6443 // If there are no entries, we don't need to do anything.
6444 if (OffloadInfoManager
.empty())
6447 LLVMContext
&C
= M
.getContext();
6448 SmallVector
<std::pair
<const OffloadEntriesInfoManager::OffloadEntryInfo
*,
6449 TargetRegionEntryInfo
>,
6451 OrderedEntries(OffloadInfoManager
.size());
6453 // Auxiliary methods to create metadata values and strings.
6454 auto &&GetMDInt
= [this](unsigned V
) {
6455 return ConstantAsMetadata::get(ConstantInt::get(Builder
.getInt32Ty(), V
));
6458 auto &&GetMDString
= [&C
](StringRef V
) { return MDString::get(C
, V
); };
6460 // Create the offloading info metadata node.
6461 NamedMDNode
*MD
= M
.getOrInsertNamedMetadata("omp_offload.info");
6462 auto &&TargetRegionMetadataEmitter
=
6463 [&C
, MD
, &OrderedEntries
, &GetMDInt
, &GetMDString
](
6464 const TargetRegionEntryInfo
&EntryInfo
,
6465 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion
&E
) {
6466 // Generate metadata for target regions. Each entry of this metadata
6468 // - Entry 0 -> Kind of this type of metadata (0).
6469 // - Entry 1 -> Device ID of the file where the entry was identified.
6470 // - Entry 2 -> File ID of the file where the entry was identified.
6471 // - Entry 3 -> Mangled name of the function where the entry was
6473 // - Entry 4 -> Line in the file where the entry was identified.
6474 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
6475 // - Entry 6 -> Order the entry was created.
6476 // The first element of the metadata node is the kind.
6478 GetMDInt(E
.getKind()), GetMDInt(EntryInfo
.DeviceID
),
6479 GetMDInt(EntryInfo
.FileID
), GetMDString(EntryInfo
.ParentName
),
6480 GetMDInt(EntryInfo
.Line
), GetMDInt(EntryInfo
.Count
),
6481 GetMDInt(E
.getOrder())};
6483 // Save this entry in the right position of the ordered entries array.
6484 OrderedEntries
[E
.getOrder()] = std::make_pair(&E
, EntryInfo
);
6486 // Add metadata to the named metadata node.
6487 MD
->addOperand(MDNode::get(C
, Ops
));
6490 OffloadInfoManager
.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter
);
6492 // Create function that emits metadata for each device global variable entry;
6493 auto &&DeviceGlobalVarMetadataEmitter
=
6494 [&C
, &OrderedEntries
, &GetMDInt
, &GetMDString
, MD
](
6495 StringRef MangledName
,
6496 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar
&E
) {
6497 // Generate metadata for global variables. Each entry of this metadata
6499 // - Entry 0 -> Kind of this type of metadata (1).
6500 // - Entry 1 -> Mangled name of the variable.
6501 // - Entry 2 -> Declare target kind.
6502 // - Entry 3 -> Order the entry was created.
6503 // The first element of the metadata node is the kind.
6504 Metadata
*Ops
[] = {GetMDInt(E
.getKind()), GetMDString(MangledName
),
6505 GetMDInt(E
.getFlags()), GetMDInt(E
.getOrder())};
6507 // Save this entry in the right position of the ordered entries array.
6508 TargetRegionEntryInfo
varInfo(MangledName
, 0, 0, 0);
6509 OrderedEntries
[E
.getOrder()] = std::make_pair(&E
, varInfo
);
6511 // Add metadata to the named metadata node.
6512 MD
->addOperand(MDNode::get(C
, Ops
));
6515 OffloadInfoManager
.actOnDeviceGlobalVarEntriesInfo(
6516 DeviceGlobalVarMetadataEmitter
);
6518 for (const auto &E
: OrderedEntries
) {
6519 assert(E
.first
&& "All ordered entries must exist!");
6520 if (const auto *CE
=
6521 dyn_cast
<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion
>(
6523 if (!CE
->getID() || !CE
->getAddress()) {
6524 // Do not blame the entry if the parent funtion is not emitted.
6525 TargetRegionEntryInfo EntryInfo
= E
.second
;
6526 StringRef FnName
= EntryInfo
.ParentName
;
6527 if (!M
.getNamedValue(FnName
))
6529 ErrorFn(EMIT_MD_TARGET_REGION_ERROR
, EntryInfo
);
6532 createOffloadEntry(CE
->getID(), CE
->getAddress(),
6533 /*Size=*/0, CE
->getFlags(),
6534 GlobalValue::WeakAnyLinkage
);
6535 } else if (const auto *CE
= dyn_cast
<
6536 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar
>(
6538 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags
=
6539 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind
>(
6542 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter
:
6543 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo
:
6544 if (Config
.isTargetDevice() && Config
.hasRequiresUnifiedSharedMemory())
6546 if (!CE
->getAddress()) {
6547 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR
, E
.second
);
6550 // The vaiable has no definition - no need to add the entry.
6551 if (CE
->getVarSize() == 0)
6554 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink
:
6555 assert(((Config
.isTargetDevice() && !CE
->getAddress()) ||
6556 (!Config
.isTargetDevice() && CE
->getAddress())) &&
6557 "Declaret target link address is set.");
6558 if (Config
.isTargetDevice())
6560 if (!CE
->getAddress()) {
6561 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR
, TargetRegionEntryInfo());
6569 // Hidden or internal symbols on the device are not externally visible.
6570 // We should not attempt to register them by creating an offloading
6571 // entry. Indirect variables are handled separately on the device.
6572 if (auto *GV
= dyn_cast
<GlobalValue
>(CE
->getAddress()))
6573 if ((GV
->hasLocalLinkage() || GV
->hasHiddenVisibility()) &&
6574 Flags
!= OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect
)
6577 // Indirect globals need to use a special name that doesn't match the name
6578 // of the associated host global.
6579 if (Flags
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect
)
6580 createOffloadEntry(CE
->getAddress(), CE
->getAddress(), CE
->getVarSize(),
6581 Flags
, CE
->getLinkage(), CE
->getVarName());
6583 createOffloadEntry(CE
->getAddress(), CE
->getAddress(), CE
->getVarSize(),
6584 Flags
, CE
->getLinkage());
6587 llvm_unreachable("Unsupported entry kind.");
6592 void TargetRegionEntryInfo::getTargetRegionEntryFnName(
6593 SmallVectorImpl
<char> &Name
, StringRef ParentName
, unsigned DeviceID
,
6594 unsigned FileID
, unsigned Line
, unsigned Count
) {
6595 raw_svector_ostream
OS(Name
);
6596 OS
<< "__omp_offloading" << llvm::format("_%x", DeviceID
)
6597 << llvm::format("_%x_", FileID
) << ParentName
<< "_l" << Line
;
6602 void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
6603 SmallVectorImpl
<char> &Name
, const TargetRegionEntryInfo
&EntryInfo
) {
6604 unsigned NewCount
= getTargetRegionEntryInfoCount(EntryInfo
);
6605 TargetRegionEntryInfo::getTargetRegionEntryFnName(
6606 Name
, EntryInfo
.ParentName
, EntryInfo
.DeviceID
, EntryInfo
.FileID
,
6607 EntryInfo
.Line
, NewCount
);
6610 TargetRegionEntryInfo
6611 OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack
,
6612 StringRef ParentName
) {
6613 sys::fs::UniqueID ID
;
6614 auto FileIDInfo
= CallBack();
6615 if (auto EC
= sys::fs::getUniqueID(std::get
<0>(FileIDInfo
), ID
)) {
6616 report_fatal_error(("Unable to get unique ID for file, during "
6617 "getTargetEntryUniqueInfo, error message: " +
6622 return TargetRegionEntryInfo(ParentName
, ID
.getDevice(), ID
.getFile(),
6623 std::get
<1>(FileIDInfo
));
6626 unsigned OpenMPIRBuilder::getFlagMemberOffset() {
6627 unsigned Offset
= 0;
6628 for (uint64_t Remain
=
6629 static_cast<std::underlying_type_t
<omp::OpenMPOffloadMappingFlags
>>(
6630 omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF
);
6631 !(Remain
& 1); Remain
= Remain
>> 1)
6636 omp::OpenMPOffloadMappingFlags
6637 OpenMPIRBuilder::getMemberOfFlag(unsigned Position
) {
6638 // Rotate by getFlagMemberOffset() bits.
6639 return static_cast<omp::OpenMPOffloadMappingFlags
>(((uint64_t)Position
+ 1)
6640 << getFlagMemberOffset());
6643 void OpenMPIRBuilder::setCorrectMemberOfFlag(
6644 omp::OpenMPOffloadMappingFlags
&Flags
,
6645 omp::OpenMPOffloadMappingFlags MemberOfFlag
) {
6646 // If the entry is PTR_AND_OBJ but has not been marked with the special
6647 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
6648 // marked as MEMBER_OF.
6649 if (static_cast<std::underlying_type_t
<omp::OpenMPOffloadMappingFlags
>>(
6650 Flags
& omp::OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ
) &&
6651 static_cast<std::underlying_type_t
<omp::OpenMPOffloadMappingFlags
>>(
6652 (Flags
& omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF
) !=
6653 omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF
))
6656 // Reset the placeholder value to prepare the flag for the assignment of the
6657 // proper MEMBER_OF value.
6658 Flags
&= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF
;
6659 Flags
|= MemberOfFlag
;
6662 Constant
*OpenMPIRBuilder::getAddrOfDeclareTargetVar(
6663 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause
,
6664 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause
,
6665 bool IsDeclaration
, bool IsExternallyVisible
,
6666 TargetRegionEntryInfo EntryInfo
, StringRef MangledName
,
6667 std::vector
<GlobalVariable
*> &GeneratedRefs
, bool OpenMPSIMD
,
6668 std::vector
<Triple
> TargetTriple
, Type
*LlvmPtrTy
,
6669 std::function
<Constant
*()> GlobalInitializer
,
6670 std::function
<GlobalValue::LinkageTypes()> VariableLinkage
) {
6671 // TODO: convert this to utilise the IRBuilder Config rather than
6672 // a passed down argument.
6676 if (CaptureClause
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink
||
6677 ((CaptureClause
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo
||
6679 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter
) &&
6680 Config
.hasRequiresUnifiedSharedMemory())) {
6681 SmallString
<64> PtrName
;
6683 raw_svector_ostream
OS(PtrName
);
6685 if (!IsExternallyVisible
)
6686 OS
<< format("_%x", EntryInfo
.FileID
);
6687 OS
<< "_decl_tgt_ref_ptr";
6690 Value
*Ptr
= M
.getNamedValue(PtrName
);
6693 GlobalValue
*GlobalValue
= M
.getNamedValue(MangledName
);
6694 Ptr
= getOrCreateInternalVariable(LlvmPtrTy
, PtrName
);
6696 auto *GV
= cast
<GlobalVariable
>(Ptr
);
6697 GV
->setLinkage(GlobalValue::WeakAnyLinkage
);
6699 if (!Config
.isTargetDevice()) {
6700 if (GlobalInitializer
)
6701 GV
->setInitializer(GlobalInitializer());
6703 GV
->setInitializer(GlobalValue
);
6706 registerTargetGlobalVariable(
6707 CaptureClause
, DeviceClause
, IsDeclaration
, IsExternallyVisible
,
6708 EntryInfo
, MangledName
, GeneratedRefs
, OpenMPSIMD
, TargetTriple
,
6709 GlobalInitializer
, VariableLinkage
, LlvmPtrTy
, cast
<Constant
>(Ptr
));
6712 return cast
<Constant
>(Ptr
);
6718 void OpenMPIRBuilder::registerTargetGlobalVariable(
6719 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause
,
6720 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause
,
6721 bool IsDeclaration
, bool IsExternallyVisible
,
6722 TargetRegionEntryInfo EntryInfo
, StringRef MangledName
,
6723 std::vector
<GlobalVariable
*> &GeneratedRefs
, bool OpenMPSIMD
,
6724 std::vector
<Triple
> TargetTriple
,
6725 std::function
<Constant
*()> GlobalInitializer
,
6726 std::function
<GlobalValue::LinkageTypes()> VariableLinkage
, Type
*LlvmPtrTy
,
6728 if (DeviceClause
!= OffloadEntriesInfoManager::OMPTargetDeviceClauseAny
||
6729 (TargetTriple
.empty() && !Config
.isTargetDevice()))
6732 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags
;
6735 GlobalValue::LinkageTypes Linkage
;
6737 if ((CaptureClause
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo
||
6739 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter
) &&
6740 !Config
.hasRequiresUnifiedSharedMemory()) {
6741 Flags
= OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo
;
6742 VarName
= MangledName
;
6743 GlobalValue
*LlvmVal
= M
.getNamedValue(VarName
);
6746 VarSize
= divideCeil(
6747 M
.getDataLayout().getTypeSizeInBits(LlvmVal
->getValueType()), 8);
6750 Linkage
= (VariableLinkage
) ? VariableLinkage() : LlvmVal
->getLinkage();
6752 // This is a workaround carried over from Clang which prevents undesired
6753 // optimisation of internal variables.
6754 if (Config
.isTargetDevice() &&
6755 (!IsExternallyVisible
|| Linkage
== GlobalValue::LinkOnceODRLinkage
)) {
6756 // Do not create a "ref-variable" if the original is not also available
6758 if (!OffloadInfoManager
.hasDeviceGlobalVarEntryInfo(VarName
))
6761 std::string RefName
= createPlatformSpecificName({VarName
, "ref"});
6763 if (!M
.getNamedValue(RefName
)) {
6765 getOrCreateInternalVariable(Addr
->getType(), RefName
);
6766 auto *GvAddrRef
= cast
<GlobalVariable
>(AddrRef
);
6767 GvAddrRef
->setConstant(true);
6768 GvAddrRef
->setLinkage(GlobalValue::InternalLinkage
);
6769 GvAddrRef
->setInitializer(Addr
);
6770 GeneratedRefs
.push_back(GvAddrRef
);
6774 if (CaptureClause
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink
)
6775 Flags
= OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink
;
6777 Flags
= OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo
;
6779 if (Config
.isTargetDevice()) {
6780 VarName
= (Addr
) ? Addr
->getName() : "";
6783 Addr
= getAddrOfDeclareTargetVar(
6784 CaptureClause
, DeviceClause
, IsDeclaration
, IsExternallyVisible
,
6785 EntryInfo
, MangledName
, GeneratedRefs
, OpenMPSIMD
, TargetTriple
,
6786 LlvmPtrTy
, GlobalInitializer
, VariableLinkage
);
6787 VarName
= (Addr
) ? Addr
->getName() : "";
6789 VarSize
= M
.getDataLayout().getPointerSize();
6790 Linkage
= GlobalValue::WeakAnyLinkage
;
6793 OffloadInfoManager
.registerDeviceGlobalVarEntryInfo(VarName
, Addr
, VarSize
,
6797 /// Loads all the offload entries information from the host IR
6799 void OpenMPIRBuilder::loadOffloadInfoMetadata(Module
&M
) {
6800 // If we are in target mode, load the metadata from the host IR. This code has
6801 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
6803 NamedMDNode
*MD
= M
.getNamedMetadata(ompOffloadInfoName
);
6807 for (MDNode
*MN
: MD
->operands()) {
6808 auto &&GetMDInt
= [MN
](unsigned Idx
) {
6809 auto *V
= cast
<ConstantAsMetadata
>(MN
->getOperand(Idx
));
6810 return cast
<ConstantInt
>(V
->getValue())->getZExtValue();
6813 auto &&GetMDString
= [MN
](unsigned Idx
) {
6814 auto *V
= cast
<MDString
>(MN
->getOperand(Idx
));
6815 return V
->getString();
6818 switch (GetMDInt(0)) {
6820 llvm_unreachable("Unexpected metadata!");
6822 case OffloadEntriesInfoManager::OffloadEntryInfo::
6823 OffloadingEntryInfoTargetRegion
: {
6824 TargetRegionEntryInfo
EntryInfo(/*ParentName=*/GetMDString(3),
6825 /*DeviceID=*/GetMDInt(1),
6826 /*FileID=*/GetMDInt(2),
6827 /*Line=*/GetMDInt(4),
6828 /*Count=*/GetMDInt(5));
6829 OffloadInfoManager
.initializeTargetRegionEntryInfo(EntryInfo
,
6830 /*Order=*/GetMDInt(6));
6833 case OffloadEntriesInfoManager::OffloadEntryInfo::
6834 OffloadingEntryInfoDeviceGlobalVar
:
6835 OffloadInfoManager
.initializeDeviceGlobalVarEntryInfo(
6836 /*MangledName=*/GetMDString(1),
6837 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind
>(
6838 /*Flags=*/GetMDInt(2)),
6839 /*Order=*/GetMDInt(3));
6845 void OpenMPIRBuilder::loadOffloadInfoMetadata(StringRef HostFilePath
) {
6846 if (HostFilePath
.empty())
6849 auto Buf
= MemoryBuffer::getFile(HostFilePath
);
6850 if (std::error_code Err
= Buf
.getError()) {
6851 report_fatal_error(("error opening host file from host file path inside of "
6852 "OpenMPIRBuilder: " +
6858 auto M
= expectedToErrorOrAndEmitErrors(
6859 Ctx
, parseBitcodeFile(Buf
.get()->getMemBufferRef(), Ctx
));
6860 if (std::error_code Err
= M
.getError()) {
6862 ("error parsing host file inside of OpenMPIRBuilder: " + Err
.message())
6866 loadOffloadInfoMetadata(*M
.get());
6869 Function
*OpenMPIRBuilder::createRegisterRequires(StringRef Name
) {
6870 // Skip the creation of the registration function if this is device codegen
6871 if (Config
.isTargetDevice())
6874 Builder
.ClearInsertionPoint();
6876 // Create registration function prototype
6877 auto *RegFnTy
= FunctionType::get(Builder
.getVoidTy(), {});
6878 auto *RegFn
= Function::Create(
6879 RegFnTy
, GlobalVariable::LinkageTypes::InternalLinkage
, Name
, M
);
6880 RegFn
->setSection(".text.startup");
6881 RegFn
->addFnAttr(Attribute::NoInline
);
6882 RegFn
->addFnAttr(Attribute::NoUnwind
);
6884 // Create registration function body
6885 auto *BB
= BasicBlock::Create(M
.getContext(), "entry", RegFn
);
6886 ConstantInt
*FlagsVal
=
6887 ConstantInt::getSigned(Builder
.getInt64Ty(), Config
.getRequiresFlags());
6888 Function
*RTLRegFn
= getOrCreateRuntimeFunctionPtr(
6889 omp::RuntimeFunction::OMPRTL___tgt_register_requires
);
6891 Builder
.SetInsertPoint(BB
);
6892 Builder
.CreateCall(RTLRegFn
, {FlagsVal
});
6893 Builder
.CreateRetVoid();
6898 //===----------------------------------------------------------------------===//
6899 // OffloadEntriesInfoManager
6900 //===----------------------------------------------------------------------===//
6902 bool OffloadEntriesInfoManager::empty() const {
6903 return OffloadEntriesTargetRegion
.empty() &&
6904 OffloadEntriesDeviceGlobalVar
.empty();
6907 unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
6908 const TargetRegionEntryInfo
&EntryInfo
) const {
6909 auto It
= OffloadEntriesTargetRegionCount
.find(
6910 getTargetRegionEntryCountKey(EntryInfo
));
6911 if (It
== OffloadEntriesTargetRegionCount
.end())
6916 void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
6917 const TargetRegionEntryInfo
&EntryInfo
) {
6918 OffloadEntriesTargetRegionCount
[getTargetRegionEntryCountKey(EntryInfo
)] =
6919 EntryInfo
.Count
+ 1;
6922 /// Initialize target region entry.
6923 void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
6924 const TargetRegionEntryInfo
&EntryInfo
, unsigned Order
) {
6925 OffloadEntriesTargetRegion
[EntryInfo
] =
6926 OffloadEntryInfoTargetRegion(Order
, /*Addr=*/nullptr, /*ID=*/nullptr,
6927 OMPTargetRegionEntryTargetRegion
);
6928 ++OffloadingEntriesNum
;
6931 void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
6932 TargetRegionEntryInfo EntryInfo
, Constant
*Addr
, Constant
*ID
,
6933 OMPTargetRegionEntryKind Flags
) {
6934 assert(EntryInfo
.Count
== 0 && "expected default EntryInfo");
6936 // Update the EntryInfo with the next available count for this location.
6937 EntryInfo
.Count
= getTargetRegionEntryInfoCount(EntryInfo
);
6939 // If we are emitting code for a target, the entry is already initialized,
6940 // only has to be registered.
6941 if (OMPBuilder
->Config
.isTargetDevice()) {
6942 // This could happen if the device compilation is invoked standalone.
6943 if (!hasTargetRegionEntryInfo(EntryInfo
)) {
6946 auto &Entry
= OffloadEntriesTargetRegion
[EntryInfo
];
6947 Entry
.setAddress(Addr
);
6949 Entry
.setFlags(Flags
);
6951 if (Flags
== OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion
&&
6952 hasTargetRegionEntryInfo(EntryInfo
, /*IgnoreAddressId*/ true))
6954 assert(!hasTargetRegionEntryInfo(EntryInfo
) &&
6955 "Target region entry already registered!");
6956 OffloadEntryInfoTargetRegion
Entry(OffloadingEntriesNum
, Addr
, ID
, Flags
);
6957 OffloadEntriesTargetRegion
[EntryInfo
] = Entry
;
6958 ++OffloadingEntriesNum
;
6960 incrementTargetRegionEntryInfoCount(EntryInfo
);
6963 bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
6964 TargetRegionEntryInfo EntryInfo
, bool IgnoreAddressId
) const {
6966 // Update the EntryInfo with the next available count for this location.
6967 EntryInfo
.Count
= getTargetRegionEntryInfoCount(EntryInfo
);
6969 auto It
= OffloadEntriesTargetRegion
.find(EntryInfo
);
6970 if (It
== OffloadEntriesTargetRegion
.end()) {
6973 // Fail if this entry is already registered.
6974 if (!IgnoreAddressId
&& (It
->second
.getAddress() || It
->second
.getID()))
6979 void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
6980 const OffloadTargetRegionEntryInfoActTy
&Action
) {
6981 // Scan all target region entries and perform the provided action.
6982 for (const auto &It
: OffloadEntriesTargetRegion
) {
6983 Action(It
.first
, It
.second
);
6987 void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
6988 StringRef Name
, OMPTargetGlobalVarEntryKind Flags
, unsigned Order
) {
6989 OffloadEntriesDeviceGlobalVar
.try_emplace(Name
, Order
, Flags
);
6990 ++OffloadingEntriesNum
;
6993 void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
6994 StringRef VarName
, Constant
*Addr
, int64_t VarSize
,
6995 OMPTargetGlobalVarEntryKind Flags
, GlobalValue::LinkageTypes Linkage
) {
6996 if (OMPBuilder
->Config
.isTargetDevice()) {
6997 // This could happen if the device compilation is invoked standalone.
6998 if (!hasDeviceGlobalVarEntryInfo(VarName
))
7000 auto &Entry
= OffloadEntriesDeviceGlobalVar
[VarName
];
7001 if (Entry
.getAddress() && hasDeviceGlobalVarEntryInfo(VarName
)) {
7002 if (Entry
.getVarSize() == 0) {
7003 Entry
.setVarSize(VarSize
);
7004 Entry
.setLinkage(Linkage
);
7008 Entry
.setVarSize(VarSize
);
7009 Entry
.setLinkage(Linkage
);
7010 Entry
.setAddress(Addr
);
7012 if (hasDeviceGlobalVarEntryInfo(VarName
)) {
7013 auto &Entry
= OffloadEntriesDeviceGlobalVar
[VarName
];
7014 assert(Entry
.isValid() && Entry
.getFlags() == Flags
&&
7015 "Entry not initialized!");
7016 if (Entry
.getVarSize() == 0) {
7017 Entry
.setVarSize(VarSize
);
7018 Entry
.setLinkage(Linkage
);
7022 if (Flags
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect
)
7023 OffloadEntriesDeviceGlobalVar
.try_emplace(VarName
, OffloadingEntriesNum
,
7024 Addr
, VarSize
, Flags
, Linkage
,
7027 OffloadEntriesDeviceGlobalVar
.try_emplace(
7028 VarName
, OffloadingEntriesNum
, Addr
, VarSize
, Flags
, Linkage
, "");
7029 ++OffloadingEntriesNum
;
7033 void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
7034 const OffloadDeviceGlobalVarEntryInfoActTy
&Action
) {
7035 // Scan all target region entries and perform the provided action.
7036 for (const auto &E
: OffloadEntriesDeviceGlobalVar
)
7037 Action(E
.getKey(), E
.getValue());
7040 //===----------------------------------------------------------------------===//
7041 // CanonicalLoopInfo
7042 //===----------------------------------------------------------------------===//
7044 void CanonicalLoopInfo::collectControlBlocks(
7045 SmallVectorImpl
<BasicBlock
*> &BBs
) {
7046 // We only count those BBs as control block for which we do not need to
7047 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
7048 // flow. For consistency, this also means we do not add the Body block, which
7049 // is just the entry to the body code.
7050 BBs
.reserve(BBs
.size() + 6);
7051 BBs
.append({getPreheader(), Header
, Cond
, Latch
, Exit
, getAfter()});
7054 BasicBlock
*CanonicalLoopInfo::getPreheader() const {
7055 assert(isValid() && "Requires a valid canonical loop");
7056 for (BasicBlock
*Pred
: predecessors(Header
)) {
7060 llvm_unreachable("Missing preheader");
7063 void CanonicalLoopInfo::setTripCount(Value
*TripCount
) {
7064 assert(isValid() && "Requires a valid canonical loop");
7066 Instruction
*CmpI
= &getCond()->front();
7067 assert(isa
<CmpInst
>(CmpI
) && "First inst must compare IV with TripCount");
7068 CmpI
->setOperand(1, TripCount
);
7075 void CanonicalLoopInfo::mapIndVar(
7076 llvm::function_ref
<Value
*(Instruction
*)> Updater
) {
7077 assert(isValid() && "Requires a valid canonical loop");
7079 Instruction
*OldIV
= getIndVar();
7081 // Record all uses excluding those introduced by the updater. Uses by the
7082 // CanonicalLoopInfo itself to keep track of the number of iterations are
7084 SmallVector
<Use
*> ReplacableUses
;
7085 for (Use
&U
: OldIV
->uses()) {
7086 auto *User
= dyn_cast
<Instruction
>(U
.getUser());
7089 if (User
->getParent() == getCond())
7091 if (User
->getParent() == getLatch())
7093 ReplacableUses
.push_back(&U
);
7096 // Run the updater that may introduce new uses
7097 Value
*NewIV
= Updater(OldIV
);
7099 // Replace the old uses with the value returned by the updater.
7100 for (Use
*U
: ReplacableUses
)
7108 void CanonicalLoopInfo::assertOK() const {
7110 // No constraints if this object currently does not describe a loop.
7114 BasicBlock
*Preheader
= getPreheader();
7115 BasicBlock
*Body
= getBody();
7116 BasicBlock
*After
= getAfter();
7118 // Verify standard control-flow we use for OpenMP loops.
7120 assert(isa
<BranchInst
>(Preheader
->getTerminator()) &&
7121 "Preheader must terminate with unconditional branch");
7122 assert(Preheader
->getSingleSuccessor() == Header
&&
7123 "Preheader must jump to header");
7126 assert(isa
<BranchInst
>(Header
->getTerminator()) &&
7127 "Header must terminate with unconditional branch");
7128 assert(Header
->getSingleSuccessor() == Cond
&&
7129 "Header must jump to exiting block");
7132 assert(Cond
->getSinglePredecessor() == Header
&&
7133 "Exiting block only reachable from header");
7135 assert(isa
<BranchInst
>(Cond
->getTerminator()) &&
7136 "Exiting block must terminate with conditional branch");
7137 assert(size(successors(Cond
)) == 2 &&
7138 "Exiting block must have two successors");
7139 assert(cast
<BranchInst
>(Cond
->getTerminator())->getSuccessor(0) == Body
&&
7140 "Exiting block's first successor jump to the body");
7141 assert(cast
<BranchInst
>(Cond
->getTerminator())->getSuccessor(1) == Exit
&&
7142 "Exiting block's second successor must exit the loop");
7145 assert(Body
->getSinglePredecessor() == Cond
&&
7146 "Body only reachable from exiting block");
7147 assert(!isa
<PHINode
>(Body
->front()));
7150 assert(isa
<BranchInst
>(Latch
->getTerminator()) &&
7151 "Latch must terminate with unconditional branch");
7152 assert(Latch
->getSingleSuccessor() == Header
&& "Latch must jump to header");
7153 // TODO: To support simple redirecting of the end of the body code that has
7154 // multiple; introduce another auxiliary basic block like preheader and after.
7155 assert(Latch
->getSinglePredecessor() != nullptr);
7156 assert(!isa
<PHINode
>(Latch
->front()));
7159 assert(isa
<BranchInst
>(Exit
->getTerminator()) &&
7160 "Exit block must terminate with unconditional branch");
7161 assert(Exit
->getSingleSuccessor() == After
&&
7162 "Exit block must jump to after block");
7165 assert(After
->getSinglePredecessor() == Exit
&&
7166 "After block only reachable from exit block");
7167 assert(After
->empty() || !isa
<PHINode
>(After
->front()));
7169 Instruction
*IndVar
= getIndVar();
7170 assert(IndVar
&& "Canonical induction variable not found?");
7171 assert(isa
<IntegerType
>(IndVar
->getType()) &&
7172 "Induction variable must be an integer");
7173 assert(cast
<PHINode
>(IndVar
)->getParent() == Header
&&
7174 "Induction variable must be a PHI in the loop header");
7175 assert(cast
<PHINode
>(IndVar
)->getIncomingBlock(0) == Preheader
);
7177 cast
<ConstantInt
>(cast
<PHINode
>(IndVar
)->getIncomingValue(0))->isZero());
7178 assert(cast
<PHINode
>(IndVar
)->getIncomingBlock(1) == Latch
);
7180 auto *NextIndVar
= cast
<PHINode
>(IndVar
)->getIncomingValue(1);
7181 assert(cast
<Instruction
>(NextIndVar
)->getParent() == Latch
);
7182 assert(cast
<BinaryOperator
>(NextIndVar
)->getOpcode() == BinaryOperator::Add
);
7183 assert(cast
<BinaryOperator
>(NextIndVar
)->getOperand(0) == IndVar
);
7184 assert(cast
<ConstantInt
>(cast
<BinaryOperator
>(NextIndVar
)->getOperand(1))
7187 Value
*TripCount
= getTripCount();
7188 assert(TripCount
&& "Loop trip count not found?");
7189 assert(IndVar
->getType() == TripCount
->getType() &&
7190 "Trip count and induction variable must have the same type");
7192 auto *CmpI
= cast
<CmpInst
>(&Cond
->front());
7193 assert(CmpI
->getPredicate() == CmpInst::ICMP_ULT
&&
7194 "Exit condition must be a signed less-than comparison");
7195 assert(CmpI
->getOperand(0) == IndVar
&&
7196 "Exit condition must compare the induction variable");
7197 assert(CmpI
->getOperand(1) == TripCount
&&
7198 "Exit condition must compare with the trip count");
7202 void CanonicalLoopInfo::invalidate() {