1 //===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This file implements the OpenMPIRBuilder class, which is used as a
11 /// convenient way to create LLVM instructions for OpenMP directives.
13 //===----------------------------------------------------------------------===//
15 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
16 #include "llvm/ADT/SmallSet.h"
17 #include "llvm/ADT/StringExtras.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/CodeMetrics.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
23 #include "llvm/Analysis/ScalarEvolution.h"
24 #include "llvm/Analysis/TargetLibraryInfo.h"
25 #include "llvm/Bitcode/BitcodeReader.h"
26 #include "llvm/Frontend/Offloading/Utility.h"
27 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
28 #include "llvm/IR/Attributes.h"
29 #include "llvm/IR/BasicBlock.h"
30 #include "llvm/IR/CFG.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/Constant.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/DebugInfoMetadata.h"
35 #include "llvm/IR/DerivedTypes.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/GlobalVariable.h"
38 #include "llvm/IR/IRBuilder.h"
39 #include "llvm/IR/LLVMContext.h"
40 #include "llvm/IR/MDBuilder.h"
41 #include "llvm/IR/Metadata.h"
42 #include "llvm/IR/PassManager.h"
43 #include "llvm/IR/Value.h"
44 #include "llvm/MC/TargetRegistry.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/FileSystem.h"
48 #include "llvm/Target/TargetMachine.h"
49 #include "llvm/Target/TargetOptions.h"
50 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
51 #include "llvm/Transforms/Utils/Cloning.h"
52 #include "llvm/Transforms/Utils/CodeExtractor.h"
53 #include "llvm/Transforms/Utils/LoopPeel.h"
54 #include "llvm/Transforms/Utils/UnrollLoop.h"
59 #define DEBUG_TYPE "openmp-ir-builder"
65 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden
,
66 cl::desc("Use optimistic attributes describing "
67 "'as-if' properties of runtime calls."),
70 static cl::opt
<double> UnrollThresholdFactor(
71 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden
,
72 cl::desc("Factor for the unroll threshold to account for code "
73 "simplifications still taking place"),
77 /// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
78 /// at position IP1 may change the meaning of IP2 or vice-versa. This is because
79 /// an InsertPoint stores the instruction before something is inserted. For
80 /// instance, if both point to the same instruction, two IRBuilders alternating
81 /// creating instruction will cause the instructions to be interleaved.
82 static bool isConflictIP(IRBuilder
<>::InsertPoint IP1
,
83 IRBuilder
<>::InsertPoint IP2
) {
84 if (!IP1
.isSet() || !IP2
.isSet())
86 return IP1
.getBlock() == IP2
.getBlock() && IP1
.getPoint() == IP2
.getPoint();
89 static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType
) {
90 // Valid ordered/unordered and base algorithm combinations.
91 switch (SchedType
& ~OMPScheduleType::MonotonicityMask
) {
92 case OMPScheduleType::UnorderedStaticChunked
:
93 case OMPScheduleType::UnorderedStatic
:
94 case OMPScheduleType::UnorderedDynamicChunked
:
95 case OMPScheduleType::UnorderedGuidedChunked
:
96 case OMPScheduleType::UnorderedRuntime
:
97 case OMPScheduleType::UnorderedAuto
:
98 case OMPScheduleType::UnorderedTrapezoidal
:
99 case OMPScheduleType::UnorderedGreedy
:
100 case OMPScheduleType::UnorderedBalanced
:
101 case OMPScheduleType::UnorderedGuidedIterativeChunked
:
102 case OMPScheduleType::UnorderedGuidedAnalyticalChunked
:
103 case OMPScheduleType::UnorderedSteal
:
104 case OMPScheduleType::UnorderedStaticBalancedChunked
:
105 case OMPScheduleType::UnorderedGuidedSimd
:
106 case OMPScheduleType::UnorderedRuntimeSimd
:
107 case OMPScheduleType::OrderedStaticChunked
:
108 case OMPScheduleType::OrderedStatic
:
109 case OMPScheduleType::OrderedDynamicChunked
:
110 case OMPScheduleType::OrderedGuidedChunked
:
111 case OMPScheduleType::OrderedRuntime
:
112 case OMPScheduleType::OrderedAuto
:
113 case OMPScheduleType::OrderdTrapezoidal
:
114 case OMPScheduleType::NomergeUnorderedStaticChunked
:
115 case OMPScheduleType::NomergeUnorderedStatic
:
116 case OMPScheduleType::NomergeUnorderedDynamicChunked
:
117 case OMPScheduleType::NomergeUnorderedGuidedChunked
:
118 case OMPScheduleType::NomergeUnorderedRuntime
:
119 case OMPScheduleType::NomergeUnorderedAuto
:
120 case OMPScheduleType::NomergeUnorderedTrapezoidal
:
121 case OMPScheduleType::NomergeUnorderedGreedy
:
122 case OMPScheduleType::NomergeUnorderedBalanced
:
123 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked
:
124 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked
:
125 case OMPScheduleType::NomergeUnorderedSteal
:
126 case OMPScheduleType::NomergeOrderedStaticChunked
:
127 case OMPScheduleType::NomergeOrderedStatic
:
128 case OMPScheduleType::NomergeOrderedDynamicChunked
:
129 case OMPScheduleType::NomergeOrderedGuidedChunked
:
130 case OMPScheduleType::NomergeOrderedRuntime
:
131 case OMPScheduleType::NomergeOrderedAuto
:
132 case OMPScheduleType::NomergeOrderedTrapezoidal
:
138 // Must not set both monotonicity modifiers at the same time.
139 OMPScheduleType MonotonicityFlags
=
140 SchedType
& OMPScheduleType::MonotonicityMask
;
141 if (MonotonicityFlags
== OMPScheduleType::MonotonicityMask
)
148 static const omp::GV
&getGridValue(const Triple
&T
, Function
*Kernel
) {
151 Kernel
->getFnAttribute("target-features").getValueAsString();
152 if (Features
.count("+wavefrontsize64"))
153 return omp::getAMDGPUGridValues
<64>();
154 return omp::getAMDGPUGridValues
<32>();
157 return omp::NVPTXGridValues
;
158 llvm_unreachable("No grid value available for this architecture!");
161 /// Determine which scheduling algorithm to use, determined from schedule clause
163 static OMPScheduleType
164 getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind
, bool HasChunks
,
165 bool HasSimdModifier
) {
166 // Currently, the default schedule it static.
167 switch (ClauseKind
) {
168 case OMP_SCHEDULE_Default
:
169 case OMP_SCHEDULE_Static
:
170 return HasChunks
? OMPScheduleType::BaseStaticChunked
171 : OMPScheduleType::BaseStatic
;
172 case OMP_SCHEDULE_Dynamic
:
173 return OMPScheduleType::BaseDynamicChunked
;
174 case OMP_SCHEDULE_Guided
:
175 return HasSimdModifier
? OMPScheduleType::BaseGuidedSimd
176 : OMPScheduleType::BaseGuidedChunked
;
177 case OMP_SCHEDULE_Auto
:
178 return llvm::omp::OMPScheduleType::BaseAuto
;
179 case OMP_SCHEDULE_Runtime
:
180 return HasSimdModifier
? OMPScheduleType::BaseRuntimeSimd
181 : OMPScheduleType::BaseRuntime
;
183 llvm_unreachable("unhandled schedule clause argument");
186 /// Adds ordering modifier flags to schedule type.
187 static OMPScheduleType
188 getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType
,
189 bool HasOrderedClause
) {
190 assert((BaseScheduleType
& OMPScheduleType::ModifierMask
) ==
191 OMPScheduleType::None
&&
192 "Must not have ordering nor monotonicity flags already set");
194 OMPScheduleType OrderingModifier
= HasOrderedClause
195 ? OMPScheduleType::ModifierOrdered
196 : OMPScheduleType::ModifierUnordered
;
197 OMPScheduleType OrderingScheduleType
= BaseScheduleType
| OrderingModifier
;
199 // Unsupported combinations
200 if (OrderingScheduleType
==
201 (OMPScheduleType::BaseGuidedSimd
| OMPScheduleType::ModifierOrdered
))
202 return OMPScheduleType::OrderedGuidedChunked
;
203 else if (OrderingScheduleType
== (OMPScheduleType::BaseRuntimeSimd
|
204 OMPScheduleType::ModifierOrdered
))
205 return OMPScheduleType::OrderedRuntime
;
207 return OrderingScheduleType
;
210 /// Adds monotonicity modifier flags to schedule type.
211 static OMPScheduleType
212 getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType
,
213 bool HasSimdModifier
, bool HasMonotonic
,
214 bool HasNonmonotonic
, bool HasOrderedClause
) {
215 assert((ScheduleType
& OMPScheduleType::MonotonicityMask
) ==
216 OMPScheduleType::None
&&
217 "Must not have monotonicity flags already set");
218 assert((!HasMonotonic
|| !HasNonmonotonic
) &&
219 "Monotonic and Nonmonotonic are contradicting each other");
222 return ScheduleType
| OMPScheduleType::ModifierMonotonic
;
223 } else if (HasNonmonotonic
) {
224 return ScheduleType
| OMPScheduleType::ModifierNonmonotonic
;
226 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
227 // If the static schedule kind is specified or if the ordered clause is
228 // specified, and if the nonmonotonic modifier is not specified, the
229 // effect is as if the monotonic modifier is specified. Otherwise, unless
230 // the monotonic modifier is specified, the effect is as if the
231 // nonmonotonic modifier is specified.
232 OMPScheduleType BaseScheduleType
=
233 ScheduleType
& ~OMPScheduleType::ModifierMask
;
234 if ((BaseScheduleType
== OMPScheduleType::BaseStatic
) ||
235 (BaseScheduleType
== OMPScheduleType::BaseStaticChunked
) ||
237 // The monotonic is used by default in openmp runtime library, so no need
241 return ScheduleType
| OMPScheduleType::ModifierNonmonotonic
;
246 /// Determine the schedule type using schedule and ordering clause arguments.
247 static OMPScheduleType
248 computeOpenMPScheduleType(ScheduleKind ClauseKind
, bool HasChunks
,
249 bool HasSimdModifier
, bool HasMonotonicModifier
,
250 bool HasNonmonotonicModifier
, bool HasOrderedClause
) {
251 OMPScheduleType BaseSchedule
=
252 getOpenMPBaseScheduleType(ClauseKind
, HasChunks
, HasSimdModifier
);
253 OMPScheduleType OrderedSchedule
=
254 getOpenMPOrderingScheduleType(BaseSchedule
, HasOrderedClause
);
255 OMPScheduleType Result
= getOpenMPMonotonicityScheduleType(
256 OrderedSchedule
, HasSimdModifier
, HasMonotonicModifier
,
257 HasNonmonotonicModifier
, HasOrderedClause
);
259 assert(isValidWorkshareLoopScheduleType(Result
));
263 /// Make \p Source branch to \p Target.
265 /// Handles two situations:
266 /// * \p Source already has an unconditional branch.
267 /// * \p Source is a degenerate block (no terminator because the BB is
268 /// the current head of the IR construction).
269 static void redirectTo(BasicBlock
*Source
, BasicBlock
*Target
, DebugLoc DL
) {
270 if (Instruction
*Term
= Source
->getTerminator()) {
271 auto *Br
= cast
<BranchInst
>(Term
);
272 assert(!Br
->isConditional() &&
273 "BB's terminator must be an unconditional branch (or degenerate)");
274 BasicBlock
*Succ
= Br
->getSuccessor(0);
275 Succ
->removePredecessor(Source
, /*KeepOneInputPHIs=*/true);
276 Br
->setSuccessor(0, Target
);
280 auto *NewBr
= BranchInst::Create(Target
, Source
);
281 NewBr
->setDebugLoc(DL
);
284 void llvm::spliceBB(IRBuilderBase::InsertPoint IP
, BasicBlock
*New
,
286 assert(New
->getFirstInsertionPt() == New
->begin() &&
287 "Target BB must not have PHI nodes");
289 // Move instructions to new block.
290 BasicBlock
*Old
= IP
.getBlock();
291 New
->splice(New
->begin(), Old
, IP
.getPoint(), Old
->end());
294 BranchInst::Create(New
, Old
);
297 void llvm::spliceBB(IRBuilder
<> &Builder
, BasicBlock
*New
, bool CreateBranch
) {
298 DebugLoc DebugLoc
= Builder
.getCurrentDebugLocation();
299 BasicBlock
*Old
= Builder
.GetInsertBlock();
301 spliceBB(Builder
.saveIP(), New
, CreateBranch
);
303 Builder
.SetInsertPoint(Old
->getTerminator());
305 Builder
.SetInsertPoint(Old
);
307 // SetInsertPoint also updates the Builder's debug location, but we want to
308 // keep the one the Builder was configured to use.
309 Builder
.SetCurrentDebugLocation(DebugLoc
);
312 BasicBlock
*llvm::splitBB(IRBuilderBase::InsertPoint IP
, bool CreateBranch
,
314 BasicBlock
*Old
= IP
.getBlock();
315 BasicBlock
*New
= BasicBlock::Create(
316 Old
->getContext(), Name
.isTriviallyEmpty() ? Old
->getName() : Name
,
317 Old
->getParent(), Old
->getNextNode());
318 spliceBB(IP
, New
, CreateBranch
);
319 New
->replaceSuccessorsPhiUsesWith(Old
, New
);
323 BasicBlock
*llvm::splitBB(IRBuilderBase
&Builder
, bool CreateBranch
,
325 DebugLoc DebugLoc
= Builder
.getCurrentDebugLocation();
326 BasicBlock
*New
= splitBB(Builder
.saveIP(), CreateBranch
, Name
);
328 Builder
.SetInsertPoint(Builder
.GetInsertBlock()->getTerminator());
330 Builder
.SetInsertPoint(Builder
.GetInsertBlock());
331 // SetInsertPoint also updates the Builder's debug location, but we want to
332 // keep the one the Builder was configured to use.
333 Builder
.SetCurrentDebugLocation(DebugLoc
);
337 BasicBlock
*llvm::splitBB(IRBuilder
<> &Builder
, bool CreateBranch
,
339 DebugLoc DebugLoc
= Builder
.getCurrentDebugLocation();
340 BasicBlock
*New
= splitBB(Builder
.saveIP(), CreateBranch
, Name
);
342 Builder
.SetInsertPoint(Builder
.GetInsertBlock()->getTerminator());
344 Builder
.SetInsertPoint(Builder
.GetInsertBlock());
345 // SetInsertPoint also updates the Builder's debug location, but we want to
346 // keep the one the Builder was configured to use.
347 Builder
.SetCurrentDebugLocation(DebugLoc
);
351 BasicBlock
*llvm::splitBBWithSuffix(IRBuilderBase
&Builder
, bool CreateBranch
,
352 llvm::Twine Suffix
) {
353 BasicBlock
*Old
= Builder
.GetInsertBlock();
354 return splitBB(Builder
, CreateBranch
, Old
->getName() + Suffix
);
357 // This function creates a fake integer value and a fake use for the integer
358 // value. It returns the fake value created. This is useful in modeling the
359 // extra arguments to the outlined functions.
360 Value
*createFakeIntVal(IRBuilder
<> &Builder
,
361 OpenMPIRBuilder::InsertPointTy OuterAllocaIP
,
362 std::stack
<Instruction
*> &ToBeDeleted
,
363 OpenMPIRBuilder::InsertPointTy InnerAllocaIP
,
364 const Twine
&Name
= "", bool AsPtr
= true) {
365 Builder
.restoreIP(OuterAllocaIP
);
366 Instruction
*FakeVal
;
367 AllocaInst
*FakeValAddr
=
368 Builder
.CreateAlloca(Builder
.getInt32Ty(), nullptr, Name
+ ".addr");
369 ToBeDeleted
.push(FakeValAddr
);
372 FakeVal
= FakeValAddr
;
375 Builder
.CreateLoad(Builder
.getInt32Ty(), FakeValAddr
, Name
+ ".val");
376 ToBeDeleted
.push(FakeVal
);
379 // Generate a fake use of this value
380 Builder
.restoreIP(InnerAllocaIP
);
381 Instruction
*UseFakeVal
;
384 Builder
.CreateLoad(Builder
.getInt32Ty(), FakeVal
, Name
+ ".use");
387 cast
<BinaryOperator
>(Builder
.CreateAdd(FakeVal
, Builder
.getInt32(10)));
389 ToBeDeleted
.push(UseFakeVal
);
393 //===----------------------------------------------------------------------===//
394 // OpenMPIRBuilderConfig
395 //===----------------------------------------------------------------------===//
398 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
399 /// Values for bit flags for marking which requires clauses have been used.
400 enum OpenMPOffloadingRequiresDirFlags
{
402 OMP_REQ_UNDEFINED
= 0x000,
403 /// no requires directive present.
404 OMP_REQ_NONE
= 0x001,
405 /// reverse_offload clause.
406 OMP_REQ_REVERSE_OFFLOAD
= 0x002,
407 /// unified_address clause.
408 OMP_REQ_UNIFIED_ADDRESS
= 0x004,
409 /// unified_shared_memory clause.
410 OMP_REQ_UNIFIED_SHARED_MEMORY
= 0x008,
411 /// dynamic_allocators clause.
412 OMP_REQ_DYNAMIC_ALLOCATORS
= 0x010,
413 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS
)
416 } // anonymous namespace
418 OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
419 : RequiresFlags(OMP_REQ_UNDEFINED
) {}
421 OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
422 bool IsTargetDevice
, bool IsGPU
, bool OpenMPOffloadMandatory
,
423 bool HasRequiresReverseOffload
, bool HasRequiresUnifiedAddress
,
424 bool HasRequiresUnifiedSharedMemory
, bool HasRequiresDynamicAllocators
)
425 : IsTargetDevice(IsTargetDevice
), IsGPU(IsGPU
),
426 OpenMPOffloadMandatory(OpenMPOffloadMandatory
),
427 RequiresFlags(OMP_REQ_UNDEFINED
) {
428 if (HasRequiresReverseOffload
)
429 RequiresFlags
|= OMP_REQ_REVERSE_OFFLOAD
;
430 if (HasRequiresUnifiedAddress
)
431 RequiresFlags
|= OMP_REQ_UNIFIED_ADDRESS
;
432 if (HasRequiresUnifiedSharedMemory
)
433 RequiresFlags
|= OMP_REQ_UNIFIED_SHARED_MEMORY
;
434 if (HasRequiresDynamicAllocators
)
435 RequiresFlags
|= OMP_REQ_DYNAMIC_ALLOCATORS
;
438 bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
439 return RequiresFlags
& OMP_REQ_REVERSE_OFFLOAD
;
442 bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
443 return RequiresFlags
& OMP_REQ_UNIFIED_ADDRESS
;
446 bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
447 return RequiresFlags
& OMP_REQ_UNIFIED_SHARED_MEMORY
;
450 bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
451 return RequiresFlags
& OMP_REQ_DYNAMIC_ALLOCATORS
;
454 int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
455 return hasRequiresFlags() ? RequiresFlags
456 : static_cast<int64_t>(OMP_REQ_NONE
);
459 void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value
) {
461 RequiresFlags
|= OMP_REQ_REVERSE_OFFLOAD
;
463 RequiresFlags
&= ~OMP_REQ_REVERSE_OFFLOAD
;
466 void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value
) {
468 RequiresFlags
|= OMP_REQ_UNIFIED_ADDRESS
;
470 RequiresFlags
&= ~OMP_REQ_UNIFIED_ADDRESS
;
473 void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value
) {
475 RequiresFlags
|= OMP_REQ_UNIFIED_SHARED_MEMORY
;
477 RequiresFlags
&= ~OMP_REQ_UNIFIED_SHARED_MEMORY
;
480 void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value
) {
482 RequiresFlags
|= OMP_REQ_DYNAMIC_ALLOCATORS
;
484 RequiresFlags
&= ~OMP_REQ_DYNAMIC_ALLOCATORS
;
487 //===----------------------------------------------------------------------===//
489 //===----------------------------------------------------------------------===//
491 void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs
&KernelArgs
,
492 IRBuilderBase
&Builder
,
493 SmallVector
<Value
*> &ArgsVector
) {
494 Value
*Version
= Builder
.getInt32(OMP_KERNEL_ARG_VERSION
);
495 Value
*PointerNum
= Builder
.getInt32(KernelArgs
.NumTargetItems
);
496 auto Int32Ty
= Type::getInt32Ty(Builder
.getContext());
497 Value
*ZeroArray
= Constant::getNullValue(ArrayType::get(Int32Ty
, 3));
498 Value
*Flags
= Builder
.getInt64(KernelArgs
.HasNoWait
);
501 Builder
.CreateInsertValue(ZeroArray
, KernelArgs
.NumTeams
, {0});
502 Value
*NumThreads3D
=
503 Builder
.CreateInsertValue(ZeroArray
, KernelArgs
.NumThreads
, {0});
505 ArgsVector
= {Version
,
507 KernelArgs
.RTArgs
.BasePointersArray
,
508 KernelArgs
.RTArgs
.PointersArray
,
509 KernelArgs
.RTArgs
.SizesArray
,
510 KernelArgs
.RTArgs
.MapTypesArray
,
511 KernelArgs
.RTArgs
.MapNamesArray
,
512 KernelArgs
.RTArgs
.MappersArray
,
513 KernelArgs
.NumIterations
,
517 KernelArgs
.DynCGGroupMem
};
520 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID
, Function
&Fn
) {
521 LLVMContext
&Ctx
= Fn
.getContext();
523 // Get the function's current attributes.
524 auto Attrs
= Fn
.getAttributes();
525 auto FnAttrs
= Attrs
.getFnAttrs();
526 auto RetAttrs
= Attrs
.getRetAttrs();
527 SmallVector
<AttributeSet
, 4> ArgAttrs
;
528 for (size_t ArgNo
= 0; ArgNo
< Fn
.arg_size(); ++ArgNo
)
529 ArgAttrs
.emplace_back(Attrs
.getParamAttrs(ArgNo
));
531 // Add AS to FnAS while taking special care with integer extensions.
532 auto addAttrSet
= [&](AttributeSet
&FnAS
, const AttributeSet
&AS
,
533 bool Param
= true) -> void {
534 bool HasSignExt
= AS
.hasAttribute(Attribute::SExt
);
535 bool HasZeroExt
= AS
.hasAttribute(Attribute::ZExt
);
536 if (HasSignExt
|| HasZeroExt
) {
537 assert(AS
.getNumAttributes() == 1 &&
538 "Currently not handling extension attr combined with others.");
540 if (auto AK
= TargetLibraryInfo::getExtAttrForI32Param(T
, HasSignExt
))
541 FnAS
= FnAS
.addAttribute(Ctx
, AK
);
543 TargetLibraryInfo::getExtAttrForI32Return(T
, HasSignExt
))
544 FnAS
= FnAS
.addAttribute(Ctx
, AK
);
546 FnAS
= FnAS
.addAttributes(Ctx
, AS
);
550 #define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
551 #include "llvm/Frontend/OpenMP/OMPKinds.def"
553 // Add attributes to the function declaration.
555 #define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
557 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
558 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
559 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
560 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
561 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
563 #include "llvm/Frontend/OpenMP/OMPKinds.def"
565 // Attributes are optional.
571 OpenMPIRBuilder::getOrCreateRuntimeFunction(Module
&M
, RuntimeFunction FnID
) {
572 FunctionType
*FnTy
= nullptr;
573 Function
*Fn
= nullptr;
575 // Try to find the declation in the module first.
577 #define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
579 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
581 Fn = M.getFunction(Str); \
583 #include "llvm/Frontend/OpenMP/OMPKinds.def"
587 // Create a new declaration if we need one.
589 #define OMP_RTL(Enum, Str, ...) \
591 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
593 #include "llvm/Frontend/OpenMP/OMPKinds.def"
596 // Add information if the runtime function takes a callback function
597 if (FnID
== OMPRTL___kmpc_fork_call
|| FnID
== OMPRTL___kmpc_fork_teams
) {
598 if (!Fn
->hasMetadata(LLVMContext::MD_callback
)) {
599 LLVMContext
&Ctx
= Fn
->getContext();
601 // Annotate the callback behavior of the runtime function:
602 // - The callback callee is argument number 2 (microtask).
603 // - The first two arguments of the callback callee are unknown (-1).
604 // - All variadic arguments to the runtime function are passed to the
607 LLVMContext::MD_callback
,
608 *MDNode::get(Ctx
, {MDB
.createCallbackEncoding(
609 2, {-1, -1}, /* VarArgsArePassed */ true)}));
613 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn
->getName()
614 << " with type " << *Fn
->getFunctionType() << "\n");
615 addAttributes(FnID
, *Fn
);
618 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn
->getName()
619 << " with type " << *Fn
->getFunctionType() << "\n");
622 assert(Fn
&& "Failed to create OpenMP runtime function");
627 Function
*OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID
) {
628 FunctionCallee RTLFn
= getOrCreateRuntimeFunction(M
, FnID
);
629 auto *Fn
= dyn_cast
<llvm::Function
>(RTLFn
.getCallee());
630 assert(Fn
&& "Failed to create OpenMP runtime function pointer");
634 void OpenMPIRBuilder::initialize() { initializeTypes(M
); }
636 void OpenMPIRBuilder::finalize(Function
*Fn
) {
637 SmallPtrSet
<BasicBlock
*, 32> ParallelRegionBlockSet
;
638 SmallVector
<BasicBlock
*, 32> Blocks
;
639 SmallVector
<OutlineInfo
, 16> DeferredOutlines
;
640 for (OutlineInfo
&OI
: OutlineInfos
) {
641 // Skip functions that have not finalized yet; may happen with nested
642 // function generation.
643 if (Fn
&& OI
.getFunction() != Fn
) {
644 DeferredOutlines
.push_back(OI
);
648 ParallelRegionBlockSet
.clear();
650 OI
.collectBlocks(ParallelRegionBlockSet
, Blocks
);
652 Function
*OuterFn
= OI
.getFunction();
653 CodeExtractorAnalysisCache
CEAC(*OuterFn
);
654 CodeExtractor
Extractor(Blocks
, /* DominatorTree */ nullptr,
655 /* AggregateArgs */ true,
656 /* BlockFrequencyInfo */ nullptr,
657 /* BranchProbabilityInfo */ nullptr,
658 /* AssumptionCache */ nullptr,
659 /* AllowVarArgs */ true,
660 /* AllowAlloca */ true,
661 /* AllocaBlock*/ OI
.OuterAllocaBB
,
662 /* Suffix */ ".omp_par");
664 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn
<< "\n");
665 LLVM_DEBUG(dbgs() << "Entry " << OI
.EntryBB
->getName()
666 << " Exit: " << OI
.ExitBB
->getName() << "\n");
667 assert(Extractor
.isEligible() &&
668 "Expected OpenMP outlining to be possible!");
670 for (auto *V
: OI
.ExcludeArgsFromAggregate
)
671 Extractor
.excludeArgFromAggregate(V
);
673 Function
*OutlinedFn
= Extractor
.extractCodeRegion(CEAC
);
675 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn
<< "\n");
676 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn
<< "\n");
677 assert(OutlinedFn
->getReturnType()->isVoidTy() &&
678 "OpenMP outlined functions should not return a value!");
680 // For compability with the clang CG we move the outlined function after the
681 // one with the parallel region.
682 OutlinedFn
->removeFromParent();
683 M
.getFunctionList().insertAfter(OuterFn
->getIterator(), OutlinedFn
);
685 // Remove the artificial entry introduced by the extractor right away, we
686 // made our own entry block after all.
688 BasicBlock
&ArtificialEntry
= OutlinedFn
->getEntryBlock();
689 assert(ArtificialEntry
.getUniqueSuccessor() == OI
.EntryBB
);
690 assert(OI
.EntryBB
->getUniquePredecessor() == &ArtificialEntry
);
691 // Move instructions from the to-be-deleted ArtificialEntry to the entry
692 // basic block of the parallel region. CodeExtractor generates
693 // instructions to unwrap the aggregate argument and may sink
694 // allocas/bitcasts for values that are solely used in the outlined region
695 // and do not escape.
696 assert(!ArtificialEntry
.empty() &&
697 "Expected instructions to add in the outlined region entry");
698 for (BasicBlock::reverse_iterator It
= ArtificialEntry
.rbegin(),
699 End
= ArtificialEntry
.rend();
701 Instruction
&I
= *It
;
704 if (I
.isTerminator())
707 I
.moveBeforePreserving(*OI
.EntryBB
, OI
.EntryBB
->getFirstInsertionPt());
710 OI
.EntryBB
->moveBefore(&ArtificialEntry
);
711 ArtificialEntry
.eraseFromParent();
713 assert(&OutlinedFn
->getEntryBlock() == OI
.EntryBB
);
714 assert(OutlinedFn
&& OutlinedFn
->getNumUses() == 1);
716 // Run a user callback, e.g. to add attributes.
717 if (OI
.PostOutlineCB
)
718 OI
.PostOutlineCB(*OutlinedFn
);
721 // Remove work items that have been completed.
722 OutlineInfos
= std::move(DeferredOutlines
);
724 EmitMetadataErrorReportFunctionTy
&&ErrorReportFn
=
725 [](EmitMetadataErrorKind Kind
,
726 const TargetRegionEntryInfo
&EntryInfo
) -> void {
727 errs() << "Error of kind: " << Kind
728 << " when emitting offload entries and metadata during "
729 "OMPIRBuilder finalization \n";
732 if (!OffloadInfoManager
.empty())
733 createOffloadEntriesAndInfoMetadata(ErrorReportFn
);
736 OpenMPIRBuilder::~OpenMPIRBuilder() {
737 assert(OutlineInfos
.empty() && "There must be no outstanding outlinings");
740 GlobalValue
*OpenMPIRBuilder::createGlobalFlag(unsigned Value
, StringRef Name
) {
741 IntegerType
*I32Ty
= Type::getInt32Ty(M
.getContext());
743 new GlobalVariable(M
, I32Ty
,
744 /* isConstant = */ true, GlobalValue::WeakODRLinkage
,
745 ConstantInt::get(I32Ty
, Value
), Name
);
746 GV
->setVisibility(GlobalValue::HiddenVisibility
);
751 Constant
*OpenMPIRBuilder::getOrCreateIdent(Constant
*SrcLocStr
,
752 uint32_t SrcLocStrSize
,
754 unsigned Reserve2Flags
) {
756 LocFlags
|= OMP_IDENT_FLAG_KMPC
;
759 IdentMap
[{SrcLocStr
, uint64_t(LocFlags
) << 31 | Reserve2Flags
}];
761 Constant
*I32Null
= ConstantInt::getNullValue(Int32
);
762 Constant
*IdentData
[] = {I32Null
,
763 ConstantInt::get(Int32
, uint32_t(LocFlags
)),
764 ConstantInt::get(Int32
, Reserve2Flags
),
765 ConstantInt::get(Int32
, SrcLocStrSize
), SrcLocStr
};
766 Constant
*Initializer
=
767 ConstantStruct::get(OpenMPIRBuilder::Ident
, IdentData
);
769 // Look for existing encoding of the location + flags, not needed but
770 // minimizes the difference to the existing solution while we transition.
771 for (GlobalVariable
&GV
: M
.globals())
772 if (GV
.getValueType() == OpenMPIRBuilder::Ident
&& GV
.hasInitializer())
773 if (GV
.getInitializer() == Initializer
)
777 auto *GV
= new GlobalVariable(
778 M
, OpenMPIRBuilder::Ident
,
779 /* isConstant = */ true, GlobalValue::PrivateLinkage
, Initializer
, "",
780 nullptr, GlobalValue::NotThreadLocal
,
781 M
.getDataLayout().getDefaultGlobalsAddressSpace());
782 GV
->setUnnamedAddr(GlobalValue::UnnamedAddr::Global
);
783 GV
->setAlignment(Align(8));
788 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident
, IdentPtr
);
791 Constant
*OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr
,
792 uint32_t &SrcLocStrSize
) {
793 SrcLocStrSize
= LocStr
.size();
794 Constant
*&SrcLocStr
= SrcLocStrMap
[LocStr
];
796 Constant
*Initializer
=
797 ConstantDataArray::getString(M
.getContext(), LocStr
);
799 // Look for existing encoding of the location, not needed but minimizes the
800 // difference to the existing solution while we transition.
801 for (GlobalVariable
&GV
: M
.globals())
802 if (GV
.isConstant() && GV
.hasInitializer() &&
803 GV
.getInitializer() == Initializer
)
804 return SrcLocStr
= ConstantExpr::getPointerCast(&GV
, Int8Ptr
);
806 SrcLocStr
= Builder
.CreateGlobalStringPtr(LocStr
, /* Name */ "",
807 /* AddressSpace */ 0, &M
);
812 Constant
*OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName
,
814 unsigned Line
, unsigned Column
,
815 uint32_t &SrcLocStrSize
) {
816 SmallString
<128> Buffer
;
817 Buffer
.push_back(';');
818 Buffer
.append(FileName
);
819 Buffer
.push_back(';');
820 Buffer
.append(FunctionName
);
821 Buffer
.push_back(';');
822 Buffer
.append(std::to_string(Line
));
823 Buffer
.push_back(';');
824 Buffer
.append(std::to_string(Column
));
825 Buffer
.push_back(';');
826 Buffer
.push_back(';');
827 return getOrCreateSrcLocStr(Buffer
.str(), SrcLocStrSize
);
831 OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize
) {
832 StringRef UnknownLoc
= ";unknown;unknown;0;0;;";
833 return getOrCreateSrcLocStr(UnknownLoc
, SrcLocStrSize
);
836 Constant
*OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL
,
837 uint32_t &SrcLocStrSize
,
839 DILocation
*DIL
= DL
.get();
841 return getOrCreateDefaultSrcLocStr(SrcLocStrSize
);
842 StringRef FileName
= M
.getName();
843 if (DIFile
*DIF
= DIL
->getFile())
844 if (std::optional
<StringRef
> Source
= DIF
->getSource())
846 StringRef Function
= DIL
->getScope()->getSubprogram()->getName();
847 if (Function
.empty() && F
)
848 Function
= F
->getName();
849 return getOrCreateSrcLocStr(Function
, FileName
, DIL
->getLine(),
850 DIL
->getColumn(), SrcLocStrSize
);
853 Constant
*OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription
&Loc
,
854 uint32_t &SrcLocStrSize
) {
855 return getOrCreateSrcLocStr(Loc
.DL
, SrcLocStrSize
,
856 Loc
.IP
.getBlock()->getParent());
859 Value
*OpenMPIRBuilder::getOrCreateThreadID(Value
*Ident
) {
860 return Builder
.CreateCall(
861 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num
), Ident
,
862 "omp_global_thread_num");
865 OpenMPIRBuilder::InsertPointTy
866 OpenMPIRBuilder::createBarrier(const LocationDescription
&Loc
, Directive DK
,
867 bool ForceSimpleCall
, bool CheckCancelFlag
) {
868 if (!updateToLocation(Loc
))
870 return emitBarrierImpl(Loc
, DK
, ForceSimpleCall
, CheckCancelFlag
);
873 OpenMPIRBuilder::InsertPointTy
874 OpenMPIRBuilder::emitBarrierImpl(const LocationDescription
&Loc
, Directive Kind
,
875 bool ForceSimpleCall
, bool CheckCancelFlag
) {
876 // Build call __kmpc_cancel_barrier(loc, thread_id) or
877 // __kmpc_barrier(loc, thread_id);
879 IdentFlag BarrierLocFlags
;
882 BarrierLocFlags
= OMP_IDENT_FLAG_BARRIER_IMPL_FOR
;
885 BarrierLocFlags
= OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS
;
888 BarrierLocFlags
= OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE
;
891 BarrierLocFlags
= OMP_IDENT_FLAG_BARRIER_EXPL
;
894 BarrierLocFlags
= OMP_IDENT_FLAG_BARRIER_IMPL
;
898 uint32_t SrcLocStrSize
;
899 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
901 getOrCreateIdent(SrcLocStr
, SrcLocStrSize
, BarrierLocFlags
),
902 getOrCreateThreadID(getOrCreateIdent(SrcLocStr
, SrcLocStrSize
))};
904 // If we are in a cancellable parallel region, barriers are cancellation
906 // TODO: Check why we would force simple calls or to ignore the cancel flag.
907 bool UseCancelBarrier
=
908 !ForceSimpleCall
&& isLastFinalizationInfoCancellable(OMPD_parallel
);
911 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(
912 UseCancelBarrier
? OMPRTL___kmpc_cancel_barrier
913 : OMPRTL___kmpc_barrier
),
916 if (UseCancelBarrier
&& CheckCancelFlag
)
917 emitCancelationCheckImpl(Result
, OMPD_parallel
);
919 return Builder
.saveIP();
922 OpenMPIRBuilder::InsertPointTy
923 OpenMPIRBuilder::createCancel(const LocationDescription
&Loc
,
925 omp::Directive CanceledDirective
) {
926 if (!updateToLocation(Loc
))
929 // LLVM utilities like blocks with terminators.
930 auto *UI
= Builder
.CreateUnreachable();
932 Instruction
*ThenTI
= UI
, *ElseTI
= nullptr;
934 SplitBlockAndInsertIfThenElse(IfCondition
, UI
, &ThenTI
, &ElseTI
);
935 Builder
.SetInsertPoint(ThenTI
);
937 Value
*CancelKind
= nullptr;
938 switch (CanceledDirective
) {
939 #define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
940 case DirectiveEnum: \
941 CancelKind = Builder.getInt32(Value); \
943 #include "llvm/Frontend/OpenMP/OMPKinds.def"
945 llvm_unreachable("Unknown cancel kind!");
948 uint32_t SrcLocStrSize
;
949 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
950 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
951 Value
*Args
[] = {Ident
, getOrCreateThreadID(Ident
), CancelKind
};
952 Value
*Result
= Builder
.CreateCall(
953 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel
), Args
);
954 auto ExitCB
= [this, CanceledDirective
, Loc
](InsertPointTy IP
) {
955 if (CanceledDirective
== OMPD_parallel
) {
956 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
957 Builder
.restoreIP(IP
);
958 createBarrier(LocationDescription(Builder
.saveIP(), Loc
.DL
),
959 omp::Directive::OMPD_unknown
, /* ForceSimpleCall */ false,
960 /* CheckCancelFlag */ false);
964 // The actual cancel logic is shared with others, e.g., cancel_barriers.
965 emitCancelationCheckImpl(Result
, CanceledDirective
, ExitCB
);
967 // Update the insertion point and remove the terminator we introduced.
968 Builder
.SetInsertPoint(UI
->getParent());
969 UI
->eraseFromParent();
971 return Builder
.saveIP();
974 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::emitTargetKernel(
975 const LocationDescription
&Loc
, InsertPointTy AllocaIP
, Value
*&Return
,
976 Value
*Ident
, Value
*DeviceID
, Value
*NumTeams
, Value
*NumThreads
,
977 Value
*HostPtr
, ArrayRef
<Value
*> KernelArgs
) {
978 if (!updateToLocation(Loc
))
981 Builder
.restoreIP(AllocaIP
);
982 auto *KernelArgsPtr
=
983 Builder
.CreateAlloca(OpenMPIRBuilder::KernelArgs
, nullptr, "kernel_args");
984 Builder
.restoreIP(Loc
.IP
);
986 for (unsigned I
= 0, Size
= KernelArgs
.size(); I
!= Size
; ++I
) {
988 Builder
.CreateStructGEP(OpenMPIRBuilder::KernelArgs
, KernelArgsPtr
, I
);
989 Builder
.CreateAlignedStore(
991 M
.getDataLayout().getPrefTypeAlign(KernelArgs
[I
]->getType()));
994 SmallVector
<Value
*> OffloadingArgs
{Ident
, DeviceID
, NumTeams
,
995 NumThreads
, HostPtr
, KernelArgsPtr
};
997 Return
= Builder
.CreateCall(
998 getOrCreateRuntimeFunction(M
, OMPRTL___tgt_target_kernel
),
1001 return Builder
.saveIP();
1004 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::emitKernelLaunch(
1005 const LocationDescription
&Loc
, Function
*OutlinedFn
, Value
*OutlinedFnID
,
1006 EmitFallbackCallbackTy emitTargetCallFallbackCB
, TargetKernelArgs
&Args
,
1007 Value
*DeviceID
, Value
*RTLoc
, InsertPointTy AllocaIP
) {
1009 if (!updateToLocation(Loc
))
1012 Builder
.restoreIP(Loc
.IP
);
1013 // On top of the arrays that were filled up, the target offloading call
1014 // takes as arguments the device id as well as the host pointer. The host
1015 // pointer is used by the runtime library to identify the current target
1016 // region, so it only has to be unique and not necessarily point to
1017 // anything. It could be the pointer to the outlined function that
1018 // implements the target region, but we aren't using that so that the
1019 // compiler doesn't need to keep that, and could therefore inline the host
1020 // function if proven worthwhile during optimization.
1022 // From this point on, we need to have an ID of the target region defined.
1023 assert(OutlinedFnID
&& "Invalid outlined function ID!");
1026 // Return value of the runtime offloading call.
1027 Value
*Return
= nullptr;
1029 // Arguments for the target kernel.
1030 SmallVector
<Value
*> ArgsVector
;
1031 getKernelArgsVector(Args
, Builder
, ArgsVector
);
1033 // The target region is an outlined function launched by the runtime
1034 // via calls to __tgt_target_kernel().
1036 // Note that on the host and CPU targets, the runtime implementation of
1037 // these calls simply call the outlined function without forking threads.
1038 // The outlined functions themselves have runtime calls to
1039 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1040 // the compiler in emitTeamsCall() and emitParallelCall().
1042 // In contrast, on the NVPTX target, the implementation of
1043 // __tgt_target_teams() launches a GPU kernel with the requested number
1044 // of teams and threads so no additional calls to the runtime are required.
1045 // Check the error code and execute the host version if required.
1046 Builder
.restoreIP(emitTargetKernel(Builder
, AllocaIP
, Return
, RTLoc
, DeviceID
,
1047 Args
.NumTeams
, Args
.NumThreads
,
1048 OutlinedFnID
, ArgsVector
));
1050 BasicBlock
*OffloadFailedBlock
=
1051 BasicBlock::Create(Builder
.getContext(), "omp_offload.failed");
1052 BasicBlock
*OffloadContBlock
=
1053 BasicBlock::Create(Builder
.getContext(), "omp_offload.cont");
1054 Value
*Failed
= Builder
.CreateIsNotNull(Return
);
1055 Builder
.CreateCondBr(Failed
, OffloadFailedBlock
, OffloadContBlock
);
1057 auto CurFn
= Builder
.GetInsertBlock()->getParent();
1058 emitBlock(OffloadFailedBlock
, CurFn
);
1059 Builder
.restoreIP(emitTargetCallFallbackCB(Builder
.saveIP()));
1060 emitBranch(OffloadContBlock
);
1061 emitBlock(OffloadContBlock
, CurFn
, /*IsFinished=*/true);
1062 return Builder
.saveIP();
1065 void OpenMPIRBuilder::emitCancelationCheckImpl(Value
*CancelFlag
,
1066 omp::Directive CanceledDirective
,
1067 FinalizeCallbackTy ExitCB
) {
1068 assert(isLastFinalizationInfoCancellable(CanceledDirective
) &&
1069 "Unexpected cancellation!");
1071 // For a cancel barrier we create two new blocks.
1072 BasicBlock
*BB
= Builder
.GetInsertBlock();
1073 BasicBlock
*NonCancellationBlock
;
1074 if (Builder
.GetInsertPoint() == BB
->end()) {
1075 // TODO: This branch will not be needed once we moved to the
1076 // OpenMPIRBuilder codegen completely.
1077 NonCancellationBlock
= BasicBlock::Create(
1078 BB
->getContext(), BB
->getName() + ".cont", BB
->getParent());
1080 NonCancellationBlock
= SplitBlock(BB
, &*Builder
.GetInsertPoint());
1081 BB
->getTerminator()->eraseFromParent();
1082 Builder
.SetInsertPoint(BB
);
1084 BasicBlock
*CancellationBlock
= BasicBlock::Create(
1085 BB
->getContext(), BB
->getName() + ".cncl", BB
->getParent());
1087 // Jump to them based on the return value.
1088 Value
*Cmp
= Builder
.CreateIsNull(CancelFlag
);
1089 Builder
.CreateCondBr(Cmp
, NonCancellationBlock
, CancellationBlock
,
1090 /* TODO weight */ nullptr, nullptr);
1092 // From the cancellation block we finalize all variables and go to the
1093 // post finalization block that is known to the FiniCB callback.
1094 Builder
.SetInsertPoint(CancellationBlock
);
1096 ExitCB(Builder
.saveIP());
1097 auto &FI
= FinalizationStack
.back();
1098 FI
.FiniCB(Builder
.saveIP());
1100 // The continuation block is where code generation continues.
1101 Builder
.SetInsertPoint(NonCancellationBlock
, NonCancellationBlock
->begin());
1104 IRBuilder
<>::InsertPoint
OpenMPIRBuilder::createParallel(
1105 const LocationDescription
&Loc
, InsertPointTy OuterAllocaIP
,
1106 BodyGenCallbackTy BodyGenCB
, PrivatizeCallbackTy PrivCB
,
1107 FinalizeCallbackTy FiniCB
, Value
*IfCondition
, Value
*NumThreads
,
1108 omp::ProcBindKind ProcBind
, bool IsCancellable
) {
1109 assert(!isConflictIP(Loc
.IP
, OuterAllocaIP
) && "IPs must not be ambiguous");
1111 if (!updateToLocation(Loc
))
1114 uint32_t SrcLocStrSize
;
1115 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1116 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
1117 Value
*ThreadID
= getOrCreateThreadID(Ident
);
1120 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1123 Builder
.CreateIntCast(NumThreads
, Int32
, /*isSigned*/ false)};
1125 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads
), Args
);
1128 if (ProcBind
!= OMP_PROC_BIND_default
) {
1129 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1132 ConstantInt::get(Int32
, unsigned(ProcBind
), /*isSigned=*/true)};
1134 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind
), Args
);
1137 BasicBlock
*InsertBB
= Builder
.GetInsertBlock();
1138 Function
*OuterFn
= InsertBB
->getParent();
1140 // Save the outer alloca block because the insertion iterator may get
1141 // invalidated and we still need this later.
1142 BasicBlock
*OuterAllocaBlock
= OuterAllocaIP
.getBlock();
1144 // Vector to remember instructions we used only during the modeling but which
1145 // we want to delete at the end.
1146 SmallVector
<Instruction
*, 4> ToBeDeleted
;
1148 // Change the location to the outer alloca insertion point to create and
1149 // initialize the allocas we pass into the parallel region.
1150 Builder
.restoreIP(OuterAllocaIP
);
1151 AllocaInst
*TIDAddr
= Builder
.CreateAlloca(Int32
, nullptr, "tid.addr");
1152 AllocaInst
*ZeroAddr
= Builder
.CreateAlloca(Int32
, nullptr, "zero.addr");
1154 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1155 // associated arguments in the outlined function, so we delete them later.
1156 ToBeDeleted
.push_back(TIDAddr
);
1157 ToBeDeleted
.push_back(ZeroAddr
);
1159 // Create an artificial insertion point that will also ensure the blocks we
1160 // are about to split are not degenerated.
1161 auto *UI
= new UnreachableInst(Builder
.getContext(), InsertBB
);
1163 BasicBlock
*EntryBB
= UI
->getParent();
1164 BasicBlock
*PRegEntryBB
= EntryBB
->splitBasicBlock(UI
, "omp.par.entry");
1165 BasicBlock
*PRegBodyBB
= PRegEntryBB
->splitBasicBlock(UI
, "omp.par.region");
1166 BasicBlock
*PRegPreFiniBB
=
1167 PRegBodyBB
->splitBasicBlock(UI
, "omp.par.pre_finalize");
1168 BasicBlock
*PRegExitBB
= PRegPreFiniBB
->splitBasicBlock(UI
, "omp.par.exit");
1170 auto FiniCBWrapper
= [&](InsertPointTy IP
) {
1171 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1172 // target to the region exit block.
1173 if (IP
.getBlock()->end() == IP
.getPoint()) {
1174 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
1175 Builder
.restoreIP(IP
);
1176 Instruction
*I
= Builder
.CreateBr(PRegExitBB
);
1177 IP
= InsertPointTy(I
->getParent(), I
->getIterator());
1179 assert(IP
.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1180 IP
.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB
&&
1181 "Unexpected insertion point for finalization call!");
1185 FinalizationStack
.push_back({FiniCBWrapper
, OMPD_parallel
, IsCancellable
});
1187 // Generate the privatization allocas in the block that will become the entry
1188 // of the outlined function.
1189 Builder
.SetInsertPoint(PRegEntryBB
->getTerminator());
1190 InsertPointTy InnerAllocaIP
= Builder
.saveIP();
1192 AllocaInst
*PrivTIDAddr
=
1193 Builder
.CreateAlloca(Int32
, nullptr, "tid.addr.local");
1194 Instruction
*PrivTID
= Builder
.CreateLoad(Int32
, PrivTIDAddr
, "tid");
1196 // Add some fake uses for OpenMP provided arguments.
1197 ToBeDeleted
.push_back(Builder
.CreateLoad(Int32
, TIDAddr
, "tid.addr.use"));
1198 Instruction
*ZeroAddrUse
=
1199 Builder
.CreateLoad(Int32
, ZeroAddr
, "zero.addr.use");
1200 ToBeDeleted
.push_back(ZeroAddrUse
);
1205 // PRegionEntryBB <- Privatization allocas are placed here.
1208 // PRegionBodyBB <- BodeGen is invoked here.
1211 // PRegPreFiniBB <- The block we will start finalization from.
1214 // PRegionExitBB <- A common exit to simplify block collection.
1217 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn
<< "\n");
1219 // Let the caller create the body.
1220 assert(BodyGenCB
&& "Expected body generation callback!");
1221 InsertPointTy
CodeGenIP(PRegBodyBB
, PRegBodyBB
->begin());
1222 BodyGenCB(InnerAllocaIP
, CodeGenIP
);
1224 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn
<< "\n");
1225 FunctionCallee RTLFn
;
1227 RTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if
);
1229 RTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call
);
1231 if (auto *F
= dyn_cast
<llvm::Function
>(RTLFn
.getCallee())) {
1232 if (!F
->hasMetadata(llvm::LLVMContext::MD_callback
)) {
1233 llvm::LLVMContext
&Ctx
= F
->getContext();
1235 // Annotate the callback behavior of the __kmpc_fork_call:
1236 // - The callback callee is argument number 2 (microtask).
1237 // - The first two arguments of the callback callee are unknown (-1).
1238 // - All variadic arguments to the __kmpc_fork_call are passed to the
1241 llvm::LLVMContext::MD_callback
,
1243 Ctx
, {MDB
.createCallbackEncoding(2, {-1, -1},
1244 /* VarArgsArePassed */ true)}));
1249 OI
.PostOutlineCB
= [=](Function
&OutlinedFn
) {
1250 // Add some known attributes.
1251 OutlinedFn
.addParamAttr(0, Attribute::NoAlias
);
1252 OutlinedFn
.addParamAttr(1, Attribute::NoAlias
);
1253 OutlinedFn
.addFnAttr(Attribute::NoUnwind
);
1254 OutlinedFn
.addFnAttr(Attribute::NoRecurse
);
1256 assert(OutlinedFn
.arg_size() >= 2 &&
1257 "Expected at least tid and bounded tid as arguments");
1258 unsigned NumCapturedVars
=
1259 OutlinedFn
.arg_size() - /* tid & bounded tid */ 2;
1261 CallInst
*CI
= cast
<CallInst
>(OutlinedFn
.user_back());
1262 CI
->getParent()->setName("omp_parallel");
1263 Builder
.SetInsertPoint(CI
);
1265 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1266 Value
*ForkCallArgs
[] = {
1267 Ident
, Builder
.getInt32(NumCapturedVars
),
1268 Builder
.CreateBitCast(&OutlinedFn
, ParallelTaskPtr
)};
1270 SmallVector
<Value
*, 16> RealArgs
;
1271 RealArgs
.append(std::begin(ForkCallArgs
), std::end(ForkCallArgs
));
1273 Value
*Cond
= Builder
.CreateSExtOrTrunc(IfCondition
,
1274 Type::getInt32Ty(M
.getContext()));
1275 RealArgs
.push_back(Cond
);
1277 RealArgs
.append(CI
->arg_begin() + /* tid & bound tid */ 2, CI
->arg_end());
1279 // __kmpc_fork_call_if always expects a void ptr as the last argument
1280 // If there are no arguments, pass a null pointer.
1281 auto PtrTy
= Type::getInt8PtrTy(M
.getContext());
1282 if (IfCondition
&& NumCapturedVars
== 0) {
1283 llvm::Value
*Void
= ConstantPointerNull::get(PtrTy
);
1284 RealArgs
.push_back(Void
);
1286 if (IfCondition
&& RealArgs
.back()->getType() != PtrTy
)
1287 RealArgs
.back() = Builder
.CreateBitCast(RealArgs
.back(), PtrTy
);
1289 Builder
.CreateCall(RTLFn
, RealArgs
);
1291 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1292 << *Builder
.GetInsertBlock()->getParent() << "\n");
1294 InsertPointTy
ExitIP(PRegExitBB
, PRegExitBB
->end());
1296 // Initialize the local TID stack location with the argument value.
1297 Builder
.SetInsertPoint(PrivTID
);
1298 Function::arg_iterator OutlinedAI
= OutlinedFn
.arg_begin();
1299 Builder
.CreateStore(Builder
.CreateLoad(Int32
, OutlinedAI
), PrivTIDAddr
);
1301 CI
->eraseFromParent();
1303 for (Instruction
*I
: ToBeDeleted
)
1304 I
->eraseFromParent();
1307 // Adjust the finalization stack, verify the adjustment, and call the
1308 // finalize function a last time to finalize values between the pre-fini
1309 // block and the exit block if we left the parallel "the normal way".
1310 auto FiniInfo
= FinalizationStack
.pop_back_val();
1312 assert(FiniInfo
.DK
== OMPD_parallel
&&
1313 "Unexpected finalization stack state!");
1315 Instruction
*PRegPreFiniTI
= PRegPreFiniBB
->getTerminator();
1317 InsertPointTy
PreFiniIP(PRegPreFiniBB
, PRegPreFiniTI
->getIterator());
1320 OI
.OuterAllocaBB
= OuterAllocaBlock
;
1321 OI
.EntryBB
= PRegEntryBB
;
1322 OI
.ExitBB
= PRegExitBB
;
1324 SmallPtrSet
<BasicBlock
*, 32> ParallelRegionBlockSet
;
1325 SmallVector
<BasicBlock
*, 32> Blocks
;
1326 OI
.collectBlocks(ParallelRegionBlockSet
, Blocks
);
1328 // Ensure a single exit node for the outlined region by creating one.
1329 // We might have multiple incoming edges to the exit now due to finalizations,
1330 // e.g., cancel calls that cause the control flow to leave the region.
1331 BasicBlock
*PRegOutlinedExitBB
= PRegExitBB
;
1332 PRegExitBB
= SplitBlock(PRegExitBB
, &*PRegExitBB
->getFirstInsertionPt());
1333 PRegOutlinedExitBB
->setName("omp.par.outlined.exit");
1334 Blocks
.push_back(PRegOutlinedExitBB
);
1336 CodeExtractorAnalysisCache
CEAC(*OuterFn
);
1337 CodeExtractor
Extractor(Blocks
, /* DominatorTree */ nullptr,
1338 /* AggregateArgs */ false,
1339 /* BlockFrequencyInfo */ nullptr,
1340 /* BranchProbabilityInfo */ nullptr,
1341 /* AssumptionCache */ nullptr,
1342 /* AllowVarArgs */ true,
1343 /* AllowAlloca */ true,
1344 /* AllocationBlock */ OuterAllocaBlock
,
1345 /* Suffix */ ".omp_par");
1347 // Find inputs to, outputs from the code region.
1348 BasicBlock
*CommonExit
= nullptr;
1349 SetVector
<Value
*> Inputs
, Outputs
, SinkingCands
, HoistingCands
;
1350 Extractor
.findAllocas(CEAC
, SinkingCands
, HoistingCands
, CommonExit
);
1351 Extractor
.findInputsOutputs(Inputs
, Outputs
, SinkingCands
);
1353 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn
<< "\n");
1355 FunctionCallee TIDRTLFn
=
1356 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num
);
1358 auto PrivHelper
= [&](Value
&V
) {
1359 if (&V
== TIDAddr
|| &V
== ZeroAddr
) {
1360 OI
.ExcludeArgsFromAggregate
.push_back(&V
);
1364 SetVector
<Use
*> Uses
;
1365 for (Use
&U
: V
.uses())
1366 if (auto *UserI
= dyn_cast
<Instruction
>(U
.getUser()))
1367 if (ParallelRegionBlockSet
.count(UserI
->getParent()))
1370 // __kmpc_fork_call expects extra arguments as pointers. If the input
1371 // already has a pointer type, everything is fine. Otherwise, store the
1372 // value onto stack and load it back inside the to-be-outlined region. This
1373 // will ensure only the pointer will be passed to the function.
1374 // FIXME: if there are more than 15 trailing arguments, they must be
1375 // additionally packed in a struct.
1377 if (!V
.getType()->isPointerTy()) {
1378 IRBuilder
<>::InsertPointGuard
Guard(Builder
);
1379 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V
<< "\n");
1381 Builder
.restoreIP(OuterAllocaIP
);
1383 Builder
.CreateAlloca(V
.getType(), nullptr, V
.getName() + ".reloaded");
1385 // Store to stack at end of the block that currently branches to the entry
1386 // block of the to-be-outlined region.
1387 Builder
.SetInsertPoint(InsertBB
,
1388 InsertBB
->getTerminator()->getIterator());
1389 Builder
.CreateStore(&V
, Ptr
);
1391 // Load back next to allocations in the to-be-outlined region.
1392 Builder
.restoreIP(InnerAllocaIP
);
1393 Inner
= Builder
.CreateLoad(V
.getType(), Ptr
);
1396 Value
*ReplacementValue
= nullptr;
1397 CallInst
*CI
= dyn_cast
<CallInst
>(&V
);
1398 if (CI
&& CI
->getCalledFunction() == TIDRTLFn
.getCallee()) {
1399 ReplacementValue
= PrivTID
;
1402 PrivCB(InnerAllocaIP
, Builder
.saveIP(), V
, *Inner
, ReplacementValue
));
1403 assert(ReplacementValue
&&
1404 "Expected copy/create callback to set replacement value!");
1405 if (ReplacementValue
== &V
)
1409 for (Use
*UPtr
: Uses
)
1410 UPtr
->set(ReplacementValue
);
1413 // Reset the inner alloca insertion as it will be used for loading the values
1414 // wrapped into pointers before passing them into the to-be-outlined region.
1415 // Configure it to insert immediately after the fake use of zero address so
1416 // that they are available in the generated body and so that the
1417 // OpenMP-related values (thread ID and zero address pointers) remain leading
1418 // in the argument list.
1419 InnerAllocaIP
= IRBuilder
<>::InsertPoint(
1420 ZeroAddrUse
->getParent(), ZeroAddrUse
->getNextNode()->getIterator());
1422 // Reset the outer alloca insertion point to the entry of the relevant block
1423 // in case it was invalidated.
1424 OuterAllocaIP
= IRBuilder
<>::InsertPoint(
1425 OuterAllocaBlock
, OuterAllocaBlock
->getFirstInsertionPt());
1427 for (Value
*Input
: Inputs
) {
1428 LLVM_DEBUG(dbgs() << "Captured input: " << *Input
<< "\n");
1432 for (Value
*Output
: Outputs
)
1433 LLVM_DEBUG(dbgs() << "Captured output: " << *Output
<< "\n");
1435 assert(Outputs
.empty() &&
1436 "OpenMP outlining should not produce live-out values!");
1438 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn
<< "\n");
1440 for (auto *BB
: Blocks
)
1441 dbgs() << " PBR: " << BB
->getName() << "\n";
1444 // Register the outlined info.
1445 addOutlineInfo(std::move(OI
));
1447 InsertPointTy
AfterIP(UI
->getParent(), UI
->getParent()->end());
1448 UI
->eraseFromParent();
1453 void OpenMPIRBuilder::emitFlush(const LocationDescription
&Loc
) {
1454 // Build call void __kmpc_flush(ident_t *loc)
1455 uint32_t SrcLocStrSize
;
1456 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1457 Value
*Args
[] = {getOrCreateIdent(SrcLocStr
, SrcLocStrSize
)};
1459 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush
), Args
);
1462 void OpenMPIRBuilder::createFlush(const LocationDescription
&Loc
) {
1463 if (!updateToLocation(Loc
))
1468 void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription
&Loc
) {
1469 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1471 uint32_t SrcLocStrSize
;
1472 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1473 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
1474 Value
*Args
[] = {Ident
, getOrCreateThreadID(Ident
)};
1476 // Ignore return result until untied tasks are supported.
1477 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait
),
1481 void OpenMPIRBuilder::createTaskwait(const LocationDescription
&Loc
) {
1482 if (!updateToLocation(Loc
))
1484 emitTaskwaitImpl(Loc
);
1487 void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription
&Loc
) {
1488 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1489 uint32_t SrcLocStrSize
;
1490 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1491 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
1492 Constant
*I32Null
= ConstantInt::getNullValue(Int32
);
1493 Value
*Args
[] = {Ident
, getOrCreateThreadID(Ident
), I32Null
};
1495 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield
),
1499 void OpenMPIRBuilder::createTaskyield(const LocationDescription
&Loc
) {
1500 if (!updateToLocation(Loc
))
1502 emitTaskyieldImpl(Loc
);
1505 OpenMPIRBuilder::InsertPointTy
1506 OpenMPIRBuilder::createTask(const LocationDescription
&Loc
,
1507 InsertPointTy AllocaIP
, BodyGenCallbackTy BodyGenCB
,
1508 bool Tied
, Value
*Final
, Value
*IfCondition
,
1509 SmallVector
<DependData
> Dependencies
) {
1511 if (!updateToLocation(Loc
))
1512 return InsertPointTy();
1514 uint32_t SrcLocStrSize
;
1515 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1516 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
1517 // The current basic block is split into four basic blocks. After outlining,
1518 // they will be mapped as follows:
1520 // def current_fn() {
1521 // current_basic_block:
1522 // br label %task.exit
1524 // ; instructions after task
1526 // def outlined_fn() {
1528 // br label %task.body
1533 BasicBlock
*TaskExitBB
= splitBB(Builder
, /*CreateBranch=*/true, "task.exit");
1534 BasicBlock
*TaskBodyBB
= splitBB(Builder
, /*CreateBranch=*/true, "task.body");
1535 BasicBlock
*TaskAllocaBB
=
1536 splitBB(Builder
, /*CreateBranch=*/true, "task.alloca");
1538 InsertPointTy TaskAllocaIP
=
1539 InsertPointTy(TaskAllocaBB
, TaskAllocaBB
->begin());
1540 InsertPointTy TaskBodyIP
= InsertPointTy(TaskBodyBB
, TaskBodyBB
->begin());
1541 BodyGenCB(TaskAllocaIP
, TaskBodyIP
);
1544 OI
.EntryBB
= TaskAllocaBB
;
1545 OI
.OuterAllocaBB
= AllocaIP
.getBlock();
1546 OI
.ExitBB
= TaskExitBB
;
1548 // Add the thread ID argument.
1549 std::stack
<Instruction
*> ToBeDeleted
;
1550 OI
.ExcludeArgsFromAggregate
.push_back(createFakeIntVal(
1551 Builder
, AllocaIP
, ToBeDeleted
, TaskAllocaIP
, "global.tid", false));
1553 OI
.PostOutlineCB
= [this, Ident
, Tied
, Final
, IfCondition
, Dependencies
,
1554 TaskAllocaBB
, ToBeDeleted
](Function
&OutlinedFn
) mutable {
1555 // Replace the Stale CI by appropriate RTL function call.
1556 assert(OutlinedFn
.getNumUses() == 1 &&
1557 "there must be a single user for the outlined function");
1558 CallInst
*StaleCI
= cast
<CallInst
>(OutlinedFn
.user_back());
1560 // HasShareds is true if any variables are captured in the outlined region,
1562 bool HasShareds
= StaleCI
->arg_size() > 1;
1563 Builder
.SetInsertPoint(StaleCI
);
1565 // Gather the arguments for emitting the runtime call for
1566 // @__kmpc_omp_task_alloc
1567 Function
*TaskAllocFn
=
1568 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc
);
1570 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1572 Value
*ThreadID
= getOrCreateThreadID(Ident
);
1574 // Argument - `flags`
1575 // Task is tied iff (Flags & 1) == 1.
1576 // Task is untied iff (Flags & 1) == 0.
1577 // Task is final iff (Flags & 2) == 2.
1578 // Task is not final iff (Flags & 2) == 0.
1579 // TODO: Handle the other flags.
1580 Value
*Flags
= Builder
.getInt32(Tied
);
1583 Builder
.CreateSelect(Final
, Builder
.getInt32(2), Builder
.getInt32(0));
1584 Flags
= Builder
.CreateOr(FinalFlag
, Flags
);
1587 // Argument - `sizeof_kmp_task_t` (TaskSize)
1588 // Tasksize refers to the size in bytes of kmp_task_t data structure
1589 // including private vars accessed in task.
1590 // TODO: add kmp_task_t_with_privates (privates)
1591 Value
*TaskSize
= Builder
.getInt64(
1592 divideCeil(M
.getDataLayout().getTypeSizeInBits(Task
), 8));
1594 // Argument - `sizeof_shareds` (SharedsSize)
1595 // SharedsSize refers to the shareds array size in the kmp_task_t data
1597 Value
*SharedsSize
= Builder
.getInt64(0);
1599 AllocaInst
*ArgStructAlloca
=
1600 dyn_cast
<AllocaInst
>(StaleCI
->getArgOperand(1));
1601 assert(ArgStructAlloca
&&
1602 "Unable to find the alloca instruction corresponding to arguments "
1603 "for extracted function");
1604 StructType
*ArgStructType
=
1605 dyn_cast
<StructType
>(ArgStructAlloca
->getAllocatedType());
1606 assert(ArgStructType
&& "Unable to find struct type corresponding to "
1607 "arguments for extracted function");
1609 Builder
.getInt64(M
.getDataLayout().getTypeStoreSize(ArgStructType
));
1611 // Emit the @__kmpc_omp_task_alloc runtime call
1612 // The runtime call returns a pointer to an area where the task captured
1613 // variables must be copied before the task is run (TaskData)
1614 CallInst
*TaskData
= Builder
.CreateCall(
1615 TaskAllocFn
, {/*loc_ref=*/Ident
, /*gtid=*/ThreadID
, /*flags=*/Flags
,
1616 /*sizeof_task=*/TaskSize
, /*sizeof_shared=*/SharedsSize
,
1617 /*task_func=*/&OutlinedFn
});
1619 // Copy the arguments for outlined function
1621 Value
*Shareds
= StaleCI
->getArgOperand(1);
1622 Align Alignment
= TaskData
->getPointerAlignment(M
.getDataLayout());
1623 Value
*TaskShareds
= Builder
.CreateLoad(VoidPtr
, TaskData
);
1624 Builder
.CreateMemCpy(TaskShareds
, Alignment
, Shareds
, Alignment
,
1628 Value
*DepArrayPtr
= nullptr;
1629 if (Dependencies
.size()) {
1630 InsertPointTy OldIP
= Builder
.saveIP();
1631 Builder
.SetInsertPoint(
1632 &OldIP
.getBlock()->getParent()->getEntryBlock().back());
1634 Type
*DepArrayTy
= ArrayType::get(DependInfo
, Dependencies
.size());
1636 Builder
.CreateAlloca(DepArrayTy
, nullptr, ".dep.arr.addr");
1639 for (const DependData
&Dep
: Dependencies
) {
1641 Builder
.CreateConstInBoundsGEP2_64(DepArrayTy
, DepArray
, 0, P
);
1642 // Store the pointer to the variable
1643 Value
*Addr
= Builder
.CreateStructGEP(
1645 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr
));
1647 Builder
.CreatePtrToInt(Dep
.DepVal
, Builder
.getInt64Ty());
1648 Builder
.CreateStore(DepValPtr
, Addr
);
1649 // Store the size of the variable
1650 Value
*Size
= Builder
.CreateStructGEP(
1652 static_cast<unsigned int>(RTLDependInfoFields::Len
));
1653 Builder
.CreateStore(Builder
.getInt64(M
.getDataLayout().getTypeStoreSize(
1656 // Store the dependency kind
1657 Value
*Flags
= Builder
.CreateStructGEP(
1659 static_cast<unsigned int>(RTLDependInfoFields::Flags
));
1660 Builder
.CreateStore(
1661 ConstantInt::get(Builder
.getInt8Ty(),
1662 static_cast<unsigned int>(Dep
.DepKind
)),
1667 DepArrayPtr
= Builder
.CreateBitCast(DepArray
, Builder
.getInt8PtrTy());
1668 Builder
.restoreIP(OldIP
);
1671 // In the presence of the `if` clause, the following IR is generated:
1673 // %data = call @__kmpc_omp_task_alloc(...)
1674 // br i1 %if_condition, label %then, label %else
1676 // call @__kmpc_omp_task(...)
1679 // call @__kmpc_omp_task_begin_if0(...)
1680 // call @outlined_fn(...)
1681 // call @__kmpc_omp_task_complete_if0(...)
1686 // `SplitBlockAndInsertIfThenElse` requires the block to have a
1688 splitBB(Builder
, /*CreateBranch=*/true, "if.end");
1689 Instruction
*IfTerminator
=
1690 Builder
.GetInsertPoint()->getParent()->getTerminator();
1691 Instruction
*ThenTI
= IfTerminator
, *ElseTI
= nullptr;
1692 Builder
.SetInsertPoint(IfTerminator
);
1693 SplitBlockAndInsertIfThenElse(IfCondition
, IfTerminator
, &ThenTI
,
1695 Builder
.SetInsertPoint(ElseTI
);
1696 Function
*TaskBeginFn
=
1697 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0
);
1698 Function
*TaskCompleteFn
=
1699 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0
);
1700 Builder
.CreateCall(TaskBeginFn
, {Ident
, ThreadID
, TaskData
});
1701 CallInst
*CI
= nullptr;
1703 CI
= Builder
.CreateCall(&OutlinedFn
, {ThreadID
, TaskData
});
1705 CI
= Builder
.CreateCall(&OutlinedFn
, {ThreadID
});
1706 CI
->setDebugLoc(StaleCI
->getDebugLoc());
1707 Builder
.CreateCall(TaskCompleteFn
, {Ident
, ThreadID
, TaskData
});
1708 Builder
.SetInsertPoint(ThenTI
);
1711 if (Dependencies
.size()) {
1713 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps
);
1716 {Ident
, ThreadID
, TaskData
, Builder
.getInt32(Dependencies
.size()),
1717 DepArrayPtr
, ConstantInt::get(Builder
.getInt32Ty(), 0),
1718 ConstantPointerNull::get(Type::getInt8PtrTy(M
.getContext()))});
1721 // Emit the @__kmpc_omp_task runtime call to spawn the task
1722 Function
*TaskFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task
);
1723 Builder
.CreateCall(TaskFn
, {Ident
, ThreadID
, TaskData
});
1726 StaleCI
->eraseFromParent();
1728 Builder
.SetInsertPoint(TaskAllocaBB
, TaskAllocaBB
->begin());
1730 LoadInst
*Shareds
= Builder
.CreateLoad(VoidPtr
, OutlinedFn
.getArg(1));
1731 OutlinedFn
.getArg(1)->replaceUsesWithIf(
1732 Shareds
, [Shareds
](Use
&U
) { return U
.getUser() != Shareds
; });
1735 while (!ToBeDeleted
.empty()) {
1736 ToBeDeleted
.top()->eraseFromParent();
1741 addOutlineInfo(std::move(OI
));
1742 Builder
.SetInsertPoint(TaskExitBB
, TaskExitBB
->begin());
1744 return Builder
.saveIP();
1747 OpenMPIRBuilder::InsertPointTy
1748 OpenMPIRBuilder::createTaskgroup(const LocationDescription
&Loc
,
1749 InsertPointTy AllocaIP
,
1750 BodyGenCallbackTy BodyGenCB
) {
1751 if (!updateToLocation(Loc
))
1752 return InsertPointTy();
1754 uint32_t SrcLocStrSize
;
1755 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1756 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
1757 Value
*ThreadID
= getOrCreateThreadID(Ident
);
1759 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
1760 Function
*TaskgroupFn
=
1761 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup
);
1762 Builder
.CreateCall(TaskgroupFn
, {Ident
, ThreadID
});
1764 BasicBlock
*TaskgroupExitBB
= splitBB(Builder
, true, "taskgroup.exit");
1765 BodyGenCB(AllocaIP
, Builder
.saveIP());
1767 Builder
.SetInsertPoint(TaskgroupExitBB
);
1768 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
1769 Function
*EndTaskgroupFn
=
1770 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup
);
1771 Builder
.CreateCall(EndTaskgroupFn
, {Ident
, ThreadID
});
1773 return Builder
.saveIP();
1776 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createSections(
1777 const LocationDescription
&Loc
, InsertPointTy AllocaIP
,
1778 ArrayRef
<StorableBodyGenCallbackTy
> SectionCBs
, PrivatizeCallbackTy PrivCB
,
1779 FinalizeCallbackTy FiniCB
, bool IsCancellable
, bool IsNowait
) {
1780 assert(!isConflictIP(AllocaIP
, Loc
.IP
) && "Dedicated IP allocas required");
1782 if (!updateToLocation(Loc
))
1785 auto FiniCBWrapper
= [&](InsertPointTy IP
) {
1786 if (IP
.getBlock()->end() != IP
.getPoint())
1788 // This must be done otherwise any nested constructs using FinalizeOMPRegion
1789 // will fail because that function requires the Finalization Basic Block to
1790 // have a terminator, which is already removed by EmitOMPRegionBody.
1791 // IP is currently at cancelation block.
1792 // We need to backtrack to the condition block to fetch
1793 // the exit block and create a branch from cancelation
1795 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
1796 Builder
.restoreIP(IP
);
1797 auto *CaseBB
= IP
.getBlock()->getSinglePredecessor();
1798 auto *CondBB
= CaseBB
->getSinglePredecessor()->getSinglePredecessor();
1799 auto *ExitBB
= CondBB
->getTerminator()->getSuccessor(1);
1800 Instruction
*I
= Builder
.CreateBr(ExitBB
);
1801 IP
= InsertPointTy(I
->getParent(), I
->getIterator());
1805 FinalizationStack
.push_back({FiniCBWrapper
, OMPD_sections
, IsCancellable
});
1807 // Each section is emitted as a switch case
1808 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
1809 // -> OMP.createSection() which generates the IR for each section
1810 // Iterate through all sections and emit a switch construct:
1813 // <SectionStmt[0]>;
1816 // case <NumSection> - 1:
1817 // <SectionStmt[<NumSection> - 1]>;
1821 // section_loop.after:
1823 auto LoopBodyGenCB
= [&](InsertPointTy CodeGenIP
, Value
*IndVar
) {
1824 Builder
.restoreIP(CodeGenIP
);
1825 BasicBlock
*Continue
=
1826 splitBBWithSuffix(Builder
, /*CreateBranch=*/false, ".sections.after");
1827 Function
*CurFn
= Continue
->getParent();
1828 SwitchInst
*SwitchStmt
= Builder
.CreateSwitch(IndVar
, Continue
);
1830 unsigned CaseNumber
= 0;
1831 for (auto SectionCB
: SectionCBs
) {
1832 BasicBlock
*CaseBB
= BasicBlock::Create(
1833 M
.getContext(), "omp_section_loop.body.case", CurFn
, Continue
);
1834 SwitchStmt
->addCase(Builder
.getInt32(CaseNumber
), CaseBB
);
1835 Builder
.SetInsertPoint(CaseBB
);
1836 BranchInst
*CaseEndBr
= Builder
.CreateBr(Continue
);
1837 SectionCB(InsertPointTy(),
1838 {CaseEndBr
->getParent(), CaseEndBr
->getIterator()});
1841 // remove the existing terminator from body BB since there can be no
1842 // terminators after switch/case
1844 // Loop body ends here
1845 // LowerBound, UpperBound, and STride for createCanonicalLoop
1846 Type
*I32Ty
= Type::getInt32Ty(M
.getContext());
1847 Value
*LB
= ConstantInt::get(I32Ty
, 0);
1848 Value
*UB
= ConstantInt::get(I32Ty
, SectionCBs
.size());
1849 Value
*ST
= ConstantInt::get(I32Ty
, 1);
1850 llvm::CanonicalLoopInfo
*LoopInfo
= createCanonicalLoop(
1851 Loc
, LoopBodyGenCB
, LB
, UB
, ST
, true, false, AllocaIP
, "section_loop");
1852 InsertPointTy AfterIP
=
1853 applyStaticWorkshareLoop(Loc
.DL
, LoopInfo
, AllocaIP
, !IsNowait
);
1855 // Apply the finalization callback in LoopAfterBB
1856 auto FiniInfo
= FinalizationStack
.pop_back_val();
1857 assert(FiniInfo
.DK
== OMPD_sections
&&
1858 "Unexpected finalization stack state!");
1859 if (FinalizeCallbackTy
&CB
= FiniInfo
.FiniCB
) {
1860 Builder
.restoreIP(AfterIP
);
1861 BasicBlock
*FiniBB
=
1862 splitBBWithSuffix(Builder
, /*CreateBranch=*/true, "sections.fini");
1863 CB(Builder
.saveIP());
1864 AfterIP
= {FiniBB
, FiniBB
->begin()};
1870 OpenMPIRBuilder::InsertPointTy
1871 OpenMPIRBuilder::createSection(const LocationDescription
&Loc
,
1872 BodyGenCallbackTy BodyGenCB
,
1873 FinalizeCallbackTy FiniCB
) {
1874 if (!updateToLocation(Loc
))
1877 auto FiniCBWrapper
= [&](InsertPointTy IP
) {
1878 if (IP
.getBlock()->end() != IP
.getPoint())
1880 // This must be done otherwise any nested constructs using FinalizeOMPRegion
1881 // will fail because that function requires the Finalization Basic Block to
1882 // have a terminator, which is already removed by EmitOMPRegionBody.
1883 // IP is currently at cancelation block.
1884 // We need to backtrack to the condition block to fetch
1885 // the exit block and create a branch from cancelation
1887 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
1888 Builder
.restoreIP(IP
);
1889 auto *CaseBB
= Loc
.IP
.getBlock();
1890 auto *CondBB
= CaseBB
->getSinglePredecessor()->getSinglePredecessor();
1891 auto *ExitBB
= CondBB
->getTerminator()->getSuccessor(1);
1892 Instruction
*I
= Builder
.CreateBr(ExitBB
);
1893 IP
= InsertPointTy(I
->getParent(), I
->getIterator());
1897 Directive OMPD
= Directive::OMPD_sections
;
1898 // Since we are using Finalization Callback here, HasFinalize
1899 // and IsCancellable have to be true
1900 return EmitOMPInlinedRegion(OMPD
, nullptr, nullptr, BodyGenCB
, FiniCBWrapper
,
1901 /*Conditional*/ false, /*hasFinalize*/ true,
1902 /*IsCancellable*/ true);
1905 /// Create a function with a unique name and a "void (i8*, i8*)" signature in
1906 /// the given module and return it.
1907 Function
*getFreshReductionFunc(Module
&M
) {
1908 Type
*VoidTy
= Type::getVoidTy(M
.getContext());
1909 Type
*Int8PtrTy
= Type::getInt8PtrTy(M
.getContext());
1911 FunctionType::get(VoidTy
, {Int8PtrTy
, Int8PtrTy
}, /* IsVarArg */ false);
1912 return Function::Create(FuncTy
, GlobalVariable::InternalLinkage
,
1913 M
.getDataLayout().getDefaultGlobalsAddressSpace(),
1914 ".omp.reduction.func", &M
);
1917 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createReductions(
1918 const LocationDescription
&Loc
, InsertPointTy AllocaIP
,
1919 ArrayRef
<ReductionInfo
> ReductionInfos
, bool IsNoWait
) {
1920 for (const ReductionInfo
&RI
: ReductionInfos
) {
1922 assert(RI
.Variable
&& "expected non-null variable");
1923 assert(RI
.PrivateVariable
&& "expected non-null private variable");
1924 assert(RI
.ReductionGen
&& "expected non-null reduction generator callback");
1925 assert(RI
.Variable
->getType() == RI
.PrivateVariable
->getType() &&
1926 "expected variables and their private equivalents to have the same "
1928 assert(RI
.Variable
->getType()->isPointerTy() &&
1929 "expected variables to be pointers");
1932 if (!updateToLocation(Loc
))
1933 return InsertPointTy();
1935 BasicBlock
*InsertBlock
= Loc
.IP
.getBlock();
1936 BasicBlock
*ContinuationBlock
=
1937 InsertBlock
->splitBasicBlock(Loc
.IP
.getPoint(), "reduce.finalize");
1938 InsertBlock
->getTerminator()->eraseFromParent();
1940 // Create and populate array of type-erased pointers to private reduction
1942 unsigned NumReductions
= ReductionInfos
.size();
1943 Type
*RedArrayTy
= ArrayType::get(Builder
.getInt8PtrTy(), NumReductions
);
1944 Builder
.restoreIP(AllocaIP
);
1945 Value
*RedArray
= Builder
.CreateAlloca(RedArrayTy
, nullptr, "red.array");
1947 Builder
.SetInsertPoint(InsertBlock
, InsertBlock
->end());
1949 for (auto En
: enumerate(ReductionInfos
)) {
1950 unsigned Index
= En
.index();
1951 const ReductionInfo
&RI
= En
.value();
1952 Value
*RedArrayElemPtr
= Builder
.CreateConstInBoundsGEP2_64(
1953 RedArrayTy
, RedArray
, 0, Index
, "red.array.elem." + Twine(Index
));
1955 Builder
.CreateBitCast(RI
.PrivateVariable
, Builder
.getInt8PtrTy(),
1956 "private.red.var." + Twine(Index
) + ".casted");
1957 Builder
.CreateStore(Casted
, RedArrayElemPtr
);
1960 // Emit a call to the runtime function that orchestrates the reduction.
1961 // Declare the reduction function in the process.
1962 Function
*Func
= Builder
.GetInsertBlock()->getParent();
1963 Module
*Module
= Func
->getParent();
1964 Value
*RedArrayPtr
=
1965 Builder
.CreateBitCast(RedArray
, Builder
.getInt8PtrTy(), "red.array.ptr");
1966 uint32_t SrcLocStrSize
;
1967 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
1968 bool CanGenerateAtomic
=
1969 llvm::all_of(ReductionInfos
, [](const ReductionInfo
&RI
) {
1970 return RI
.AtomicReductionGen
;
1972 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
,
1974 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
1976 Value
*ThreadId
= getOrCreateThreadID(Ident
);
1977 Constant
*NumVariables
= Builder
.getInt32(NumReductions
);
1978 const DataLayout
&DL
= Module
->getDataLayout();
1979 unsigned RedArrayByteSize
= DL
.getTypeStoreSize(RedArrayTy
);
1980 Constant
*RedArraySize
= Builder
.getInt64(RedArrayByteSize
);
1981 Function
*ReductionFunc
= getFreshReductionFunc(*Module
);
1982 Value
*Lock
= getOMPCriticalRegionLock(".reduction");
1983 Function
*ReduceFunc
= getOrCreateRuntimeFunctionPtr(
1984 IsNoWait
? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
1985 : RuntimeFunction::OMPRTL___kmpc_reduce
);
1986 CallInst
*ReduceCall
=
1987 Builder
.CreateCall(ReduceFunc
,
1988 {Ident
, ThreadId
, NumVariables
, RedArraySize
,
1989 RedArrayPtr
, ReductionFunc
, Lock
},
1992 // Create final reduction entry blocks for the atomic and non-atomic case.
1993 // Emit IR that dispatches control flow to one of the blocks based on the
1994 // reduction supporting the atomic mode.
1995 BasicBlock
*NonAtomicRedBlock
=
1996 BasicBlock::Create(Module
->getContext(), "reduce.switch.nonatomic", Func
);
1997 BasicBlock
*AtomicRedBlock
=
1998 BasicBlock::Create(Module
->getContext(), "reduce.switch.atomic", Func
);
1999 SwitchInst
*Switch
=
2000 Builder
.CreateSwitch(ReduceCall
, ContinuationBlock
, /* NumCases */ 2);
2001 Switch
->addCase(Builder
.getInt32(1), NonAtomicRedBlock
);
2002 Switch
->addCase(Builder
.getInt32(2), AtomicRedBlock
);
2004 // Populate the non-atomic reduction using the elementwise reduction function.
2005 // This loads the elements from the global and private variables and reduces
2006 // them before storing back the result to the global variable.
2007 Builder
.SetInsertPoint(NonAtomicRedBlock
);
2008 for (auto En
: enumerate(ReductionInfos
)) {
2009 const ReductionInfo
&RI
= En
.value();
2010 Type
*ValueType
= RI
.ElementType
;
2011 Value
*RedValue
= Builder
.CreateLoad(ValueType
, RI
.Variable
,
2012 "red.value." + Twine(En
.index()));
2013 Value
*PrivateRedValue
=
2014 Builder
.CreateLoad(ValueType
, RI
.PrivateVariable
,
2015 "red.private.value." + Twine(En
.index()));
2018 RI
.ReductionGen(Builder
.saveIP(), RedValue
, PrivateRedValue
, Reduced
));
2019 if (!Builder
.GetInsertBlock())
2020 return InsertPointTy();
2021 Builder
.CreateStore(Reduced
, RI
.Variable
);
2023 Function
*EndReduceFunc
= getOrCreateRuntimeFunctionPtr(
2024 IsNoWait
? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
2025 : RuntimeFunction::OMPRTL___kmpc_end_reduce
);
2026 Builder
.CreateCall(EndReduceFunc
, {Ident
, ThreadId
, Lock
});
2027 Builder
.CreateBr(ContinuationBlock
);
2029 // Populate the atomic reduction using the atomic elementwise reduction
2030 // function. There are no loads/stores here because they will be happening
2031 // inside the atomic elementwise reduction.
2032 Builder
.SetInsertPoint(AtomicRedBlock
);
2033 if (CanGenerateAtomic
) {
2034 for (const ReductionInfo
&RI
: ReductionInfos
) {
2035 Builder
.restoreIP(RI
.AtomicReductionGen(Builder
.saveIP(), RI
.ElementType
,
2036 RI
.Variable
, RI
.PrivateVariable
));
2037 if (!Builder
.GetInsertBlock())
2038 return InsertPointTy();
2040 Builder
.CreateBr(ContinuationBlock
);
2042 Builder
.CreateUnreachable();
2045 // Populate the outlined reduction function using the elementwise reduction
2046 // function. Partial values are extracted from the type-erased array of
2047 // pointers to private variables.
2048 BasicBlock
*ReductionFuncBlock
=
2049 BasicBlock::Create(Module
->getContext(), "", ReductionFunc
);
2050 Builder
.SetInsertPoint(ReductionFuncBlock
);
2051 Value
*LHSArrayPtr
= ReductionFunc
->getArg(0);
2052 Value
*RHSArrayPtr
= ReductionFunc
->getArg(1);
2054 for (auto En
: enumerate(ReductionInfos
)) {
2055 const ReductionInfo
&RI
= En
.value();
2056 Value
*LHSI8PtrPtr
= Builder
.CreateConstInBoundsGEP2_64(
2057 RedArrayTy
, LHSArrayPtr
, 0, En
.index());
2058 Value
*LHSI8Ptr
= Builder
.CreateLoad(Builder
.getInt8PtrTy(), LHSI8PtrPtr
);
2059 Value
*LHSPtr
= Builder
.CreateBitCast(LHSI8Ptr
, RI
.Variable
->getType());
2060 Value
*LHS
= Builder
.CreateLoad(RI
.ElementType
, LHSPtr
);
2061 Value
*RHSI8PtrPtr
= Builder
.CreateConstInBoundsGEP2_64(
2062 RedArrayTy
, RHSArrayPtr
, 0, En
.index());
2063 Value
*RHSI8Ptr
= Builder
.CreateLoad(Builder
.getInt8PtrTy(), RHSI8PtrPtr
);
2065 Builder
.CreateBitCast(RHSI8Ptr
, RI
.PrivateVariable
->getType());
2066 Value
*RHS
= Builder
.CreateLoad(RI
.ElementType
, RHSPtr
);
2068 Builder
.restoreIP(RI
.ReductionGen(Builder
.saveIP(), LHS
, RHS
, Reduced
));
2069 if (!Builder
.GetInsertBlock())
2070 return InsertPointTy();
2071 Builder
.CreateStore(Reduced
, LHSPtr
);
2073 Builder
.CreateRetVoid();
2075 Builder
.SetInsertPoint(ContinuationBlock
);
2076 return Builder
.saveIP();
2079 OpenMPIRBuilder::InsertPointTy
2080 OpenMPIRBuilder::createMaster(const LocationDescription
&Loc
,
2081 BodyGenCallbackTy BodyGenCB
,
2082 FinalizeCallbackTy FiniCB
) {
2084 if (!updateToLocation(Loc
))
2087 Directive OMPD
= Directive::OMPD_master
;
2088 uint32_t SrcLocStrSize
;
2089 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
2090 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
2091 Value
*ThreadId
= getOrCreateThreadID(Ident
);
2092 Value
*Args
[] = {Ident
, ThreadId
};
2094 Function
*EntryRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master
);
2095 Instruction
*EntryCall
= Builder
.CreateCall(EntryRTLFn
, Args
);
2097 Function
*ExitRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master
);
2098 Instruction
*ExitCall
= Builder
.CreateCall(ExitRTLFn
, Args
);
2100 return EmitOMPInlinedRegion(OMPD
, EntryCall
, ExitCall
, BodyGenCB
, FiniCB
,
2101 /*Conditional*/ true, /*hasFinalize*/ true);
2104 OpenMPIRBuilder::InsertPointTy
2105 OpenMPIRBuilder::createMasked(const LocationDescription
&Loc
,
2106 BodyGenCallbackTy BodyGenCB
,
2107 FinalizeCallbackTy FiniCB
, Value
*Filter
) {
2108 if (!updateToLocation(Loc
))
2111 Directive OMPD
= Directive::OMPD_masked
;
2112 uint32_t SrcLocStrSize
;
2113 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
2114 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
2115 Value
*ThreadId
= getOrCreateThreadID(Ident
);
2116 Value
*Args
[] = {Ident
, ThreadId
, Filter
};
2117 Value
*ArgsEnd
[] = {Ident
, ThreadId
};
2119 Function
*EntryRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked
);
2120 Instruction
*EntryCall
= Builder
.CreateCall(EntryRTLFn
, Args
);
2122 Function
*ExitRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked
);
2123 Instruction
*ExitCall
= Builder
.CreateCall(ExitRTLFn
, ArgsEnd
);
2125 return EmitOMPInlinedRegion(OMPD
, EntryCall
, ExitCall
, BodyGenCB
, FiniCB
,
2126 /*Conditional*/ true, /*hasFinalize*/ true);
2129 CanonicalLoopInfo
*OpenMPIRBuilder::createLoopSkeleton(
2130 DebugLoc DL
, Value
*TripCount
, Function
*F
, BasicBlock
*PreInsertBefore
,
2131 BasicBlock
*PostInsertBefore
, const Twine
&Name
) {
2132 Module
*M
= F
->getParent();
2133 LLVMContext
&Ctx
= M
->getContext();
2134 Type
*IndVarTy
= TripCount
->getType();
2136 // Create the basic block structure.
2137 BasicBlock
*Preheader
=
2138 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".preheader", F
, PreInsertBefore
);
2139 BasicBlock
*Header
=
2140 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".header", F
, PreInsertBefore
);
2142 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".cond", F
, PreInsertBefore
);
2144 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".body", F
, PreInsertBefore
);
2146 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".inc", F
, PostInsertBefore
);
2148 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".exit", F
, PostInsertBefore
);
2150 BasicBlock::Create(Ctx
, "omp_" + Name
+ ".after", F
, PostInsertBefore
);
2152 // Use specified DebugLoc for new instructions.
2153 Builder
.SetCurrentDebugLocation(DL
);
2155 Builder
.SetInsertPoint(Preheader
);
2156 Builder
.CreateBr(Header
);
2158 Builder
.SetInsertPoint(Header
);
2159 PHINode
*IndVarPHI
= Builder
.CreatePHI(IndVarTy
, 2, "omp_" + Name
+ ".iv");
2160 IndVarPHI
->addIncoming(ConstantInt::get(IndVarTy
, 0), Preheader
);
2161 Builder
.CreateBr(Cond
);
2163 Builder
.SetInsertPoint(Cond
);
2165 Builder
.CreateICmpULT(IndVarPHI
, TripCount
, "omp_" + Name
+ ".cmp");
2166 Builder
.CreateCondBr(Cmp
, Body
, Exit
);
2168 Builder
.SetInsertPoint(Body
);
2169 Builder
.CreateBr(Latch
);
2171 Builder
.SetInsertPoint(Latch
);
2172 Value
*Next
= Builder
.CreateAdd(IndVarPHI
, ConstantInt::get(IndVarTy
, 1),
2173 "omp_" + Name
+ ".next", /*HasNUW=*/true);
2174 Builder
.CreateBr(Header
);
2175 IndVarPHI
->addIncoming(Next
, Latch
);
2177 Builder
.SetInsertPoint(Exit
);
2178 Builder
.CreateBr(After
);
2180 // Remember and return the canonical control flow.
2181 LoopInfos
.emplace_front();
2182 CanonicalLoopInfo
*CL
= &LoopInfos
.front();
2184 CL
->Header
= Header
;
2196 OpenMPIRBuilder::createCanonicalLoop(const LocationDescription
&Loc
,
2197 LoopBodyGenCallbackTy BodyGenCB
,
2198 Value
*TripCount
, const Twine
&Name
) {
2199 BasicBlock
*BB
= Loc
.IP
.getBlock();
2200 BasicBlock
*NextBB
= BB
->getNextNode();
2202 CanonicalLoopInfo
*CL
= createLoopSkeleton(Loc
.DL
, TripCount
, BB
->getParent(),
2203 NextBB
, NextBB
, Name
);
2204 BasicBlock
*After
= CL
->getAfter();
2206 // If location is not set, don't connect the loop.
2207 if (updateToLocation(Loc
)) {
2208 // Split the loop at the insertion point: Branch to the preheader and move
2209 // every following instruction to after the loop (the After BB). Also, the
2210 // new successor is the loop's after block.
2211 spliceBB(Builder
, After
, /*CreateBranch=*/false);
2212 Builder
.CreateBr(CL
->getPreheader());
2215 // Emit the body content. We do it after connecting the loop to the CFG to
2216 // avoid that the callback encounters degenerate BBs.
2217 BodyGenCB(CL
->getBodyIP(), CL
->getIndVar());
2225 CanonicalLoopInfo
*OpenMPIRBuilder::createCanonicalLoop(
2226 const LocationDescription
&Loc
, LoopBodyGenCallbackTy BodyGenCB
,
2227 Value
*Start
, Value
*Stop
, Value
*Step
, bool IsSigned
, bool InclusiveStop
,
2228 InsertPointTy ComputeIP
, const Twine
&Name
) {
2230 // Consider the following difficulties (assuming 8-bit signed integers):
2231 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
2232 // DO I = 1, 100, 50
2233 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
2234 // DO I = 100, 0, -128
2236 // Start, Stop and Step must be of the same integer type.
2237 auto *IndVarTy
= cast
<IntegerType
>(Start
->getType());
2238 assert(IndVarTy
== Stop
->getType() && "Stop type mismatch");
2239 assert(IndVarTy
== Step
->getType() && "Step type mismatch");
2241 LocationDescription ComputeLoc
=
2242 ComputeIP
.isSet() ? LocationDescription(ComputeIP
, Loc
.DL
) : Loc
;
2243 updateToLocation(ComputeLoc
);
2245 ConstantInt
*Zero
= ConstantInt::get(IndVarTy
, 0);
2246 ConstantInt
*One
= ConstantInt::get(IndVarTy
, 1);
2248 // Like Step, but always positive.
2251 // Distance between Start and Stop; always positive.
2254 // Condition whether there are no iterations are executed at all, e.g. because
2259 // Ensure that increment is positive. If not, negate and invert LB and UB.
2260 Value
*IsNeg
= Builder
.CreateICmpSLT(Step
, Zero
);
2261 Incr
= Builder
.CreateSelect(IsNeg
, Builder
.CreateNeg(Step
), Step
);
2262 Value
*LB
= Builder
.CreateSelect(IsNeg
, Stop
, Start
);
2263 Value
*UB
= Builder
.CreateSelect(IsNeg
, Start
, Stop
);
2264 Span
= Builder
.CreateSub(UB
, LB
, "", false, true);
2265 ZeroCmp
= Builder
.CreateICmp(
2266 InclusiveStop
? CmpInst::ICMP_SLT
: CmpInst::ICMP_SLE
, UB
, LB
);
2268 Span
= Builder
.CreateSub(Stop
, Start
, "", true);
2269 ZeroCmp
= Builder
.CreateICmp(
2270 InclusiveStop
? CmpInst::ICMP_ULT
: CmpInst::ICMP_ULE
, Stop
, Start
);
2273 Value
*CountIfLooping
;
2274 if (InclusiveStop
) {
2275 CountIfLooping
= Builder
.CreateAdd(Builder
.CreateUDiv(Span
, Incr
), One
);
2277 // Avoid incrementing past stop since it could overflow.
2278 Value
*CountIfTwo
= Builder
.CreateAdd(
2279 Builder
.CreateUDiv(Builder
.CreateSub(Span
, One
), Incr
), One
);
2280 Value
*OneCmp
= Builder
.CreateICmp(CmpInst::ICMP_ULE
, Span
, Incr
);
2281 CountIfLooping
= Builder
.CreateSelect(OneCmp
, One
, CountIfTwo
);
2283 Value
*TripCount
= Builder
.CreateSelect(ZeroCmp
, Zero
, CountIfLooping
,
2284 "omp_" + Name
+ ".tripcount");
2286 auto BodyGen
= [=](InsertPointTy CodeGenIP
, Value
*IV
) {
2287 Builder
.restoreIP(CodeGenIP
);
2288 Value
*Span
= Builder
.CreateMul(IV
, Step
);
2289 Value
*IndVar
= Builder
.CreateAdd(Span
, Start
);
2290 BodyGenCB(Builder
.saveIP(), IndVar
);
2292 LocationDescription LoopLoc
= ComputeIP
.isSet() ? Loc
.IP
: Builder
.saveIP();
2293 return createCanonicalLoop(LoopLoc
, BodyGen
, TripCount
, Name
);
2296 // Returns an LLVM function to call for initializing loop bounds using OpenMP
2297 // static scheduling depending on `type`. Only i32 and i64 are supported by the
2298 // runtime. Always interpret integers as unsigned similarly to
2299 // CanonicalLoopInfo.
2300 static FunctionCallee
getKmpcForStaticInitForType(Type
*Ty
, Module
&M
,
2301 OpenMPIRBuilder
&OMPBuilder
) {
2302 unsigned Bitwidth
= Ty
->getIntegerBitWidth();
2304 return OMPBuilder
.getOrCreateRuntimeFunction(
2305 M
, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u
);
2307 return OMPBuilder
.getOrCreateRuntimeFunction(
2308 M
, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u
);
2309 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2312 OpenMPIRBuilder::InsertPointTy
2313 OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL
, CanonicalLoopInfo
*CLI
,
2314 InsertPointTy AllocaIP
,
2315 bool NeedsBarrier
) {
2316 assert(CLI
->isValid() && "Requires a valid canonical loop");
2317 assert(!isConflictIP(AllocaIP
, CLI
->getPreheaderIP()) &&
2318 "Require dedicated allocate IP");
2320 // Set up the source location value for OpenMP runtime.
2321 Builder
.restoreIP(CLI
->getPreheaderIP());
2322 Builder
.SetCurrentDebugLocation(DL
);
2324 uint32_t SrcLocStrSize
;
2325 Constant
*SrcLocStr
= getOrCreateSrcLocStr(DL
, SrcLocStrSize
);
2326 Value
*SrcLoc
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
2328 // Declare useful OpenMP runtime functions.
2329 Value
*IV
= CLI
->getIndVar();
2330 Type
*IVTy
= IV
->getType();
2331 FunctionCallee StaticInit
= getKmpcForStaticInitForType(IVTy
, M
, *this);
2332 FunctionCallee StaticFini
=
2333 getOrCreateRuntimeFunction(M
, omp::OMPRTL___kmpc_for_static_fini
);
2335 // Allocate space for computed loop bounds as expected by the "init" function.
2336 Builder
.restoreIP(AllocaIP
);
2337 Type
*I32Type
= Type::getInt32Ty(M
.getContext());
2338 Value
*PLastIter
= Builder
.CreateAlloca(I32Type
, nullptr, "p.lastiter");
2339 Value
*PLowerBound
= Builder
.CreateAlloca(IVTy
, nullptr, "p.lowerbound");
2340 Value
*PUpperBound
= Builder
.CreateAlloca(IVTy
, nullptr, "p.upperbound");
2341 Value
*PStride
= Builder
.CreateAlloca(IVTy
, nullptr, "p.stride");
2343 // At the end of the preheader, prepare for calling the "init" function by
2344 // storing the current loop bounds into the allocated space. A canonical loop
2345 // always iterates from 0 to trip-count with step 1. Note that "init" expects
2346 // and produces an inclusive upper bound.
2347 Builder
.SetInsertPoint(CLI
->getPreheader()->getTerminator());
2348 Constant
*Zero
= ConstantInt::get(IVTy
, 0);
2349 Constant
*One
= ConstantInt::get(IVTy
, 1);
2350 Builder
.CreateStore(Zero
, PLowerBound
);
2351 Value
*UpperBound
= Builder
.CreateSub(CLI
->getTripCount(), One
);
2352 Builder
.CreateStore(UpperBound
, PUpperBound
);
2353 Builder
.CreateStore(One
, PStride
);
2355 Value
*ThreadNum
= getOrCreateThreadID(SrcLoc
);
2357 Constant
*SchedulingType
= ConstantInt::get(
2358 I32Type
, static_cast<int>(OMPScheduleType::UnorderedStatic
));
2360 // Call the "init" function and update the trip count of the loop with the
2361 // value it produced.
2362 Builder
.CreateCall(StaticInit
,
2363 {SrcLoc
, ThreadNum
, SchedulingType
, PLastIter
, PLowerBound
,
2364 PUpperBound
, PStride
, One
, Zero
});
2365 Value
*LowerBound
= Builder
.CreateLoad(IVTy
, PLowerBound
);
2366 Value
*InclusiveUpperBound
= Builder
.CreateLoad(IVTy
, PUpperBound
);
2367 Value
*TripCountMinusOne
= Builder
.CreateSub(InclusiveUpperBound
, LowerBound
);
2368 Value
*TripCount
= Builder
.CreateAdd(TripCountMinusOne
, One
);
2369 CLI
->setTripCount(TripCount
);
2371 // Update all uses of the induction variable except the one in the condition
2372 // block that compares it with the actual upper bound, and the increment in
2375 CLI
->mapIndVar([&](Instruction
*OldIV
) -> Value
* {
2376 Builder
.SetInsertPoint(CLI
->getBody(),
2377 CLI
->getBody()->getFirstInsertionPt());
2378 Builder
.SetCurrentDebugLocation(DL
);
2379 return Builder
.CreateAdd(OldIV
, LowerBound
);
2382 // In the "exit" block, call the "fini" function.
2383 Builder
.SetInsertPoint(CLI
->getExit(),
2384 CLI
->getExit()->getTerminator()->getIterator());
2385 Builder
.CreateCall(StaticFini
, {SrcLoc
, ThreadNum
});
2387 // Add the barrier if requested.
2389 createBarrier(LocationDescription(Builder
.saveIP(), DL
),
2390 omp::Directive::OMPD_for
, /* ForceSimpleCall */ false,
2391 /* CheckCancelFlag */ false);
2393 InsertPointTy AfterIP
= CLI
->getAfterIP();
2399 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
2400 DebugLoc DL
, CanonicalLoopInfo
*CLI
, InsertPointTy AllocaIP
,
2401 bool NeedsBarrier
, Value
*ChunkSize
) {
2402 assert(CLI
->isValid() && "Requires a valid canonical loop");
2403 assert(ChunkSize
&& "Chunk size is required");
2405 LLVMContext
&Ctx
= CLI
->getFunction()->getContext();
2406 Value
*IV
= CLI
->getIndVar();
2407 Value
*OrigTripCount
= CLI
->getTripCount();
2408 Type
*IVTy
= IV
->getType();
2409 assert(IVTy
->getIntegerBitWidth() <= 64 &&
2410 "Max supported tripcount bitwidth is 64 bits");
2411 Type
*InternalIVTy
= IVTy
->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx
)
2412 : Type::getInt64Ty(Ctx
);
2413 Type
*I32Type
= Type::getInt32Ty(M
.getContext());
2414 Constant
*Zero
= ConstantInt::get(InternalIVTy
, 0);
2415 Constant
*One
= ConstantInt::get(InternalIVTy
, 1);
2417 // Declare useful OpenMP runtime functions.
2418 FunctionCallee StaticInit
=
2419 getKmpcForStaticInitForType(InternalIVTy
, M
, *this);
2420 FunctionCallee StaticFini
=
2421 getOrCreateRuntimeFunction(M
, omp::OMPRTL___kmpc_for_static_fini
);
2423 // Allocate space for computed loop bounds as expected by the "init" function.
2424 Builder
.restoreIP(AllocaIP
);
2425 Builder
.SetCurrentDebugLocation(DL
);
2426 Value
*PLastIter
= Builder
.CreateAlloca(I32Type
, nullptr, "p.lastiter");
2427 Value
*PLowerBound
=
2428 Builder
.CreateAlloca(InternalIVTy
, nullptr, "p.lowerbound");
2429 Value
*PUpperBound
=
2430 Builder
.CreateAlloca(InternalIVTy
, nullptr, "p.upperbound");
2431 Value
*PStride
= Builder
.CreateAlloca(InternalIVTy
, nullptr, "p.stride");
2433 // Set up the source location value for the OpenMP runtime.
2434 Builder
.restoreIP(CLI
->getPreheaderIP());
2435 Builder
.SetCurrentDebugLocation(DL
);
2437 // TODO: Detect overflow in ubsan or max-out with current tripcount.
2438 Value
*CastedChunkSize
=
2439 Builder
.CreateZExtOrTrunc(ChunkSize
, InternalIVTy
, "chunksize");
2440 Value
*CastedTripCount
=
2441 Builder
.CreateZExt(OrigTripCount
, InternalIVTy
, "tripcount");
2443 Constant
*SchedulingType
= ConstantInt::get(
2444 I32Type
, static_cast<int>(OMPScheduleType::UnorderedStaticChunked
));
2445 Builder
.CreateStore(Zero
, PLowerBound
);
2446 Value
*OrigUpperBound
= Builder
.CreateSub(CastedTripCount
, One
);
2447 Builder
.CreateStore(OrigUpperBound
, PUpperBound
);
2448 Builder
.CreateStore(One
, PStride
);
2450 // Call the "init" function and update the trip count of the loop with the
2451 // value it produced.
2452 uint32_t SrcLocStrSize
;
2453 Constant
*SrcLocStr
= getOrCreateSrcLocStr(DL
, SrcLocStrSize
);
2454 Value
*SrcLoc
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
2455 Value
*ThreadNum
= getOrCreateThreadID(SrcLoc
);
2456 Builder
.CreateCall(StaticInit
,
2457 {/*loc=*/SrcLoc
, /*global_tid=*/ThreadNum
,
2458 /*schedtype=*/SchedulingType
, /*plastiter=*/PLastIter
,
2459 /*plower=*/PLowerBound
, /*pupper=*/PUpperBound
,
2460 /*pstride=*/PStride
, /*incr=*/One
,
2461 /*chunk=*/CastedChunkSize
});
2463 // Load values written by the "init" function.
2464 Value
*FirstChunkStart
=
2465 Builder
.CreateLoad(InternalIVTy
, PLowerBound
, "omp_firstchunk.lb");
2466 Value
*FirstChunkStop
=
2467 Builder
.CreateLoad(InternalIVTy
, PUpperBound
, "omp_firstchunk.ub");
2468 Value
*FirstChunkEnd
= Builder
.CreateAdd(FirstChunkStop
, One
);
2470 Builder
.CreateSub(FirstChunkEnd
, FirstChunkStart
, "omp_chunk.range");
2471 Value
*NextChunkStride
=
2472 Builder
.CreateLoad(InternalIVTy
, PStride
, "omp_dispatch.stride");
2474 // Create outer "dispatch" loop for enumerating the chunks.
2475 BasicBlock
*DispatchEnter
= splitBB(Builder
, true);
2476 Value
*DispatchCounter
;
2477 CanonicalLoopInfo
*DispatchCLI
= createCanonicalLoop(
2478 {Builder
.saveIP(), DL
},
2479 [&](InsertPointTy BodyIP
, Value
*Counter
) { DispatchCounter
= Counter
; },
2480 FirstChunkStart
, CastedTripCount
, NextChunkStride
,
2481 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
2484 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
2485 // not have to preserve the canonical invariant.
2486 BasicBlock
*DispatchBody
= DispatchCLI
->getBody();
2487 BasicBlock
*DispatchLatch
= DispatchCLI
->getLatch();
2488 BasicBlock
*DispatchExit
= DispatchCLI
->getExit();
2489 BasicBlock
*DispatchAfter
= DispatchCLI
->getAfter();
2490 DispatchCLI
->invalidate();
2492 // Rewire the original loop to become the chunk loop inside the dispatch loop.
2493 redirectTo(DispatchAfter
, CLI
->getAfter(), DL
);
2494 redirectTo(CLI
->getExit(), DispatchLatch
, DL
);
2495 redirectTo(DispatchBody
, DispatchEnter
, DL
);
2497 // Prepare the prolog of the chunk loop.
2498 Builder
.restoreIP(CLI
->getPreheaderIP());
2499 Builder
.SetCurrentDebugLocation(DL
);
2501 // Compute the number of iterations of the chunk loop.
2502 Builder
.SetInsertPoint(CLI
->getPreheader()->getTerminator());
2503 Value
*ChunkEnd
= Builder
.CreateAdd(DispatchCounter
, ChunkRange
);
2504 Value
*IsLastChunk
=
2505 Builder
.CreateICmpUGE(ChunkEnd
, CastedTripCount
, "omp_chunk.is_last");
2506 Value
*CountUntilOrigTripCount
=
2507 Builder
.CreateSub(CastedTripCount
, DispatchCounter
);
2508 Value
*ChunkTripCount
= Builder
.CreateSelect(
2509 IsLastChunk
, CountUntilOrigTripCount
, ChunkRange
, "omp_chunk.tripcount");
2510 Value
*BackcastedChunkTC
=
2511 Builder
.CreateTrunc(ChunkTripCount
, IVTy
, "omp_chunk.tripcount.trunc");
2512 CLI
->setTripCount(BackcastedChunkTC
);
2514 // Update all uses of the induction variable except the one in the condition
2515 // block that compares it with the actual upper bound, and the increment in
2517 Value
*BackcastedDispatchCounter
=
2518 Builder
.CreateTrunc(DispatchCounter
, IVTy
, "omp_dispatch.iv.trunc");
2519 CLI
->mapIndVar([&](Instruction
*) -> Value
* {
2520 Builder
.restoreIP(CLI
->getBodyIP());
2521 return Builder
.CreateAdd(IV
, BackcastedDispatchCounter
);
2524 // In the "exit" block, call the "fini" function.
2525 Builder
.SetInsertPoint(DispatchExit
, DispatchExit
->getFirstInsertionPt());
2526 Builder
.CreateCall(StaticFini
, {SrcLoc
, ThreadNum
});
2528 // Add the barrier if requested.
2530 createBarrier(LocationDescription(Builder
.saveIP(), DL
), OMPD_for
,
2531 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
2534 // Even though we currently do not support applying additional methods to it,
2535 // the chunk loop should remain a canonical loop.
2539 return {DispatchAfter
, DispatchAfter
->getFirstInsertionPt()};
2542 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::applyWorkshareLoop(
2543 DebugLoc DL
, CanonicalLoopInfo
*CLI
, InsertPointTy AllocaIP
,
2544 bool NeedsBarrier
, llvm::omp::ScheduleKind SchedKind
,
2545 llvm::Value
*ChunkSize
, bool HasSimdModifier
, bool HasMonotonicModifier
,
2546 bool HasNonmonotonicModifier
, bool HasOrderedClause
) {
2547 OMPScheduleType EffectiveScheduleType
= computeOpenMPScheduleType(
2548 SchedKind
, ChunkSize
, HasSimdModifier
, HasMonotonicModifier
,
2549 HasNonmonotonicModifier
, HasOrderedClause
);
2551 bool IsOrdered
= (EffectiveScheduleType
& OMPScheduleType::ModifierOrdered
) ==
2552 OMPScheduleType::ModifierOrdered
;
2553 switch (EffectiveScheduleType
& ~OMPScheduleType::ModifierMask
) {
2554 case OMPScheduleType::BaseStatic
:
2555 assert(!ChunkSize
&& "No chunk size with static-chunked schedule");
2557 return applyDynamicWorkshareLoop(DL
, CLI
, AllocaIP
, EffectiveScheduleType
,
2558 NeedsBarrier
, ChunkSize
);
2559 // FIXME: Monotonicity ignored?
2560 return applyStaticWorkshareLoop(DL
, CLI
, AllocaIP
, NeedsBarrier
);
2562 case OMPScheduleType::BaseStaticChunked
:
2564 return applyDynamicWorkshareLoop(DL
, CLI
, AllocaIP
, EffectiveScheduleType
,
2565 NeedsBarrier
, ChunkSize
);
2566 // FIXME: Monotonicity ignored?
2567 return applyStaticChunkedWorkshareLoop(DL
, CLI
, AllocaIP
, NeedsBarrier
,
2570 case OMPScheduleType::BaseRuntime
:
2571 case OMPScheduleType::BaseAuto
:
2572 case OMPScheduleType::BaseGreedy
:
2573 case OMPScheduleType::BaseBalanced
:
2574 case OMPScheduleType::BaseSteal
:
2575 case OMPScheduleType::BaseGuidedSimd
:
2576 case OMPScheduleType::BaseRuntimeSimd
:
2577 assert(!ChunkSize
&&
2578 "schedule type does not support user-defined chunk sizes");
2580 case OMPScheduleType::BaseDynamicChunked
:
2581 case OMPScheduleType::BaseGuidedChunked
:
2582 case OMPScheduleType::BaseGuidedIterativeChunked
:
2583 case OMPScheduleType::BaseGuidedAnalyticalChunked
:
2584 case OMPScheduleType::BaseStaticBalancedChunked
:
2585 return applyDynamicWorkshareLoop(DL
, CLI
, AllocaIP
, EffectiveScheduleType
,
2586 NeedsBarrier
, ChunkSize
);
2589 llvm_unreachable("Unknown/unimplemented schedule kind");
2593 /// Returns an LLVM function to call for initializing loop bounds using OpenMP
2594 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
2595 /// the runtime. Always interpret integers as unsigned similarly to
2596 /// CanonicalLoopInfo.
2597 static FunctionCallee
2598 getKmpcForDynamicInitForType(Type
*Ty
, Module
&M
, OpenMPIRBuilder
&OMPBuilder
) {
2599 unsigned Bitwidth
= Ty
->getIntegerBitWidth();
2601 return OMPBuilder
.getOrCreateRuntimeFunction(
2602 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u
);
2604 return OMPBuilder
.getOrCreateRuntimeFunction(
2605 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u
);
2606 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2609 /// Returns an LLVM function to call for updating the next loop using OpenMP
2610 /// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
2611 /// the runtime. Always interpret integers as unsigned similarly to
2612 /// CanonicalLoopInfo.
2613 static FunctionCallee
2614 getKmpcForDynamicNextForType(Type
*Ty
, Module
&M
, OpenMPIRBuilder
&OMPBuilder
) {
2615 unsigned Bitwidth
= Ty
->getIntegerBitWidth();
2617 return OMPBuilder
.getOrCreateRuntimeFunction(
2618 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u
);
2620 return OMPBuilder
.getOrCreateRuntimeFunction(
2621 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u
);
2622 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2625 /// Returns an LLVM function to call for finalizing the dynamic loop using
2626 /// depending on `type`. Only i32 and i64 are supported by the runtime. Always
2627 /// interpret integers as unsigned similarly to CanonicalLoopInfo.
2628 static FunctionCallee
2629 getKmpcForDynamicFiniForType(Type
*Ty
, Module
&M
, OpenMPIRBuilder
&OMPBuilder
) {
2630 unsigned Bitwidth
= Ty
->getIntegerBitWidth();
2632 return OMPBuilder
.getOrCreateRuntimeFunction(
2633 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u
);
2635 return OMPBuilder
.getOrCreateRuntimeFunction(
2636 M
, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u
);
2637 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2640 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::applyDynamicWorkshareLoop(
2641 DebugLoc DL
, CanonicalLoopInfo
*CLI
, InsertPointTy AllocaIP
,
2642 OMPScheduleType SchedType
, bool NeedsBarrier
, Value
*Chunk
) {
2643 assert(CLI
->isValid() && "Requires a valid canonical loop");
2644 assert(!isConflictIP(AllocaIP
, CLI
->getPreheaderIP()) &&
2645 "Require dedicated allocate IP");
2646 assert(isValidWorkshareLoopScheduleType(SchedType
) &&
2647 "Require valid schedule type");
2649 bool Ordered
= (SchedType
& OMPScheduleType::ModifierOrdered
) ==
2650 OMPScheduleType::ModifierOrdered
;
2652 // Set up the source location value for OpenMP runtime.
2653 Builder
.SetCurrentDebugLocation(DL
);
2655 uint32_t SrcLocStrSize
;
2656 Constant
*SrcLocStr
= getOrCreateSrcLocStr(DL
, SrcLocStrSize
);
2657 Value
*SrcLoc
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
2659 // Declare useful OpenMP runtime functions.
2660 Value
*IV
= CLI
->getIndVar();
2661 Type
*IVTy
= IV
->getType();
2662 FunctionCallee DynamicInit
= getKmpcForDynamicInitForType(IVTy
, M
, *this);
2663 FunctionCallee DynamicNext
= getKmpcForDynamicNextForType(IVTy
, M
, *this);
2665 // Allocate space for computed loop bounds as expected by the "init" function.
2666 Builder
.restoreIP(AllocaIP
);
2667 Type
*I32Type
= Type::getInt32Ty(M
.getContext());
2668 Value
*PLastIter
= Builder
.CreateAlloca(I32Type
, nullptr, "p.lastiter");
2669 Value
*PLowerBound
= Builder
.CreateAlloca(IVTy
, nullptr, "p.lowerbound");
2670 Value
*PUpperBound
= Builder
.CreateAlloca(IVTy
, nullptr, "p.upperbound");
2671 Value
*PStride
= Builder
.CreateAlloca(IVTy
, nullptr, "p.stride");
2673 // At the end of the preheader, prepare for calling the "init" function by
2674 // storing the current loop bounds into the allocated space. A canonical loop
2675 // always iterates from 0 to trip-count with step 1. Note that "init" expects
2676 // and produces an inclusive upper bound.
2677 BasicBlock
*PreHeader
= CLI
->getPreheader();
2678 Builder
.SetInsertPoint(PreHeader
->getTerminator());
2679 Constant
*One
= ConstantInt::get(IVTy
, 1);
2680 Builder
.CreateStore(One
, PLowerBound
);
2681 Value
*UpperBound
= CLI
->getTripCount();
2682 Builder
.CreateStore(UpperBound
, PUpperBound
);
2683 Builder
.CreateStore(One
, PStride
);
2685 BasicBlock
*Header
= CLI
->getHeader();
2686 BasicBlock
*Exit
= CLI
->getExit();
2687 BasicBlock
*Cond
= CLI
->getCond();
2688 BasicBlock
*Latch
= CLI
->getLatch();
2689 InsertPointTy AfterIP
= CLI
->getAfterIP();
2691 // The CLI will be "broken" in the code below, as the loop is no longer
2692 // a valid canonical loop.
2697 Value
*ThreadNum
= getOrCreateThreadID(SrcLoc
);
2699 Constant
*SchedulingType
=
2700 ConstantInt::get(I32Type
, static_cast<int>(SchedType
));
2702 // Call the "init" function.
2703 Builder
.CreateCall(DynamicInit
,
2704 {SrcLoc
, ThreadNum
, SchedulingType
, /* LowerBound */ One
,
2705 UpperBound
, /* step */ One
, Chunk
});
2707 // An outer loop around the existing one.
2708 BasicBlock
*OuterCond
= BasicBlock::Create(
2709 PreHeader
->getContext(), Twine(PreHeader
->getName()) + ".outer.cond",
2710 PreHeader
->getParent());
2711 // This needs to be 32-bit always, so can't use the IVTy Zero above.
2712 Builder
.SetInsertPoint(OuterCond
, OuterCond
->getFirstInsertionPt());
2714 Builder
.CreateCall(DynamicNext
, {SrcLoc
, ThreadNum
, PLastIter
,
2715 PLowerBound
, PUpperBound
, PStride
});
2716 Constant
*Zero32
= ConstantInt::get(I32Type
, 0);
2717 Value
*MoreWork
= Builder
.CreateCmp(CmpInst::ICMP_NE
, Res
, Zero32
);
2719 Builder
.CreateSub(Builder
.CreateLoad(IVTy
, PLowerBound
), One
, "lb");
2720 Builder
.CreateCondBr(MoreWork
, Header
, Exit
);
2722 // Change PHI-node in loop header to use outer cond rather than preheader,
2723 // and set IV to the LowerBound.
2724 Instruction
*Phi
= &Header
->front();
2725 auto *PI
= cast
<PHINode
>(Phi
);
2726 PI
->setIncomingBlock(0, OuterCond
);
2727 PI
->setIncomingValue(0, LowerBound
);
2729 // Then set the pre-header to jump to the OuterCond
2730 Instruction
*Term
= PreHeader
->getTerminator();
2731 auto *Br
= cast
<BranchInst
>(Term
);
2732 Br
->setSuccessor(0, OuterCond
);
2734 // Modify the inner condition:
2735 // * Use the UpperBound returned from the DynamicNext call.
2736 // * jump to the loop outer loop when done with one of the inner loops.
2737 Builder
.SetInsertPoint(Cond
, Cond
->getFirstInsertionPt());
2738 UpperBound
= Builder
.CreateLoad(IVTy
, PUpperBound
, "ub");
2739 Instruction
*Comp
= &*Builder
.GetInsertPoint();
2740 auto *CI
= cast
<CmpInst
>(Comp
);
2741 CI
->setOperand(1, UpperBound
);
2742 // Redirect the inner exit to branch to outer condition.
2743 Instruction
*Branch
= &Cond
->back();
2744 auto *BI
= cast
<BranchInst
>(Branch
);
2745 assert(BI
->getSuccessor(1) == Exit
);
2746 BI
->setSuccessor(1, OuterCond
);
2748 // Call the "fini" function if "ordered" is present in wsloop directive.
2750 Builder
.SetInsertPoint(&Latch
->back());
2751 FunctionCallee DynamicFini
= getKmpcForDynamicFiniForType(IVTy
, M
, *this);
2752 Builder
.CreateCall(DynamicFini
, {SrcLoc
, ThreadNum
});
2755 // Add the barrier if requested.
2757 Builder
.SetInsertPoint(&Exit
->back());
2758 createBarrier(LocationDescription(Builder
.saveIP(), DL
),
2759 omp::Directive::OMPD_for
, /* ForceSimpleCall */ false,
2760 /* CheckCancelFlag */ false);
2767 /// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
2768 /// after this \p OldTarget will be orphaned.
2769 static void redirectAllPredecessorsTo(BasicBlock
*OldTarget
,
2770 BasicBlock
*NewTarget
, DebugLoc DL
) {
2771 for (BasicBlock
*Pred
: make_early_inc_range(predecessors(OldTarget
)))
2772 redirectTo(Pred
, NewTarget
, DL
);
2775 /// Determine which blocks in \p BBs are reachable from outside and remove the
2776 /// ones that are not reachable from the function.
2777 static void removeUnusedBlocksFromParent(ArrayRef
<BasicBlock
*> BBs
) {
2778 SmallPtrSet
<BasicBlock
*, 6> BBsToErase
{BBs
.begin(), BBs
.end()};
2779 auto HasRemainingUses
= [&BBsToErase
](BasicBlock
*BB
) {
2780 for (Use
&U
: BB
->uses()) {
2781 auto *UseInst
= dyn_cast
<Instruction
>(U
.getUser());
2784 if (BBsToErase
.count(UseInst
->getParent()))
2792 bool Changed
= false;
2793 for (BasicBlock
*BB
: make_early_inc_range(BBsToErase
)) {
2794 if (HasRemainingUses(BB
)) {
2795 BBsToErase
.erase(BB
);
2803 SmallVector
<BasicBlock
*, 7> BBVec(BBsToErase
.begin(), BBsToErase
.end());
2804 DeleteDeadBlocks(BBVec
);
2808 OpenMPIRBuilder::collapseLoops(DebugLoc DL
, ArrayRef
<CanonicalLoopInfo
*> Loops
,
2809 InsertPointTy ComputeIP
) {
2810 assert(Loops
.size() >= 1 && "At least one loop required");
2811 size_t NumLoops
= Loops
.size();
2813 // Nothing to do if there is already just one loop.
2815 return Loops
.front();
2817 CanonicalLoopInfo
*Outermost
= Loops
.front();
2818 CanonicalLoopInfo
*Innermost
= Loops
.back();
2819 BasicBlock
*OrigPreheader
= Outermost
->getPreheader();
2820 BasicBlock
*OrigAfter
= Outermost
->getAfter();
2821 Function
*F
= OrigPreheader
->getParent();
2823 // Loop control blocks that may become orphaned later.
2824 SmallVector
<BasicBlock
*, 12> OldControlBBs
;
2825 OldControlBBs
.reserve(6 * Loops
.size());
2826 for (CanonicalLoopInfo
*Loop
: Loops
)
2827 Loop
->collectControlBlocks(OldControlBBs
);
2829 // Setup the IRBuilder for inserting the trip count computation.
2830 Builder
.SetCurrentDebugLocation(DL
);
2831 if (ComputeIP
.isSet())
2832 Builder
.restoreIP(ComputeIP
);
2834 Builder
.restoreIP(Outermost
->getPreheaderIP());
2836 // Derive the collapsed' loop trip count.
2837 // TODO: Find common/largest indvar type.
2838 Value
*CollapsedTripCount
= nullptr;
2839 for (CanonicalLoopInfo
*L
: Loops
) {
2840 assert(L
->isValid() &&
2841 "All loops to collapse must be valid canonical loops");
2842 Value
*OrigTripCount
= L
->getTripCount();
2843 if (!CollapsedTripCount
) {
2844 CollapsedTripCount
= OrigTripCount
;
2848 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
2849 CollapsedTripCount
= Builder
.CreateMul(CollapsedTripCount
, OrigTripCount
,
2850 {}, /*HasNUW=*/true);
2853 // Create the collapsed loop control flow.
2854 CanonicalLoopInfo
*Result
=
2855 createLoopSkeleton(DL
, CollapsedTripCount
, F
,
2856 OrigPreheader
->getNextNode(), OrigAfter
, "collapsed");
2858 // Build the collapsed loop body code.
2859 // Start with deriving the input loop induction variables from the collapsed
2860 // one, using a divmod scheme. To preserve the original loops' order, the
2861 // innermost loop use the least significant bits.
2862 Builder
.restoreIP(Result
->getBodyIP());
2864 Value
*Leftover
= Result
->getIndVar();
2865 SmallVector
<Value
*> NewIndVars
;
2866 NewIndVars
.resize(NumLoops
);
2867 for (int i
= NumLoops
- 1; i
>= 1; --i
) {
2868 Value
*OrigTripCount
= Loops
[i
]->getTripCount();
2870 Value
*NewIndVar
= Builder
.CreateURem(Leftover
, OrigTripCount
);
2871 NewIndVars
[i
] = NewIndVar
;
2873 Leftover
= Builder
.CreateUDiv(Leftover
, OrigTripCount
);
2875 // Outermost loop gets all the remaining bits.
2876 NewIndVars
[0] = Leftover
;
2878 // Construct the loop body control flow.
2879 // We progressively construct the branch structure following in direction of
2880 // the control flow, from the leading in-between code, the loop nest body, the
2881 // trailing in-between code, and rejoining the collapsed loop's latch.
2882 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
2883 // the ContinueBlock is set, continue with that block. If ContinuePred, use
2884 // its predecessors as sources.
2885 BasicBlock
*ContinueBlock
= Result
->getBody();
2886 BasicBlock
*ContinuePred
= nullptr;
2887 auto ContinueWith
= [&ContinueBlock
, &ContinuePred
, DL
](BasicBlock
*Dest
,
2888 BasicBlock
*NextSrc
) {
2890 redirectTo(ContinueBlock
, Dest
, DL
);
2892 redirectAllPredecessorsTo(ContinuePred
, Dest
, DL
);
2894 ContinueBlock
= nullptr;
2895 ContinuePred
= NextSrc
;
2898 // The code before the nested loop of each level.
2899 // Because we are sinking it into the nest, it will be executed more often
2900 // that the original loop. More sophisticated schemes could keep track of what
2901 // the in-between code is and instantiate it only once per thread.
2902 for (size_t i
= 0; i
< NumLoops
- 1; ++i
)
2903 ContinueWith(Loops
[i
]->getBody(), Loops
[i
+ 1]->getHeader());
2905 // Connect the loop nest body.
2906 ContinueWith(Innermost
->getBody(), Innermost
->getLatch());
2908 // The code after the nested loop at each level.
2909 for (size_t i
= NumLoops
- 1; i
> 0; --i
)
2910 ContinueWith(Loops
[i
]->getAfter(), Loops
[i
- 1]->getLatch());
2912 // Connect the finished loop to the collapsed loop latch.
2913 ContinueWith(Result
->getLatch(), nullptr);
2915 // Replace the input loops with the new collapsed loop.
2916 redirectTo(Outermost
->getPreheader(), Result
->getPreheader(), DL
);
2917 redirectTo(Result
->getAfter(), Outermost
->getAfter(), DL
);
2919 // Replace the input loop indvars with the derived ones.
2920 for (size_t i
= 0; i
< NumLoops
; ++i
)
2921 Loops
[i
]->getIndVar()->replaceAllUsesWith(NewIndVars
[i
]);
2923 // Remove unused parts of the input loops.
2924 removeUnusedBlocksFromParent(OldControlBBs
);
2926 for (CanonicalLoopInfo
*L
: Loops
)
2935 std::vector
<CanonicalLoopInfo
*>
2936 OpenMPIRBuilder::tileLoops(DebugLoc DL
, ArrayRef
<CanonicalLoopInfo
*> Loops
,
2937 ArrayRef
<Value
*> TileSizes
) {
2938 assert(TileSizes
.size() == Loops
.size() &&
2939 "Must pass as many tile sizes as there are loops");
2940 int NumLoops
= Loops
.size();
2941 assert(NumLoops
>= 1 && "At least one loop to tile required");
2943 CanonicalLoopInfo
*OutermostLoop
= Loops
.front();
2944 CanonicalLoopInfo
*InnermostLoop
= Loops
.back();
2945 Function
*F
= OutermostLoop
->getBody()->getParent();
2946 BasicBlock
*InnerEnter
= InnermostLoop
->getBody();
2947 BasicBlock
*InnerLatch
= InnermostLoop
->getLatch();
2949 // Loop control blocks that may become orphaned later.
2950 SmallVector
<BasicBlock
*, 12> OldControlBBs
;
2951 OldControlBBs
.reserve(6 * Loops
.size());
2952 for (CanonicalLoopInfo
*Loop
: Loops
)
2953 Loop
->collectControlBlocks(OldControlBBs
);
2955 // Collect original trip counts and induction variable to be accessible by
2956 // index. Also, the structure of the original loops is not preserved during
2957 // the construction of the tiled loops, so do it before we scavenge the BBs of
2958 // any original CanonicalLoopInfo.
2959 SmallVector
<Value
*, 4> OrigTripCounts
, OrigIndVars
;
2960 for (CanonicalLoopInfo
*L
: Loops
) {
2961 assert(L
->isValid() && "All input loops must be valid canonical loops");
2962 OrigTripCounts
.push_back(L
->getTripCount());
2963 OrigIndVars
.push_back(L
->getIndVar());
2966 // Collect the code between loop headers. These may contain SSA definitions
2967 // that are used in the loop nest body. To be usable with in the innermost
2968 // body, these BasicBlocks will be sunk into the loop nest body. That is,
2969 // these instructions may be executed more often than before the tiling.
2970 // TODO: It would be sufficient to only sink them into body of the
2971 // corresponding tile loop.
2972 SmallVector
<std::pair
<BasicBlock
*, BasicBlock
*>, 4> InbetweenCode
;
2973 for (int i
= 0; i
< NumLoops
- 1; ++i
) {
2974 CanonicalLoopInfo
*Surrounding
= Loops
[i
];
2975 CanonicalLoopInfo
*Nested
= Loops
[i
+ 1];
2977 BasicBlock
*EnterBB
= Surrounding
->getBody();
2978 BasicBlock
*ExitBB
= Nested
->getHeader();
2979 InbetweenCode
.emplace_back(EnterBB
, ExitBB
);
2982 // Compute the trip counts of the floor loops.
2983 Builder
.SetCurrentDebugLocation(DL
);
2984 Builder
.restoreIP(OutermostLoop
->getPreheaderIP());
2985 SmallVector
<Value
*, 4> FloorCount
, FloorRems
;
2986 for (int i
= 0; i
< NumLoops
; ++i
) {
2987 Value
*TileSize
= TileSizes
[i
];
2988 Value
*OrigTripCount
= OrigTripCounts
[i
];
2989 Type
*IVType
= OrigTripCount
->getType();
2991 Value
*FloorTripCount
= Builder
.CreateUDiv(OrigTripCount
, TileSize
);
2992 Value
*FloorTripRem
= Builder
.CreateURem(OrigTripCount
, TileSize
);
2994 // 0 if tripcount divides the tilesize, 1 otherwise.
2995 // 1 means we need an additional iteration for a partial tile.
2997 // Unfortunately we cannot just use the roundup-formula
2998 // (tripcount + tilesize - 1)/tilesize
2999 // because the summation might overflow. We do not want introduce undefined
3000 // behavior when the untiled loop nest did not.
3001 Value
*FloorTripOverflow
=
3002 Builder
.CreateICmpNE(FloorTripRem
, ConstantInt::get(IVType
, 0));
3004 FloorTripOverflow
= Builder
.CreateZExt(FloorTripOverflow
, IVType
);
3006 Builder
.CreateAdd(FloorTripCount
, FloorTripOverflow
,
3007 "omp_floor" + Twine(i
) + ".tripcount", true);
3009 // Remember some values for later use.
3010 FloorCount
.push_back(FloorTripCount
);
3011 FloorRems
.push_back(FloorTripRem
);
3014 // Generate the new loop nest, from the outermost to the innermost.
3015 std::vector
<CanonicalLoopInfo
*> Result
;
3016 Result
.reserve(NumLoops
* 2);
3018 // The basic block of the surrounding loop that enters the nest generated
3020 BasicBlock
*Enter
= OutermostLoop
->getPreheader();
3022 // The basic block of the surrounding loop where the inner code should
3024 BasicBlock
*Continue
= OutermostLoop
->getAfter();
3026 // Where the next loop basic block should be inserted.
3027 BasicBlock
*OutroInsertBefore
= InnermostLoop
->getExit();
3029 auto EmbeddNewLoop
=
3030 [this, DL
, F
, InnerEnter
, &Enter
, &Continue
, &OutroInsertBefore
](
3031 Value
*TripCount
, const Twine
&Name
) -> CanonicalLoopInfo
* {
3032 CanonicalLoopInfo
*EmbeddedLoop
= createLoopSkeleton(
3033 DL
, TripCount
, F
, InnerEnter
, OutroInsertBefore
, Name
);
3034 redirectTo(Enter
, EmbeddedLoop
->getPreheader(), DL
);
3035 redirectTo(EmbeddedLoop
->getAfter(), Continue
, DL
);
3037 // Setup the position where the next embedded loop connects to this loop.
3038 Enter
= EmbeddedLoop
->getBody();
3039 Continue
= EmbeddedLoop
->getLatch();
3040 OutroInsertBefore
= EmbeddedLoop
->getLatch();
3041 return EmbeddedLoop
;
3044 auto EmbeddNewLoops
= [&Result
, &EmbeddNewLoop
](ArrayRef
<Value
*> TripCounts
,
3045 const Twine
&NameBase
) {
3046 for (auto P
: enumerate(TripCounts
)) {
3047 CanonicalLoopInfo
*EmbeddedLoop
=
3048 EmbeddNewLoop(P
.value(), NameBase
+ Twine(P
.index()));
3049 Result
.push_back(EmbeddedLoop
);
3053 EmbeddNewLoops(FloorCount
, "floor");
3055 // Within the innermost floor loop, emit the code that computes the tile
3057 Builder
.SetInsertPoint(Enter
->getTerminator());
3058 SmallVector
<Value
*, 4> TileCounts
;
3059 for (int i
= 0; i
< NumLoops
; ++i
) {
3060 CanonicalLoopInfo
*FloorLoop
= Result
[i
];
3061 Value
*TileSize
= TileSizes
[i
];
3063 Value
*FloorIsEpilogue
=
3064 Builder
.CreateICmpEQ(FloorLoop
->getIndVar(), FloorCount
[i
]);
3065 Value
*TileTripCount
=
3066 Builder
.CreateSelect(FloorIsEpilogue
, FloorRems
[i
], TileSize
);
3068 TileCounts
.push_back(TileTripCount
);
3071 // Create the tile loops.
3072 EmbeddNewLoops(TileCounts
, "tile");
3074 // Insert the inbetween code into the body.
3075 BasicBlock
*BodyEnter
= Enter
;
3076 BasicBlock
*BodyEntered
= nullptr;
3077 for (std::pair
<BasicBlock
*, BasicBlock
*> P
: InbetweenCode
) {
3078 BasicBlock
*EnterBB
= P
.first
;
3079 BasicBlock
*ExitBB
= P
.second
;
3082 redirectTo(BodyEnter
, EnterBB
, DL
);
3084 redirectAllPredecessorsTo(BodyEntered
, EnterBB
, DL
);
3086 BodyEnter
= nullptr;
3087 BodyEntered
= ExitBB
;
3090 // Append the original loop nest body into the generated loop nest body.
3092 redirectTo(BodyEnter
, InnerEnter
, DL
);
3094 redirectAllPredecessorsTo(BodyEntered
, InnerEnter
, DL
);
3095 redirectAllPredecessorsTo(InnerLatch
, Continue
, DL
);
3097 // Replace the original induction variable with an induction variable computed
3098 // from the tile and floor induction variables.
3099 Builder
.restoreIP(Result
.back()->getBodyIP());
3100 for (int i
= 0; i
< NumLoops
; ++i
) {
3101 CanonicalLoopInfo
*FloorLoop
= Result
[i
];
3102 CanonicalLoopInfo
*TileLoop
= Result
[NumLoops
+ i
];
3103 Value
*OrigIndVar
= OrigIndVars
[i
];
3104 Value
*Size
= TileSizes
[i
];
3107 Builder
.CreateMul(Size
, FloorLoop
->getIndVar(), {}, /*HasNUW=*/true);
3109 Builder
.CreateAdd(Scale
, TileLoop
->getIndVar(), {}, /*HasNUW=*/true);
3110 OrigIndVar
->replaceAllUsesWith(Shift
);
3113 // Remove unused parts of the original loops.
3114 removeUnusedBlocksFromParent(OldControlBBs
);
3116 for (CanonicalLoopInfo
*L
: Loops
)
3120 for (CanonicalLoopInfo
*GenL
: Result
)
3126 /// Attach metadata \p Properties to the basic block described by \p BB. If the
3127 /// basic block already has metadata, the basic block properties are appended.
3128 static void addBasicBlockMetadata(BasicBlock
*BB
,
3129 ArrayRef
<Metadata
*> Properties
) {
3130 // Nothing to do if no property to attach.
3131 if (Properties
.empty())
3134 LLVMContext
&Ctx
= BB
->getContext();
3135 SmallVector
<Metadata
*> NewProperties
;
3136 NewProperties
.push_back(nullptr);
3138 // If the basic block already has metadata, prepend it to the new metadata.
3139 MDNode
*Existing
= BB
->getTerminator()->getMetadata(LLVMContext::MD_loop
);
3141 append_range(NewProperties
, drop_begin(Existing
->operands(), 1));
3143 append_range(NewProperties
, Properties
);
3144 MDNode
*BasicBlockID
= MDNode::getDistinct(Ctx
, NewProperties
);
3145 BasicBlockID
->replaceOperandWith(0, BasicBlockID
);
3147 BB
->getTerminator()->setMetadata(LLVMContext::MD_loop
, BasicBlockID
);
3150 /// Attach loop metadata \p Properties to the loop described by \p Loop. If the
3151 /// loop already has metadata, the loop properties are appended.
3152 static void addLoopMetadata(CanonicalLoopInfo
*Loop
,
3153 ArrayRef
<Metadata
*> Properties
) {
3154 assert(Loop
->isValid() && "Expecting a valid CanonicalLoopInfo");
3156 // Attach metadata to the loop's latch
3157 BasicBlock
*Latch
= Loop
->getLatch();
3158 assert(Latch
&& "A valid CanonicalLoopInfo must have a unique latch");
3159 addBasicBlockMetadata(Latch
, Properties
);
3162 /// Attach llvm.access.group metadata to the memref instructions of \p Block
3163 static void addSimdMetadata(BasicBlock
*Block
, MDNode
*AccessGroup
,
3165 for (Instruction
&I
: *Block
) {
3166 if (I
.mayReadOrWriteMemory()) {
3167 // TODO: This instruction may already have access group from
3168 // other pragmas e.g. #pragma clang loop vectorize. Append
3169 // so that the existing metadata is not overwritten.
3170 I
.setMetadata(LLVMContext::MD_access_group
, AccessGroup
);
3175 void OpenMPIRBuilder::unrollLoopFull(DebugLoc
, CanonicalLoopInfo
*Loop
) {
3176 LLVMContext
&Ctx
= Builder
.getContext();
3178 Loop
, {MDNode::get(Ctx
, MDString::get(Ctx
, "llvm.loop.unroll.enable")),
3179 MDNode::get(Ctx
, MDString::get(Ctx
, "llvm.loop.unroll.full"))});
3182 void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc
, CanonicalLoopInfo
*Loop
) {
3183 LLVMContext
&Ctx
= Builder
.getContext();
3186 MDNode::get(Ctx
, MDString::get(Ctx
, "llvm.loop.unroll.enable")),
3190 void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo
*CanonicalLoop
,
3191 Value
*IfCond
, ValueToValueMapTy
&VMap
,
3192 const Twine
&NamePrefix
) {
3193 Function
*F
= CanonicalLoop
->getFunction();
3195 // Define where if branch should be inserted
3196 Instruction
*SplitBefore
;
3197 if (Instruction::classof(IfCond
)) {
3198 SplitBefore
= dyn_cast
<Instruction
>(IfCond
);
3200 SplitBefore
= CanonicalLoop
->getPreheader()->getTerminator();
3203 // TODO: We should not rely on pass manager. Currently we use pass manager
3204 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
3205 // object. We should have a method which returns all blocks between
3206 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
3207 FunctionAnalysisManager FAM
;
3208 FAM
.registerPass([]() { return DominatorTreeAnalysis(); });
3209 FAM
.registerPass([]() { return LoopAnalysis(); });
3210 FAM
.registerPass([]() { return PassInstrumentationAnalysis(); });
3212 // Get the loop which needs to be cloned
3214 LoopInfo
&&LI
= LIA
.run(*F
, FAM
);
3215 Loop
*L
= LI
.getLoopFor(CanonicalLoop
->getHeader());
3217 // Create additional blocks for the if statement
3218 BasicBlock
*Head
= SplitBefore
->getParent();
3219 Instruction
*HeadOldTerm
= Head
->getTerminator();
3220 llvm::LLVMContext
&C
= Head
->getContext();
3221 llvm::BasicBlock
*ThenBlock
= llvm::BasicBlock::Create(
3222 C
, NamePrefix
+ ".if.then", Head
->getParent(), Head
->getNextNode());
3223 llvm::BasicBlock
*ElseBlock
= llvm::BasicBlock::Create(
3224 C
, NamePrefix
+ ".if.else", Head
->getParent(), CanonicalLoop
->getExit());
3226 // Create if condition branch.
3227 Builder
.SetInsertPoint(HeadOldTerm
);
3228 Instruction
*BrInstr
=
3229 Builder
.CreateCondBr(IfCond
, ThenBlock
, /*ifFalse*/ ElseBlock
);
3230 InsertPointTy IP
{BrInstr
->getParent(), ++BrInstr
->getIterator()};
3231 // Then block contains branch to omp loop which needs to be vectorized
3232 spliceBB(IP
, ThenBlock
, false);
3233 ThenBlock
->replaceSuccessorsPhiUsesWith(Head
, ThenBlock
);
3235 Builder
.SetInsertPoint(ElseBlock
);
3237 // Clone loop for the else branch
3238 SmallVector
<BasicBlock
*, 8> NewBlocks
;
3240 VMap
[CanonicalLoop
->getPreheader()] = ElseBlock
;
3241 for (BasicBlock
*Block
: L
->getBlocks()) {
3242 BasicBlock
*NewBB
= CloneBasicBlock(Block
, VMap
, "", F
);
3243 NewBB
->moveBefore(CanonicalLoop
->getExit());
3244 VMap
[Block
] = NewBB
;
3245 NewBlocks
.push_back(NewBB
);
3247 remapInstructionsInBlocks(NewBlocks
, VMap
);
3248 Builder
.CreateBr(NewBlocks
.front());
3252 OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple
&TargetTriple
,
3253 const StringMap
<bool> &Features
) {
3254 if (TargetTriple
.isX86()) {
3255 if (Features
.lookup("avx512f"))
3257 else if (Features
.lookup("avx"))
3261 if (TargetTriple
.isPPC())
3263 if (TargetTriple
.isWasm())
3268 void OpenMPIRBuilder::applySimd(CanonicalLoopInfo
*CanonicalLoop
,
3269 MapVector
<Value
*, Value
*> AlignedVars
,
3270 Value
*IfCond
, OrderKind Order
,
3271 ConstantInt
*Simdlen
, ConstantInt
*Safelen
) {
3272 LLVMContext
&Ctx
= Builder
.getContext();
3274 Function
*F
= CanonicalLoop
->getFunction();
3276 // TODO: We should not rely on pass manager. Currently we use pass manager
3277 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
3278 // object. We should have a method which returns all blocks between
3279 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
3280 FunctionAnalysisManager FAM
;
3281 FAM
.registerPass([]() { return DominatorTreeAnalysis(); });
3282 FAM
.registerPass([]() { return LoopAnalysis(); });
3283 FAM
.registerPass([]() { return PassInstrumentationAnalysis(); });
3286 LoopInfo
&&LI
= LIA
.run(*F
, FAM
);
3288 Loop
*L
= LI
.getLoopFor(CanonicalLoop
->getHeader());
3289 if (AlignedVars
.size()) {
3290 InsertPointTy IP
= Builder
.saveIP();
3291 Builder
.SetInsertPoint(CanonicalLoop
->getPreheader()->getTerminator());
3292 for (auto &AlignedItem
: AlignedVars
) {
3293 Value
*AlignedPtr
= AlignedItem
.first
;
3294 Value
*Alignment
= AlignedItem
.second
;
3295 Builder
.CreateAlignmentAssumption(F
->getParent()->getDataLayout(),
3296 AlignedPtr
, Alignment
);
3298 Builder
.restoreIP(IP
);
3302 ValueToValueMapTy VMap
;
3303 createIfVersion(CanonicalLoop
, IfCond
, VMap
, "simd");
3304 // Add metadata to the cloned loop which disables vectorization
3305 Value
*MappedLatch
= VMap
.lookup(CanonicalLoop
->getLatch());
3306 assert(MappedLatch
&&
3307 "Cannot find value which corresponds to original loop latch");
3308 assert(isa
<BasicBlock
>(MappedLatch
) &&
3309 "Cannot cast mapped latch block value to BasicBlock");
3310 BasicBlock
*NewLatchBlock
= dyn_cast
<BasicBlock
>(MappedLatch
);
3311 ConstantAsMetadata
*BoolConst
=
3312 ConstantAsMetadata::get(ConstantInt::getFalse(Type::getInt1Ty(Ctx
)));
3313 addBasicBlockMetadata(
3315 {MDNode::get(Ctx
, {MDString::get(Ctx
, "llvm.loop.vectorize.enable"),
3319 SmallSet
<BasicBlock
*, 8> Reachable
;
3321 // Get the basic blocks from the loop in which memref instructions
3323 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
3324 // preferably without running any passes.
3325 for (BasicBlock
*Block
: L
->getBlocks()) {
3326 if (Block
== CanonicalLoop
->getCond() ||
3327 Block
== CanonicalLoop
->getHeader())
3329 Reachable
.insert(Block
);
3332 SmallVector
<Metadata
*> LoopMDList
;
3334 // In presence of finite 'safelen', it may be unsafe to mark all
3335 // the memory instructions parallel, because loop-carried
3336 // dependences of 'safelen' iterations are possible.
3337 // If clause order(concurrent) is specified then the memory instructions
3338 // are marked parallel even if 'safelen' is finite.
3339 if ((Safelen
== nullptr) || (Order
== OrderKind::OMP_ORDER_concurrent
)) {
3340 // Add access group metadata to memory-access instructions.
3341 MDNode
*AccessGroup
= MDNode::getDistinct(Ctx
, {});
3342 for (BasicBlock
*BB
: Reachable
)
3343 addSimdMetadata(BB
, AccessGroup
, LI
);
3344 // TODO: If the loop has existing parallel access metadata, have
3345 // to combine two lists.
3346 LoopMDList
.push_back(MDNode::get(
3347 Ctx
, {MDString::get(Ctx
, "llvm.loop.parallel_accesses"), AccessGroup
}));
3350 // Use the above access group metadata to create loop level
3351 // metadata, which should be distinct for each loop.
3352 ConstantAsMetadata
*BoolConst
=
3353 ConstantAsMetadata::get(ConstantInt::getTrue(Type::getInt1Ty(Ctx
)));
3354 LoopMDList
.push_back(MDNode::get(
3355 Ctx
, {MDString::get(Ctx
, "llvm.loop.vectorize.enable"), BoolConst
}));
3357 if (Simdlen
|| Safelen
) {
3358 // If both simdlen and safelen clauses are specified, the value of the
3359 // simdlen parameter must be less than or equal to the value of the safelen
3360 // parameter. Therefore, use safelen only in the absence of simdlen.
3361 ConstantInt
*VectorizeWidth
= Simdlen
== nullptr ? Safelen
: Simdlen
;
3362 LoopMDList
.push_back(
3363 MDNode::get(Ctx
, {MDString::get(Ctx
, "llvm.loop.vectorize.width"),
3364 ConstantAsMetadata::get(VectorizeWidth
)}));
3367 addLoopMetadata(CanonicalLoop
, LoopMDList
);
3370 /// Create the TargetMachine object to query the backend for optimization
3373 /// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
3374 /// e.g. Clang does not pass it to its CodeGen layer and creates it only when
3375 /// needed for the LLVM pass pipline. We use some default options to avoid
3376 /// having to pass too many settings from the frontend that probably do not
3379 /// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
3380 /// method. If we are going to use TargetMachine for more purposes, especially
3381 /// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
3382 /// might become be worth requiring front-ends to pass on their TargetMachine,
3383 /// or at least cache it between methods. Note that while fontends such as Clang
3384 /// have just a single main TargetMachine per translation unit, "target-cpu" and
3385 /// "target-features" that determine the TargetMachine are per-function and can
3386 /// be overrided using __attribute__((target("OPTIONS"))).
3387 static std::unique_ptr
<TargetMachine
>
3388 createTargetMachine(Function
*F
, CodeGenOptLevel OptLevel
) {
3389 Module
*M
= F
->getParent();
3391 StringRef CPU
= F
->getFnAttribute("target-cpu").getValueAsString();
3392 StringRef Features
= F
->getFnAttribute("target-features").getValueAsString();
3393 const std::string
&Triple
= M
->getTargetTriple();
3396 const llvm::Target
*TheTarget
= TargetRegistry::lookupTarget(Triple
, Error
);
3400 llvm::TargetOptions Options
;
3401 return std::unique_ptr
<TargetMachine
>(TheTarget
->createTargetMachine(
3402 Triple
, CPU
, Features
, Options
, /*RelocModel=*/std::nullopt
,
3403 /*CodeModel=*/std::nullopt
, OptLevel
));
3406 /// Heuristically determine the best-performant unroll factor for \p CLI. This
3407 /// depends on the target processor. We are re-using the same heuristics as the
3409 static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo
*CLI
) {
3410 Function
*F
= CLI
->getFunction();
3412 // Assume the user requests the most aggressive unrolling, even if the rest of
3413 // the code is optimized using a lower setting.
3414 CodeGenOptLevel OptLevel
= CodeGenOptLevel::Aggressive
;
3415 std::unique_ptr
<TargetMachine
> TM
= createTargetMachine(F
, OptLevel
);
3417 FunctionAnalysisManager FAM
;
3418 FAM
.registerPass([]() { return TargetLibraryAnalysis(); });
3419 FAM
.registerPass([]() { return AssumptionAnalysis(); });
3420 FAM
.registerPass([]() { return DominatorTreeAnalysis(); });
3421 FAM
.registerPass([]() { return LoopAnalysis(); });
3422 FAM
.registerPass([]() { return ScalarEvolutionAnalysis(); });
3423 FAM
.registerPass([]() { return PassInstrumentationAnalysis(); });
3424 TargetIRAnalysis TIRA
;
3426 TIRA
= TargetIRAnalysis(
3427 [&](const Function
&F
) { return TM
->getTargetTransformInfo(F
); });
3428 FAM
.registerPass([&]() { return TIRA
; });
3430 TargetIRAnalysis::Result
&&TTI
= TIRA
.run(*F
, FAM
);
3431 ScalarEvolutionAnalysis SEA
;
3432 ScalarEvolution
&&SE
= SEA
.run(*F
, FAM
);
3433 DominatorTreeAnalysis DTA
;
3434 DominatorTree
&&DT
= DTA
.run(*F
, FAM
);
3436 LoopInfo
&&LI
= LIA
.run(*F
, FAM
);
3437 AssumptionAnalysis ACT
;
3438 AssumptionCache
&&AC
= ACT
.run(*F
, FAM
);
3439 OptimizationRemarkEmitter ORE
{F
};
3441 Loop
*L
= LI
.getLoopFor(CLI
->getHeader());
3442 assert(L
&& "Expecting CanonicalLoopInfo to be recognized as a loop");
3444 TargetTransformInfo::UnrollingPreferences UP
=
3445 gatherUnrollingPreferences(L
, SE
, TTI
,
3446 /*BlockFrequencyInfo=*/nullptr,
3447 /*ProfileSummaryInfo=*/nullptr, ORE
, static_cast<int>(OptLevel
),
3448 /*UserThreshold=*/std::nullopt
,
3449 /*UserCount=*/std::nullopt
,
3450 /*UserAllowPartial=*/true,
3451 /*UserAllowRuntime=*/true,
3452 /*UserUpperBound=*/std::nullopt
,
3453 /*UserFullUnrollMaxCount=*/std::nullopt
);
3457 // Account for additional optimizations taking place before the LoopUnrollPass
3458 // would unroll the loop.
3459 UP
.Threshold
*= UnrollThresholdFactor
;
3460 UP
.PartialThreshold
*= UnrollThresholdFactor
;
3462 // Use normal unroll factors even if the rest of the code is optimized for
3464 UP
.OptSizeThreshold
= UP
.Threshold
;
3465 UP
.PartialOptSizeThreshold
= UP
.PartialThreshold
;
3467 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
3468 << " Threshold=" << UP
.Threshold
<< "\n"
3469 << " PartialThreshold=" << UP
.PartialThreshold
<< "\n"
3470 << " OptSizeThreshold=" << UP
.OptSizeThreshold
<< "\n"
3471 << " PartialOptSizeThreshold="
3472 << UP
.PartialOptSizeThreshold
<< "\n");
3475 TargetTransformInfo::PeelingPreferences PP
=
3476 gatherPeelingPreferences(L
, SE
, TTI
,
3477 /*UserAllowPeeling=*/false,
3478 /*UserAllowProfileBasedPeeling=*/false,
3479 /*UnrollingSpecficValues=*/false);
3481 SmallPtrSet
<const Value
*, 32> EphValues
;
3482 CodeMetrics::collectEphemeralValues(L
, &AC
, EphValues
);
3484 // Assume that reads and writes to stack variables can be eliminated by
3485 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
3487 for (BasicBlock
*BB
: L
->blocks()) {
3488 for (Instruction
&I
: *BB
) {
3490 if (auto *Load
= dyn_cast
<LoadInst
>(&I
)) {
3491 Ptr
= Load
->getPointerOperand();
3492 } else if (auto *Store
= dyn_cast
<StoreInst
>(&I
)) {
3493 Ptr
= Store
->getPointerOperand();
3497 Ptr
= Ptr
->stripPointerCasts();
3499 if (auto *Alloca
= dyn_cast
<AllocaInst
>(Ptr
)) {
3500 if (Alloca
->getParent() == &F
->getEntryBlock())
3501 EphValues
.insert(&I
);
3506 UnrollCostEstimator
UCE(L
, TTI
, EphValues
, UP
.BEInsns
);
3508 // Loop is not unrollable if the loop contains certain instructions.
3509 if (!UCE
.canUnroll() || UCE
.Convergent
) {
3510 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
3514 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE
.getRolledLoopSize()
3517 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
3518 // be able to use it.
3520 int MaxTripCount
= 0;
3521 bool MaxOrZero
= false;
3522 unsigned TripMultiple
= 0;
3524 bool UseUpperBound
= false;
3525 computeUnrollCount(L
, TTI
, DT
, &LI
, &AC
, SE
, EphValues
, &ORE
, TripCount
,
3526 MaxTripCount
, MaxOrZero
, TripMultiple
, UCE
, UP
, PP
,
3528 unsigned Factor
= UP
.Count
;
3529 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor
<< "\n");
3531 // This function returns 1 to signal to not unroll a loop.
3537 void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL
, CanonicalLoopInfo
*Loop
,
3539 CanonicalLoopInfo
**UnrolledCLI
) {
3540 assert(Factor
>= 0 && "Unroll factor must not be negative");
3542 Function
*F
= Loop
->getFunction();
3543 LLVMContext
&Ctx
= F
->getContext();
3545 // If the unrolled loop is not used for another loop-associated directive, it
3546 // is sufficient to add metadata for the LoopUnrollPass.
3548 SmallVector
<Metadata
*, 2> LoopMetadata
;
3549 LoopMetadata
.push_back(
3550 MDNode::get(Ctx
, MDString::get(Ctx
, "llvm.loop.unroll.enable")));
3553 ConstantAsMetadata
*FactorConst
= ConstantAsMetadata::get(
3554 ConstantInt::get(Type::getInt32Ty(Ctx
), APInt(32, Factor
)));
3555 LoopMetadata
.push_back(MDNode::get(
3556 Ctx
, {MDString::get(Ctx
, "llvm.loop.unroll.count"), FactorConst
}));
3559 addLoopMetadata(Loop
, LoopMetadata
);
3563 // Heuristically determine the unroll factor.
3565 Factor
= computeHeuristicUnrollFactor(Loop
);
3567 // No change required with unroll factor 1.
3569 *UnrolledCLI
= Loop
;
3573 assert(Factor
>= 2 &&
3574 "unrolling only makes sense with a factor of 2 or larger");
3576 Type
*IndVarTy
= Loop
->getIndVarType();
3578 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
3579 // unroll the inner loop.
3581 ConstantInt::get(IndVarTy
, APInt(IndVarTy
->getIntegerBitWidth(), Factor
,
3582 /*isSigned=*/false));
3583 std::vector
<CanonicalLoopInfo
*> LoopNest
=
3584 tileLoops(DL
, {Loop
}, {FactorVal
});
3585 assert(LoopNest
.size() == 2 && "Expect 2 loops after tiling");
3586 *UnrolledCLI
= LoopNest
[0];
3587 CanonicalLoopInfo
*InnerLoop
= LoopNest
[1];
3589 // LoopUnrollPass can only fully unroll loops with constant trip count.
3590 // Unroll by the unroll factor with a fallback epilog for the remainder
3591 // iterations if necessary.
3592 ConstantAsMetadata
*FactorConst
= ConstantAsMetadata::get(
3593 ConstantInt::get(Type::getInt32Ty(Ctx
), APInt(32, Factor
)));
3596 {MDNode::get(Ctx
, MDString::get(Ctx
, "llvm.loop.unroll.enable")),
3598 Ctx
, {MDString::get(Ctx
, "llvm.loop.unroll.count"), FactorConst
})});
3601 (*UnrolledCLI
)->assertOK();
3605 OpenMPIRBuilder::InsertPointTy
3606 OpenMPIRBuilder::createCopyPrivate(const LocationDescription
&Loc
,
3607 llvm::Value
*BufSize
, llvm::Value
*CpyBuf
,
3608 llvm::Value
*CpyFn
, llvm::Value
*DidIt
) {
3609 if (!updateToLocation(Loc
))
3612 uint32_t SrcLocStrSize
;
3613 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
3614 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
3615 Value
*ThreadId
= getOrCreateThreadID(Ident
);
3617 llvm::Value
*DidItLD
= Builder
.CreateLoad(Builder
.getInt32Ty(), DidIt
);
3619 Value
*Args
[] = {Ident
, ThreadId
, BufSize
, CpyBuf
, CpyFn
, DidItLD
};
3621 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate
);
3622 Builder
.CreateCall(Fn
, Args
);
3624 return Builder
.saveIP();
3627 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createSingle(
3628 const LocationDescription
&Loc
, BodyGenCallbackTy BodyGenCB
,
3629 FinalizeCallbackTy FiniCB
, bool IsNowait
, llvm::Value
*DidIt
) {
3631 if (!updateToLocation(Loc
))
3634 // If needed (i.e. not null), initialize `DidIt` with 0
3636 Builder
.CreateStore(Builder
.getInt32(0), DidIt
);
3639 Directive OMPD
= Directive::OMPD_single
;
3640 uint32_t SrcLocStrSize
;
3641 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
3642 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
3643 Value
*ThreadId
= getOrCreateThreadID(Ident
);
3644 Value
*Args
[] = {Ident
, ThreadId
};
3646 Function
*EntryRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single
);
3647 Instruction
*EntryCall
= Builder
.CreateCall(EntryRTLFn
, Args
);
3649 Function
*ExitRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single
);
3650 Instruction
*ExitCall
= Builder
.CreateCall(ExitRTLFn
, Args
);
3652 // generates the following:
3653 // if (__kmpc_single()) {
3654 // .... single region ...
3655 // __kmpc_end_single
3659 EmitOMPInlinedRegion(OMPD
, EntryCall
, ExitCall
, BodyGenCB
, FiniCB
,
3660 /*Conditional*/ true,
3661 /*hasFinalize*/ true);
3663 createBarrier(LocationDescription(Builder
.saveIP(), Loc
.DL
),
3664 omp::Directive::OMPD_unknown
, /* ForceSimpleCall */ false,
3665 /* CheckCancelFlag */ false);
3666 return Builder
.saveIP();
3669 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createCritical(
3670 const LocationDescription
&Loc
, BodyGenCallbackTy BodyGenCB
,
3671 FinalizeCallbackTy FiniCB
, StringRef CriticalName
, Value
*HintInst
) {
3673 if (!updateToLocation(Loc
))
3676 Directive OMPD
= Directive::OMPD_critical
;
3677 uint32_t SrcLocStrSize
;
3678 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
3679 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
3680 Value
*ThreadId
= getOrCreateThreadID(Ident
);
3681 Value
*LockVar
= getOMPCriticalRegionLock(CriticalName
);
3682 Value
*Args
[] = {Ident
, ThreadId
, LockVar
};
3684 SmallVector
<llvm::Value
*, 4> EnterArgs(std::begin(Args
), std::end(Args
));
3685 Function
*RTFn
= nullptr;
3687 // Add Hint to entry Args and create call
3688 EnterArgs
.push_back(HintInst
);
3689 RTFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint
);
3691 RTFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical
);
3693 Instruction
*EntryCall
= Builder
.CreateCall(RTFn
, EnterArgs
);
3695 Function
*ExitRTLFn
=
3696 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical
);
3697 Instruction
*ExitCall
= Builder
.CreateCall(ExitRTLFn
, Args
);
3699 return EmitOMPInlinedRegion(OMPD
, EntryCall
, ExitCall
, BodyGenCB
, FiniCB
,
3700 /*Conditional*/ false, /*hasFinalize*/ true);
3703 OpenMPIRBuilder::InsertPointTy
3704 OpenMPIRBuilder::createOrderedDepend(const LocationDescription
&Loc
,
3705 InsertPointTy AllocaIP
, unsigned NumLoops
,
3706 ArrayRef
<llvm::Value
*> StoreValues
,
3707 const Twine
&Name
, bool IsDependSource
) {
3709 llvm::all_of(StoreValues
,
3710 [](Value
*SV
) { return SV
->getType()->isIntegerTy(64); }) &&
3711 "OpenMP runtime requires depend vec with i64 type");
3713 if (!updateToLocation(Loc
))
3716 // Allocate space for vector and generate alloc instruction.
3717 auto *ArrI64Ty
= ArrayType::get(Int64
, NumLoops
);
3718 Builder
.restoreIP(AllocaIP
);
3719 AllocaInst
*ArgsBase
= Builder
.CreateAlloca(ArrI64Ty
, nullptr, Name
);
3720 ArgsBase
->setAlignment(Align(8));
3721 Builder
.restoreIP(Loc
.IP
);
3723 // Store the index value with offset in depend vector.
3724 for (unsigned I
= 0; I
< NumLoops
; ++I
) {
3725 Value
*DependAddrGEPIter
= Builder
.CreateInBoundsGEP(
3726 ArrI64Ty
, ArgsBase
, {Builder
.getInt64(0), Builder
.getInt64(I
)});
3727 StoreInst
*STInst
= Builder
.CreateStore(StoreValues
[I
], DependAddrGEPIter
);
3728 STInst
->setAlignment(Align(8));
3731 Value
*DependBaseAddrGEP
= Builder
.CreateInBoundsGEP(
3732 ArrI64Ty
, ArgsBase
, {Builder
.getInt64(0), Builder
.getInt64(0)});
3734 uint32_t SrcLocStrSize
;
3735 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
3736 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
3737 Value
*ThreadId
= getOrCreateThreadID(Ident
);
3738 Value
*Args
[] = {Ident
, ThreadId
, DependBaseAddrGEP
};
3740 Function
*RTLFn
= nullptr;
3742 RTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post
);
3744 RTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait
);
3745 Builder
.CreateCall(RTLFn
, Args
);
3747 return Builder
.saveIP();
3750 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createOrderedThreadsSimd(
3751 const LocationDescription
&Loc
, BodyGenCallbackTy BodyGenCB
,
3752 FinalizeCallbackTy FiniCB
, bool IsThreads
) {
3753 if (!updateToLocation(Loc
))
3756 Directive OMPD
= Directive::OMPD_ordered
;
3757 Instruction
*EntryCall
= nullptr;
3758 Instruction
*ExitCall
= nullptr;
3761 uint32_t SrcLocStrSize
;
3762 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
3763 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
3764 Value
*ThreadId
= getOrCreateThreadID(Ident
);
3765 Value
*Args
[] = {Ident
, ThreadId
};
3767 Function
*EntryRTLFn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered
);
3768 EntryCall
= Builder
.CreateCall(EntryRTLFn
, Args
);
3770 Function
*ExitRTLFn
=
3771 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered
);
3772 ExitCall
= Builder
.CreateCall(ExitRTLFn
, Args
);
3775 return EmitOMPInlinedRegion(OMPD
, EntryCall
, ExitCall
, BodyGenCB
, FiniCB
,
3776 /*Conditional*/ false, /*hasFinalize*/ true);
3779 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::EmitOMPInlinedRegion(
3780 Directive OMPD
, Instruction
*EntryCall
, Instruction
*ExitCall
,
3781 BodyGenCallbackTy BodyGenCB
, FinalizeCallbackTy FiniCB
, bool Conditional
,
3782 bool HasFinalize
, bool IsCancellable
) {
3785 FinalizationStack
.push_back({FiniCB
, OMPD
, IsCancellable
});
3787 // Create inlined region's entry and body blocks, in preparation
3788 // for conditional creation
3789 BasicBlock
*EntryBB
= Builder
.GetInsertBlock();
3790 Instruction
*SplitPos
= EntryBB
->getTerminator();
3791 if (!isa_and_nonnull
<BranchInst
>(SplitPos
))
3792 SplitPos
= new UnreachableInst(Builder
.getContext(), EntryBB
);
3793 BasicBlock
*ExitBB
= EntryBB
->splitBasicBlock(SplitPos
, "omp_region.end");
3794 BasicBlock
*FiniBB
=
3795 EntryBB
->splitBasicBlock(EntryBB
->getTerminator(), "omp_region.finalize");
3797 Builder
.SetInsertPoint(EntryBB
->getTerminator());
3798 emitCommonDirectiveEntry(OMPD
, EntryCall
, ExitBB
, Conditional
);
3801 BodyGenCB(/* AllocaIP */ InsertPointTy(),
3802 /* CodeGenIP */ Builder
.saveIP());
3804 // emit exit call and do any needed finalization.
3805 auto FinIP
= InsertPointTy(FiniBB
, FiniBB
->getFirstInsertionPt());
3806 assert(FiniBB
->getTerminator()->getNumSuccessors() == 1 &&
3807 FiniBB
->getTerminator()->getSuccessor(0) == ExitBB
&&
3808 "Unexpected control flow graph state!!");
3809 emitCommonDirectiveExit(OMPD
, FinIP
, ExitCall
, HasFinalize
);
3810 assert(FiniBB
->getUniquePredecessor()->getUniqueSuccessor() == FiniBB
&&
3811 "Unexpected Control Flow State!");
3812 MergeBlockIntoPredecessor(FiniBB
);
3814 // If we are skipping the region of a non conditional, remove the exit
3815 // block, and clear the builder's insertion point.
3816 assert(SplitPos
->getParent() == ExitBB
&&
3817 "Unexpected Insertion point location!");
3818 auto merged
= MergeBlockIntoPredecessor(ExitBB
);
3819 BasicBlock
*ExitPredBB
= SplitPos
->getParent();
3820 auto InsertBB
= merged
? ExitPredBB
: ExitBB
;
3821 if (!isa_and_nonnull
<BranchInst
>(SplitPos
))
3822 SplitPos
->eraseFromParent();
3823 Builder
.SetInsertPoint(InsertBB
);
3825 return Builder
.saveIP();
3828 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::emitCommonDirectiveEntry(
3829 Directive OMPD
, Value
*EntryCall
, BasicBlock
*ExitBB
, bool Conditional
) {
3830 // if nothing to do, Return current insertion point.
3831 if (!Conditional
|| !EntryCall
)
3832 return Builder
.saveIP();
3834 BasicBlock
*EntryBB
= Builder
.GetInsertBlock();
3835 Value
*CallBool
= Builder
.CreateIsNotNull(EntryCall
);
3836 auto *ThenBB
= BasicBlock::Create(M
.getContext(), "omp_region.body");
3837 auto *UI
= new UnreachableInst(Builder
.getContext(), ThenBB
);
3839 // Emit thenBB and set the Builder's insertion point there for
3840 // body generation next. Place the block after the current block.
3841 Function
*CurFn
= EntryBB
->getParent();
3842 CurFn
->insert(std::next(EntryBB
->getIterator()), ThenBB
);
3844 // Move Entry branch to end of ThenBB, and replace with conditional
3846 Instruction
*EntryBBTI
= EntryBB
->getTerminator();
3847 Builder
.CreateCondBr(CallBool
, ThenBB
, ExitBB
);
3848 EntryBBTI
->removeFromParent();
3849 Builder
.SetInsertPoint(UI
);
3850 Builder
.Insert(EntryBBTI
);
3851 UI
->eraseFromParent();
3852 Builder
.SetInsertPoint(ThenBB
->getTerminator());
3854 // return an insertion point to ExitBB.
3855 return IRBuilder
<>::InsertPoint(ExitBB
, ExitBB
->getFirstInsertionPt());
3858 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::emitCommonDirectiveExit(
3859 omp::Directive OMPD
, InsertPointTy FinIP
, Instruction
*ExitCall
,
3862 Builder
.restoreIP(FinIP
);
3864 // If there is finalization to do, emit it before the exit call
3866 assert(!FinalizationStack
.empty() &&
3867 "Unexpected finalization stack state!");
3869 FinalizationInfo Fi
= FinalizationStack
.pop_back_val();
3870 assert(Fi
.DK
== OMPD
&& "Unexpected Directive for Finalization call!");
3874 BasicBlock
*FiniBB
= FinIP
.getBlock();
3875 Instruction
*FiniBBTI
= FiniBB
->getTerminator();
3877 // set Builder IP for call creation
3878 Builder
.SetInsertPoint(FiniBBTI
);
3882 return Builder
.saveIP();
3884 // place the Exitcall as last instruction before Finalization block terminator
3885 ExitCall
->removeFromParent();
3886 Builder
.Insert(ExitCall
);
3888 return IRBuilder
<>::InsertPoint(ExitCall
->getParent(),
3889 ExitCall
->getIterator());
3892 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createCopyinClauseBlocks(
3893 InsertPointTy IP
, Value
*MasterAddr
, Value
*PrivateAddr
,
3894 llvm::IntegerType
*IntPtrTy
, bool BranchtoEnd
) {
3898 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
3900 // creates the following CFG structure
3901 // OMP_Entry : (MasterAddr != PrivateAddr)?
3904 // | copin.not.master
3907 // copyin.not.master.end
3912 BasicBlock
*OMP_Entry
= IP
.getBlock();
3913 Function
*CurFn
= OMP_Entry
->getParent();
3914 BasicBlock
*CopyBegin
=
3915 BasicBlock::Create(M
.getContext(), "copyin.not.master", CurFn
);
3916 BasicBlock
*CopyEnd
= nullptr;
3918 // If entry block is terminated, split to preserve the branch to following
3919 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
3920 if (isa_and_nonnull
<BranchInst
>(OMP_Entry
->getTerminator())) {
3921 CopyEnd
= OMP_Entry
->splitBasicBlock(OMP_Entry
->getTerminator(),
3922 "copyin.not.master.end");
3923 OMP_Entry
->getTerminator()->eraseFromParent();
3926 BasicBlock::Create(M
.getContext(), "copyin.not.master.end", CurFn
);
3929 Builder
.SetInsertPoint(OMP_Entry
);
3930 Value
*MasterPtr
= Builder
.CreatePtrToInt(MasterAddr
, IntPtrTy
);
3931 Value
*PrivatePtr
= Builder
.CreatePtrToInt(PrivateAddr
, IntPtrTy
);
3932 Value
*cmp
= Builder
.CreateICmpNE(MasterPtr
, PrivatePtr
);
3933 Builder
.CreateCondBr(cmp
, CopyBegin
, CopyEnd
);
3935 Builder
.SetInsertPoint(CopyBegin
);
3937 Builder
.SetInsertPoint(Builder
.CreateBr(CopyEnd
));
3939 return Builder
.saveIP();
3942 CallInst
*OpenMPIRBuilder::createOMPAlloc(const LocationDescription
&Loc
,
3943 Value
*Size
, Value
*Allocator
,
3945 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
3946 Builder
.restoreIP(Loc
.IP
);
3948 uint32_t SrcLocStrSize
;
3949 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
3950 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
3951 Value
*ThreadId
= getOrCreateThreadID(Ident
);
3952 Value
*Args
[] = {ThreadId
, Size
, Allocator
};
3954 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc
);
3956 return Builder
.CreateCall(Fn
, Args
, Name
);
3959 CallInst
*OpenMPIRBuilder::createOMPFree(const LocationDescription
&Loc
,
3960 Value
*Addr
, Value
*Allocator
,
3962 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
3963 Builder
.restoreIP(Loc
.IP
);
3965 uint32_t SrcLocStrSize
;
3966 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
3967 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
3968 Value
*ThreadId
= getOrCreateThreadID(Ident
);
3969 Value
*Args
[] = {ThreadId
, Addr
, Allocator
};
3970 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free
);
3971 return Builder
.CreateCall(Fn
, Args
, Name
);
3974 CallInst
*OpenMPIRBuilder::createOMPInteropInit(
3975 const LocationDescription
&Loc
, Value
*InteropVar
,
3976 omp::OMPInteropType InteropType
, Value
*Device
, Value
*NumDependences
,
3977 Value
*DependenceAddress
, bool HaveNowaitClause
) {
3978 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
3979 Builder
.restoreIP(Loc
.IP
);
3981 uint32_t SrcLocStrSize
;
3982 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
3983 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
3984 Value
*ThreadId
= getOrCreateThreadID(Ident
);
3985 if (Device
== nullptr)
3986 Device
= ConstantInt::get(Int32
, -1);
3987 Constant
*InteropTypeVal
= ConstantInt::get(Int32
, (int)InteropType
);
3988 if (NumDependences
== nullptr) {
3989 NumDependences
= ConstantInt::get(Int32
, 0);
3990 PointerType
*PointerTypeVar
= Type::getInt8PtrTy(M
.getContext());
3991 DependenceAddress
= ConstantPointerNull::get(PointerTypeVar
);
3993 Value
*HaveNowaitClauseVal
= ConstantInt::get(Int32
, HaveNowaitClause
);
3995 Ident
, ThreadId
, InteropVar
, InteropTypeVal
,
3996 Device
, NumDependences
, DependenceAddress
, HaveNowaitClauseVal
};
3998 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init
);
4000 return Builder
.CreateCall(Fn
, Args
);
4003 CallInst
*OpenMPIRBuilder::createOMPInteropDestroy(
4004 const LocationDescription
&Loc
, Value
*InteropVar
, Value
*Device
,
4005 Value
*NumDependences
, Value
*DependenceAddress
, bool HaveNowaitClause
) {
4006 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
4007 Builder
.restoreIP(Loc
.IP
);
4009 uint32_t SrcLocStrSize
;
4010 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4011 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4012 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4013 if (Device
== nullptr)
4014 Device
= ConstantInt::get(Int32
, -1);
4015 if (NumDependences
== nullptr) {
4016 NumDependences
= ConstantInt::get(Int32
, 0);
4017 PointerType
*PointerTypeVar
= Type::getInt8PtrTy(M
.getContext());
4018 DependenceAddress
= ConstantPointerNull::get(PointerTypeVar
);
4020 Value
*HaveNowaitClauseVal
= ConstantInt::get(Int32
, HaveNowaitClause
);
4022 Ident
, ThreadId
, InteropVar
, Device
,
4023 NumDependences
, DependenceAddress
, HaveNowaitClauseVal
};
4025 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy
);
4027 return Builder
.CreateCall(Fn
, Args
);
4030 CallInst
*OpenMPIRBuilder::createOMPInteropUse(const LocationDescription
&Loc
,
4031 Value
*InteropVar
, Value
*Device
,
4032 Value
*NumDependences
,
4033 Value
*DependenceAddress
,
4034 bool HaveNowaitClause
) {
4035 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
4036 Builder
.restoreIP(Loc
.IP
);
4037 uint32_t SrcLocStrSize
;
4038 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4039 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4040 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4041 if (Device
== nullptr)
4042 Device
= ConstantInt::get(Int32
, -1);
4043 if (NumDependences
== nullptr) {
4044 NumDependences
= ConstantInt::get(Int32
, 0);
4045 PointerType
*PointerTypeVar
= Type::getInt8PtrTy(M
.getContext());
4046 DependenceAddress
= ConstantPointerNull::get(PointerTypeVar
);
4048 Value
*HaveNowaitClauseVal
= ConstantInt::get(Int32
, HaveNowaitClause
);
4050 Ident
, ThreadId
, InteropVar
, Device
,
4051 NumDependences
, DependenceAddress
, HaveNowaitClauseVal
};
4053 Function
*Fn
= getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use
);
4055 return Builder
.CreateCall(Fn
, Args
);
4058 CallInst
*OpenMPIRBuilder::createCachedThreadPrivate(
4059 const LocationDescription
&Loc
, llvm::Value
*Pointer
,
4060 llvm::ConstantInt
*Size
, const llvm::Twine
&Name
) {
4061 IRBuilder
<>::InsertPointGuard
IPG(Builder
);
4062 Builder
.restoreIP(Loc
.IP
);
4064 uint32_t SrcLocStrSize
;
4065 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4066 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4067 Value
*ThreadId
= getOrCreateThreadID(Ident
);
4068 Constant
*ThreadPrivateCache
=
4069 getOrCreateInternalVariable(Int8PtrPtr
, Name
.str());
4070 llvm::Value
*Args
[] = {Ident
, ThreadId
, Pointer
, Size
, ThreadPrivateCache
};
4073 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached
);
4075 return Builder
.CreateCall(Fn
, Args
);
4078 OpenMPIRBuilder::InsertPointTy
4079 OpenMPIRBuilder::createTargetInit(const LocationDescription
&Loc
, bool IsSPMD
,
4080 int32_t MinThreadsVal
, int32_t MaxThreadsVal
,
4081 int32_t MinTeamsVal
, int32_t MaxTeamsVal
) {
4082 if (!updateToLocation(Loc
))
4085 uint32_t SrcLocStrSize
;
4086 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4087 Constant
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4088 Constant
*IsSPMDVal
= ConstantInt::getSigned(
4089 Int8
, IsSPMD
? OMP_TGT_EXEC_MODE_SPMD
: OMP_TGT_EXEC_MODE_GENERIC
);
4090 Constant
*UseGenericStateMachineVal
= ConstantInt::getSigned(Int8
, !IsSPMD
);
4091 Constant
*MayUseNestedParallelismVal
= ConstantInt::getSigned(Int8
, true);
4092 Constant
*DebugIndentionLevelVal
= ConstantInt::getSigned(Int16
, 0);
4094 Function
*Kernel
= Builder
.GetInsertBlock()->getParent();
4096 // Manifest the launch configuration in the metadata matching the kernel
4098 if (MinTeamsVal
> 1 || MaxTeamsVal
> 0)
4099 writeTeamsForKernel(T
, *Kernel
, MinTeamsVal
, MaxTeamsVal
);
4101 // For max values, < 0 means unset, == 0 means set but unknown.
4102 if (MaxThreadsVal
< 0)
4103 MaxThreadsVal
= std::max(
4104 int32_t(getGridValue(T
, Kernel
).GV_Default_WG_Size
), MinThreadsVal
);
4106 if (MaxThreadsVal
> 0)
4107 writeThreadBoundsForKernel(T
, *Kernel
, MinThreadsVal
, MaxThreadsVal
);
4109 Constant
*MinThreads
= ConstantInt::getSigned(Int32
, MinThreadsVal
);
4110 Constant
*MaxThreads
= ConstantInt::getSigned(Int32
, MaxThreadsVal
);
4111 Constant
*MinTeams
= ConstantInt::getSigned(Int32
, MinTeamsVal
);
4112 Constant
*MaxTeams
= ConstantInt::getSigned(Int32
, MaxTeamsVal
);
4113 Constant
*ReductionBufferSize
= ConstantInt::getSigned(Int32
, 0);
4115 // We need to strip the debug prefix to get the correct kernel name.
4116 StringRef KernelName
= Kernel
->getName();
4117 const std::string DebugPrefix
= "_debug__";
4118 if (KernelName
.ends_with(DebugPrefix
))
4119 KernelName
= KernelName
.drop_back(DebugPrefix
.length());
4121 Function
*Fn
= getOrCreateRuntimeFunctionPtr(
4122 omp::RuntimeFunction::OMPRTL___kmpc_target_init
);
4123 const DataLayout
&DL
= Fn
->getParent()->getDataLayout();
4125 Twine DynamicEnvironmentName
= KernelName
+ "_dynamic_environment";
4126 Constant
*DynamicEnvironmentInitializer
=
4127 ConstantStruct::get(DynamicEnvironment
, {DebugIndentionLevelVal
});
4128 GlobalVariable
*DynamicEnvironmentGV
= new GlobalVariable(
4129 M
, DynamicEnvironment
, /*IsConstant=*/false, GlobalValue::WeakODRLinkage
,
4130 DynamicEnvironmentInitializer
, DynamicEnvironmentName
,
4131 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal
,
4132 DL
.getDefaultGlobalsAddressSpace());
4133 DynamicEnvironmentGV
->setVisibility(GlobalValue::ProtectedVisibility
);
4135 Constant
*DynamicEnvironment
=
4136 DynamicEnvironmentGV
->getType() == DynamicEnvironmentPtr
4137 ? DynamicEnvironmentGV
4138 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV
,
4139 DynamicEnvironmentPtr
);
4141 Constant
*ConfigurationEnvironmentInitializer
= ConstantStruct::get(
4142 ConfigurationEnvironment
, {
4143 UseGenericStateMachineVal
,
4144 MayUseNestedParallelismVal
,
4150 ReductionBufferSize
,
4152 Constant
*KernelEnvironmentInitializer
= ConstantStruct::get(
4153 KernelEnvironment
, {
4154 ConfigurationEnvironmentInitializer
,
4158 Twine KernelEnvironmentName
= KernelName
+ "_kernel_environment";
4159 GlobalVariable
*KernelEnvironmentGV
= new GlobalVariable(
4160 M
, KernelEnvironment
, /*IsConstant=*/true, GlobalValue::WeakODRLinkage
,
4161 KernelEnvironmentInitializer
, KernelEnvironmentName
,
4162 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal
,
4163 DL
.getDefaultGlobalsAddressSpace());
4164 KernelEnvironmentGV
->setVisibility(GlobalValue::ProtectedVisibility
);
4166 Constant
*KernelEnvironment
=
4167 KernelEnvironmentGV
->getType() == KernelEnvironmentPtr
4168 ? KernelEnvironmentGV
4169 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV
,
4170 KernelEnvironmentPtr
);
4171 Value
*KernelLaunchEnvironment
= Kernel
->getArg(0);
4172 CallInst
*ThreadKind
=
4173 Builder
.CreateCall(Fn
, {KernelEnvironment
, KernelLaunchEnvironment
});
4175 Value
*ExecUserCode
= Builder
.CreateICmpEQ(
4176 ThreadKind
, ConstantInt::get(ThreadKind
->getType(), -1),
4179 // ThreadKind = __kmpc_target_init(...)
4180 // if (ThreadKind == -1)
4185 auto *UI
= Builder
.CreateUnreachable();
4186 BasicBlock
*CheckBB
= UI
->getParent();
4187 BasicBlock
*UserCodeEntryBB
= CheckBB
->splitBasicBlock(UI
, "user_code.entry");
4189 BasicBlock
*WorkerExitBB
= BasicBlock::Create(
4190 CheckBB
->getContext(), "worker.exit", CheckBB
->getParent());
4191 Builder
.SetInsertPoint(WorkerExitBB
);
4192 Builder
.CreateRetVoid();
4194 auto *CheckBBTI
= CheckBB
->getTerminator();
4195 Builder
.SetInsertPoint(CheckBBTI
);
4196 Builder
.CreateCondBr(ExecUserCode
, UI
->getParent(), WorkerExitBB
);
4198 CheckBBTI
->eraseFromParent();
4199 UI
->eraseFromParent();
4201 // Continue in the "user_code" block, see diagram above and in
4202 // openmp/libomptarget/deviceRTLs/common/include/target.h .
4203 return InsertPointTy(UserCodeEntryBB
, UserCodeEntryBB
->getFirstInsertionPt());
4206 void OpenMPIRBuilder::createTargetDeinit(const LocationDescription
&Loc
,
4207 int32_t TeamsReductionBufferSize
) {
4208 if (!updateToLocation(Loc
))
4211 Function
*Fn
= getOrCreateRuntimeFunctionPtr(
4212 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit
);
4214 Builder
.CreateCall(Fn
, {});
4216 if (!TeamsReductionBufferSize
)
4219 Function
*Kernel
= Builder
.GetInsertBlock()->getParent();
4220 // We need to strip the debug prefix to get the correct kernel name.
4221 StringRef KernelName
= Kernel
->getName();
4222 const std::string DebugPrefix
= "_debug__";
4223 if (KernelName
.ends_with(DebugPrefix
))
4224 KernelName
= KernelName
.drop_back(DebugPrefix
.length());
4225 auto *KernelEnvironmentGV
=
4226 M
.getNamedGlobal((KernelName
+ "_kernel_environment").str());
4227 assert(KernelEnvironmentGV
&& "Expected kernel environment global\n");
4228 auto *KernelEnvironmentInitializer
= KernelEnvironmentGV
->getInitializer();
4229 auto *NewInitializer
= ConstantFoldInsertValueInstruction(
4230 KernelEnvironmentInitializer
,
4231 ConstantInt::get(Int32
, TeamsReductionBufferSize
), {0, 7});
4232 KernelEnvironmentGV
->setInitializer(NewInitializer
);
4235 static MDNode
*getNVPTXMDNode(Function
&Kernel
, StringRef Name
) {
4236 Module
&M
= *Kernel
.getParent();
4237 NamedMDNode
*MD
= M
.getOrInsertNamedMetadata("nvvm.annotations");
4238 for (auto *Op
: MD
->operands()) {
4239 if (Op
->getNumOperands() != 3)
4241 auto *KernelOp
= dyn_cast
<ConstantAsMetadata
>(Op
->getOperand(0));
4242 if (!KernelOp
|| KernelOp
->getValue() != &Kernel
)
4244 auto *Prop
= dyn_cast
<MDString
>(Op
->getOperand(1));
4245 if (!Prop
|| Prop
->getString() != Name
)
4252 static void updateNVPTXMetadata(Function
&Kernel
, StringRef Name
, int32_t Value
,
4254 // Update the "maxntidx" metadata for NVIDIA, or add it.
4255 MDNode
*ExistingOp
= getNVPTXMDNode(Kernel
, Name
);
4257 auto *OldVal
= dyn_cast
<ConstantAsMetadata
>(ExistingOp
->getOperand(2));
4258 int32_t OldLimit
= cast
<ConstantInt
>(OldVal
->getValue())->getZExtValue();
4259 ExistingOp
->replaceOperandWith(
4260 2, ConstantAsMetadata::get(ConstantInt::get(
4261 OldVal
->getValue()->getType(),
4262 Min
? std::min(OldLimit
, Value
) : std::max(OldLimit
, Value
))));
4264 LLVMContext
&Ctx
= Kernel
.getContext();
4265 Metadata
*MDVals
[] = {ConstantAsMetadata::get(&Kernel
),
4266 MDString::get(Ctx
, Name
),
4267 ConstantAsMetadata::get(
4268 ConstantInt::get(Type::getInt32Ty(Ctx
), Value
))};
4269 // Append metadata to nvvm.annotations
4270 Module
&M
= *Kernel
.getParent();
4271 NamedMDNode
*MD
= M
.getOrInsertNamedMetadata("nvvm.annotations");
4272 MD
->addOperand(MDNode::get(Ctx
, MDVals
));
4276 std::pair
<int32_t, int32_t>
4277 OpenMPIRBuilder::readThreadBoundsForKernel(const Triple
&T
, Function
&Kernel
) {
4278 int32_t ThreadLimit
=
4279 Kernel
.getFnAttributeAsParsedInteger("omp_target_thread_limit");
4282 const auto &Attr
= Kernel
.getFnAttribute("amdgpu-flat-work-group-size");
4283 if (!Attr
.isValid() || !Attr
.isStringAttribute())
4284 return {0, ThreadLimit
};
4285 auto [LBStr
, UBStr
] = Attr
.getValueAsString().split(',');
4287 if (!llvm::to_integer(UBStr
, UB
, 10))
4288 return {0, ThreadLimit
};
4289 UB
= ThreadLimit
? std::min(ThreadLimit
, UB
) : UB
;
4290 if (!llvm::to_integer(LBStr
, LB
, 10))
4295 if (MDNode
*ExistingOp
= getNVPTXMDNode(Kernel
, "maxntidx")) {
4296 auto *OldVal
= dyn_cast
<ConstantAsMetadata
>(ExistingOp
->getOperand(2));
4297 int32_t UB
= cast
<ConstantInt
>(OldVal
->getValue())->getZExtValue();
4298 return {0, ThreadLimit
? std::min(ThreadLimit
, UB
) : UB
};
4300 return {0, ThreadLimit
};
4303 void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple
&T
,
4304 Function
&Kernel
, int32_t LB
,
4306 Kernel
.addFnAttr("omp_target_thread_limit", std::to_string(UB
));
4309 Kernel
.addFnAttr("amdgpu-flat-work-group-size",
4310 llvm::utostr(LB
) + "," + llvm::utostr(UB
));
4314 updateNVPTXMetadata(Kernel
, "maxntidx", UB
, true);
4317 std::pair
<int32_t, int32_t>
4318 OpenMPIRBuilder::readTeamBoundsForKernel(const Triple
&, Function
&Kernel
) {
4319 // TODO: Read from backend annotations if available.
4320 return {0, Kernel
.getFnAttributeAsParsedInteger("omp_target_num_teams")};
4323 void OpenMPIRBuilder::writeTeamsForKernel(const Triple
&T
, Function
&Kernel
,
4324 int32_t LB
, int32_t UB
) {
4327 updateNVPTXMetadata(Kernel
, "maxclusterrank", UB
, true);
4328 updateNVPTXMetadata(Kernel
, "minctasm", LB
, false);
4330 Kernel
.addFnAttr("omp_target_num_teams", std::to_string(LB
));
4333 void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
4334 Function
*OutlinedFn
) {
4335 if (Config
.isTargetDevice()) {
4336 OutlinedFn
->setLinkage(GlobalValue::WeakODRLinkage
);
4337 // TODO: Determine if DSO local can be set to true.
4338 OutlinedFn
->setDSOLocal(false);
4339 OutlinedFn
->setVisibility(GlobalValue::ProtectedVisibility
);
4341 OutlinedFn
->setCallingConv(CallingConv::AMDGPU_KERNEL
);
4345 Constant
*OpenMPIRBuilder::createOutlinedFunctionID(Function
*OutlinedFn
,
4346 StringRef EntryFnIDName
) {
4347 if (Config
.isTargetDevice()) {
4348 assert(OutlinedFn
&& "The outlined function must exist if embedded");
4349 return ConstantExpr::getBitCast(OutlinedFn
, Builder
.getInt8PtrTy());
4352 return new GlobalVariable(
4353 M
, Builder
.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage
,
4354 Constant::getNullValue(Builder
.getInt8Ty()), EntryFnIDName
);
4357 Constant
*OpenMPIRBuilder::createTargetRegionEntryAddr(Function
*OutlinedFn
,
4358 StringRef EntryFnName
) {
4362 assert(!M
.getGlobalVariable(EntryFnName
, true) &&
4363 "Named kernel already exists?");
4364 return new GlobalVariable(
4365 M
, Builder
.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage
,
4366 Constant::getNullValue(Builder
.getInt8Ty()), EntryFnName
);
4369 void OpenMPIRBuilder::emitTargetRegionFunction(
4370 TargetRegionEntryInfo
&EntryInfo
,
4371 FunctionGenCallback
&GenerateFunctionCallback
, bool IsOffloadEntry
,
4372 Function
*&OutlinedFn
, Constant
*&OutlinedFnID
) {
4374 SmallString
<64> EntryFnName
;
4375 OffloadInfoManager
.getTargetRegionEntryFnName(EntryFnName
, EntryInfo
);
4377 OutlinedFn
= Config
.isTargetDevice() || !Config
.openMPOffloadMandatory()
4378 ? GenerateFunctionCallback(EntryFnName
)
4381 // If this target outline function is not an offload entry, we don't need to
4382 // register it. This may be in the case of a false if clause, or if there are
4383 // no OpenMP targets.
4384 if (!IsOffloadEntry
)
4387 std::string EntryFnIDName
=
4388 Config
.isTargetDevice()
4389 ? std::string(EntryFnName
)
4390 : createPlatformSpecificName({EntryFnName
, "region_id"});
4392 OutlinedFnID
= registerTargetRegionFunction(EntryInfo
, OutlinedFn
,
4393 EntryFnName
, EntryFnIDName
);
4396 Constant
*OpenMPIRBuilder::registerTargetRegionFunction(
4397 TargetRegionEntryInfo
&EntryInfo
, Function
*OutlinedFn
,
4398 StringRef EntryFnName
, StringRef EntryFnIDName
) {
4400 setOutlinedTargetRegionFunctionAttributes(OutlinedFn
);
4401 auto OutlinedFnID
= createOutlinedFunctionID(OutlinedFn
, EntryFnIDName
);
4402 auto EntryAddr
= createTargetRegionEntryAddr(OutlinedFn
, EntryFnName
);
4403 OffloadInfoManager
.registerTargetRegionEntryInfo(
4404 EntryInfo
, EntryAddr
, OutlinedFnID
,
4405 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion
);
4406 return OutlinedFnID
;
4409 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createTargetData(
4410 const LocationDescription
&Loc
, InsertPointTy AllocaIP
,
4411 InsertPointTy CodeGenIP
, Value
*DeviceID
, Value
*IfCond
,
4412 TargetDataInfo
&Info
, GenMapInfoCallbackTy GenMapInfoCB
,
4413 omp::RuntimeFunction
*MapperFunc
,
4414 function_ref
<InsertPointTy(InsertPointTy CodeGenIP
, BodyGenTy BodyGenType
)>
4416 function_ref
<void(unsigned int, Value
*)> DeviceAddrCB
,
4417 function_ref
<Value
*(unsigned int)> CustomMapperCB
, Value
*SrcLocInfo
) {
4418 if (!updateToLocation(Loc
))
4419 return InsertPointTy();
4421 Builder
.restoreIP(CodeGenIP
);
4422 bool IsStandAlone
= !BodyGenCB
;
4423 MapInfosTy
*MapInfo
;
4424 // Generate the code for the opening of the data environment. Capture all the
4425 // arguments of the runtime call by reference because they are used in the
4426 // closing of the region.
4427 auto BeginThenGen
= [&](InsertPointTy AllocaIP
, InsertPointTy CodeGenIP
) {
4428 MapInfo
= &GenMapInfoCB(Builder
.saveIP());
4429 emitOffloadingArrays(AllocaIP
, Builder
.saveIP(), *MapInfo
, Info
,
4430 /*IsNonContiguous=*/true, DeviceAddrCB
,
4433 TargetDataRTArgs RTArgs
;
4434 emitOffloadingArraysArgument(Builder
, RTArgs
, Info
,
4435 !MapInfo
->Names
.empty());
4437 // Emit the number of elements in the offloading arrays.
4438 Value
*PointerNum
= Builder
.getInt32(Info
.NumberOfPtrs
);
4440 // Source location for the ident struct
4442 uint32_t SrcLocStrSize
;
4443 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4444 SrcLocInfo
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4447 Value
*OffloadingArgs
[] = {SrcLocInfo
, DeviceID
,
4448 PointerNum
, RTArgs
.BasePointersArray
,
4449 RTArgs
.PointersArray
, RTArgs
.SizesArray
,
4450 RTArgs
.MapTypesArray
, RTArgs
.MapNamesArray
,
4451 RTArgs
.MappersArray
};
4454 assert(MapperFunc
&& "MapperFunc missing for standalone target data");
4455 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc
),
4458 Function
*BeginMapperFunc
= getOrCreateRuntimeFunctionPtr(
4459 omp::OMPRTL___tgt_target_data_begin_mapper
);
4461 Builder
.CreateCall(BeginMapperFunc
, OffloadingArgs
);
4463 for (auto DeviceMap
: Info
.DevicePtrInfoMap
) {
4464 if (isa
<AllocaInst
>(DeviceMap
.second
.second
)) {
4466 Builder
.CreateLoad(Builder
.getPtrTy(), DeviceMap
.second
.first
);
4467 Builder
.CreateStore(LI
, DeviceMap
.second
.second
);
4471 // If device pointer privatization is required, emit the body of the
4472 // region here. It will have to be duplicated: with and without
4474 Builder
.restoreIP(BodyGenCB(Builder
.saveIP(), BodyGenTy::Priv
));
4478 // If we need device pointer privatization, we need to emit the body of the
4479 // region with no privatization in the 'else' branch of the conditional.
4480 // Otherwise, we don't have to do anything.
4481 auto BeginElseGen
= [&](InsertPointTy AllocaIP
, InsertPointTy CodeGenIP
) {
4482 Builder
.restoreIP(BodyGenCB(Builder
.saveIP(), BodyGenTy::DupNoPriv
));
4485 // Generate code for the closing of the data region.
4486 auto EndThenGen
= [&](InsertPointTy AllocaIP
, InsertPointTy CodeGenIP
) {
4487 TargetDataRTArgs RTArgs
;
4488 emitOffloadingArraysArgument(Builder
, RTArgs
, Info
, !MapInfo
->Names
.empty(),
4489 /*ForEndCall=*/true);
4491 // Emit the number of elements in the offloading arrays.
4492 Value
*PointerNum
= Builder
.getInt32(Info
.NumberOfPtrs
);
4494 // Source location for the ident struct
4496 uint32_t SrcLocStrSize
;
4497 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
4498 SrcLocInfo
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
4501 Value
*OffloadingArgs
[] = {SrcLocInfo
, DeviceID
,
4502 PointerNum
, RTArgs
.BasePointersArray
,
4503 RTArgs
.PointersArray
, RTArgs
.SizesArray
,
4504 RTArgs
.MapTypesArray
, RTArgs
.MapNamesArray
,
4505 RTArgs
.MappersArray
};
4506 Function
*EndMapperFunc
=
4507 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper
);
4509 Builder
.CreateCall(EndMapperFunc
, OffloadingArgs
);
4512 // We don't have to do anything to close the region if the if clause evaluates
4514 auto EndElseGen
= [&](InsertPointTy AllocaIP
, InsertPointTy CodeGenIP
) {};
4518 emitIfClause(IfCond
, BeginThenGen
, BeginElseGen
, AllocaIP
);
4520 BeginThenGen(AllocaIP
, Builder
.saveIP());
4523 // If we don't require privatization of device pointers, we emit the body in
4524 // between the runtime calls. This avoids duplicating the body code.
4525 Builder
.restoreIP(BodyGenCB(Builder
.saveIP(), BodyGenTy::NoPriv
));
4528 emitIfClause(IfCond
, EndThenGen
, EndElseGen
, AllocaIP
);
4530 EndThenGen(AllocaIP
, Builder
.saveIP());
4534 emitIfClause(IfCond
, BeginThenGen
, EndElseGen
, AllocaIP
);
4536 BeginThenGen(AllocaIP
, Builder
.saveIP());
4540 return Builder
.saveIP();
4544 OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize
, bool IVSigned
,
4545 bool IsGPUDistribute
) {
4546 assert((IVSize
== 32 || IVSize
== 64) &&
4547 "IV size is not compatible with the omp runtime");
4548 RuntimeFunction Name
;
4549 if (IsGPUDistribute
)
4551 ? (IVSigned
? omp::OMPRTL___kmpc_distribute_static_init_4
4552 : omp::OMPRTL___kmpc_distribute_static_init_4u
)
4553 : (IVSigned
? omp::OMPRTL___kmpc_distribute_static_init_8
4554 : omp::OMPRTL___kmpc_distribute_static_init_8u
);
4556 Name
= IVSize
== 32 ? (IVSigned
? omp::OMPRTL___kmpc_for_static_init_4
4557 : omp::OMPRTL___kmpc_for_static_init_4u
)
4558 : (IVSigned
? omp::OMPRTL___kmpc_for_static_init_8
4559 : omp::OMPRTL___kmpc_for_static_init_8u
);
4561 return getOrCreateRuntimeFunction(M
, Name
);
4564 FunctionCallee
OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize
,
4566 assert((IVSize
== 32 || IVSize
== 64) &&
4567 "IV size is not compatible with the omp runtime");
4568 RuntimeFunction Name
= IVSize
== 32
4569 ? (IVSigned
? omp::OMPRTL___kmpc_dispatch_init_4
4570 : omp::OMPRTL___kmpc_dispatch_init_4u
)
4571 : (IVSigned
? omp::OMPRTL___kmpc_dispatch_init_8
4572 : omp::OMPRTL___kmpc_dispatch_init_8u
);
4574 return getOrCreateRuntimeFunction(M
, Name
);
4577 FunctionCallee
OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize
,
4579 assert((IVSize
== 32 || IVSize
== 64) &&
4580 "IV size is not compatible with the omp runtime");
4581 RuntimeFunction Name
= IVSize
== 32
4582 ? (IVSigned
? omp::OMPRTL___kmpc_dispatch_next_4
4583 : omp::OMPRTL___kmpc_dispatch_next_4u
)
4584 : (IVSigned
? omp::OMPRTL___kmpc_dispatch_next_8
4585 : omp::OMPRTL___kmpc_dispatch_next_8u
);
4587 return getOrCreateRuntimeFunction(M
, Name
);
4590 FunctionCallee
OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize
,
4592 assert((IVSize
== 32 || IVSize
== 64) &&
4593 "IV size is not compatible with the omp runtime");
4594 RuntimeFunction Name
= IVSize
== 32
4595 ? (IVSigned
? omp::OMPRTL___kmpc_dispatch_fini_4
4596 : omp::OMPRTL___kmpc_dispatch_fini_4u
)
4597 : (IVSigned
? omp::OMPRTL___kmpc_dispatch_fini_8
4598 : omp::OMPRTL___kmpc_dispatch_fini_8u
);
4600 return getOrCreateRuntimeFunction(M
, Name
);
4603 static Function
*createOutlinedFunction(
4604 OpenMPIRBuilder
&OMPBuilder
, IRBuilderBase
&Builder
, StringRef FuncName
,
4605 SmallVectorImpl
<Value
*> &Inputs
,
4606 OpenMPIRBuilder::TargetBodyGenCallbackTy
&CBFunc
,
4607 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy
&ArgAccessorFuncCB
) {
4608 SmallVector
<Type
*> ParameterTypes
;
4609 if (OMPBuilder
.Config
.isTargetDevice()) {
4610 // Add the "implicit" runtime argument we use to provide launch specific
4611 // information for target devices.
4612 auto *Int8PtrTy
= Type::getInt8PtrTy(Builder
.getContext());
4613 ParameterTypes
.push_back(Int8PtrTy
);
4615 // All parameters to target devices are passed as pointers
4616 // or i64. This assumes 64-bit address spaces/pointers.
4617 for (auto &Arg
: Inputs
)
4618 ParameterTypes
.push_back(Arg
->getType()->isPointerTy()
4620 : Type::getInt64Ty(Builder
.getContext()));
4622 for (auto &Arg
: Inputs
)
4623 ParameterTypes
.push_back(Arg
->getType());
4626 auto FuncType
= FunctionType::get(Builder
.getVoidTy(), ParameterTypes
,
4627 /*isVarArg*/ false);
4628 auto Func
= Function::Create(FuncType
, GlobalValue::InternalLinkage
, FuncName
,
4629 Builder
.GetInsertBlock()->getModule());
4631 // Save insert point.
4632 auto OldInsertPoint
= Builder
.saveIP();
4634 // Generate the region into the function.
4635 BasicBlock
*EntryBB
= BasicBlock::Create(Builder
.getContext(), "entry", Func
);
4636 Builder
.SetInsertPoint(EntryBB
);
4638 // Insert target init call in the device compilation pass.
4639 if (OMPBuilder
.Config
.isTargetDevice())
4640 Builder
.restoreIP(OMPBuilder
.createTargetInit(Builder
, /*IsSPMD*/ false));
4642 BasicBlock
*UserCodeEntryBB
= Builder
.GetInsertBlock();
4644 // Insert target deinit call in the device compilation pass.
4645 Builder
.restoreIP(CBFunc(Builder
.saveIP(), Builder
.saveIP()));
4646 if (OMPBuilder
.Config
.isTargetDevice())
4647 OMPBuilder
.createTargetDeinit(Builder
);
4649 // Insert return instruction.
4650 Builder
.CreateRetVoid();
4652 // New Alloca IP at entry point of created device function.
4653 Builder
.SetInsertPoint(EntryBB
->getFirstNonPHI());
4654 auto AllocaIP
= Builder
.saveIP();
4656 Builder
.SetInsertPoint(UserCodeEntryBB
->getFirstNonPHIOrDbg());
4658 // Skip the artificial dyn_ptr on the device.
4659 const auto &ArgRange
=
4660 OMPBuilder
.Config
.isTargetDevice()
4661 ? make_range(Func
->arg_begin() + 1, Func
->arg_end())
4664 // Rewrite uses of input valus to parameters.
4665 for (auto InArg
: zip(Inputs
, ArgRange
)) {
4666 Value
*Input
= std::get
<0>(InArg
);
4667 Argument
&Arg
= std::get
<1>(InArg
);
4668 Value
*InputCopy
= nullptr;
4671 ArgAccessorFuncCB(Arg
, Input
, InputCopy
, AllocaIP
, Builder
.saveIP()));
4673 // Collect all the instructions
4674 for (User
*User
: make_early_inc_range(Input
->users()))
4675 if (auto Instr
= dyn_cast
<Instruction
>(User
))
4676 if (Instr
->getFunction() == Func
)
4677 Instr
->replaceUsesOfWith(Input
, InputCopy
);
4680 // Restore insert point.
4681 Builder
.restoreIP(OldInsertPoint
);
4686 static void emitTargetOutlinedFunction(
4687 OpenMPIRBuilder
&OMPBuilder
, IRBuilderBase
&Builder
,
4688 TargetRegionEntryInfo
&EntryInfo
, Function
*&OutlinedFn
,
4689 Constant
*&OutlinedFnID
, SmallVectorImpl
<Value
*> &Inputs
,
4690 OpenMPIRBuilder::TargetBodyGenCallbackTy
&CBFunc
,
4691 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy
&ArgAccessorFuncCB
) {
4693 OpenMPIRBuilder::FunctionGenCallback
&&GenerateOutlinedFunction
=
4694 [&OMPBuilder
, &Builder
, &Inputs
, &CBFunc
,
4695 &ArgAccessorFuncCB
](StringRef EntryFnName
) {
4696 return createOutlinedFunction(OMPBuilder
, Builder
, EntryFnName
, Inputs
,
4697 CBFunc
, ArgAccessorFuncCB
);
4700 OMPBuilder
.emitTargetRegionFunction(EntryInfo
, GenerateOutlinedFunction
, true,
4701 OutlinedFn
, OutlinedFnID
);
4704 static void emitTargetCall(OpenMPIRBuilder
&OMPBuilder
, IRBuilderBase
&Builder
,
4705 OpenMPIRBuilder::InsertPointTy AllocaIP
,
4706 Function
*OutlinedFn
, Constant
*OutlinedFnID
,
4707 int32_t NumTeams
, int32_t NumThreads
,
4708 SmallVectorImpl
<Value
*> &Args
,
4709 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB
) {
4711 OpenMPIRBuilder::TargetDataInfo
Info(
4712 /*RequiresDevicePointerInfo=*/false,
4713 /*SeparateBeginEndCalls=*/true);
4715 OpenMPIRBuilder::MapInfosTy
&MapInfo
= GenMapInfoCB(Builder
.saveIP());
4716 OMPBuilder
.emitOffloadingArrays(AllocaIP
, Builder
.saveIP(), MapInfo
, Info
,
4717 /*IsNonContiguous=*/true);
4719 OpenMPIRBuilder::TargetDataRTArgs RTArgs
;
4720 OMPBuilder
.emitOffloadingArraysArgument(Builder
, RTArgs
, Info
,
4721 !MapInfo
.Names
.empty());
4724 auto &&EmitTargetCallFallbackCB
=
4725 [&](OpenMPIRBuilder::InsertPointTy IP
) -> OpenMPIRBuilder::InsertPointTy
{
4726 Builder
.restoreIP(IP
);
4727 Builder
.CreateCall(OutlinedFn
, Args
);
4728 return Builder
.saveIP();
4731 unsigned NumTargetItems
= MapInfo
.BasePointers
.size();
4732 // TODO: Use correct device ID
4733 Value
*DeviceID
= Builder
.getInt64(OMP_DEVICEID_UNDEF
);
4734 Value
*NumTeamsVal
= Builder
.getInt32(NumTeams
);
4735 Value
*NumThreadsVal
= Builder
.getInt32(NumThreads
);
4736 uint32_t SrcLocStrSize
;
4737 Constant
*SrcLocStr
= OMPBuilder
.getOrCreateDefaultSrcLocStr(SrcLocStrSize
);
4738 Value
*RTLoc
= OMPBuilder
.getOrCreateIdent(SrcLocStr
, SrcLocStrSize
,
4739 llvm::omp::IdentFlag(0), 0);
4740 // TODO: Use correct NumIterations
4741 Value
*NumIterations
= Builder
.getInt64(0);
4742 // TODO: Use correct DynCGGroupMem
4743 Value
*DynCGGroupMem
= Builder
.getInt32(0);
4745 bool HasNoWait
= false;
4747 OpenMPIRBuilder::TargetKernelArgs
KArgs(NumTargetItems
, RTArgs
, NumIterations
,
4748 NumTeamsVal
, NumThreadsVal
,
4749 DynCGGroupMem
, HasNoWait
);
4751 Builder
.restoreIP(OMPBuilder
.emitKernelLaunch(
4752 Builder
, OutlinedFn
, OutlinedFnID
, EmitTargetCallFallbackCB
, KArgs
,
4753 DeviceID
, RTLoc
, AllocaIP
));
4756 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createTarget(
4757 const LocationDescription
&Loc
, InsertPointTy AllocaIP
,
4758 InsertPointTy CodeGenIP
, TargetRegionEntryInfo
&EntryInfo
, int32_t NumTeams
,
4759 int32_t NumThreads
, SmallVectorImpl
<Value
*> &Args
,
4760 GenMapInfoCallbackTy GenMapInfoCB
,
4761 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc
,
4762 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB
) {
4763 if (!updateToLocation(Loc
))
4764 return InsertPointTy();
4766 Builder
.restoreIP(CodeGenIP
);
4768 Function
*OutlinedFn
;
4769 Constant
*OutlinedFnID
;
4770 emitTargetOutlinedFunction(*this, Builder
, EntryInfo
, OutlinedFn
,
4771 OutlinedFnID
, Args
, CBFunc
, ArgAccessorFuncCB
);
4772 if (!Config
.isTargetDevice())
4773 emitTargetCall(*this, Builder
, AllocaIP
, OutlinedFn
, OutlinedFnID
, NumTeams
,
4774 NumThreads
, Args
, GenMapInfoCB
);
4776 return Builder
.saveIP();
4779 std::string
OpenMPIRBuilder::getNameWithSeparators(ArrayRef
<StringRef
> Parts
,
4780 StringRef FirstSeparator
,
4781 StringRef Separator
) {
4782 SmallString
<128> Buffer
;
4783 llvm::raw_svector_ostream
OS(Buffer
);
4784 StringRef Sep
= FirstSeparator
;
4785 for (StringRef Part
: Parts
) {
4789 return OS
.str().str();
4793 OpenMPIRBuilder::createPlatformSpecificName(ArrayRef
<StringRef
> Parts
) const {
4794 return OpenMPIRBuilder::getNameWithSeparators(Parts
, Config
.firstSeparator(),
4795 Config
.separator());
4799 OpenMPIRBuilder::getOrCreateInternalVariable(Type
*Ty
, const StringRef
&Name
,
4800 unsigned AddressSpace
) {
4801 auto &Elem
= *InternalVars
.try_emplace(Name
, nullptr).first
;
4803 assert(Elem
.second
->getValueType() == Ty
&&
4804 "OMP internal variable has different type than requested");
4806 // TODO: investigate the appropriate linkage type used for the global
4807 // variable for possibly changing that to internal or private, or maybe
4808 // create different versions of the function for different OMP internal
4810 auto *GV
= new GlobalVariable(
4811 M
, Ty
, /*IsConstant=*/false, GlobalValue::CommonLinkage
,
4812 Constant::getNullValue(Ty
), Elem
.first(),
4813 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal
, AddressSpace
);
4814 const DataLayout
&DL
= M
.getDataLayout();
4815 const llvm::Align TypeAlign
= DL
.getABITypeAlign(Ty
);
4816 const llvm::Align PtrAlign
= DL
.getPointerABIAlignment(AddressSpace
);
4817 GV
->setAlignment(std::max(TypeAlign
, PtrAlign
));
4824 Value
*OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName
) {
4825 std::string Prefix
= Twine("gomp_critical_user_", CriticalName
).str();
4826 std::string Name
= getNameWithSeparators({Prefix
, "var"}, ".", ".");
4827 return getOrCreateInternalVariable(KmpCriticalNameTy
, Name
);
4830 Value
*OpenMPIRBuilder::getSizeInBytes(Value
*BasePtr
) {
4831 LLVMContext
&Ctx
= Builder
.getContext();
4833 Constant::getNullValue(PointerType::getUnqual(BasePtr
->getContext()));
4835 Builder
.CreateGEP(BasePtr
->getType(), Null
, Builder
.getInt32(1));
4836 Value
*SizePtrToInt
= Builder
.CreatePtrToInt(SizeGep
, Type::getInt64Ty(Ctx
));
4837 return SizePtrToInt
;
4841 OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl
<uint64_t> &Mappings
,
4842 std::string VarName
) {
4843 llvm::Constant
*MaptypesArrayInit
=
4844 llvm::ConstantDataArray::get(M
.getContext(), Mappings
);
4845 auto *MaptypesArrayGlobal
= new llvm::GlobalVariable(
4846 M
, MaptypesArrayInit
->getType(),
4847 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage
, MaptypesArrayInit
,
4849 MaptypesArrayGlobal
->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global
);
4850 return MaptypesArrayGlobal
;
4853 void OpenMPIRBuilder::createMapperAllocas(const LocationDescription
&Loc
,
4854 InsertPointTy AllocaIP
,
4855 unsigned NumOperands
,
4856 struct MapperAllocas
&MapperAllocas
) {
4857 if (!updateToLocation(Loc
))
4860 auto *ArrI8PtrTy
= ArrayType::get(Int8Ptr
, NumOperands
);
4861 auto *ArrI64Ty
= ArrayType::get(Int64
, NumOperands
);
4862 Builder
.restoreIP(AllocaIP
);
4863 AllocaInst
*ArgsBase
= Builder
.CreateAlloca(
4864 ArrI8PtrTy
, /* ArraySize = */ nullptr, ".offload_baseptrs");
4865 AllocaInst
*Args
= Builder
.CreateAlloca(ArrI8PtrTy
, /* ArraySize = */ nullptr,
4867 AllocaInst
*ArgSizes
= Builder
.CreateAlloca(
4868 ArrI64Ty
, /* ArraySize = */ nullptr, ".offload_sizes");
4869 Builder
.restoreIP(Loc
.IP
);
4870 MapperAllocas
.ArgsBase
= ArgsBase
;
4871 MapperAllocas
.Args
= Args
;
4872 MapperAllocas
.ArgSizes
= ArgSizes
;
4875 void OpenMPIRBuilder::emitMapperCall(const LocationDescription
&Loc
,
4876 Function
*MapperFunc
, Value
*SrcLocInfo
,
4877 Value
*MaptypesArg
, Value
*MapnamesArg
,
4878 struct MapperAllocas
&MapperAllocas
,
4879 int64_t DeviceID
, unsigned NumOperands
) {
4880 if (!updateToLocation(Loc
))
4883 auto *ArrI8PtrTy
= ArrayType::get(Int8Ptr
, NumOperands
);
4884 auto *ArrI64Ty
= ArrayType::get(Int64
, NumOperands
);
4885 Value
*ArgsBaseGEP
=
4886 Builder
.CreateInBoundsGEP(ArrI8PtrTy
, MapperAllocas
.ArgsBase
,
4887 {Builder
.getInt32(0), Builder
.getInt32(0)});
4889 Builder
.CreateInBoundsGEP(ArrI8PtrTy
, MapperAllocas
.Args
,
4890 {Builder
.getInt32(0), Builder
.getInt32(0)});
4891 Value
*ArgSizesGEP
=
4892 Builder
.CreateInBoundsGEP(ArrI64Ty
, MapperAllocas
.ArgSizes
,
4893 {Builder
.getInt32(0), Builder
.getInt32(0)});
4895 Constant::getNullValue(PointerType::getUnqual(Int8Ptr
->getContext()));
4896 Builder
.CreateCall(MapperFunc
,
4897 {SrcLocInfo
, Builder
.getInt64(DeviceID
),
4898 Builder
.getInt32(NumOperands
), ArgsBaseGEP
, ArgsGEP
,
4899 ArgSizesGEP
, MaptypesArg
, MapnamesArg
, NullPtr
});
4902 void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase
&Builder
,
4903 TargetDataRTArgs
&RTArgs
,
4904 TargetDataInfo
&Info
,
4907 assert((!ForEndCall
|| Info
.separateBeginEndCalls()) &&
4908 "expected region end call to runtime only when end call is separate");
4909 auto UnqualPtrTy
= PointerType::getUnqual(M
.getContext());
4910 auto VoidPtrTy
= UnqualPtrTy
;
4911 auto VoidPtrPtrTy
= UnqualPtrTy
;
4912 auto Int64Ty
= Type::getInt64Ty(M
.getContext());
4913 auto Int64PtrTy
= UnqualPtrTy
;
4915 if (!Info
.NumberOfPtrs
) {
4916 RTArgs
.BasePointersArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
4917 RTArgs
.PointersArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
4918 RTArgs
.SizesArray
= ConstantPointerNull::get(Int64PtrTy
);
4919 RTArgs
.MapTypesArray
= ConstantPointerNull::get(Int64PtrTy
);
4920 RTArgs
.MapNamesArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
4921 RTArgs
.MappersArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
4925 RTArgs
.BasePointersArray
= Builder
.CreateConstInBoundsGEP2_32(
4926 ArrayType::get(VoidPtrTy
, Info
.NumberOfPtrs
),
4927 Info
.RTArgs
.BasePointersArray
,
4928 /*Idx0=*/0, /*Idx1=*/0);
4929 RTArgs
.PointersArray
= Builder
.CreateConstInBoundsGEP2_32(
4930 ArrayType::get(VoidPtrTy
, Info
.NumberOfPtrs
), Info
.RTArgs
.PointersArray
,
4933 RTArgs
.SizesArray
= Builder
.CreateConstInBoundsGEP2_32(
4934 ArrayType::get(Int64Ty
, Info
.NumberOfPtrs
), Info
.RTArgs
.SizesArray
,
4935 /*Idx0=*/0, /*Idx1=*/0);
4936 RTArgs
.MapTypesArray
= Builder
.CreateConstInBoundsGEP2_32(
4937 ArrayType::get(Int64Ty
, Info
.NumberOfPtrs
),
4938 ForEndCall
&& Info
.RTArgs
.MapTypesArrayEnd
? Info
.RTArgs
.MapTypesArrayEnd
4939 : Info
.RTArgs
.MapTypesArray
,
4943 // Only emit the mapper information arrays if debug information is
4946 RTArgs
.MapNamesArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
4948 RTArgs
.MapNamesArray
= Builder
.CreateConstInBoundsGEP2_32(
4949 ArrayType::get(VoidPtrTy
, Info
.NumberOfPtrs
), Info
.RTArgs
.MapNamesArray
,
4952 // If there is no user-defined mapper, set the mapper array to nullptr to
4953 // avoid an unnecessary data privatization
4954 if (!Info
.HasMapper
)
4955 RTArgs
.MappersArray
= ConstantPointerNull::get(VoidPtrPtrTy
);
4957 RTArgs
.MappersArray
=
4958 Builder
.CreatePointerCast(Info
.RTArgs
.MappersArray
, VoidPtrPtrTy
);
4961 void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP
,
4962 InsertPointTy CodeGenIP
,
4963 MapInfosTy
&CombinedInfo
,
4964 TargetDataInfo
&Info
) {
4965 MapInfosTy::StructNonContiguousInfo
&NonContigInfo
=
4966 CombinedInfo
.NonContigInfo
;
4968 // Build an array of struct descriptor_dim and then assign it to
4971 // struct descriptor_dim {
4976 Type
*Int64Ty
= Builder
.getInt64Ty();
4977 StructType
*DimTy
= StructType::create(
4978 M
.getContext(), ArrayRef
<Type
*>({Int64Ty
, Int64Ty
, Int64Ty
}),
4979 "struct.descriptor_dim");
4981 enum { OffsetFD
= 0, CountFD
, StrideFD
};
4982 // We need two index variable here since the size of "Dims" is the same as
4983 // the size of Components, however, the size of offset, count, and stride is
4984 // equal to the size of base declaration that is non-contiguous.
4985 for (unsigned I
= 0, L
= 0, E
= NonContigInfo
.Dims
.size(); I
< E
; ++I
) {
4986 // Skip emitting ir if dimension size is 1 since it cannot be
4988 if (NonContigInfo
.Dims
[I
] == 1)
4990 Builder
.restoreIP(AllocaIP
);
4991 ArrayType
*ArrayTy
= ArrayType::get(DimTy
, NonContigInfo
.Dims
[I
]);
4992 AllocaInst
*DimsAddr
=
4993 Builder
.CreateAlloca(ArrayTy
, /* ArraySize = */ nullptr, "dims");
4994 Builder
.restoreIP(CodeGenIP
);
4995 for (unsigned II
= 0, EE
= NonContigInfo
.Dims
[I
]; II
< EE
; ++II
) {
4996 unsigned RevIdx
= EE
- II
- 1;
4997 Value
*DimsLVal
= Builder
.CreateInBoundsGEP(
4998 DimsAddr
->getAllocatedType(), DimsAddr
,
4999 {Builder
.getInt64(0), Builder
.getInt64(II
)});
5001 Value
*OffsetLVal
= Builder
.CreateStructGEP(DimTy
, DimsLVal
, OffsetFD
);
5002 Builder
.CreateAlignedStore(
5003 NonContigInfo
.Offsets
[L
][RevIdx
], OffsetLVal
,
5004 M
.getDataLayout().getPrefTypeAlign(OffsetLVal
->getType()));
5006 Value
*CountLVal
= Builder
.CreateStructGEP(DimTy
, DimsLVal
, CountFD
);
5007 Builder
.CreateAlignedStore(
5008 NonContigInfo
.Counts
[L
][RevIdx
], CountLVal
,
5009 M
.getDataLayout().getPrefTypeAlign(CountLVal
->getType()));
5011 Value
*StrideLVal
= Builder
.CreateStructGEP(DimTy
, DimsLVal
, StrideFD
);
5012 Builder
.CreateAlignedStore(
5013 NonContigInfo
.Strides
[L
][RevIdx
], StrideLVal
,
5014 M
.getDataLayout().getPrefTypeAlign(CountLVal
->getType()));
5017 Builder
.restoreIP(CodeGenIP
);
5018 Value
*DAddr
= Builder
.CreatePointerBitCastOrAddrSpaceCast(
5019 DimsAddr
, Builder
.getInt8PtrTy());
5020 Value
*P
= Builder
.CreateConstInBoundsGEP2_32(
5021 ArrayType::get(Builder
.getInt8PtrTy(), Info
.NumberOfPtrs
),
5022 Info
.RTArgs
.PointersArray
, 0, I
);
5023 Builder
.CreateAlignedStore(
5024 DAddr
, P
, M
.getDataLayout().getPrefTypeAlign(Builder
.getInt8PtrTy()));
5029 void OpenMPIRBuilder::emitOffloadingArrays(
5030 InsertPointTy AllocaIP
, InsertPointTy CodeGenIP
, MapInfosTy
&CombinedInfo
,
5031 TargetDataInfo
&Info
, bool IsNonContiguous
,
5032 function_ref
<void(unsigned int, Value
*)> DeviceAddrCB
,
5033 function_ref
<Value
*(unsigned int)> CustomMapperCB
) {
5035 // Reset the array information.
5036 Info
.clearArrayInfo();
5037 Info
.NumberOfPtrs
= CombinedInfo
.BasePointers
.size();
5039 if (Info
.NumberOfPtrs
== 0)
5042 Builder
.restoreIP(AllocaIP
);
5043 // Detect if we have any capture size requiring runtime evaluation of the
5044 // size so that a constant array could be eventually used.
5045 ArrayType
*PointerArrayType
=
5046 ArrayType::get(Builder
.getInt8PtrTy(), Info
.NumberOfPtrs
);
5048 Info
.RTArgs
.BasePointersArray
= Builder
.CreateAlloca(
5049 PointerArrayType
, /* ArraySize = */ nullptr, ".offload_baseptrs");
5051 Info
.RTArgs
.PointersArray
= Builder
.CreateAlloca(
5052 PointerArrayType
, /* ArraySize = */ nullptr, ".offload_ptrs");
5053 AllocaInst
*MappersArray
= Builder
.CreateAlloca(
5054 PointerArrayType
, /* ArraySize = */ nullptr, ".offload_mappers");
5055 Info
.RTArgs
.MappersArray
= MappersArray
;
5057 // If we don't have any VLA types or other types that require runtime
5058 // evaluation, we can use a constant array for the map sizes, otherwise we
5059 // need to fill up the arrays as we do for the pointers.
5060 Type
*Int64Ty
= Builder
.getInt64Ty();
5061 SmallVector
<Constant
*> ConstSizes(CombinedInfo
.Sizes
.size(),
5062 ConstantInt::get(Int64Ty
, 0));
5063 SmallBitVector
RuntimeSizes(CombinedInfo
.Sizes
.size());
5064 for (unsigned I
= 0, E
= CombinedInfo
.Sizes
.size(); I
< E
; ++I
) {
5065 if (auto *CI
= dyn_cast
<Constant
>(CombinedInfo
.Sizes
[I
])) {
5066 if (!isa
<ConstantExpr
>(CI
) && !isa
<GlobalValue
>(CI
)) {
5067 if (IsNonContiguous
&&
5068 static_cast<std::underlying_type_t
<OpenMPOffloadMappingFlags
>>(
5069 CombinedInfo
.Types
[I
] &
5070 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG
))
5072 ConstantInt::get(Int64Ty
, CombinedInfo
.NonContigInfo
.Dims
[I
]);
5078 RuntimeSizes
.set(I
);
5081 if (RuntimeSizes
.all()) {
5082 ArrayType
*SizeArrayType
= ArrayType::get(Int64Ty
, Info
.NumberOfPtrs
);
5083 Info
.RTArgs
.SizesArray
= Builder
.CreateAlloca(
5084 SizeArrayType
, /* ArraySize = */ nullptr, ".offload_sizes");
5085 Builder
.restoreIP(CodeGenIP
);
5087 auto *SizesArrayInit
= ConstantArray::get(
5088 ArrayType::get(Int64Ty
, ConstSizes
.size()), ConstSizes
);
5089 std::string Name
= createPlatformSpecificName({"offload_sizes"});
5090 auto *SizesArrayGbl
=
5091 new GlobalVariable(M
, SizesArrayInit
->getType(), /*isConstant=*/true,
5092 GlobalValue::PrivateLinkage
, SizesArrayInit
, Name
);
5093 SizesArrayGbl
->setUnnamedAddr(GlobalValue::UnnamedAddr::Global
);
5095 if (!RuntimeSizes
.any()) {
5096 Info
.RTArgs
.SizesArray
= SizesArrayGbl
;
5098 unsigned IndexSize
= M
.getDataLayout().getIndexSizeInBits(0);
5099 Align OffloadSizeAlign
= M
.getDataLayout().getABIIntegerTypeAlignment(64);
5100 ArrayType
*SizeArrayType
= ArrayType::get(Int64Ty
, Info
.NumberOfPtrs
);
5101 AllocaInst
*Buffer
= Builder
.CreateAlloca(
5102 SizeArrayType
, /* ArraySize = */ nullptr, ".offload_sizes");
5103 Buffer
->setAlignment(OffloadSizeAlign
);
5104 Builder
.restoreIP(CodeGenIP
);
5105 Builder
.CreateMemCpy(
5106 Buffer
, M
.getDataLayout().getPrefTypeAlign(Buffer
->getType()),
5107 SizesArrayGbl
, OffloadSizeAlign
,
5110 Buffer
->getAllocationSize(M
.getDataLayout())->getFixedValue()));
5112 Info
.RTArgs
.SizesArray
= Buffer
;
5114 Builder
.restoreIP(CodeGenIP
);
5117 // The map types are always constant so we don't need to generate code to
5118 // fill arrays. Instead, we create an array constant.
5119 SmallVector
<uint64_t, 4> Mapping
;
5120 for (auto mapFlag
: CombinedInfo
.Types
)
5122 static_cast<std::underlying_type_t
<OpenMPOffloadMappingFlags
>>(
5124 std::string MaptypesName
= createPlatformSpecificName({"offload_maptypes"});
5125 auto *MapTypesArrayGbl
= createOffloadMaptypes(Mapping
, MaptypesName
);
5126 Info
.RTArgs
.MapTypesArray
= MapTypesArrayGbl
;
5128 // The information types are only built if provided.
5129 if (!CombinedInfo
.Names
.empty()) {
5130 std::string MapnamesName
= createPlatformSpecificName({"offload_mapnames"});
5131 auto *MapNamesArrayGbl
=
5132 createOffloadMapnames(CombinedInfo
.Names
, MapnamesName
);
5133 Info
.RTArgs
.MapNamesArray
= MapNamesArrayGbl
;
5135 Info
.RTArgs
.MapNamesArray
=
5136 Constant::getNullValue(PointerType::getUnqual(Builder
.getContext()));
5139 // If there's a present map type modifier, it must not be applied to the end
5140 // of a region, so generate a separate map type array in that case.
5141 if (Info
.separateBeginEndCalls()) {
5142 bool EndMapTypesDiffer
= false;
5143 for (uint64_t &Type
: Mapping
) {
5144 if (Type
& static_cast<std::underlying_type_t
<OpenMPOffloadMappingFlags
>>(
5145 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT
)) {
5146 Type
&= ~static_cast<std::underlying_type_t
<OpenMPOffloadMappingFlags
>>(
5147 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT
);
5148 EndMapTypesDiffer
= true;
5151 if (EndMapTypesDiffer
) {
5152 MapTypesArrayGbl
= createOffloadMaptypes(Mapping
, MaptypesName
);
5153 Info
.RTArgs
.MapTypesArrayEnd
= MapTypesArrayGbl
;
5157 PointerType
*PtrTy
= Builder
.getPtrTy();
5158 for (unsigned I
= 0; I
< Info
.NumberOfPtrs
; ++I
) {
5159 Value
*BPVal
= CombinedInfo
.BasePointers
[I
];
5160 Value
*BP
= Builder
.CreateConstInBoundsGEP2_32(
5161 ArrayType::get(PtrTy
, Info
.NumberOfPtrs
), Info
.RTArgs
.BasePointersArray
,
5163 Builder
.CreateAlignedStore(BPVal
, BP
,
5164 M
.getDataLayout().getPrefTypeAlign(PtrTy
));
5166 if (Info
.requiresDevicePointerInfo()) {
5167 if (CombinedInfo
.DevicePointers
[I
] == DeviceInfoTy::Pointer
) {
5168 CodeGenIP
= Builder
.saveIP();
5169 Builder
.restoreIP(AllocaIP
);
5170 Info
.DevicePtrInfoMap
[BPVal
] = {BP
, Builder
.CreateAlloca(PtrTy
)};
5171 Builder
.restoreIP(CodeGenIP
);
5173 DeviceAddrCB(I
, Info
.DevicePtrInfoMap
[BPVal
].second
);
5174 } else if (CombinedInfo
.DevicePointers
[I
] == DeviceInfoTy::Address
) {
5175 Info
.DevicePtrInfoMap
[BPVal
] = {BP
, BP
};
5177 DeviceAddrCB(I
, BP
);
5181 Value
*PVal
= CombinedInfo
.Pointers
[I
];
5182 Value
*P
= Builder
.CreateConstInBoundsGEP2_32(
5183 ArrayType::get(PtrTy
, Info
.NumberOfPtrs
), Info
.RTArgs
.PointersArray
, 0,
5185 // TODO: Check alignment correct.
5186 Builder
.CreateAlignedStore(PVal
, P
,
5187 M
.getDataLayout().getPrefTypeAlign(PtrTy
));
5189 if (RuntimeSizes
.test(I
)) {
5190 Value
*S
= Builder
.CreateConstInBoundsGEP2_32(
5191 ArrayType::get(Int64Ty
, Info
.NumberOfPtrs
), Info
.RTArgs
.SizesArray
,
5194 Builder
.CreateAlignedStore(Builder
.CreateIntCast(CombinedInfo
.Sizes
[I
],
5197 S
, M
.getDataLayout().getPrefTypeAlign(PtrTy
));
5199 // Fill up the mapper array.
5200 unsigned IndexSize
= M
.getDataLayout().getIndexSizeInBits(0);
5201 Value
*MFunc
= ConstantPointerNull::get(PtrTy
);
5203 if (Value
*CustomMFunc
= CustomMapperCB(I
))
5204 MFunc
= Builder
.CreatePointerCast(CustomMFunc
, PtrTy
);
5205 Value
*MAddr
= Builder
.CreateInBoundsGEP(
5206 MappersArray
->getAllocatedType(), MappersArray
,
5207 {Builder
.getIntN(IndexSize
, 0), Builder
.getIntN(IndexSize
, I
)});
5208 Builder
.CreateAlignedStore(
5209 MFunc
, MAddr
, M
.getDataLayout().getPrefTypeAlign(MAddr
->getType()));
5212 if (!IsNonContiguous
|| CombinedInfo
.NonContigInfo
.Offsets
.empty() ||
5213 Info
.NumberOfPtrs
== 0)
5215 emitNonContiguousDescriptor(AllocaIP
, CodeGenIP
, CombinedInfo
, Info
);
5218 void OpenMPIRBuilder::emitBranch(BasicBlock
*Target
) {
5219 BasicBlock
*CurBB
= Builder
.GetInsertBlock();
5221 if (!CurBB
|| CurBB
->getTerminator()) {
5222 // If there is no insert point or the previous block is already
5223 // terminated, don't touch it.
5225 // Otherwise, create a fall-through branch.
5226 Builder
.CreateBr(Target
);
5229 Builder
.ClearInsertionPoint();
5232 void OpenMPIRBuilder::emitBlock(BasicBlock
*BB
, Function
*CurFn
,
5234 BasicBlock
*CurBB
= Builder
.GetInsertBlock();
5236 // Fall out of the current block (if necessary).
5239 if (IsFinished
&& BB
->use_empty()) {
5240 BB
->eraseFromParent();
5244 // Place the block after the current block, if possible, or else at
5245 // the end of the function.
5246 if (CurBB
&& CurBB
->getParent())
5247 CurFn
->insert(std::next(CurBB
->getIterator()), BB
);
5249 CurFn
->insert(CurFn
->end(), BB
);
5250 Builder
.SetInsertPoint(BB
);
5253 void OpenMPIRBuilder::emitIfClause(Value
*Cond
, BodyGenCallbackTy ThenGen
,
5254 BodyGenCallbackTy ElseGen
,
5255 InsertPointTy AllocaIP
) {
5256 // If the condition constant folds and can be elided, try to avoid emitting
5257 // the condition and the dead arm of the if/else.
5258 if (auto *CI
= dyn_cast
<ConstantInt
>(Cond
)) {
5259 auto CondConstant
= CI
->getSExtValue();
5261 ThenGen(AllocaIP
, Builder
.saveIP());
5263 ElseGen(AllocaIP
, Builder
.saveIP());
5267 Function
*CurFn
= Builder
.GetInsertBlock()->getParent();
5269 // Otherwise, the condition did not fold, or we couldn't elide it. Just
5270 // emit the conditional branch.
5271 BasicBlock
*ThenBlock
= BasicBlock::Create(M
.getContext(), "omp_if.then");
5272 BasicBlock
*ElseBlock
= BasicBlock::Create(M
.getContext(), "omp_if.else");
5273 BasicBlock
*ContBlock
= BasicBlock::Create(M
.getContext(), "omp_if.end");
5274 Builder
.CreateCondBr(Cond
, ThenBlock
, ElseBlock
);
5275 // Emit the 'then' code.
5276 emitBlock(ThenBlock
, CurFn
);
5277 ThenGen(AllocaIP
, Builder
.saveIP());
5278 emitBranch(ContBlock
);
5279 // Emit the 'else' code if present.
5280 // There is no need to emit line number for unconditional branch.
5281 emitBlock(ElseBlock
, CurFn
);
5282 ElseGen(AllocaIP
, Builder
.saveIP());
5283 // There is no need to emit line number for unconditional branch.
5284 emitBranch(ContBlock
);
5285 // Emit the continuation block for code after the if.
5286 emitBlock(ContBlock
, CurFn
, /*IsFinished=*/true);
5289 bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
5290 const LocationDescription
&Loc
, llvm::AtomicOrdering AO
, AtomicKind AK
) {
5291 assert(!(AO
== AtomicOrdering::NotAtomic
||
5292 AO
== llvm::AtomicOrdering::Unordered
) &&
5293 "Unexpected Atomic Ordering.");
5296 llvm::AtomicOrdering FlushAO
= AtomicOrdering::Monotonic
;
5300 if (AO
== AtomicOrdering::Acquire
|| AO
== AtomicOrdering::AcquireRelease
||
5301 AO
== AtomicOrdering::SequentiallyConsistent
) {
5302 FlushAO
= AtomicOrdering::Acquire
;
5309 if (AO
== AtomicOrdering::Release
|| AO
== AtomicOrdering::AcquireRelease
||
5310 AO
== AtomicOrdering::SequentiallyConsistent
) {
5311 FlushAO
= AtomicOrdering::Release
;
5317 case AtomicOrdering::Acquire
:
5318 FlushAO
= AtomicOrdering::Acquire
;
5321 case AtomicOrdering::Release
:
5322 FlushAO
= AtomicOrdering::Release
;
5325 case AtomicOrdering::AcquireRelease
:
5326 case AtomicOrdering::SequentiallyConsistent
:
5327 FlushAO
= AtomicOrdering::AcquireRelease
;
5331 // do nothing - leave silently.
5337 // Currently Flush RT call still doesn't take memory_ordering, so for when
5338 // that happens, this tries to do the resolution of which atomic ordering
5339 // to use with but issue the flush call
5340 // TODO: pass `FlushAO` after memory ordering support is added
5345 // for AO == AtomicOrdering::Monotonic and all other case combinations
5350 OpenMPIRBuilder::InsertPointTy
5351 OpenMPIRBuilder::createAtomicRead(const LocationDescription
&Loc
,
5352 AtomicOpValue
&X
, AtomicOpValue
&V
,
5353 AtomicOrdering AO
) {
5354 if (!updateToLocation(Loc
))
5357 assert(X
.Var
->getType()->isPointerTy() &&
5358 "OMP Atomic expects a pointer to target memory");
5359 Type
*XElemTy
= X
.ElemTy
;
5360 assert((XElemTy
->isFloatingPointTy() || XElemTy
->isIntegerTy() ||
5361 XElemTy
->isPointerTy()) &&
5362 "OMP atomic read expected a scalar type");
5364 Value
*XRead
= nullptr;
5366 if (XElemTy
->isIntegerTy()) {
5368 Builder
.CreateLoad(XElemTy
, X
.Var
, X
.IsVolatile
, "omp.atomic.read");
5370 XRead
= cast
<Value
>(XLD
);
5372 // We need to perform atomic op as integer
5373 IntegerType
*IntCastTy
=
5374 IntegerType::get(M
.getContext(), XElemTy
->getScalarSizeInBits());
5376 Builder
.CreateLoad(IntCastTy
, X
.Var
, X
.IsVolatile
, "omp.atomic.load");
5377 XLoad
->setAtomic(AO
);
5378 if (XElemTy
->isFloatingPointTy()) {
5379 XRead
= Builder
.CreateBitCast(XLoad
, XElemTy
, "atomic.flt.cast");
5381 XRead
= Builder
.CreateIntToPtr(XLoad
, XElemTy
, "atomic.ptr.cast");
5384 checkAndEmitFlushAfterAtomic(Loc
, AO
, AtomicKind::Read
);
5385 Builder
.CreateStore(XRead
, V
.Var
, V
.IsVolatile
);
5386 return Builder
.saveIP();
5389 OpenMPIRBuilder::InsertPointTy
5390 OpenMPIRBuilder::createAtomicWrite(const LocationDescription
&Loc
,
5391 AtomicOpValue
&X
, Value
*Expr
,
5392 AtomicOrdering AO
) {
5393 if (!updateToLocation(Loc
))
5396 Type
*XTy
= X
.Var
->getType();
5397 assert(XTy
->isPointerTy() && "OMP Atomic expects a pointer to target memory");
5398 Type
*XElemTy
= X
.ElemTy
;
5399 assert((XElemTy
->isFloatingPointTy() || XElemTy
->isIntegerTy() ||
5400 XElemTy
->isPointerTy()) &&
5401 "OMP atomic write expected a scalar type");
5403 if (XElemTy
->isIntegerTy()) {
5404 StoreInst
*XSt
= Builder
.CreateStore(Expr
, X
.Var
, X
.IsVolatile
);
5407 // We need to bitcast and perform atomic op as integers
5408 unsigned Addrspace
= cast
<PointerType
>(XTy
)->getAddressSpace();
5409 IntegerType
*IntCastTy
=
5410 IntegerType::get(M
.getContext(), XElemTy
->getScalarSizeInBits());
5411 Value
*XBCast
= Builder
.CreateBitCast(
5412 X
.Var
, IntCastTy
->getPointerTo(Addrspace
), "atomic.dst.int.cast");
5414 Builder
.CreateBitCast(Expr
, IntCastTy
, "atomic.src.int.cast");
5415 StoreInst
*XSt
= Builder
.CreateStore(ExprCast
, XBCast
, X
.IsVolatile
);
5419 checkAndEmitFlushAfterAtomic(Loc
, AO
, AtomicKind::Write
);
5420 return Builder
.saveIP();
5423 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createAtomicUpdate(
5424 const LocationDescription
&Loc
, InsertPointTy AllocaIP
, AtomicOpValue
&X
,
5425 Value
*Expr
, AtomicOrdering AO
, AtomicRMWInst::BinOp RMWOp
,
5426 AtomicUpdateCallbackTy
&UpdateOp
, bool IsXBinopExpr
) {
5427 assert(!isConflictIP(Loc
.IP
, AllocaIP
) && "IPs must not be ambiguous");
5428 if (!updateToLocation(Loc
))
5432 Type
*XTy
= X
.Var
->getType();
5433 assert(XTy
->isPointerTy() &&
5434 "OMP Atomic expects a pointer to target memory");
5435 Type
*XElemTy
= X
.ElemTy
;
5436 assert((XElemTy
->isFloatingPointTy() || XElemTy
->isIntegerTy() ||
5437 XElemTy
->isPointerTy()) &&
5438 "OMP atomic update expected a scalar type");
5439 assert((RMWOp
!= AtomicRMWInst::Max
) && (RMWOp
!= AtomicRMWInst::Min
) &&
5440 (RMWOp
!= AtomicRMWInst::UMax
) && (RMWOp
!= AtomicRMWInst::UMin
) &&
5441 "OpenMP atomic does not support LT or GT operations");
5444 emitAtomicUpdate(AllocaIP
, X
.Var
, X
.ElemTy
, Expr
, AO
, RMWOp
, UpdateOp
,
5445 X
.IsVolatile
, IsXBinopExpr
);
5446 checkAndEmitFlushAfterAtomic(Loc
, AO
, AtomicKind::Update
);
5447 return Builder
.saveIP();
5450 // FIXME: Duplicating AtomicExpand
5451 Value
*OpenMPIRBuilder::emitRMWOpAsInstruction(Value
*Src1
, Value
*Src2
,
5452 AtomicRMWInst::BinOp RMWOp
) {
5454 case AtomicRMWInst::Add
:
5455 return Builder
.CreateAdd(Src1
, Src2
);
5456 case AtomicRMWInst::Sub
:
5457 return Builder
.CreateSub(Src1
, Src2
);
5458 case AtomicRMWInst::And
:
5459 return Builder
.CreateAnd(Src1
, Src2
);
5460 case AtomicRMWInst::Nand
:
5461 return Builder
.CreateNeg(Builder
.CreateAnd(Src1
, Src2
));
5462 case AtomicRMWInst::Or
:
5463 return Builder
.CreateOr(Src1
, Src2
);
5464 case AtomicRMWInst::Xor
:
5465 return Builder
.CreateXor(Src1
, Src2
);
5466 case AtomicRMWInst::Xchg
:
5467 case AtomicRMWInst::FAdd
:
5468 case AtomicRMWInst::FSub
:
5469 case AtomicRMWInst::BAD_BINOP
:
5470 case AtomicRMWInst::Max
:
5471 case AtomicRMWInst::Min
:
5472 case AtomicRMWInst::UMax
:
5473 case AtomicRMWInst::UMin
:
5474 case AtomicRMWInst::FMax
:
5475 case AtomicRMWInst::FMin
:
5476 case AtomicRMWInst::UIncWrap
:
5477 case AtomicRMWInst::UDecWrap
:
5478 llvm_unreachable("Unsupported atomic update operation");
5480 llvm_unreachable("Unsupported atomic update operation");
5483 std::pair
<Value
*, Value
*> OpenMPIRBuilder::emitAtomicUpdate(
5484 InsertPointTy AllocaIP
, Value
*X
, Type
*XElemTy
, Value
*Expr
,
5485 AtomicOrdering AO
, AtomicRMWInst::BinOp RMWOp
,
5486 AtomicUpdateCallbackTy
&UpdateOp
, bool VolatileX
, bool IsXBinopExpr
) {
5487 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
5488 // or a complex datatype.
5489 bool emitRMWOp
= false;
5491 case AtomicRMWInst::Add
:
5492 case AtomicRMWInst::And
:
5493 case AtomicRMWInst::Nand
:
5494 case AtomicRMWInst::Or
:
5495 case AtomicRMWInst::Xor
:
5496 case AtomicRMWInst::Xchg
:
5497 emitRMWOp
= XElemTy
;
5499 case AtomicRMWInst::Sub
:
5500 emitRMWOp
= (IsXBinopExpr
&& XElemTy
);
5505 emitRMWOp
&= XElemTy
->isIntegerTy();
5507 std::pair
<Value
*, Value
*> Res
;
5509 Res
.first
= Builder
.CreateAtomicRMW(RMWOp
, X
, Expr
, llvm::MaybeAlign(), AO
);
5510 // not needed except in case of postfix captures. Generate anyway for
5511 // consistency with the else part. Will be removed with any DCE pass.
5512 // AtomicRMWInst::Xchg does not have a coressponding instruction.
5513 if (RMWOp
== AtomicRMWInst::Xchg
)
5514 Res
.second
= Res
.first
;
5516 Res
.second
= emitRMWOpAsInstruction(Res
.first
, Expr
, RMWOp
);
5518 IntegerType
*IntCastTy
=
5519 IntegerType::get(M
.getContext(), XElemTy
->getScalarSizeInBits());
5521 Builder
.CreateLoad(IntCastTy
, X
, X
->getName() + ".atomic.load");
5522 OldVal
->setAtomic(AO
);
5528 BasicBlock
*CurBB
= Builder
.GetInsertBlock();
5529 Instruction
*CurBBTI
= CurBB
->getTerminator();
5530 CurBBTI
= CurBBTI
? CurBBTI
: Builder
.CreateUnreachable();
5531 BasicBlock
*ExitBB
=
5532 CurBB
->splitBasicBlock(CurBBTI
, X
->getName() + ".atomic.exit");
5533 BasicBlock
*ContBB
= CurBB
->splitBasicBlock(CurBB
->getTerminator(),
5534 X
->getName() + ".atomic.cont");
5535 ContBB
->getTerminator()->eraseFromParent();
5536 Builder
.restoreIP(AllocaIP
);
5537 AllocaInst
*NewAtomicAddr
= Builder
.CreateAlloca(XElemTy
);
5538 NewAtomicAddr
->setName(X
->getName() + "x.new.val");
5539 Builder
.SetInsertPoint(ContBB
);
5540 llvm::PHINode
*PHI
= Builder
.CreatePHI(OldVal
->getType(), 2);
5541 PHI
->addIncoming(OldVal
, CurBB
);
5542 bool IsIntTy
= XElemTy
->isIntegerTy();
5543 Value
*OldExprVal
= PHI
;
5545 if (XElemTy
->isFloatingPointTy()) {
5546 OldExprVal
= Builder
.CreateBitCast(PHI
, XElemTy
,
5547 X
->getName() + ".atomic.fltCast");
5549 OldExprVal
= Builder
.CreateIntToPtr(PHI
, XElemTy
,
5550 X
->getName() + ".atomic.ptrCast");
5554 Value
*Upd
= UpdateOp(OldExprVal
, Builder
);
5555 Builder
.CreateStore(Upd
, NewAtomicAddr
);
5556 LoadInst
*DesiredVal
= Builder
.CreateLoad(IntCastTy
, NewAtomicAddr
);
5557 AtomicOrdering Failure
=
5558 llvm::AtomicCmpXchgInst::getStrongestFailureOrdering(AO
);
5559 AtomicCmpXchgInst
*Result
= Builder
.CreateAtomicCmpXchg(
5560 X
, PHI
, DesiredVal
, llvm::MaybeAlign(), AO
, Failure
);
5561 Result
->setVolatile(VolatileX
);
5562 Value
*PreviousVal
= Builder
.CreateExtractValue(Result
, /*Idxs=*/0);
5563 Value
*SuccessFailureVal
= Builder
.CreateExtractValue(Result
, /*Idxs=*/1);
5564 PHI
->addIncoming(PreviousVal
, Builder
.GetInsertBlock());
5565 Builder
.CreateCondBr(SuccessFailureVal
, ExitBB
, ContBB
);
5567 Res
.first
= OldExprVal
;
5570 // set Insertion point in exit block
5571 if (UnreachableInst
*ExitTI
=
5572 dyn_cast
<UnreachableInst
>(ExitBB
->getTerminator())) {
5573 CurBBTI
->eraseFromParent();
5574 Builder
.SetInsertPoint(ExitBB
);
5576 Builder
.SetInsertPoint(ExitTI
);
5583 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createAtomicCapture(
5584 const LocationDescription
&Loc
, InsertPointTy AllocaIP
, AtomicOpValue
&X
,
5585 AtomicOpValue
&V
, Value
*Expr
, AtomicOrdering AO
,
5586 AtomicRMWInst::BinOp RMWOp
, AtomicUpdateCallbackTy
&UpdateOp
,
5587 bool UpdateExpr
, bool IsPostfixUpdate
, bool IsXBinopExpr
) {
5588 if (!updateToLocation(Loc
))
5592 Type
*XTy
= X
.Var
->getType();
5593 assert(XTy
->isPointerTy() &&
5594 "OMP Atomic expects a pointer to target memory");
5595 Type
*XElemTy
= X
.ElemTy
;
5596 assert((XElemTy
->isFloatingPointTy() || XElemTy
->isIntegerTy() ||
5597 XElemTy
->isPointerTy()) &&
5598 "OMP atomic capture expected a scalar type");
5599 assert((RMWOp
!= AtomicRMWInst::Max
) && (RMWOp
!= AtomicRMWInst::Min
) &&
5600 "OpenMP atomic does not support LT or GT operations");
5603 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
5604 // 'x' is simply atomically rewritten with 'expr'.
5605 AtomicRMWInst::BinOp AtomicOp
= (UpdateExpr
? RMWOp
: AtomicRMWInst::Xchg
);
5606 std::pair
<Value
*, Value
*> Result
=
5607 emitAtomicUpdate(AllocaIP
, X
.Var
, X
.ElemTy
, Expr
, AO
, AtomicOp
, UpdateOp
,
5608 X
.IsVolatile
, IsXBinopExpr
);
5610 Value
*CapturedVal
= (IsPostfixUpdate
? Result
.first
: Result
.second
);
5611 Builder
.CreateStore(CapturedVal
, V
.Var
, V
.IsVolatile
);
5613 checkAndEmitFlushAfterAtomic(Loc
, AO
, AtomicKind::Capture
);
5614 return Builder
.saveIP();
5617 OpenMPIRBuilder::InsertPointTy
OpenMPIRBuilder::createAtomicCompare(
5618 const LocationDescription
&Loc
, AtomicOpValue
&X
, AtomicOpValue
&V
,
5619 AtomicOpValue
&R
, Value
*E
, Value
*D
, AtomicOrdering AO
,
5620 omp::OMPAtomicCompareOp Op
, bool IsXBinopExpr
, bool IsPostfixUpdate
,
5623 if (!updateToLocation(Loc
))
5626 assert(X
.Var
->getType()->isPointerTy() &&
5627 "OMP atomic expects a pointer to target memory");
5630 assert(V
.Var
->getType()->isPointerTy() && "v.var must be of pointer type");
5631 assert(V
.ElemTy
== X
.ElemTy
&& "x and v must be of same type");
5634 bool IsInteger
= E
->getType()->isIntegerTy();
5636 if (Op
== OMPAtomicCompareOp::EQ
) {
5637 AtomicOrdering Failure
= AtomicCmpXchgInst::getStrongestFailureOrdering(AO
);
5638 AtomicCmpXchgInst
*Result
= nullptr;
5640 IntegerType
*IntCastTy
=
5641 IntegerType::get(M
.getContext(), X
.ElemTy
->getScalarSizeInBits());
5642 Value
*EBCast
= Builder
.CreateBitCast(E
, IntCastTy
);
5643 Value
*DBCast
= Builder
.CreateBitCast(D
, IntCastTy
);
5644 Result
= Builder
.CreateAtomicCmpXchg(X
.Var
, EBCast
, DBCast
, MaybeAlign(),
5648 Builder
.CreateAtomicCmpXchg(X
.Var
, E
, D
, MaybeAlign(), AO
, Failure
);
5652 Value
*OldValue
= Builder
.CreateExtractValue(Result
, /*Idxs=*/0);
5654 OldValue
= Builder
.CreateBitCast(OldValue
, X
.ElemTy
);
5655 assert(OldValue
->getType() == V
.ElemTy
&&
5656 "OldValue and V must be of same type");
5657 if (IsPostfixUpdate
) {
5658 Builder
.CreateStore(OldValue
, V
.Var
, V
.IsVolatile
);
5660 Value
*SuccessOrFail
= Builder
.CreateExtractValue(Result
, /*Idxs=*/1);
5670 // where ContBB only contains the store of old value to 'v'.
5671 BasicBlock
*CurBB
= Builder
.GetInsertBlock();
5672 Instruction
*CurBBTI
= CurBB
->getTerminator();
5673 CurBBTI
= CurBBTI
? CurBBTI
: Builder
.CreateUnreachable();
5674 BasicBlock
*ExitBB
= CurBB
->splitBasicBlock(
5675 CurBBTI
, X
.Var
->getName() + ".atomic.exit");
5676 BasicBlock
*ContBB
= CurBB
->splitBasicBlock(
5677 CurBB
->getTerminator(), X
.Var
->getName() + ".atomic.cont");
5678 ContBB
->getTerminator()->eraseFromParent();
5679 CurBB
->getTerminator()->eraseFromParent();
5681 Builder
.CreateCondBr(SuccessOrFail
, ExitBB
, ContBB
);
5683 Builder
.SetInsertPoint(ContBB
);
5684 Builder
.CreateStore(OldValue
, V
.Var
);
5685 Builder
.CreateBr(ExitBB
);
5687 if (UnreachableInst
*ExitTI
=
5688 dyn_cast
<UnreachableInst
>(ExitBB
->getTerminator())) {
5689 CurBBTI
->eraseFromParent();
5690 Builder
.SetInsertPoint(ExitBB
);
5692 Builder
.SetInsertPoint(ExitTI
);
5695 Value
*CapturedValue
=
5696 Builder
.CreateSelect(SuccessOrFail
, E
, OldValue
);
5697 Builder
.CreateStore(CapturedValue
, V
.Var
, V
.IsVolatile
);
5701 // The comparison result has to be stored.
5703 assert(R
.Var
->getType()->isPointerTy() &&
5704 "r.var must be of pointer type");
5705 assert(R
.ElemTy
->isIntegerTy() && "r must be of integral type");
5707 Value
*SuccessFailureVal
= Builder
.CreateExtractValue(Result
, /*Idxs=*/1);
5708 Value
*ResultCast
= R
.IsSigned
5709 ? Builder
.CreateSExt(SuccessFailureVal
, R
.ElemTy
)
5710 : Builder
.CreateZExt(SuccessFailureVal
, R
.ElemTy
);
5711 Builder
.CreateStore(ResultCast
, R
.Var
, R
.IsVolatile
);
5714 assert((Op
== OMPAtomicCompareOp::MAX
|| Op
== OMPAtomicCompareOp::MIN
) &&
5715 "Op should be either max or min at this point");
5716 assert(!IsFailOnly
&& "IsFailOnly is only valid when the comparison is ==");
5718 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
5719 // Let's take max as example.
5721 // x = x > expr ? expr : x;
5723 // *ptr = *ptr > val ? *ptr : val;
5724 // We need to transform to LLVM form.
5725 // x = x <= expr ? x : expr;
5726 AtomicRMWInst::BinOp NewOp
;
5730 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::Min
5731 : AtomicRMWInst::Max
;
5733 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::UMin
5734 : AtomicRMWInst::UMax
;
5736 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::FMin
5737 : AtomicRMWInst::FMax
;
5742 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::Max
5743 : AtomicRMWInst::Min
;
5745 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::UMax
5746 : AtomicRMWInst::UMin
;
5748 NewOp
= Op
== OMPAtomicCompareOp::MAX
? AtomicRMWInst::FMax
5749 : AtomicRMWInst::FMin
;
5753 AtomicRMWInst
*OldValue
=
5754 Builder
.CreateAtomicRMW(NewOp
, X
.Var
, E
, MaybeAlign(), AO
);
5756 Value
*CapturedValue
= nullptr;
5757 if (IsPostfixUpdate
) {
5758 CapturedValue
= OldValue
;
5760 CmpInst::Predicate Pred
;
5762 case AtomicRMWInst::Max
:
5763 Pred
= CmpInst::ICMP_SGT
;
5765 case AtomicRMWInst::UMax
:
5766 Pred
= CmpInst::ICMP_UGT
;
5768 case AtomicRMWInst::FMax
:
5769 Pred
= CmpInst::FCMP_OGT
;
5771 case AtomicRMWInst::Min
:
5772 Pred
= CmpInst::ICMP_SLT
;
5774 case AtomicRMWInst::UMin
:
5775 Pred
= CmpInst::ICMP_ULT
;
5777 case AtomicRMWInst::FMin
:
5778 Pred
= CmpInst::FCMP_OLT
;
5781 llvm_unreachable("unexpected comparison op");
5783 Value
*NonAtomicCmp
= Builder
.CreateCmp(Pred
, OldValue
, E
);
5784 CapturedValue
= Builder
.CreateSelect(NonAtomicCmp
, E
, OldValue
);
5786 Builder
.CreateStore(CapturedValue
, V
.Var
, V
.IsVolatile
);
5790 checkAndEmitFlushAfterAtomic(Loc
, AO
, AtomicKind::Compare
);
5792 return Builder
.saveIP();
5795 OpenMPIRBuilder::InsertPointTy
5796 OpenMPIRBuilder::createTeams(const LocationDescription
&Loc
,
5797 BodyGenCallbackTy BodyGenCB
, Value
*NumTeamsLower
,
5798 Value
*NumTeamsUpper
, Value
*ThreadLimit
,
5800 if (!updateToLocation(Loc
))
5801 return InsertPointTy();
5803 uint32_t SrcLocStrSize
;
5804 Constant
*SrcLocStr
= getOrCreateSrcLocStr(Loc
, SrcLocStrSize
);
5805 Value
*Ident
= getOrCreateIdent(SrcLocStr
, SrcLocStrSize
);
5806 Function
*CurrentFunction
= Builder
.GetInsertBlock()->getParent();
5808 // Outer allocation basicblock is the entry block of the current function.
5809 BasicBlock
&OuterAllocaBB
= CurrentFunction
->getEntryBlock();
5810 if (&OuterAllocaBB
== Builder
.GetInsertBlock()) {
5811 BasicBlock
*BodyBB
= splitBB(Builder
, /*CreateBranch=*/true, "teams.entry");
5812 Builder
.SetInsertPoint(BodyBB
, BodyBB
->begin());
5815 // The current basic block is split into four basic blocks. After outlining,
5816 // they will be mapped as follows:
5818 // def current_fn() {
5819 // current_basic_block:
5820 // br label %teams.exit
5822 // ; instructions after teams
5825 // def outlined_fn() {
5827 // br label %teams.body
5829 // ; instructions within teams body
5832 BasicBlock
*ExitBB
= splitBB(Builder
, /*CreateBranch=*/true, "teams.exit");
5833 BasicBlock
*BodyBB
= splitBB(Builder
, /*CreateBranch=*/true, "teams.body");
5834 BasicBlock
*AllocaBB
=
5835 splitBB(Builder
, /*CreateBranch=*/true, "teams.alloca");
5838 if (NumTeamsLower
|| NumTeamsUpper
|| ThreadLimit
|| IfExpr
) {
5839 assert((NumTeamsLower
== nullptr || NumTeamsUpper
!= nullptr) &&
5840 "if lowerbound is non-null, then upperbound must also be non-null "
5841 "for bounds on num_teams");
5843 if (NumTeamsUpper
== nullptr)
5844 NumTeamsUpper
= Builder
.getInt32(0);
5846 if (NumTeamsLower
== nullptr)
5847 NumTeamsLower
= NumTeamsUpper
;
5850 assert(IfExpr
->getType()->isIntegerTy() &&
5851 "argument to if clause must be an integer value");
5853 // upper = ifexpr ? upper : 1
5854 if (IfExpr
->getType() != Int1
)
5855 IfExpr
= Builder
.CreateICmpNE(IfExpr
,
5856 ConstantInt::get(IfExpr
->getType(), 0));
5857 NumTeamsUpper
= Builder
.CreateSelect(
5858 IfExpr
, NumTeamsUpper
, Builder
.getInt32(1), "numTeamsUpper");
5860 // lower = ifexpr ? lower : 1
5861 NumTeamsLower
= Builder
.CreateSelect(
5862 IfExpr
, NumTeamsLower
, Builder
.getInt32(1), "numTeamsLower");
5865 if (ThreadLimit
== nullptr)
5866 ThreadLimit
= Builder
.getInt32(0);
5868 Value
*ThreadNum
= getOrCreateThreadID(Ident
);
5870 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51
),
5871 {Ident
, ThreadNum
, NumTeamsLower
, NumTeamsUpper
, ThreadLimit
});
5873 // Generate the body of teams.
5874 InsertPointTy
AllocaIP(AllocaBB
, AllocaBB
->begin());
5875 InsertPointTy
CodeGenIP(BodyBB
, BodyBB
->begin());
5876 BodyGenCB(AllocaIP
, CodeGenIP
);
5879 OI
.EntryBB
= AllocaBB
;
5881 OI
.OuterAllocaBB
= &OuterAllocaBB
;
5883 // Insert fake values for global tid and bound tid.
5884 std::stack
<Instruction
*> ToBeDeleted
;
5885 InsertPointTy
OuterAllocaIP(&OuterAllocaBB
, OuterAllocaBB
.begin());
5886 OI
.ExcludeArgsFromAggregate
.push_back(createFakeIntVal(
5887 Builder
, OuterAllocaIP
, ToBeDeleted
, AllocaIP
, "gid", true));
5888 OI
.ExcludeArgsFromAggregate
.push_back(createFakeIntVal(
5889 Builder
, OuterAllocaIP
, ToBeDeleted
, AllocaIP
, "tid", true));
5891 OI
.PostOutlineCB
= [this, Ident
, ToBeDeleted
](Function
&OutlinedFn
) mutable {
5892 // The stale call instruction will be replaced with a new call instruction
5893 // for runtime call with the outlined function.
5895 assert(OutlinedFn
.getNumUses() == 1 &&
5896 "there must be a single user for the outlined function");
5897 CallInst
*StaleCI
= cast
<CallInst
>(OutlinedFn
.user_back());
5898 ToBeDeleted
.push(StaleCI
);
5900 assert((OutlinedFn
.arg_size() == 2 || OutlinedFn
.arg_size() == 3) &&
5901 "Outlined function must have two or three arguments only");
5903 bool HasShared
= OutlinedFn
.arg_size() == 3;
5905 OutlinedFn
.getArg(0)->setName("global.tid.ptr");
5906 OutlinedFn
.getArg(1)->setName("bound.tid.ptr");
5908 OutlinedFn
.getArg(2)->setName("data");
5910 // Call to the runtime function for teams in the current function.
5911 assert(StaleCI
&& "Error while outlining - no CallInst user found for the "
5912 "outlined function.");
5913 Builder
.SetInsertPoint(StaleCI
);
5914 SmallVector
<Value
*> Args
= {
5915 Ident
, Builder
.getInt32(StaleCI
->arg_size() - 2), &OutlinedFn
};
5917 Args
.push_back(StaleCI
->getArgOperand(2));
5918 Builder
.CreateCall(getOrCreateRuntimeFunctionPtr(
5919 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams
),
5922 while (!ToBeDeleted
.empty()) {
5923 ToBeDeleted
.top()->eraseFromParent();
5928 addOutlineInfo(std::move(OI
));
5930 Builder
.SetInsertPoint(ExitBB
, ExitBB
->begin());
5932 return Builder
.saveIP();
5936 OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl
<llvm::Constant
*> &Names
,
5937 std::string VarName
) {
5938 llvm::Constant
*MapNamesArrayInit
= llvm::ConstantArray::get(
5939 llvm::ArrayType::get(llvm::PointerType::getUnqual(M
.getContext()),
5942 auto *MapNamesArrayGlobal
= new llvm::GlobalVariable(
5943 M
, MapNamesArrayInit
->getType(),
5944 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage
, MapNamesArrayInit
,
5946 return MapNamesArrayGlobal
;
5949 // Create all simple and struct types exposed by the runtime and remember
5950 // the llvm::PointerTypes of them for easy access later.
5951 void OpenMPIRBuilder::initializeTypes(Module
&M
) {
5952 LLVMContext
&Ctx
= M
.getContext();
5954 #define OMP_TYPE(VarName, InitValue) VarName = InitValue;
5955 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
5956 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
5957 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
5958 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
5959 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
5960 VarName##Ptr = PointerType::getUnqual(VarName);
5961 #define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
5962 T = StructType::getTypeByName(Ctx, StructName); \
5964 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
5966 VarName##Ptr = PointerType::getUnqual(T);
5967 #include "llvm/Frontend/OpenMP/OMPKinds.def"
5970 void OpenMPIRBuilder::OutlineInfo::collectBlocks(
5971 SmallPtrSetImpl
<BasicBlock
*> &BlockSet
,
5972 SmallVectorImpl
<BasicBlock
*> &BlockVector
) {
5973 SmallVector
<BasicBlock
*, 32> Worklist
;
5974 BlockSet
.insert(EntryBB
);
5975 BlockSet
.insert(ExitBB
);
5977 Worklist
.push_back(EntryBB
);
5978 while (!Worklist
.empty()) {
5979 BasicBlock
*BB
= Worklist
.pop_back_val();
5980 BlockVector
.push_back(BB
);
5981 for (BasicBlock
*SuccBB
: successors(BB
))
5982 if (BlockSet
.insert(SuccBB
).second
)
5983 Worklist
.push_back(SuccBB
);
5987 void OpenMPIRBuilder::createOffloadEntry(Constant
*ID
, Constant
*Addr
,
5988 uint64_t Size
, int32_t Flags
,
5989 GlobalValue::LinkageTypes
,
5991 if (!Config
.isGPU()) {
5992 llvm::offloading::emitOffloadingEntry(
5993 M
, ID
, Name
.empty() ? Addr
->getName() : Name
, Size
, Flags
,
5994 "omp_offloading_entries");
5997 // TODO: Add support for global variables on the device after declare target
5999 Function
*Fn
= dyn_cast
<Function
>(Addr
);
6003 Module
&M
= *(Fn
->getParent());
6004 LLVMContext
&Ctx
= M
.getContext();
6006 // Get "nvvm.annotations" metadata node.
6007 NamedMDNode
*MD
= M
.getOrInsertNamedMetadata("nvvm.annotations");
6009 Metadata
*MDVals
[] = {
6010 ConstantAsMetadata::get(Fn
), MDString::get(Ctx
, "kernel"),
6011 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx
), 1))};
6012 // Append metadata to nvvm.annotations.
6013 MD
->addOperand(MDNode::get(Ctx
, MDVals
));
6015 // Add a function attribute for the kernel.
6016 Fn
->addFnAttr(Attribute::get(Ctx
, "kernel"));
6018 Fn
->addFnAttr("uniform-work-group-size", "true");
6019 Fn
->addFnAttr(Attribute::MustProgress
);
6022 // We only generate metadata for function that contain target regions.
6023 void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
6024 EmitMetadataErrorReportFunctionTy
&ErrorFn
) {
6026 // If there are no entries, we don't need to do anything.
6027 if (OffloadInfoManager
.empty())
6030 LLVMContext
&C
= M
.getContext();
6031 SmallVector
<std::pair
<const OffloadEntriesInfoManager::OffloadEntryInfo
*,
6032 TargetRegionEntryInfo
>,
6034 OrderedEntries(OffloadInfoManager
.size());
6036 // Auxiliary methods to create metadata values and strings.
6037 auto &&GetMDInt
= [this](unsigned V
) {
6038 return ConstantAsMetadata::get(ConstantInt::get(Builder
.getInt32Ty(), V
));
6041 auto &&GetMDString
= [&C
](StringRef V
) { return MDString::get(C
, V
); };
6043 // Create the offloading info metadata node.
6044 NamedMDNode
*MD
= M
.getOrInsertNamedMetadata("omp_offload.info");
6045 auto &&TargetRegionMetadataEmitter
=
6046 [&C
, MD
, &OrderedEntries
, &GetMDInt
, &GetMDString
](
6047 const TargetRegionEntryInfo
&EntryInfo
,
6048 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion
&E
) {
6049 // Generate metadata for target regions. Each entry of this metadata
6051 // - Entry 0 -> Kind of this type of metadata (0).
6052 // - Entry 1 -> Device ID of the file where the entry was identified.
6053 // - Entry 2 -> File ID of the file where the entry was identified.
6054 // - Entry 3 -> Mangled name of the function where the entry was
6056 // - Entry 4 -> Line in the file where the entry was identified.
6057 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
6058 // - Entry 6 -> Order the entry was created.
6059 // The first element of the metadata node is the kind.
6061 GetMDInt(E
.getKind()), GetMDInt(EntryInfo
.DeviceID
),
6062 GetMDInt(EntryInfo
.FileID
), GetMDString(EntryInfo
.ParentName
),
6063 GetMDInt(EntryInfo
.Line
), GetMDInt(EntryInfo
.Count
),
6064 GetMDInt(E
.getOrder())};
6066 // Save this entry in the right position of the ordered entries array.
6067 OrderedEntries
[E
.getOrder()] = std::make_pair(&E
, EntryInfo
);
6069 // Add metadata to the named metadata node.
6070 MD
->addOperand(MDNode::get(C
, Ops
));
6073 OffloadInfoManager
.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter
);
6075 // Create function that emits metadata for each device global variable entry;
6076 auto &&DeviceGlobalVarMetadataEmitter
=
6077 [&C
, &OrderedEntries
, &GetMDInt
, &GetMDString
, MD
](
6078 StringRef MangledName
,
6079 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar
&E
) {
6080 // Generate metadata for global variables. Each entry of this metadata
6082 // - Entry 0 -> Kind of this type of metadata (1).
6083 // - Entry 1 -> Mangled name of the variable.
6084 // - Entry 2 -> Declare target kind.
6085 // - Entry 3 -> Order the entry was created.
6086 // The first element of the metadata node is the kind.
6087 Metadata
*Ops
[] = {GetMDInt(E
.getKind()), GetMDString(MangledName
),
6088 GetMDInt(E
.getFlags()), GetMDInt(E
.getOrder())};
6090 // Save this entry in the right position of the ordered entries array.
6091 TargetRegionEntryInfo
varInfo(MangledName
, 0, 0, 0);
6092 OrderedEntries
[E
.getOrder()] = std::make_pair(&E
, varInfo
);
6094 // Add metadata to the named metadata node.
6095 MD
->addOperand(MDNode::get(C
, Ops
));
6098 OffloadInfoManager
.actOnDeviceGlobalVarEntriesInfo(
6099 DeviceGlobalVarMetadataEmitter
);
6101 for (const auto &E
: OrderedEntries
) {
6102 assert(E
.first
&& "All ordered entries must exist!");
6103 if (const auto *CE
=
6104 dyn_cast
<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion
>(
6106 if (!CE
->getID() || !CE
->getAddress()) {
6107 // Do not blame the entry if the parent funtion is not emitted.
6108 TargetRegionEntryInfo EntryInfo
= E
.second
;
6109 StringRef FnName
= EntryInfo
.ParentName
;
6110 if (!M
.getNamedValue(FnName
))
6112 ErrorFn(EMIT_MD_TARGET_REGION_ERROR
, EntryInfo
);
6115 createOffloadEntry(CE
->getID(), CE
->getAddress(),
6116 /*Size=*/0, CE
->getFlags(),
6117 GlobalValue::WeakAnyLinkage
);
6118 } else if (const auto *CE
= dyn_cast
<
6119 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar
>(
6121 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags
=
6122 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind
>(
6125 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter
:
6126 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo
:
6127 if (Config
.isTargetDevice() && Config
.hasRequiresUnifiedSharedMemory())
6129 if (!CE
->getAddress()) {
6130 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR
, E
.second
);
6133 // The vaiable has no definition - no need to add the entry.
6134 if (CE
->getVarSize() == 0)
6137 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink
:
6138 assert(((Config
.isTargetDevice() && !CE
->getAddress()) ||
6139 (!Config
.isTargetDevice() && CE
->getAddress())) &&
6140 "Declaret target link address is set.");
6141 if (Config
.isTargetDevice())
6143 if (!CE
->getAddress()) {
6144 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR
, TargetRegionEntryInfo());
6152 // Hidden or internal symbols on the device are not externally visible.
6153 // We should not attempt to register them by creating an offloading
6154 // entry. Indirect variables are handled separately on the device.
6155 if (auto *GV
= dyn_cast
<GlobalValue
>(CE
->getAddress()))
6156 if ((GV
->hasLocalLinkage() || GV
->hasHiddenVisibility()) &&
6157 Flags
!= OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect
)
6160 // Indirect globals need to use a special name that doesn't match the name
6161 // of the associated host global.
6162 if (Flags
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect
)
6163 createOffloadEntry(CE
->getAddress(), CE
->getAddress(), CE
->getVarSize(),
6164 Flags
, CE
->getLinkage(), CE
->getVarName());
6166 createOffloadEntry(CE
->getAddress(), CE
->getAddress(), CE
->getVarSize(),
6167 Flags
, CE
->getLinkage());
6170 llvm_unreachable("Unsupported entry kind.");
6175 void TargetRegionEntryInfo::getTargetRegionEntryFnName(
6176 SmallVectorImpl
<char> &Name
, StringRef ParentName
, unsigned DeviceID
,
6177 unsigned FileID
, unsigned Line
, unsigned Count
) {
6178 raw_svector_ostream
OS(Name
);
6179 OS
<< "__omp_offloading" << llvm::format("_%x", DeviceID
)
6180 << llvm::format("_%x_", FileID
) << ParentName
<< "_l" << Line
;
6185 void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
6186 SmallVectorImpl
<char> &Name
, const TargetRegionEntryInfo
&EntryInfo
) {
6187 unsigned NewCount
= getTargetRegionEntryInfoCount(EntryInfo
);
6188 TargetRegionEntryInfo::getTargetRegionEntryFnName(
6189 Name
, EntryInfo
.ParentName
, EntryInfo
.DeviceID
, EntryInfo
.FileID
,
6190 EntryInfo
.Line
, NewCount
);
6193 TargetRegionEntryInfo
6194 OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack
,
6195 StringRef ParentName
) {
6196 sys::fs::UniqueID ID
;
6197 auto FileIDInfo
= CallBack();
6198 if (auto EC
= sys::fs::getUniqueID(std::get
<0>(FileIDInfo
), ID
)) {
6199 report_fatal_error(("Unable to get unique ID for file, during "
6200 "getTargetEntryUniqueInfo, error message: " +
6205 return TargetRegionEntryInfo(ParentName
, ID
.getDevice(), ID
.getFile(),
6206 std::get
<1>(FileIDInfo
));
6209 unsigned OpenMPIRBuilder::getFlagMemberOffset() {
6210 unsigned Offset
= 0;
6211 for (uint64_t Remain
=
6212 static_cast<std::underlying_type_t
<omp::OpenMPOffloadMappingFlags
>>(
6213 omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF
);
6214 !(Remain
& 1); Remain
= Remain
>> 1)
6219 omp::OpenMPOffloadMappingFlags
6220 OpenMPIRBuilder::getMemberOfFlag(unsigned Position
) {
6221 // Rotate by getFlagMemberOffset() bits.
6222 return static_cast<omp::OpenMPOffloadMappingFlags
>(((uint64_t)Position
+ 1)
6223 << getFlagMemberOffset());
6226 void OpenMPIRBuilder::setCorrectMemberOfFlag(
6227 omp::OpenMPOffloadMappingFlags
&Flags
,
6228 omp::OpenMPOffloadMappingFlags MemberOfFlag
) {
6229 // If the entry is PTR_AND_OBJ but has not been marked with the special
6230 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
6231 // marked as MEMBER_OF.
6232 if (static_cast<std::underlying_type_t
<omp::OpenMPOffloadMappingFlags
>>(
6233 Flags
& omp::OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ
) &&
6234 static_cast<std::underlying_type_t
<omp::OpenMPOffloadMappingFlags
>>(
6235 (Flags
& omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF
) !=
6236 omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF
))
6239 // Reset the placeholder value to prepare the flag for the assignment of the
6240 // proper MEMBER_OF value.
6241 Flags
&= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF
;
6242 Flags
|= MemberOfFlag
;
6245 Constant
*OpenMPIRBuilder::getAddrOfDeclareTargetVar(
6246 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause
,
6247 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause
,
6248 bool IsDeclaration
, bool IsExternallyVisible
,
6249 TargetRegionEntryInfo EntryInfo
, StringRef MangledName
,
6250 std::vector
<GlobalVariable
*> &GeneratedRefs
, bool OpenMPSIMD
,
6251 std::vector
<Triple
> TargetTriple
, Type
*LlvmPtrTy
,
6252 std::function
<Constant
*()> GlobalInitializer
,
6253 std::function
<GlobalValue::LinkageTypes()> VariableLinkage
) {
6254 // TODO: convert this to utilise the IRBuilder Config rather than
6255 // a passed down argument.
6259 if (CaptureClause
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink
||
6260 ((CaptureClause
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo
||
6262 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter
) &&
6263 Config
.hasRequiresUnifiedSharedMemory())) {
6264 SmallString
<64> PtrName
;
6266 raw_svector_ostream
OS(PtrName
);
6268 if (!IsExternallyVisible
)
6269 OS
<< format("_%x", EntryInfo
.FileID
);
6270 OS
<< "_decl_tgt_ref_ptr";
6273 Value
*Ptr
= M
.getNamedValue(PtrName
);
6276 GlobalValue
*GlobalValue
= M
.getNamedValue(MangledName
);
6277 Ptr
= getOrCreateInternalVariable(LlvmPtrTy
, PtrName
);
6279 auto *GV
= cast
<GlobalVariable
>(Ptr
);
6280 GV
->setLinkage(GlobalValue::WeakAnyLinkage
);
6282 if (!Config
.isTargetDevice()) {
6283 if (GlobalInitializer
)
6284 GV
->setInitializer(GlobalInitializer());
6286 GV
->setInitializer(GlobalValue
);
6289 registerTargetGlobalVariable(
6290 CaptureClause
, DeviceClause
, IsDeclaration
, IsExternallyVisible
,
6291 EntryInfo
, MangledName
, GeneratedRefs
, OpenMPSIMD
, TargetTriple
,
6292 GlobalInitializer
, VariableLinkage
, LlvmPtrTy
, cast
<Constant
>(Ptr
));
6295 return cast
<Constant
>(Ptr
);
6301 void OpenMPIRBuilder::registerTargetGlobalVariable(
6302 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause
,
6303 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause
,
6304 bool IsDeclaration
, bool IsExternallyVisible
,
6305 TargetRegionEntryInfo EntryInfo
, StringRef MangledName
,
6306 std::vector
<GlobalVariable
*> &GeneratedRefs
, bool OpenMPSIMD
,
6307 std::vector
<Triple
> TargetTriple
,
6308 std::function
<Constant
*()> GlobalInitializer
,
6309 std::function
<GlobalValue::LinkageTypes()> VariableLinkage
, Type
*LlvmPtrTy
,
6311 if (DeviceClause
!= OffloadEntriesInfoManager::OMPTargetDeviceClauseAny
||
6312 (TargetTriple
.empty() && !Config
.isTargetDevice()))
6315 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags
;
6318 GlobalValue::LinkageTypes Linkage
;
6320 if ((CaptureClause
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo
||
6322 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter
) &&
6323 !Config
.hasRequiresUnifiedSharedMemory()) {
6324 Flags
= OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo
;
6325 VarName
= MangledName
;
6326 GlobalValue
*LlvmVal
= M
.getNamedValue(VarName
);
6329 VarSize
= divideCeil(
6330 M
.getDataLayout().getTypeSizeInBits(LlvmVal
->getValueType()), 8);
6333 Linkage
= (VariableLinkage
) ? VariableLinkage() : LlvmVal
->getLinkage();
6335 // This is a workaround carried over from Clang which prevents undesired
6336 // optimisation of internal variables.
6337 if (Config
.isTargetDevice() &&
6338 (!IsExternallyVisible
|| Linkage
== GlobalValue::LinkOnceODRLinkage
)) {
6339 // Do not create a "ref-variable" if the original is not also available
6341 if (!OffloadInfoManager
.hasDeviceGlobalVarEntryInfo(VarName
))
6344 std::string RefName
= createPlatformSpecificName({VarName
, "ref"});
6346 if (!M
.getNamedValue(RefName
)) {
6348 getOrCreateInternalVariable(Addr
->getType(), RefName
);
6349 auto *GvAddrRef
= cast
<GlobalVariable
>(AddrRef
);
6350 GvAddrRef
->setConstant(true);
6351 GvAddrRef
->setLinkage(GlobalValue::InternalLinkage
);
6352 GvAddrRef
->setInitializer(Addr
);
6353 GeneratedRefs
.push_back(GvAddrRef
);
6357 if (CaptureClause
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink
)
6358 Flags
= OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink
;
6360 Flags
= OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo
;
6362 if (Config
.isTargetDevice()) {
6363 VarName
= (Addr
) ? Addr
->getName() : "";
6366 Addr
= getAddrOfDeclareTargetVar(
6367 CaptureClause
, DeviceClause
, IsDeclaration
, IsExternallyVisible
,
6368 EntryInfo
, MangledName
, GeneratedRefs
, OpenMPSIMD
, TargetTriple
,
6369 LlvmPtrTy
, GlobalInitializer
, VariableLinkage
);
6370 VarName
= (Addr
) ? Addr
->getName() : "";
6372 VarSize
= M
.getDataLayout().getPointerSize();
6373 Linkage
= GlobalValue::WeakAnyLinkage
;
6376 OffloadInfoManager
.registerDeviceGlobalVarEntryInfo(VarName
, Addr
, VarSize
,
6380 /// Loads all the offload entries information from the host IR
6382 void OpenMPIRBuilder::loadOffloadInfoMetadata(Module
&M
) {
6383 // If we are in target mode, load the metadata from the host IR. This code has
6384 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
6386 NamedMDNode
*MD
= M
.getNamedMetadata(ompOffloadInfoName
);
6390 for (MDNode
*MN
: MD
->operands()) {
6391 auto &&GetMDInt
= [MN
](unsigned Idx
) {
6392 auto *V
= cast
<ConstantAsMetadata
>(MN
->getOperand(Idx
));
6393 return cast
<ConstantInt
>(V
->getValue())->getZExtValue();
6396 auto &&GetMDString
= [MN
](unsigned Idx
) {
6397 auto *V
= cast
<MDString
>(MN
->getOperand(Idx
));
6398 return V
->getString();
6401 switch (GetMDInt(0)) {
6403 llvm_unreachable("Unexpected metadata!");
6405 case OffloadEntriesInfoManager::OffloadEntryInfo::
6406 OffloadingEntryInfoTargetRegion
: {
6407 TargetRegionEntryInfo
EntryInfo(/*ParentName=*/GetMDString(3),
6408 /*DeviceID=*/GetMDInt(1),
6409 /*FileID=*/GetMDInt(2),
6410 /*Line=*/GetMDInt(4),
6411 /*Count=*/GetMDInt(5));
6412 OffloadInfoManager
.initializeTargetRegionEntryInfo(EntryInfo
,
6413 /*Order=*/GetMDInt(6));
6416 case OffloadEntriesInfoManager::OffloadEntryInfo::
6417 OffloadingEntryInfoDeviceGlobalVar
:
6418 OffloadInfoManager
.initializeDeviceGlobalVarEntryInfo(
6419 /*MangledName=*/GetMDString(1),
6420 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind
>(
6421 /*Flags=*/GetMDInt(2)),
6422 /*Order=*/GetMDInt(3));
6428 void OpenMPIRBuilder::loadOffloadInfoMetadata(StringRef HostFilePath
) {
6429 if (HostFilePath
.empty())
6432 auto Buf
= MemoryBuffer::getFile(HostFilePath
);
6433 if (std::error_code Err
= Buf
.getError()) {
6434 report_fatal_error(("error opening host file from host file path inside of "
6435 "OpenMPIRBuilder: " +
6441 auto M
= expectedToErrorOrAndEmitErrors(
6442 Ctx
, parseBitcodeFile(Buf
.get()->getMemBufferRef(), Ctx
));
6443 if (std::error_code Err
= M
.getError()) {
6445 ("error parsing host file inside of OpenMPIRBuilder: " + Err
.message())
6449 loadOffloadInfoMetadata(*M
.get());
6452 Function
*OpenMPIRBuilder::createRegisterRequires(StringRef Name
) {
6453 // Skip the creation of the registration function if this is device codegen
6454 if (Config
.isTargetDevice())
6457 Builder
.ClearInsertionPoint();
6459 // Create registration function prototype
6460 auto *RegFnTy
= FunctionType::get(Builder
.getVoidTy(), {});
6461 auto *RegFn
= Function::Create(
6462 RegFnTy
, GlobalVariable::LinkageTypes::InternalLinkage
, Name
, M
);
6463 RegFn
->setSection(".text.startup");
6464 RegFn
->addFnAttr(Attribute::NoInline
);
6465 RegFn
->addFnAttr(Attribute::NoUnwind
);
6467 // Create registration function body
6468 auto *BB
= BasicBlock::Create(M
.getContext(), "entry", RegFn
);
6469 ConstantInt
*FlagsVal
=
6470 ConstantInt::getSigned(Builder
.getInt64Ty(), Config
.getRequiresFlags());
6471 Function
*RTLRegFn
= getOrCreateRuntimeFunctionPtr(
6472 omp::RuntimeFunction::OMPRTL___tgt_register_requires
);
6474 Builder
.SetInsertPoint(BB
);
6475 Builder
.CreateCall(RTLRegFn
, {FlagsVal
});
6476 Builder
.CreateRetVoid();
6481 //===----------------------------------------------------------------------===//
6482 // OffloadEntriesInfoManager
6483 //===----------------------------------------------------------------------===//
6485 bool OffloadEntriesInfoManager::empty() const {
6486 return OffloadEntriesTargetRegion
.empty() &&
6487 OffloadEntriesDeviceGlobalVar
.empty();
6490 unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
6491 const TargetRegionEntryInfo
&EntryInfo
) const {
6492 auto It
= OffloadEntriesTargetRegionCount
.find(
6493 getTargetRegionEntryCountKey(EntryInfo
));
6494 if (It
== OffloadEntriesTargetRegionCount
.end())
6499 void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
6500 const TargetRegionEntryInfo
&EntryInfo
) {
6501 OffloadEntriesTargetRegionCount
[getTargetRegionEntryCountKey(EntryInfo
)] =
6502 EntryInfo
.Count
+ 1;
6505 /// Initialize target region entry.
6506 void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
6507 const TargetRegionEntryInfo
&EntryInfo
, unsigned Order
) {
6508 OffloadEntriesTargetRegion
[EntryInfo
] =
6509 OffloadEntryInfoTargetRegion(Order
, /*Addr=*/nullptr, /*ID=*/nullptr,
6510 OMPTargetRegionEntryTargetRegion
);
6511 ++OffloadingEntriesNum
;
6514 void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
6515 TargetRegionEntryInfo EntryInfo
, Constant
*Addr
, Constant
*ID
,
6516 OMPTargetRegionEntryKind Flags
) {
6517 assert(EntryInfo
.Count
== 0 && "expected default EntryInfo");
6519 // Update the EntryInfo with the next available count for this location.
6520 EntryInfo
.Count
= getTargetRegionEntryInfoCount(EntryInfo
);
6522 // If we are emitting code for a target, the entry is already initialized,
6523 // only has to be registered.
6524 if (OMPBuilder
->Config
.isTargetDevice()) {
6525 // This could happen if the device compilation is invoked standalone.
6526 if (!hasTargetRegionEntryInfo(EntryInfo
)) {
6529 auto &Entry
= OffloadEntriesTargetRegion
[EntryInfo
];
6530 Entry
.setAddress(Addr
);
6532 Entry
.setFlags(Flags
);
6534 if (Flags
== OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion
&&
6535 hasTargetRegionEntryInfo(EntryInfo
, /*IgnoreAddressId*/ true))
6537 assert(!hasTargetRegionEntryInfo(EntryInfo
) &&
6538 "Target region entry already registered!");
6539 OffloadEntryInfoTargetRegion
Entry(OffloadingEntriesNum
, Addr
, ID
, Flags
);
6540 OffloadEntriesTargetRegion
[EntryInfo
] = Entry
;
6541 ++OffloadingEntriesNum
;
6543 incrementTargetRegionEntryInfoCount(EntryInfo
);
6546 bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
6547 TargetRegionEntryInfo EntryInfo
, bool IgnoreAddressId
) const {
6549 // Update the EntryInfo with the next available count for this location.
6550 EntryInfo
.Count
= getTargetRegionEntryInfoCount(EntryInfo
);
6552 auto It
= OffloadEntriesTargetRegion
.find(EntryInfo
);
6553 if (It
== OffloadEntriesTargetRegion
.end()) {
6556 // Fail if this entry is already registered.
6557 if (!IgnoreAddressId
&& (It
->second
.getAddress() || It
->second
.getID()))
6562 void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
6563 const OffloadTargetRegionEntryInfoActTy
&Action
) {
6564 // Scan all target region entries and perform the provided action.
6565 for (const auto &It
: OffloadEntriesTargetRegion
) {
6566 Action(It
.first
, It
.second
);
6570 void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
6571 StringRef Name
, OMPTargetGlobalVarEntryKind Flags
, unsigned Order
) {
6572 OffloadEntriesDeviceGlobalVar
.try_emplace(Name
, Order
, Flags
);
6573 ++OffloadingEntriesNum
;
6576 void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
6577 StringRef VarName
, Constant
*Addr
, int64_t VarSize
,
6578 OMPTargetGlobalVarEntryKind Flags
, GlobalValue::LinkageTypes Linkage
) {
6579 if (OMPBuilder
->Config
.isTargetDevice()) {
6580 // This could happen if the device compilation is invoked standalone.
6581 if (!hasDeviceGlobalVarEntryInfo(VarName
))
6583 auto &Entry
= OffloadEntriesDeviceGlobalVar
[VarName
];
6584 if (Entry
.getAddress() && hasDeviceGlobalVarEntryInfo(VarName
)) {
6585 if (Entry
.getVarSize() == 0) {
6586 Entry
.setVarSize(VarSize
);
6587 Entry
.setLinkage(Linkage
);
6591 Entry
.setVarSize(VarSize
);
6592 Entry
.setLinkage(Linkage
);
6593 Entry
.setAddress(Addr
);
6595 if (hasDeviceGlobalVarEntryInfo(VarName
)) {
6596 auto &Entry
= OffloadEntriesDeviceGlobalVar
[VarName
];
6597 assert(Entry
.isValid() && Entry
.getFlags() == Flags
&&
6598 "Entry not initialized!");
6599 if (Entry
.getVarSize() == 0) {
6600 Entry
.setVarSize(VarSize
);
6601 Entry
.setLinkage(Linkage
);
6605 if (Flags
== OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect
)
6606 OffloadEntriesDeviceGlobalVar
.try_emplace(VarName
, OffloadingEntriesNum
,
6607 Addr
, VarSize
, Flags
, Linkage
,
6610 OffloadEntriesDeviceGlobalVar
.try_emplace(
6611 VarName
, OffloadingEntriesNum
, Addr
, VarSize
, Flags
, Linkage
, "");
6612 ++OffloadingEntriesNum
;
6616 void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
6617 const OffloadDeviceGlobalVarEntryInfoActTy
&Action
) {
6618 // Scan all target region entries and perform the provided action.
6619 for (const auto &E
: OffloadEntriesDeviceGlobalVar
)
6620 Action(E
.getKey(), E
.getValue());
6623 //===----------------------------------------------------------------------===//
6624 // CanonicalLoopInfo
6625 //===----------------------------------------------------------------------===//
6627 void CanonicalLoopInfo::collectControlBlocks(
6628 SmallVectorImpl
<BasicBlock
*> &BBs
) {
6629 // We only count those BBs as control block for which we do not need to
6630 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
6631 // flow. For consistency, this also means we do not add the Body block, which
6632 // is just the entry to the body code.
6633 BBs
.reserve(BBs
.size() + 6);
6634 BBs
.append({getPreheader(), Header
, Cond
, Latch
, Exit
, getAfter()});
6637 BasicBlock
*CanonicalLoopInfo::getPreheader() const {
6638 assert(isValid() && "Requires a valid canonical loop");
6639 for (BasicBlock
*Pred
: predecessors(Header
)) {
6643 llvm_unreachable("Missing preheader");
6646 void CanonicalLoopInfo::setTripCount(Value
*TripCount
) {
6647 assert(isValid() && "Requires a valid canonical loop");
6649 Instruction
*CmpI
= &getCond()->front();
6650 assert(isa
<CmpInst
>(CmpI
) && "First inst must compare IV with TripCount");
6651 CmpI
->setOperand(1, TripCount
);
6658 void CanonicalLoopInfo::mapIndVar(
6659 llvm::function_ref
<Value
*(Instruction
*)> Updater
) {
6660 assert(isValid() && "Requires a valid canonical loop");
6662 Instruction
*OldIV
= getIndVar();
6664 // Record all uses excluding those introduced by the updater. Uses by the
6665 // CanonicalLoopInfo itself to keep track of the number of iterations are
6667 SmallVector
<Use
*> ReplacableUses
;
6668 for (Use
&U
: OldIV
->uses()) {
6669 auto *User
= dyn_cast
<Instruction
>(U
.getUser());
6672 if (User
->getParent() == getCond())
6674 if (User
->getParent() == getLatch())
6676 ReplacableUses
.push_back(&U
);
6679 // Run the updater that may introduce new uses
6680 Value
*NewIV
= Updater(OldIV
);
6682 // Replace the old uses with the value returned by the updater.
6683 for (Use
*U
: ReplacableUses
)
6691 void CanonicalLoopInfo::assertOK() const {
6693 // No constraints if this object currently does not describe a loop.
6697 BasicBlock
*Preheader
= getPreheader();
6698 BasicBlock
*Body
= getBody();
6699 BasicBlock
*After
= getAfter();
6701 // Verify standard control-flow we use for OpenMP loops.
6703 assert(isa
<BranchInst
>(Preheader
->getTerminator()) &&
6704 "Preheader must terminate with unconditional branch");
6705 assert(Preheader
->getSingleSuccessor() == Header
&&
6706 "Preheader must jump to header");
6709 assert(isa
<BranchInst
>(Header
->getTerminator()) &&
6710 "Header must terminate with unconditional branch");
6711 assert(Header
->getSingleSuccessor() == Cond
&&
6712 "Header must jump to exiting block");
6715 assert(Cond
->getSinglePredecessor() == Header
&&
6716 "Exiting block only reachable from header");
6718 assert(isa
<BranchInst
>(Cond
->getTerminator()) &&
6719 "Exiting block must terminate with conditional branch");
6720 assert(size(successors(Cond
)) == 2 &&
6721 "Exiting block must have two successors");
6722 assert(cast
<BranchInst
>(Cond
->getTerminator())->getSuccessor(0) == Body
&&
6723 "Exiting block's first successor jump to the body");
6724 assert(cast
<BranchInst
>(Cond
->getTerminator())->getSuccessor(1) == Exit
&&
6725 "Exiting block's second successor must exit the loop");
6728 assert(Body
->getSinglePredecessor() == Cond
&&
6729 "Body only reachable from exiting block");
6730 assert(!isa
<PHINode
>(Body
->front()));
6733 assert(isa
<BranchInst
>(Latch
->getTerminator()) &&
6734 "Latch must terminate with unconditional branch");
6735 assert(Latch
->getSingleSuccessor() == Header
&& "Latch must jump to header");
6736 // TODO: To support simple redirecting of the end of the body code that has
6737 // multiple; introduce another auxiliary basic block like preheader and after.
6738 assert(Latch
->getSinglePredecessor() != nullptr);
6739 assert(!isa
<PHINode
>(Latch
->front()));
6742 assert(isa
<BranchInst
>(Exit
->getTerminator()) &&
6743 "Exit block must terminate with unconditional branch");
6744 assert(Exit
->getSingleSuccessor() == After
&&
6745 "Exit block must jump to after block");
6748 assert(After
->getSinglePredecessor() == Exit
&&
6749 "After block only reachable from exit block");
6750 assert(After
->empty() || !isa
<PHINode
>(After
->front()));
6752 Instruction
*IndVar
= getIndVar();
6753 assert(IndVar
&& "Canonical induction variable not found?");
6754 assert(isa
<IntegerType
>(IndVar
->getType()) &&
6755 "Induction variable must be an integer");
6756 assert(cast
<PHINode
>(IndVar
)->getParent() == Header
&&
6757 "Induction variable must be a PHI in the loop header");
6758 assert(cast
<PHINode
>(IndVar
)->getIncomingBlock(0) == Preheader
);
6760 cast
<ConstantInt
>(cast
<PHINode
>(IndVar
)->getIncomingValue(0))->isZero());
6761 assert(cast
<PHINode
>(IndVar
)->getIncomingBlock(1) == Latch
);
6763 auto *NextIndVar
= cast
<PHINode
>(IndVar
)->getIncomingValue(1);
6764 assert(cast
<Instruction
>(NextIndVar
)->getParent() == Latch
);
6765 assert(cast
<BinaryOperator
>(NextIndVar
)->getOpcode() == BinaryOperator::Add
);
6766 assert(cast
<BinaryOperator
>(NextIndVar
)->getOperand(0) == IndVar
);
6767 assert(cast
<ConstantInt
>(cast
<BinaryOperator
>(NextIndVar
)->getOperand(1))
6770 Value
*TripCount
= getTripCount();
6771 assert(TripCount
&& "Loop trip count not found?");
6772 assert(IndVar
->getType() == TripCount
->getType() &&
6773 "Trip count and induction variable must have the same type");
6775 auto *CmpI
= cast
<CmpInst
>(&Cond
->front());
6776 assert(CmpI
->getPredicate() == CmpInst::ICMP_ULT
&&
6777 "Exit condition must be a signed less-than comparison");
6778 assert(CmpI
->getOperand(0) == IndVar
&&
6779 "Exit condition must compare the induction variable");
6780 assert(CmpI
->getOperand(1) == TripCount
&&
6781 "Exit condition must compare with the trip count");
6785 void CanonicalLoopInfo::invalidate() {