1 //===-------- interface.cpp - Target independent OpenMP target RTL --------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Implementation of the interface to be used by Clang during the codegen of a
12 //===----------------------------------------------------------------------===//
14 #include "OmptCallback.h"
15 #include "OmptInterface.h"
17 #include "omptarget.h"
21 #include "Utilities.h"
27 #include <type_traits>
30 using namespace llvm::omp::target::ompt
;
33 ////////////////////////////////////////////////////////////////////////////////
34 /// adds requires flags
35 EXTERN
void __tgt_register_requires(int64_t Flags
) {
37 PM
->RTLs
.registerRequires(Flags
);
40 ////////////////////////////////////////////////////////////////////////////////
41 /// adds a target shared library to the target execution image
42 EXTERN
void __tgt_register_lib(__tgt_bin_desc
*Desc
) {
44 if (PM
->maybeDelayRegisterLib(Desc
))
47 for (auto &RTL
: PM
->RTLs
.AllRTLs
) {
48 if (RTL
.register_lib
) {
49 if ((*RTL
.register_lib
)(Desc
) != OFFLOAD_SUCCESS
) {
50 DP("Could not register library with %s", RTL
.RTLName
.c_str());
54 PM
->RTLs
.registerLib(Desc
);
57 ////////////////////////////////////////////////////////////////////////////////
58 /// Initialize all available devices without registering any image
59 EXTERN
void __tgt_init_all_rtls() { PM
->RTLs
.initAllRTLs(); }
61 ////////////////////////////////////////////////////////////////////////////////
62 /// unloads a target shared library
63 EXTERN
void __tgt_unregister_lib(__tgt_bin_desc
*Desc
) {
65 PM
->RTLs
.unregisterLib(Desc
);
66 for (auto &RTL
: PM
->RTLs
.UsedRTLs
) {
67 if (RTL
->unregister_lib
) {
68 if ((*RTL
->unregister_lib
)(Desc
) != OFFLOAD_SUCCESS
) {
69 DP("Could not register library with %s", RTL
->RTLName
.c_str());
75 template <typename TargetAsyncInfoTy
>
77 targetData(ident_t
*Loc
, int64_t DeviceId
, int32_t ArgNum
, void **ArgsBase
,
78 void **Args
, int64_t *ArgSizes
, int64_t *ArgTypes
,
79 map_var_info_t
*ArgNames
, void **ArgMappers
,
80 TargetDataFuncPtrTy TargetDataFunction
, const char *RegionTypeMsg
,
81 const char *RegionName
) {
82 static_assert(std::is_convertible_v
<TargetAsyncInfoTy
, AsyncInfoTy
>,
83 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
85 TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg
, Loc
);
87 DP("Entering data %s region for device %" PRId64
" with %d mappings\n",
88 RegionName
, DeviceId
, ArgNum
);
90 if (checkDeviceAndCtors(DeviceId
, Loc
)) {
91 DP("Not offloading to device %" PRId64
"\n", DeviceId
);
95 if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS
)
96 printKernelArguments(Loc
, DeviceId
, ArgNum
, ArgSizes
, ArgTypes
, ArgNames
,
98 #ifdef OMPTARGET_DEBUG
99 for (int I
= 0; I
< ArgNum
; ++I
) {
100 DP("Entry %2d: Base=" DPxMOD
", Begin=" DPxMOD
", Size=%" PRId64
101 ", Type=0x%" PRIx64
", Name=%s\n",
102 I
, DPxPTR(ArgsBase
[I
]), DPxPTR(Args
[I
]), ArgSizes
[I
], ArgTypes
[I
],
103 (ArgNames
) ? getNameFromMapping(ArgNames
[I
]).c_str() : "unknown");
107 DeviceTy
&Device
= *PM
->Devices
[DeviceId
];
108 TargetAsyncInfoTy
TargetAsyncInfo(Device
);
109 AsyncInfoTy
&AsyncInfo
= TargetAsyncInfo
;
111 /// RAII to establish tool anchors before and after data begin / end / update
112 OMPT_IF_BUILT(assert((TargetDataFunction
== targetDataBegin
||
113 TargetDataFunction
== targetDataEnd
||
114 TargetDataFunction
== targetDataUpdate
) &&
115 "Encountered unexpected TargetDataFunction during "
116 "execution of targetData");
117 auto CallbackFunctions
=
118 (TargetDataFunction
== targetDataBegin
)
119 ? RegionInterface
.getCallbacks
<ompt_target_enter_data
>()
120 : (TargetDataFunction
== targetDataEnd
)
121 ? RegionInterface
.getCallbacks
<ompt_target_exit_data
>()
122 : RegionInterface
.getCallbacks
<ompt_target_update
>();
123 InterfaceRAII
TargetDataRAII(CallbackFunctions
, DeviceId
,
124 OMPT_GET_RETURN_ADDRESS(0));)
126 int Rc
= OFFLOAD_SUCCESS
;
127 Rc
= TargetDataFunction(Loc
, Device
, ArgNum
, ArgsBase
, Args
, ArgSizes
,
128 ArgTypes
, ArgNames
, ArgMappers
, AsyncInfo
,
129 false /* FromMapper */);
131 if (Rc
== OFFLOAD_SUCCESS
)
132 Rc
= AsyncInfo
.synchronize();
134 handleTargetOutcome(Rc
== OFFLOAD_SUCCESS
, Loc
);
137 /// creates host-to-target data mapping, stores it in the
138 /// libomptarget.so internal structure (an entry in a stack of data maps)
139 /// and passes the data to the device.
140 EXTERN
void __tgt_target_data_begin_mapper(ident_t
*Loc
, int64_t DeviceId
,
141 int32_t ArgNum
, void **ArgsBase
,
142 void **Args
, int64_t *ArgSizes
,
144 map_var_info_t
*ArgNames
,
147 targetData
<AsyncInfoTy
>(Loc
, DeviceId
, ArgNum
, ArgsBase
, Args
, ArgSizes
,
148 ArgTypes
, ArgNames
, ArgMappers
, targetDataBegin
,
149 "Entering OpenMP data region with being_mapper",
153 EXTERN
void __tgt_target_data_begin_nowait_mapper(
154 ident_t
*Loc
, int64_t DeviceId
, int32_t ArgNum
, void **ArgsBase
,
155 void **Args
, int64_t *ArgSizes
, int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
156 void **ArgMappers
, int32_t DepNum
, void *DepList
, int32_t NoAliasDepNum
,
157 void *NoAliasDepList
) {
159 targetData
<TaskAsyncInfoWrapperTy
>(
160 Loc
, DeviceId
, ArgNum
, ArgsBase
, Args
, ArgSizes
, ArgTypes
, ArgNames
,
161 ArgMappers
, targetDataBegin
,
162 "Entering OpenMP data region with being_nowait_mapper", "begin");
165 /// passes data from the target, releases target memory and destroys
166 /// the host-target mapping (top entry from the stack of data maps)
167 /// created by the last __tgt_target_data_begin.
168 EXTERN
void __tgt_target_data_end_mapper(ident_t
*Loc
, int64_t DeviceId
,
169 int32_t ArgNum
, void **ArgsBase
,
170 void **Args
, int64_t *ArgSizes
,
172 map_var_info_t
*ArgNames
,
175 targetData
<AsyncInfoTy
>(Loc
, DeviceId
, ArgNum
, ArgsBase
, Args
, ArgSizes
,
176 ArgTypes
, ArgNames
, ArgMappers
, targetDataEnd
,
177 "Exiting OpenMP data region with end_mapper", "end");
180 EXTERN
void __tgt_target_data_end_nowait_mapper(
181 ident_t
*Loc
, int64_t DeviceId
, int32_t ArgNum
, void **ArgsBase
,
182 void **Args
, int64_t *ArgSizes
, int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
183 void **ArgMappers
, int32_t DepNum
, void *DepList
, int32_t NoAliasDepNum
,
184 void *NoAliasDepList
) {
186 targetData
<TaskAsyncInfoWrapperTy
>(
187 Loc
, DeviceId
, ArgNum
, ArgsBase
, Args
, ArgSizes
, ArgTypes
, ArgNames
,
188 ArgMappers
, targetDataEnd
,
189 "Exiting OpenMP data region with end_nowait_mapper", "end");
192 EXTERN
void __tgt_target_data_update_mapper(ident_t
*Loc
, int64_t DeviceId
,
193 int32_t ArgNum
, void **ArgsBase
,
194 void **Args
, int64_t *ArgSizes
,
196 map_var_info_t
*ArgNames
,
199 targetData
<AsyncInfoTy
>(
200 Loc
, DeviceId
, ArgNum
, ArgsBase
, Args
, ArgSizes
, ArgTypes
, ArgNames
,
201 ArgMappers
, targetDataUpdate
,
202 "Updating data within the OpenMP data region with update_mapper",
206 EXTERN
void __tgt_target_data_update_nowait_mapper(
207 ident_t
*Loc
, int64_t DeviceId
, int32_t ArgNum
, void **ArgsBase
,
208 void **Args
, int64_t *ArgSizes
, int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
209 void **ArgMappers
, int32_t DepNum
, void *DepList
, int32_t NoAliasDepNum
,
210 void *NoAliasDepList
) {
211 targetData
<TaskAsyncInfoWrapperTy
>(
212 Loc
, DeviceId
, ArgNum
, ArgsBase
, Args
, ArgSizes
, ArgTypes
, ArgNames
,
213 ArgMappers
, targetDataUpdate
,
214 "Updating data within the OpenMP data region with update_nowait_mapper",
218 static KernelArgsTy
*upgradeKernelArgs(KernelArgsTy
*KernelArgs
,
219 KernelArgsTy
&LocalKernelArgs
,
220 int32_t NumTeams
, int32_t ThreadLimit
) {
221 if (KernelArgs
->Version
> 2)
222 DP("Unexpected ABI version: %u\n", KernelArgs
->Version
);
224 if (KernelArgs
->Version
== 1) {
225 LocalKernelArgs
.Version
= 2;
226 LocalKernelArgs
.NumArgs
= KernelArgs
->NumArgs
;
227 LocalKernelArgs
.ArgBasePtrs
= KernelArgs
->ArgBasePtrs
;
228 LocalKernelArgs
.ArgPtrs
= KernelArgs
->ArgPtrs
;
229 LocalKernelArgs
.ArgSizes
= KernelArgs
->ArgSizes
;
230 LocalKernelArgs
.ArgTypes
= KernelArgs
->ArgTypes
;
231 LocalKernelArgs
.ArgNames
= KernelArgs
->ArgNames
;
232 LocalKernelArgs
.ArgMappers
= KernelArgs
->ArgMappers
;
233 LocalKernelArgs
.Tripcount
= KernelArgs
->Tripcount
;
234 LocalKernelArgs
.Flags
= KernelArgs
->Flags
;
235 LocalKernelArgs
.DynCGroupMem
= 0;
236 LocalKernelArgs
.NumTeams
[0] = NumTeams
;
237 LocalKernelArgs
.NumTeams
[1] = 0;
238 LocalKernelArgs
.NumTeams
[2] = 0;
239 LocalKernelArgs
.ThreadLimit
[0] = ThreadLimit
;
240 LocalKernelArgs
.ThreadLimit
[1] = 0;
241 LocalKernelArgs
.ThreadLimit
[2] = 0;
242 return &LocalKernelArgs
;
248 template <typename TargetAsyncInfoTy
>
249 static inline int targetKernel(ident_t
*Loc
, int64_t DeviceId
, int32_t NumTeams
,
250 int32_t ThreadLimit
, void *HostPtr
,
251 KernelArgsTy
*KernelArgs
) {
252 static_assert(std::is_convertible_v
<TargetAsyncInfoTy
, AsyncInfoTy
>,
253 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
255 TIMESCOPE_WITH_IDENT(Loc
);
257 DP("Entering target region for device %" PRId64
" with entry point " DPxMOD
259 DeviceId
, DPxPTR(HostPtr
));
261 if (checkDeviceAndCtors(DeviceId
, Loc
)) {
262 DP("Not offloading to device %" PRId64
"\n", DeviceId
);
266 bool IsTeams
= NumTeams
!= -1;
268 KernelArgs
->NumTeams
[0] = NumTeams
= 1;
270 // Auto-upgrade kernel args version 1 to 2.
271 KernelArgsTy LocalKernelArgs
;
273 upgradeKernelArgs(KernelArgs
, LocalKernelArgs
, NumTeams
, ThreadLimit
);
275 assert(KernelArgs
->NumTeams
[0] == static_cast<uint32_t>(NumTeams
) &&
276 !KernelArgs
->NumTeams
[1] && !KernelArgs
->NumTeams
[2] &&
277 "OpenMP interface should not use multiple dimensions");
278 assert(KernelArgs
->ThreadLimit
[0] == static_cast<uint32_t>(ThreadLimit
) &&
279 !KernelArgs
->ThreadLimit
[1] && !KernelArgs
->ThreadLimit
[2] &&
280 "OpenMP interface should not use multiple dimensions");
282 if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS
)
283 printKernelArguments(Loc
, DeviceId
, KernelArgs
->NumArgs
,
284 KernelArgs
->ArgSizes
, KernelArgs
->ArgTypes
,
285 KernelArgs
->ArgNames
, "Entering OpenMP kernel");
286 #ifdef OMPTARGET_DEBUG
287 for (uint32_t I
= 0; I
< KernelArgs
->NumArgs
; ++I
) {
288 DP("Entry %2d: Base=" DPxMOD
", Begin=" DPxMOD
", Size=%" PRId64
289 ", Type=0x%" PRIx64
", Name=%s\n",
290 I
, DPxPTR(KernelArgs
->ArgBasePtrs
[I
]), DPxPTR(KernelArgs
->ArgPtrs
[I
]),
291 KernelArgs
->ArgSizes
[I
], KernelArgs
->ArgTypes
[I
],
292 (KernelArgs
->ArgNames
)
293 ? getNameFromMapping(KernelArgs
->ArgNames
[I
]).c_str()
298 DeviceTy
&Device
= *PM
->Devices
[DeviceId
];
299 TargetAsyncInfoTy
TargetAsyncInfo(Device
);
300 AsyncInfoTy
&AsyncInfo
= TargetAsyncInfo
;
301 /// RAII to establish tool anchors before and after target region
302 OMPT_IF_BUILT(InterfaceRAII
TargetRAII(
303 RegionInterface
.getCallbacks
<ompt_target
>(), DeviceId
,
304 /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
306 int Rc
= OFFLOAD_SUCCESS
;
307 Rc
= target(Loc
, Device
, HostPtr
, *KernelArgs
, AsyncInfo
);
309 if (Rc
== OFFLOAD_SUCCESS
)
310 Rc
= AsyncInfo
.synchronize();
312 handleTargetOutcome(Rc
== OFFLOAD_SUCCESS
, Loc
);
313 assert(Rc
== OFFLOAD_SUCCESS
&& "__tgt_target_kernel unexpected failure!");
315 return OMP_TGT_SUCCESS
;
318 /// Implements a kernel entry that executes the target region on the specified
321 /// \param Loc Source location associated with this target region.
322 /// \param DeviceId The device to execute this region, -1 indicated the default.
323 /// \param NumTeams Number of teams to launch the region with, -1 indicates a
324 /// non-teams region and 0 indicates it was unspecified.
325 /// \param ThreadLimit Limit to the number of threads to use in the kernel
326 /// launch, 0 indicates it was unspecified.
327 /// \param HostPtr The pointer to the host function registered with the kernel.
328 /// \param Args All arguments to this kernel launch (see struct definition).
329 EXTERN
int __tgt_target_kernel(ident_t
*Loc
, int64_t DeviceId
, int32_t NumTeams
,
330 int32_t ThreadLimit
, void *HostPtr
,
331 KernelArgsTy
*KernelArgs
) {
332 if (KernelArgs
->Flags
.NoWait
)
333 return targetKernel
<TaskAsyncInfoWrapperTy
>(
334 Loc
, DeviceId
, NumTeams
, ThreadLimit
, HostPtr
, KernelArgs
);
336 return targetKernel
<AsyncInfoTy
>(Loc
, DeviceId
, NumTeams
, ThreadLimit
,
337 HostPtr
, KernelArgs
);
340 /// Activates the record replay mechanism.
341 /// \param DeviceId The device identifier to execute the target region.
342 /// \param MemorySize The number of bytes to be (pre-)allocated
343 /// by the bump allocator
344 /// /param IsRecord Activates the record replay mechanism in
345 /// 'record' mode or 'replay' mode.
346 /// /param SaveOutput Store the device memory after kernel
347 /// execution on persistent storage
348 EXTERN
int __tgt_activate_record_replay(int64_t DeviceId
, uint64_t MemorySize
,
349 void *VAddr
, bool IsRecord
,
351 if (!deviceIsReady(DeviceId
)) {
352 DP("Device %" PRId64
" is not ready\n", DeviceId
);
356 DeviceTy
&Device
= *PM
->Devices
[DeviceId
];
357 [[maybe_unused
]] int Rc
=
358 target_activate_rr(Device
, MemorySize
, VAddr
, IsRecord
, SaveOutput
);
359 assert(Rc
== OFFLOAD_SUCCESS
&&
360 "__tgt_activate_record_replay unexpected failure!");
361 return OMP_TGT_SUCCESS
;
364 /// Implements a target kernel entry that replays a pre-recorded kernel.
365 /// \param Loc Source location associated with this target region (unused).
366 /// \param DeviceId The device identifier to execute the target region.
367 /// \param HostPtr A pointer to an address that uniquely identifies the kernel.
368 /// \param DeviceMemory A pointer to an array storing device memory data to move
369 /// prior to kernel execution.
370 /// \param DeviceMemorySize The size of the above device memory data in bytes.
371 /// \param TgtArgs An array of pointers of the pre-recorded target kernel
373 /// \param TgtOffsets An array of pointers of the pre-recorded target kernel
374 /// argument offsets.
375 /// \param NumArgs The number of kernel arguments.
376 /// \param NumTeams Number of teams to launch the target region with.
377 /// \param ThreadLimit Limit to the number of threads to use in kernel
379 /// \param LoopTripCount The pre-recorded value of the loop tripcount, if any.
380 /// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
381 EXTERN
int __tgt_target_kernel_replay(ident_t
*Loc
, int64_t DeviceId
,
382 void *HostPtr
, void *DeviceMemory
,
383 int64_t DeviceMemorySize
, void **TgtArgs
,
384 ptrdiff_t *TgtOffsets
, int32_t NumArgs
,
385 int32_t NumTeams
, int32_t ThreadLimit
,
386 uint64_t LoopTripCount
) {
388 if (checkDeviceAndCtors(DeviceId
, Loc
)) {
389 DP("Not offloading to device %" PRId64
"\n", DeviceId
);
392 DeviceTy
&Device
= *PM
->Devices
[DeviceId
];
393 /// RAII to establish tool anchors before and after target region
394 OMPT_IF_BUILT(InterfaceRAII
TargetRAII(
395 RegionInterface
.getCallbacks
<ompt_target
>(), DeviceId
,
396 /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
398 AsyncInfoTy
AsyncInfo(Device
);
399 int Rc
= target_replay(Loc
, Device
, HostPtr
, DeviceMemory
, DeviceMemorySize
,
400 TgtArgs
, TgtOffsets
, NumArgs
, NumTeams
, ThreadLimit
,
401 LoopTripCount
, AsyncInfo
);
402 if (Rc
== OFFLOAD_SUCCESS
)
403 Rc
= AsyncInfo
.synchronize();
404 handleTargetOutcome(Rc
== OFFLOAD_SUCCESS
, Loc
);
405 assert(Rc
== OFFLOAD_SUCCESS
&&
406 "__tgt_target_kernel_replay unexpected failure!");
407 return OMP_TGT_SUCCESS
;
410 // Get the current number of components for a user-defined mapper.
411 EXTERN
int64_t __tgt_mapper_num_components(void *RtMapperHandle
) {
413 auto *MapperComponentsPtr
= (struct MapperComponentsTy
*)RtMapperHandle
;
414 int64_t Size
= MapperComponentsPtr
->Components
.size();
415 DP("__tgt_mapper_num_components(Handle=" DPxMOD
") returns %" PRId64
"\n",
416 DPxPTR(RtMapperHandle
), Size
);
420 // Push back one component for a user-defined mapper.
421 EXTERN
void __tgt_push_mapper_component(void *RtMapperHandle
, void *Base
,
422 void *Begin
, int64_t Size
, int64_t Type
,
425 DP("__tgt_push_mapper_component(Handle=" DPxMOD
426 ") adds an entry (Base=" DPxMOD
", Begin=" DPxMOD
", Size=%" PRId64
427 ", Type=0x%" PRIx64
", Name=%s).\n",
428 DPxPTR(RtMapperHandle
), DPxPTR(Base
), DPxPTR(Begin
), Size
, Type
,
429 (Name
) ? getNameFromMapping(Name
).c_str() : "unknown");
430 auto *MapperComponentsPtr
= (struct MapperComponentsTy
*)RtMapperHandle
;
431 MapperComponentsPtr
->Components
.push_back(
432 MapComponentInfoTy(Base
, Begin
, Size
, Type
, Name
));
435 EXTERN
void __tgt_set_info_flag(uint32_t NewInfoLevel
) {
436 std::atomic
<uint32_t> &InfoLevel
= getInfoLevelInternal();
437 InfoLevel
.store(NewInfoLevel
);
438 for (auto &R
: PM
->RTLs
.AllRTLs
) {
440 R
.set_info_flag(NewInfoLevel
);
444 EXTERN
int __tgt_print_device_info(int64_t DeviceId
) {
445 // Make sure the device is ready.
446 if (!deviceIsReady(DeviceId
)) {
447 DP("Device %" PRId64
" is not ready\n", DeviceId
);
451 return PM
->Devices
[DeviceId
]->printDeviceInfo(
452 PM
->Devices
[DeviceId
]->RTLDeviceID
);
455 EXTERN
void __tgt_target_nowait_query(void **AsyncHandle
) {
456 if (!AsyncHandle
|| !*AsyncHandle
) {
458 1, "Receive an invalid async handle from the current OpenMP task. Is "
459 "this a target nowait region?\n");
462 // Exponential backoff tries to optimally decide if a thread should just query
463 // for the device operations (work/spin wait on them) or block until they are
464 // completed (use device side blocking mechanism). This allows the runtime to
465 // adapt itself when there are a lot of long-running target regions in-flight.
466 using namespace llvm::omp::target
;
467 static thread_local ExponentialBackoff
QueryCounter(
468 Int64Envar("OMPTARGET_QUERY_COUNT_MAX", 10),
469 Int64Envar("OMPTARGET_QUERY_COUNT_THRESHOLD", 5),
470 Envar
<float>("OMPTARGET_QUERY_COUNT_BACKOFF_FACTOR", 0.5f
));
472 auto *AsyncInfo
= (AsyncInfoTy
*)*AsyncHandle
;
474 // If the thread is actively waiting on too many target nowait regions, we
475 // should use the blocking sync type.
476 if (QueryCounter
.isAboveThreshold())
477 AsyncInfo
->SyncType
= AsyncInfoTy::SyncTy::BLOCKING
;
479 if (const int Rc
= AsyncInfo
->synchronize())
480 FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n");
481 // If there are device operations still pending, return immediately without
482 // deallocating the handle and increase the current thread query count.
483 if (!AsyncInfo
->isDone()) {
484 QueryCounter
.increment();
488 // When a thread successfully completes a target nowait region, we
489 // exponentially backoff its query counter by the query factor.
490 QueryCounter
.decrement();
492 // Delete the handle and unset it from the OpenMP task data.
494 *AsyncHandle
= nullptr;