Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / libomptarget / src / interface.cpp
blobe9ab7f05c7a0a76ce5a29b4950856ff2756d5632
1 //===-------- interface.cpp - Target independent OpenMP target RTL --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
12 //===----------------------------------------------------------------------===//
14 #include "OmptCallback.h"
15 #include "OmptInterface.h"
16 #include "device.h"
17 #include "omptarget.h"
18 #include "private.h"
19 #include "rtl.h"
21 #include "Utilities.h"
23 #include <cassert>
24 #include <cstdio>
25 #include <cstdlib>
26 #include <mutex>
27 #include <type_traits>
29 #ifdef OMPT_SUPPORT
30 using namespace llvm::omp::target::ompt;
31 #endif
33 ////////////////////////////////////////////////////////////////////////////////
34 /// adds requires flags
35 EXTERN void __tgt_register_requires(int64_t Flags) {
36 TIMESCOPE();
37 PM->RTLs.registerRequires(Flags);
40 ////////////////////////////////////////////////////////////////////////////////
41 /// adds a target shared library to the target execution image
42 EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
43 TIMESCOPE();
44 if (PM->maybeDelayRegisterLib(Desc))
45 return;
47 for (auto &RTL : PM->RTLs.AllRTLs) {
48 if (RTL.register_lib) {
49 if ((*RTL.register_lib)(Desc) != OFFLOAD_SUCCESS) {
50 DP("Could not register library with %s", RTL.RTLName.c_str());
54 PM->RTLs.registerLib(Desc);
57 ////////////////////////////////////////////////////////////////////////////////
58 /// Initialize all available devices without registering any image
59 EXTERN void __tgt_init_all_rtls() { PM->RTLs.initAllRTLs(); }
61 ////////////////////////////////////////////////////////////////////////////////
62 /// unloads a target shared library
63 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
64 TIMESCOPE();
65 PM->RTLs.unregisterLib(Desc);
66 for (auto &RTL : PM->RTLs.UsedRTLs) {
67 if (RTL->unregister_lib) {
68 if ((*RTL->unregister_lib)(Desc) != OFFLOAD_SUCCESS) {
69 DP("Could not register library with %s", RTL->RTLName.c_str());
75 template <typename TargetAsyncInfoTy>
76 static inline void
77 targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
78 void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
79 map_var_info_t *ArgNames, void **ArgMappers,
80 TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg,
81 const char *RegionName) {
82 static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
83 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
85 TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
87 DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
88 RegionName, DeviceId, ArgNum);
90 if (checkDeviceAndCtors(DeviceId, Loc)) {
91 DP("Not offloading to device %" PRId64 "\n", DeviceId);
92 return;
95 if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
96 printKernelArguments(Loc, DeviceId, ArgNum, ArgSizes, ArgTypes, ArgNames,
97 RegionTypeMsg);
98 #ifdef OMPTARGET_DEBUG
99 for (int I = 0; I < ArgNum; ++I) {
100 DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
101 ", Type=0x%" PRIx64 ", Name=%s\n",
102 I, DPxPTR(ArgsBase[I]), DPxPTR(Args[I]), ArgSizes[I], ArgTypes[I],
103 (ArgNames) ? getNameFromMapping(ArgNames[I]).c_str() : "unknown");
105 #endif
107 DeviceTy &Device = *PM->Devices[DeviceId];
108 TargetAsyncInfoTy TargetAsyncInfo(Device);
109 AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
111 /// RAII to establish tool anchors before and after data begin / end / update
112 OMPT_IF_BUILT(assert((TargetDataFunction == targetDataBegin ||
113 TargetDataFunction == targetDataEnd ||
114 TargetDataFunction == targetDataUpdate) &&
115 "Encountered unexpected TargetDataFunction during "
116 "execution of targetData");
117 auto CallbackFunctions =
118 (TargetDataFunction == targetDataBegin)
119 ? RegionInterface.getCallbacks<ompt_target_enter_data>()
120 : (TargetDataFunction == targetDataEnd)
121 ? RegionInterface.getCallbacks<ompt_target_exit_data>()
122 : RegionInterface.getCallbacks<ompt_target_update>();
123 InterfaceRAII TargetDataRAII(CallbackFunctions, DeviceId,
124 OMPT_GET_RETURN_ADDRESS(0));)
126 int Rc = OFFLOAD_SUCCESS;
127 Rc = TargetDataFunction(Loc, Device, ArgNum, ArgsBase, Args, ArgSizes,
128 ArgTypes, ArgNames, ArgMappers, AsyncInfo,
129 false /* FromMapper */);
131 if (Rc == OFFLOAD_SUCCESS)
132 Rc = AsyncInfo.synchronize();
134 handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
137 /// creates host-to-target data mapping, stores it in the
138 /// libomptarget.so internal structure (an entry in a stack of data maps)
139 /// and passes the data to the device.
140 EXTERN void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId,
141 int32_t ArgNum, void **ArgsBase,
142 void **Args, int64_t *ArgSizes,
143 int64_t *ArgTypes,
144 map_var_info_t *ArgNames,
145 void **ArgMappers) {
147 targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
148 ArgTypes, ArgNames, ArgMappers, targetDataBegin,
149 "Entering OpenMP data region with being_mapper",
150 "begin");
153 EXTERN void __tgt_target_data_begin_nowait_mapper(
154 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
155 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
156 void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
157 void *NoAliasDepList) {
159 targetData<TaskAsyncInfoWrapperTy>(
160 Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
161 ArgMappers, targetDataBegin,
162 "Entering OpenMP data region with being_nowait_mapper", "begin");
165 /// passes data from the target, releases target memory and destroys
166 /// the host-target mapping (top entry from the stack of data maps)
167 /// created by the last __tgt_target_data_begin.
168 EXTERN void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId,
169 int32_t ArgNum, void **ArgsBase,
170 void **Args, int64_t *ArgSizes,
171 int64_t *ArgTypes,
172 map_var_info_t *ArgNames,
173 void **ArgMappers) {
175 targetData<AsyncInfoTy>(Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes,
176 ArgTypes, ArgNames, ArgMappers, targetDataEnd,
177 "Exiting OpenMP data region with end_mapper", "end");
180 EXTERN void __tgt_target_data_end_nowait_mapper(
181 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
182 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
183 void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
184 void *NoAliasDepList) {
186 targetData<TaskAsyncInfoWrapperTy>(
187 Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
188 ArgMappers, targetDataEnd,
189 "Exiting OpenMP data region with end_nowait_mapper", "end");
192 EXTERN void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId,
193 int32_t ArgNum, void **ArgsBase,
194 void **Args, int64_t *ArgSizes,
195 int64_t *ArgTypes,
196 map_var_info_t *ArgNames,
197 void **ArgMappers) {
199 targetData<AsyncInfoTy>(
200 Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
201 ArgMappers, targetDataUpdate,
202 "Updating data within the OpenMP data region with update_mapper",
203 "update");
206 EXTERN void __tgt_target_data_update_nowait_mapper(
207 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
208 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
209 void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
210 void *NoAliasDepList) {
211 targetData<TaskAsyncInfoWrapperTy>(
212 Loc, DeviceId, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames,
213 ArgMappers, targetDataUpdate,
214 "Updating data within the OpenMP data region with update_nowait_mapper",
215 "update");
218 static KernelArgsTy *upgradeKernelArgs(KernelArgsTy *KernelArgs,
219 KernelArgsTy &LocalKernelArgs,
220 int32_t NumTeams, int32_t ThreadLimit) {
221 if (KernelArgs->Version > 2)
222 DP("Unexpected ABI version: %u\n", KernelArgs->Version);
224 if (KernelArgs->Version == 1) {
225 LocalKernelArgs.Version = 2;
226 LocalKernelArgs.NumArgs = KernelArgs->NumArgs;
227 LocalKernelArgs.ArgBasePtrs = KernelArgs->ArgBasePtrs;
228 LocalKernelArgs.ArgPtrs = KernelArgs->ArgPtrs;
229 LocalKernelArgs.ArgSizes = KernelArgs->ArgSizes;
230 LocalKernelArgs.ArgTypes = KernelArgs->ArgTypes;
231 LocalKernelArgs.ArgNames = KernelArgs->ArgNames;
232 LocalKernelArgs.ArgMappers = KernelArgs->ArgMappers;
233 LocalKernelArgs.Tripcount = KernelArgs->Tripcount;
234 LocalKernelArgs.Flags = KernelArgs->Flags;
235 LocalKernelArgs.DynCGroupMem = 0;
236 LocalKernelArgs.NumTeams[0] = NumTeams;
237 LocalKernelArgs.NumTeams[1] = 0;
238 LocalKernelArgs.NumTeams[2] = 0;
239 LocalKernelArgs.ThreadLimit[0] = ThreadLimit;
240 LocalKernelArgs.ThreadLimit[1] = 0;
241 LocalKernelArgs.ThreadLimit[2] = 0;
242 return &LocalKernelArgs;
245 return KernelArgs;
248 template <typename TargetAsyncInfoTy>
249 static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
250 int32_t ThreadLimit, void *HostPtr,
251 KernelArgsTy *KernelArgs) {
252 static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
253 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
255 TIMESCOPE_WITH_IDENT(Loc);
257 DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
258 "\n",
259 DeviceId, DPxPTR(HostPtr));
261 if (checkDeviceAndCtors(DeviceId, Loc)) {
262 DP("Not offloading to device %" PRId64 "\n", DeviceId);
263 return OMP_TGT_FAIL;
266 bool IsTeams = NumTeams != -1;
267 if (!IsTeams)
268 KernelArgs->NumTeams[0] = NumTeams = 1;
270 // Auto-upgrade kernel args version 1 to 2.
271 KernelArgsTy LocalKernelArgs;
272 KernelArgs =
273 upgradeKernelArgs(KernelArgs, LocalKernelArgs, NumTeams, ThreadLimit);
275 assert(KernelArgs->NumTeams[0] == static_cast<uint32_t>(NumTeams) &&
276 !KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&
277 "OpenMP interface should not use multiple dimensions");
278 assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
279 !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
280 "OpenMP interface should not use multiple dimensions");
282 if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
283 printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
284 KernelArgs->ArgSizes, KernelArgs->ArgTypes,
285 KernelArgs->ArgNames, "Entering OpenMP kernel");
286 #ifdef OMPTARGET_DEBUG
287 for (uint32_t I = 0; I < KernelArgs->NumArgs; ++I) {
288 DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
289 ", Type=0x%" PRIx64 ", Name=%s\n",
290 I, DPxPTR(KernelArgs->ArgBasePtrs[I]), DPxPTR(KernelArgs->ArgPtrs[I]),
291 KernelArgs->ArgSizes[I], KernelArgs->ArgTypes[I],
292 (KernelArgs->ArgNames)
293 ? getNameFromMapping(KernelArgs->ArgNames[I]).c_str()
294 : "unknown");
296 #endif
298 DeviceTy &Device = *PM->Devices[DeviceId];
299 TargetAsyncInfoTy TargetAsyncInfo(Device);
300 AsyncInfoTy &AsyncInfo = TargetAsyncInfo;
301 /// RAII to establish tool anchors before and after target region
302 OMPT_IF_BUILT(InterfaceRAII TargetRAII(
303 RegionInterface.getCallbacks<ompt_target>(), DeviceId,
304 /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
306 int Rc = OFFLOAD_SUCCESS;
307 Rc = target(Loc, Device, HostPtr, *KernelArgs, AsyncInfo);
309 if (Rc == OFFLOAD_SUCCESS)
310 Rc = AsyncInfo.synchronize();
312 handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
313 assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
315 return OMP_TGT_SUCCESS;
318 /// Implements a kernel entry that executes the target region on the specified
319 /// device.
321 /// \param Loc Source location associated with this target region.
322 /// \param DeviceId The device to execute this region, -1 indicated the default.
323 /// \param NumTeams Number of teams to launch the region with, -1 indicates a
324 /// non-teams region and 0 indicates it was unspecified.
325 /// \param ThreadLimit Limit to the number of threads to use in the kernel
326 /// launch, 0 indicates it was unspecified.
327 /// \param HostPtr The pointer to the host function registered with the kernel.
328 /// \param Args All arguments to this kernel launch (see struct definition).
329 EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
330 int32_t ThreadLimit, void *HostPtr,
331 KernelArgsTy *KernelArgs) {
332 if (KernelArgs->Flags.NoWait)
333 return targetKernel<TaskAsyncInfoWrapperTy>(
334 Loc, DeviceId, NumTeams, ThreadLimit, HostPtr, KernelArgs);
335 else
336 return targetKernel<AsyncInfoTy>(Loc, DeviceId, NumTeams, ThreadLimit,
337 HostPtr, KernelArgs);
340 /// Activates the record replay mechanism.
341 /// \param DeviceId The device identifier to execute the target region.
342 /// \param MemorySize The number of bytes to be (pre-)allocated
343 /// by the bump allocator
344 /// /param IsRecord Activates the record replay mechanism in
345 /// 'record' mode or 'replay' mode.
346 /// /param SaveOutput Store the device memory after kernel
347 /// execution on persistent storage
348 EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
349 void *VAddr, bool IsRecord,
350 bool SaveOutput) {
351 if (!deviceIsReady(DeviceId)) {
352 DP("Device %" PRId64 " is not ready\n", DeviceId);
353 return OMP_TGT_FAIL;
356 DeviceTy &Device = *PM->Devices[DeviceId];
357 [[maybe_unused]] int Rc =
358 target_activate_rr(Device, MemorySize, VAddr, IsRecord, SaveOutput);
359 assert(Rc == OFFLOAD_SUCCESS &&
360 "__tgt_activate_record_replay unexpected failure!");
361 return OMP_TGT_SUCCESS;
364 /// Implements a target kernel entry that replays a pre-recorded kernel.
365 /// \param Loc Source location associated with this target region (unused).
366 /// \param DeviceId The device identifier to execute the target region.
367 /// \param HostPtr A pointer to an address that uniquely identifies the kernel.
368 /// \param DeviceMemory A pointer to an array storing device memory data to move
369 /// prior to kernel execution.
370 /// \param DeviceMemorySize The size of the above device memory data in bytes.
371 /// \param TgtArgs An array of pointers of the pre-recorded target kernel
372 /// arguments.
373 /// \param TgtOffsets An array of pointers of the pre-recorded target kernel
374 /// argument offsets.
375 /// \param NumArgs The number of kernel arguments.
376 /// \param NumTeams Number of teams to launch the target region with.
377 /// \param ThreadLimit Limit to the number of threads to use in kernel
378 /// execution.
379 /// \param LoopTripCount The pre-recorded value of the loop tripcount, if any.
380 /// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
381 EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
382 void *HostPtr, void *DeviceMemory,
383 int64_t DeviceMemorySize, void **TgtArgs,
384 ptrdiff_t *TgtOffsets, int32_t NumArgs,
385 int32_t NumTeams, int32_t ThreadLimit,
386 uint64_t LoopTripCount) {
388 if (checkDeviceAndCtors(DeviceId, Loc)) {
389 DP("Not offloading to device %" PRId64 "\n", DeviceId);
390 return OMP_TGT_FAIL;
392 DeviceTy &Device = *PM->Devices[DeviceId];
393 /// RAII to establish tool anchors before and after target region
394 OMPT_IF_BUILT(InterfaceRAII TargetRAII(
395 RegionInterface.getCallbacks<ompt_target>(), DeviceId,
396 /* CodePtr */ OMPT_GET_RETURN_ADDRESS(0));)
398 AsyncInfoTy AsyncInfo(Device);
399 int Rc = target_replay(Loc, Device, HostPtr, DeviceMemory, DeviceMemorySize,
400 TgtArgs, TgtOffsets, NumArgs, NumTeams, ThreadLimit,
401 LoopTripCount, AsyncInfo);
402 if (Rc == OFFLOAD_SUCCESS)
403 Rc = AsyncInfo.synchronize();
404 handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
405 assert(Rc == OFFLOAD_SUCCESS &&
406 "__tgt_target_kernel_replay unexpected failure!");
407 return OMP_TGT_SUCCESS;
410 // Get the current number of components for a user-defined mapper.
411 EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
412 TIMESCOPE();
413 auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
414 int64_t Size = MapperComponentsPtr->Components.size();
415 DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
416 DPxPTR(RtMapperHandle), Size);
417 return Size;
420 // Push back one component for a user-defined mapper.
421 EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
422 void *Begin, int64_t Size, int64_t Type,
423 void *Name) {
424 TIMESCOPE();
425 DP("__tgt_push_mapper_component(Handle=" DPxMOD
426 ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
427 ", Type=0x%" PRIx64 ", Name=%s).\n",
428 DPxPTR(RtMapperHandle), DPxPTR(Base), DPxPTR(Begin), Size, Type,
429 (Name) ? getNameFromMapping(Name).c_str() : "unknown");
430 auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
431 MapperComponentsPtr->Components.push_back(
432 MapComponentInfoTy(Base, Begin, Size, Type, Name));
435 EXTERN void __tgt_set_info_flag(uint32_t NewInfoLevel) {
436 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
437 InfoLevel.store(NewInfoLevel);
438 for (auto &R : PM->RTLs.AllRTLs) {
439 if (R.set_info_flag)
440 R.set_info_flag(NewInfoLevel);
444 EXTERN int __tgt_print_device_info(int64_t DeviceId) {
445 // Make sure the device is ready.
446 if (!deviceIsReady(DeviceId)) {
447 DP("Device %" PRId64 " is not ready\n", DeviceId);
448 return OMP_TGT_FAIL;
451 return PM->Devices[DeviceId]->printDeviceInfo(
452 PM->Devices[DeviceId]->RTLDeviceID);
455 EXTERN void __tgt_target_nowait_query(void **AsyncHandle) {
456 if (!AsyncHandle || !*AsyncHandle) {
457 FATAL_MESSAGE0(
458 1, "Receive an invalid async handle from the current OpenMP task. Is "
459 "this a target nowait region?\n");
462 // Exponential backoff tries to optimally decide if a thread should just query
463 // for the device operations (work/spin wait on them) or block until they are
464 // completed (use device side blocking mechanism). This allows the runtime to
465 // adapt itself when there are a lot of long-running target regions in-flight.
466 using namespace llvm::omp::target;
467 static thread_local ExponentialBackoff QueryCounter(
468 Int64Envar("OMPTARGET_QUERY_COUNT_MAX", 10),
469 Int64Envar("OMPTARGET_QUERY_COUNT_THRESHOLD", 5),
470 Envar<float>("OMPTARGET_QUERY_COUNT_BACKOFF_FACTOR", 0.5f));
472 auto *AsyncInfo = (AsyncInfoTy *)*AsyncHandle;
474 // If the thread is actively waiting on too many target nowait regions, we
475 // should use the blocking sync type.
476 if (QueryCounter.isAboveThreshold())
477 AsyncInfo->SyncType = AsyncInfoTy::SyncTy::BLOCKING;
479 if (const int Rc = AsyncInfo->synchronize())
480 FATAL_MESSAGE0(1, "Error while querying the async queue for completion.\n");
481 // If there are device operations still pending, return immediately without
482 // deallocating the handle and increase the current thread query count.
483 if (!AsyncInfo->isDone()) {
484 QueryCounter.increment();
485 return;
488 // When a thread successfully completes a target nowait region, we
489 // exponentially backoff its query counter by the query factor.
490 QueryCounter.decrement();
492 // Delete the handle and unset it from the OpenMP task data.
493 delete AsyncInfo;
494 *AsyncHandle = nullptr;