1 //===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Interface to be used by Clang during the codegen of a
12 //===----------------------------------------------------------------------===//
17 #include "Shared/APITypes.h"
18 #include "Shared/Environment.h"
19 #include "Shared/SourceInfo.h"
21 #include "OpenMP/InternalTypes.h"
27 #include <type_traits>
29 #include "llvm/ADT/SmallVector.h"
31 #define OFFLOAD_SUCCESS (0)
32 #define OFFLOAD_FAIL (~0)
34 #define OFFLOAD_DEVICE_DEFAULT -1
36 // Don't format out enums and structs.
39 /// return flags of __tgt_target_XXX public APIs
40 enum __tgt_target_return_t
: int {
41 /// successful offload executed on a target device
43 /// offload may not execute on the requested target device
44 /// this scenario can be caused by the device not available or unsupported
45 /// as described in the Execution Model in the specifcation
46 /// this status may not be used for target device execution failure
47 /// which should be handled internally in libomptarget
51 /// Data attributes for each data reference used in an OpenMP target region.
54 OMP_TGT_MAPTYPE_NONE
= 0x000,
55 // copy data from host to device
56 OMP_TGT_MAPTYPE_TO
= 0x001,
57 // copy data from device to host
58 OMP_TGT_MAPTYPE_FROM
= 0x002,
59 // copy regardless of the reference count
60 OMP_TGT_MAPTYPE_ALWAYS
= 0x004,
61 // force unmapping of data
62 OMP_TGT_MAPTYPE_DELETE
= 0x008,
63 // map the pointer as well as the pointee
64 OMP_TGT_MAPTYPE_PTR_AND_OBJ
= 0x010,
65 // pass device base address to kernel
66 OMP_TGT_MAPTYPE_TARGET_PARAM
= 0x020,
67 // return base device address of mapped data
68 OMP_TGT_MAPTYPE_RETURN_PARAM
= 0x040,
69 // private variable - not mapped
70 OMP_TGT_MAPTYPE_PRIVATE
= 0x080,
71 // copy by value - not mapped
72 OMP_TGT_MAPTYPE_LITERAL
= 0x100,
73 // mapping is implicit
74 OMP_TGT_MAPTYPE_IMPLICIT
= 0x200,
75 // copy data to device
76 OMP_TGT_MAPTYPE_CLOSE
= 0x400,
77 // runtime error if not already allocated
78 OMP_TGT_MAPTYPE_PRESENT
= 0x1000,
79 // use a separate reference counter so that the data cannot be unmapped within
80 // the structured region
81 // This is an OpenMP extension for the sake of OpenACC support.
82 OMP_TGT_MAPTYPE_OMPX_HOLD
= 0x2000,
83 // descriptor for non-contiguous target-update
84 OMP_TGT_MAPTYPE_NON_CONTIG
= 0x100000000000,
85 // member of struct, member given by [16 MSBs] - 1
86 OMP_TGT_MAPTYPE_MEMBER_OF
= 0xffff000000000000
89 /// Flags for offload entries.
90 enum OpenMPOffloadingDeclareTargetFlags
{
91 /// Mark the entry global as having a 'link' attribute.
92 OMP_DECLARE_TARGET_LINK
= 0x01,
93 /// Mark the entry global as being an indirectly callable function.
94 OMP_DECLARE_TARGET_INDIRECT
= 0x08,
95 /// This is an entry corresponding to a requirement to be registered.
96 OMP_REGISTER_REQUIRES
= 0x10,
99 enum TargetAllocTy
: int32_t {
100 TARGET_ALLOC_DEVICE
= 0,
103 TARGET_ALLOC_DEFAULT
,
104 /// The allocation will not block on other streams.
105 TARGET_ALLOC_DEVICE_NON_BLOCKING
,
108 inline KernelArgsTy CTorDTorKernelArgs
= {1, 0, nullptr, nullptr,
109 nullptr, nullptr, nullptr, nullptr,
110 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0};
114 /// The libomptarget wrapper around a __tgt_async_info object directly
115 /// associated with a libomptarget layer device. RAII semantics to avoid
119 enum class SyncTy
{ BLOCKING
, NON_BLOCKING
};
122 /// Locations we used in (potentially) asynchronous calls which should live
123 /// as long as this AsyncInfoTy object.
124 std::deque
<void *> BufferLocations
;
126 /// Post-processing operations executed after a successful synchronization.
127 /// \note the post-processing function should return OFFLOAD_SUCCESS or
128 /// OFFLOAD_FAIL appropriately.
129 using PostProcFuncTy
= std::function
<int()>;
130 llvm::SmallVector
<PostProcFuncTy
> PostProcessingFunctions
;
132 __tgt_async_info AsyncInfo
;
136 /// Synchronization method to be used.
139 AsyncInfoTy(DeviceTy
&Device
, SyncTy SyncType
= SyncTy::BLOCKING
)
140 : Device(Device
), SyncType(SyncType
) {}
141 ~AsyncInfoTy() { synchronize(); }
143 /// Implicit conversion to the __tgt_async_info which is used in the
144 /// plugin interface.
145 operator __tgt_async_info
*() { return &AsyncInfo
; }
147 /// Synchronize all pending actions.
149 /// \note synchronization will be performance in a blocking or non-blocking
150 /// manner, depending on the SyncType.
152 /// \note if the operations are completed, the registered post-processing
153 /// functions will be executed once and unregistered afterwards.
155 /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately.
158 /// Return a void* reference with a lifetime that is at least as long as this
159 /// AsyncInfoTy object. The location can be used as intermediate buffer.
160 void *&getVoidPtrLocation();
162 /// Check if all asynchronous operations are completed.
164 /// \note only a lightweight check. If needed, use synchronize() to query the
165 /// status of AsyncInfo before checking.
167 /// \returns true if there is no pending asynchronous operations, false
171 /// Add a new post-processing function to be executed after synchronization.
173 /// \param[in] Function is a templated function (e.g., function pointers,
174 /// lambdas, std::function) that can be convertible to a PostProcFuncTy (i.e.,
175 /// it must have int() as its function signature).
176 template <typename FuncTy
> void addPostProcessingFunction(FuncTy
&&Function
) {
177 static_assert(std::is_convertible_v
<FuncTy
, PostProcFuncTy
>,
178 "Invalid post-processing function type. Please check "
179 "function signature!");
180 PostProcessingFunctions
.emplace_back(Function
);
184 /// Run all the post-processing functions sequentially.
186 /// \note after a successful execution, all previously registered functions
187 /// are unregistered.
189 /// \returns OFFLOAD_FAIL if any post-processing function failed,
190 /// OFFLOAD_SUCCESS otherwise.
191 int32_t runPostProcessing();
193 /// Check if the internal asynchronous info queue is empty or not.
195 /// \returns true if empty, false otherwise.
196 bool isQueueEmpty() const;
199 // Wrapper for task stored async info objects.
200 class TaskAsyncInfoWrapperTy
{
201 // Invalid GTID as defined by libomp; keep in sync
202 static constexpr int KMP_GTID_DNE
= -2;
204 const int ExecThreadID
= KMP_GTID_DNE
;
205 AsyncInfoTy LocalAsyncInfo
;
206 AsyncInfoTy
*AsyncInfo
= &LocalAsyncInfo
;
207 void **TaskAsyncInfoPtr
= nullptr;
210 TaskAsyncInfoWrapperTy(DeviceTy
&Device
)
211 : ExecThreadID(__kmpc_global_thread_num(NULL
)), LocalAsyncInfo(Device
) {
212 // If we failed to acquired the current global thread id, we cannot
213 // re-enqueue the current task. Thus we should use the local blocking async
215 if (ExecThreadID
== KMP_GTID_DNE
)
218 // Only tasks with an assigned task team can be re-enqueue and thus can
219 // use the non-blocking synchronization scheme. Thus we should use the local
220 // blocking async info, if we donĀ“t have one.
221 if (!__kmpc_omp_has_task_team(ExecThreadID
))
224 // Acquire a pointer to the AsyncInfo stored inside the current task being
226 TaskAsyncInfoPtr
= __kmpc_omp_get_target_async_handle_ptr(ExecThreadID
);
228 // If we cannot acquire such pointer, fallback to using the local blocking
230 if (!TaskAsyncInfoPtr
)
233 // When creating a new task async info, the task handle must always be
234 // invalid. We must never overwrite any task async handle and there should
235 // never be any valid handle store inside the task at this point.
236 assert((*TaskAsyncInfoPtr
) == nullptr &&
237 "Task async handle is not empty when dispatching new device "
238 "operations. The handle was not cleared properly or "
239 "__tgt_target_nowait_query should have been called!");
241 // If no valid async handle is present, a new AsyncInfo will be allocated
242 // and stored in the current task.
243 AsyncInfo
= new AsyncInfoTy(Device
, AsyncInfoTy::SyncTy::NON_BLOCKING
);
244 *TaskAsyncInfoPtr
= (void *)AsyncInfo
;
247 ~TaskAsyncInfoWrapperTy() {
248 // Local async info destruction is automatically handled by ~AsyncInfoTy.
249 if (AsyncInfo
== &LocalAsyncInfo
)
252 // If the are device operations still pending, return immediately without
253 // deallocating the handle.
254 if (!AsyncInfo
->isDone())
257 // Delete the handle and unset it from the OpenMP task data.
259 *TaskAsyncInfoPtr
= nullptr;
262 operator AsyncInfoTy
&() { return *AsyncInfo
; }
265 /// This struct is a record of non-contiguous information
266 struct __tgt_target_non_contig
{
276 void ompx_dump_mapping_tables(void);
277 int omp_get_num_devices(void);
278 int omp_get_device_num(void);
279 int omp_get_initial_device(void);
280 void *omp_target_alloc(size_t Size
, int DeviceNum
);
281 void omp_target_free(void *DevicePtr
, int DeviceNum
);
282 int omp_target_is_present(const void *Ptr
, int DeviceNum
);
283 int omp_target_memcpy(void *Dst
, const void *Src
, size_t Length
,
284 size_t DstOffset
, size_t SrcOffset
, int DstDevice
,
286 int omp_target_memcpy_rect(void *Dst
, const void *Src
, size_t ElementSize
,
287 int NumDims
, const size_t *Volume
,
288 const size_t *DstOffsets
, const size_t *SrcOffsets
,
289 const size_t *DstDimensions
,
290 const size_t *SrcDimensions
, int DstDevice
,
292 void *omp_target_memset(void *Ptr
, int C
, size_t N
, int DeviceNum
);
293 int omp_target_associate_ptr(const void *HostPtr
, const void *DevicePtr
,
294 size_t Size
, size_t DeviceOffset
, int DeviceNum
);
295 int omp_target_disassociate_ptr(const void *HostPtr
, int DeviceNum
);
297 /// Explicit target memory allocators
298 /// Using the llvm_ prefix until they become part of the OpenMP standard.
299 void *llvm_omp_target_alloc_device(size_t Size
, int DeviceNum
);
300 void *llvm_omp_target_alloc_host(size_t Size
, int DeviceNum
);
301 void *llvm_omp_target_alloc_shared(size_t Size
, int DeviceNum
);
303 /// Explicit target memory deallocators
304 /// Using the llvm_ prefix until they become part of the OpenMP standard.
305 void llvm_omp_target_free_device(void *DevicePtr
, int DeviceNum
);
306 void llvm_omp_target_free_host(void *DevicePtr
, int DeviceNum
);
307 void llvm_omp_target_free_shared(void *DevicePtr
, int DeviceNum
);
309 /// Dummy target so we have a symbol for generating host fallback.
310 void *llvm_omp_target_dynamic_shared_alloc();
312 /// add the clauses of the requires directives in a given file
313 void __tgt_register_requires(int64_t Flags
);
315 /// Initializes the runtime library.
316 void __tgt_rtl_init();
318 /// Deinitializes the runtime library.
319 void __tgt_rtl_deinit();
321 /// adds a target shared library to the target execution image
322 void __tgt_register_lib(__tgt_bin_desc
*Desc
);
324 /// Initialize all RTLs at once
325 void __tgt_init_all_rtls();
327 /// removes a target shared library from the target execution image
328 void __tgt_unregister_lib(__tgt_bin_desc
*Desc
);
330 // creates the host to target data mapping, stores it in the
331 // libomptarget.so internal structure (an entry in a stack of data maps) and
332 // passes the data to the device;
333 void __tgt_target_data_begin(int64_t DeviceId
, int32_t ArgNum
, void **ArgsBase
,
334 void **Args
, int64_t *ArgSizes
, int64_t *ArgTypes
);
335 void __tgt_target_data_begin_nowait(int64_t DeviceId
, int32_t ArgNum
,
336 void **ArgsBase
, void **Args
,
337 int64_t *ArgSizes
, int64_t *ArgTypes
,
338 int32_t DepNum
, void *DepList
,
339 int32_t NoAliasDepNum
,
340 void *NoAliasDepList
);
341 void __tgt_target_data_begin_mapper(ident_t
*Loc
, int64_t DeviceId
,
342 int32_t ArgNum
, void **ArgsBase
,
343 void **Args
, int64_t *ArgSizes
,
344 int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
346 void __tgt_target_data_begin_nowait_mapper(
347 ident_t
*Loc
, int64_t DeviceId
, int32_t ArgNum
, void **ArgsBase
,
348 void **Args
, int64_t *ArgSizes
, int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
349 void **ArgMappers
, int32_t DepNum
, void *DepList
, int32_t NoAliasDepNum
,
350 void *NoAliasDepList
);
352 // passes data from the target, release target memory and destroys the
353 // host-target mapping (top entry from the stack of data maps) created by
354 // the last __tgt_target_data_begin
355 void __tgt_target_data_end(int64_t DeviceId
, int32_t ArgNum
, void **ArgsBase
,
356 void **Args
, int64_t *ArgSizes
, int64_t *ArgTypes
);
357 void __tgt_target_data_end_nowait(int64_t DeviceId
, int32_t ArgNum
,
358 void **ArgsBase
, void **Args
,
359 int64_t *ArgSizes
, int64_t *ArgTypes
,
360 int32_t DepNum
, void *DepList
,
361 int32_t NoAliasDepNum
, void *NoAliasDepList
);
362 void __tgt_target_data_end_mapper(ident_t
*Loc
, int64_t DeviceId
,
363 int32_t ArgNum
, void **ArgsBase
, void **Args
,
364 int64_t *ArgSizes
, int64_t *ArgTypes
,
365 map_var_info_t
*ArgNames
, void **ArgMappers
);
366 void __tgt_target_data_end_nowait_mapper(
367 ident_t
*Loc
, int64_t DeviceId
, int32_t ArgNum
, void **ArgsBase
,
368 void **Args
, int64_t *ArgSizes
, int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
369 void **ArgMappers
, int32_t depNum
, void *depList
, int32_t NoAliasDepNum
,
370 void *NoAliasDepList
);
372 /// passes data to/from the target
373 void __tgt_target_data_update(int64_t DeviceId
, int32_t ArgNum
, void **ArgsBase
,
374 void **Args
, int64_t *ArgSizes
,
376 void __tgt_target_data_update_nowait(int64_t DeviceId
, int32_t ArgNum
,
377 void **ArgsBase
, void **Args
,
378 int64_t *ArgSizes
, int64_t *ArgTypes
,
379 int32_t DepNum
, void *DepList
,
380 int32_t NoAliasDepNum
,
381 void *NoAliasDepList
);
382 void __tgt_target_data_update_mapper(ident_t
*Loc
, int64_t DeviceId
,
383 int32_t ArgNum
, void **ArgsBase
,
384 void **Args
, int64_t *ArgSizes
,
386 map_var_info_t
*ArgNames
,
388 void __tgt_target_data_update_nowait_mapper(
389 ident_t
*Loc
, int64_t DeviceId
, int32_t ArgNum
, void **ArgsBase
,
390 void **Args
, int64_t *ArgSizes
, int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
391 void **ArgMappers
, int32_t DepNum
, void *DepList
, int32_t NoAliasDepNum
,
392 void *NoAliasDepList
);
394 // Performs the same actions as data_begin in case ArgNum is non-zero
395 // and initiates run of offloaded region on target platform; if ArgNum
396 // is non-zero after the region execution is done it also performs the
397 // same action as data_end above. The following types are used; this
398 // function returns 0 if it was able to transfer the execution to a
399 // target and an int different from zero otherwise.
400 int __tgt_target_kernel(ident_t
*Loc
, int64_t DeviceId
, int32_t NumTeams
,
401 int32_t ThreadLimit
, void *HostPtr
, KernelArgsTy
*Args
);
403 // Non-blocking synchronization for target nowait regions. This function
404 // acquires the asynchronous context from task data of the current task being
405 // executed and tries to query for the completion of its operations. If the
406 // operations are still pending, the function returns immediately. If the
407 // operations are completed, all the post-processing procedures stored in the
408 // asynchronous context are executed and the context is removed from the task
410 void __tgt_target_nowait_query(void **AsyncHandle
);
412 /// Executes a target kernel by replaying recorded kernel arguments and
414 int __tgt_target_kernel_replay(ident_t
*Loc
, int64_t DeviceId
, void *HostPtr
,
415 void *DeviceMemory
, int64_t DeviceMemorySize
,
416 void **TgtArgs
, ptrdiff_t *TgtOffsets
,
417 int32_t NumArgs
, int32_t NumTeams
,
418 int32_t ThreadLimit
, uint64_t LoopTripCount
);
420 void __tgt_set_info_flag(uint32_t);
422 int __tgt_print_device_info(int64_t DeviceId
);
424 int __tgt_activate_record_replay(int64_t DeviceId
, uint64_t MemorySize
,
425 void *VAddr
, bool IsRecord
, bool SaveOutput
,
426 uint64_t &ReqPtrArgOffset
);
433 #define EXTERN extern "C"
435 #define EXTERN extern
438 #endif // _OMPTARGET_H_