[X86][MC,LLD][NFC] Rename R_X86_64_REX2_GOTPCRELX (#116737)
[llvm-project.git] / offload / include / omptarget.h
blob2b6445e9fbe550ecbbf4ec763c0b659ffe791fa5
1 //===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Interface to be used by Clang during the codegen of a
10 // target region.
12 //===----------------------------------------------------------------------===//
14 #ifndef _OMPTARGET_H_
15 #define _OMPTARGET_H_
17 #include "Shared/APITypes.h"
18 #include "Shared/Environment.h"
19 #include "Shared/SourceInfo.h"
21 #include "OpenMP/InternalTypes.h"
23 #include <cstddef>
24 #include <cstdint>
25 #include <deque>
26 #include <functional>
27 #include <type_traits>
29 #include "llvm/ADT/SmallVector.h"
31 #define OFFLOAD_SUCCESS (0)
32 #define OFFLOAD_FAIL (~0)
34 #define OFFLOAD_DEVICE_DEFAULT -1
36 // Don't format out enums and structs.
37 // clang-format off
39 /// return flags of __tgt_target_XXX public APIs
40 enum __tgt_target_return_t : int {
41 /// successful offload executed on a target device
42 OMP_TGT_SUCCESS = 0,
43 /// offload may not execute on the requested target device
44 /// this scenario can be caused by the device not available or unsupported
45 /// as described in the Execution Model in the specifcation
46 /// this status may not be used for target device execution failure
47 /// which should be handled internally in libomptarget
48 OMP_TGT_FAIL = ~0
51 /// Data attributes for each data reference used in an OpenMP target region.
52 enum tgt_map_type {
53 // No flags
54 OMP_TGT_MAPTYPE_NONE = 0x000,
55 // copy data from host to device
56 OMP_TGT_MAPTYPE_TO = 0x001,
57 // copy data from device to host
58 OMP_TGT_MAPTYPE_FROM = 0x002,
59 // copy regardless of the reference count
60 OMP_TGT_MAPTYPE_ALWAYS = 0x004,
61 // force unmapping of data
62 OMP_TGT_MAPTYPE_DELETE = 0x008,
63 // map the pointer as well as the pointee
64 OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010,
65 // pass device base address to kernel
66 OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020,
67 // return base device address of mapped data
68 OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040,
69 // private variable - not mapped
70 OMP_TGT_MAPTYPE_PRIVATE = 0x080,
71 // copy by value - not mapped
72 OMP_TGT_MAPTYPE_LITERAL = 0x100,
73 // mapping is implicit
74 OMP_TGT_MAPTYPE_IMPLICIT = 0x200,
75 // copy data to device
76 OMP_TGT_MAPTYPE_CLOSE = 0x400,
77 // runtime error if not already allocated
78 OMP_TGT_MAPTYPE_PRESENT = 0x1000,
79 // use a separate reference counter so that the data cannot be unmapped within
80 // the structured region
81 // This is an OpenMP extension for the sake of OpenACC support.
82 OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000,
83 // descriptor for non-contiguous target-update
84 OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000,
85 // member of struct, member given by [16 MSBs] - 1
86 OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000
89 /// Flags for offload entries.
90 enum OpenMPOffloadingDeclareTargetFlags {
91 /// Mark the entry global as having a 'link' attribute.
92 OMP_DECLARE_TARGET_LINK = 0x01,
93 /// Mark the entry global as being an indirectly callable function.
94 OMP_DECLARE_TARGET_INDIRECT = 0x08,
95 /// This is an entry corresponding to a requirement to be registered.
96 OMP_REGISTER_REQUIRES = 0x10,
99 enum TargetAllocTy : int32_t {
100 TARGET_ALLOC_DEVICE = 0,
101 TARGET_ALLOC_HOST,
102 TARGET_ALLOC_SHARED,
103 TARGET_ALLOC_DEFAULT,
104 /// The allocation will not block on other streams.
105 TARGET_ALLOC_DEVICE_NON_BLOCKING,
108 inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr,
109 nullptr, nullptr, nullptr, nullptr,
110 0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0};
112 struct DeviceTy;
114 /// The libomptarget wrapper around a __tgt_async_info object directly
115 /// associated with a libomptarget layer device. RAII semantics to avoid
116 /// mistakes.
117 class AsyncInfoTy {
118 public:
119 enum class SyncTy { BLOCKING, NON_BLOCKING };
121 private:
122 /// Locations we used in (potentially) asynchronous calls which should live
123 /// as long as this AsyncInfoTy object.
124 std::deque<void *> BufferLocations;
126 /// Post-processing operations executed after a successful synchronization.
127 /// \note the post-processing function should return OFFLOAD_SUCCESS or
128 /// OFFLOAD_FAIL appropriately.
129 using PostProcFuncTy = std::function<int()>;
130 llvm::SmallVector<PostProcFuncTy> PostProcessingFunctions;
132 __tgt_async_info AsyncInfo;
133 DeviceTy &Device;
135 public:
136 /// Synchronization method to be used.
137 SyncTy SyncType;
139 AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
140 : Device(Device), SyncType(SyncType) {}
141 ~AsyncInfoTy() { synchronize(); }
143 /// Implicit conversion to the __tgt_async_info which is used in the
144 /// plugin interface.
145 operator __tgt_async_info *() { return &AsyncInfo; }
147 /// Synchronize all pending actions.
149 /// \note synchronization will be performance in a blocking or non-blocking
150 /// manner, depending on the SyncType.
152 /// \note if the operations are completed, the registered post-processing
153 /// functions will be executed once and unregistered afterwards.
155 /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately.
156 int synchronize();
158 /// Return a void* reference with a lifetime that is at least as long as this
159 /// AsyncInfoTy object. The location can be used as intermediate buffer.
160 void *&getVoidPtrLocation();
162 /// Check if all asynchronous operations are completed.
164 /// \note only a lightweight check. If needed, use synchronize() to query the
165 /// status of AsyncInfo before checking.
167 /// \returns true if there is no pending asynchronous operations, false
168 /// otherwise.
169 bool isDone() const;
171 /// Add a new post-processing function to be executed after synchronization.
173 /// \param[in] Function is a templated function (e.g., function pointers,
174 /// lambdas, std::function) that can be convertible to a PostProcFuncTy (i.e.,
175 /// it must have int() as its function signature).
176 template <typename FuncTy> void addPostProcessingFunction(FuncTy &&Function) {
177 static_assert(std::is_convertible_v<FuncTy, PostProcFuncTy>,
178 "Invalid post-processing function type. Please check "
179 "function signature!");
180 PostProcessingFunctions.emplace_back(Function);
183 private:
184 /// Run all the post-processing functions sequentially.
186 /// \note after a successful execution, all previously registered functions
187 /// are unregistered.
189 /// \returns OFFLOAD_FAIL if any post-processing function failed,
190 /// OFFLOAD_SUCCESS otherwise.
191 int32_t runPostProcessing();
193 /// Check if the internal asynchronous info queue is empty or not.
195 /// \returns true if empty, false otherwise.
196 bool isQueueEmpty() const;
199 // Wrapper for task stored async info objects.
200 class TaskAsyncInfoWrapperTy {
201 // Invalid GTID as defined by libomp; keep in sync
202 static constexpr int KMP_GTID_DNE = -2;
204 const int ExecThreadID = KMP_GTID_DNE;
205 AsyncInfoTy LocalAsyncInfo;
206 AsyncInfoTy *AsyncInfo = &LocalAsyncInfo;
207 void **TaskAsyncInfoPtr = nullptr;
209 public:
210 TaskAsyncInfoWrapperTy(DeviceTy &Device)
211 : ExecThreadID(__kmpc_global_thread_num(NULL)), LocalAsyncInfo(Device) {
212 // If we failed to acquired the current global thread id, we cannot
213 // re-enqueue the current task. Thus we should use the local blocking async
214 // info.
215 if (ExecThreadID == KMP_GTID_DNE)
216 return;
218 // Only tasks with an assigned task team can be re-enqueue and thus can
219 // use the non-blocking synchronization scheme. Thus we should use the local
220 // blocking async info, if we donĀ“t have one.
221 if (!__kmpc_omp_has_task_team(ExecThreadID))
222 return;
224 // Acquire a pointer to the AsyncInfo stored inside the current task being
225 // executed.
226 TaskAsyncInfoPtr = __kmpc_omp_get_target_async_handle_ptr(ExecThreadID);
228 // If we cannot acquire such pointer, fallback to using the local blocking
229 // async info.
230 if (!TaskAsyncInfoPtr)
231 return;
233 // When creating a new task async info, the task handle must always be
234 // invalid. We must never overwrite any task async handle and there should
235 // never be any valid handle store inside the task at this point.
236 assert((*TaskAsyncInfoPtr) == nullptr &&
237 "Task async handle is not empty when dispatching new device "
238 "operations. The handle was not cleared properly or "
239 "__tgt_target_nowait_query should have been called!");
241 // If no valid async handle is present, a new AsyncInfo will be allocated
242 // and stored in the current task.
243 AsyncInfo = new AsyncInfoTy(Device, AsyncInfoTy::SyncTy::NON_BLOCKING);
244 *TaskAsyncInfoPtr = (void *)AsyncInfo;
247 ~TaskAsyncInfoWrapperTy() {
248 // Local async info destruction is automatically handled by ~AsyncInfoTy.
249 if (AsyncInfo == &LocalAsyncInfo)
250 return;
252 // If the are device operations still pending, return immediately without
253 // deallocating the handle.
254 if (!AsyncInfo->isDone())
255 return;
257 // Delete the handle and unset it from the OpenMP task data.
258 delete AsyncInfo;
259 *TaskAsyncInfoPtr = nullptr;
262 operator AsyncInfoTy &() { return *AsyncInfo; }
265 /// This struct is a record of non-contiguous information
266 struct __tgt_target_non_contig {
267 uint64_t Offset;
268 uint64_t Count;
269 uint64_t Stride;
272 #ifdef __cplusplus
273 extern "C" {
274 #endif
276 void ompx_dump_mapping_tables(void);
277 int omp_get_num_devices(void);
278 int omp_get_device_num(void);
279 int omp_get_initial_device(void);
280 void *omp_target_alloc(size_t Size, int DeviceNum);
281 void omp_target_free(void *DevicePtr, int DeviceNum);
282 int omp_target_is_present(const void *Ptr, int DeviceNum);
283 int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
284 size_t DstOffset, size_t SrcOffset, int DstDevice,
285 int SrcDevice);
286 int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
287 int NumDims, const size_t *Volume,
288 const size_t *DstOffsets, const size_t *SrcOffsets,
289 const size_t *DstDimensions,
290 const size_t *SrcDimensions, int DstDevice,
291 int SrcDevice);
292 void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
293 int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
294 size_t Size, size_t DeviceOffset, int DeviceNum);
295 int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum);
297 /// Explicit target memory allocators
298 /// Using the llvm_ prefix until they become part of the OpenMP standard.
299 void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum);
300 void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
301 void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
303 /// Explicit target memory deallocators
304 /// Using the llvm_ prefix until they become part of the OpenMP standard.
305 void llvm_omp_target_free_device(void *DevicePtr, int DeviceNum);
306 void llvm_omp_target_free_host(void *DevicePtr, int DeviceNum);
307 void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
309 /// Dummy target so we have a symbol for generating host fallback.
310 void *llvm_omp_target_dynamic_shared_alloc();
312 /// add the clauses of the requires directives in a given file
313 void __tgt_register_requires(int64_t Flags);
315 /// Initializes the runtime library.
316 void __tgt_rtl_init();
318 /// Deinitializes the runtime library.
319 void __tgt_rtl_deinit();
321 /// adds a target shared library to the target execution image
322 void __tgt_register_lib(__tgt_bin_desc *Desc);
324 /// Initialize all RTLs at once
325 void __tgt_init_all_rtls();
327 /// removes a target shared library from the target execution image
328 void __tgt_unregister_lib(__tgt_bin_desc *Desc);
330 // creates the host to target data mapping, stores it in the
331 // libomptarget.so internal structure (an entry in a stack of data maps) and
332 // passes the data to the device;
333 void __tgt_target_data_begin(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
334 void **Args, int64_t *ArgSizes, int64_t *ArgTypes);
335 void __tgt_target_data_begin_nowait(int64_t DeviceId, int32_t ArgNum,
336 void **ArgsBase, void **Args,
337 int64_t *ArgSizes, int64_t *ArgTypes,
338 int32_t DepNum, void *DepList,
339 int32_t NoAliasDepNum,
340 void *NoAliasDepList);
341 void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId,
342 int32_t ArgNum, void **ArgsBase,
343 void **Args, int64_t *ArgSizes,
344 int64_t *ArgTypes, map_var_info_t *ArgNames,
345 void **ArgMappers);
346 void __tgt_target_data_begin_nowait_mapper(
347 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
348 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
349 void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
350 void *NoAliasDepList);
352 // passes data from the target, release target memory and destroys the
353 // host-target mapping (top entry from the stack of data maps) created by
354 // the last __tgt_target_data_begin
355 void __tgt_target_data_end(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
356 void **Args, int64_t *ArgSizes, int64_t *ArgTypes);
357 void __tgt_target_data_end_nowait(int64_t DeviceId, int32_t ArgNum,
358 void **ArgsBase, void **Args,
359 int64_t *ArgSizes, int64_t *ArgTypes,
360 int32_t DepNum, void *DepList,
361 int32_t NoAliasDepNum, void *NoAliasDepList);
362 void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId,
363 int32_t ArgNum, void **ArgsBase, void **Args,
364 int64_t *ArgSizes, int64_t *ArgTypes,
365 map_var_info_t *ArgNames, void **ArgMappers);
366 void __tgt_target_data_end_nowait_mapper(
367 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
368 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
369 void **ArgMappers, int32_t depNum, void *depList, int32_t NoAliasDepNum,
370 void *NoAliasDepList);
372 /// passes data to/from the target
373 void __tgt_target_data_update(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
374 void **Args, int64_t *ArgSizes,
375 int64_t *ArgTypes);
376 void __tgt_target_data_update_nowait(int64_t DeviceId, int32_t ArgNum,
377 void **ArgsBase, void **Args,
378 int64_t *ArgSizes, int64_t *ArgTypes,
379 int32_t DepNum, void *DepList,
380 int32_t NoAliasDepNum,
381 void *NoAliasDepList);
382 void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId,
383 int32_t ArgNum, void **ArgsBase,
384 void **Args, int64_t *ArgSizes,
385 int64_t *ArgTypes,
386 map_var_info_t *ArgNames,
387 void **ArgMappers);
388 void __tgt_target_data_update_nowait_mapper(
389 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
390 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
391 void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
392 void *NoAliasDepList);
394 // Performs the same actions as data_begin in case ArgNum is non-zero
395 // and initiates run of offloaded region on target platform; if ArgNum
396 // is non-zero after the region execution is done it also performs the
397 // same action as data_end above. The following types are used; this
398 // function returns 0 if it was able to transfer the execution to a
399 // target and an int different from zero otherwise.
400 int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
401 int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args);
403 // Non-blocking synchronization for target nowait regions. This function
404 // acquires the asynchronous context from task data of the current task being
405 // executed and tries to query for the completion of its operations. If the
406 // operations are still pending, the function returns immediately. If the
407 // operations are completed, all the post-processing procedures stored in the
408 // asynchronous context are executed and the context is removed from the task
409 // data.
410 void __tgt_target_nowait_query(void **AsyncHandle);
412 /// Executes a target kernel by replaying recorded kernel arguments and
413 /// device memory.
414 int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
415 void *DeviceMemory, int64_t DeviceMemorySize,
416 void **TgtArgs, ptrdiff_t *TgtOffsets,
417 int32_t NumArgs, int32_t NumTeams,
418 int32_t ThreadLimit, uint64_t LoopTripCount);
420 void __tgt_set_info_flag(uint32_t);
422 int __tgt_print_device_info(int64_t DeviceId);
424 int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
425 void *VAddr, bool IsRecord, bool SaveOutput,
426 uint64_t &ReqPtrArgOffset);
428 #ifdef __cplusplus
430 #endif
432 #ifdef __cplusplus
433 #define EXTERN extern "C"
434 #else
435 #define EXTERN extern
436 #endif
438 #endif // _OMPTARGET_H_