Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / libomptarget / include / omptarget.h
blob818967c88904ec0fcda5fceb771ec893cb19f937
1 //===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Interface to be used by Clang during the codegen of a
10 // target region.
12 //===----------------------------------------------------------------------===//
14 #ifndef _OMPTARGET_H_
15 #define _OMPTARGET_H_
17 #include "Environment.h"
19 #include <cstdint>
20 #include <deque>
21 #include <functional>
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <type_traits>
26 #include <SourceInfo.h>
28 #include "llvm/ADT/SmallVector.h"
30 #define OFFLOAD_SUCCESS (0)
31 #define OFFLOAD_FAIL (~0)
33 #define OFFLOAD_DEVICE_DEFAULT -1
35 // Don't format out enums and structs.
36 // clang-format off
38 /// return flags of __tgt_target_XXX public APIs
39 enum __tgt_target_return_t : int {
40 /// successful offload executed on a target device
41 OMP_TGT_SUCCESS = 0,
42 /// offload may not execute on the requested target device
43 /// this scenario can be caused by the device not available or unsupported
44 /// as described in the Execution Model in the specifcation
45 /// this status may not be used for target device execution failure
46 /// which should be handled internally in libomptarget
47 OMP_TGT_FAIL = ~0
50 /// Data attributes for each data reference used in an OpenMP target region.
51 enum tgt_map_type {
52 // No flags
53 OMP_TGT_MAPTYPE_NONE = 0x000,
54 // copy data from host to device
55 OMP_TGT_MAPTYPE_TO = 0x001,
56 // copy data from device to host
57 OMP_TGT_MAPTYPE_FROM = 0x002,
58 // copy regardless of the reference count
59 OMP_TGT_MAPTYPE_ALWAYS = 0x004,
60 // force unmapping of data
61 OMP_TGT_MAPTYPE_DELETE = 0x008,
62 // map the pointer as well as the pointee
63 OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010,
64 // pass device base address to kernel
65 OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020,
66 // return base device address of mapped data
67 OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040,
68 // private variable - not mapped
69 OMP_TGT_MAPTYPE_PRIVATE = 0x080,
70 // copy by value - not mapped
71 OMP_TGT_MAPTYPE_LITERAL = 0x100,
72 // mapping is implicit
73 OMP_TGT_MAPTYPE_IMPLICIT = 0x200,
74 // copy data to device
75 OMP_TGT_MAPTYPE_CLOSE = 0x400,
76 // runtime error if not already allocated
77 OMP_TGT_MAPTYPE_PRESENT = 0x1000,
78 // use a separate reference counter so that the data cannot be unmapped within
79 // the structured region
80 // This is an OpenMP extension for the sake of OpenACC support.
81 OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000,
82 // descriptor for non-contiguous target-update
83 OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000,
84 // member of struct, member given by [16 MSBs] - 1
85 OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000
88 /// Flags for offload entries.
89 enum OpenMPOffloadingDeclareTargetFlags {
90 /// Mark the entry global as having a 'link' attribute.
91 OMP_DECLARE_TARGET_LINK = 0x01,
92 /// Mark the entry kernel as being a global constructor.
93 OMP_DECLARE_TARGET_CTOR = 0x02,
94 /// Mark the entry kernel as being a global destructor.
95 OMP_DECLARE_TARGET_DTOR = 0x04,
96 /// Mark the entry global as being an indirectly callable function.
97 OMP_DECLARE_TARGET_INDIRECT = 0x08
100 enum OpenMPOffloadingRequiresDirFlags {
101 /// flag undefined.
102 OMP_REQ_UNDEFINED = 0x000,
103 /// no requires directive present.
104 OMP_REQ_NONE = 0x001,
105 /// reverse_offload clause.
106 OMP_REQ_REVERSE_OFFLOAD = 0x002,
107 /// unified_address clause.
108 OMP_REQ_UNIFIED_ADDRESS = 0x004,
109 /// unified_shared_memory clause.
110 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
111 /// dynamic_allocators clause.
112 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010
115 enum TargetAllocTy : int32_t {
116 TARGET_ALLOC_DEVICE = 0,
117 TARGET_ALLOC_HOST,
118 TARGET_ALLOC_SHARED,
119 TARGET_ALLOC_DEFAULT
122 /// This struct contains all of the arguments to a target kernel region launch.
123 struct KernelArgsTy {
124 uint32_t Version; // Version of this struct for ABI compatibility.
125 uint32_t NumArgs; // Number of arguments in each input pointer.
126 void **ArgBasePtrs; // Base pointer of each argument (e.g. a struct).
127 void **ArgPtrs; // Pointer to the argument data.
128 int64_t *ArgSizes; // Size of the argument data in bytes.
129 int64_t *ArgTypes; // Type of the data (e.g. to / from).
130 void **ArgNames; // Name of the data for debugging, possibly null.
131 void **ArgMappers; // User-defined mappers, possibly null.
132 uint64_t Tripcount; // Tripcount for the teams / distribute loop, 0 otherwise.
133 struct {
134 uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
135 uint64_t Unused : 63;
136 } Flags;
137 uint32_t NumTeams[3]; // The number of teams (for x,y,z dimension).
138 uint32_t ThreadLimit[3]; // The number of threads (for x,y,z dimension).
139 uint32_t DynCGroupMem; // Amount of dynamic cgroup memory requested.
141 static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t),
142 "Invalid struct size");
143 static_assert(sizeof(KernelArgsTy) == (8 * sizeof(int32_t) + 3 * sizeof(int64_t) + 4 * sizeof(void**) + 2 * sizeof(int64_t*)),
144 "Invalid struct size");
145 inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr,
146 nullptr, nullptr, nullptr, nullptr,
147 0, {0,0}, {1, 0, 0}, {1, 0, 0}, 0};
149 /// This struct is a record of an entry point or global. For a function
150 /// entry point the size is expected to be zero
151 struct __tgt_offload_entry {
152 void *addr; // Pointer to the offload entry info (function or global)
153 char *name; // Name of the function or global
154 size_t size; // Size of the entry info (0 if it is a function)
155 int32_t flags; // Flags associated with the entry, e.g. 'link'.
156 int32_t reserved; // Reserved, to be used by the runtime library.
159 /// This struct is a record of the device image information
160 struct __tgt_device_image {
161 void *ImageStart; // Pointer to the target code start
162 void *ImageEnd; // Pointer to the target code end
163 __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries
164 __tgt_offload_entry *EntriesEnd; // End of table (non inclusive)
167 /// This struct contains information about a given image.
168 struct __tgt_image_info {
169 const char *Arch;
172 /// This struct is a record of all the host code that may be offloaded to a
173 /// target.
174 struct __tgt_bin_desc {
175 int32_t NumDeviceImages; // Number of device types supported
176 __tgt_device_image *DeviceImages; // Array of device images (1 per dev. type)
177 __tgt_offload_entry *HostEntriesBegin; // Begin of table with all host entries
178 __tgt_offload_entry *HostEntriesEnd; // End of table (non inclusive)
181 /// This struct contains the offload entries identified by the target runtime
182 struct __tgt_target_table {
183 __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries
184 __tgt_offload_entry
185 *EntriesEnd; // End of the table with all the entries (non inclusive)
188 // clang-format on
190 /// This struct contains information exchanged between different asynchronous
191 /// operations for device-dependent optimization and potential synchronization
192 struct __tgt_async_info {
193 // A pointer to a queue-like structure where offloading operations are issued.
194 // We assume to use this structure to do synchronization. In CUDA backend, it
195 // is CUstream.
196 void *Queue = nullptr;
198 /// A collection of allocations that are associated with this stream and that
199 /// should be freed after finalization.
200 llvm::SmallVector<void *, 2> AssociatedAllocations;
202 /// The kernel launch environment used to issue a kernel. Stored here to
203 /// ensure it is a valid location while the transfer to the device is
204 /// happening.
205 KernelLaunchEnvironmentTy KernelLaunchEnvironment;
208 struct DeviceTy;
210 /// The libomptarget wrapper around a __tgt_async_info object directly
211 /// associated with a libomptarget layer device. RAII semantics to avoid
212 /// mistakes.
213 class AsyncInfoTy {
214 public:
215 enum class SyncTy { BLOCKING, NON_BLOCKING };
217 private:
218 /// Locations we used in (potentially) asynchronous calls which should live
219 /// as long as this AsyncInfoTy object.
220 std::deque<void *> BufferLocations;
222 /// Post-processing operations executed after a successful synchronization.
223 /// \note the post-processing function should return OFFLOAD_SUCCESS or
224 /// OFFLOAD_FAIL appropriately.
225 using PostProcFuncTy = std::function<int()>;
226 llvm::SmallVector<PostProcFuncTy> PostProcessingFunctions;
228 __tgt_async_info AsyncInfo;
229 DeviceTy &Device;
231 public:
232 /// Synchronization method to be used.
233 SyncTy SyncType;
235 AsyncInfoTy(DeviceTy &Device, SyncTy SyncType = SyncTy::BLOCKING)
236 : Device(Device), SyncType(SyncType) {}
237 ~AsyncInfoTy() { synchronize(); }
239 /// Implicit conversion to the __tgt_async_info which is used in the
240 /// plugin interface.
241 operator __tgt_async_info *() { return &AsyncInfo; }
243 /// Synchronize all pending actions.
245 /// \note synchronization will be performance in a blocking or non-blocking
246 /// manner, depending on the SyncType.
248 /// \note if the operations are completed, the registered post-processing
249 /// functions will be executed once and unregistered afterwards.
251 /// \returns OFFLOAD_FAIL or OFFLOAD_SUCCESS appropriately.
252 int synchronize();
254 /// Return a void* reference with a lifetime that is at least as long as this
255 /// AsyncInfoTy object. The location can be used as intermediate buffer.
256 void *&getVoidPtrLocation();
258 /// Check if all asynchronous operations are completed.
260 /// \note only a lightweight check. If needed, use synchronize() to query the
261 /// status of AsyncInfo before checking.
263 /// \returns true if there is no pending asynchronous operations, false
264 /// otherwise.
265 bool isDone() const;
267 /// Add a new post-processing function to be executed after synchronization.
269 /// \param[in] Function is a templated function (e.g., function pointers,
270 /// lambdas, std::function) that can be convertible to a PostProcFuncTy (i.e.,
271 /// it must have int() as its function signature).
272 template <typename FuncTy> void addPostProcessingFunction(FuncTy &&Function) {
273 static_assert(std::is_convertible_v<FuncTy, PostProcFuncTy>,
274 "Invalid post-processing function type. Please check "
275 "function signature!");
276 PostProcessingFunctions.emplace_back(Function);
279 private:
280 /// Run all the post-processing functions sequentially.
282 /// \note after a successful execution, all previously registered functions
283 /// are unregistered.
285 /// \returns OFFLOAD_FAIL if any post-processing function failed,
286 /// OFFLOAD_SUCCESS otherwise.
287 int32_t runPostProcessing();
289 /// Check if the internal asynchronous info queue is empty or not.
291 /// \returns true if empty, false otherwise.
292 bool isQueueEmpty() const;
295 /// This struct is a record of non-contiguous information
296 struct __tgt_target_non_contig {
297 uint64_t Offset;
298 uint64_t Count;
299 uint64_t Stride;
302 struct __tgt_device_info {
303 void *Context = nullptr;
304 void *Device = nullptr;
307 #ifdef __cplusplus
308 extern "C" {
309 #endif
311 int omp_get_num_devices(void);
312 int omp_get_device_num(void);
313 int omp_get_initial_device(void);
314 void *omp_target_alloc(size_t Size, int DeviceNum);
315 void omp_target_free(void *DevicePtr, int DeviceNum);
316 int omp_target_is_present(const void *Ptr, int DeviceNum);
317 int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
318 size_t DstOffset, size_t SrcOffset, int DstDevice,
319 int SrcDevice);
320 int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
321 int NumDims, const size_t *Volume,
322 const size_t *DstOffsets, const size_t *SrcOffsets,
323 const size_t *DstDimensions,
324 const size_t *SrcDimensions, int DstDevice,
325 int SrcDevice);
326 void *omp_target_memset(void *Ptr, int C, size_t N, int DeviceNum);
327 int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
328 size_t Size, size_t DeviceOffset, int DeviceNum);
329 int omp_target_disassociate_ptr(const void *HostPtr, int DeviceNum);
331 /// Explicit target memory allocators
332 /// Using the llvm_ prefix until they become part of the OpenMP standard.
333 void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum);
334 void *llvm_omp_target_alloc_host(size_t Size, int DeviceNum);
335 void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
337 /// Explicit target memory deallocators
338 /// Using the llvm_ prefix until they become part of the OpenMP standard.
339 void llvm_omp_target_free_device(void *DevicePtr, int DeviceNum);
340 void llvm_omp_target_free_host(void *DevicePtr, int DeviceNum);
341 void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
343 /// Dummy target so we have a symbol for generating host fallback.
344 void *llvm_omp_target_dynamic_shared_alloc();
346 /// add the clauses of the requires directives in a given file
347 void __tgt_register_requires(int64_t Flags);
349 /// adds a target shared library to the target execution image
350 void __tgt_register_lib(__tgt_bin_desc *Desc);
352 /// Initialize all RTLs at once
353 void __tgt_init_all_rtls();
355 /// removes a target shared library from the target execution image
356 void __tgt_unregister_lib(__tgt_bin_desc *Desc);
358 // creates the host to target data mapping, stores it in the
359 // libomptarget.so internal structure (an entry in a stack of data maps) and
360 // passes the data to the device;
361 void __tgt_target_data_begin(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
362 void **Args, int64_t *ArgSizes, int64_t *ArgTypes);
363 void __tgt_target_data_begin_nowait(int64_t DeviceId, int32_t ArgNum,
364 void **ArgsBase, void **Args,
365 int64_t *ArgSizes, int64_t *ArgTypes,
366 int32_t DepNum, void *DepList,
367 int32_t NoAliasDepNum,
368 void *NoAliasDepList);
369 void __tgt_target_data_begin_mapper(ident_t *Loc, int64_t DeviceId,
370 int32_t ArgNum, void **ArgsBase,
371 void **Args, int64_t *ArgSizes,
372 int64_t *ArgTypes, map_var_info_t *ArgNames,
373 void **ArgMappers);
374 void __tgt_target_data_begin_nowait_mapper(
375 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
376 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
377 void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
378 void *NoAliasDepList);
380 // passes data from the target, release target memory and destroys the
381 // host-target mapping (top entry from the stack of data maps) created by
382 // the last __tgt_target_data_begin
383 void __tgt_target_data_end(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
384 void **Args, int64_t *ArgSizes, int64_t *ArgTypes);
385 void __tgt_target_data_end_nowait(int64_t DeviceId, int32_t ArgNum,
386 void **ArgsBase, void **Args,
387 int64_t *ArgSizes, int64_t *ArgTypes,
388 int32_t DepNum, void *DepList,
389 int32_t NoAliasDepNum, void *NoAliasDepList);
390 void __tgt_target_data_end_mapper(ident_t *Loc, int64_t DeviceId,
391 int32_t ArgNum, void **ArgsBase, void **Args,
392 int64_t *ArgSizes, int64_t *ArgTypes,
393 map_var_info_t *ArgNames, void **ArgMappers);
394 void __tgt_target_data_end_nowait_mapper(
395 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
396 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
397 void **ArgMappers, int32_t depNum, void *depList, int32_t NoAliasDepNum,
398 void *NoAliasDepList);
400 /// passes data to/from the target
401 void __tgt_target_data_update(int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
402 void **Args, int64_t *ArgSizes,
403 int64_t *ArgTypes);
404 void __tgt_target_data_update_nowait(int64_t DeviceId, int32_t ArgNum,
405 void **ArgsBase, void **Args,
406 int64_t *ArgSizes, int64_t *ArgTypes,
407 int32_t DepNum, void *DepList,
408 int32_t NoAliasDepNum,
409 void *NoAliasDepList);
410 void __tgt_target_data_update_mapper(ident_t *Loc, int64_t DeviceId,
411 int32_t ArgNum, void **ArgsBase,
412 void **Args, int64_t *ArgSizes,
413 int64_t *ArgTypes,
414 map_var_info_t *ArgNames,
415 void **ArgMappers);
416 void __tgt_target_data_update_nowait_mapper(
417 ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
418 void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames,
419 void **ArgMappers, int32_t DepNum, void *DepList, int32_t NoAliasDepNum,
420 void *NoAliasDepList);
422 // Performs the same actions as data_begin in case ArgNum is non-zero
423 // and initiates run of offloaded region on target platform; if ArgNum
424 // is non-zero after the region execution is done it also performs the
425 // same action as data_end above. The following types are used; this
426 // function returns 0 if it was able to transfer the execution to a
427 // target and an int different from zero otherwise.
428 int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
429 int32_t ThreadLimit, void *HostPtr, KernelArgsTy *Args);
431 // Non-blocking synchronization for target nowait regions. This function
432 // acquires the asynchronous context from task data of the current task being
433 // executed and tries to query for the completion of its operations. If the
434 // operations are still pending, the function returns immediately. If the
435 // operations are completed, all the post-processing procedures stored in the
436 // asynchronous context are executed and the context is removed from the task
437 // data.
438 void __tgt_target_nowait_query(void **AsyncHandle);
440 /// Executes a target kernel by replaying recorded kernel arguments and
441 /// device memory.
442 int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
443 void *DeviceMemory, int64_t DeviceMemorySize,
444 void **TgtArgs, ptrdiff_t *TgtOffsets,
445 int32_t NumArgs, int32_t NumTeams,
446 int32_t ThreadLimit, uint64_t LoopTripCount);
448 void __tgt_set_info_flag(uint32_t);
450 int __tgt_print_device_info(int64_t DeviceId);
452 int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
453 void *VAddr, bool IsRecord, bool SaveOutput);
455 #ifdef __cplusplus
457 #endif
459 #ifdef __cplusplus
460 #define EXTERN extern "C"
461 #else
462 #define EXTERN extern
463 #endif
465 #endif // _OMPTARGET_H_