Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / libomptarget / src / omptarget.cpp
blob65f2a49abc714ce78156fc6254825761b88260bb
1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
12 //===----------------------------------------------------------------------===//
14 #include "omptarget.h"
15 #include "OmptCallback.h"
16 #include "OmptInterface.h"
17 #include "device.h"
18 #include "private.h"
19 #include "rtl.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/ADT/bit.h"
24 #include <cassert>
25 #include <cstdint>
26 #include <vector>
28 using llvm::SmallVector;
29 #ifdef OMPT_SUPPORT
30 using namespace llvm::omp::target::ompt;
31 #endif
33 int AsyncInfoTy::synchronize() {
34 int Result = OFFLOAD_SUCCESS;
35 if (!isQueueEmpty()) {
36 switch (SyncType) {
37 case SyncTy::BLOCKING:
38 // If we have a queue we need to synchronize it now.
39 Result = Device.synchronize(*this);
40 assert(AsyncInfo.Queue == nullptr &&
41 "The device plugin should have nulled the queue to indicate there "
42 "are no outstanding actions!");
43 break;
44 case SyncTy::NON_BLOCKING:
45 Result = Device.queryAsync(*this);
46 break;
50 // Run any pending post-processing function registered on this async object.
51 if (Result == OFFLOAD_SUCCESS && isQueueEmpty())
52 Result = runPostProcessing();
54 return Result;
57 void *&AsyncInfoTy::getVoidPtrLocation() {
58 BufferLocations.push_back(nullptr);
59 return BufferLocations.back();
62 bool AsyncInfoTy::isDone() const { return isQueueEmpty(); }
64 int32_t AsyncInfoTy::runPostProcessing() {
65 size_t Size = PostProcessingFunctions.size();
66 for (size_t I = 0; I < Size; ++I) {
67 const int Result = PostProcessingFunctions[I]();
68 if (Result != OFFLOAD_SUCCESS)
69 return Result;
72 // Clear the vector up until the last known function, since post-processing
73 // procedures might add new procedures themselves.
74 const auto PrevBegin = PostProcessingFunctions.begin();
75 PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size);
77 return OFFLOAD_SUCCESS;
80 bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; }
82 /* All begin addresses for partially mapped structs must be aligned, up to 16,
83 * in order to ensure proper alignment of members. E.g.
85 * struct S {
86 * int a; // 4-aligned
87 * int b; // 4-aligned
88 * int *p; // 8-aligned
89 * } s1;
90 * ...
91 * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
92 * {
93 * s1.b = 5;
94 * for (int i...) s1.p[i] = ...;
95 * }
97 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
98 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
99 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
100 * requirements for its type. Now, when we allocate memory on the device, in
101 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
102 * This means that the chunk of the struct on the device will start at a
103 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
104 * address of p will be a misaligned 0x204 (on the host there was no need to add
105 * padding between b and p, so p comes exactly 4 bytes after b). If the device
106 * kernel tries to access s1.p, a misaligned address error occurs (as reported
107 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
108 * extending the size of the allocated chuck accordingly, the chuck on the
109 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
110 * &s1.p=0x208, as they should be to satisfy the alignment requirements.
112 static const int64_t MaxAlignment = 16;
114 /// Return the alignment requirement of partially mapped structs, see
115 /// MaxAlignment above.
116 static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) {
117 int LowestOneBit = __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase));
118 uint64_t BaseAlignment = 1 << (LowestOneBit - 1);
119 return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment;
122 /// Map global data and execute pending ctors
123 static int initLibrary(DeviceTy &Device) {
125 * Map global data
127 int32_t DeviceId = Device.DeviceID;
128 int Rc = OFFLOAD_SUCCESS;
129 bool SupportsEmptyImages = Device.RTL->supports_empty_images &&
130 Device.RTL->supports_empty_images() > 0;
132 std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx);
133 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
134 TranslationTable *TransTable =
135 &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
136 if (TransTable->HostTable.EntriesBegin ==
137 TransTable->HostTable.EntriesEnd &&
138 !SupportsEmptyImages) {
139 // No host entry so no need to proceed
140 continue;
143 if (TransTable->TargetsTable[DeviceId] != 0) {
144 // Library entries have already been processed
145 continue;
148 // 1) get image.
149 assert(TransTable->TargetsImages.size() > (size_t)DeviceId &&
150 "Not expecting a device ID outside the table's bounds!");
151 __tgt_device_image *Img = TransTable->TargetsImages[DeviceId];
152 if (!Img) {
153 REPORT("No image loaded for device id %d.\n", DeviceId);
154 Rc = OFFLOAD_FAIL;
155 break;
157 // 2) load image into the target table.
158 __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] =
159 Device.loadBinary(Img);
160 // Unable to get table for this image: invalidate image and fail.
161 if (!TargetTable) {
162 REPORT("Unable to generate entries table for device id %d.\n",
163 DeviceId);
164 TransTable->TargetsImages[DeviceId] = 0;
165 Rc = OFFLOAD_FAIL;
166 break;
169 // Verify whether the two table sizes match.
170 size_t Hsize =
171 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
172 size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
174 // Invalid image for these host entries!
175 if (Hsize != Tsize) {
176 REPORT(
177 "Host and Target tables mismatch for device id %d [%zx != %zx].\n",
178 DeviceId, Hsize, Tsize);
179 TransTable->TargetsImages[DeviceId] = 0;
180 TransTable->TargetsTable[DeviceId] = 0;
181 Rc = OFFLOAD_FAIL;
182 break;
185 DeviceTy::HDTTMapAccessorTy HDTTMap =
186 Device.HostDataToTargetMap.getExclusiveAccessor();
188 __tgt_target_table *HostTable = &TransTable->HostTable;
189 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
190 *CurrHostEntry = HostTable->EntriesBegin,
191 *EntryDeviceEnd = TargetTable->EntriesEnd;
192 CurrDeviceEntry != EntryDeviceEnd;
193 CurrDeviceEntry++, CurrHostEntry++) {
194 if (CurrDeviceEntry->size != 0) {
195 // has data.
196 assert(CurrDeviceEntry->size == CurrHostEntry->size &&
197 "data size mismatch");
199 // Fortran may use multiple weak declarations for the same symbol,
200 // therefore we must allow for multiple weak symbols to be loaded from
201 // the fat binary. Treat these mappings as any other "regular"
202 // mapping. Add entry to map.
203 if (Device.getTgtPtrBegin(HDTTMap, CurrHostEntry->addr,
204 CurrHostEntry->size))
205 continue;
207 DP("Add mapping from host " DPxMOD " to device " DPxMOD
208 " with size %zu"
209 "\n",
210 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
211 CurrDeviceEntry->size);
212 HDTTMap->emplace(new HostDataToTargetTy(
213 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
214 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
215 (uintptr_t)CurrHostEntry->addr +
216 CurrHostEntry->size /*HstPtrEnd*/,
217 (uintptr_t)CurrDeviceEntry->addr /*TgtAllocBegin*/,
218 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
219 false /*UseHoldRefCount*/, CurrHostEntry->name,
220 true /*IsRefCountINF*/));
222 // Notify about the new mapping.
223 if (Device.notifyDataMapped(CurrHostEntry->addr, CurrHostEntry->size))
224 return OFFLOAD_FAIL;
230 if (Rc != OFFLOAD_SUCCESS) {
231 return Rc;
235 * Run ctors for static objects
237 if (!Device.PendingCtorsDtors.empty()) {
238 AsyncInfoTy AsyncInfo(Device);
239 // Call all ctors for all libraries registered so far
240 for (auto &Lib : Device.PendingCtorsDtors) {
241 if (!Lib.second.PendingCtors.empty()) {
242 DP("Has pending ctors... call now\n");
243 for (auto &Entry : Lib.second.PendingCtors) {
244 void *Ctor = Entry;
245 int Rc = target(nullptr, Device, Ctor, CTorDTorKernelArgs, AsyncInfo);
246 if (Rc != OFFLOAD_SUCCESS) {
247 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(Ctor));
248 return OFFLOAD_FAIL;
251 // Clear the list to indicate that this device has been used
252 Lib.second.PendingCtors.clear();
253 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(Lib.first));
256 // All constructors have been issued, wait for them now.
257 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
258 return OFFLOAD_FAIL;
260 Device.HasPendingGlobals = false;
262 return OFFLOAD_SUCCESS;
265 void handleTargetOutcome(bool Success, ident_t *Loc) {
266 switch (PM->TargetOffloadPolicy) {
267 case tgt_disabled:
268 if (Success) {
269 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
271 break;
272 case tgt_default:
273 FATAL_MESSAGE0(1, "default offloading policy must be switched to "
274 "mandatory or disabled");
275 break;
276 case tgt_mandatory:
277 if (!Success) {
278 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
279 for (auto &Device : PM->Devices)
280 dumpTargetPointerMappings(Loc, *Device);
281 else
282 FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "
283 "for debugging options.\n");
285 if (PM->RTLs.UsedRTLs.empty()) {
286 llvm::SmallVector<llvm::StringRef> Archs;
287 llvm::transform(PM->Images, std::back_inserter(Archs),
288 [](const auto &x) {
289 return !x.second.Arch ? "empty" : x.second.Arch;
291 FAILURE_MESSAGE(
292 "No images found compatible with the installed hardware. ");
293 fprintf(stderr, "Found (%s)\n", llvm::join(Archs, ",").c_str());
296 SourceInfo Info(Loc);
297 if (Info.isAvailible())
298 fprintf(stderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(),
299 Info.getColumn());
300 else
301 FAILURE_MESSAGE("Source location information not present. Compile with "
302 "-g or -gline-tables-only.\n");
303 FATAL_MESSAGE0(
304 1, "failure of target construct while offloading is mandatory");
305 } else {
306 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
307 for (auto &Device : PM->Devices)
308 dumpTargetPointerMappings(Loc, *Device);
310 break;
314 static void handleDefaultTargetOffload() {
315 std::lock_guard<decltype(PM->TargetOffloadMtx)> LG(PM->TargetOffloadMtx);
316 if (PM->TargetOffloadPolicy == tgt_default) {
317 if (omp_get_num_devices() > 0) {
318 DP("Default TARGET OFFLOAD policy is now mandatory "
319 "(devices were found)\n");
320 PM->TargetOffloadPolicy = tgt_mandatory;
321 } else {
322 DP("Default TARGET OFFLOAD policy is now disabled "
323 "(no devices were found)\n");
324 PM->TargetOffloadPolicy = tgt_disabled;
329 static bool isOffloadDisabled() {
330 if (PM->TargetOffloadPolicy == tgt_default)
331 handleDefaultTargetOffload();
332 return PM->TargetOffloadPolicy == tgt_disabled;
335 // If offload is enabled, ensure that device DeviceID has been initialized,
336 // global ctors have been executed, and global data has been mapped.
338 // The return bool indicates if the offload is to the host device
339 // There are three possible results:
340 // - Return false if the taregt device is ready for offload
341 // - Return true without reporting a runtime error if offload is
342 // disabled, perhaps because the initial device was specified.
343 // - Report a runtime error and return true.
345 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
346 // This step might be skipped if offload is disabled.
347 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
348 if (isOffloadDisabled()) {
349 DP("Offload is disabled\n");
350 return true;
353 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
354 DeviceID = omp_get_default_device();
355 DP("Use default device id %" PRId64 "\n", DeviceID);
358 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
359 if (omp_get_num_devices() == 0) {
360 DP("omp_get_num_devices() == 0 but offload is manadatory\n");
361 handleTargetOutcome(false, Loc);
362 return true;
365 if (DeviceID == omp_get_initial_device()) {
366 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
367 DeviceID);
368 return true;
371 // Is device ready?
372 if (!deviceIsReady(DeviceID)) {
373 REPORT("Device %" PRId64 " is not ready.\n", DeviceID);
374 handleTargetOutcome(false, Loc);
375 return true;
378 // Get device info.
379 DeviceTy &Device = *PM->Devices[DeviceID];
381 // Check whether global data has been mapped for this device
383 std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG(
384 Device.PendingGlobalsMtx);
385 if (Device.HasPendingGlobals && initLibrary(Device) != OFFLOAD_SUCCESS) {
386 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
387 handleTargetOutcome(false, Loc);
388 return true;
392 return false;
395 static int32_t getParentIndex(int64_t Type) {
396 return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
399 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
400 const char *Name) {
401 TIMESCOPE();
402 DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
404 if (Size <= 0) {
405 DP("Call to %s with non-positive length\n", Name);
406 return NULL;
409 void *Rc = NULL;
411 if (DeviceNum == omp_get_initial_device()) {
412 Rc = malloc(Size);
413 DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc));
414 return Rc;
417 if (!deviceIsReady(DeviceNum)) {
418 DP("%s returns NULL ptr\n", Name);
419 return NULL;
422 DeviceTy &Device = *PM->Devices[DeviceNum];
423 Rc = Device.allocData(Size, nullptr, Kind);
424 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc));
425 return Rc;
428 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
429 const char *Name) {
430 TIMESCOPE();
431 DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
432 DPxPTR(DevicePtr));
434 if (!DevicePtr) {
435 DP("Call to %s with NULL ptr\n", Name);
436 return;
439 if (DeviceNum == omp_get_initial_device()) {
440 free(DevicePtr);
441 DP("%s deallocated host ptr\n", Name);
442 return;
445 if (!deviceIsReady(DeviceNum)) {
446 DP("%s returns, nothing to do\n", Name);
447 return;
450 PM->Devices[DeviceNum]->deleteData(DevicePtr, Kind);
451 DP("omp_target_free deallocated device ptr\n");
454 void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
455 const char *Name) {
456 TIMESCOPE();
457 DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
459 if (Size <= 0) {
460 DP("Call to %s with non-positive length\n", Name);
461 return NULL;
464 void *rc = NULL;
466 if (!deviceIsReady(DeviceNum)) {
467 DP("%s returns NULL ptr\n", Name);
468 return NULL;
471 DeviceTy *DevicePtr = nullptr;
473 std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx);
475 if (!PM->Devices[DeviceNum]) {
476 DP("%s returns, device %d not available\n", Name, DeviceNum);
477 return nullptr;
480 DevicePtr = PM->Devices[DeviceNum].get();
483 int32_t err = 0;
484 if (DevicePtr->RTL->data_lock) {
485 err = DevicePtr->RTL->data_lock(DeviceNum, HostPtr, Size, &rc);
486 if (err) {
487 DP("Could not lock ptr %p\n", HostPtr);
488 return nullptr;
491 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(rc));
492 return rc;
495 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
496 TIMESCOPE();
497 DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
499 DeviceTy *DevicePtr = nullptr;
501 std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx);
503 // Don't check deviceIsReady as it can initialize the device if needed.
504 // Just check if DeviceNum exists as targetUnlockExplicit can be called
505 // during process exit/free (and it may have been already destroyed) and
506 // targetAllocExplicit will have already checked deviceIsReady anyway.
507 size_t DevicesSize = PM->Devices.size();
509 if (DevicesSize <= (size_t)DeviceNum) {
510 DP("Device ID %d does not have a matching RTL\n", DeviceNum);
511 return;
514 if (!PM->Devices[DeviceNum]) {
515 DP("%s returns, device %d not available\n", Name, DeviceNum);
516 return;
519 DevicePtr = PM->Devices[DeviceNum].get();
520 } // unlock RTLsMtx
522 if (DevicePtr->RTL->data_unlock)
523 DevicePtr->RTL->data_unlock(DeviceNum, HostPtr);
525 DP("%s returns\n", Name);
528 /// Call the user-defined mapper function followed by the appropriate
529 // targetData* function (targetData{Begin,End,Update}).
530 int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
531 int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames,
532 void *ArgMapper, AsyncInfoTy &AsyncInfo,
533 TargetDataFuncPtrTy TargetDataFunction) {
534 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper));
536 // The mapper function fills up Components.
537 MapperComponentsTy MapperComponents;
538 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper);
539 (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType,
540 ArgNames);
542 // Construct new arrays for args_base, args, arg_sizes and arg_types
543 // using the information in MapperComponents and call the corresponding
544 // targetData* function using these new arrays.
545 SmallVector<void *> MapperArgsBase(MapperComponents.Components.size());
546 SmallVector<void *> MapperArgs(MapperComponents.Components.size());
547 SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size());
548 SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size());
549 SmallVector<void *> MapperArgNames(MapperComponents.Components.size());
551 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
552 auto &C = MapperComponents.Components[I];
553 MapperArgsBase[I] = C.Base;
554 MapperArgs[I] = C.Begin;
555 MapperArgSizes[I] = C.Size;
556 MapperArgTypes[I] = C.Type;
557 MapperArgNames[I] = C.Name;
560 int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(),
561 MapperArgsBase.data(), MapperArgs.data(),
562 MapperArgSizes.data(), MapperArgTypes.data(),
563 MapperArgNames.data(), /*arg_mappers*/ nullptr,
564 AsyncInfo, /*FromMapper=*/true);
566 return Rc;
569 /// Internal function to do the mapping and transfer the data to the device
570 int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
571 void **ArgsBase, void **Args, int64_t *ArgSizes,
572 int64_t *ArgTypes, map_var_info_t *ArgNames,
573 void **ArgMappers, AsyncInfoTy &AsyncInfo,
574 bool FromMapper) {
575 TIMESCOPE_WITH_IDENT(Loc);
576 // process each input.
577 for (int32_t I = 0; I < ArgNum; ++I) {
578 // Ignore private variables and arrays - there is no mapping for them.
579 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
580 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
581 continue;
583 if (ArgMappers && ArgMappers[I]) {
584 // Instead of executing the regular path of targetDataBegin, call the
585 // targetDataMapper variant which will call targetDataBegin again
586 // with new arguments.
587 DP("Calling targetDataMapper for the %dth argument\n", I);
589 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
590 int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
591 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
592 targetDataBegin);
594 if (Rc != OFFLOAD_SUCCESS) {
595 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
596 " failed.\n");
597 return OFFLOAD_FAIL;
600 // Skip the rest of this function, continue to the next argument.
601 continue;
604 void *HstPtrBegin = Args[I];
605 void *HstPtrBase = ArgsBase[I];
606 int64_t DataSize = ArgSizes[I];
607 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
609 // Adjust for proper alignment if this is a combined entry (for structs).
610 // Look at the next argument - if that is MEMBER_OF this one, then this one
611 // is a combined entry.
612 int64_t TgtPadding = 0;
613 const int NextI = I + 1;
614 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
615 getParentIndex(ArgTypes[NextI]) == I) {
616 int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase);
617 TgtPadding = (int64_t)HstPtrBegin % Alignment;
618 if (TgtPadding) {
619 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
620 "\n",
621 TgtPadding, DPxPTR(HstPtrBegin));
625 // Address of pointer on the host and device, respectively.
626 void *PointerHstPtrBegin, *PointerTgtPtrBegin;
627 TargetPointerResultTy PointerTpr;
628 bool IsHostPtr = false;
629 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
630 // Force the creation of a device side copy of the data when:
631 // a close map modifier was associated with a map that contained a to.
632 bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
633 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
634 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
635 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
636 // have reached this point via __tgt_target_data_begin and not __tgt_target
637 // then no argument is marked as TARGET_PARAM ("omp target data map" is not
638 // associated with a target region, so there are no target parameters). This
639 // may be considered a hack, we could revise the scheme in the future.
640 bool UpdateRef =
641 !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0);
643 DeviceTy::HDTTMapAccessorTy HDTTMap =
644 Device.HostDataToTargetMap.getExclusiveAccessor();
645 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
646 DP("Has a pointer entry: \n");
647 // Base is address of pointer.
649 // Usually, the pointer is already allocated by this time. For example:
651 // #pragma omp target map(s.p[0:N])
653 // The map entry for s comes first, and the PTR_AND_OBJ entry comes
654 // afterward, so the pointer is already allocated by the time the
655 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
656 // non-null. However, "declare target link" can produce a PTR_AND_OBJ
657 // entry for a global that might not already be allocated by the time the
658 // PTR_AND_OBJ entry is handled below, and so the allocation might fail
659 // when HasPresentModifier.
660 PointerTpr = Device.getTargetPointer(
661 HDTTMap, HstPtrBase, HstPtrBase, /*TgtPadding=*/0, sizeof(void *),
662 /*HstPtrName=*/nullptr,
663 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
664 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo,
665 /* OwnedTPR */ nullptr, /* ReleaseHDTTMap */ false);
666 PointerTgtPtrBegin = PointerTpr.TargetPointer;
667 IsHostPtr = PointerTpr.Flags.IsHostPointer;
668 if (!PointerTgtPtrBegin) {
669 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
670 HasPresentModifier ? "'present' map type modifier"
671 : "device failure or illegal mapping");
672 return OFFLOAD_FAIL;
674 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
675 "\n",
676 sizeof(void *), DPxPTR(PointerTgtPtrBegin),
677 (PointerTpr.Flags.IsNewEntry ? "" : " not"));
678 PointerHstPtrBegin = HstPtrBase;
679 // modify current entry.
680 HstPtrBase = *(void **)HstPtrBase;
681 // No need to update pointee ref count for the first element of the
682 // subelement that comes from mapper.
683 UpdateRef =
684 (!FromMapper || I != 0); // subsequently update ref count of pointee
687 const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO;
688 const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
689 // Note that HDTTMap will be released in getTargetPointer.
690 auto TPR = Device.getTargetPointer(
691 HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, DataSize, HstPtrName,
692 HasFlagTo, HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier,
693 HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry());
694 void *TgtPtrBegin = TPR.TargetPointer;
695 IsHostPtr = TPR.Flags.IsHostPointer;
696 // If data_size==0, then the argument could be a zero-length pointer to
697 // NULL, so getOrAlloc() returning NULL is not an error.
698 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
699 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
700 HasPresentModifier ? "'present' map type modifier"
701 : "device failure or illegal mapping");
702 return OFFLOAD_FAIL;
704 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
705 " - is%s new\n",
706 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
708 if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
709 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
710 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
711 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
712 ArgsBase[I] = TgtPtrBase;
715 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
717 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
718 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
720 if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{
721 (void **)PointerHstPtrBegin, HstPtrBase,
722 (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) {
723 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
724 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
726 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
727 TgtPtrBase = ExpectedTgtPtrBase;
729 int Ret =
730 Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *),
731 AsyncInfo, PointerTpr.getEntry());
732 if (Ret != OFFLOAD_SUCCESS) {
733 REPORT("Copying data to device failed.\n");
734 return OFFLOAD_FAIL;
736 if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
737 OFFLOAD_SUCCESS)
738 return OFFLOAD_FAIL;
742 // Check if variable can be used on the device:
743 bool IsStructMember = ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF;
744 if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING && ArgTypes[I] != 0 &&
745 !IsStructMember && !IsImplicit && !TPR.isPresent() &&
746 !TPR.isContained() && !TPR.isHostPointer())
747 INFO(OMP_INFOTYPE_EMPTY_MAPPING, Device.DeviceID,
748 "variable %s does not have a valid device counterpart\n",
749 (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
752 return OFFLOAD_SUCCESS;
755 namespace {
756 /// This structure contains information to deallocate a target pointer, aka.
757 /// used to fix up the shadow map and potentially delete the entry from the
758 /// mapping table via \p DeviceTy::deallocTgtPtr.
759 struct PostProcessingInfo {
760 /// Host pointer used to look up into the map table
761 void *HstPtrBegin;
763 /// Size of the data
764 int64_t DataSize;
766 /// The mapping type (bitfield).
767 int64_t ArgType;
769 /// The target pointer information.
770 TargetPointerResultTy TPR;
772 PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType,
773 TargetPointerResultTy &&TPR)
774 : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType),
775 TPR(std::move(TPR)) {}
778 } // namespace
780 /// Applies the necessary post-processing procedures to entries listed in \p
781 /// EntriesInfo after the execution of all device side operations from a target
782 /// data end. This includes the update of pointers at the host and removal of
783 /// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS
784 /// according to the successfulness of the operations.
785 [[nodiscard]] static int
786 postProcessingTargetDataEnd(DeviceTy *Device,
787 SmallVector<PostProcessingInfo> &EntriesInfo) {
788 int Ret = OFFLOAD_SUCCESS;
790 for (auto &[HstPtrBegin, DataSize, ArgType, TPR] : EntriesInfo) {
791 bool DelEntry = !TPR.isHostPointer();
793 // If the last element from the mapper (for end transfer args comes in
794 // reverse order), do not remove the partial entry, the parent struct still
795 // exists.
796 if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) &&
797 !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
798 DelEntry = false; // protect parent struct from being deallocated
801 // If we marked the entry to be deleted we need to verify no other
802 // thread reused it by now. If deletion is still supposed to happen by
803 // this thread LR will be set and exclusive access to the HDTT map
804 // will avoid another thread reusing the entry now. Note that we do
805 // not request (exclusive) access to the HDTT map if DelEntry is
806 // not set.
807 DeviceTy::HDTTMapAccessorTy HDTTMap =
808 Device->HostDataToTargetMap.getExclusiveAccessor();
810 // We cannot use a lock guard because we may end up delete the mutex.
811 // We also explicitly unlocked the entry after it was put in the EntriesInfo
812 // so it can be reused.
813 TPR.getEntry()->lock();
814 auto *Entry = TPR.getEntry();
816 const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0;
817 if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) {
818 // The thread is not in charge of deletion anymore. Give up access
819 // to the HDTT map and unset the deletion flag.
820 HDTTMap.destroy();
821 DelEntry = false;
824 // If we copied back to the host a struct/array containing pointers,
825 // we need to restore the original host pointer values from their
826 // shadow copies. If the struct is going to be deallocated, remove any
827 // remaining shadow pointer entries for this struct.
828 const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
829 if (HasFrom) {
830 Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
831 *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
832 DP("Restoring original host pointer value " DPxMOD " for host "
833 "pointer " DPxMOD "\n",
834 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
835 return OFFLOAD_SUCCESS;
839 // Give up the lock as we either don't need it anymore (e.g., done with
840 // TPR), or erase TPR.
841 TPR.setEntry(nullptr);
843 if (!DelEntry)
844 continue;
846 Ret = Device->eraseMapEntry(HDTTMap, Entry, DataSize);
847 // Entry is already remove from the map, we can unlock it now.
848 HDTTMap.destroy();
849 Ret |= Device->deallocTgtPtrAndEntry(Entry, DataSize);
850 if (Ret != OFFLOAD_SUCCESS) {
851 REPORT("Deallocating data from device failed.\n");
852 break;
856 delete &EntriesInfo;
857 return Ret;
860 /// Internal function to undo the mapping and retrieve the data from the device.
861 int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
862 void **ArgBases, void **Args, int64_t *ArgSizes,
863 int64_t *ArgTypes, map_var_info_t *ArgNames,
864 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
865 int Ret = OFFLOAD_SUCCESS;
866 auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
867 // process each input.
868 for (int32_t I = ArgNum - 1; I >= 0; --I) {
869 // Ignore private variables and arrays - there is no mapping for them.
870 // Also, ignore the use_device_ptr directive, it has no effect here.
871 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
872 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
873 continue;
875 if (ArgMappers && ArgMappers[I]) {
876 // Instead of executing the regular path of targetDataEnd, call the
877 // targetDataMapper variant which will call targetDataEnd again
878 // with new arguments.
879 DP("Calling targetDataMapper for the %dth argument\n", I);
881 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
882 Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I],
883 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
884 targetDataEnd);
886 if (Ret != OFFLOAD_SUCCESS) {
887 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
888 " failed.\n");
889 return OFFLOAD_FAIL;
892 // Skip the rest of this function, continue to the next argument.
893 continue;
896 void *HstPtrBegin = Args[I];
897 int64_t DataSize = ArgSizes[I];
898 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
899 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
900 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
901 !(FromMapper && I == 0);
902 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
903 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
904 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
906 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
907 TargetPointerResultTy TPR =
908 Device.getTgtPtrBegin(HstPtrBegin, DataSize, UpdateRef, HasHoldModifier,
909 !IsImplicit, ForceDelete, /*FromDataEnd=*/true);
910 void *TgtPtrBegin = TPR.TargetPointer;
911 if (!TPR.isPresent() && !TPR.isHostPointer() &&
912 (DataSize || HasPresentModifier)) {
913 DP("Mapping does not exist (%s)\n",
914 (HasPresentModifier ? "'present' map type modifier" : "ignored"));
915 if (HasPresentModifier) {
916 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
917 // "If a map clause appears on a target, target data, target enter data
918 // or target exit data construct with a present map-type-modifier then
919 // on entry to the region if the corresponding list item does not appear
920 // in the device data environment then an error occurs and the program
921 // terminates."
923 // This should be an error upon entering an "omp target exit data". It
924 // should not be an error upon exiting an "omp target data" or "omp
925 // target". For "omp target data", Clang thus doesn't include present
926 // modifiers for end calls. For "omp target", we have not found a valid
927 // OpenMP program for which the error matters: it appears that, if a
928 // program can guarantee that data is present at the beginning of an
929 // "omp target" region so that there's no error there, that data is also
930 // guaranteed to be present at the end.
931 MESSAGE("device mapping required by 'present' map type modifier does "
932 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
933 DPxPTR(HstPtrBegin), DataSize);
934 return OFFLOAD_FAIL;
936 } else {
937 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
938 " - is%s last\n",
939 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not"));
942 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
943 // "If the map clause appears on a target, target data, or target exit data
944 // construct and a corresponding list item of the original list item is not
945 // present in the device data environment on exit from the region then the
946 // list item is ignored."
947 if (!TPR.isPresent())
948 continue;
950 // Move data back to the host
951 const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
952 const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM;
953 if (HasFrom && (HasAlways || TPR.Flags.IsLast) &&
954 !TPR.Flags.IsHostPointer && DataSize != 0) {
955 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
956 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
958 // Wait for any previous transfer if an event is present.
959 if (void *Event = TPR.getEntry()->getEvent()) {
960 if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
961 REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
962 return OFFLOAD_FAIL;
966 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo,
967 TPR.getEntry());
968 if (Ret != OFFLOAD_SUCCESS) {
969 REPORT("Copying data from device failed.\n");
970 return OFFLOAD_FAIL;
973 // As we are expecting to delete the entry the d2h copy might race
974 // with another one that also tries to delete the entry. This happens
975 // as the entry can be reused and the reuse might happen after the
976 // copy-back was issued but before it completed. Since the reuse might
977 // also copy-back a value we would race.
978 if (TPR.Flags.IsLast) {
979 if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
980 OFFLOAD_SUCCESS)
981 return OFFLOAD_FAIL;
985 // Add pointer to the buffer for post-synchronize processing.
986 PostProcessingPtrs->emplace_back(HstPtrBegin, DataSize, ArgTypes[I],
987 std::move(TPR));
988 PostProcessingPtrs->back().TPR.getEntry()->unlock();
991 // Add post-processing functions
992 // TODO: We might want to remove `mutable` in the future by not changing the
993 // captured variables somehow.
994 AsyncInfo.addPostProcessingFunction([=, Device = &Device]() mutable -> int {
995 return postProcessingTargetDataEnd(Device, *PostProcessingPtrs);
998 return Ret;
1001 static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
1002 void *HstPtrBegin, int64_t ArgSize,
1003 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
1004 TargetPointerResultTy TPR =
1005 Device.getTgtPtrBegin(HstPtrBegin, ArgSize, /*UpdateRefCount=*/false,
1006 /*UseHoldRefCount=*/false, /*MustContain=*/true);
1007 void *TgtPtrBegin = TPR.TargetPointer;
1008 if (!TPR.isPresent()) {
1009 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
1010 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
1011 MESSAGE("device mapping required by 'present' motion modifier does not "
1012 "exist for host address " DPxMOD " (%" PRId64 " bytes)",
1013 DPxPTR(HstPtrBegin), ArgSize);
1014 return OFFLOAD_FAIL;
1016 return OFFLOAD_SUCCESS;
1019 if (TPR.Flags.IsHostPointer) {
1020 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
1021 DPxPTR(HstPtrBegin));
1022 return OFFLOAD_SUCCESS;
1025 if (ArgType & OMP_TGT_MAPTYPE_TO) {
1026 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
1027 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
1028 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo,
1029 TPR.getEntry());
1030 if (Ret != OFFLOAD_SUCCESS) {
1031 REPORT("Copying data to device failed.\n");
1032 return OFFLOAD_FAIL;
1034 if (TPR.getEntry()) {
1035 int Ret = TPR.getEntry()->foreachShadowPointerInfo(
1036 [&](ShadowPtrInfoTy &ShadowPtr) {
1037 DP("Restoring original target pointer value " DPxMOD " for target "
1038 "pointer " DPxMOD "\n",
1039 DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr));
1040 Ret = Device.submitData(ShadowPtr.TgtPtrAddr,
1041 (void *)&ShadowPtr.TgtPtrVal,
1042 sizeof(void *), AsyncInfo);
1043 if (Ret != OFFLOAD_SUCCESS) {
1044 REPORT("Copying data to device failed.\n");
1045 return OFFLOAD_FAIL;
1047 return OFFLOAD_SUCCESS;
1049 if (Ret != OFFLOAD_SUCCESS) {
1050 DP("Updating shadow map failed\n");
1051 return Ret;
1056 if (ArgType & OMP_TGT_MAPTYPE_FROM) {
1057 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
1058 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
1059 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo,
1060 TPR.getEntry());
1061 if (Ret != OFFLOAD_SUCCESS) {
1062 REPORT("Copying data from device failed.\n");
1063 return OFFLOAD_FAIL;
1066 // Wait for device-to-host memcopies for whole struct to complete,
1067 // before restoring the correct host pointer.
1068 if (auto *Entry = TPR.getEntry()) {
1069 AsyncInfo.addPostProcessingFunction([=]() -> int {
1070 int Ret = Entry->foreachShadowPointerInfo(
1071 [&](const ShadowPtrInfoTy &ShadowPtr) {
1072 *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
1073 DP("Restoring original host pointer value " DPxMOD
1074 " for host pointer " DPxMOD "\n",
1075 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
1076 return OFFLOAD_SUCCESS;
1078 Entry->unlock();
1079 if (Ret != OFFLOAD_SUCCESS) {
1080 DP("Updating shadow map failed\n");
1081 return Ret;
1083 return OFFLOAD_SUCCESS;
1088 return OFFLOAD_SUCCESS;
1091 static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device,
1092 void *ArgsBase,
1093 __tgt_target_non_contig *NonContig,
1094 uint64_t Size, int64_t ArgType,
1095 int CurrentDim, int DimSize, uint64_t Offset,
1096 AsyncInfoTy &AsyncInfo) {
1097 int Ret = OFFLOAD_SUCCESS;
1098 if (CurrentDim < DimSize) {
1099 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
1100 uint64_t CurOffset =
1101 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
1102 // we only need to transfer the first element for the last dimension
1103 // since we've already got a contiguous piece.
1104 if (CurrentDim != DimSize - 1 || I == 0) {
1105 Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size,
1106 ArgType, CurrentDim + 1, DimSize,
1107 Offset + CurOffset, AsyncInfo);
1108 // Stop the whole process if any contiguous piece returns anything
1109 // other than OFFLOAD_SUCCESS.
1110 if (Ret != OFFLOAD_SUCCESS)
1111 return Ret;
1114 } else {
1115 char *Ptr = (char *)ArgsBase + Offset;
1116 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
1117 " len %" PRIu64 "\n",
1118 DPxPTR(Ptr), Offset, Size);
1119 Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType,
1120 AsyncInfo);
1122 return Ret;
1125 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
1126 int32_t DimSize) {
1127 int RemovedDim = 0;
1128 for (int I = DimSize - 1; I > 0; --I) {
1129 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
1130 RemovedDim++;
1132 return RemovedDim;
1135 /// Internal function to pass data to/from the target.
1136 int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
1137 void **ArgsBase, void **Args, int64_t *ArgSizes,
1138 int64_t *ArgTypes, map_var_info_t *ArgNames,
1139 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
1140 // process each input.
1141 for (int32_t I = 0; I < ArgNum; ++I) {
1142 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
1143 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
1144 continue;
1146 if (ArgMappers && ArgMappers[I]) {
1147 // Instead of executing the regular path of targetDataUpdate, call the
1148 // targetDataMapper variant which will call targetDataUpdate again
1149 // with new arguments.
1150 DP("Calling targetDataMapper for the %dth argument\n", I);
1152 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
1153 int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1154 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
1155 targetDataUpdate);
1157 if (Ret != OFFLOAD_SUCCESS) {
1158 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
1159 " failed.\n");
1160 return OFFLOAD_FAIL;
1163 // Skip the rest of this function, continue to the next argument.
1164 continue;
1167 int Ret = OFFLOAD_SUCCESS;
1169 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
1170 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
1171 int32_t DimSize = ArgSizes[I];
1172 uint64_t Size =
1173 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
1174 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
1175 Ret = targetDataNonContiguous(
1176 Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
1177 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
1178 } else {
1179 Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1180 ArgTypes[I], AsyncInfo);
1182 if (Ret == OFFLOAD_FAIL)
1183 return OFFLOAD_FAIL;
1185 return OFFLOAD_SUCCESS;
1188 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
1189 OMP_TGT_MAPTYPE_LITERAL |
1190 OMP_TGT_MAPTYPE_IMPLICIT;
1191 static bool isLambdaMapping(int64_t Mapping) {
1192 return (Mapping & LambdaMapping) == LambdaMapping;
1195 namespace {
1196 /// Find the table information in the map or look it up in the translation
1197 /// tables.
1198 TableMap *getTableMap(void *HostPtr) {
1199 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1200 HostPtrToTableMapTy::iterator TableMapIt =
1201 PM->HostPtrToTableMap.find(HostPtr);
1203 if (TableMapIt != PM->HostPtrToTableMap.end())
1204 return &TableMapIt->second;
1206 // We don't have a map. So search all the registered libraries.
1207 TableMap *TM = nullptr;
1208 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1209 for (HostEntriesBeginToTransTableTy::iterator Itr =
1210 PM->HostEntriesBeginToTransTable.begin();
1211 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
1212 // get the translation table (which contains all the good info).
1213 TranslationTable *TransTable = &Itr->second;
1214 // iterate over all the host table entries to see if we can locate the
1215 // host_ptr.
1216 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
1217 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
1218 if (Cur->addr != HostPtr)
1219 continue;
1220 // we got a match, now fill the HostPtrToTableMap so that we
1221 // may avoid this search next time.
1222 TM = &(PM->HostPtrToTableMap)[HostPtr];
1223 TM->Table = TransTable;
1224 TM->Index = I;
1225 return TM;
1229 return nullptr;
1232 /// A class manages private arguments in a target region.
1233 class PrivateArgumentManagerTy {
1234 /// A data structure for the information of first-private arguments. We can
1235 /// use this information to optimize data transfer by packing all
1236 /// first-private arguments and transfer them all at once.
1237 struct FirstPrivateArgInfoTy {
1238 /// Host pointer begin
1239 char *HstPtrBegin;
1240 /// Host pointer end
1241 char *HstPtrEnd;
1242 /// The index of the element in \p TgtArgs corresponding to the argument
1243 int Index;
1244 /// Alignment of the entry (base of the entry, not after the entry).
1245 uint32_t Alignment;
1246 /// Size (without alignment, see padding)
1247 uint32_t Size;
1248 /// Padding used to align this argument entry, if necessary.
1249 uint32_t Padding;
1250 /// Host pointer name
1251 map_var_info_t HstPtrName = nullptr;
1253 FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size,
1254 uint32_t Alignment, uint32_t Padding,
1255 const map_var_info_t HstPtrName = nullptr)
1256 : HstPtrBegin(reinterpret_cast<char *>(HstPtr)),
1257 HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment),
1258 Size(Size), Padding(Padding), HstPtrName(HstPtrName) {}
1261 /// A vector of target pointers for all private arguments
1262 SmallVector<void *> TgtPtrs;
1264 /// A vector of information of all first-private arguments to be packed
1265 SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1266 /// Host buffer for all arguments to be packed
1267 SmallVector<char> FirstPrivateArgBuffer;
1268 /// The total size of all arguments to be packed
1269 int64_t FirstPrivateArgSize = 0;
1271 /// A reference to the \p DeviceTy object
1272 DeviceTy &Device;
1273 /// A pointer to a \p AsyncInfoTy object
1274 AsyncInfoTy &AsyncInfo;
1276 // TODO: What would be the best value here? Should we make it configurable?
1277 // If the size is larger than this threshold, we will allocate and transfer it
1278 // immediately instead of packing it.
1279 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1281 public:
1282 /// Constructor
1283 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1284 : Device(Dev), AsyncInfo(AsyncInfo) {}
1286 /// Add a private argument
1287 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1288 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1289 const map_var_info_t HstPtrName = nullptr,
1290 const bool AllocImmediately = false) {
1291 // If the argument is not first-private, or its size is greater than a
1292 // predefined threshold, we will allocate memory and issue the transfer
1293 // immediately.
1294 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1295 AllocImmediately) {
1296 TgtPtr = Device.allocData(ArgSize, HstPtr);
1297 if (!TgtPtr) {
1298 DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1299 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1300 return OFFLOAD_FAIL;
1302 #ifdef OMPTARGET_DEBUG
1303 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1304 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1305 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1306 "\n",
1307 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1308 DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1309 #endif
1310 // If first-private, copy data from host
1311 if (IsFirstPrivate) {
1312 DP("Submitting firstprivate data to the device.\n");
1313 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1314 if (Ret != OFFLOAD_SUCCESS) {
1315 DP("Copying data to device failed, failed.\n");
1316 return OFFLOAD_FAIL;
1319 TgtPtrs.push_back(TgtPtr);
1320 } else {
1321 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1322 DPxPTR(HstPtr), ArgSize);
1323 // When reach this point, the argument must meet all following
1324 // requirements:
1325 // 1. Its size does not exceed the threshold (see the comment for
1326 // FirstPrivateArgSizeThreshold);
1327 // 2. It must be first-private (needs to be mapped to target device).
1328 // We will pack all this kind of arguments to transfer them all at once
1329 // to reduce the number of data transfer. We will not take
1330 // non-first-private arguments, aka. private arguments that doesn't need
1331 // to be mapped to target device, into account because data allocation
1332 // can be very efficient with memory manager.
1334 // Placeholder value
1335 TgtPtr = nullptr;
1336 auto *LastFPArgInfo =
1337 FirstPrivateArgInfo.empty() ? nullptr : &FirstPrivateArgInfo.back();
1339 // Compute the start alignment of this entry, add padding if necessary.
1340 // TODO: Consider sorting instead.
1341 uint32_t Padding = 0;
1342 uint32_t StartAlignment =
1343 LastFPArgInfo ? LastFPArgInfo->Alignment : MaxAlignment;
1344 if (LastFPArgInfo) {
1345 // Check if we keep the start alignment or if it is shrunk due to the
1346 // size of the last element.
1347 uint32_t Offset = LastFPArgInfo->Size % StartAlignment;
1348 if (Offset)
1349 StartAlignment = Offset;
1350 // We only need as much alignment as the host pointer had (since we
1351 // don't know the alignment information from the source we might end up
1352 // overaligning accesses but not too much).
1353 uint32_t RequiredAlignment =
1354 llvm::bit_floor(getPartialStructRequiredAlignment(HstPtr));
1355 if (RequiredAlignment > StartAlignment) {
1356 Padding = RequiredAlignment - StartAlignment;
1357 StartAlignment = RequiredAlignment;
1361 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1362 StartAlignment, Padding, HstPtrName);
1363 FirstPrivateArgSize += Padding + ArgSize;
1366 return OFFLOAD_SUCCESS;
1369 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1370 /// and start the transfer.
1371 int packAndTransfer(SmallVector<void *> &TgtArgs) {
1372 if (!FirstPrivateArgInfo.empty()) {
1373 assert(FirstPrivateArgSize != 0 &&
1374 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1375 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1376 auto Itr = FirstPrivateArgBuffer.begin();
1377 // Copy all host data to this buffer
1378 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1379 // First pad the pointer as we (have to) pad it on the device too.
1380 Itr = std::next(Itr, Info.Padding);
1381 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1382 Itr = std::next(Itr, Info.Size);
1384 // Allocate target memory
1385 void *TgtPtr =
1386 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1387 if (TgtPtr == nullptr) {
1388 DP("Failed to allocate target memory for private arguments.\n");
1389 return OFFLOAD_FAIL;
1391 TgtPtrs.push_back(TgtPtr);
1392 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1393 FirstPrivateArgSize, DPxPTR(TgtPtr));
1394 // Transfer data to target device
1395 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1396 FirstPrivateArgSize, AsyncInfo);
1397 if (Ret != OFFLOAD_SUCCESS) {
1398 DP("Failed to submit data of private arguments.\n");
1399 return OFFLOAD_FAIL;
1401 // Fill in all placeholder pointers
1402 auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1403 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1404 void *&Ptr = TgtArgs[Info.Index];
1405 assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1406 // Pad the device pointer to get the right alignment.
1407 TP += Info.Padding;
1408 Ptr = reinterpret_cast<void *>(TP);
1409 TP += Info.Size;
1410 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1411 "\n",
1412 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1413 DPxPTR(Ptr));
1417 return OFFLOAD_SUCCESS;
1420 /// Free all target memory allocated for private arguments
1421 int free() {
1422 for (void *P : TgtPtrs) {
1423 int Ret = Device.deleteData(P);
1424 if (Ret != OFFLOAD_SUCCESS) {
1425 DP("Deallocation of (first-)private arrays failed.\n");
1426 return OFFLOAD_FAIL;
1430 TgtPtrs.clear();
1432 return OFFLOAD_SUCCESS;
1436 /// Process data before launching the kernel, including calling targetDataBegin
1437 /// to map and transfer data to target device, transferring (first-)private
1438 /// variables.
1439 static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1440 int32_t ArgNum, void **ArgBases, void **Args,
1441 int64_t *ArgSizes, int64_t *ArgTypes,
1442 map_var_info_t *ArgNames, void **ArgMappers,
1443 SmallVector<void *> &TgtArgs,
1444 SmallVector<ptrdiff_t> &TgtOffsets,
1445 PrivateArgumentManagerTy &PrivateArgumentManager,
1446 AsyncInfoTy &AsyncInfo) {
1447 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
1448 DeviceTy &Device = *PM->Devices[DeviceId];
1449 int Ret = targetDataBegin(Loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1450 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1451 if (Ret != OFFLOAD_SUCCESS) {
1452 REPORT("Call to targetDataBegin failed, abort target.\n");
1453 return OFFLOAD_FAIL;
1456 // List of (first-)private arrays allocated for this target region
1457 SmallVector<int> TgtArgsPositions(ArgNum, -1);
1459 for (int32_t I = 0; I < ArgNum; ++I) {
1460 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1461 // This is not a target parameter, do not push it into TgtArgs.
1462 // Check for lambda mapping.
1463 if (isLambdaMapping(ArgTypes[I])) {
1464 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1465 "PTR_AND_OBJ must be also MEMBER_OF.");
1466 unsigned Idx = getParentIndex(ArgTypes[I]);
1467 int TgtIdx = TgtArgsPositions[Idx];
1468 assert(TgtIdx != -1 && "Base address must be translated already.");
1469 // The parent lambda must be processed already and it must be the last
1470 // in TgtArgs and TgtOffsets arrays.
1471 void *HstPtrVal = Args[I];
1472 void *HstPtrBegin = ArgBases[I];
1473 void *HstPtrBase = Args[Idx];
1474 void *TgtPtrBase =
1475 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1476 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1477 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1478 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1479 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1480 TargetPointerResultTy TPR = Device.getTgtPtrBegin(
1481 HstPtrVal, ArgSizes[I], /*UpdateRefCount=*/false,
1482 /*UseHoldRefCount=*/false);
1483 PointerTgtPtrBegin = TPR.TargetPointer;
1484 if (!TPR.isPresent()) {
1485 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1486 DPxPTR(HstPtrVal));
1487 continue;
1489 if (TPR.Flags.IsHostPointer) {
1490 DP("Unified memory is active, no need to map lambda captured"
1491 "variable (" DPxMOD ")\n",
1492 DPxPTR(HstPtrVal));
1493 continue;
1495 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1496 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1497 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1498 sizeof(void *), AsyncInfo, TPR.getEntry());
1499 if (Ret != OFFLOAD_SUCCESS) {
1500 REPORT("Copying data to device failed.\n");
1501 return OFFLOAD_FAIL;
1504 continue;
1506 void *HstPtrBegin = Args[I];
1507 void *HstPtrBase = ArgBases[I];
1508 void *TgtPtrBegin;
1509 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1510 ptrdiff_t TgtBaseOffset;
1511 TargetPointerResultTy TPR;
1512 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1513 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1514 DPxPTR(HstPtrBase));
1515 TgtPtrBegin = HstPtrBase;
1516 TgtBaseOffset = 0;
1517 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1518 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1519 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1520 // If there is a next argument and it depends on the current one, we need
1521 // to allocate the private memory immediately. If this is not the case,
1522 // then the argument can be marked for optimization and packed with the
1523 // other privates.
1524 const bool AllocImmediately =
1525 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1526 Ret = PrivateArgumentManager.addArg(
1527 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1528 TgtArgs.size(), HstPtrName, AllocImmediately);
1529 if (Ret != OFFLOAD_SUCCESS) {
1530 REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1531 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1532 return OFFLOAD_FAIL;
1534 } else {
1535 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1536 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1537 TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I],
1538 /*UpdateRefCount=*/false,
1539 /*UseHoldRefCount=*/false);
1540 TgtPtrBegin = TPR.TargetPointer;
1541 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1542 #ifdef OMPTARGET_DEBUG
1543 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1544 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1545 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1546 #endif
1548 TgtArgsPositions[I] = TgtArgs.size();
1549 TgtArgs.push_back(TgtPtrBegin);
1550 TgtOffsets.push_back(TgtBaseOffset);
1553 assert(TgtArgs.size() == TgtOffsets.size() &&
1554 "Size mismatch in arguments and offsets");
1556 // Pack and transfer first-private arguments
1557 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1558 if (Ret != OFFLOAD_SUCCESS) {
1559 DP("Failed to pack and transfer first private arguments\n");
1560 return OFFLOAD_FAIL;
1563 return OFFLOAD_SUCCESS;
1566 /// Process data after launching the kernel, including transferring data back to
1567 /// host if needed and deallocating target memory of (first-)private variables.
1568 static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1569 int32_t ArgNum, void **ArgBases, void **Args,
1570 int64_t *ArgSizes, int64_t *ArgTypes,
1571 map_var_info_t *ArgNames, void **ArgMappers,
1572 PrivateArgumentManagerTy &PrivateArgumentManager,
1573 AsyncInfoTy &AsyncInfo) {
1574 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
1575 DeviceTy &Device = *PM->Devices[DeviceId];
1577 // Move data from device.
1578 int Ret = targetDataEnd(Loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1579 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1580 if (Ret != OFFLOAD_SUCCESS) {
1581 REPORT("Call to targetDataEnd failed, abort target.\n");
1582 return OFFLOAD_FAIL;
1585 // Free target memory for private arguments after synchronization.
1586 // TODO: We might want to remove `mutable` in the future by not changing the
1587 // captured variables somehow.
1588 AsyncInfo.addPostProcessingFunction(
1589 [PrivateArgumentManager =
1590 std::move(PrivateArgumentManager)]() mutable -> int {
1591 int Ret = PrivateArgumentManager.free();
1592 if (Ret != OFFLOAD_SUCCESS) {
1593 REPORT("Failed to deallocate target memory for private args\n");
1594 return OFFLOAD_FAIL;
1596 return Ret;
1599 return OFFLOAD_SUCCESS;
1601 } // namespace
1603 /// performs the same actions as data_begin in case arg_num is
1604 /// non-zero and initiates run of the offloaded region on the target platform;
1605 /// if arg_num is non-zero after the region execution is done it also
1606 /// performs the same action as data_update and data_end above. This function
1607 /// returns 0 if it was able to transfer the execution to a target and an
1608 /// integer different from zero otherwise.
1609 int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
1610 KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) {
1611 int32_t DeviceId = Device.DeviceID;
1612 TableMap *TM = getTableMap(HostPtr);
1613 // No map for this host pointer found!
1614 if (!TM) {
1615 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1616 DPxPTR(HostPtr));
1617 return OFFLOAD_FAIL;
1620 // get target table.
1621 __tgt_target_table *TargetTable = nullptr;
1623 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1624 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1625 "Not expecting a device ID outside the table's bounds!");
1626 TargetTable = TM->Table->TargetsTable[DeviceId];
1628 assert(TargetTable && "Global data has not been mapped\n");
1630 DP("loop trip count is %" PRIu64 ".\n", KernelArgs.Tripcount);
1632 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1633 // need to manifest base pointers prior to launching a kernel. Even if we have
1634 // mapped an object only partially, e.g. A[N:M], although the kernel is
1635 // expected to access elements starting at address &A[N] and beyond, we still
1636 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1637 // API, we need the begin address itself, i.e. &A[N], as the API operates on
1638 // begin addresses, not bases. That's why we pass args and offsets as two
1639 // separate entities so that each plugin can do what it needs. This behavior
1640 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1641 SmallVector<void *> TgtArgs;
1642 SmallVector<ptrdiff_t> TgtOffsets;
1644 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1646 int NumClangLaunchArgs = KernelArgs.NumArgs;
1647 int Ret = OFFLOAD_SUCCESS;
1648 if (NumClangLaunchArgs) {
1649 // Process data, such as data mapping, before launching the kernel
1650 Ret = processDataBefore(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
1651 KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
1652 KernelArgs.ArgSizes, KernelArgs.ArgTypes,
1653 KernelArgs.ArgNames, KernelArgs.ArgMappers, TgtArgs,
1654 TgtOffsets, PrivateArgumentManager, AsyncInfo);
1655 if (Ret != OFFLOAD_SUCCESS) {
1656 REPORT("Failed to process data before launching the kernel.\n");
1657 return OFFLOAD_FAIL;
1660 // Clang might pass more values via the ArgPtrs to the runtime that we pass
1661 // on to the kernel.
1662 // TOOD: Next time we adjust the KernelArgsTy we should introduce a new
1663 // NumKernelArgs field.
1664 KernelArgs.NumArgs = TgtArgs.size();
1667 // Launch device execution.
1668 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1669 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1670 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1673 assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
1674 TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc);
1676 #ifdef OMPT_SUPPORT
1677 assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
1678 "Multi dimensional launch not supported yet.");
1679 /// RAII to establish tool anchors before and after kernel launch
1680 int32_t NumTeams = KernelArgs.NumTeams[0];
1681 // No need to guard this with OMPT_IF_BUILT
1682 InterfaceRAII TargetSubmitRAII(
1683 RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams);
1684 #endif
1686 Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(),
1687 KernelArgs, AsyncInfo);
1690 if (Ret != OFFLOAD_SUCCESS) {
1691 REPORT("Executing target region abort target.\n");
1692 return OFFLOAD_FAIL;
1695 if (NumClangLaunchArgs) {
1696 // Transfer data back and deallocate target memory for (first-)private
1697 // variables
1698 Ret = processDataAfter(Loc, DeviceId, HostPtr, NumClangLaunchArgs,
1699 KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs,
1700 KernelArgs.ArgSizes, KernelArgs.ArgTypes,
1701 KernelArgs.ArgNames, KernelArgs.ArgMappers,
1702 PrivateArgumentManager, AsyncInfo);
1703 if (Ret != OFFLOAD_SUCCESS) {
1704 REPORT("Failed to process data after launching the kernel.\n");
1705 return OFFLOAD_FAIL;
1709 return OFFLOAD_SUCCESS;
1712 /// Enables the record replay mechanism by pre-allocating MemorySize
1713 /// and informing the record-replayer of whether to store the output
1714 /// in some file.
1715 int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
1716 bool isRecord, bool SaveOutput) {
1717 return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, VAddr,
1718 isRecord, SaveOutput);
1721 /// Executes a kernel using pre-recorded information for loading to
1722 /// device memory to launch the target kernel with the pre-recorded
1723 /// configuration.
1724 int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
1725 void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
1726 ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
1727 int32_t ThreadLimit, uint64_t LoopTripCount,
1728 AsyncInfoTy &AsyncInfo) {
1729 int32_t DeviceId = Device.DeviceID;
1730 TableMap *TM = getTableMap(HostPtr);
1731 // Fail if the table map fails to find the target kernel pointer for the
1732 // provided host pointer.
1733 if (!TM) {
1734 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1735 DPxPTR(HostPtr));
1736 return OFFLOAD_FAIL;
1739 // Retrieve the target table of offloading entries.
1740 __tgt_target_table *TargetTable = nullptr;
1742 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1743 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1744 "Not expecting a device ID outside the table's bounds!");
1745 TargetTable = TM->Table->TargetsTable[DeviceId];
1747 assert(TargetTable && "Global data has not been mapped\n");
1749 // Retrieve the target kernel pointer, allocate and store the recorded device
1750 // memory data, and launch device execution.
1751 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1752 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1753 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1755 void *TgtPtr = Device.allocData(DeviceMemorySize, /* HstPtr */ nullptr,
1756 TARGET_ALLOC_DEFAULT);
1757 Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
1759 KernelArgsTy KernelArgs = {0};
1760 KernelArgs.Version = 2;
1761 KernelArgs.NumArgs = NumArgs;
1762 KernelArgs.Tripcount = LoopTripCount;
1763 KernelArgs.NumTeams[0] = NumTeams;
1764 KernelArgs.ThreadLimit[0] = ThreadLimit;
1766 int Ret = Device.launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs,
1767 AsyncInfo);
1769 if (Ret != OFFLOAD_SUCCESS) {
1770 REPORT("Executing target region abort target.\n");
1771 return OFFLOAD_FAIL;
1774 return OFFLOAD_SUCCESS;