1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Implementation of the interface to be used by Clang during the codegen of a
12 //===----------------------------------------------------------------------===//
14 #include "omptarget.h"
15 #include "OmptCallback.h"
16 #include "OmptInterface.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/ADT/bit.h"
28 using llvm::SmallVector
;
30 using namespace llvm::omp::target::ompt
;
33 int AsyncInfoTy::synchronize() {
34 int Result
= OFFLOAD_SUCCESS
;
35 if (!isQueueEmpty()) {
37 case SyncTy::BLOCKING
:
38 // If we have a queue we need to synchronize it now.
39 Result
= Device
.synchronize(*this);
40 assert(AsyncInfo
.Queue
== nullptr &&
41 "The device plugin should have nulled the queue to indicate there "
42 "are no outstanding actions!");
44 case SyncTy::NON_BLOCKING
:
45 Result
= Device
.queryAsync(*this);
50 // Run any pending post-processing function registered on this async object.
51 if (Result
== OFFLOAD_SUCCESS
&& isQueueEmpty())
52 Result
= runPostProcessing();
57 void *&AsyncInfoTy::getVoidPtrLocation() {
58 BufferLocations
.push_back(nullptr);
59 return BufferLocations
.back();
62 bool AsyncInfoTy::isDone() const { return isQueueEmpty(); }
64 int32_t AsyncInfoTy::runPostProcessing() {
65 size_t Size
= PostProcessingFunctions
.size();
66 for (size_t I
= 0; I
< Size
; ++I
) {
67 const int Result
= PostProcessingFunctions
[I
]();
68 if (Result
!= OFFLOAD_SUCCESS
)
72 // Clear the vector up until the last known function, since post-processing
73 // procedures might add new procedures themselves.
74 const auto PrevBegin
= PostProcessingFunctions
.begin();
75 PostProcessingFunctions
.erase(PrevBegin
, PrevBegin
+ Size
);
77 return OFFLOAD_SUCCESS
;
80 bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo
.Queue
== nullptr; }
82 /* All begin addresses for partially mapped structs must be aligned, up to 16,
83 * in order to ensure proper alignment of members. E.g.
88 * int *p; // 8-aligned
91 * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
94 * for (int i...) s1.p[i] = ...;
97 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
98 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
99 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
100 * requirements for its type. Now, when we allocate memory on the device, in
101 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
102 * This means that the chunk of the struct on the device will start at a
103 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
104 * address of p will be a misaligned 0x204 (on the host there was no need to add
105 * padding between b and p, so p comes exactly 4 bytes after b). If the device
106 * kernel tries to access s1.p, a misaligned address error occurs (as reported
107 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
108 * extending the size of the allocated chuck accordingly, the chuck on the
109 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
110 * &s1.p=0x208, as they should be to satisfy the alignment requirements.
112 static const int64_t MaxAlignment
= 16;
114 /// Return the alignment requirement of partially mapped structs, see
115 /// MaxAlignment above.
116 static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase
) {
117 int LowestOneBit
= __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase
));
118 uint64_t BaseAlignment
= 1 << (LowestOneBit
- 1);
119 return MaxAlignment
< BaseAlignment
? MaxAlignment
: BaseAlignment
;
122 /// Map global data and execute pending ctors
123 static int initLibrary(DeviceTy
&Device
) {
127 int32_t DeviceId
= Device
.DeviceID
;
128 int Rc
= OFFLOAD_SUCCESS
;
129 bool SupportsEmptyImages
= Device
.RTL
->supports_empty_images
&&
130 Device
.RTL
->supports_empty_images() > 0;
132 std::lock_guard
<decltype(PM
->TrlTblMtx
)> LG(PM
->TrlTblMtx
);
133 for (auto *HostEntriesBegin
: PM
->HostEntriesBeginRegistrationOrder
) {
134 TranslationTable
*TransTable
=
135 &PM
->HostEntriesBeginToTransTable
[HostEntriesBegin
];
136 if (TransTable
->HostTable
.EntriesBegin
==
137 TransTable
->HostTable
.EntriesEnd
&&
138 !SupportsEmptyImages
) {
139 // No host entry so no need to proceed
143 if (TransTable
->TargetsTable
[DeviceId
] != 0) {
144 // Library entries have already been processed
149 assert(TransTable
->TargetsImages
.size() > (size_t)DeviceId
&&
150 "Not expecting a device ID outside the table's bounds!");
151 __tgt_device_image
*Img
= TransTable
->TargetsImages
[DeviceId
];
153 REPORT("No image loaded for device id %d.\n", DeviceId
);
157 // 2) load image into the target table.
158 __tgt_target_table
*TargetTable
= TransTable
->TargetsTable
[DeviceId
] =
159 Device
.loadBinary(Img
);
160 // Unable to get table for this image: invalidate image and fail.
162 REPORT("Unable to generate entries table for device id %d.\n",
164 TransTable
->TargetsImages
[DeviceId
] = 0;
169 // Verify whether the two table sizes match.
171 TransTable
->HostTable
.EntriesEnd
- TransTable
->HostTable
.EntriesBegin
;
172 size_t Tsize
= TargetTable
->EntriesEnd
- TargetTable
->EntriesBegin
;
174 // Invalid image for these host entries!
175 if (Hsize
!= Tsize
) {
177 "Host and Target tables mismatch for device id %d [%zx != %zx].\n",
178 DeviceId
, Hsize
, Tsize
);
179 TransTable
->TargetsImages
[DeviceId
] = 0;
180 TransTable
->TargetsTable
[DeviceId
] = 0;
185 DeviceTy::HDTTMapAccessorTy HDTTMap
=
186 Device
.HostDataToTargetMap
.getExclusiveAccessor();
188 __tgt_target_table
*HostTable
= &TransTable
->HostTable
;
189 for (__tgt_offload_entry
*CurrDeviceEntry
= TargetTable
->EntriesBegin
,
190 *CurrHostEntry
= HostTable
->EntriesBegin
,
191 *EntryDeviceEnd
= TargetTable
->EntriesEnd
;
192 CurrDeviceEntry
!= EntryDeviceEnd
;
193 CurrDeviceEntry
++, CurrHostEntry
++) {
194 if (CurrDeviceEntry
->size
!= 0) {
196 assert(CurrDeviceEntry
->size
== CurrHostEntry
->size
&&
197 "data size mismatch");
199 // Fortran may use multiple weak declarations for the same symbol,
200 // therefore we must allow for multiple weak symbols to be loaded from
201 // the fat binary. Treat these mappings as any other "regular"
202 // mapping. Add entry to map.
203 if (Device
.getTgtPtrBegin(HDTTMap
, CurrHostEntry
->addr
,
204 CurrHostEntry
->size
))
207 DP("Add mapping from host " DPxMOD
" to device " DPxMOD
210 DPxPTR(CurrHostEntry
->addr
), DPxPTR(CurrDeviceEntry
->addr
),
211 CurrDeviceEntry
->size
);
212 HDTTMap
->emplace(new HostDataToTargetTy(
213 (uintptr_t)CurrHostEntry
->addr
/*HstPtrBase*/,
214 (uintptr_t)CurrHostEntry
->addr
/*HstPtrBegin*/,
215 (uintptr_t)CurrHostEntry
->addr
+
216 CurrHostEntry
->size
/*HstPtrEnd*/,
217 (uintptr_t)CurrDeviceEntry
->addr
/*TgtAllocBegin*/,
218 (uintptr_t)CurrDeviceEntry
->addr
/*TgtPtrBegin*/,
219 false /*UseHoldRefCount*/, CurrHostEntry
->name
,
220 true /*IsRefCountINF*/));
222 // Notify about the new mapping.
223 if (Device
.notifyDataMapped(CurrHostEntry
->addr
, CurrHostEntry
->size
))
230 if (Rc
!= OFFLOAD_SUCCESS
) {
235 * Run ctors for static objects
237 if (!Device
.PendingCtorsDtors
.empty()) {
238 AsyncInfoTy
AsyncInfo(Device
);
239 // Call all ctors for all libraries registered so far
240 for (auto &Lib
: Device
.PendingCtorsDtors
) {
241 if (!Lib
.second
.PendingCtors
.empty()) {
242 DP("Has pending ctors... call now\n");
243 for (auto &Entry
: Lib
.second
.PendingCtors
) {
245 int Rc
= target(nullptr, Device
, Ctor
, CTorDTorKernelArgs
, AsyncInfo
);
246 if (Rc
!= OFFLOAD_SUCCESS
) {
247 REPORT("Running ctor " DPxMOD
" failed.\n", DPxPTR(Ctor
));
251 // Clear the list to indicate that this device has been used
252 Lib
.second
.PendingCtors
.clear();
253 DP("Done with pending ctors for lib " DPxMOD
"\n", DPxPTR(Lib
.first
));
256 // All constructors have been issued, wait for them now.
257 if (AsyncInfo
.synchronize() != OFFLOAD_SUCCESS
)
260 Device
.HasPendingGlobals
= false;
262 return OFFLOAD_SUCCESS
;
265 void handleTargetOutcome(bool Success
, ident_t
*Loc
) {
266 switch (PM
->TargetOffloadPolicy
) {
269 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
273 FATAL_MESSAGE0(1, "default offloading policy must be switched to "
274 "mandatory or disabled");
278 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE
)
279 for (auto &Device
: PM
->Devices
)
280 dumpTargetPointerMappings(Loc
, *Device
);
282 FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "
283 "for debugging options.\n");
285 if (PM
->RTLs
.UsedRTLs
.empty()) {
286 llvm::SmallVector
<llvm::StringRef
> Archs
;
287 llvm::transform(PM
->Images
, std::back_inserter(Archs
),
289 return !x
.second
.Arch
? "empty" : x
.second
.Arch
;
292 "No images found compatible with the installed hardware. ");
293 fprintf(stderr
, "Found (%s)\n", llvm::join(Archs
, ",").c_str());
296 SourceInfo
Info(Loc
);
297 if (Info
.isAvailible())
298 fprintf(stderr
, "%s:%d:%d: ", Info
.getFilename(), Info
.getLine(),
301 FAILURE_MESSAGE("Source location information not present. Compile with "
302 "-g or -gline-tables-only.\n");
304 1, "failure of target construct while offloading is mandatory");
306 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE
)
307 for (auto &Device
: PM
->Devices
)
308 dumpTargetPointerMappings(Loc
, *Device
);
314 static void handleDefaultTargetOffload() {
315 std::lock_guard
<decltype(PM
->TargetOffloadMtx
)> LG(PM
->TargetOffloadMtx
);
316 if (PM
->TargetOffloadPolicy
== tgt_default
) {
317 if (omp_get_num_devices() > 0) {
318 DP("Default TARGET OFFLOAD policy is now mandatory "
319 "(devices were found)\n");
320 PM
->TargetOffloadPolicy
= tgt_mandatory
;
322 DP("Default TARGET OFFLOAD policy is now disabled "
323 "(no devices were found)\n");
324 PM
->TargetOffloadPolicy
= tgt_disabled
;
329 static bool isOffloadDisabled() {
330 if (PM
->TargetOffloadPolicy
== tgt_default
)
331 handleDefaultTargetOffload();
332 return PM
->TargetOffloadPolicy
== tgt_disabled
;
335 // If offload is enabled, ensure that device DeviceID has been initialized,
336 // global ctors have been executed, and global data has been mapped.
338 // The return bool indicates if the offload is to the host device
339 // There are three possible results:
340 // - Return false if the taregt device is ready for offload
341 // - Return true without reporting a runtime error if offload is
342 // disabled, perhaps because the initial device was specified.
343 // - Report a runtime error and return true.
345 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
346 // This step might be skipped if offload is disabled.
347 bool checkDeviceAndCtors(int64_t &DeviceID
, ident_t
*Loc
) {
348 if (isOffloadDisabled()) {
349 DP("Offload is disabled\n");
353 if (DeviceID
== OFFLOAD_DEVICE_DEFAULT
) {
354 DeviceID
= omp_get_default_device();
355 DP("Use default device id %" PRId64
"\n", DeviceID
);
358 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
359 if (omp_get_num_devices() == 0) {
360 DP("omp_get_num_devices() == 0 but offload is manadatory\n");
361 handleTargetOutcome(false, Loc
);
365 if (DeviceID
== omp_get_initial_device()) {
366 DP("Device is host (%" PRId64
"), returning as if offload is disabled\n",
372 if (!deviceIsReady(DeviceID
)) {
373 REPORT("Device %" PRId64
" is not ready.\n", DeviceID
);
374 handleTargetOutcome(false, Loc
);
379 DeviceTy
&Device
= *PM
->Devices
[DeviceID
];
381 // Check whether global data has been mapped for this device
383 std::lock_guard
<decltype(Device
.PendingGlobalsMtx
)> LG(
384 Device
.PendingGlobalsMtx
);
385 if (Device
.HasPendingGlobals
&& initLibrary(Device
) != OFFLOAD_SUCCESS
) {
386 REPORT("Failed to init globals on device %" PRId64
"\n", DeviceID
);
387 handleTargetOutcome(false, Loc
);
395 static int32_t getParentIndex(int64_t Type
) {
396 return ((Type
& OMP_TGT_MAPTYPE_MEMBER_OF
) >> 48) - 1;
399 void *targetAllocExplicit(size_t Size
, int DeviceNum
, int Kind
,
402 DP("Call to %s for device %d requesting %zu bytes\n", Name
, DeviceNum
, Size
);
405 DP("Call to %s with non-positive length\n", Name
);
411 if (DeviceNum
== omp_get_initial_device()) {
413 DP("%s returns host ptr " DPxMOD
"\n", Name
, DPxPTR(Rc
));
417 if (!deviceIsReady(DeviceNum
)) {
418 DP("%s returns NULL ptr\n", Name
);
422 DeviceTy
&Device
= *PM
->Devices
[DeviceNum
];
423 Rc
= Device
.allocData(Size
, nullptr, Kind
);
424 DP("%s returns device ptr " DPxMOD
"\n", Name
, DPxPTR(Rc
));
428 void targetFreeExplicit(void *DevicePtr
, int DeviceNum
, int Kind
,
431 DP("Call to %s for device %d and address " DPxMOD
"\n", Name
, DeviceNum
,
435 DP("Call to %s with NULL ptr\n", Name
);
439 if (DeviceNum
== omp_get_initial_device()) {
441 DP("%s deallocated host ptr\n", Name
);
445 if (!deviceIsReady(DeviceNum
)) {
446 DP("%s returns, nothing to do\n", Name
);
450 PM
->Devices
[DeviceNum
]->deleteData(DevicePtr
, Kind
);
451 DP("omp_target_free deallocated device ptr\n");
454 void *targetLockExplicit(void *HostPtr
, size_t Size
, int DeviceNum
,
457 DP("Call to %s for device %d locking %zu bytes\n", Name
, DeviceNum
, Size
);
460 DP("Call to %s with non-positive length\n", Name
);
466 if (!deviceIsReady(DeviceNum
)) {
467 DP("%s returns NULL ptr\n", Name
);
471 DeviceTy
*DevicePtr
= nullptr;
473 std::lock_guard
<decltype(PM
->RTLsMtx
)> LG(PM
->RTLsMtx
);
475 if (!PM
->Devices
[DeviceNum
]) {
476 DP("%s returns, device %d not available\n", Name
, DeviceNum
);
480 DevicePtr
= PM
->Devices
[DeviceNum
].get();
484 if (DevicePtr
->RTL
->data_lock
) {
485 err
= DevicePtr
->RTL
->data_lock(DeviceNum
, HostPtr
, Size
, &rc
);
487 DP("Could not lock ptr %p\n", HostPtr
);
491 DP("%s returns device ptr " DPxMOD
"\n", Name
, DPxPTR(rc
));
495 void targetUnlockExplicit(void *HostPtr
, int DeviceNum
, const char *Name
) {
497 DP("Call to %s for device %d unlocking\n", Name
, DeviceNum
);
499 DeviceTy
*DevicePtr
= nullptr;
501 std::lock_guard
<decltype(PM
->RTLsMtx
)> LG(PM
->RTLsMtx
);
503 // Don't check deviceIsReady as it can initialize the device if needed.
504 // Just check if DeviceNum exists as targetUnlockExplicit can be called
505 // during process exit/free (and it may have been already destroyed) and
506 // targetAllocExplicit will have already checked deviceIsReady anyway.
507 size_t DevicesSize
= PM
->Devices
.size();
509 if (DevicesSize
<= (size_t)DeviceNum
) {
510 DP("Device ID %d does not have a matching RTL\n", DeviceNum
);
514 if (!PM
->Devices
[DeviceNum
]) {
515 DP("%s returns, device %d not available\n", Name
, DeviceNum
);
519 DevicePtr
= PM
->Devices
[DeviceNum
].get();
522 if (DevicePtr
->RTL
->data_unlock
)
523 DevicePtr
->RTL
->data_unlock(DeviceNum
, HostPtr
);
525 DP("%s returns\n", Name
);
528 /// Call the user-defined mapper function followed by the appropriate
529 // targetData* function (targetData{Begin,End,Update}).
530 int targetDataMapper(ident_t
*Loc
, DeviceTy
&Device
, void *ArgBase
, void *Arg
,
531 int64_t ArgSize
, int64_t ArgType
, map_var_info_t ArgNames
,
532 void *ArgMapper
, AsyncInfoTy
&AsyncInfo
,
533 TargetDataFuncPtrTy TargetDataFunction
) {
534 DP("Calling the mapper function " DPxMOD
"\n", DPxPTR(ArgMapper
));
536 // The mapper function fills up Components.
537 MapperComponentsTy MapperComponents
;
538 MapperFuncPtrTy MapperFuncPtr
= (MapperFuncPtrTy
)(ArgMapper
);
539 (*MapperFuncPtr
)((void *)&MapperComponents
, ArgBase
, Arg
, ArgSize
, ArgType
,
542 // Construct new arrays for args_base, args, arg_sizes and arg_types
543 // using the information in MapperComponents and call the corresponding
544 // targetData* function using these new arrays.
545 SmallVector
<void *> MapperArgsBase(MapperComponents
.Components
.size());
546 SmallVector
<void *> MapperArgs(MapperComponents
.Components
.size());
547 SmallVector
<int64_t> MapperArgSizes(MapperComponents
.Components
.size());
548 SmallVector
<int64_t> MapperArgTypes(MapperComponents
.Components
.size());
549 SmallVector
<void *> MapperArgNames(MapperComponents
.Components
.size());
551 for (unsigned I
= 0, E
= MapperComponents
.Components
.size(); I
< E
; ++I
) {
552 auto &C
= MapperComponents
.Components
[I
];
553 MapperArgsBase
[I
] = C
.Base
;
554 MapperArgs
[I
] = C
.Begin
;
555 MapperArgSizes
[I
] = C
.Size
;
556 MapperArgTypes
[I
] = C
.Type
;
557 MapperArgNames
[I
] = C
.Name
;
560 int Rc
= TargetDataFunction(Loc
, Device
, MapperComponents
.Components
.size(),
561 MapperArgsBase
.data(), MapperArgs
.data(),
562 MapperArgSizes
.data(), MapperArgTypes
.data(),
563 MapperArgNames
.data(), /*arg_mappers*/ nullptr,
564 AsyncInfo
, /*FromMapper=*/true);
569 /// Internal function to do the mapping and transfer the data to the device
570 int targetDataBegin(ident_t
*Loc
, DeviceTy
&Device
, int32_t ArgNum
,
571 void **ArgsBase
, void **Args
, int64_t *ArgSizes
,
572 int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
573 void **ArgMappers
, AsyncInfoTy
&AsyncInfo
,
575 TIMESCOPE_WITH_IDENT(Loc
);
576 // process each input.
577 for (int32_t I
= 0; I
< ArgNum
; ++I
) {
578 // Ignore private variables and arrays - there is no mapping for them.
579 if ((ArgTypes
[I
] & OMP_TGT_MAPTYPE_LITERAL
) ||
580 (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PRIVATE
))
583 if (ArgMappers
&& ArgMappers
[I
]) {
584 // Instead of executing the regular path of targetDataBegin, call the
585 // targetDataMapper variant which will call targetDataBegin again
586 // with new arguments.
587 DP("Calling targetDataMapper for the %dth argument\n", I
);
589 map_var_info_t ArgName
= (!ArgNames
) ? nullptr : ArgNames
[I
];
590 int Rc
= targetDataMapper(Loc
, Device
, ArgsBase
[I
], Args
[I
], ArgSizes
[I
],
591 ArgTypes
[I
], ArgName
, ArgMappers
[I
], AsyncInfo
,
594 if (Rc
!= OFFLOAD_SUCCESS
) {
595 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
600 // Skip the rest of this function, continue to the next argument.
604 void *HstPtrBegin
= Args
[I
];
605 void *HstPtrBase
= ArgsBase
[I
];
606 int64_t DataSize
= ArgSizes
[I
];
607 map_var_info_t HstPtrName
= (!ArgNames
) ? nullptr : ArgNames
[I
];
609 // Adjust for proper alignment if this is a combined entry (for structs).
610 // Look at the next argument - if that is MEMBER_OF this one, then this one
611 // is a combined entry.
612 int64_t TgtPadding
= 0;
613 const int NextI
= I
+ 1;
614 if (getParentIndex(ArgTypes
[I
]) < 0 && NextI
< ArgNum
&&
615 getParentIndex(ArgTypes
[NextI
]) == I
) {
616 int64_t Alignment
= getPartialStructRequiredAlignment(HstPtrBase
);
617 TgtPadding
= (int64_t)HstPtrBegin
% Alignment
;
619 DP("Using a padding of %" PRId64
" bytes for begin address " DPxMOD
621 TgtPadding
, DPxPTR(HstPtrBegin
));
625 // Address of pointer on the host and device, respectively.
626 void *PointerHstPtrBegin
, *PointerTgtPtrBegin
;
627 TargetPointerResultTy PointerTpr
;
628 bool IsHostPtr
= false;
629 bool IsImplicit
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_IMPLICIT
;
630 // Force the creation of a device side copy of the data when:
631 // a close map modifier was associated with a map that contained a to.
632 bool HasCloseModifier
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_CLOSE
;
633 bool HasPresentModifier
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_PRESENT
;
634 bool HasHoldModifier
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_OMPX_HOLD
;
635 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
636 // have reached this point via __tgt_target_data_begin and not __tgt_target
637 // then no argument is marked as TARGET_PARAM ("omp target data map" is not
638 // associated with a target region, so there are no target parameters). This
639 // may be considered a hack, we could revise the scheme in the future.
641 !(ArgTypes
[I
] & OMP_TGT_MAPTYPE_MEMBER_OF
) && !(FromMapper
&& I
== 0);
643 DeviceTy::HDTTMapAccessorTy HDTTMap
=
644 Device
.HostDataToTargetMap
.getExclusiveAccessor();
645 if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PTR_AND_OBJ
) {
646 DP("Has a pointer entry: \n");
647 // Base is address of pointer.
649 // Usually, the pointer is already allocated by this time. For example:
651 // #pragma omp target map(s.p[0:N])
653 // The map entry for s comes first, and the PTR_AND_OBJ entry comes
654 // afterward, so the pointer is already allocated by the time the
655 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
656 // non-null. However, "declare target link" can produce a PTR_AND_OBJ
657 // entry for a global that might not already be allocated by the time the
658 // PTR_AND_OBJ entry is handled below, and so the allocation might fail
659 // when HasPresentModifier.
660 PointerTpr
= Device
.getTargetPointer(
661 HDTTMap
, HstPtrBase
, HstPtrBase
, /*TgtPadding=*/0, sizeof(void *),
662 /*HstPtrName=*/nullptr,
663 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit
, UpdateRef
,
664 HasCloseModifier
, HasPresentModifier
, HasHoldModifier
, AsyncInfo
,
665 /* OwnedTPR */ nullptr, /* ReleaseHDTTMap */ false);
666 PointerTgtPtrBegin
= PointerTpr
.TargetPointer
;
667 IsHostPtr
= PointerTpr
.Flags
.IsHostPointer
;
668 if (!PointerTgtPtrBegin
) {
669 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
670 HasPresentModifier
? "'present' map type modifier"
671 : "device failure or illegal mapping");
674 DP("There are %zu bytes allocated at target address " DPxMOD
" - is%s new"
676 sizeof(void *), DPxPTR(PointerTgtPtrBegin
),
677 (PointerTpr
.Flags
.IsNewEntry
? "" : " not"));
678 PointerHstPtrBegin
= HstPtrBase
;
679 // modify current entry.
680 HstPtrBase
= *(void **)HstPtrBase
;
681 // No need to update pointee ref count for the first element of the
682 // subelement that comes from mapper.
684 (!FromMapper
|| I
!= 0); // subsequently update ref count of pointee
687 const bool HasFlagTo
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_TO
;
688 const bool HasFlagAlways
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_ALWAYS
;
689 // Note that HDTTMap will be released in getTargetPointer.
690 auto TPR
= Device
.getTargetPointer(
691 HDTTMap
, HstPtrBegin
, HstPtrBase
, TgtPadding
, DataSize
, HstPtrName
,
692 HasFlagTo
, HasFlagAlways
, IsImplicit
, UpdateRef
, HasCloseModifier
,
693 HasPresentModifier
, HasHoldModifier
, AsyncInfo
, PointerTpr
.getEntry());
694 void *TgtPtrBegin
= TPR
.TargetPointer
;
695 IsHostPtr
= TPR
.Flags
.IsHostPointer
;
696 // If data_size==0, then the argument could be a zero-length pointer to
697 // NULL, so getOrAlloc() returning NULL is not an error.
698 if (!TgtPtrBegin
&& (DataSize
|| HasPresentModifier
)) {
699 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
700 HasPresentModifier
? "'present' map type modifier"
701 : "device failure or illegal mapping");
704 DP("There are %" PRId64
" bytes allocated at target address " DPxMOD
706 DataSize
, DPxPTR(TgtPtrBegin
), (TPR
.Flags
.IsNewEntry
? "" : " not"));
708 if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_RETURN_PARAM
) {
709 uintptr_t Delta
= (uintptr_t)HstPtrBegin
- (uintptr_t)HstPtrBase
;
710 void *TgtPtrBase
= (void *)((uintptr_t)TgtPtrBegin
- Delta
);
711 DP("Returning device pointer " DPxMOD
"\n", DPxPTR(TgtPtrBase
));
712 ArgsBase
[I
] = TgtPtrBase
;
715 if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PTR_AND_OBJ
&& !IsHostPtr
) {
717 uint64_t Delta
= (uint64_t)HstPtrBegin
- (uint64_t)HstPtrBase
;
718 void *ExpectedTgtPtrBase
= (void *)((uint64_t)TgtPtrBegin
- Delta
);
720 if (PointerTpr
.getEntry()->addShadowPointer(ShadowPtrInfoTy
{
721 (void **)PointerHstPtrBegin
, HstPtrBase
,
722 (void **)PointerTgtPtrBegin
, ExpectedTgtPtrBase
})) {
723 DP("Update pointer (" DPxMOD
") -> [" DPxMOD
"]\n",
724 DPxPTR(PointerTgtPtrBegin
), DPxPTR(TgtPtrBegin
));
726 void *&TgtPtrBase
= AsyncInfo
.getVoidPtrLocation();
727 TgtPtrBase
= ExpectedTgtPtrBase
;
730 Device
.submitData(PointerTgtPtrBegin
, &TgtPtrBase
, sizeof(void *),
731 AsyncInfo
, PointerTpr
.getEntry());
732 if (Ret
!= OFFLOAD_SUCCESS
) {
733 REPORT("Copying data to device failed.\n");
736 if (PointerTpr
.getEntry()->addEventIfNecessary(Device
, AsyncInfo
) !=
742 // Check if variable can be used on the device:
743 bool IsStructMember
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_MEMBER_OF
;
744 if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING
&& ArgTypes
[I
] != 0 &&
745 !IsStructMember
&& !IsImplicit
&& !TPR
.isPresent() &&
746 !TPR
.isContained() && !TPR
.isHostPointer())
747 INFO(OMP_INFOTYPE_EMPTY_MAPPING
, Device
.DeviceID
,
748 "variable %s does not have a valid device counterpart\n",
749 (HstPtrName
) ? getNameFromMapping(HstPtrName
).c_str() : "unknown");
752 return OFFLOAD_SUCCESS
;
756 /// This structure contains information to deallocate a target pointer, aka.
757 /// used to fix up the shadow map and potentially delete the entry from the
758 /// mapping table via \p DeviceTy::deallocTgtPtr.
759 struct PostProcessingInfo
{
760 /// Host pointer used to look up into the map table
766 /// The mapping type (bitfield).
769 /// The target pointer information.
770 TargetPointerResultTy TPR
;
772 PostProcessingInfo(void *HstPtr
, int64_t Size
, int64_t ArgType
,
773 TargetPointerResultTy
&&TPR
)
774 : HstPtrBegin(HstPtr
), DataSize(Size
), ArgType(ArgType
),
775 TPR(std::move(TPR
)) {}
780 /// Applies the necessary post-processing procedures to entries listed in \p
781 /// EntriesInfo after the execution of all device side operations from a target
782 /// data end. This includes the update of pointers at the host and removal of
783 /// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS
784 /// according to the successfulness of the operations.
785 [[nodiscard
]] static int
786 postProcessingTargetDataEnd(DeviceTy
*Device
,
787 SmallVector
<PostProcessingInfo
> &EntriesInfo
) {
788 int Ret
= OFFLOAD_SUCCESS
;
790 for (auto &[HstPtrBegin
, DataSize
, ArgType
, TPR
] : EntriesInfo
) {
791 bool DelEntry
= !TPR
.isHostPointer();
793 // If the last element from the mapper (for end transfer args comes in
794 // reverse order), do not remove the partial entry, the parent struct still
796 if ((ArgType
& OMP_TGT_MAPTYPE_MEMBER_OF
) &&
797 !(ArgType
& OMP_TGT_MAPTYPE_PTR_AND_OBJ
)) {
798 DelEntry
= false; // protect parent struct from being deallocated
801 // If we marked the entry to be deleted we need to verify no other
802 // thread reused it by now. If deletion is still supposed to happen by
803 // this thread LR will be set and exclusive access to the HDTT map
804 // will avoid another thread reusing the entry now. Note that we do
805 // not request (exclusive) access to the HDTT map if DelEntry is
807 DeviceTy::HDTTMapAccessorTy HDTTMap
=
808 Device
->HostDataToTargetMap
.getExclusiveAccessor();
810 // We cannot use a lock guard because we may end up delete the mutex.
811 // We also explicitly unlocked the entry after it was put in the EntriesInfo
812 // so it can be reused.
813 TPR
.getEntry()->lock();
814 auto *Entry
= TPR
.getEntry();
816 const bool IsNotLastUser
= Entry
->decDataEndThreadCount() != 0;
817 if (DelEntry
&& (Entry
->getTotalRefCount() != 0 || IsNotLastUser
)) {
818 // The thread is not in charge of deletion anymore. Give up access
819 // to the HDTT map and unset the deletion flag.
824 // If we copied back to the host a struct/array containing pointers,
825 // we need to restore the original host pointer values from their
826 // shadow copies. If the struct is going to be deallocated, remove any
827 // remaining shadow pointer entries for this struct.
828 const bool HasFrom
= ArgType
& OMP_TGT_MAPTYPE_FROM
;
830 Entry
->foreachShadowPointerInfo([&](const ShadowPtrInfoTy
&ShadowPtr
) {
831 *ShadowPtr
.HstPtrAddr
= ShadowPtr
.HstPtrVal
;
832 DP("Restoring original host pointer value " DPxMOD
" for host "
833 "pointer " DPxMOD
"\n",
834 DPxPTR(ShadowPtr
.HstPtrVal
), DPxPTR(ShadowPtr
.HstPtrAddr
));
835 return OFFLOAD_SUCCESS
;
839 // Give up the lock as we either don't need it anymore (e.g., done with
840 // TPR), or erase TPR.
841 TPR
.setEntry(nullptr);
846 Ret
= Device
->eraseMapEntry(HDTTMap
, Entry
, DataSize
);
847 // Entry is already remove from the map, we can unlock it now.
849 Ret
|= Device
->deallocTgtPtrAndEntry(Entry
, DataSize
);
850 if (Ret
!= OFFLOAD_SUCCESS
) {
851 REPORT("Deallocating data from device failed.\n");
860 /// Internal function to undo the mapping and retrieve the data from the device.
861 int targetDataEnd(ident_t
*Loc
, DeviceTy
&Device
, int32_t ArgNum
,
862 void **ArgBases
, void **Args
, int64_t *ArgSizes
,
863 int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
864 void **ArgMappers
, AsyncInfoTy
&AsyncInfo
, bool FromMapper
) {
865 int Ret
= OFFLOAD_SUCCESS
;
866 auto *PostProcessingPtrs
= new SmallVector
<PostProcessingInfo
>();
867 // process each input.
868 for (int32_t I
= ArgNum
- 1; I
>= 0; --I
) {
869 // Ignore private variables and arrays - there is no mapping for them.
870 // Also, ignore the use_device_ptr directive, it has no effect here.
871 if ((ArgTypes
[I
] & OMP_TGT_MAPTYPE_LITERAL
) ||
872 (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PRIVATE
))
875 if (ArgMappers
&& ArgMappers
[I
]) {
876 // Instead of executing the regular path of targetDataEnd, call the
877 // targetDataMapper variant which will call targetDataEnd again
878 // with new arguments.
879 DP("Calling targetDataMapper for the %dth argument\n", I
);
881 map_var_info_t ArgName
= (!ArgNames
) ? nullptr : ArgNames
[I
];
882 Ret
= targetDataMapper(Loc
, Device
, ArgBases
[I
], Args
[I
], ArgSizes
[I
],
883 ArgTypes
[I
], ArgName
, ArgMappers
[I
], AsyncInfo
,
886 if (Ret
!= OFFLOAD_SUCCESS
) {
887 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
892 // Skip the rest of this function, continue to the next argument.
896 void *HstPtrBegin
= Args
[I
];
897 int64_t DataSize
= ArgSizes
[I
];
898 bool IsImplicit
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_IMPLICIT
;
899 bool UpdateRef
= (!(ArgTypes
[I
] & OMP_TGT_MAPTYPE_MEMBER_OF
) ||
900 (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PTR_AND_OBJ
)) &&
901 !(FromMapper
&& I
== 0);
902 bool ForceDelete
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_DELETE
;
903 bool HasPresentModifier
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_PRESENT
;
904 bool HasHoldModifier
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_OMPX_HOLD
;
906 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
907 TargetPointerResultTy TPR
=
908 Device
.getTgtPtrBegin(HstPtrBegin
, DataSize
, UpdateRef
, HasHoldModifier
,
909 !IsImplicit
, ForceDelete
, /*FromDataEnd=*/true);
910 void *TgtPtrBegin
= TPR
.TargetPointer
;
911 if (!TPR
.isPresent() && !TPR
.isHostPointer() &&
912 (DataSize
|| HasPresentModifier
)) {
913 DP("Mapping does not exist (%s)\n",
914 (HasPresentModifier
? "'present' map type modifier" : "ignored"));
915 if (HasPresentModifier
) {
916 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
917 // "If a map clause appears on a target, target data, target enter data
918 // or target exit data construct with a present map-type-modifier then
919 // on entry to the region if the corresponding list item does not appear
920 // in the device data environment then an error occurs and the program
923 // This should be an error upon entering an "omp target exit data". It
924 // should not be an error upon exiting an "omp target data" or "omp
925 // target". For "omp target data", Clang thus doesn't include present
926 // modifiers for end calls. For "omp target", we have not found a valid
927 // OpenMP program for which the error matters: it appears that, if a
928 // program can guarantee that data is present at the beginning of an
929 // "omp target" region so that there's no error there, that data is also
930 // guaranteed to be present at the end.
931 MESSAGE("device mapping required by 'present' map type modifier does "
932 "not exist for host address " DPxMOD
" (%" PRId64
" bytes)",
933 DPxPTR(HstPtrBegin
), DataSize
);
937 DP("There are %" PRId64
" bytes allocated at target address " DPxMOD
939 DataSize
, DPxPTR(TgtPtrBegin
), (TPR
.Flags
.IsLast
? "" : " not"));
942 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
943 // "If the map clause appears on a target, target data, or target exit data
944 // construct and a corresponding list item of the original list item is not
945 // present in the device data environment on exit from the region then the
946 // list item is ignored."
947 if (!TPR
.isPresent())
950 // Move data back to the host
951 const bool HasAlways
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_ALWAYS
;
952 const bool HasFrom
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_FROM
;
953 if (HasFrom
&& (HasAlways
|| TPR
.Flags
.IsLast
) &&
954 !TPR
.Flags
.IsHostPointer
&& DataSize
!= 0) {
955 DP("Moving %" PRId64
" bytes (tgt:" DPxMOD
") -> (hst:" DPxMOD
")\n",
956 DataSize
, DPxPTR(TgtPtrBegin
), DPxPTR(HstPtrBegin
));
958 // Wait for any previous transfer if an event is present.
959 if (void *Event
= TPR
.getEntry()->getEvent()) {
960 if (Device
.waitEvent(Event
, AsyncInfo
) != OFFLOAD_SUCCESS
) {
961 REPORT("Failed to wait for event " DPxMOD
".\n", DPxPTR(Event
));
966 Ret
= Device
.retrieveData(HstPtrBegin
, TgtPtrBegin
, DataSize
, AsyncInfo
,
968 if (Ret
!= OFFLOAD_SUCCESS
) {
969 REPORT("Copying data from device failed.\n");
973 // As we are expecting to delete the entry the d2h copy might race
974 // with another one that also tries to delete the entry. This happens
975 // as the entry can be reused and the reuse might happen after the
976 // copy-back was issued but before it completed. Since the reuse might
977 // also copy-back a value we would race.
978 if (TPR
.Flags
.IsLast
) {
979 if (TPR
.getEntry()->addEventIfNecessary(Device
, AsyncInfo
) !=
985 // Add pointer to the buffer for post-synchronize processing.
986 PostProcessingPtrs
->emplace_back(HstPtrBegin
, DataSize
, ArgTypes
[I
],
988 PostProcessingPtrs
->back().TPR
.getEntry()->unlock();
991 // Add post-processing functions
992 // TODO: We might want to remove `mutable` in the future by not changing the
993 // captured variables somehow.
994 AsyncInfo
.addPostProcessingFunction([=, Device
= &Device
]() mutable -> int {
995 return postProcessingTargetDataEnd(Device
, *PostProcessingPtrs
);
1001 static int targetDataContiguous(ident_t
*Loc
, DeviceTy
&Device
, void *ArgsBase
,
1002 void *HstPtrBegin
, int64_t ArgSize
,
1003 int64_t ArgType
, AsyncInfoTy
&AsyncInfo
) {
1004 TargetPointerResultTy TPR
=
1005 Device
.getTgtPtrBegin(HstPtrBegin
, ArgSize
, /*UpdateRefCount=*/false,
1006 /*UseHoldRefCount=*/false, /*MustContain=*/true);
1007 void *TgtPtrBegin
= TPR
.TargetPointer
;
1008 if (!TPR
.isPresent()) {
1009 DP("hst data:" DPxMOD
" not found, becomes a noop\n", DPxPTR(HstPtrBegin
));
1010 if (ArgType
& OMP_TGT_MAPTYPE_PRESENT
) {
1011 MESSAGE("device mapping required by 'present' motion modifier does not "
1012 "exist for host address " DPxMOD
" (%" PRId64
" bytes)",
1013 DPxPTR(HstPtrBegin
), ArgSize
);
1014 return OFFLOAD_FAIL
;
1016 return OFFLOAD_SUCCESS
;
1019 if (TPR
.Flags
.IsHostPointer
) {
1020 DP("hst data:" DPxMOD
" unified and shared, becomes a noop\n",
1021 DPxPTR(HstPtrBegin
));
1022 return OFFLOAD_SUCCESS
;
1025 if (ArgType
& OMP_TGT_MAPTYPE_TO
) {
1026 DP("Moving %" PRId64
" bytes (hst:" DPxMOD
") -> (tgt:" DPxMOD
")\n",
1027 ArgSize
, DPxPTR(HstPtrBegin
), DPxPTR(TgtPtrBegin
));
1028 int Ret
= Device
.submitData(TgtPtrBegin
, HstPtrBegin
, ArgSize
, AsyncInfo
,
1030 if (Ret
!= OFFLOAD_SUCCESS
) {
1031 REPORT("Copying data to device failed.\n");
1032 return OFFLOAD_FAIL
;
1034 if (TPR
.getEntry()) {
1035 int Ret
= TPR
.getEntry()->foreachShadowPointerInfo(
1036 [&](ShadowPtrInfoTy
&ShadowPtr
) {
1037 DP("Restoring original target pointer value " DPxMOD
" for target "
1038 "pointer " DPxMOD
"\n",
1039 DPxPTR(ShadowPtr
.TgtPtrVal
), DPxPTR(ShadowPtr
.TgtPtrAddr
));
1040 Ret
= Device
.submitData(ShadowPtr
.TgtPtrAddr
,
1041 (void *)&ShadowPtr
.TgtPtrVal
,
1042 sizeof(void *), AsyncInfo
);
1043 if (Ret
!= OFFLOAD_SUCCESS
) {
1044 REPORT("Copying data to device failed.\n");
1045 return OFFLOAD_FAIL
;
1047 return OFFLOAD_SUCCESS
;
1049 if (Ret
!= OFFLOAD_SUCCESS
) {
1050 DP("Updating shadow map failed\n");
1056 if (ArgType
& OMP_TGT_MAPTYPE_FROM
) {
1057 DP("Moving %" PRId64
" bytes (tgt:" DPxMOD
") -> (hst:" DPxMOD
")\n",
1058 ArgSize
, DPxPTR(TgtPtrBegin
), DPxPTR(HstPtrBegin
));
1059 int Ret
= Device
.retrieveData(HstPtrBegin
, TgtPtrBegin
, ArgSize
, AsyncInfo
,
1061 if (Ret
!= OFFLOAD_SUCCESS
) {
1062 REPORT("Copying data from device failed.\n");
1063 return OFFLOAD_FAIL
;
1066 // Wait for device-to-host memcopies for whole struct to complete,
1067 // before restoring the correct host pointer.
1068 if (auto *Entry
= TPR
.getEntry()) {
1069 AsyncInfo
.addPostProcessingFunction([=]() -> int {
1070 int Ret
= Entry
->foreachShadowPointerInfo(
1071 [&](const ShadowPtrInfoTy
&ShadowPtr
) {
1072 *ShadowPtr
.HstPtrAddr
= ShadowPtr
.HstPtrVal
;
1073 DP("Restoring original host pointer value " DPxMOD
1074 " for host pointer " DPxMOD
"\n",
1075 DPxPTR(ShadowPtr
.HstPtrVal
), DPxPTR(ShadowPtr
.HstPtrAddr
));
1076 return OFFLOAD_SUCCESS
;
1079 if (Ret
!= OFFLOAD_SUCCESS
) {
1080 DP("Updating shadow map failed\n");
1083 return OFFLOAD_SUCCESS
;
1088 return OFFLOAD_SUCCESS
;
1091 static int targetDataNonContiguous(ident_t
*Loc
, DeviceTy
&Device
,
1093 __tgt_target_non_contig
*NonContig
,
1094 uint64_t Size
, int64_t ArgType
,
1095 int CurrentDim
, int DimSize
, uint64_t Offset
,
1096 AsyncInfoTy
&AsyncInfo
) {
1097 int Ret
= OFFLOAD_SUCCESS
;
1098 if (CurrentDim
< DimSize
) {
1099 for (unsigned int I
= 0; I
< NonContig
[CurrentDim
].Count
; ++I
) {
1100 uint64_t CurOffset
=
1101 (NonContig
[CurrentDim
].Offset
+ I
) * NonContig
[CurrentDim
].Stride
;
1102 // we only need to transfer the first element for the last dimension
1103 // since we've already got a contiguous piece.
1104 if (CurrentDim
!= DimSize
- 1 || I
== 0) {
1105 Ret
= targetDataNonContiguous(Loc
, Device
, ArgsBase
, NonContig
, Size
,
1106 ArgType
, CurrentDim
+ 1, DimSize
,
1107 Offset
+ CurOffset
, AsyncInfo
);
1108 // Stop the whole process if any contiguous piece returns anything
1109 // other than OFFLOAD_SUCCESS.
1110 if (Ret
!= OFFLOAD_SUCCESS
)
1115 char *Ptr
= (char *)ArgsBase
+ Offset
;
1116 DP("Transfer of non-contiguous : host ptr " DPxMOD
" offset %" PRIu64
1117 " len %" PRIu64
"\n",
1118 DPxPTR(Ptr
), Offset
, Size
);
1119 Ret
= targetDataContiguous(Loc
, Device
, ArgsBase
, Ptr
, Size
, ArgType
,
1125 static int getNonContigMergedDimension(__tgt_target_non_contig
*NonContig
,
1128 for (int I
= DimSize
- 1; I
> 0; --I
) {
1129 if (NonContig
[I
].Count
* NonContig
[I
].Stride
== NonContig
[I
- 1].Stride
)
1135 /// Internal function to pass data to/from the target.
1136 int targetDataUpdate(ident_t
*Loc
, DeviceTy
&Device
, int32_t ArgNum
,
1137 void **ArgsBase
, void **Args
, int64_t *ArgSizes
,
1138 int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
1139 void **ArgMappers
, AsyncInfoTy
&AsyncInfo
, bool) {
1140 // process each input.
1141 for (int32_t I
= 0; I
< ArgNum
; ++I
) {
1142 if ((ArgTypes
[I
] & OMP_TGT_MAPTYPE_LITERAL
) ||
1143 (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PRIVATE
))
1146 if (ArgMappers
&& ArgMappers
[I
]) {
1147 // Instead of executing the regular path of targetDataUpdate, call the
1148 // targetDataMapper variant which will call targetDataUpdate again
1149 // with new arguments.
1150 DP("Calling targetDataMapper for the %dth argument\n", I
);
1152 map_var_info_t ArgName
= (!ArgNames
) ? nullptr : ArgNames
[I
];
1153 int Ret
= targetDataMapper(Loc
, Device
, ArgsBase
[I
], Args
[I
], ArgSizes
[I
],
1154 ArgTypes
[I
], ArgName
, ArgMappers
[I
], AsyncInfo
,
1157 if (Ret
!= OFFLOAD_SUCCESS
) {
1158 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
1160 return OFFLOAD_FAIL
;
1163 // Skip the rest of this function, continue to the next argument.
1167 int Ret
= OFFLOAD_SUCCESS
;
1169 if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_NON_CONTIG
) {
1170 __tgt_target_non_contig
*NonContig
= (__tgt_target_non_contig
*)Args
[I
];
1171 int32_t DimSize
= ArgSizes
[I
];
1173 NonContig
[DimSize
- 1].Count
* NonContig
[DimSize
- 1].Stride
;
1174 int32_t MergedDim
= getNonContigMergedDimension(NonContig
, DimSize
);
1175 Ret
= targetDataNonContiguous(
1176 Loc
, Device
, ArgsBase
[I
], NonContig
, Size
, ArgTypes
[I
],
1177 /*current_dim=*/0, DimSize
- MergedDim
, /*offset=*/0, AsyncInfo
);
1179 Ret
= targetDataContiguous(Loc
, Device
, ArgsBase
[I
], Args
[I
], ArgSizes
[I
],
1180 ArgTypes
[I
], AsyncInfo
);
1182 if (Ret
== OFFLOAD_FAIL
)
1183 return OFFLOAD_FAIL
;
1185 return OFFLOAD_SUCCESS
;
1188 static const unsigned LambdaMapping
= OMP_TGT_MAPTYPE_PTR_AND_OBJ
|
1189 OMP_TGT_MAPTYPE_LITERAL
|
1190 OMP_TGT_MAPTYPE_IMPLICIT
;
1191 static bool isLambdaMapping(int64_t Mapping
) {
1192 return (Mapping
& LambdaMapping
) == LambdaMapping
;
1196 /// Find the table information in the map or look it up in the translation
1198 TableMap
*getTableMap(void *HostPtr
) {
1199 std::lock_guard
<std::mutex
> TblMapLock(PM
->TblMapMtx
);
1200 HostPtrToTableMapTy::iterator TableMapIt
=
1201 PM
->HostPtrToTableMap
.find(HostPtr
);
1203 if (TableMapIt
!= PM
->HostPtrToTableMap
.end())
1204 return &TableMapIt
->second
;
1206 // We don't have a map. So search all the registered libraries.
1207 TableMap
*TM
= nullptr;
1208 std::lock_guard
<std::mutex
> TrlTblLock(PM
->TrlTblMtx
);
1209 for (HostEntriesBeginToTransTableTy::iterator Itr
=
1210 PM
->HostEntriesBeginToTransTable
.begin();
1211 Itr
!= PM
->HostEntriesBeginToTransTable
.end(); ++Itr
) {
1212 // get the translation table (which contains all the good info).
1213 TranslationTable
*TransTable
= &Itr
->second
;
1214 // iterate over all the host table entries to see if we can locate the
1216 __tgt_offload_entry
*Cur
= TransTable
->HostTable
.EntriesBegin
;
1217 for (uint32_t I
= 0; Cur
< TransTable
->HostTable
.EntriesEnd
; ++Cur
, ++I
) {
1218 if (Cur
->addr
!= HostPtr
)
1220 // we got a match, now fill the HostPtrToTableMap so that we
1221 // may avoid this search next time.
1222 TM
= &(PM
->HostPtrToTableMap
)[HostPtr
];
1223 TM
->Table
= TransTable
;
1232 /// A class manages private arguments in a target region.
1233 class PrivateArgumentManagerTy
{
1234 /// A data structure for the information of first-private arguments. We can
1235 /// use this information to optimize data transfer by packing all
1236 /// first-private arguments and transfer them all at once.
1237 struct FirstPrivateArgInfoTy
{
1238 /// Host pointer begin
1240 /// Host pointer end
1242 /// The index of the element in \p TgtArgs corresponding to the argument
1244 /// Alignment of the entry (base of the entry, not after the entry).
1246 /// Size (without alignment, see padding)
1248 /// Padding used to align this argument entry, if necessary.
1250 /// Host pointer name
1251 map_var_info_t HstPtrName
= nullptr;
1253 FirstPrivateArgInfoTy(int Index
, void *HstPtr
, uint32_t Size
,
1254 uint32_t Alignment
, uint32_t Padding
,
1255 const map_var_info_t HstPtrName
= nullptr)
1256 : HstPtrBegin(reinterpret_cast<char *>(HstPtr
)),
1257 HstPtrEnd(HstPtrBegin
+ Size
), Index(Index
), Alignment(Alignment
),
1258 Size(Size
), Padding(Padding
), HstPtrName(HstPtrName
) {}
1261 /// A vector of target pointers for all private arguments
1262 SmallVector
<void *> TgtPtrs
;
1264 /// A vector of information of all first-private arguments to be packed
1265 SmallVector
<FirstPrivateArgInfoTy
> FirstPrivateArgInfo
;
1266 /// Host buffer for all arguments to be packed
1267 SmallVector
<char> FirstPrivateArgBuffer
;
1268 /// The total size of all arguments to be packed
1269 int64_t FirstPrivateArgSize
= 0;
1271 /// A reference to the \p DeviceTy object
1273 /// A pointer to a \p AsyncInfoTy object
1274 AsyncInfoTy
&AsyncInfo
;
1276 // TODO: What would be the best value here? Should we make it configurable?
1277 // If the size is larger than this threshold, we will allocate and transfer it
1278 // immediately instead of packing it.
1279 static constexpr const int64_t FirstPrivateArgSizeThreshold
= 1024;
1283 PrivateArgumentManagerTy(DeviceTy
&Dev
, AsyncInfoTy
&AsyncInfo
)
1284 : Device(Dev
), AsyncInfo(AsyncInfo
) {}
1286 /// Add a private argument
1287 int addArg(void *HstPtr
, int64_t ArgSize
, int64_t ArgOffset
,
1288 bool IsFirstPrivate
, void *&TgtPtr
, int TgtArgsIndex
,
1289 const map_var_info_t HstPtrName
= nullptr,
1290 const bool AllocImmediately
= false) {
1291 // If the argument is not first-private, or its size is greater than a
1292 // predefined threshold, we will allocate memory and issue the transfer
1294 if (ArgSize
> FirstPrivateArgSizeThreshold
|| !IsFirstPrivate
||
1296 TgtPtr
= Device
.allocData(ArgSize
, HstPtr
);
1298 DP("Data allocation for %sprivate array " DPxMOD
" failed.\n",
1299 (IsFirstPrivate
? "first-" : ""), DPxPTR(HstPtr
));
1300 return OFFLOAD_FAIL
;
1302 #ifdef OMPTARGET_DEBUG
1303 void *TgtPtrBase
= (void *)((intptr_t)TgtPtr
+ ArgOffset
);
1304 DP("Allocated %" PRId64
" bytes of target memory at " DPxMOD
1305 " for %sprivate array " DPxMOD
" - pushing target argument " DPxMOD
1307 ArgSize
, DPxPTR(TgtPtr
), (IsFirstPrivate
? "first-" : ""),
1308 DPxPTR(HstPtr
), DPxPTR(TgtPtrBase
));
1310 // If first-private, copy data from host
1311 if (IsFirstPrivate
) {
1312 DP("Submitting firstprivate data to the device.\n");
1313 int Ret
= Device
.submitData(TgtPtr
, HstPtr
, ArgSize
, AsyncInfo
);
1314 if (Ret
!= OFFLOAD_SUCCESS
) {
1315 DP("Copying data to device failed, failed.\n");
1316 return OFFLOAD_FAIL
;
1319 TgtPtrs
.push_back(TgtPtr
);
1321 DP("Firstprivate array " DPxMOD
" of size %" PRId64
" will be packed\n",
1322 DPxPTR(HstPtr
), ArgSize
);
1323 // When reach this point, the argument must meet all following
1325 // 1. Its size does not exceed the threshold (see the comment for
1326 // FirstPrivateArgSizeThreshold);
1327 // 2. It must be first-private (needs to be mapped to target device).
1328 // We will pack all this kind of arguments to transfer them all at once
1329 // to reduce the number of data transfer. We will not take
1330 // non-first-private arguments, aka. private arguments that doesn't need
1331 // to be mapped to target device, into account because data allocation
1332 // can be very efficient with memory manager.
1334 // Placeholder value
1336 auto *LastFPArgInfo
=
1337 FirstPrivateArgInfo
.empty() ? nullptr : &FirstPrivateArgInfo
.back();
1339 // Compute the start alignment of this entry, add padding if necessary.
1340 // TODO: Consider sorting instead.
1341 uint32_t Padding
= 0;
1342 uint32_t StartAlignment
=
1343 LastFPArgInfo
? LastFPArgInfo
->Alignment
: MaxAlignment
;
1344 if (LastFPArgInfo
) {
1345 // Check if we keep the start alignment or if it is shrunk due to the
1346 // size of the last element.
1347 uint32_t Offset
= LastFPArgInfo
->Size
% StartAlignment
;
1349 StartAlignment
= Offset
;
1350 // We only need as much alignment as the host pointer had (since we
1351 // don't know the alignment information from the source we might end up
1352 // overaligning accesses but not too much).
1353 uint32_t RequiredAlignment
=
1354 llvm::bit_floor(getPartialStructRequiredAlignment(HstPtr
));
1355 if (RequiredAlignment
> StartAlignment
) {
1356 Padding
= RequiredAlignment
- StartAlignment
;
1357 StartAlignment
= RequiredAlignment
;
1361 FirstPrivateArgInfo
.emplace_back(TgtArgsIndex
, HstPtr
, ArgSize
,
1362 StartAlignment
, Padding
, HstPtrName
);
1363 FirstPrivateArgSize
+= Padding
+ ArgSize
;
1366 return OFFLOAD_SUCCESS
;
1369 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1370 /// and start the transfer.
1371 int packAndTransfer(SmallVector
<void *> &TgtArgs
) {
1372 if (!FirstPrivateArgInfo
.empty()) {
1373 assert(FirstPrivateArgSize
!= 0 &&
1374 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1375 FirstPrivateArgBuffer
.resize(FirstPrivateArgSize
, 0);
1376 auto Itr
= FirstPrivateArgBuffer
.begin();
1377 // Copy all host data to this buffer
1378 for (FirstPrivateArgInfoTy
&Info
: FirstPrivateArgInfo
) {
1379 // First pad the pointer as we (have to) pad it on the device too.
1380 Itr
= std::next(Itr
, Info
.Padding
);
1381 std::copy(Info
.HstPtrBegin
, Info
.HstPtrEnd
, Itr
);
1382 Itr
= std::next(Itr
, Info
.Size
);
1384 // Allocate target memory
1386 Device
.allocData(FirstPrivateArgSize
, FirstPrivateArgBuffer
.data());
1387 if (TgtPtr
== nullptr) {
1388 DP("Failed to allocate target memory for private arguments.\n");
1389 return OFFLOAD_FAIL
;
1391 TgtPtrs
.push_back(TgtPtr
);
1392 DP("Allocated %" PRId64
" bytes of target memory at " DPxMOD
"\n",
1393 FirstPrivateArgSize
, DPxPTR(TgtPtr
));
1394 // Transfer data to target device
1395 int Ret
= Device
.submitData(TgtPtr
, FirstPrivateArgBuffer
.data(),
1396 FirstPrivateArgSize
, AsyncInfo
);
1397 if (Ret
!= OFFLOAD_SUCCESS
) {
1398 DP("Failed to submit data of private arguments.\n");
1399 return OFFLOAD_FAIL
;
1401 // Fill in all placeholder pointers
1402 auto TP
= reinterpret_cast<uintptr_t>(TgtPtr
);
1403 for (FirstPrivateArgInfoTy
&Info
: FirstPrivateArgInfo
) {
1404 void *&Ptr
= TgtArgs
[Info
.Index
];
1405 assert(Ptr
== nullptr && "Target pointer is already set by mistaken");
1406 // Pad the device pointer to get the right alignment.
1408 Ptr
= reinterpret_cast<void *>(TP
);
1410 DP("Firstprivate array " DPxMOD
" of size %" PRId64
" mapped to " DPxMOD
1412 DPxPTR(Info
.HstPtrBegin
), Info
.HstPtrEnd
- Info
.HstPtrBegin
,
1417 return OFFLOAD_SUCCESS
;
1420 /// Free all target memory allocated for private arguments
1422 for (void *P
: TgtPtrs
) {
1423 int Ret
= Device
.deleteData(P
);
1424 if (Ret
!= OFFLOAD_SUCCESS
) {
1425 DP("Deallocation of (first-)private arrays failed.\n");
1426 return OFFLOAD_FAIL
;
1432 return OFFLOAD_SUCCESS
;
1436 /// Process data before launching the kernel, including calling targetDataBegin
1437 /// to map and transfer data to target device, transferring (first-)private
1439 static int processDataBefore(ident_t
*Loc
, int64_t DeviceId
, void *HostPtr
,
1440 int32_t ArgNum
, void **ArgBases
, void **Args
,
1441 int64_t *ArgSizes
, int64_t *ArgTypes
,
1442 map_var_info_t
*ArgNames
, void **ArgMappers
,
1443 SmallVector
<void *> &TgtArgs
,
1444 SmallVector
<ptrdiff_t> &TgtOffsets
,
1445 PrivateArgumentManagerTy
&PrivateArgumentManager
,
1446 AsyncInfoTy
&AsyncInfo
) {
1447 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc
);
1448 DeviceTy
&Device
= *PM
->Devices
[DeviceId
];
1449 int Ret
= targetDataBegin(Loc
, Device
, ArgNum
, ArgBases
, Args
, ArgSizes
,
1450 ArgTypes
, ArgNames
, ArgMappers
, AsyncInfo
);
1451 if (Ret
!= OFFLOAD_SUCCESS
) {
1452 REPORT("Call to targetDataBegin failed, abort target.\n");
1453 return OFFLOAD_FAIL
;
1456 // List of (first-)private arrays allocated for this target region
1457 SmallVector
<int> TgtArgsPositions(ArgNum
, -1);
1459 for (int32_t I
= 0; I
< ArgNum
; ++I
) {
1460 if (!(ArgTypes
[I
] & OMP_TGT_MAPTYPE_TARGET_PARAM
)) {
1461 // This is not a target parameter, do not push it into TgtArgs.
1462 // Check for lambda mapping.
1463 if (isLambdaMapping(ArgTypes
[I
])) {
1464 assert((ArgTypes
[I
] & OMP_TGT_MAPTYPE_MEMBER_OF
) &&
1465 "PTR_AND_OBJ must be also MEMBER_OF.");
1466 unsigned Idx
= getParentIndex(ArgTypes
[I
]);
1467 int TgtIdx
= TgtArgsPositions
[Idx
];
1468 assert(TgtIdx
!= -1 && "Base address must be translated already.");
1469 // The parent lambda must be processed already and it must be the last
1470 // in TgtArgs and TgtOffsets arrays.
1471 void *HstPtrVal
= Args
[I
];
1472 void *HstPtrBegin
= ArgBases
[I
];
1473 void *HstPtrBase
= Args
[Idx
];
1475 (void *)((intptr_t)TgtArgs
[TgtIdx
] + TgtOffsets
[TgtIdx
]);
1476 DP("Parent lambda base " DPxMOD
"\n", DPxPTR(TgtPtrBase
));
1477 uint64_t Delta
= (uint64_t)HstPtrBegin
- (uint64_t)HstPtrBase
;
1478 void *TgtPtrBegin
= (void *)((uintptr_t)TgtPtrBase
+ Delta
);
1479 void *&PointerTgtPtrBegin
= AsyncInfo
.getVoidPtrLocation();
1480 TargetPointerResultTy TPR
= Device
.getTgtPtrBegin(
1481 HstPtrVal
, ArgSizes
[I
], /*UpdateRefCount=*/false,
1482 /*UseHoldRefCount=*/false);
1483 PointerTgtPtrBegin
= TPR
.TargetPointer
;
1484 if (!TPR
.isPresent()) {
1485 DP("No lambda captured variable mapped (" DPxMOD
") - ignored\n",
1489 if (TPR
.Flags
.IsHostPointer
) {
1490 DP("Unified memory is active, no need to map lambda captured"
1491 "variable (" DPxMOD
")\n",
1495 DP("Update lambda reference (" DPxMOD
") -> [" DPxMOD
"]\n",
1496 DPxPTR(PointerTgtPtrBegin
), DPxPTR(TgtPtrBegin
));
1497 Ret
= Device
.submitData(TgtPtrBegin
, &PointerTgtPtrBegin
,
1498 sizeof(void *), AsyncInfo
, TPR
.getEntry());
1499 if (Ret
!= OFFLOAD_SUCCESS
) {
1500 REPORT("Copying data to device failed.\n");
1501 return OFFLOAD_FAIL
;
1506 void *HstPtrBegin
= Args
[I
];
1507 void *HstPtrBase
= ArgBases
[I
];
1509 map_var_info_t HstPtrName
= (!ArgNames
) ? nullptr : ArgNames
[I
];
1510 ptrdiff_t TgtBaseOffset
;
1511 TargetPointerResultTy TPR
;
1512 if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_LITERAL
) {
1513 DP("Forwarding first-private value " DPxMOD
" to the target construct\n",
1514 DPxPTR(HstPtrBase
));
1515 TgtPtrBegin
= HstPtrBase
;
1517 } else if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PRIVATE
) {
1518 TgtBaseOffset
= (intptr_t)HstPtrBase
- (intptr_t)HstPtrBegin
;
1519 const bool IsFirstPrivate
= (ArgTypes
[I
] & OMP_TGT_MAPTYPE_TO
);
1520 // If there is a next argument and it depends on the current one, we need
1521 // to allocate the private memory immediately. If this is not the case,
1522 // then the argument can be marked for optimization and packed with the
1524 const bool AllocImmediately
=
1525 (I
< ArgNum
- 1 && (ArgTypes
[I
+ 1] & OMP_TGT_MAPTYPE_MEMBER_OF
));
1526 Ret
= PrivateArgumentManager
.addArg(
1527 HstPtrBegin
, ArgSizes
[I
], TgtBaseOffset
, IsFirstPrivate
, TgtPtrBegin
,
1528 TgtArgs
.size(), HstPtrName
, AllocImmediately
);
1529 if (Ret
!= OFFLOAD_SUCCESS
) {
1530 REPORT("Failed to process %sprivate argument " DPxMOD
"\n",
1531 (IsFirstPrivate
? "first-" : ""), DPxPTR(HstPtrBegin
));
1532 return OFFLOAD_FAIL
;
1535 if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PTR_AND_OBJ
)
1536 HstPtrBase
= *reinterpret_cast<void **>(HstPtrBase
);
1537 TPR
= Device
.getTgtPtrBegin(HstPtrBegin
, ArgSizes
[I
],
1538 /*UpdateRefCount=*/false,
1539 /*UseHoldRefCount=*/false);
1540 TgtPtrBegin
= TPR
.TargetPointer
;
1541 TgtBaseOffset
= (intptr_t)HstPtrBase
- (intptr_t)HstPtrBegin
;
1542 #ifdef OMPTARGET_DEBUG
1543 void *TgtPtrBase
= (void *)((intptr_t)TgtPtrBegin
+ TgtBaseOffset
);
1544 DP("Obtained target argument " DPxMOD
" from host pointer " DPxMOD
"\n",
1545 DPxPTR(TgtPtrBase
), DPxPTR(HstPtrBegin
));
1548 TgtArgsPositions
[I
] = TgtArgs
.size();
1549 TgtArgs
.push_back(TgtPtrBegin
);
1550 TgtOffsets
.push_back(TgtBaseOffset
);
1553 assert(TgtArgs
.size() == TgtOffsets
.size() &&
1554 "Size mismatch in arguments and offsets");
1556 // Pack and transfer first-private arguments
1557 Ret
= PrivateArgumentManager
.packAndTransfer(TgtArgs
);
1558 if (Ret
!= OFFLOAD_SUCCESS
) {
1559 DP("Failed to pack and transfer first private arguments\n");
1560 return OFFLOAD_FAIL
;
1563 return OFFLOAD_SUCCESS
;
1566 /// Process data after launching the kernel, including transferring data back to
1567 /// host if needed and deallocating target memory of (first-)private variables.
1568 static int processDataAfter(ident_t
*Loc
, int64_t DeviceId
, void *HostPtr
,
1569 int32_t ArgNum
, void **ArgBases
, void **Args
,
1570 int64_t *ArgSizes
, int64_t *ArgTypes
,
1571 map_var_info_t
*ArgNames
, void **ArgMappers
,
1572 PrivateArgumentManagerTy
&PrivateArgumentManager
,
1573 AsyncInfoTy
&AsyncInfo
) {
1574 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc
);
1575 DeviceTy
&Device
= *PM
->Devices
[DeviceId
];
1577 // Move data from device.
1578 int Ret
= targetDataEnd(Loc
, Device
, ArgNum
, ArgBases
, Args
, ArgSizes
,
1579 ArgTypes
, ArgNames
, ArgMappers
, AsyncInfo
);
1580 if (Ret
!= OFFLOAD_SUCCESS
) {
1581 REPORT("Call to targetDataEnd failed, abort target.\n");
1582 return OFFLOAD_FAIL
;
1585 // Free target memory for private arguments after synchronization.
1586 // TODO: We might want to remove `mutable` in the future by not changing the
1587 // captured variables somehow.
1588 AsyncInfo
.addPostProcessingFunction(
1589 [PrivateArgumentManager
=
1590 std::move(PrivateArgumentManager
)]() mutable -> int {
1591 int Ret
= PrivateArgumentManager
.free();
1592 if (Ret
!= OFFLOAD_SUCCESS
) {
1593 REPORT("Failed to deallocate target memory for private args\n");
1594 return OFFLOAD_FAIL
;
1599 return OFFLOAD_SUCCESS
;
1603 /// performs the same actions as data_begin in case arg_num is
1604 /// non-zero and initiates run of the offloaded region on the target platform;
1605 /// if arg_num is non-zero after the region execution is done it also
1606 /// performs the same action as data_update and data_end above. This function
1607 /// returns 0 if it was able to transfer the execution to a target and an
1608 /// integer different from zero otherwise.
1609 int target(ident_t
*Loc
, DeviceTy
&Device
, void *HostPtr
,
1610 KernelArgsTy
&KernelArgs
, AsyncInfoTy
&AsyncInfo
) {
1611 int32_t DeviceId
= Device
.DeviceID
;
1612 TableMap
*TM
= getTableMap(HostPtr
);
1613 // No map for this host pointer found!
1615 REPORT("Host ptr " DPxMOD
" does not have a matching target pointer.\n",
1617 return OFFLOAD_FAIL
;
1620 // get target table.
1621 __tgt_target_table
*TargetTable
= nullptr;
1623 std::lock_guard
<std::mutex
> TrlTblLock(PM
->TrlTblMtx
);
1624 assert(TM
->Table
->TargetsTable
.size() > (size_t)DeviceId
&&
1625 "Not expecting a device ID outside the table's bounds!");
1626 TargetTable
= TM
->Table
->TargetsTable
[DeviceId
];
1628 assert(TargetTable
&& "Global data has not been mapped\n");
1630 DP("loop trip count is %" PRIu64
".\n", KernelArgs
.Tripcount
);
1632 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1633 // need to manifest base pointers prior to launching a kernel. Even if we have
1634 // mapped an object only partially, e.g. A[N:M], although the kernel is
1635 // expected to access elements starting at address &A[N] and beyond, we still
1636 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1637 // API, we need the begin address itself, i.e. &A[N], as the API operates on
1638 // begin addresses, not bases. That's why we pass args and offsets as two
1639 // separate entities so that each plugin can do what it needs. This behavior
1640 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1641 SmallVector
<void *> TgtArgs
;
1642 SmallVector
<ptrdiff_t> TgtOffsets
;
1644 PrivateArgumentManagerTy
PrivateArgumentManager(Device
, AsyncInfo
);
1646 int NumClangLaunchArgs
= KernelArgs
.NumArgs
;
1647 int Ret
= OFFLOAD_SUCCESS
;
1648 if (NumClangLaunchArgs
) {
1649 // Process data, such as data mapping, before launching the kernel
1650 Ret
= processDataBefore(Loc
, DeviceId
, HostPtr
, NumClangLaunchArgs
,
1651 KernelArgs
.ArgBasePtrs
, KernelArgs
.ArgPtrs
,
1652 KernelArgs
.ArgSizes
, KernelArgs
.ArgTypes
,
1653 KernelArgs
.ArgNames
, KernelArgs
.ArgMappers
, TgtArgs
,
1654 TgtOffsets
, PrivateArgumentManager
, AsyncInfo
);
1655 if (Ret
!= OFFLOAD_SUCCESS
) {
1656 REPORT("Failed to process data before launching the kernel.\n");
1657 return OFFLOAD_FAIL
;
1660 // Clang might pass more values via the ArgPtrs to the runtime that we pass
1661 // on to the kernel.
1662 // TOOD: Next time we adjust the KernelArgsTy we should introduce a new
1663 // NumKernelArgs field.
1664 KernelArgs
.NumArgs
= TgtArgs
.size();
1667 // Launch device execution.
1668 void *TgtEntryPtr
= TargetTable
->EntriesBegin
[TM
->Index
].addr
;
1669 DP("Launching target execution %s with pointer " DPxMOD
" (index=%d).\n",
1670 TargetTable
->EntriesBegin
[TM
->Index
].name
, DPxPTR(TgtEntryPtr
), TM
->Index
);
1673 assert(KernelArgs
.NumArgs
== TgtArgs
.size() && "Argument count mismatch!");
1674 TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc
);
1677 assert(KernelArgs
.NumTeams
[1] == 0 && KernelArgs
.NumTeams
[2] == 0 &&
1678 "Multi dimensional launch not supported yet.");
1679 /// RAII to establish tool anchors before and after kernel launch
1680 int32_t NumTeams
= KernelArgs
.NumTeams
[0];
1681 // No need to guard this with OMPT_IF_BUILT
1682 InterfaceRAII
TargetSubmitRAII(
1683 RegionInterface
.getCallbacks
<ompt_callback_target_submit
>(), NumTeams
);
1686 Ret
= Device
.launchKernel(TgtEntryPtr
, TgtArgs
.data(), TgtOffsets
.data(),
1687 KernelArgs
, AsyncInfo
);
1690 if (Ret
!= OFFLOAD_SUCCESS
) {
1691 REPORT("Executing target region abort target.\n");
1692 return OFFLOAD_FAIL
;
1695 if (NumClangLaunchArgs
) {
1696 // Transfer data back and deallocate target memory for (first-)private
1698 Ret
= processDataAfter(Loc
, DeviceId
, HostPtr
, NumClangLaunchArgs
,
1699 KernelArgs
.ArgBasePtrs
, KernelArgs
.ArgPtrs
,
1700 KernelArgs
.ArgSizes
, KernelArgs
.ArgTypes
,
1701 KernelArgs
.ArgNames
, KernelArgs
.ArgMappers
,
1702 PrivateArgumentManager
, AsyncInfo
);
1703 if (Ret
!= OFFLOAD_SUCCESS
) {
1704 REPORT("Failed to process data after launching the kernel.\n");
1705 return OFFLOAD_FAIL
;
1709 return OFFLOAD_SUCCESS
;
1712 /// Enables the record replay mechanism by pre-allocating MemorySize
1713 /// and informing the record-replayer of whether to store the output
1715 int target_activate_rr(DeviceTy
&Device
, uint64_t MemorySize
, void *VAddr
,
1716 bool isRecord
, bool SaveOutput
) {
1717 return Device
.RTL
->activate_record_replay(Device
.DeviceID
, MemorySize
, VAddr
,
1718 isRecord
, SaveOutput
);
1721 /// Executes a kernel using pre-recorded information for loading to
1722 /// device memory to launch the target kernel with the pre-recorded
1724 int target_replay(ident_t
*Loc
, DeviceTy
&Device
, void *HostPtr
,
1725 void *DeviceMemory
, int64_t DeviceMemorySize
, void **TgtArgs
,
1726 ptrdiff_t *TgtOffsets
, int32_t NumArgs
, int32_t NumTeams
,
1727 int32_t ThreadLimit
, uint64_t LoopTripCount
,
1728 AsyncInfoTy
&AsyncInfo
) {
1729 int32_t DeviceId
= Device
.DeviceID
;
1730 TableMap
*TM
= getTableMap(HostPtr
);
1731 // Fail if the table map fails to find the target kernel pointer for the
1732 // provided host pointer.
1734 REPORT("Host ptr " DPxMOD
" does not have a matching target pointer.\n",
1736 return OFFLOAD_FAIL
;
1739 // Retrieve the target table of offloading entries.
1740 __tgt_target_table
*TargetTable
= nullptr;
1742 std::lock_guard
<std::mutex
> TrlTblLock(PM
->TrlTblMtx
);
1743 assert(TM
->Table
->TargetsTable
.size() > (size_t)DeviceId
&&
1744 "Not expecting a device ID outside the table's bounds!");
1745 TargetTable
= TM
->Table
->TargetsTable
[DeviceId
];
1747 assert(TargetTable
&& "Global data has not been mapped\n");
1749 // Retrieve the target kernel pointer, allocate and store the recorded device
1750 // memory data, and launch device execution.
1751 void *TgtEntryPtr
= TargetTable
->EntriesBegin
[TM
->Index
].addr
;
1752 DP("Launching target execution %s with pointer " DPxMOD
" (index=%d).\n",
1753 TargetTable
->EntriesBegin
[TM
->Index
].name
, DPxPTR(TgtEntryPtr
), TM
->Index
);
1755 void *TgtPtr
= Device
.allocData(DeviceMemorySize
, /* HstPtr */ nullptr,
1756 TARGET_ALLOC_DEFAULT
);
1757 Device
.submitData(TgtPtr
, DeviceMemory
, DeviceMemorySize
, AsyncInfo
);
1759 KernelArgsTy KernelArgs
= {0};
1760 KernelArgs
.Version
= 2;
1761 KernelArgs
.NumArgs
= NumArgs
;
1762 KernelArgs
.Tripcount
= LoopTripCount
;
1763 KernelArgs
.NumTeams
[0] = NumTeams
;
1764 KernelArgs
.ThreadLimit
[0] = ThreadLimit
;
1766 int Ret
= Device
.launchKernel(TgtEntryPtr
, TgtArgs
, TgtOffsets
, KernelArgs
,
1769 if (Ret
!= OFFLOAD_SUCCESS
) {
1770 REPORT("Executing target region abort target.\n");
1771 return OFFLOAD_FAIL
;
1774 return OFFLOAD_SUCCESS
;