1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Implementation of the interface to be used by Clang during the codegen of a
12 //===----------------------------------------------------------------------===//
14 #include "omptarget.h"
22 int AsyncInfoTy::synchronize() {
23 int Result
= OFFLOAD_SUCCESS
;
24 if (AsyncInfo
.Queue
) {
25 // If we have a queue we need to synchronize it now.
26 Result
= Device
.synchronize(*this);
27 assert(AsyncInfo
.Queue
== nullptr &&
28 "The device plugin should have nulled the queue to indicate there "
29 "are no outstanding actions!");
34 void *&AsyncInfoTy::getVoidPtrLocation() {
35 BufferLocations
.push_back(nullptr);
36 return BufferLocations
.back();
39 /* All begin addresses for partially mapped structs must be 8-aligned in order
40 * to ensure proper alignment of members. E.g.
45 * int *p; // 8-aligned
48 * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
51 * for (int i...) s1.p[i] = ...;
54 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
55 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
56 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
57 * requirements for its type. Now, when we allocate memory on the device, in
58 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
59 * This means that the chunk of the struct on the device will start at a
60 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
61 * address of p will be a misaligned 0x204 (on the host there was no need to add
62 * padding between b and p, so p comes exactly 4 bytes after b). If the device
63 * kernel tries to access s1.p, a misaligned address error occurs (as reported
64 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
65 * extending the size of the allocated chuck accordingly, the chuck on the
66 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
67 * &s1.p=0x208, as they should be to satisfy the alignment requirements.
69 static const int64_t Alignment
= 8;
71 /// Map global data and execute pending ctors
72 static int InitLibrary(DeviceTy
&Device
) {
76 int32_t device_id
= Device
.DeviceID
;
77 int rc
= OFFLOAD_SUCCESS
;
78 bool supportsEmptyImages
= Device
.RTL
->supports_empty_images
&&
79 Device
.RTL
->supports_empty_images() > 0;
81 Device
.PendingGlobalsMtx
.lock();
83 for (auto *HostEntriesBegin
: PM
->HostEntriesBeginRegistrationOrder
) {
84 TranslationTable
*TransTable
=
85 &PM
->HostEntriesBeginToTransTable
[HostEntriesBegin
];
86 if (TransTable
->HostTable
.EntriesBegin
==
87 TransTable
->HostTable
.EntriesEnd
&&
88 !supportsEmptyImages
) {
89 // No host entry so no need to proceed
93 if (TransTable
->TargetsTable
[device_id
] != 0) {
94 // Library entries have already been processed
99 assert(TransTable
->TargetsImages
.size() > (size_t)device_id
&&
100 "Not expecting a device ID outside the table's bounds!");
101 __tgt_device_image
*img
= TransTable
->TargetsImages
[device_id
];
103 REPORT("No image loaded for device id %d.\n", device_id
);
107 // 2) load image into the target table.
108 __tgt_target_table
*TargetTable
= TransTable
->TargetsTable
[device_id
] =
109 Device
.load_binary(img
);
110 // Unable to get table for this image: invalidate image and fail.
112 REPORT("Unable to generate entries table for device id %d.\n", device_id
);
113 TransTable
->TargetsImages
[device_id
] = 0;
118 // Verify whether the two table sizes match.
120 TransTable
->HostTable
.EntriesEnd
- TransTable
->HostTable
.EntriesBegin
;
121 size_t tsize
= TargetTable
->EntriesEnd
- TargetTable
->EntriesBegin
;
123 // Invalid image for these host entries!
124 if (hsize
!= tsize
) {
125 REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
126 device_id
, hsize
, tsize
);
127 TransTable
->TargetsImages
[device_id
] = 0;
128 TransTable
->TargetsTable
[device_id
] = 0;
133 // process global data that needs to be mapped.
134 Device
.DataMapMtx
.lock();
135 __tgt_target_table
*HostTable
= &TransTable
->HostTable
;
136 for (__tgt_offload_entry
*CurrDeviceEntry
= TargetTable
->EntriesBegin
,
137 *CurrHostEntry
= HostTable
->EntriesBegin
,
138 *EntryDeviceEnd
= TargetTable
->EntriesEnd
;
139 CurrDeviceEntry
!= EntryDeviceEnd
;
140 CurrDeviceEntry
++, CurrHostEntry
++) {
141 if (CurrDeviceEntry
->size
!= 0) {
143 assert(CurrDeviceEntry
->size
== CurrHostEntry
->size
&&
144 "data size mismatch");
146 // Fortran may use multiple weak declarations for the same symbol,
147 // therefore we must allow for multiple weak symbols to be loaded from
148 // the fat binary. Treat these mappings as any other "regular" mapping.
150 if (Device
.getTgtPtrBegin(CurrHostEntry
->addr
, CurrHostEntry
->size
))
152 DP("Add mapping from host " DPxMOD
" to device " DPxMOD
" with size %zu"
154 DPxPTR(CurrHostEntry
->addr
), DPxPTR(CurrDeviceEntry
->addr
),
155 CurrDeviceEntry
->size
);
156 Device
.HostDataToTargetMap
.emplace(
157 (uintptr_t)CurrHostEntry
->addr
/*HstPtrBase*/,
158 (uintptr_t)CurrHostEntry
->addr
/*HstPtrBegin*/,
159 (uintptr_t)CurrHostEntry
->addr
+ CurrHostEntry
->size
/*HstPtrEnd*/,
160 (uintptr_t)CurrDeviceEntry
->addr
/*TgtPtrBegin*/,
161 false /*UseHoldRefCount*/, nullptr /*Name*/,
162 true /*IsRefCountINF*/);
165 Device
.DataMapMtx
.unlock();
167 PM
->TrlTblMtx
.unlock();
169 if (rc
!= OFFLOAD_SUCCESS
) {
170 Device
.PendingGlobalsMtx
.unlock();
175 * Run ctors for static objects
177 if (!Device
.PendingCtorsDtors
.empty()) {
178 AsyncInfoTy
AsyncInfo(Device
);
179 // Call all ctors for all libraries registered so far
180 for (auto &lib
: Device
.PendingCtorsDtors
) {
181 if (!lib
.second
.PendingCtors
.empty()) {
182 DP("Has pending ctors... call now\n");
183 for (auto &entry
: lib
.second
.PendingCtors
) {
186 target(nullptr, Device
, ctor
, 0, nullptr, nullptr, nullptr,
187 nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo
);
188 if (rc
!= OFFLOAD_SUCCESS
) {
189 REPORT("Running ctor " DPxMOD
" failed.\n", DPxPTR(ctor
));
190 Device
.PendingGlobalsMtx
.unlock();
194 // Clear the list to indicate that this device has been used
195 lib
.second
.PendingCtors
.clear();
196 DP("Done with pending ctors for lib " DPxMOD
"\n", DPxPTR(lib
.first
));
199 // All constructors have been issued, wait for them now.
200 if (AsyncInfo
.synchronize() != OFFLOAD_SUCCESS
)
203 Device
.HasPendingGlobals
= false;
204 Device
.PendingGlobalsMtx
.unlock();
206 return OFFLOAD_SUCCESS
;
209 void handleTargetOutcome(bool Success
, ident_t
*Loc
) {
210 switch (PM
->TargetOffloadPolicy
) {
213 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
217 FATAL_MESSAGE0(1, "default offloading policy must be switched to "
218 "mandatory or disabled");
222 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE
)
223 for (auto &Device
: PM
->Devices
)
224 dumpTargetPointerMappings(Loc
, *Device
);
226 FAILURE_MESSAGE("Run with LIBOMPTARGET_INFO=%d to dump host-target "
227 "pointer mappings.\n",
228 OMP_INFOTYPE_DUMP_TABLE
);
230 SourceInfo
info(Loc
);
231 if (info
.isAvailible())
232 fprintf(stderr
, "%s:%d:%d: ", info
.getFilename(), info
.getLine(),
235 FAILURE_MESSAGE("Source location information not present. Compile with "
236 "-g or -gline-tables-only.\n");
238 1, "failure of target construct while offloading is mandatory");
240 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE
)
241 for (auto &Device
: PM
->Devices
)
242 dumpTargetPointerMappings(Loc
, *Device
);
248 static void handleDefaultTargetOffload() {
249 PM
->TargetOffloadMtx
.lock();
250 if (PM
->TargetOffloadPolicy
== tgt_default
) {
251 if (omp_get_num_devices() > 0) {
252 DP("Default TARGET OFFLOAD policy is now mandatory "
253 "(devices were found)\n");
254 PM
->TargetOffloadPolicy
= tgt_mandatory
;
256 DP("Default TARGET OFFLOAD policy is now disabled "
257 "(no devices were found)\n");
258 PM
->TargetOffloadPolicy
= tgt_disabled
;
261 PM
->TargetOffloadMtx
.unlock();
264 static bool isOffloadDisabled() {
265 if (PM
->TargetOffloadPolicy
== tgt_default
)
266 handleDefaultTargetOffload();
267 return PM
->TargetOffloadPolicy
== tgt_disabled
;
270 // If offload is enabled, ensure that device DeviceID has been initialized,
271 // global ctors have been executed, and global data has been mapped.
273 // The return bool indicates if the offload is to the host device
274 // There are three possible results:
275 // - Return false if the taregt device is ready for offload
276 // - Return true without reporting a runtime error if offload is
277 // disabled, perhaps because the initial device was specified.
278 // - Report a runtime error and return true.
280 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
281 // This step might be skipped if offload is disabled.
282 bool checkDeviceAndCtors(int64_t &DeviceID
, ident_t
*Loc
) {
283 if (isOffloadDisabled()) {
284 DP("Offload is disabled\n");
288 if (DeviceID
== OFFLOAD_DEVICE_DEFAULT
) {
289 DeviceID
= omp_get_default_device();
290 DP("Use default device id %" PRId64
"\n", DeviceID
);
293 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
294 if (omp_get_num_devices() == 0) {
295 DP("omp_get_num_devices() == 0 but offload is manadatory\n");
296 handleTargetOutcome(false, Loc
);
300 if (DeviceID
== omp_get_initial_device()) {
301 DP("Device is host (%" PRId64
"), returning as if offload is disabled\n",
307 if (!device_is_ready(DeviceID
)) {
308 REPORT("Device %" PRId64
" is not ready.\n", DeviceID
);
309 handleTargetOutcome(false, Loc
);
314 DeviceTy
&Device
= *PM
->Devices
[DeviceID
];
316 // Check whether global data has been mapped for this device
317 Device
.PendingGlobalsMtx
.lock();
318 bool hasPendingGlobals
= Device
.HasPendingGlobals
;
319 Device
.PendingGlobalsMtx
.unlock();
320 if (hasPendingGlobals
&& InitLibrary(Device
) != OFFLOAD_SUCCESS
) {
321 REPORT("Failed to init globals on device %" PRId64
"\n", DeviceID
);
322 handleTargetOutcome(false, Loc
);
329 static int32_t getParentIndex(int64_t type
) {
330 return ((type
& OMP_TGT_MAPTYPE_MEMBER_OF
) >> 48) - 1;
333 void *targetAllocExplicit(size_t size
, int device_num
, int kind
,
336 DP("Call to %s for device %d requesting %zu bytes\n", name
, device_num
, size
);
339 DP("Call to %s with non-positive length\n", name
);
345 if (device_num
== omp_get_initial_device()) {
347 DP("%s returns host ptr " DPxMOD
"\n", name
, DPxPTR(rc
));
351 if (!device_is_ready(device_num
)) {
352 DP("%s returns NULL ptr\n", name
);
356 DeviceTy
&Device
= *PM
->Devices
[device_num
];
357 rc
= Device
.allocData(size
, nullptr, kind
);
358 DP("%s returns device ptr " DPxMOD
"\n", name
, DPxPTR(rc
));
362 /// Call the user-defined mapper function followed by the appropriate
363 // targetData* function (targetData{Begin,End,Update}).
364 int targetDataMapper(ident_t
*loc
, DeviceTy
&Device
, void *arg_base
, void *arg
,
365 int64_t arg_size
, int64_t arg_type
,
366 map_var_info_t arg_names
, void *arg_mapper
,
367 AsyncInfoTy
&AsyncInfo
,
368 TargetDataFuncPtrTy target_data_function
) {
369 TIMESCOPE_WITH_IDENT(loc
);
370 DP("Calling the mapper function " DPxMOD
"\n", DPxPTR(arg_mapper
));
372 // The mapper function fills up Components.
373 MapperComponentsTy MapperComponents
;
374 MapperFuncPtrTy MapperFuncPtr
= (MapperFuncPtrTy
)(arg_mapper
);
375 (*MapperFuncPtr
)((void *)&MapperComponents
, arg_base
, arg
, arg_size
, arg_type
,
378 // Construct new arrays for args_base, args, arg_sizes and arg_types
379 // using the information in MapperComponents and call the corresponding
380 // targetData* function using these new arrays.
381 std::vector
<void *> MapperArgsBase(MapperComponents
.Components
.size());
382 std::vector
<void *> MapperArgs(MapperComponents
.Components
.size());
383 std::vector
<int64_t> MapperArgSizes(MapperComponents
.Components
.size());
384 std::vector
<int64_t> MapperArgTypes(MapperComponents
.Components
.size());
385 std::vector
<void *> MapperArgNames(MapperComponents
.Components
.size());
387 for (unsigned I
= 0, E
= MapperComponents
.Components
.size(); I
< E
; ++I
) {
388 auto &C
= MapperComponents
.Components
[I
];
389 MapperArgsBase
[I
] = C
.Base
;
390 MapperArgs
[I
] = C
.Begin
;
391 MapperArgSizes
[I
] = C
.Size
;
392 MapperArgTypes
[I
] = C
.Type
;
393 MapperArgNames
[I
] = C
.Name
;
396 int rc
= target_data_function(loc
, Device
, MapperComponents
.Components
.size(),
397 MapperArgsBase
.data(), MapperArgs
.data(),
398 MapperArgSizes
.data(), MapperArgTypes
.data(),
399 MapperArgNames
.data(), /*arg_mappers*/ nullptr,
400 AsyncInfo
, /*FromMapper=*/true);
405 /// Internal function to do the mapping and transfer the data to the device
406 int targetDataBegin(ident_t
*loc
, DeviceTy
&Device
, int32_t arg_num
,
407 void **args_base
, void **args
, int64_t *arg_sizes
,
408 int64_t *arg_types
, map_var_info_t
*arg_names
,
409 void **arg_mappers
, AsyncInfoTy
&AsyncInfo
,
411 // process each input.
412 for (int32_t i
= 0; i
< arg_num
; ++i
) {
413 // Ignore private variables and arrays - there is no mapping for them.
414 if ((arg_types
[i
] & OMP_TGT_MAPTYPE_LITERAL
) ||
415 (arg_types
[i
] & OMP_TGT_MAPTYPE_PRIVATE
))
418 if (arg_mappers
&& arg_mappers
[i
]) {
419 // Instead of executing the regular path of targetDataBegin, call the
420 // targetDataMapper variant which will call targetDataBegin again
421 // with new arguments.
422 DP("Calling targetDataMapper for the %dth argument\n", i
);
424 map_var_info_t arg_name
= (!arg_names
) ? nullptr : arg_names
[i
];
425 int rc
= targetDataMapper(loc
, Device
, args_base
[i
], args
[i
],
426 arg_sizes
[i
], arg_types
[i
], arg_name
,
427 arg_mappers
[i
], AsyncInfo
, targetDataBegin
);
429 if (rc
!= OFFLOAD_SUCCESS
) {
430 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
435 // Skip the rest of this function, continue to the next argument.
439 void *HstPtrBegin
= args
[i
];
440 void *HstPtrBase
= args_base
[i
];
441 int64_t data_size
= arg_sizes
[i
];
442 map_var_info_t HstPtrName
= (!arg_names
) ? nullptr : arg_names
[i
];
444 // Adjust for proper alignment if this is a combined entry (for structs).
445 // Look at the next argument - if that is MEMBER_OF this one, then this one
446 // is a combined entry.
448 const int next_i
= i
+ 1;
449 if (getParentIndex(arg_types
[i
]) < 0 && next_i
< arg_num
&&
450 getParentIndex(arg_types
[next_i
]) == i
) {
451 padding
= (int64_t)HstPtrBegin
% Alignment
;
453 DP("Using a padding of %" PRId64
" bytes for begin address " DPxMOD
455 padding
, DPxPTR(HstPtrBegin
));
456 HstPtrBegin
= (char *)HstPtrBegin
- padding
;
457 data_size
+= padding
;
461 // Address of pointer on the host and device, respectively.
462 void *Pointer_HstPtrBegin
, *PointerTgtPtrBegin
;
463 TargetPointerResultTy Pointer_TPR
;
464 bool IsHostPtr
= false;
465 bool IsImplicit
= arg_types
[i
] & OMP_TGT_MAPTYPE_IMPLICIT
;
466 // Force the creation of a device side copy of the data when:
467 // a close map modifier was associated with a map that contained a to.
468 bool HasCloseModifier
= arg_types
[i
] & OMP_TGT_MAPTYPE_CLOSE
;
469 bool HasPresentModifier
= arg_types
[i
] & OMP_TGT_MAPTYPE_PRESENT
;
470 bool HasHoldModifier
= arg_types
[i
] & OMP_TGT_MAPTYPE_OMPX_HOLD
;
471 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
472 // have reached this point via __tgt_target_data_begin and not __tgt_target
473 // then no argument is marked as TARGET_PARAM ("omp target data map" is not
474 // associated with a target region, so there are no target parameters). This
475 // may be considered a hack, we could revise the scheme in the future.
477 !(arg_types
[i
] & OMP_TGT_MAPTYPE_MEMBER_OF
) && !(FromMapper
&& i
== 0);
478 if (arg_types
[i
] & OMP_TGT_MAPTYPE_PTR_AND_OBJ
) {
479 DP("Has a pointer entry: \n");
480 // Base is address of pointer.
482 // Usually, the pointer is already allocated by this time. For example:
484 // #pragma omp target map(s.p[0:N])
486 // The map entry for s comes first, and the PTR_AND_OBJ entry comes
487 // afterward, so the pointer is already allocated by the time the
488 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
489 // non-null. However, "declare target link" can produce a PTR_AND_OBJ
490 // entry for a global that might not already be allocated by the time the
491 // PTR_AND_OBJ entry is handled below, and so the allocation might fail
492 // when HasPresentModifier.
493 Pointer_TPR
= Device
.getTargetPointer(
494 HstPtrBase
, HstPtrBase
, sizeof(void *), /*HstPtrName=*/nullptr,
495 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit
, UpdateRef
,
496 HasCloseModifier
, HasPresentModifier
, HasHoldModifier
, AsyncInfo
);
497 PointerTgtPtrBegin
= Pointer_TPR
.TargetPointer
;
498 IsHostPtr
= Pointer_TPR
.Flags
.IsHostPointer
;
499 if (!PointerTgtPtrBegin
) {
500 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
501 HasPresentModifier
? "'present' map type modifier"
502 : "device failure or illegal mapping");
505 DP("There are %zu bytes allocated at target address " DPxMOD
" - is%s new"
507 sizeof(void *), DPxPTR(PointerTgtPtrBegin
),
508 (Pointer_TPR
.Flags
.IsNewEntry
? "" : " not"));
509 Pointer_HstPtrBegin
= HstPtrBase
;
510 // modify current entry.
511 HstPtrBase
= *(void **)HstPtrBase
;
512 // No need to update pointee ref count for the first element of the
513 // subelement that comes from mapper.
515 (!FromMapper
|| i
!= 0); // subsequently update ref count of pointee
518 const bool HasFlagTo
= arg_types
[i
] & OMP_TGT_MAPTYPE_TO
;
519 const bool HasFlagAlways
= arg_types
[i
] & OMP_TGT_MAPTYPE_ALWAYS
;
520 auto TPR
= Device
.getTargetPointer(HstPtrBegin
, HstPtrBase
, data_size
,
521 HstPtrName
, HasFlagTo
, HasFlagAlways
,
522 IsImplicit
, UpdateRef
, HasCloseModifier
,
523 HasPresentModifier
, HasHoldModifier
,
525 void *TgtPtrBegin
= TPR
.TargetPointer
;
526 IsHostPtr
= TPR
.Flags
.IsHostPointer
;
527 // If data_size==0, then the argument could be a zero-length pointer to
528 // NULL, so getOrAlloc() returning NULL is not an error.
529 if (!TgtPtrBegin
&& (data_size
|| HasPresentModifier
)) {
530 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
531 HasPresentModifier
? "'present' map type modifier"
532 : "device failure or illegal mapping");
535 DP("There are %" PRId64
" bytes allocated at target address " DPxMOD
537 data_size
, DPxPTR(TgtPtrBegin
), (TPR
.Flags
.IsNewEntry
? "" : " not"));
539 if (arg_types
[i
] & OMP_TGT_MAPTYPE_RETURN_PARAM
) {
540 uintptr_t Delta
= (uintptr_t)HstPtrBegin
- (uintptr_t)HstPtrBase
;
541 void *TgtPtrBase
= (void *)((uintptr_t)TgtPtrBegin
- Delta
);
542 DP("Returning device pointer " DPxMOD
"\n", DPxPTR(TgtPtrBase
));
543 args_base
[i
] = TgtPtrBase
;
546 if (arg_types
[i
] & OMP_TGT_MAPTYPE_PTR_AND_OBJ
&& !IsHostPtr
) {
547 // Check whether we need to update the pointer on the device
548 bool UpdateDevPtr
= false;
550 uint64_t Delta
= (uint64_t)HstPtrBegin
- (uint64_t)HstPtrBase
;
551 void *ExpectedTgtPtrBase
= (void *)((uint64_t)TgtPtrBegin
- Delta
);
553 Device
.ShadowMtx
.lock();
554 auto Entry
= Device
.ShadowPtrMap
.find(Pointer_HstPtrBegin
);
555 // If this pointer is not in the map we need to insert it. If the map
556 // contains a stale entry, we need to update it (e.g. if the pointee was
557 // deallocated and later on is reallocated at another device address). The
558 // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An
559 // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ
560 // pair is deallocated, not when the OBJ is deallocated. In
561 // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer,
562 // so it stays in the map for the lifetime of the application. When the
563 // OBJ is deallocated and later on allocated again (at a different device
564 // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin
565 // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ.
566 if (Entry
== Device
.ShadowPtrMap
.end() ||
567 Entry
->second
.TgtPtrVal
!= ExpectedTgtPtrBase
) {
568 // create or update shadow pointers for this entry
569 Device
.ShadowPtrMap
[Pointer_HstPtrBegin
] = {
570 HstPtrBase
, PointerTgtPtrBegin
, ExpectedTgtPtrBase
};
575 Pointer_TPR
.MapTableEntry
->lock();
576 Device
.ShadowMtx
.unlock();
578 DP("Update pointer (" DPxMOD
") -> [" DPxMOD
"]\n",
579 DPxPTR(PointerTgtPtrBegin
), DPxPTR(TgtPtrBegin
));
581 void *&TgtPtrBase
= AsyncInfo
.getVoidPtrLocation();
582 TgtPtrBase
= ExpectedTgtPtrBase
;
584 int Ret
= Device
.submitData(PointerTgtPtrBegin
, &TgtPtrBase
,
585 sizeof(void *), AsyncInfo
);
586 if (Ret
!= OFFLOAD_SUCCESS
) {
587 Pointer_TPR
.MapTableEntry
->unlock();
588 REPORT("Copying data to device failed.\n");
591 void *Event
= Pointer_TPR
.MapTableEntry
->getEvent();
592 bool NeedNewEvent
= Event
== nullptr;
593 if (NeedNewEvent
&& Device
.createEvent(&Event
) != OFFLOAD_SUCCESS
) {
594 Pointer_TPR
.MapTableEntry
->unlock();
595 REPORT("Failed to create event.\n");
598 // We cannot assume the event should not be nullptr because we don't
599 // know if the target support event. But if a target doesn't,
600 // recordEvent should always return success.
601 Ret
= Device
.recordEvent(Event
, AsyncInfo
);
602 if (Ret
!= OFFLOAD_SUCCESS
) {
603 Pointer_TPR
.MapTableEntry
->unlock();
604 REPORT("Failed to set dependence on event " DPxMOD
"\n",
609 Pointer_TPR
.MapTableEntry
->setEvent(Event
);
610 Pointer_TPR
.MapTableEntry
->unlock();
612 Device
.ShadowMtx
.unlock();
616 return OFFLOAD_SUCCESS
;
620 /// This structure contains information to deallocate a target pointer, aka.
621 /// used to call the function \p DeviceTy::deallocTgtPtr.
622 struct DeallocTgtPtrInfo
{
623 /// Host pointer used to look up into the map table
627 /// Whether it has \p ompx_hold modifier
628 bool HasHoldModifier
;
630 DeallocTgtPtrInfo(void *HstPtr
, int64_t Size
, bool HasHoldModifier
)
631 : HstPtrBegin(HstPtr
), DataSize(Size
), HasHoldModifier(HasHoldModifier
) {}
635 /// Internal function to undo the mapping and retrieve the data from the device.
636 int targetDataEnd(ident_t
*loc
, DeviceTy
&Device
, int32_t ArgNum
,
637 void **ArgBases
, void **Args
, int64_t *ArgSizes
,
638 int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
639 void **ArgMappers
, AsyncInfoTy
&AsyncInfo
, bool FromMapper
) {
641 std::vector
<DeallocTgtPtrInfo
> DeallocTgtPtrs
;
642 void *FromMapperBase
= nullptr;
643 // process each input.
644 for (int32_t I
= ArgNum
- 1; I
>= 0; --I
) {
645 // Ignore private variables and arrays - there is no mapping for them.
646 // Also, ignore the use_device_ptr directive, it has no effect here.
647 if ((ArgTypes
[I
] & OMP_TGT_MAPTYPE_LITERAL
) ||
648 (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PRIVATE
))
651 if (ArgMappers
&& ArgMappers
[I
]) {
652 // Instead of executing the regular path of targetDataEnd, call the
653 // targetDataMapper variant which will call targetDataEnd again
654 // with new arguments.
655 DP("Calling targetDataMapper for the %dth argument\n", I
);
657 map_var_info_t ArgName
= (!ArgNames
) ? nullptr : ArgNames
[I
];
658 Ret
= targetDataMapper(loc
, Device
, ArgBases
[I
], Args
[I
], ArgSizes
[I
],
659 ArgTypes
[I
], ArgName
, ArgMappers
[I
], AsyncInfo
,
662 if (Ret
!= OFFLOAD_SUCCESS
) {
663 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
668 // Skip the rest of this function, continue to the next argument.
672 void *HstPtrBegin
= Args
[I
];
673 int64_t DataSize
= ArgSizes
[I
];
674 // Adjust for proper alignment if this is a combined entry (for structs).
675 // Look at the next argument - if that is MEMBER_OF this one, then this one
676 // is a combined entry.
677 const int NextI
= I
+ 1;
678 if (getParentIndex(ArgTypes
[I
]) < 0 && NextI
< ArgNum
&&
679 getParentIndex(ArgTypes
[NextI
]) == I
) {
680 int64_t Padding
= (int64_t)HstPtrBegin
% Alignment
;
682 DP("Using a Padding of %" PRId64
" bytes for begin address " DPxMOD
684 Padding
, DPxPTR(HstPtrBegin
));
685 HstPtrBegin
= (char *)HstPtrBegin
- Padding
;
690 bool IsLast
, IsHostPtr
;
691 bool IsImplicit
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_IMPLICIT
;
692 bool UpdateRef
= (!(ArgTypes
[I
] & OMP_TGT_MAPTYPE_MEMBER_OF
) ||
693 (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PTR_AND_OBJ
)) &&
694 !(FromMapper
&& I
== 0);
695 bool ForceDelete
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_DELETE
;
696 bool HasPresentModifier
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_PRESENT
;
697 bool HasHoldModifier
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_OMPX_HOLD
;
699 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
700 void *TgtPtrBegin
= Device
.getTgtPtrBegin(
701 HstPtrBegin
, DataSize
, IsLast
, UpdateRef
, HasHoldModifier
, IsHostPtr
,
702 !IsImplicit
, ForceDelete
);
703 if (!TgtPtrBegin
&& (DataSize
|| HasPresentModifier
)) {
704 DP("Mapping does not exist (%s)\n",
705 (HasPresentModifier
? "'present' map type modifier" : "ignored"));
706 if (HasPresentModifier
) {
707 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
708 // "If a map clause appears on a target, target data, target enter data
709 // or target exit data construct with a present map-type-modifier then
710 // on entry to the region if the corresponding list item does not appear
711 // in the device data environment then an error occurs and the program
714 // This should be an error upon entering an "omp target exit data". It
715 // should not be an error upon exiting an "omp target data" or "omp
716 // target". For "omp target data", Clang thus doesn't include present
717 // modifiers for end calls. For "omp target", we have not found a valid
718 // OpenMP program for which the error matters: it appears that, if a
719 // program can guarantee that data is present at the beginning of an
720 // "omp target" region so that there's no error there, that data is also
721 // guaranteed to be present at the end.
722 MESSAGE("device mapping required by 'present' map type modifier does "
723 "not exist for host address " DPxMOD
" (%" PRId64
" bytes)",
724 DPxPTR(HstPtrBegin
), DataSize
);
728 DP("There are %" PRId64
" bytes allocated at target address " DPxMOD
730 DataSize
, DPxPTR(TgtPtrBegin
), (IsLast
? "" : " not"));
733 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
734 // "If the map clause appears on a target, target data, or target exit data
735 // construct and a corresponding list item of the original list item is not
736 // present in the device data environment on exit from the region then the
737 // list item is ignored."
741 bool DelEntry
= IsLast
;
743 // If the last element from the mapper (for end transfer args comes in
744 // reverse order), do not remove the partial entry, the parent struct still
746 if ((ArgTypes
[I
] & OMP_TGT_MAPTYPE_MEMBER_OF
) &&
747 !(ArgTypes
[I
] & OMP_TGT_MAPTYPE_PTR_AND_OBJ
)) {
748 DelEntry
= false; // protect parent struct from being deallocated
751 if ((ArgTypes
[I
] & OMP_TGT_MAPTYPE_FROM
) || DelEntry
) {
752 // Move data back to the host
753 if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_FROM
) {
754 bool Always
= ArgTypes
[I
] & OMP_TGT_MAPTYPE_ALWAYS
;
755 if ((Always
|| IsLast
) && !IsHostPtr
) {
756 DP("Moving %" PRId64
" bytes (tgt:" DPxMOD
") -> (hst:" DPxMOD
")\n",
757 DataSize
, DPxPTR(TgtPtrBegin
), DPxPTR(HstPtrBegin
));
758 Ret
= Device
.retrieveData(HstPtrBegin
, TgtPtrBegin
, DataSize
,
760 if (Ret
!= OFFLOAD_SUCCESS
) {
761 REPORT("Copying data from device failed.\n");
766 if (DelEntry
&& FromMapper
&& I
== 0) {
768 FromMapperBase
= HstPtrBegin
;
771 // If we copied back to the host a struct/array containing pointers, we
772 // need to restore the original host pointer values from their shadow
773 // copies. If the struct is going to be deallocated, remove any remaining
774 // shadow pointer entries for this struct.
775 uintptr_t LB
= (uintptr_t)HstPtrBegin
;
776 uintptr_t UB
= (uintptr_t)HstPtrBegin
+ DataSize
;
777 Device
.ShadowMtx
.lock();
778 for (ShadowPtrListTy::iterator Itr
= Device
.ShadowPtrMap
.begin();
779 Itr
!= Device
.ShadowPtrMap
.end();) {
780 void **ShadowHstPtrAddr
= (void **)Itr
->first
;
782 // An STL map is sorted on its keys; use this property
783 // to quickly determine when to break out of the loop.
784 if ((uintptr_t)ShadowHstPtrAddr
< LB
) {
788 if ((uintptr_t)ShadowHstPtrAddr
>= UB
)
791 // If we copied the struct to the host, we need to restore the pointer.
792 if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_FROM
) {
793 DP("Restoring original host pointer value " DPxMOD
" for host "
794 "pointer " DPxMOD
"\n",
795 DPxPTR(Itr
->second
.HstPtrVal
), DPxPTR(ShadowHstPtrAddr
));
796 *ShadowHstPtrAddr
= Itr
->second
.HstPtrVal
;
798 // If the struct is to be deallocated, remove the shadow entry.
800 DP("Removing shadow pointer " DPxMOD
"\n", DPxPTR(ShadowHstPtrAddr
));
801 Itr
= Device
.ShadowPtrMap
.erase(Itr
);
806 Device
.ShadowMtx
.unlock();
808 // Add pointer to the buffer for later deallocation
809 if (DelEntry
&& !IsHostPtr
)
810 DeallocTgtPtrs
.emplace_back(HstPtrBegin
, DataSize
, HasHoldModifier
);
814 // TODO: We should not synchronize here but pass the AsyncInfo object to the
815 // allocate/deallocate device APIs.
817 // We need to synchronize before deallocating data.
818 Ret
= AsyncInfo
.synchronize();
819 if (Ret
!= OFFLOAD_SUCCESS
)
822 // Deallocate target pointer
823 for (DeallocTgtPtrInfo
&Info
: DeallocTgtPtrs
) {
824 if (FromMapperBase
&& FromMapperBase
== Info
.HstPtrBegin
)
826 Ret
= Device
.deallocTgtPtr(Info
.HstPtrBegin
, Info
.DataSize
,
827 Info
.HasHoldModifier
);
828 if (Ret
!= OFFLOAD_SUCCESS
) {
829 REPORT("Deallocating data from device failed.\n");
834 return OFFLOAD_SUCCESS
;
837 static int targetDataContiguous(ident_t
*loc
, DeviceTy
&Device
, void *ArgsBase
,
838 void *HstPtrBegin
, int64_t ArgSize
,
839 int64_t ArgType
, AsyncInfoTy
&AsyncInfo
) {
840 TIMESCOPE_WITH_IDENT(loc
);
841 bool IsLast
, IsHostPtr
;
842 void *TgtPtrBegin
= Device
.getTgtPtrBegin(
843 HstPtrBegin
, ArgSize
, IsLast
, /*UpdateRefCount=*/false,
844 /*UseHoldRefCount=*/false, IsHostPtr
, /*MustContain=*/true);
846 DP("hst data:" DPxMOD
" not found, becomes a noop\n", DPxPTR(HstPtrBegin
));
847 if (ArgType
& OMP_TGT_MAPTYPE_PRESENT
) {
848 MESSAGE("device mapping required by 'present' motion modifier does not "
849 "exist for host address " DPxMOD
" (%" PRId64
" bytes)",
850 DPxPTR(HstPtrBegin
), ArgSize
);
853 return OFFLOAD_SUCCESS
;
857 DP("hst data:" DPxMOD
" unified and shared, becomes a noop\n",
858 DPxPTR(HstPtrBegin
));
859 return OFFLOAD_SUCCESS
;
862 if (ArgType
& OMP_TGT_MAPTYPE_FROM
) {
863 DP("Moving %" PRId64
" bytes (tgt:" DPxMOD
") -> (hst:" DPxMOD
")\n",
864 ArgSize
, DPxPTR(TgtPtrBegin
), DPxPTR(HstPtrBegin
));
865 int Ret
= Device
.retrieveData(HstPtrBegin
, TgtPtrBegin
, ArgSize
, AsyncInfo
);
866 if (Ret
!= OFFLOAD_SUCCESS
) {
867 REPORT("Copying data from device failed.\n");
871 uintptr_t LB
= (uintptr_t)HstPtrBegin
;
872 uintptr_t UB
= (uintptr_t)HstPtrBegin
+ ArgSize
;
873 Device
.ShadowMtx
.lock();
874 for (ShadowPtrListTy::iterator IT
= Device
.ShadowPtrMap
.begin();
875 IT
!= Device
.ShadowPtrMap
.end(); ++IT
) {
876 void **ShadowHstPtrAddr
= (void **)IT
->first
;
877 if ((uintptr_t)ShadowHstPtrAddr
< LB
)
879 if ((uintptr_t)ShadowHstPtrAddr
>= UB
)
881 DP("Restoring original host pointer value " DPxMOD
882 " for host pointer " DPxMOD
"\n",
883 DPxPTR(IT
->second
.HstPtrVal
), DPxPTR(ShadowHstPtrAddr
));
884 *ShadowHstPtrAddr
= IT
->second
.HstPtrVal
;
886 Device
.ShadowMtx
.unlock();
889 if (ArgType
& OMP_TGT_MAPTYPE_TO
) {
890 DP("Moving %" PRId64
" bytes (hst:" DPxMOD
") -> (tgt:" DPxMOD
")\n",
891 ArgSize
, DPxPTR(HstPtrBegin
), DPxPTR(TgtPtrBegin
));
892 int Ret
= Device
.submitData(TgtPtrBegin
, HstPtrBegin
, ArgSize
, AsyncInfo
);
893 if (Ret
!= OFFLOAD_SUCCESS
) {
894 REPORT("Copying data to device failed.\n");
898 uintptr_t LB
= (uintptr_t)HstPtrBegin
;
899 uintptr_t UB
= (uintptr_t)HstPtrBegin
+ ArgSize
;
900 Device
.ShadowMtx
.lock();
901 for (ShadowPtrListTy::iterator IT
= Device
.ShadowPtrMap
.begin();
902 IT
!= Device
.ShadowPtrMap
.end(); ++IT
) {
903 void **ShadowHstPtrAddr
= (void **)IT
->first
;
904 if ((uintptr_t)ShadowHstPtrAddr
< LB
)
906 if ((uintptr_t)ShadowHstPtrAddr
>= UB
)
908 DP("Restoring original target pointer value " DPxMOD
" for target "
909 "pointer " DPxMOD
"\n",
910 DPxPTR(IT
->second
.TgtPtrVal
), DPxPTR(IT
->second
.TgtPtrAddr
));
911 Ret
= Device
.submitData(IT
->second
.TgtPtrAddr
, &IT
->second
.TgtPtrVal
,
912 sizeof(void *), AsyncInfo
);
913 if (Ret
!= OFFLOAD_SUCCESS
) {
914 REPORT("Copying data to device failed.\n");
915 Device
.ShadowMtx
.unlock();
919 Device
.ShadowMtx
.unlock();
921 return OFFLOAD_SUCCESS
;
924 static int targetDataNonContiguous(ident_t
*loc
, DeviceTy
&Device
,
926 __tgt_target_non_contig
*NonContig
,
927 uint64_t Size
, int64_t ArgType
,
928 int CurrentDim
, int DimSize
, uint64_t Offset
,
929 AsyncInfoTy
&AsyncInfo
) {
930 TIMESCOPE_WITH_IDENT(loc
);
931 int Ret
= OFFLOAD_SUCCESS
;
932 if (CurrentDim
< DimSize
) {
933 for (unsigned int I
= 0; I
< NonContig
[CurrentDim
].Count
; ++I
) {
935 (NonContig
[CurrentDim
].Offset
+ I
) * NonContig
[CurrentDim
].Stride
;
936 // we only need to transfer the first element for the last dimension
937 // since we've already got a contiguous piece.
938 if (CurrentDim
!= DimSize
- 1 || I
== 0) {
939 Ret
= targetDataNonContiguous(loc
, Device
, ArgsBase
, NonContig
, Size
,
940 ArgType
, CurrentDim
+ 1, DimSize
,
941 Offset
+ CurOffset
, AsyncInfo
);
942 // Stop the whole process if any contiguous piece returns anything
943 // other than OFFLOAD_SUCCESS.
944 if (Ret
!= OFFLOAD_SUCCESS
)
949 char *Ptr
= (char *)ArgsBase
+ Offset
;
950 DP("Transfer of non-contiguous : host ptr " DPxMOD
" offset %" PRIu64
951 " len %" PRIu64
"\n",
952 DPxPTR(Ptr
), Offset
, Size
);
953 Ret
= targetDataContiguous(loc
, Device
, ArgsBase
, Ptr
, Size
, ArgType
,
959 static int getNonContigMergedDimension(__tgt_target_non_contig
*NonContig
,
962 for (int I
= DimSize
- 1; I
> 0; --I
) {
963 if (NonContig
[I
].Count
* NonContig
[I
].Stride
== NonContig
[I
- 1].Stride
)
969 /// Internal function to pass data to/from the target.
970 int targetDataUpdate(ident_t
*loc
, DeviceTy
&Device
, int32_t ArgNum
,
971 void **ArgsBase
, void **Args
, int64_t *ArgSizes
,
972 int64_t *ArgTypes
, map_var_info_t
*ArgNames
,
973 void **ArgMappers
, AsyncInfoTy
&AsyncInfo
, bool) {
974 // process each input.
975 for (int32_t I
= 0; I
< ArgNum
; ++I
) {
976 if ((ArgTypes
[I
] & OMP_TGT_MAPTYPE_LITERAL
) ||
977 (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PRIVATE
))
980 if (ArgMappers
&& ArgMappers
[I
]) {
981 // Instead of executing the regular path of targetDataUpdate, call the
982 // targetDataMapper variant which will call targetDataUpdate again
983 // with new arguments.
984 DP("Calling targetDataMapper for the %dth argument\n", I
);
986 map_var_info_t ArgName
= (!ArgNames
) ? nullptr : ArgNames
[I
];
987 int Ret
= targetDataMapper(loc
, Device
, ArgsBase
[I
], Args
[I
], ArgSizes
[I
],
988 ArgTypes
[I
], ArgName
, ArgMappers
[I
], AsyncInfo
,
991 if (Ret
!= OFFLOAD_SUCCESS
) {
992 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
997 // Skip the rest of this function, continue to the next argument.
1001 int Ret
= OFFLOAD_SUCCESS
;
1003 if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_NON_CONTIG
) {
1004 __tgt_target_non_contig
*NonContig
= (__tgt_target_non_contig
*)Args
[I
];
1005 int32_t DimSize
= ArgSizes
[I
];
1007 NonContig
[DimSize
- 1].Count
* NonContig
[DimSize
- 1].Stride
;
1008 int32_t MergedDim
= getNonContigMergedDimension(NonContig
, DimSize
);
1009 Ret
= targetDataNonContiguous(
1010 loc
, Device
, ArgsBase
[I
], NonContig
, Size
, ArgTypes
[I
],
1011 /*current_dim=*/0, DimSize
- MergedDim
, /*offset=*/0, AsyncInfo
);
1013 Ret
= targetDataContiguous(loc
, Device
, ArgsBase
[I
], Args
[I
], ArgSizes
[I
],
1014 ArgTypes
[I
], AsyncInfo
);
1016 if (Ret
== OFFLOAD_FAIL
)
1017 return OFFLOAD_FAIL
;
1019 return OFFLOAD_SUCCESS
;
1022 static const unsigned LambdaMapping
= OMP_TGT_MAPTYPE_PTR_AND_OBJ
|
1023 OMP_TGT_MAPTYPE_LITERAL
|
1024 OMP_TGT_MAPTYPE_IMPLICIT
;
1025 static bool isLambdaMapping(int64_t Mapping
) {
1026 return (Mapping
& LambdaMapping
) == LambdaMapping
;
1030 /// Find the table information in the map or look it up in the translation
1032 TableMap
*getTableMap(void *HostPtr
) {
1033 std::lock_guard
<std::mutex
> TblMapLock(PM
->TblMapMtx
);
1034 HostPtrToTableMapTy::iterator TableMapIt
=
1035 PM
->HostPtrToTableMap
.find(HostPtr
);
1037 if (TableMapIt
!= PM
->HostPtrToTableMap
.end())
1038 return &TableMapIt
->second
;
1040 // We don't have a map. So search all the registered libraries.
1041 TableMap
*TM
= nullptr;
1042 std::lock_guard
<std::mutex
> TrlTblLock(PM
->TrlTblMtx
);
1043 for (HostEntriesBeginToTransTableTy::iterator Itr
=
1044 PM
->HostEntriesBeginToTransTable
.begin();
1045 Itr
!= PM
->HostEntriesBeginToTransTable
.end(); ++Itr
) {
1046 // get the translation table (which contains all the good info).
1047 TranslationTable
*TransTable
= &Itr
->second
;
1048 // iterate over all the host table entries to see if we can locate the
1050 __tgt_offload_entry
*Cur
= TransTable
->HostTable
.EntriesBegin
;
1051 for (uint32_t I
= 0; Cur
< TransTable
->HostTable
.EntriesEnd
; ++Cur
, ++I
) {
1052 if (Cur
->addr
!= HostPtr
)
1054 // we got a match, now fill the HostPtrToTableMap so that we
1055 // may avoid this search next time.
1056 TM
= &(PM
->HostPtrToTableMap
)[HostPtr
];
1057 TM
->Table
= TransTable
;
1066 /// Get loop trip count
1067 /// FIXME: This function will not work right if calling
1068 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in
1069 /// another thread, which might occur when we call task yield.
1070 uint64_t getLoopTripCount(int64_t DeviceId
) {
1071 DeviceTy
&Device
= *PM
->Devices
[DeviceId
];
1072 uint64_t LoopTripCount
= 0;
1075 std::lock_guard
<std::mutex
> TblMapLock(PM
->TblMapMtx
);
1076 auto I
= Device
.LoopTripCnt
.find(__kmpc_global_thread_num(NULL
));
1077 if (I
!= Device
.LoopTripCnt
.end()) {
1078 LoopTripCount
= I
->second
;
1079 Device
.LoopTripCnt
.erase(I
);
1080 DP("loop trip count is %" PRIu64
".\n", LoopTripCount
);
1084 return LoopTripCount
;
1087 /// A class manages private arguments in a target region.
1088 class PrivateArgumentManagerTy
{
1089 /// A data structure for the information of first-private arguments. We can
1090 /// use this information to optimize data transfer by packing all
1091 /// first-private arguments and transfer them all at once.
1092 struct FirstPrivateArgInfoTy
{
1093 /// The index of the element in \p TgtArgs corresponding to the argument
1095 /// Host pointer begin
1096 const char *HstPtrBegin
;
1097 /// Host pointer end
1098 const char *HstPtrEnd
;
1100 const int64_t AlignedSize
;
1101 /// Host pointer name
1102 const map_var_info_t HstPtrName
= nullptr;
1104 FirstPrivateArgInfoTy(int Index
, const void *HstPtr
, int64_t Size
,
1105 const map_var_info_t HstPtrName
= nullptr)
1106 : Index(Index
), HstPtrBegin(reinterpret_cast<const char *>(HstPtr
)),
1107 HstPtrEnd(HstPtrBegin
+ Size
), AlignedSize(Size
+ Size
% Alignment
),
1108 HstPtrName(HstPtrName
) {}
1111 /// A vector of target pointers for all private arguments
1112 std::vector
<void *> TgtPtrs
;
1114 /// A vector of information of all first-private arguments to be packed
1115 std::vector
<FirstPrivateArgInfoTy
> FirstPrivateArgInfo
;
1116 /// Host buffer for all arguments to be packed
1117 std::vector
<char> FirstPrivateArgBuffer
;
1118 /// The total size of all arguments to be packed
1119 int64_t FirstPrivateArgSize
= 0;
1121 /// A reference to the \p DeviceTy object
1123 /// A pointer to a \p AsyncInfoTy object
1124 AsyncInfoTy
&AsyncInfo
;
1126 // TODO: What would be the best value here? Should we make it configurable?
1127 // If the size is larger than this threshold, we will allocate and transfer it
1128 // immediately instead of packing it.
1129 static constexpr const int64_t FirstPrivateArgSizeThreshold
= 1024;
1133 PrivateArgumentManagerTy(DeviceTy
&Dev
, AsyncInfoTy
&AsyncInfo
)
1134 : Device(Dev
), AsyncInfo(AsyncInfo
) {}
1136 /// Add a private argument
1137 int addArg(void *HstPtr
, int64_t ArgSize
, int64_t ArgOffset
,
1138 bool IsFirstPrivate
, void *&TgtPtr
, int TgtArgsIndex
,
1139 const map_var_info_t HstPtrName
= nullptr,
1140 const bool AllocImmediately
= false) {
1141 // If the argument is not first-private, or its size is greater than a
1142 // predefined threshold, we will allocate memory and issue the transfer
1144 if (ArgSize
> FirstPrivateArgSizeThreshold
|| !IsFirstPrivate
||
1146 TgtPtr
= Device
.allocData(ArgSize
, HstPtr
);
1148 DP("Data allocation for %sprivate array " DPxMOD
" failed.\n",
1149 (IsFirstPrivate
? "first-" : ""), DPxPTR(HstPtr
));
1150 return OFFLOAD_FAIL
;
1152 #ifdef OMPTARGET_DEBUG
1153 void *TgtPtrBase
= (void *)((intptr_t)TgtPtr
+ ArgOffset
);
1154 DP("Allocated %" PRId64
" bytes of target memory at " DPxMOD
1155 " for %sprivate array " DPxMOD
" - pushing target argument " DPxMOD
1157 ArgSize
, DPxPTR(TgtPtr
), (IsFirstPrivate
? "first-" : ""),
1158 DPxPTR(HstPtr
), DPxPTR(TgtPtrBase
));
1160 // If first-private, copy data from host
1161 if (IsFirstPrivate
) {
1162 DP("Submitting firstprivate data to the device.\n");
1163 int Ret
= Device
.submitData(TgtPtr
, HstPtr
, ArgSize
, AsyncInfo
);
1164 if (Ret
!= OFFLOAD_SUCCESS
) {
1165 DP("Copying data to device failed, failed.\n");
1166 return OFFLOAD_FAIL
;
1169 TgtPtrs
.push_back(TgtPtr
);
1171 DP("Firstprivate array " DPxMOD
" of size %" PRId64
" will be packed\n",
1172 DPxPTR(HstPtr
), ArgSize
);
1173 // When reach this point, the argument must meet all following
1175 // 1. Its size does not exceed the threshold (see the comment for
1176 // FirstPrivateArgSizeThreshold);
1177 // 2. It must be first-private (needs to be mapped to target device).
1178 // We will pack all this kind of arguments to transfer them all at once
1179 // to reduce the number of data transfer. We will not take
1180 // non-first-private arguments, aka. private arguments that doesn't need
1181 // to be mapped to target device, into account because data allocation
1182 // can be very efficient with memory manager.
1184 // Placeholder value
1186 FirstPrivateArgInfo
.emplace_back(TgtArgsIndex
, HstPtr
, ArgSize
,
1188 FirstPrivateArgSize
+= FirstPrivateArgInfo
.back().AlignedSize
;
1191 return OFFLOAD_SUCCESS
;
1194 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1195 /// and start the transfer.
1196 int packAndTransfer(std::vector
<void *> &TgtArgs
) {
1197 if (!FirstPrivateArgInfo
.empty()) {
1198 assert(FirstPrivateArgSize
!= 0 &&
1199 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1200 FirstPrivateArgBuffer
.resize(FirstPrivateArgSize
, 0);
1201 auto Itr
= FirstPrivateArgBuffer
.begin();
1202 // Copy all host data to this buffer
1203 for (FirstPrivateArgInfoTy
&Info
: FirstPrivateArgInfo
) {
1204 std::copy(Info
.HstPtrBegin
, Info
.HstPtrEnd
, Itr
);
1205 Itr
= std::next(Itr
, Info
.AlignedSize
);
1207 // Allocate target memory
1209 Device
.allocData(FirstPrivateArgSize
, FirstPrivateArgBuffer
.data());
1210 if (TgtPtr
== nullptr) {
1211 DP("Failed to allocate target memory for private arguments.\n");
1212 return OFFLOAD_FAIL
;
1214 TgtPtrs
.push_back(TgtPtr
);
1215 DP("Allocated %" PRId64
" bytes of target memory at " DPxMOD
"\n",
1216 FirstPrivateArgSize
, DPxPTR(TgtPtr
));
1217 // Transfer data to target device
1218 int Ret
= Device
.submitData(TgtPtr
, FirstPrivateArgBuffer
.data(),
1219 FirstPrivateArgSize
, AsyncInfo
);
1220 if (Ret
!= OFFLOAD_SUCCESS
) {
1221 DP("Failed to submit data of private arguments.\n");
1222 return OFFLOAD_FAIL
;
1224 // Fill in all placeholder pointers
1225 auto TP
= reinterpret_cast<uintptr_t>(TgtPtr
);
1226 for (FirstPrivateArgInfoTy
&Info
: FirstPrivateArgInfo
) {
1227 void *&Ptr
= TgtArgs
[Info
.Index
];
1228 assert(Ptr
== nullptr && "Target pointer is already set by mistaken");
1229 Ptr
= reinterpret_cast<void *>(TP
);
1230 TP
+= Info
.AlignedSize
;
1231 DP("Firstprivate array " DPxMOD
" of size %" PRId64
" mapped to " DPxMOD
1233 DPxPTR(Info
.HstPtrBegin
), Info
.HstPtrEnd
- Info
.HstPtrBegin
,
1238 return OFFLOAD_SUCCESS
;
1241 /// Free all target memory allocated for private arguments
1243 for (void *P
: TgtPtrs
) {
1244 int Ret
= Device
.deleteData(P
);
1245 if (Ret
!= OFFLOAD_SUCCESS
) {
1246 DP("Deallocation of (first-)private arrays failed.\n");
1247 return OFFLOAD_FAIL
;
1253 return OFFLOAD_SUCCESS
;
1257 /// Process data before launching the kernel, including calling targetDataBegin
1258 /// to map and transfer data to target device, transferring (first-)private
1260 static int processDataBefore(ident_t
*loc
, int64_t DeviceId
, void *HostPtr
,
1261 int32_t ArgNum
, void **ArgBases
, void **Args
,
1262 int64_t *ArgSizes
, int64_t *ArgTypes
,
1263 map_var_info_t
*ArgNames
, void **ArgMappers
,
1264 std::vector
<void *> &TgtArgs
,
1265 std::vector
<ptrdiff_t> &TgtOffsets
,
1266 PrivateArgumentManagerTy
&PrivateArgumentManager
,
1267 AsyncInfoTy
&AsyncInfo
) {
1268 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc
);
1269 DeviceTy
&Device
= *PM
->Devices
[DeviceId
];
1270 int Ret
= targetDataBegin(loc
, Device
, ArgNum
, ArgBases
, Args
, ArgSizes
,
1271 ArgTypes
, ArgNames
, ArgMappers
, AsyncInfo
);
1272 if (Ret
!= OFFLOAD_SUCCESS
) {
1273 REPORT("Call to targetDataBegin failed, abort target.\n");
1274 return OFFLOAD_FAIL
;
1277 // List of (first-)private arrays allocated for this target region
1278 std::vector
<int> TgtArgsPositions(ArgNum
, -1);
1280 for (int32_t I
= 0; I
< ArgNum
; ++I
) {
1281 if (!(ArgTypes
[I
] & OMP_TGT_MAPTYPE_TARGET_PARAM
)) {
1282 // This is not a target parameter, do not push it into TgtArgs.
1283 // Check for lambda mapping.
1284 if (isLambdaMapping(ArgTypes
[I
])) {
1285 assert((ArgTypes
[I
] & OMP_TGT_MAPTYPE_MEMBER_OF
) &&
1286 "PTR_AND_OBJ must be also MEMBER_OF.");
1287 unsigned Idx
= getParentIndex(ArgTypes
[I
]);
1288 int TgtIdx
= TgtArgsPositions
[Idx
];
1289 assert(TgtIdx
!= -1 && "Base address must be translated already.");
1290 // The parent lambda must be processed already and it must be the last
1291 // in TgtArgs and TgtOffsets arrays.
1292 void *HstPtrVal
= Args
[I
];
1293 void *HstPtrBegin
= ArgBases
[I
];
1294 void *HstPtrBase
= Args
[Idx
];
1295 bool IsLast
, IsHostPtr
; // IsLast is unused.
1297 (void *)((intptr_t)TgtArgs
[TgtIdx
] + TgtOffsets
[TgtIdx
]);
1298 DP("Parent lambda base " DPxMOD
"\n", DPxPTR(TgtPtrBase
));
1299 uint64_t Delta
= (uint64_t)HstPtrBegin
- (uint64_t)HstPtrBase
;
1300 void *TgtPtrBegin
= (void *)((uintptr_t)TgtPtrBase
+ Delta
);
1301 void *&PointerTgtPtrBegin
= AsyncInfo
.getVoidPtrLocation();
1302 PointerTgtPtrBegin
= Device
.getTgtPtrBegin(
1303 HstPtrVal
, ArgSizes
[I
], IsLast
, /*UpdateRefCount=*/false,
1304 /*UseHoldRefCount=*/false, IsHostPtr
);
1305 if (!PointerTgtPtrBegin
) {
1306 DP("No lambda captured variable mapped (" DPxMOD
") - ignored\n",
1311 DP("Unified memory is active, no need to map lambda captured"
1312 "variable (" DPxMOD
")\n",
1316 DP("Update lambda reference (" DPxMOD
") -> [" DPxMOD
"]\n",
1317 DPxPTR(PointerTgtPtrBegin
), DPxPTR(TgtPtrBegin
));
1318 Ret
= Device
.submitData(TgtPtrBegin
, &PointerTgtPtrBegin
,
1319 sizeof(void *), AsyncInfo
);
1320 if (Ret
!= OFFLOAD_SUCCESS
) {
1321 REPORT("Copying data to device failed.\n");
1322 return OFFLOAD_FAIL
;
1327 void *HstPtrBegin
= Args
[I
];
1328 void *HstPtrBase
= ArgBases
[I
];
1330 map_var_info_t HstPtrName
= (!ArgNames
) ? nullptr : ArgNames
[I
];
1331 ptrdiff_t TgtBaseOffset
;
1332 bool IsLast
, IsHostPtr
; // unused.
1333 if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_LITERAL
) {
1334 DP("Forwarding first-private value " DPxMOD
" to the target construct\n",
1335 DPxPTR(HstPtrBase
));
1336 TgtPtrBegin
= HstPtrBase
;
1338 } else if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PRIVATE
) {
1339 TgtBaseOffset
= (intptr_t)HstPtrBase
- (intptr_t)HstPtrBegin
;
1340 const bool IsFirstPrivate
= (ArgTypes
[I
] & OMP_TGT_MAPTYPE_TO
);
1341 // If there is a next argument and it depends on the current one, we need
1342 // to allocate the private memory immediately. If this is not the case,
1343 // then the argument can be marked for optimization and packed with the
1345 const bool AllocImmediately
=
1346 (I
< ArgNum
- 1 && (ArgTypes
[I
+ 1] & OMP_TGT_MAPTYPE_MEMBER_OF
));
1347 Ret
= PrivateArgumentManager
.addArg(
1348 HstPtrBegin
, ArgSizes
[I
], TgtBaseOffset
, IsFirstPrivate
, TgtPtrBegin
,
1349 TgtArgs
.size(), HstPtrName
, AllocImmediately
);
1350 if (Ret
!= OFFLOAD_SUCCESS
) {
1351 REPORT("Failed to process %sprivate argument " DPxMOD
"\n",
1352 (IsFirstPrivate
? "first-" : ""), DPxPTR(HstPtrBegin
));
1353 return OFFLOAD_FAIL
;
1356 if (ArgTypes
[I
] & OMP_TGT_MAPTYPE_PTR_AND_OBJ
)
1357 HstPtrBase
= *reinterpret_cast<void **>(HstPtrBase
);
1358 TgtPtrBegin
= Device
.getTgtPtrBegin(HstPtrBegin
, ArgSizes
[I
], IsLast
,
1359 /*UpdateRefCount=*/false,
1360 /*UseHoldRefCount=*/false, IsHostPtr
);
1361 TgtBaseOffset
= (intptr_t)HstPtrBase
- (intptr_t)HstPtrBegin
;
1362 #ifdef OMPTARGET_DEBUG
1363 void *TgtPtrBase
= (void *)((intptr_t)TgtPtrBegin
+ TgtBaseOffset
);
1364 DP("Obtained target argument " DPxMOD
" from host pointer " DPxMOD
"\n",
1365 DPxPTR(TgtPtrBase
), DPxPTR(HstPtrBegin
));
1368 TgtArgsPositions
[I
] = TgtArgs
.size();
1369 TgtArgs
.push_back(TgtPtrBegin
);
1370 TgtOffsets
.push_back(TgtBaseOffset
);
1373 assert(TgtArgs
.size() == TgtOffsets
.size() &&
1374 "Size mismatch in arguments and offsets");
1376 // Pack and transfer first-private arguments
1377 Ret
= PrivateArgumentManager
.packAndTransfer(TgtArgs
);
1378 if (Ret
!= OFFLOAD_SUCCESS
) {
1379 DP("Failed to pack and transfer first private arguments\n");
1380 return OFFLOAD_FAIL
;
1383 return OFFLOAD_SUCCESS
;
1386 /// Process data after launching the kernel, including transferring data back to
1387 /// host if needed and deallocating target memory of (first-)private variables.
1388 static int processDataAfter(ident_t
*loc
, int64_t DeviceId
, void *HostPtr
,
1389 int32_t ArgNum
, void **ArgBases
, void **Args
,
1390 int64_t *ArgSizes
, int64_t *ArgTypes
,
1391 map_var_info_t
*ArgNames
, void **ArgMappers
,
1392 PrivateArgumentManagerTy
&PrivateArgumentManager
,
1393 AsyncInfoTy
&AsyncInfo
) {
1394 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc
);
1395 DeviceTy
&Device
= *PM
->Devices
[DeviceId
];
1397 // Move data from device.
1398 int Ret
= targetDataEnd(loc
, Device
, ArgNum
, ArgBases
, Args
, ArgSizes
,
1399 ArgTypes
, ArgNames
, ArgMappers
, AsyncInfo
);
1400 if (Ret
!= OFFLOAD_SUCCESS
) {
1401 REPORT("Call to targetDataEnd failed, abort target.\n");
1402 return OFFLOAD_FAIL
;
1405 // Free target memory for private arguments
1406 Ret
= PrivateArgumentManager
.free();
1407 if (Ret
!= OFFLOAD_SUCCESS
) {
1408 REPORT("Failed to deallocate target memory for private args\n");
1409 return OFFLOAD_FAIL
;
1412 return OFFLOAD_SUCCESS
;
1416 /// performs the same actions as data_begin in case arg_num is
1417 /// non-zero and initiates run of the offloaded region on the target platform;
1418 /// if arg_num is non-zero after the region execution is done it also
1419 /// performs the same action as data_update and data_end above. This function
1420 /// returns 0 if it was able to transfer the execution to a target and an
1421 /// integer different from zero otherwise.
1422 int target(ident_t
*loc
, DeviceTy
&Device
, void *HostPtr
, int32_t ArgNum
,
1423 void **ArgBases
, void **Args
, int64_t *ArgSizes
, int64_t *ArgTypes
,
1424 map_var_info_t
*ArgNames
, void **ArgMappers
, int32_t TeamNum
,
1425 int32_t ThreadLimit
, int IsTeamConstruct
, AsyncInfoTy
&AsyncInfo
) {
1426 int32_t DeviceId
= Device
.DeviceID
;
1428 TableMap
*TM
= getTableMap(HostPtr
);
1429 // No map for this host pointer found!
1431 REPORT("Host ptr " DPxMOD
" does not have a matching target pointer.\n",
1433 return OFFLOAD_FAIL
;
1436 // get target table.
1437 __tgt_target_table
*TargetTable
= nullptr;
1439 std::lock_guard
<std::mutex
> TrlTblLock(PM
->TrlTblMtx
);
1440 assert(TM
->Table
->TargetsTable
.size() > (size_t)DeviceId
&&
1441 "Not expecting a device ID outside the table's bounds!");
1442 TargetTable
= TM
->Table
->TargetsTable
[DeviceId
];
1444 assert(TargetTable
&& "Global data has not been mapped\n");
1446 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1447 // need to manifest base pointers prior to launching a kernel. Even if we have
1448 // mapped an object only partially, e.g. A[N:M], although the kernel is
1449 // expected to access elements starting at address &A[N] and beyond, we still
1450 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1451 // API, we need the begin address itself, i.e. &A[N], as the API operates on
1452 // begin addresses, not bases. That's why we pass args and offsets as two
1453 // separate entities so that each plugin can do what it needs. This behavior
1454 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1455 std::vector
<void *> TgtArgs
;
1456 std::vector
<ptrdiff_t> TgtOffsets
;
1458 PrivateArgumentManagerTy
PrivateArgumentManager(Device
, AsyncInfo
);
1462 // Process data, such as data mapping, before launching the kernel
1463 Ret
= processDataBefore(loc
, DeviceId
, HostPtr
, ArgNum
, ArgBases
, Args
,
1464 ArgSizes
, ArgTypes
, ArgNames
, ArgMappers
, TgtArgs
,
1465 TgtOffsets
, PrivateArgumentManager
, AsyncInfo
);
1466 if (Ret
!= OFFLOAD_SUCCESS
) {
1467 REPORT("Failed to process data before launching the kernel.\n");
1468 return OFFLOAD_FAIL
;
1472 // Launch device execution.
1473 void *TgtEntryPtr
= TargetTable
->EntriesBegin
[TM
->Index
].addr
;
1474 DP("Launching target execution %s with pointer " DPxMOD
" (index=%d).\n",
1475 TargetTable
->EntriesBegin
[TM
->Index
].name
, DPxPTR(TgtEntryPtr
), TM
->Index
);
1478 TIMESCOPE_WITH_NAME_AND_IDENT(
1479 IsTeamConstruct
? "runTargetTeamRegion" : "runTargetRegion", loc
);
1480 if (IsTeamConstruct
)
1481 Ret
= Device
.runTeamRegion(TgtEntryPtr
, &TgtArgs
[0], &TgtOffsets
[0],
1482 TgtArgs
.size(), TeamNum
, ThreadLimit
,
1483 getLoopTripCount(DeviceId
), AsyncInfo
);
1485 Ret
= Device
.runRegion(TgtEntryPtr
, &TgtArgs
[0], &TgtOffsets
[0],
1486 TgtArgs
.size(), AsyncInfo
);
1489 if (Ret
!= OFFLOAD_SUCCESS
) {
1490 REPORT("Executing target region abort target.\n");
1491 return OFFLOAD_FAIL
;
1495 // Transfer data back and deallocate target memory for (first-)private
1497 Ret
= processDataAfter(loc
, DeviceId
, HostPtr
, ArgNum
, ArgBases
, Args
,
1498 ArgSizes
, ArgTypes
, ArgNames
, ArgMappers
,
1499 PrivateArgumentManager
, AsyncInfo
);
1500 if (Ret
!= OFFLOAD_SUCCESS
) {
1501 REPORT("Failed to process data after launching the kernel.\n");
1502 return OFFLOAD_FAIL
;
1506 return OFFLOAD_SUCCESS
;