[sanitizer] Improve FreeBSD ASLR detection
[llvm-project.git] / openmp / libomptarget / src / omptarget.cpp
blobdd3f97e12f724d9ad066a590582162e4940d3361
1 //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Implementation of the interface to be used by Clang during the codegen of a
10 // target region.
12 //===----------------------------------------------------------------------===//
14 #include "omptarget.h"
15 #include "device.h"
16 #include "private.h"
17 #include "rtl.h"
19 #include <cassert>
20 #include <vector>
22 int AsyncInfoTy::synchronize() {
23 int Result = OFFLOAD_SUCCESS;
24 if (AsyncInfo.Queue) {
25 // If we have a queue we need to synchronize it now.
26 Result = Device.synchronize(*this);
27 assert(AsyncInfo.Queue == nullptr &&
28 "The device plugin should have nulled the queue to indicate there "
29 "are no outstanding actions!");
31 return Result;
34 void *&AsyncInfoTy::getVoidPtrLocation() {
35 BufferLocations.push_back(nullptr);
36 return BufferLocations.back();
39 /* All begin addresses for partially mapped structs must be 8-aligned in order
40 * to ensure proper alignment of members. E.g.
42 * struct S {
43 * int a; // 4-aligned
44 * int b; // 4-aligned
45 * int *p; // 8-aligned
46 * } s1;
47 * ...
48 * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
49 * {
50 * s1.b = 5;
51 * for (int i...) s1.p[i] = ...;
52 * }
54 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
55 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
56 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
57 * requirements for its type. Now, when we allocate memory on the device, in
58 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
59 * This means that the chunk of the struct on the device will start at a
60 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
61 * address of p will be a misaligned 0x204 (on the host there was no need to add
62 * padding between b and p, so p comes exactly 4 bytes after b). If the device
63 * kernel tries to access s1.p, a misaligned address error occurs (as reported
64 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
65 * extending the size of the allocated chuck accordingly, the chuck on the
66 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
67 * &s1.p=0x208, as they should be to satisfy the alignment requirements.
69 static const int64_t Alignment = 8;
71 /// Map global data and execute pending ctors
72 static int InitLibrary(DeviceTy &Device) {
74 * Map global data
76 int32_t device_id = Device.DeviceID;
77 int rc = OFFLOAD_SUCCESS;
78 bool supportsEmptyImages = Device.RTL->supports_empty_images &&
79 Device.RTL->supports_empty_images() > 0;
81 Device.PendingGlobalsMtx.lock();
82 PM->TrlTblMtx.lock();
83 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
84 TranslationTable *TransTable =
85 &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
86 if (TransTable->HostTable.EntriesBegin ==
87 TransTable->HostTable.EntriesEnd &&
88 !supportsEmptyImages) {
89 // No host entry so no need to proceed
90 continue;
93 if (TransTable->TargetsTable[device_id] != 0) {
94 // Library entries have already been processed
95 continue;
98 // 1) get image.
99 assert(TransTable->TargetsImages.size() > (size_t)device_id &&
100 "Not expecting a device ID outside the table's bounds!");
101 __tgt_device_image *img = TransTable->TargetsImages[device_id];
102 if (!img) {
103 REPORT("No image loaded for device id %d.\n", device_id);
104 rc = OFFLOAD_FAIL;
105 break;
107 // 2) load image into the target table.
108 __tgt_target_table *TargetTable = TransTable->TargetsTable[device_id] =
109 Device.load_binary(img);
110 // Unable to get table for this image: invalidate image and fail.
111 if (!TargetTable) {
112 REPORT("Unable to generate entries table for device id %d.\n", device_id);
113 TransTable->TargetsImages[device_id] = 0;
114 rc = OFFLOAD_FAIL;
115 break;
118 // Verify whether the two table sizes match.
119 size_t hsize =
120 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
121 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
123 // Invalid image for these host entries!
124 if (hsize != tsize) {
125 REPORT("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
126 device_id, hsize, tsize);
127 TransTable->TargetsImages[device_id] = 0;
128 TransTable->TargetsTable[device_id] = 0;
129 rc = OFFLOAD_FAIL;
130 break;
133 // process global data that needs to be mapped.
134 Device.DataMapMtx.lock();
135 __tgt_target_table *HostTable = &TransTable->HostTable;
136 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
137 *CurrHostEntry = HostTable->EntriesBegin,
138 *EntryDeviceEnd = TargetTable->EntriesEnd;
139 CurrDeviceEntry != EntryDeviceEnd;
140 CurrDeviceEntry++, CurrHostEntry++) {
141 if (CurrDeviceEntry->size != 0) {
142 // has data.
143 assert(CurrDeviceEntry->size == CurrHostEntry->size &&
144 "data size mismatch");
146 // Fortran may use multiple weak declarations for the same symbol,
147 // therefore we must allow for multiple weak symbols to be loaded from
148 // the fat binary. Treat these mappings as any other "regular" mapping.
149 // Add entry to map.
150 if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
151 continue;
152 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
153 "\n",
154 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
155 CurrDeviceEntry->size);
156 Device.HostDataToTargetMap.emplace(
157 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
158 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
159 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
160 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
161 false /*UseHoldRefCount*/, nullptr /*Name*/,
162 true /*IsRefCountINF*/);
165 Device.DataMapMtx.unlock();
167 PM->TrlTblMtx.unlock();
169 if (rc != OFFLOAD_SUCCESS) {
170 Device.PendingGlobalsMtx.unlock();
171 return rc;
175 * Run ctors for static objects
177 if (!Device.PendingCtorsDtors.empty()) {
178 AsyncInfoTy AsyncInfo(Device);
179 // Call all ctors for all libraries registered so far
180 for (auto &lib : Device.PendingCtorsDtors) {
181 if (!lib.second.PendingCtors.empty()) {
182 DP("Has pending ctors... call now\n");
183 for (auto &entry : lib.second.PendingCtors) {
184 void *ctor = entry;
185 int rc =
186 target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
187 nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
188 if (rc != OFFLOAD_SUCCESS) {
189 REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
190 Device.PendingGlobalsMtx.unlock();
191 return OFFLOAD_FAIL;
194 // Clear the list to indicate that this device has been used
195 lib.second.PendingCtors.clear();
196 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
199 // All constructors have been issued, wait for them now.
200 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
201 return OFFLOAD_FAIL;
203 Device.HasPendingGlobals = false;
204 Device.PendingGlobalsMtx.unlock();
206 return OFFLOAD_SUCCESS;
209 void handleTargetOutcome(bool Success, ident_t *Loc) {
210 switch (PM->TargetOffloadPolicy) {
211 case tgt_disabled:
212 if (Success) {
213 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
215 break;
216 case tgt_default:
217 FATAL_MESSAGE0(1, "default offloading policy must be switched to "
218 "mandatory or disabled");
219 break;
220 case tgt_mandatory:
221 if (!Success) {
222 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
223 for (auto &Device : PM->Devices)
224 dumpTargetPointerMappings(Loc, *Device);
225 else
226 FAILURE_MESSAGE("Run with LIBOMPTARGET_INFO=%d to dump host-target "
227 "pointer mappings.\n",
228 OMP_INFOTYPE_DUMP_TABLE);
230 SourceInfo info(Loc);
231 if (info.isAvailible())
232 fprintf(stderr, "%s:%d:%d: ", info.getFilename(), info.getLine(),
233 info.getColumn());
234 else
235 FAILURE_MESSAGE("Source location information not present. Compile with "
236 "-g or -gline-tables-only.\n");
237 FATAL_MESSAGE0(
238 1, "failure of target construct while offloading is mandatory");
239 } else {
240 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE)
241 for (auto &Device : PM->Devices)
242 dumpTargetPointerMappings(Loc, *Device);
244 break;
248 static void handleDefaultTargetOffload() {
249 PM->TargetOffloadMtx.lock();
250 if (PM->TargetOffloadPolicy == tgt_default) {
251 if (omp_get_num_devices() > 0) {
252 DP("Default TARGET OFFLOAD policy is now mandatory "
253 "(devices were found)\n");
254 PM->TargetOffloadPolicy = tgt_mandatory;
255 } else {
256 DP("Default TARGET OFFLOAD policy is now disabled "
257 "(no devices were found)\n");
258 PM->TargetOffloadPolicy = tgt_disabled;
261 PM->TargetOffloadMtx.unlock();
264 static bool isOffloadDisabled() {
265 if (PM->TargetOffloadPolicy == tgt_default)
266 handleDefaultTargetOffload();
267 return PM->TargetOffloadPolicy == tgt_disabled;
270 // If offload is enabled, ensure that device DeviceID has been initialized,
271 // global ctors have been executed, and global data has been mapped.
273 // The return bool indicates if the offload is to the host device
274 // There are three possible results:
275 // - Return false if the taregt device is ready for offload
276 // - Return true without reporting a runtime error if offload is
277 // disabled, perhaps because the initial device was specified.
278 // - Report a runtime error and return true.
280 // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
281 // This step might be skipped if offload is disabled.
282 bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
283 if (isOffloadDisabled()) {
284 DP("Offload is disabled\n");
285 return true;
288 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
289 DeviceID = omp_get_default_device();
290 DP("Use default device id %" PRId64 "\n", DeviceID);
293 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
294 if (omp_get_num_devices() == 0) {
295 DP("omp_get_num_devices() == 0 but offload is manadatory\n");
296 handleTargetOutcome(false, Loc);
297 return true;
300 if (DeviceID == omp_get_initial_device()) {
301 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
302 DeviceID);
303 return true;
306 // Is device ready?
307 if (!device_is_ready(DeviceID)) {
308 REPORT("Device %" PRId64 " is not ready.\n", DeviceID);
309 handleTargetOutcome(false, Loc);
310 return true;
313 // Get device info.
314 DeviceTy &Device = *PM->Devices[DeviceID];
316 // Check whether global data has been mapped for this device
317 Device.PendingGlobalsMtx.lock();
318 bool hasPendingGlobals = Device.HasPendingGlobals;
319 Device.PendingGlobalsMtx.unlock();
320 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
321 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
322 handleTargetOutcome(false, Loc);
323 return true;
326 return false;
329 static int32_t getParentIndex(int64_t type) {
330 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
333 void *targetAllocExplicit(size_t size, int device_num, int kind,
334 const char *name) {
335 TIMESCOPE();
336 DP("Call to %s for device %d requesting %zu bytes\n", name, device_num, size);
338 if (size <= 0) {
339 DP("Call to %s with non-positive length\n", name);
340 return NULL;
343 void *rc = NULL;
345 if (device_num == omp_get_initial_device()) {
346 rc = malloc(size);
347 DP("%s returns host ptr " DPxMOD "\n", name, DPxPTR(rc));
348 return rc;
351 if (!device_is_ready(device_num)) {
352 DP("%s returns NULL ptr\n", name);
353 return NULL;
356 DeviceTy &Device = *PM->Devices[device_num];
357 rc = Device.allocData(size, nullptr, kind);
358 DP("%s returns device ptr " DPxMOD "\n", name, DPxPTR(rc));
359 return rc;
362 /// Call the user-defined mapper function followed by the appropriate
363 // targetData* function (targetData{Begin,End,Update}).
364 int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
365 int64_t arg_size, int64_t arg_type,
366 map_var_info_t arg_names, void *arg_mapper,
367 AsyncInfoTy &AsyncInfo,
368 TargetDataFuncPtrTy target_data_function) {
369 TIMESCOPE_WITH_IDENT(loc);
370 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
372 // The mapper function fills up Components.
373 MapperComponentsTy MapperComponents;
374 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(arg_mapper);
375 (*MapperFuncPtr)((void *)&MapperComponents, arg_base, arg, arg_size, arg_type,
376 arg_names);
378 // Construct new arrays for args_base, args, arg_sizes and arg_types
379 // using the information in MapperComponents and call the corresponding
380 // targetData* function using these new arrays.
381 std::vector<void *> MapperArgsBase(MapperComponents.Components.size());
382 std::vector<void *> MapperArgs(MapperComponents.Components.size());
383 std::vector<int64_t> MapperArgSizes(MapperComponents.Components.size());
384 std::vector<int64_t> MapperArgTypes(MapperComponents.Components.size());
385 std::vector<void *> MapperArgNames(MapperComponents.Components.size());
387 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
388 auto &C = MapperComponents.Components[I];
389 MapperArgsBase[I] = C.Base;
390 MapperArgs[I] = C.Begin;
391 MapperArgSizes[I] = C.Size;
392 MapperArgTypes[I] = C.Type;
393 MapperArgNames[I] = C.Name;
396 int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
397 MapperArgsBase.data(), MapperArgs.data(),
398 MapperArgSizes.data(), MapperArgTypes.data(),
399 MapperArgNames.data(), /*arg_mappers*/ nullptr,
400 AsyncInfo, /*FromMapper=*/true);
402 return rc;
405 /// Internal function to do the mapping and transfer the data to the device
406 int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
407 void **args_base, void **args, int64_t *arg_sizes,
408 int64_t *arg_types, map_var_info_t *arg_names,
409 void **arg_mappers, AsyncInfoTy &AsyncInfo,
410 bool FromMapper) {
411 // process each input.
412 for (int32_t i = 0; i < arg_num; ++i) {
413 // Ignore private variables and arrays - there is no mapping for them.
414 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
415 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
416 continue;
418 if (arg_mappers && arg_mappers[i]) {
419 // Instead of executing the regular path of targetDataBegin, call the
420 // targetDataMapper variant which will call targetDataBegin again
421 // with new arguments.
422 DP("Calling targetDataMapper for the %dth argument\n", i);
424 map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
425 int rc = targetDataMapper(loc, Device, args_base[i], args[i],
426 arg_sizes[i], arg_types[i], arg_name,
427 arg_mappers[i], AsyncInfo, targetDataBegin);
429 if (rc != OFFLOAD_SUCCESS) {
430 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
431 " failed.\n");
432 return OFFLOAD_FAIL;
435 // Skip the rest of this function, continue to the next argument.
436 continue;
439 void *HstPtrBegin = args[i];
440 void *HstPtrBase = args_base[i];
441 int64_t data_size = arg_sizes[i];
442 map_var_info_t HstPtrName = (!arg_names) ? nullptr : arg_names[i];
444 // Adjust for proper alignment if this is a combined entry (for structs).
445 // Look at the next argument - if that is MEMBER_OF this one, then this one
446 // is a combined entry.
447 int64_t padding = 0;
448 const int next_i = i + 1;
449 if (getParentIndex(arg_types[i]) < 0 && next_i < arg_num &&
450 getParentIndex(arg_types[next_i]) == i) {
451 padding = (int64_t)HstPtrBegin % Alignment;
452 if (padding) {
453 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
454 "\n",
455 padding, DPxPTR(HstPtrBegin));
456 HstPtrBegin = (char *)HstPtrBegin - padding;
457 data_size += padding;
461 // Address of pointer on the host and device, respectively.
462 void *Pointer_HstPtrBegin, *PointerTgtPtrBegin;
463 TargetPointerResultTy Pointer_TPR;
464 bool IsHostPtr = false;
465 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
466 // Force the creation of a device side copy of the data when:
467 // a close map modifier was associated with a map that contained a to.
468 bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE;
469 bool HasPresentModifier = arg_types[i] & OMP_TGT_MAPTYPE_PRESENT;
470 bool HasHoldModifier = arg_types[i] & OMP_TGT_MAPTYPE_OMPX_HOLD;
471 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
472 // have reached this point via __tgt_target_data_begin and not __tgt_target
473 // then no argument is marked as TARGET_PARAM ("omp target data map" is not
474 // associated with a target region, so there are no target parameters). This
475 // may be considered a hack, we could revise the scheme in the future.
476 bool UpdateRef =
477 !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && i == 0);
478 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
479 DP("Has a pointer entry: \n");
480 // Base is address of pointer.
482 // Usually, the pointer is already allocated by this time. For example:
484 // #pragma omp target map(s.p[0:N])
486 // The map entry for s comes first, and the PTR_AND_OBJ entry comes
487 // afterward, so the pointer is already allocated by the time the
488 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
489 // non-null. However, "declare target link" can produce a PTR_AND_OBJ
490 // entry for a global that might not already be allocated by the time the
491 // PTR_AND_OBJ entry is handled below, and so the allocation might fail
492 // when HasPresentModifier.
493 Pointer_TPR = Device.getTargetPointer(
494 HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr,
495 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef,
496 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo);
497 PointerTgtPtrBegin = Pointer_TPR.TargetPointer;
498 IsHostPtr = Pointer_TPR.Flags.IsHostPointer;
499 if (!PointerTgtPtrBegin) {
500 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
501 HasPresentModifier ? "'present' map type modifier"
502 : "device failure or illegal mapping");
503 return OFFLOAD_FAIL;
505 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
506 "\n",
507 sizeof(void *), DPxPTR(PointerTgtPtrBegin),
508 (Pointer_TPR.Flags.IsNewEntry ? "" : " not"));
509 Pointer_HstPtrBegin = HstPtrBase;
510 // modify current entry.
511 HstPtrBase = *(void **)HstPtrBase;
512 // No need to update pointee ref count for the first element of the
513 // subelement that comes from mapper.
514 UpdateRef =
515 (!FromMapper || i != 0); // subsequently update ref count of pointee
518 const bool HasFlagTo = arg_types[i] & OMP_TGT_MAPTYPE_TO;
519 const bool HasFlagAlways = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
520 auto TPR = Device.getTargetPointer(HstPtrBegin, HstPtrBase, data_size,
521 HstPtrName, HasFlagTo, HasFlagAlways,
522 IsImplicit, UpdateRef, HasCloseModifier,
523 HasPresentModifier, HasHoldModifier,
524 AsyncInfo);
525 void *TgtPtrBegin = TPR.TargetPointer;
526 IsHostPtr = TPR.Flags.IsHostPointer;
527 // If data_size==0, then the argument could be a zero-length pointer to
528 // NULL, so getOrAlloc() returning NULL is not an error.
529 if (!TgtPtrBegin && (data_size || HasPresentModifier)) {
530 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
531 HasPresentModifier ? "'present' map type modifier"
532 : "device failure or illegal mapping");
533 return OFFLOAD_FAIL;
535 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
536 " - is%s new\n",
537 data_size, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
539 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
540 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
541 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
542 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
543 args_base[i] = TgtPtrBase;
546 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
547 // Check whether we need to update the pointer on the device
548 bool UpdateDevPtr = false;
550 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
551 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
553 Device.ShadowMtx.lock();
554 auto Entry = Device.ShadowPtrMap.find(Pointer_HstPtrBegin);
555 // If this pointer is not in the map we need to insert it. If the map
556 // contains a stale entry, we need to update it (e.g. if the pointee was
557 // deallocated and later on is reallocated at another device address). The
558 // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An
559 // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ
560 // pair is deallocated, not when the OBJ is deallocated. In
561 // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer,
562 // so it stays in the map for the lifetime of the application. When the
563 // OBJ is deallocated and later on allocated again (at a different device
564 // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin
565 // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ.
566 if (Entry == Device.ShadowPtrMap.end() ||
567 Entry->second.TgtPtrVal != ExpectedTgtPtrBase) {
568 // create or update shadow pointers for this entry
569 Device.ShadowPtrMap[Pointer_HstPtrBegin] = {
570 HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase};
571 UpdateDevPtr = true;
574 if (UpdateDevPtr) {
575 Pointer_TPR.MapTableEntry->lock();
576 Device.ShadowMtx.unlock();
578 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
579 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
581 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
582 TgtPtrBase = ExpectedTgtPtrBase;
584 int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
585 sizeof(void *), AsyncInfo);
586 if (Ret != OFFLOAD_SUCCESS) {
587 Pointer_TPR.MapTableEntry->unlock();
588 REPORT("Copying data to device failed.\n");
589 return OFFLOAD_FAIL;
591 void *Event = Pointer_TPR.MapTableEntry->getEvent();
592 bool NeedNewEvent = Event == nullptr;
593 if (NeedNewEvent && Device.createEvent(&Event) != OFFLOAD_SUCCESS) {
594 Pointer_TPR.MapTableEntry->unlock();
595 REPORT("Failed to create event.\n");
596 return OFFLOAD_FAIL;
598 // We cannot assume the event should not be nullptr because we don't
599 // know if the target support event. But if a target doesn't,
600 // recordEvent should always return success.
601 Ret = Device.recordEvent(Event, AsyncInfo);
602 if (Ret != OFFLOAD_SUCCESS) {
603 Pointer_TPR.MapTableEntry->unlock();
604 REPORT("Failed to set dependence on event " DPxMOD "\n",
605 DPxPTR(Event));
606 return OFFLOAD_FAIL;
608 if (NeedNewEvent)
609 Pointer_TPR.MapTableEntry->setEvent(Event);
610 Pointer_TPR.MapTableEntry->unlock();
611 } else
612 Device.ShadowMtx.unlock();
616 return OFFLOAD_SUCCESS;
619 namespace {
620 /// This structure contains information to deallocate a target pointer, aka.
621 /// used to call the function \p DeviceTy::deallocTgtPtr.
622 struct DeallocTgtPtrInfo {
623 /// Host pointer used to look up into the map table
624 void *HstPtrBegin;
625 /// Size of the data
626 int64_t DataSize;
627 /// Whether it has \p ompx_hold modifier
628 bool HasHoldModifier;
630 DeallocTgtPtrInfo(void *HstPtr, int64_t Size, bool HasHoldModifier)
631 : HstPtrBegin(HstPtr), DataSize(Size), HasHoldModifier(HasHoldModifier) {}
633 } // namespace
635 /// Internal function to undo the mapping and retrieve the data from the device.
636 int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
637 void **ArgBases, void **Args, int64_t *ArgSizes,
638 int64_t *ArgTypes, map_var_info_t *ArgNames,
639 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
640 int Ret;
641 std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
642 void *FromMapperBase = nullptr;
643 // process each input.
644 for (int32_t I = ArgNum - 1; I >= 0; --I) {
645 // Ignore private variables and arrays - there is no mapping for them.
646 // Also, ignore the use_device_ptr directive, it has no effect here.
647 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
648 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
649 continue;
651 if (ArgMappers && ArgMappers[I]) {
652 // Instead of executing the regular path of targetDataEnd, call the
653 // targetDataMapper variant which will call targetDataEnd again
654 // with new arguments.
655 DP("Calling targetDataMapper for the %dth argument\n", I);
657 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
658 Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
659 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
660 targetDataEnd);
662 if (Ret != OFFLOAD_SUCCESS) {
663 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
664 " failed.\n");
665 return OFFLOAD_FAIL;
668 // Skip the rest of this function, continue to the next argument.
669 continue;
672 void *HstPtrBegin = Args[I];
673 int64_t DataSize = ArgSizes[I];
674 // Adjust for proper alignment if this is a combined entry (for structs).
675 // Look at the next argument - if that is MEMBER_OF this one, then this one
676 // is a combined entry.
677 const int NextI = I + 1;
678 if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum &&
679 getParentIndex(ArgTypes[NextI]) == I) {
680 int64_t Padding = (int64_t)HstPtrBegin % Alignment;
681 if (Padding) {
682 DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD
683 "\n",
684 Padding, DPxPTR(HstPtrBegin));
685 HstPtrBegin = (char *)HstPtrBegin - Padding;
686 DataSize += Padding;
690 bool IsLast, IsHostPtr;
691 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
692 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
693 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
694 !(FromMapper && I == 0);
695 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
696 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
697 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
699 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
700 void *TgtPtrBegin = Device.getTgtPtrBegin(
701 HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr,
702 !IsImplicit, ForceDelete);
703 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
704 DP("Mapping does not exist (%s)\n",
705 (HasPresentModifier ? "'present' map type modifier" : "ignored"));
706 if (HasPresentModifier) {
707 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
708 // "If a map clause appears on a target, target data, target enter data
709 // or target exit data construct with a present map-type-modifier then
710 // on entry to the region if the corresponding list item does not appear
711 // in the device data environment then an error occurs and the program
712 // terminates."
714 // This should be an error upon entering an "omp target exit data". It
715 // should not be an error upon exiting an "omp target data" or "omp
716 // target". For "omp target data", Clang thus doesn't include present
717 // modifiers for end calls. For "omp target", we have not found a valid
718 // OpenMP program for which the error matters: it appears that, if a
719 // program can guarantee that data is present at the beginning of an
720 // "omp target" region so that there's no error there, that data is also
721 // guaranteed to be present at the end.
722 MESSAGE("device mapping required by 'present' map type modifier does "
723 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
724 DPxPTR(HstPtrBegin), DataSize);
725 return OFFLOAD_FAIL;
727 } else {
728 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
729 " - is%s last\n",
730 DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not"));
733 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
734 // "If the map clause appears on a target, target data, or target exit data
735 // construct and a corresponding list item of the original list item is not
736 // present in the device data environment on exit from the region then the
737 // list item is ignored."
738 if (!TgtPtrBegin)
739 continue;
741 bool DelEntry = IsLast;
743 // If the last element from the mapper (for end transfer args comes in
744 // reverse order), do not remove the partial entry, the parent struct still
745 // exists.
746 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
747 !(ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
748 DelEntry = false; // protect parent struct from being deallocated
751 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
752 // Move data back to the host
753 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
754 bool Always = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
755 if ((Always || IsLast) && !IsHostPtr) {
756 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
757 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
758 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
759 AsyncInfo);
760 if (Ret != OFFLOAD_SUCCESS) {
761 REPORT("Copying data from device failed.\n");
762 return OFFLOAD_FAIL;
766 if (DelEntry && FromMapper && I == 0) {
767 DelEntry = false;
768 FromMapperBase = HstPtrBegin;
771 // If we copied back to the host a struct/array containing pointers, we
772 // need to restore the original host pointer values from their shadow
773 // copies. If the struct is going to be deallocated, remove any remaining
774 // shadow pointer entries for this struct.
775 uintptr_t LB = (uintptr_t)HstPtrBegin;
776 uintptr_t UB = (uintptr_t)HstPtrBegin + DataSize;
777 Device.ShadowMtx.lock();
778 for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin();
779 Itr != Device.ShadowPtrMap.end();) {
780 void **ShadowHstPtrAddr = (void **)Itr->first;
782 // An STL map is sorted on its keys; use this property
783 // to quickly determine when to break out of the loop.
784 if ((uintptr_t)ShadowHstPtrAddr < LB) {
785 ++Itr;
786 continue;
788 if ((uintptr_t)ShadowHstPtrAddr >= UB)
789 break;
791 // If we copied the struct to the host, we need to restore the pointer.
792 if (ArgTypes[I] & OMP_TGT_MAPTYPE_FROM) {
793 DP("Restoring original host pointer value " DPxMOD " for host "
794 "pointer " DPxMOD "\n",
795 DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
796 *ShadowHstPtrAddr = Itr->second.HstPtrVal;
798 // If the struct is to be deallocated, remove the shadow entry.
799 if (DelEntry) {
800 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
801 Itr = Device.ShadowPtrMap.erase(Itr);
802 } else {
803 ++Itr;
806 Device.ShadowMtx.unlock();
808 // Add pointer to the buffer for later deallocation
809 if (DelEntry && !IsHostPtr)
810 DeallocTgtPtrs.emplace_back(HstPtrBegin, DataSize, HasHoldModifier);
814 // TODO: We should not synchronize here but pass the AsyncInfo object to the
815 // allocate/deallocate device APIs.
817 // We need to synchronize before deallocating data.
818 Ret = AsyncInfo.synchronize();
819 if (Ret != OFFLOAD_SUCCESS)
820 return OFFLOAD_FAIL;
822 // Deallocate target pointer
823 for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
824 if (FromMapperBase && FromMapperBase == Info.HstPtrBegin)
825 continue;
826 Ret = Device.deallocTgtPtr(Info.HstPtrBegin, Info.DataSize,
827 Info.HasHoldModifier);
828 if (Ret != OFFLOAD_SUCCESS) {
829 REPORT("Deallocating data from device failed.\n");
830 return OFFLOAD_FAIL;
834 return OFFLOAD_SUCCESS;
837 static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
838 void *HstPtrBegin, int64_t ArgSize,
839 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
840 TIMESCOPE_WITH_IDENT(loc);
841 bool IsLast, IsHostPtr;
842 void *TgtPtrBegin = Device.getTgtPtrBegin(
843 HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false,
844 /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true);
845 if (!TgtPtrBegin) {
846 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
847 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
848 MESSAGE("device mapping required by 'present' motion modifier does not "
849 "exist for host address " DPxMOD " (%" PRId64 " bytes)",
850 DPxPTR(HstPtrBegin), ArgSize);
851 return OFFLOAD_FAIL;
853 return OFFLOAD_SUCCESS;
856 if (IsHostPtr) {
857 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
858 DPxPTR(HstPtrBegin));
859 return OFFLOAD_SUCCESS;
862 if (ArgType & OMP_TGT_MAPTYPE_FROM) {
863 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
864 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
865 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
866 if (Ret != OFFLOAD_SUCCESS) {
867 REPORT("Copying data from device failed.\n");
868 return OFFLOAD_FAIL;
871 uintptr_t LB = (uintptr_t)HstPtrBegin;
872 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
873 Device.ShadowMtx.lock();
874 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
875 IT != Device.ShadowPtrMap.end(); ++IT) {
876 void **ShadowHstPtrAddr = (void **)IT->first;
877 if ((uintptr_t)ShadowHstPtrAddr < LB)
878 continue;
879 if ((uintptr_t)ShadowHstPtrAddr >= UB)
880 break;
881 DP("Restoring original host pointer value " DPxMOD
882 " for host pointer " DPxMOD "\n",
883 DPxPTR(IT->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr));
884 *ShadowHstPtrAddr = IT->second.HstPtrVal;
886 Device.ShadowMtx.unlock();
889 if (ArgType & OMP_TGT_MAPTYPE_TO) {
890 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
891 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
892 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
893 if (Ret != OFFLOAD_SUCCESS) {
894 REPORT("Copying data to device failed.\n");
895 return OFFLOAD_FAIL;
898 uintptr_t LB = (uintptr_t)HstPtrBegin;
899 uintptr_t UB = (uintptr_t)HstPtrBegin + ArgSize;
900 Device.ShadowMtx.lock();
901 for (ShadowPtrListTy::iterator IT = Device.ShadowPtrMap.begin();
902 IT != Device.ShadowPtrMap.end(); ++IT) {
903 void **ShadowHstPtrAddr = (void **)IT->first;
904 if ((uintptr_t)ShadowHstPtrAddr < LB)
905 continue;
906 if ((uintptr_t)ShadowHstPtrAddr >= UB)
907 break;
908 DP("Restoring original target pointer value " DPxMOD " for target "
909 "pointer " DPxMOD "\n",
910 DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
911 Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
912 sizeof(void *), AsyncInfo);
913 if (Ret != OFFLOAD_SUCCESS) {
914 REPORT("Copying data to device failed.\n");
915 Device.ShadowMtx.unlock();
916 return OFFLOAD_FAIL;
919 Device.ShadowMtx.unlock();
921 return OFFLOAD_SUCCESS;
924 static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
925 void *ArgsBase,
926 __tgt_target_non_contig *NonContig,
927 uint64_t Size, int64_t ArgType,
928 int CurrentDim, int DimSize, uint64_t Offset,
929 AsyncInfoTy &AsyncInfo) {
930 TIMESCOPE_WITH_IDENT(loc);
931 int Ret = OFFLOAD_SUCCESS;
932 if (CurrentDim < DimSize) {
933 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
934 uint64_t CurOffset =
935 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
936 // we only need to transfer the first element for the last dimension
937 // since we've already got a contiguous piece.
938 if (CurrentDim != DimSize - 1 || I == 0) {
939 Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
940 ArgType, CurrentDim + 1, DimSize,
941 Offset + CurOffset, AsyncInfo);
942 // Stop the whole process if any contiguous piece returns anything
943 // other than OFFLOAD_SUCCESS.
944 if (Ret != OFFLOAD_SUCCESS)
945 return Ret;
948 } else {
949 char *Ptr = (char *)ArgsBase + Offset;
950 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
951 " len %" PRIu64 "\n",
952 DPxPTR(Ptr), Offset, Size);
953 Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
954 AsyncInfo);
956 return Ret;
959 static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
960 int32_t DimSize) {
961 int RemovedDim = 0;
962 for (int I = DimSize - 1; I > 0; --I) {
963 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
964 RemovedDim++;
966 return RemovedDim;
969 /// Internal function to pass data to/from the target.
970 int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
971 void **ArgsBase, void **Args, int64_t *ArgSizes,
972 int64_t *ArgTypes, map_var_info_t *ArgNames,
973 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
974 // process each input.
975 for (int32_t I = 0; I < ArgNum; ++I) {
976 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
977 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
978 continue;
980 if (ArgMappers && ArgMappers[I]) {
981 // Instead of executing the regular path of targetDataUpdate, call the
982 // targetDataMapper variant which will call targetDataUpdate again
983 // with new arguments.
984 DP("Calling targetDataMapper for the %dth argument\n", I);
986 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
987 int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
988 ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
989 targetDataUpdate);
991 if (Ret != OFFLOAD_SUCCESS) {
992 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
993 " failed.\n");
994 return OFFLOAD_FAIL;
997 // Skip the rest of this function, continue to the next argument.
998 continue;
1001 int Ret = OFFLOAD_SUCCESS;
1003 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
1004 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
1005 int32_t DimSize = ArgSizes[I];
1006 uint64_t Size =
1007 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
1008 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
1009 Ret = targetDataNonContiguous(
1010 loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
1011 /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
1012 } else {
1013 Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
1014 ArgTypes[I], AsyncInfo);
1016 if (Ret == OFFLOAD_FAIL)
1017 return OFFLOAD_FAIL;
1019 return OFFLOAD_SUCCESS;
1022 static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
1023 OMP_TGT_MAPTYPE_LITERAL |
1024 OMP_TGT_MAPTYPE_IMPLICIT;
1025 static bool isLambdaMapping(int64_t Mapping) {
1026 return (Mapping & LambdaMapping) == LambdaMapping;
1029 namespace {
1030 /// Find the table information in the map or look it up in the translation
1031 /// tables.
1032 TableMap *getTableMap(void *HostPtr) {
1033 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1034 HostPtrToTableMapTy::iterator TableMapIt =
1035 PM->HostPtrToTableMap.find(HostPtr);
1037 if (TableMapIt != PM->HostPtrToTableMap.end())
1038 return &TableMapIt->second;
1040 // We don't have a map. So search all the registered libraries.
1041 TableMap *TM = nullptr;
1042 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1043 for (HostEntriesBeginToTransTableTy::iterator Itr =
1044 PM->HostEntriesBeginToTransTable.begin();
1045 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
1046 // get the translation table (which contains all the good info).
1047 TranslationTable *TransTable = &Itr->second;
1048 // iterate over all the host table entries to see if we can locate the
1049 // host_ptr.
1050 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
1051 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
1052 if (Cur->addr != HostPtr)
1053 continue;
1054 // we got a match, now fill the HostPtrToTableMap so that we
1055 // may avoid this search next time.
1056 TM = &(PM->HostPtrToTableMap)[HostPtr];
1057 TM->Table = TransTable;
1058 TM->Index = I;
1059 return TM;
1063 return nullptr;
1066 /// Get loop trip count
1067 /// FIXME: This function will not work right if calling
1068 /// __kmpc_push_target_tripcount_mapper in one thread but doing offloading in
1069 /// another thread, which might occur when we call task yield.
1070 uint64_t getLoopTripCount(int64_t DeviceId) {
1071 DeviceTy &Device = *PM->Devices[DeviceId];
1072 uint64_t LoopTripCount = 0;
1075 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1076 auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL));
1077 if (I != Device.LoopTripCnt.end()) {
1078 LoopTripCount = I->second;
1079 Device.LoopTripCnt.erase(I);
1080 DP("loop trip count is %" PRIu64 ".\n", LoopTripCount);
1084 return LoopTripCount;
1087 /// A class manages private arguments in a target region.
1088 class PrivateArgumentManagerTy {
1089 /// A data structure for the information of first-private arguments. We can
1090 /// use this information to optimize data transfer by packing all
1091 /// first-private arguments and transfer them all at once.
1092 struct FirstPrivateArgInfoTy {
1093 /// The index of the element in \p TgtArgs corresponding to the argument
1094 const int Index;
1095 /// Host pointer begin
1096 const char *HstPtrBegin;
1097 /// Host pointer end
1098 const char *HstPtrEnd;
1099 /// Aligned size
1100 const int64_t AlignedSize;
1101 /// Host pointer name
1102 const map_var_info_t HstPtrName = nullptr;
1104 FirstPrivateArgInfoTy(int Index, const void *HstPtr, int64_t Size,
1105 const map_var_info_t HstPtrName = nullptr)
1106 : Index(Index), HstPtrBegin(reinterpret_cast<const char *>(HstPtr)),
1107 HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment),
1108 HstPtrName(HstPtrName) {}
1111 /// A vector of target pointers for all private arguments
1112 std::vector<void *> TgtPtrs;
1114 /// A vector of information of all first-private arguments to be packed
1115 std::vector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1116 /// Host buffer for all arguments to be packed
1117 std::vector<char> FirstPrivateArgBuffer;
1118 /// The total size of all arguments to be packed
1119 int64_t FirstPrivateArgSize = 0;
1121 /// A reference to the \p DeviceTy object
1122 DeviceTy &Device;
1123 /// A pointer to a \p AsyncInfoTy object
1124 AsyncInfoTy &AsyncInfo;
1126 // TODO: What would be the best value here? Should we make it configurable?
1127 // If the size is larger than this threshold, we will allocate and transfer it
1128 // immediately instead of packing it.
1129 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1131 public:
1132 /// Constructor
1133 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1134 : Device(Dev), AsyncInfo(AsyncInfo) {}
1136 /// Add a private argument
1137 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1138 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1139 const map_var_info_t HstPtrName = nullptr,
1140 const bool AllocImmediately = false) {
1141 // If the argument is not first-private, or its size is greater than a
1142 // predefined threshold, we will allocate memory and issue the transfer
1143 // immediately.
1144 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1145 AllocImmediately) {
1146 TgtPtr = Device.allocData(ArgSize, HstPtr);
1147 if (!TgtPtr) {
1148 DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1149 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1150 return OFFLOAD_FAIL;
1152 #ifdef OMPTARGET_DEBUG
1153 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1154 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1155 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1156 "\n",
1157 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1158 DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1159 #endif
1160 // If first-private, copy data from host
1161 if (IsFirstPrivate) {
1162 DP("Submitting firstprivate data to the device.\n");
1163 int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
1164 if (Ret != OFFLOAD_SUCCESS) {
1165 DP("Copying data to device failed, failed.\n");
1166 return OFFLOAD_FAIL;
1169 TgtPtrs.push_back(TgtPtr);
1170 } else {
1171 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1172 DPxPTR(HstPtr), ArgSize);
1173 // When reach this point, the argument must meet all following
1174 // requirements:
1175 // 1. Its size does not exceed the threshold (see the comment for
1176 // FirstPrivateArgSizeThreshold);
1177 // 2. It must be first-private (needs to be mapped to target device).
1178 // We will pack all this kind of arguments to transfer them all at once
1179 // to reduce the number of data transfer. We will not take
1180 // non-first-private arguments, aka. private arguments that doesn't need
1181 // to be mapped to target device, into account because data allocation
1182 // can be very efficient with memory manager.
1184 // Placeholder value
1185 TgtPtr = nullptr;
1186 FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize,
1187 HstPtrName);
1188 FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize;
1191 return OFFLOAD_SUCCESS;
1194 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1195 /// and start the transfer.
1196 int packAndTransfer(std::vector<void *> &TgtArgs) {
1197 if (!FirstPrivateArgInfo.empty()) {
1198 assert(FirstPrivateArgSize != 0 &&
1199 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1200 FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0);
1201 auto Itr = FirstPrivateArgBuffer.begin();
1202 // Copy all host data to this buffer
1203 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1204 std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr);
1205 Itr = std::next(Itr, Info.AlignedSize);
1207 // Allocate target memory
1208 void *TgtPtr =
1209 Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data());
1210 if (TgtPtr == nullptr) {
1211 DP("Failed to allocate target memory for private arguments.\n");
1212 return OFFLOAD_FAIL;
1214 TgtPtrs.push_back(TgtPtr);
1215 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1216 FirstPrivateArgSize, DPxPTR(TgtPtr));
1217 // Transfer data to target device
1218 int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
1219 FirstPrivateArgSize, AsyncInfo);
1220 if (Ret != OFFLOAD_SUCCESS) {
1221 DP("Failed to submit data of private arguments.\n");
1222 return OFFLOAD_FAIL;
1224 // Fill in all placeholder pointers
1225 auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1226 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1227 void *&Ptr = TgtArgs[Info.Index];
1228 assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1229 Ptr = reinterpret_cast<void *>(TP);
1230 TP += Info.AlignedSize;
1231 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1232 "\n",
1233 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1234 DPxPTR(Ptr));
1238 return OFFLOAD_SUCCESS;
1241 /// Free all target memory allocated for private arguments
1242 int free() {
1243 for (void *P : TgtPtrs) {
1244 int Ret = Device.deleteData(P);
1245 if (Ret != OFFLOAD_SUCCESS) {
1246 DP("Deallocation of (first-)private arrays failed.\n");
1247 return OFFLOAD_FAIL;
1251 TgtPtrs.clear();
1253 return OFFLOAD_SUCCESS;
1257 /// Process data before launching the kernel, including calling targetDataBegin
1258 /// to map and transfer data to target device, transferring (first-)private
1259 /// variables.
1260 static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
1261 int32_t ArgNum, void **ArgBases, void **Args,
1262 int64_t *ArgSizes, int64_t *ArgTypes,
1263 map_var_info_t *ArgNames, void **ArgMappers,
1264 std::vector<void *> &TgtArgs,
1265 std::vector<ptrdiff_t> &TgtOffsets,
1266 PrivateArgumentManagerTy &PrivateArgumentManager,
1267 AsyncInfoTy &AsyncInfo) {
1268 TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
1269 DeviceTy &Device = *PM->Devices[DeviceId];
1270 int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1271 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1272 if (Ret != OFFLOAD_SUCCESS) {
1273 REPORT("Call to targetDataBegin failed, abort target.\n");
1274 return OFFLOAD_FAIL;
1277 // List of (first-)private arrays allocated for this target region
1278 std::vector<int> TgtArgsPositions(ArgNum, -1);
1280 for (int32_t I = 0; I < ArgNum; ++I) {
1281 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1282 // This is not a target parameter, do not push it into TgtArgs.
1283 // Check for lambda mapping.
1284 if (isLambdaMapping(ArgTypes[I])) {
1285 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1286 "PTR_AND_OBJ must be also MEMBER_OF.");
1287 unsigned Idx = getParentIndex(ArgTypes[I]);
1288 int TgtIdx = TgtArgsPositions[Idx];
1289 assert(TgtIdx != -1 && "Base address must be translated already.");
1290 // The parent lambda must be processed already and it must be the last
1291 // in TgtArgs and TgtOffsets arrays.
1292 void *HstPtrVal = Args[I];
1293 void *HstPtrBegin = ArgBases[I];
1294 void *HstPtrBase = Args[Idx];
1295 bool IsLast, IsHostPtr; // IsLast is unused.
1296 void *TgtPtrBase =
1297 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1298 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1299 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1300 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1301 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1302 PointerTgtPtrBegin = Device.getTgtPtrBegin(
1303 HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false,
1304 /*UseHoldRefCount=*/false, IsHostPtr);
1305 if (!PointerTgtPtrBegin) {
1306 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1307 DPxPTR(HstPtrVal));
1308 continue;
1310 if (IsHostPtr) {
1311 DP("Unified memory is active, no need to map lambda captured"
1312 "variable (" DPxMOD ")\n",
1313 DPxPTR(HstPtrVal));
1314 continue;
1316 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1317 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1318 Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
1319 sizeof(void *), AsyncInfo);
1320 if (Ret != OFFLOAD_SUCCESS) {
1321 REPORT("Copying data to device failed.\n");
1322 return OFFLOAD_FAIL;
1325 continue;
1327 void *HstPtrBegin = Args[I];
1328 void *HstPtrBase = ArgBases[I];
1329 void *TgtPtrBegin;
1330 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1331 ptrdiff_t TgtBaseOffset;
1332 bool IsLast, IsHostPtr; // unused.
1333 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1334 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1335 DPxPTR(HstPtrBase));
1336 TgtPtrBegin = HstPtrBase;
1337 TgtBaseOffset = 0;
1338 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1339 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1340 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1341 // If there is a next argument and it depends on the current one, we need
1342 // to allocate the private memory immediately. If this is not the case,
1343 // then the argument can be marked for optimization and packed with the
1344 // other privates.
1345 const bool AllocImmediately =
1346 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1347 Ret = PrivateArgumentManager.addArg(
1348 HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin,
1349 TgtArgs.size(), HstPtrName, AllocImmediately);
1350 if (Ret != OFFLOAD_SUCCESS) {
1351 REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1352 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1353 return OFFLOAD_FAIL;
1355 } else {
1356 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1357 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1358 TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast,
1359 /*UpdateRefCount=*/false,
1360 /*UseHoldRefCount=*/false, IsHostPtr);
1361 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1362 #ifdef OMPTARGET_DEBUG
1363 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1364 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1365 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1366 #endif
1368 TgtArgsPositions[I] = TgtArgs.size();
1369 TgtArgs.push_back(TgtPtrBegin);
1370 TgtOffsets.push_back(TgtBaseOffset);
1373 assert(TgtArgs.size() == TgtOffsets.size() &&
1374 "Size mismatch in arguments and offsets");
1376 // Pack and transfer first-private arguments
1377 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1378 if (Ret != OFFLOAD_SUCCESS) {
1379 DP("Failed to pack and transfer first private arguments\n");
1380 return OFFLOAD_FAIL;
1383 return OFFLOAD_SUCCESS;
1386 /// Process data after launching the kernel, including transferring data back to
1387 /// host if needed and deallocating target memory of (first-)private variables.
1388 static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
1389 int32_t ArgNum, void **ArgBases, void **Args,
1390 int64_t *ArgSizes, int64_t *ArgTypes,
1391 map_var_info_t *ArgNames, void **ArgMappers,
1392 PrivateArgumentManagerTy &PrivateArgumentManager,
1393 AsyncInfoTy &AsyncInfo) {
1394 TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
1395 DeviceTy &Device = *PM->Devices[DeviceId];
1397 // Move data from device.
1398 int Ret = targetDataEnd(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
1399 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1400 if (Ret != OFFLOAD_SUCCESS) {
1401 REPORT("Call to targetDataEnd failed, abort target.\n");
1402 return OFFLOAD_FAIL;
1405 // Free target memory for private arguments
1406 Ret = PrivateArgumentManager.free();
1407 if (Ret != OFFLOAD_SUCCESS) {
1408 REPORT("Failed to deallocate target memory for private args\n");
1409 return OFFLOAD_FAIL;
1412 return OFFLOAD_SUCCESS;
1414 } // namespace
1416 /// performs the same actions as data_begin in case arg_num is
1417 /// non-zero and initiates run of the offloaded region on the target platform;
1418 /// if arg_num is non-zero after the region execution is done it also
1419 /// performs the same action as data_update and data_end above. This function
1420 /// returns 0 if it was able to transfer the execution to a target and an
1421 /// integer different from zero otherwise.
1422 int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
1423 void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
1424 map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
1425 int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
1426 int32_t DeviceId = Device.DeviceID;
1428 TableMap *TM = getTableMap(HostPtr);
1429 // No map for this host pointer found!
1430 if (!TM) {
1431 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1432 DPxPTR(HostPtr));
1433 return OFFLOAD_FAIL;
1436 // get target table.
1437 __tgt_target_table *TargetTable = nullptr;
1439 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1440 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1441 "Not expecting a device ID outside the table's bounds!");
1442 TargetTable = TM->Table->TargetsTable[DeviceId];
1444 assert(TargetTable && "Global data has not been mapped\n");
1446 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1447 // need to manifest base pointers prior to launching a kernel. Even if we have
1448 // mapped an object only partially, e.g. A[N:M], although the kernel is
1449 // expected to access elements starting at address &A[N] and beyond, we still
1450 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1451 // API, we need the begin address itself, i.e. &A[N], as the API operates on
1452 // begin addresses, not bases. That's why we pass args and offsets as two
1453 // separate entities so that each plugin can do what it needs. This behavior
1454 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1455 std::vector<void *> TgtArgs;
1456 std::vector<ptrdiff_t> TgtOffsets;
1458 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1460 int Ret;
1461 if (ArgNum) {
1462 // Process data, such as data mapping, before launching the kernel
1463 Ret = processDataBefore(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1464 ArgSizes, ArgTypes, ArgNames, ArgMappers, TgtArgs,
1465 TgtOffsets, PrivateArgumentManager, AsyncInfo);
1466 if (Ret != OFFLOAD_SUCCESS) {
1467 REPORT("Failed to process data before launching the kernel.\n");
1468 return OFFLOAD_FAIL;
1472 // Launch device execution.
1473 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1474 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1475 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1478 TIMESCOPE_WITH_NAME_AND_IDENT(
1479 IsTeamConstruct ? "runTargetTeamRegion" : "runTargetRegion", loc);
1480 if (IsTeamConstruct)
1481 Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1482 TgtArgs.size(), TeamNum, ThreadLimit,
1483 getLoopTripCount(DeviceId), AsyncInfo);
1484 else
1485 Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
1486 TgtArgs.size(), AsyncInfo);
1489 if (Ret != OFFLOAD_SUCCESS) {
1490 REPORT("Executing target region abort target.\n");
1491 return OFFLOAD_FAIL;
1494 if (ArgNum) {
1495 // Transfer data back and deallocate target memory for (first-)private
1496 // variables
1497 Ret = processDataAfter(loc, DeviceId, HostPtr, ArgNum, ArgBases, Args,
1498 ArgSizes, ArgTypes, ArgNames, ArgMappers,
1499 PrivateArgumentManager, AsyncInfo);
1500 if (Ret != OFFLOAD_SUCCESS) {
1501 REPORT("Failed to process data after launching the kernel.\n");
1502 return OFFLOAD_FAIL;
1506 return OFFLOAD_SUCCESS;