1 //===- PluginInterface.cpp - Target independent plugin device interface ---===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 //===----------------------------------------------------------------------===//
11 #include "PluginInterface.h"
13 #include "Environment.h"
14 #include "GlobalHandler.h"
16 #include "elf_common.h"
17 #include "omptarget.h"
18 #include "omptargetplugin.h"
21 #include "OmptCallback.h"
22 #include "omp-tools.h"
25 #include "llvm/Frontend/OpenMP/OMPConstants.h"
26 #include "llvm/Support/Error.h"
27 #include "llvm/Support/JSON.h"
28 #include "llvm/Support/MathExtras.h"
29 #include "llvm/Support/MemoryBuffer.h"
36 using namespace target
;
37 using namespace plugin
;
39 GenericPluginTy
*Plugin::SpecificPlugin
= nullptr;
41 // TODO: Fix any thread safety issues for multi-threaded kernel recording.
42 struct RecordReplayTy
{
44 // Describes the state of the record replay mechanism.
45 enum RRStatusTy
{ RRDeactivated
= 0, RRRecording
, RRReplaying
};
48 // Memory pointers for recording, replaying memory.
53 GenericDeviceTy
*Device
;
54 std::mutex AllocationLock
;
57 bool ReplaySaveOutput
;
59 void *suggestAddress(uint64_t MaxMemoryAllocation
) {
60 // Get a valid pointer address for this system
62 Device
->allocate(1024, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT
);
64 // Align Address to MaxMemoryAllocation
65 Addr
= (void *)alignPtr((Addr
), MaxMemoryAllocation
);
69 Error
preAllocateVAMemory(uint64_t MaxMemoryAllocation
, void *VAddr
) {
70 size_t ASize
= MaxMemoryAllocation
;
72 if (!VAddr
&& isRecording())
73 VAddr
= suggestAddress(MaxMemoryAllocation
);
75 DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation
, VAddr
);
77 if (auto Err
= Device
->memoryVAMap(&MemoryStart
, VAddr
, &ASize
))
80 if (isReplaying() && VAddr
!= MemoryStart
) {
81 return Plugin::error("Record-Replay cannot assign the"
82 "requested recorded address (%p, %p)",
86 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, Device
->getDeviceId(),
87 "Allocated %" PRIu64
" bytes at %p for replay.\n", ASize
, MemoryStart
);
89 MemoryPtr
= MemoryStart
;
92 return Plugin::success();
95 Error
preAllocateHeuristic(uint64_t MaxMemoryAllocation
, void *VAddr
) {
96 const size_t MAX_MEMORY_ALLOCATION
= MaxMemoryAllocation
;
97 constexpr size_t STEP
= 1024 * 1024 * 1024ULL;
98 MemoryStart
= nullptr;
99 for (TotalSize
= MAX_MEMORY_ALLOCATION
; TotalSize
> 0; TotalSize
-= STEP
) {
100 MemoryStart
= Device
->allocate(TotalSize
, /* HstPtr */ nullptr,
101 TARGET_ALLOC_DEFAULT
);
106 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, Device
->getDeviceId(),
107 "Allocated %" PRIu64
" bytes at %p for replay.\n", TotalSize
,
111 return Plugin::error("Allocating record/replay memory");
113 if (VAddr
&& VAddr
!= MemoryStart
)
114 return Plugin::error("Cannot allocate recorded address");
116 MemoryPtr
= MemoryStart
;
119 return Plugin::success();
122 Error
preallocateDeviceMemory(uint64_t DeviceMemorySize
, void *ReqVAddr
) {
123 if (Device
->supportVAManagement())
124 return preAllocateVAMemory(DeviceMemorySize
, ReqVAddr
);
127 if (Device
->getDeviceMemorySize(DevMemSize
))
128 return Plugin::error("Cannot determine Device Memory Size");
130 return preAllocateHeuristic(DevMemSize
, ReqVAddr
);
133 void dumpDeviceMemory(StringRef Filename
) {
134 ErrorOr
<std::unique_ptr
<WritableMemoryBuffer
>> DeviceMemoryMB
=
135 WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize
);
137 report_fatal_error("Error creating MemoryBuffer for device memory");
139 auto Err
= Device
->dataRetrieve(DeviceMemoryMB
.get()->getBufferStart(),
140 MemoryStart
, MemorySize
, nullptr);
142 report_fatal_error("Error retrieving data for target pointer");
144 StringRef
DeviceMemory(DeviceMemoryMB
.get()->getBufferStart(), MemorySize
);
146 raw_fd_ostream
OS(Filename
, EC
);
148 report_fatal_error("Error dumping memory to file " + Filename
+ " :" +
155 bool isRecording() const { return Status
== RRStatusTy::RRRecording
; }
156 bool isReplaying() const { return Status
== RRStatusTy::RRReplaying
; }
157 bool isRecordingOrReplaying() const {
158 return (Status
!= RRStatusTy::RRDeactivated
);
160 void setStatus(RRStatusTy Status
) { this->Status
= Status
; }
161 bool isSaveOutputEnabled() const { return ReplaySaveOutput
; }
164 : Status(RRStatusTy::RRDeactivated
), ReplaySaveOutput(false) {}
166 void saveImage(const char *Name
, const DeviceImageTy
&Image
) {
167 SmallString
<128> ImageName
= {Name
, ".image"};
169 raw_fd_ostream
OS(ImageName
, EC
);
171 report_fatal_error("Error saving image : " + StringRef(EC
.message()));
172 if (const auto *TgtImageBitcode
= Image
.getTgtImageBitcode()) {
174 getPtrDiff(TgtImageBitcode
->ImageEnd
, TgtImageBitcode
->ImageStart
);
175 MemoryBufferRef MBR
= MemoryBufferRef(
176 StringRef((const char *)TgtImageBitcode
->ImageStart
, Size
), "");
177 OS
<< MBR
.getBuffer();
179 OS
<< Image
.getMemoryBuffer().getBuffer();
184 void dumpGlobals(StringRef Filename
, DeviceImageTy
&Image
) {
187 for (auto &OffloadEntry
: Image
.getOffloadEntryTable()) {
188 if (!OffloadEntry
.size
)
190 Size
+= std::strlen(OffloadEntry
.name
) + /* '\0' */ 1 +
191 /* OffloadEntry.size value */ sizeof(uint32_t) +
195 ErrorOr
<std::unique_ptr
<WritableMemoryBuffer
>> GlobalsMB
=
196 WritableMemoryBuffer::getNewUninitMemBuffer(Size
);
198 report_fatal_error("Error creating MemoryBuffer for globals memory");
200 void *BufferPtr
= GlobalsMB
.get()->getBufferStart();
201 for (auto &OffloadEntry
: Image
.getOffloadEntryTable()) {
202 if (!OffloadEntry
.size
)
205 int32_t NameLength
= std::strlen(OffloadEntry
.name
) + 1;
206 memcpy(BufferPtr
, OffloadEntry
.name
, NameLength
);
207 BufferPtr
= advanceVoidPtr(BufferPtr
, NameLength
);
209 *((uint32_t *)(BufferPtr
)) = OffloadEntry
.size
;
210 BufferPtr
= advanceVoidPtr(BufferPtr
, sizeof(uint32_t));
212 auto Err
= Plugin::success();
214 if (auto Err
= Device
->dataRetrieve(BufferPtr
, OffloadEntry
.addr
,
215 OffloadEntry
.size
, nullptr))
216 report_fatal_error("Error retrieving data for global");
219 report_fatal_error("Error retrieving data for global");
220 BufferPtr
= advanceVoidPtr(BufferPtr
, OffloadEntry
.size
);
222 assert(BufferPtr
== GlobalsMB
->get()->getBufferEnd() &&
223 "Buffer over/under-filled.");
224 assert(Size
== getPtrDiff(BufferPtr
, GlobalsMB
->get()->getBufferStart()) &&
225 "Buffer size mismatch");
227 StringRef
GlobalsMemory(GlobalsMB
.get()->getBufferStart(), Size
);
229 raw_fd_ostream
OS(Filename
, EC
);
234 void saveKernelInputInfo(const char *Name
, DeviceImageTy
&Image
,
235 void **ArgPtrs
, ptrdiff_t *ArgOffsets
,
236 int32_t NumArgs
, uint64_t NumTeamsClause
,
237 uint32_t ThreadLimitClause
, uint64_t LoopTripCount
) {
238 json::Object JsonKernelInfo
;
239 JsonKernelInfo
["Name"] = Name
;
240 JsonKernelInfo
["NumArgs"] = NumArgs
;
241 JsonKernelInfo
["NumTeamsClause"] = NumTeamsClause
;
242 JsonKernelInfo
["ThreadLimitClause"] = ThreadLimitClause
;
243 JsonKernelInfo
["LoopTripCount"] = LoopTripCount
;
244 JsonKernelInfo
["DeviceMemorySize"] = MemorySize
;
245 JsonKernelInfo
["DeviceId"] = Device
->getDeviceId();
246 JsonKernelInfo
["BumpAllocVAStart"] = (intptr_t)MemoryStart
;
248 json::Array JsonArgPtrs
;
249 for (int I
= 0; I
< NumArgs
; ++I
)
250 JsonArgPtrs
.push_back((intptr_t)ArgPtrs
[I
]);
251 JsonKernelInfo
["ArgPtrs"] = json::Value(std::move(JsonArgPtrs
));
253 json::Array JsonArgOffsets
;
254 for (int I
= 0; I
< NumArgs
; ++I
)
255 JsonArgOffsets
.push_back(ArgOffsets
[I
]);
256 JsonKernelInfo
["ArgOffsets"] = json::Value(std::move(JsonArgOffsets
));
258 SmallString
<128> MemoryFilename
= {Name
, ".memory"};
259 dumpDeviceMemory(MemoryFilename
);
261 SmallString
<128> GlobalsFilename
= {Name
, ".globals"};
262 dumpGlobals(GlobalsFilename
, Image
);
264 SmallString
<128> JsonFilename
= {Name
, ".json"};
266 raw_fd_ostream
JsonOS(JsonFilename
.str(), EC
);
268 report_fatal_error("Error saving kernel json file : " +
269 StringRef(EC
.message()));
270 JsonOS
<< json::Value(std::move(JsonKernelInfo
));
274 void saveKernelOutputInfo(const char *Name
) {
275 SmallString
<128> OutputFilename
= {
276 Name
, (isRecording() ? ".original.output" : ".replay.output")};
277 dumpDeviceMemory(OutputFilename
);
280 void *alloc(uint64_t Size
) {
281 assert(MemoryStart
&& "Expected memory has been pre-allocated");
282 void *Alloc
= nullptr;
283 constexpr int Alignment
= 16;
284 // Assumes alignment is a power of 2.
285 int64_t AlignedSize
= (Size
+ (Alignment
- 1)) & (~(Alignment
- 1));
286 std::lock_guard
<std::mutex
> LG(AllocationLock
);
288 MemoryPtr
= (char *)MemoryPtr
+ AlignedSize
;
289 MemorySize
+= AlignedSize
;
290 DP("Memory Allocator return " DPxMOD
"\n", DPxPTR(Alloc
));
294 Error
init(GenericDeviceTy
*Device
, uint64_t MemSize
, void *VAddr
,
295 RRStatusTy Status
, bool SaveOutput
) {
296 this->Device
= Device
;
297 this->Status
= Status
;
298 this->ReplaySaveOutput
= SaveOutput
;
300 if (auto Err
= preallocateDeviceMemory(MemSize
, VAddr
))
303 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, Device
->getDeviceId(),
304 "Record Replay Initialized (%p)"
305 " as starting address, %lu Memory Size"
306 " and set on status %s\n",
307 MemoryStart
, TotalSize
,
308 Status
== RRStatusTy::RRRecording
? "Recording" : "Replaying");
310 return Plugin::success();
314 if (Device
->supportVAManagement()) {
315 if (auto Err
= Device
->memoryVAUnMap(MemoryStart
, TotalSize
))
316 report_fatal_error("Error on releasing virtual memory space");
318 Device
->free(MemoryStart
);
324 // Extract the mapping of host function pointers to device function pointers
325 // from the entry table. Functions marked as 'indirect' in OpenMP will have
326 // offloading entries generated for them which map the host's function pointer
327 // to a global containing the corresponding function pointer on the device.
328 static Expected
<std::pair
<void *, uint64_t>>
329 setupIndirectCallTable(GenericPluginTy
&Plugin
, GenericDeviceTy
&Device
,
330 DeviceImageTy
&Image
) {
331 GenericGlobalHandlerTy
&Handler
= Plugin
.getGlobalHandler();
333 llvm::ArrayRef
<__tgt_offload_entry
> Entries(Image
.getTgtImage()->EntriesBegin
,
334 Image
.getTgtImage()->EntriesEnd
);
335 llvm::SmallVector
<std::pair
<void *, void *>> IndirectCallTable
;
336 for (const auto &Entry
: Entries
) {
337 if (Entry
.size
== 0 || !(Entry
.flags
& OMP_DECLARE_TARGET_INDIRECT
))
340 assert(Entry
.size
== sizeof(void *) && "Global not a function pointer?");
341 auto &[HstPtr
, DevPtr
] = IndirectCallTable
.emplace_back();
343 GlobalTy
DeviceGlobal(Entry
.name
, Entry
.size
);
345 Handler
.getGlobalMetadataFromDevice(Device
, Image
, DeviceGlobal
))
346 return std::move(Err
);
349 if (auto Err
= Device
.dataRetrieve(&DevPtr
, DeviceGlobal
.getPtr(),
350 Entry
.size
, nullptr))
351 return std::move(Err
);
354 // If we do not have any indirect globals we exit early.
355 if (IndirectCallTable
.empty())
356 return std::pair
{nullptr, 0};
358 // Sort the array to allow for more efficient lookup of device pointers.
359 llvm::sort(IndirectCallTable
,
360 [](const auto &x
, const auto &y
) { return x
.first
< y
.first
; });
363 IndirectCallTable
.size() * sizeof(std::pair
<void *, void *>);
364 void *DevicePtr
= Device
.allocate(TableSize
, nullptr, TARGET_ALLOC_DEVICE
);
365 if (auto Err
= Device
.dataSubmit(DevicePtr
, IndirectCallTable
.data(),
367 return std::move(Err
);
368 return std::pair
<void *, uint64_t>(DevicePtr
, IndirectCallTable
.size());
371 AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy
&Device
,
372 __tgt_async_info
*AsyncInfoPtr
)
374 AsyncInfoPtr(AsyncInfoPtr
? AsyncInfoPtr
: &LocalAsyncInfo
) {}
376 void AsyncInfoWrapperTy::finalize(Error
&Err
) {
377 assert(AsyncInfoPtr
&& "AsyncInfoWrapperTy already finalized");
379 // If we used a local async info object we want synchronous behavior. In that
380 // case, and assuming the current status code is correct, we will synchronize
381 // explicitly when the object is deleted. Update the error with the result of
382 // the synchronize operation.
383 if (AsyncInfoPtr
== &LocalAsyncInfo
&& LocalAsyncInfo
.Queue
&& !Err
)
384 Err
= Device
.synchronize(&LocalAsyncInfo
);
386 // Invalidate the wrapper object.
387 AsyncInfoPtr
= nullptr;
390 Error
GenericKernelTy::init(GenericDeviceTy
&GenericDevice
,
391 DeviceImageTy
&Image
) {
395 // Retrieve kernel environment object for the kernel.
396 GlobalTy
KernelEnv(std::string(Name
) + "_kernel_environment",
397 sizeof(KernelEnvironment
), &KernelEnvironment
);
398 GenericGlobalHandlerTy
&GHandler
= Plugin::get().getGlobalHandler();
400 GHandler
.readGlobalFromImage(GenericDevice
, *ImagePtr
, KernelEnv
)) {
401 [[maybe_unused
]] std::string ErrStr
= toString(std::move(Err
));
402 DP("Failed to read kernel environment for '%s': %s\n"
403 "Using default SPMD (2) execution mode\n",
404 Name
, ErrStr
.data());
405 assert(KernelEnvironment
.Configuration
.ReductionBufferSize
== 0 &&
406 "Default initialization failed.");
409 // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
410 MaxNumThreads
= KernelEnvironment
.Configuration
.MaxThreads
> 0
411 ? std::min(KernelEnvironment
.Configuration
.MaxThreads
,
412 int32_t(GenericDevice
.getThreadLimit()))
413 : GenericDevice
.getThreadLimit();
415 // Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref;
416 PreferredNumThreads
=
417 KernelEnvironment
.Configuration
.MinThreads
> 0
418 ? std::max(KernelEnvironment
.Configuration
.MinThreads
,
419 int32_t(GenericDevice
.getDefaultNumThreads()))
420 : GenericDevice
.getDefaultNumThreads();
422 return initImpl(GenericDevice
, Image
);
425 Expected
<KernelLaunchEnvironmentTy
*>
426 GenericKernelTy::getKernelLaunchEnvironment(
427 GenericDeviceTy
&GenericDevice
,
428 AsyncInfoWrapperTy
&AsyncInfoWrapper
) const {
429 // TODO: Check if the kernel needs a launch environment.
430 auto AllocOrErr
= GenericDevice
.dataAlloc(sizeof(KernelLaunchEnvironmentTy
),
432 TargetAllocTy::TARGET_ALLOC_DEVICE
);
434 return AllocOrErr
.takeError();
436 // Remember to free the memory later.
437 AsyncInfoWrapper
.freeAllocationAfterSynchronization(*AllocOrErr
);
439 /// Use the KLE in the __tgt_async_info to ensure a stable address for the
440 /// async data transfer.
441 auto &LocalKLE
= (*AsyncInfoWrapper
).KernelLaunchEnvironment
;
442 LocalKLE
= KernelLaunchEnvironment
;
443 if (KernelEnvironment
.Configuration
.ReductionBufferSize
) {
444 auto AllocOrErr
= GenericDevice
.dataAlloc(
445 KernelEnvironment
.Configuration
.ReductionBufferSize
,
446 /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE
);
448 return AllocOrErr
.takeError();
449 LocalKLE
.ReductionBuffer
= *AllocOrErr
;
450 // Remember to free the memory later.
451 AsyncInfoWrapper
.freeAllocationAfterSynchronization(*AllocOrErr
);
454 auto Err
= GenericDevice
.dataSubmit(*AllocOrErr
, &LocalKLE
,
455 sizeof(KernelLaunchEnvironmentTy
),
459 return static_cast<KernelLaunchEnvironmentTy
*>(*AllocOrErr
);
462 Error
GenericKernelTy::printLaunchInfo(GenericDeviceTy
&GenericDevice
,
463 KernelArgsTy
&KernelArgs
,
465 uint64_t NumBlocks
) const {
466 INFO(OMP_INFOTYPE_PLUGIN_KERNEL
, GenericDevice
.getDeviceId(),
467 "Launching kernel %s with %" PRIu64
468 " blocks and %d threads in %s mode\n",
469 getName(), NumBlocks
, NumThreads
, getExecutionModeName());
470 return printLaunchInfoDetails(GenericDevice
, KernelArgs
, NumThreads
,
474 Error
GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy
&GenericDevice
,
475 KernelArgsTy
&KernelArgs
,
477 uint64_t NumBlocks
) const {
478 return Plugin::success();
481 Error
GenericKernelTy::launch(GenericDeviceTy
&GenericDevice
, void **ArgPtrs
,
482 ptrdiff_t *ArgOffsets
, KernelArgsTy
&KernelArgs
,
483 AsyncInfoWrapperTy
&AsyncInfoWrapper
) const {
484 llvm::SmallVector
<void *, 16> Args
;
485 llvm::SmallVector
<void *, 16> Ptrs
;
487 auto KernelLaunchEnvOrErr
=
488 getKernelLaunchEnvironment(GenericDevice
, AsyncInfoWrapper
);
489 if (!KernelLaunchEnvOrErr
)
490 return KernelLaunchEnvOrErr
.takeError();
492 void *KernelArgsPtr
=
493 prepareArgs(GenericDevice
, ArgPtrs
, ArgOffsets
, KernelArgs
.NumArgs
, Args
,
494 Ptrs
, *KernelLaunchEnvOrErr
);
496 uint32_t NumThreads
= getNumThreads(GenericDevice
, KernelArgs
.ThreadLimit
);
498 getNumBlocks(GenericDevice
, KernelArgs
.NumTeams
, KernelArgs
.Tripcount
,
499 NumThreads
, KernelArgs
.ThreadLimit
[0] > 0);
502 printLaunchInfo(GenericDevice
, KernelArgs
, NumThreads
, NumBlocks
))
505 return launchImpl(GenericDevice
, NumThreads
, NumBlocks
, KernelArgs
,
506 KernelArgsPtr
, AsyncInfoWrapper
);
509 void *GenericKernelTy::prepareArgs(
510 GenericDeviceTy
&GenericDevice
, void **ArgPtrs
, ptrdiff_t *ArgOffsets
,
511 uint32_t &NumArgs
, llvm::SmallVectorImpl
<void *> &Args
,
512 llvm::SmallVectorImpl
<void *> &Ptrs
,
513 KernelLaunchEnvironmentTy
*KernelLaunchEnvironment
) const {
519 Args
.resize(NumArgs
);
520 Ptrs
.resize(NumArgs
);
522 Ptrs
[0] = KernelLaunchEnvironment
;
525 for (int I
= 1; I
< NumArgs
; ++I
) {
526 Ptrs
[I
] = (void *)((intptr_t)ArgPtrs
[I
- 1] + ArgOffsets
[I
- 1]);
532 uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy
&GenericDevice
,
533 uint32_t ThreadLimitClause
[3]) const {
534 assert(ThreadLimitClause
[1] == 0 && ThreadLimitClause
[2] == 0 &&
535 "Multi dimensional launch not supported yet.");
536 if (ThreadLimitClause
[0] > 0 && isGenericMode())
537 ThreadLimitClause
[0] += GenericDevice
.getWarpSize();
539 return std::min(MaxNumThreads
, (ThreadLimitClause
[0] > 0)
540 ? ThreadLimitClause
[0]
541 : PreferredNumThreads
);
544 uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy
&GenericDevice
,
545 uint32_t NumTeamsClause
[3],
546 uint64_t LoopTripCount
,
547 uint32_t &NumThreads
,
548 bool IsNumThreadsFromUser
) const {
549 assert(NumTeamsClause
[1] == 0 && NumTeamsClause
[2] == 0 &&
550 "Multi dimensional launch not supported yet.");
552 if (NumTeamsClause
[0] > 0) {
553 // TODO: We need to honor any value and consequently allow more than the
554 // block limit. For this we might need to start multiple kernels or let the
555 // blocks start again until the requested number has been started.
556 return std::min(NumTeamsClause
[0], GenericDevice
.getBlockLimit());
559 uint64_t DefaultNumBlocks
= GenericDevice
.getDefaultNumBlocks();
560 uint64_t TripCountNumBlocks
= std::numeric_limits
<uint64_t>::max();
561 if (LoopTripCount
> 0) {
563 // We have a combined construct, i.e. `target teams distribute
564 // parallel for [simd]`. We launch so many teams so that each thread
565 // will execute one iteration of the loop; rounded up to the nearest
566 // integer. However, if that results in too few teams, we artificially
567 // reduce the thread count per team to increase the outer parallelism.
568 auto MinThreads
= GenericDevice
.getMinThreadsForLowTripCountLoop();
569 MinThreads
= std::min(MinThreads
, NumThreads
);
571 // Honor the thread_limit clause; only lower the number of threads.
572 [[maybe_unused
]] auto OldNumThreads
= NumThreads
;
573 if (LoopTripCount
>= DefaultNumBlocks
* NumThreads
||
574 IsNumThreadsFromUser
) {
575 // Enough parallelism for teams and threads.
576 TripCountNumBlocks
= ((LoopTripCount
- 1) / NumThreads
) + 1;
577 assert(IsNumThreadsFromUser
||
578 TripCountNumBlocks
>= DefaultNumBlocks
&&
579 "Expected sufficient outer parallelism.");
580 } else if (LoopTripCount
>= DefaultNumBlocks
* MinThreads
) {
581 // Enough parallelism for teams, limit threads.
583 // This case is hard; for now, we force "full warps":
584 // First, compute a thread count assuming DefaultNumBlocks.
585 auto NumThreadsDefaultBlocks
=
586 (LoopTripCount
+ DefaultNumBlocks
- 1) / DefaultNumBlocks
;
587 // Now get a power of two that is larger or equal.
588 auto NumThreadsDefaultBlocksP2
=
589 llvm::PowerOf2Ceil(NumThreadsDefaultBlocks
);
590 // Do not increase a thread limit given be the user.
591 NumThreads
= std::min(NumThreads
, uint32_t(NumThreadsDefaultBlocksP2
));
592 assert(NumThreads
>= MinThreads
&&
593 "Expected sufficient inner parallelism.");
594 TripCountNumBlocks
= ((LoopTripCount
- 1) / NumThreads
) + 1;
596 // Not enough parallelism for teams and threads, limit both.
597 NumThreads
= std::min(NumThreads
, MinThreads
);
598 TripCountNumBlocks
= ((LoopTripCount
- 1) / NumThreads
) + 1;
601 assert(NumThreads
* TripCountNumBlocks
>= LoopTripCount
&&
602 "Expected sufficient parallelism");
603 assert(OldNumThreads
>= NumThreads
&&
604 "Number of threads cannot be increased!");
606 assert((isGenericMode() || isGenericSPMDMode()) &&
607 "Unexpected execution mode!");
608 // If we reach this point, then we have a non-combined construct, i.e.
609 // `teams distribute` with a nested `parallel for` and each team is
610 // assigned one iteration of the `distribute` loop. E.g.:
612 // #pragma omp target teams distribute
613 // for(...loop_tripcount...) {
614 // #pragma omp parallel for
618 // Threads within a team will execute the iterations of the `parallel`
620 TripCountNumBlocks
= LoopTripCount
;
623 // If the loops are long running we rather reuse blocks than spawn too many.
624 uint32_t PreferredNumBlocks
= std::min(TripCountNumBlocks
, DefaultNumBlocks
);
625 return std::min(PreferredNumBlocks
, GenericDevice
.getBlockLimit());
628 GenericDeviceTy::GenericDeviceTy(int32_t DeviceId
, int32_t NumDevices
,
629 const llvm::omp::GV
&OMPGridValues
)
630 : MemoryManager(nullptr), OMP_TeamLimit("OMP_TEAM_LIMIT"),
631 OMP_NumTeams("OMP_NUM_TEAMS"),
632 OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
633 OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
634 OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
635 // Do not initialize the following two envars since they depend on the
636 // device initialization. These cannot be consulted until the device is
637 // initialized correctly. We intialize them in GenericDeviceTy::init().
638 OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
639 // By default, the initial number of streams and events is 1.
640 OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
641 OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
642 DeviceId(DeviceId
), GridValues(OMPGridValues
),
643 PeerAccesses(NumDevices
, PeerAccessState::PENDING
), PeerAccessesLock(),
644 PinnedAllocs(*this), RPCServer(nullptr) {
646 OmptInitialized
.store(false);
647 // Bind the callbacks to this device's member functions
648 #define bindOmptCallback(Name, Type, Code) \
649 if (ompt::Initialized && ompt::lookupCallbackByCode) { \
650 ompt::lookupCallbackByCode((ompt_callbacks_t)(Code), \
651 ((ompt_callback_t *)&(Name##_fn))); \
652 DP("OMPT: class bound %s=%p\n", #Name, ((void *)(uint64_t)Name##_fn)); \
655 FOREACH_OMPT_DEVICE_EVENT(bindOmptCallback
);
656 #undef bindOmptCallback
661 Error
GenericDeviceTy::init(GenericPluginTy
&Plugin
) {
662 if (auto Err
= initImpl(Plugin
))
666 if (ompt::Initialized
) {
667 bool ExpectedStatus
= false;
668 if (OmptInitialized
.compare_exchange_strong(ExpectedStatus
, true))
669 performOmptCallback(device_initialize
,
670 /* device_num */ DeviceId
+
671 Plugin
.getDeviceIdStartIndex(),
672 /* type */ getComputeUnitKind().c_str(),
673 /* device */ reinterpret_cast<ompt_device_t
*>(this),
674 /* lookup */ ompt::lookupCallbackByName
,
675 /* documentation */ nullptr);
679 // Read and reinitialize the envars that depend on the device initialization.
680 // Notice these two envars may change the stack size and heap size of the
681 // device, so they need the device properly initialized.
682 auto StackSizeEnvarOrErr
= UInt64Envar::create(
683 "LIBOMPTARGET_STACK_SIZE",
684 [this](uint64_t &V
) -> Error
{ return getDeviceStackSize(V
); },
685 [this](uint64_t V
) -> Error
{ return setDeviceStackSize(V
); });
686 if (!StackSizeEnvarOrErr
)
687 return StackSizeEnvarOrErr
.takeError();
688 OMPX_TargetStackSize
= std::move(*StackSizeEnvarOrErr
);
690 auto HeapSizeEnvarOrErr
= UInt64Envar::create(
691 "LIBOMPTARGET_HEAP_SIZE",
692 [this](uint64_t &V
) -> Error
{ return getDeviceHeapSize(V
); },
693 [this](uint64_t V
) -> Error
{ return setDeviceHeapSize(V
); });
694 if (!HeapSizeEnvarOrErr
)
695 return HeapSizeEnvarOrErr
.takeError();
696 OMPX_TargetHeapSize
= std::move(*HeapSizeEnvarOrErr
);
698 // Update the maximum number of teams and threads after the device
699 // initialization sets the corresponding hardware limit.
700 if (OMP_NumTeams
> 0)
701 GridValues
.GV_Max_Teams
=
702 std::min(GridValues
.GV_Max_Teams
, uint32_t(OMP_NumTeams
));
704 if (OMP_TeamsThreadLimit
> 0)
705 GridValues
.GV_Max_WG_Size
=
706 std::min(GridValues
.GV_Max_WG_Size
, uint32_t(OMP_TeamsThreadLimit
));
708 // Enable the memory manager if required.
709 auto [ThresholdMM
, EnableMM
] = MemoryManagerTy::getSizeThresholdFromEnv();
711 MemoryManager
= new MemoryManagerTy(*this, ThresholdMM
);
713 return Plugin::success();
716 Error
GenericDeviceTy::deinit(GenericPluginTy
&Plugin
) {
718 if (OMPX_DebugKind
.get() & uint32_t(DeviceDebugKind::AllocationTracker
)) {
719 GenericGlobalHandlerTy
&GHandler
= Plugin
.getGlobalHandler();
720 for (auto *Image
: LoadedImages
) {
721 DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking
= {0, 0, ~0U, 0};
722 GlobalTy
TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
723 sizeof(DeviceMemoryPoolTrackingTy
),
724 &ImageDeviceMemoryPoolTracking
);
726 GHandler
.readGlobalFromDevice(*this, *Image
, TrackerGlobal
))
728 DeviceMemoryPoolTracking
.combine(ImageDeviceMemoryPoolTracking
);
731 // TODO: Write this by default into a file.
732 printf("\n\n|-----------------------\n"
733 "| Device memory tracker:\n"
734 "|-----------------------\n"
735 "| #Allocations: %lu\n"
736 "| Byes allocated: %lu\n"
737 "| Minimal allocation: %lu\n"
738 "| Maximal allocation: %lu\n"
739 "|-----------------------\n\n\n",
740 DeviceMemoryPoolTracking
.NumAllocations
,
741 DeviceMemoryPoolTracking
.AllocationTotal
,
742 DeviceMemoryPoolTracking
.AllocationMin
,
743 DeviceMemoryPoolTracking
.AllocationMax
);
746 // Delete the memory manager before deinitializing the device. Otherwise,
747 // we may delete device allocations after the device is deinitialized.
749 delete MemoryManager
;
750 MemoryManager
= nullptr;
752 if (RecordReplay
.isRecordingOrReplaying())
753 RecordReplay
.deinit();
756 if (auto Err
= RPCServer
->deinitDevice(*this))
760 if (ompt::Initialized
) {
761 bool ExpectedStatus
= true;
762 if (OmptInitialized
.compare_exchange_strong(ExpectedStatus
, false))
763 performOmptCallback(device_finalize
,
764 /* device_num */ DeviceId
+
765 Plugin
.getDeviceIdStartIndex());
771 Expected
<__tgt_target_table
*>
772 GenericDeviceTy::loadBinary(GenericPluginTy
&Plugin
,
773 const __tgt_device_image
*InputTgtImage
) {
774 assert(InputTgtImage
&& "Expected non-null target image");
775 DP("Load data from image " DPxMOD
"\n", DPxPTR(InputTgtImage
->ImageStart
));
777 auto PostJITImageOrErr
= Plugin
.getJIT().process(*InputTgtImage
, *this);
778 if (!PostJITImageOrErr
) {
779 auto Err
= PostJITImageOrErr
.takeError();
780 REPORT("Failure to jit IR image %p on device %d: %s\n", InputTgtImage
,
781 DeviceId
, toString(std::move(Err
)).data());
785 // Load the binary and allocate the image object. Use the next available id
786 // for the image id, which is the number of previously loaded images.
788 loadBinaryImpl(PostJITImageOrErr
.get(), LoadedImages
.size());
790 return ImageOrErr
.takeError();
792 DeviceImageTy
*Image
= *ImageOrErr
;
793 assert(Image
!= nullptr && "Invalid image");
794 if (InputTgtImage
!= PostJITImageOrErr
.get())
795 Image
->setTgtImageBitcode(InputTgtImage
);
797 // Add the image to list.
798 LoadedImages
.push_back(Image
);
800 // Setup the device environment if needed.
801 if (auto Err
= setupDeviceEnvironment(Plugin
, *Image
))
802 return std::move(Err
);
804 // Setup the global device memory pool if needed.
805 if (shouldSetupDeviceMemoryPool()) {
807 auto SizeOrErr
= getDeviceHeapSize(HeapSize
);
809 REPORT("No global device memory pool due to error: %s\n",
810 toString(std::move(SizeOrErr
)).data());
811 } else if (auto Err
= setupDeviceMemoryPool(Plugin
, *Image
, HeapSize
))
812 return std::move(Err
);
815 // Register all offload entries of the image.
816 if (auto Err
= registerOffloadEntries(*Image
))
817 return std::move(Err
);
819 if (auto Err
= setupRPCServer(Plugin
, *Image
))
820 return std::move(Err
);
823 if (ompt::Initialized
) {
825 getPtrDiff(InputTgtImage
->ImageEnd
, InputTgtImage
->ImageStart
);
826 performOmptCallback(device_load
,
827 /* device_num */ DeviceId
+
828 Plugin
.getDeviceIdStartIndex(),
829 /* FileName */ nullptr,
831 /* VmaInFile */ nullptr,
833 /* HostAddr */ InputTgtImage
->ImageStart
,
834 /* DeviceAddr */ nullptr,
835 /* FIXME: ModuleId */ 0);
839 // Return the pointer to the table of entries.
840 return Image
->getOffloadEntryTable();
843 Error
GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy
&Plugin
,
844 DeviceImageTy
&Image
) {
845 // There are some plugins that do not need this step.
846 if (!shouldSetupDeviceEnvironment())
847 return Plugin::success();
849 // Obtain a table mapping host function pointers to device function pointers.
850 auto CallTablePairOrErr
= setupIndirectCallTable(Plugin
, *this, Image
);
851 if (!CallTablePairOrErr
)
852 return CallTablePairOrErr
.takeError();
854 DeviceEnvironmentTy DeviceEnvironment
;
855 DeviceEnvironment
.DeviceDebugKind
= OMPX_DebugKind
;
856 DeviceEnvironment
.NumDevices
= Plugin
.getNumDevices();
857 // TODO: The device ID used here is not the real device ID used by OpenMP.
858 DeviceEnvironment
.DeviceNum
= DeviceId
;
859 DeviceEnvironment
.DynamicMemSize
= OMPX_SharedMemorySize
;
860 DeviceEnvironment
.ClockFrequency
= getClockFrequency();
861 DeviceEnvironment
.IndirectCallTable
=
862 reinterpret_cast<uintptr_t>(CallTablePairOrErr
->first
);
863 DeviceEnvironment
.IndirectCallTableSize
= CallTablePairOrErr
->second
;
864 DeviceEnvironment
.HardwareParallelism
= getHardwareParallelism();
866 // Create the metainfo of the device environment global.
867 GlobalTy
DevEnvGlobal("__omp_rtl_device_environment",
868 sizeof(DeviceEnvironmentTy
), &DeviceEnvironment
);
870 // Write device environment values to the device.
871 GenericGlobalHandlerTy
&GHandler
= Plugin
.getGlobalHandler();
872 if (auto Err
= GHandler
.writeGlobalToDevice(*this, Image
, DevEnvGlobal
)) {
873 DP("Missing symbol %s, continue execution anyway.\n",
874 DevEnvGlobal
.getName().data());
875 consumeError(std::move(Err
));
877 return Plugin::success();
880 Error
GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy
&Plugin
,
881 DeviceImageTy
&Image
,
883 // Free the old pool, if any.
884 if (DeviceMemoryPool
.Ptr
) {
885 if (auto Err
= dataDelete(DeviceMemoryPool
.Ptr
,
886 TargetAllocTy::TARGET_ALLOC_DEVICE
))
890 DeviceMemoryPool
.Size
= PoolSize
;
891 auto AllocOrErr
= dataAlloc(PoolSize
, /*HostPtr=*/nullptr,
892 TargetAllocTy::TARGET_ALLOC_DEVICE
);
894 DeviceMemoryPool
.Ptr
= *AllocOrErr
;
896 auto Err
= AllocOrErr
.takeError();
897 REPORT("Failure to allocate device memory for global memory pool: %s\n",
898 toString(std::move(Err
)).data());
899 DeviceMemoryPool
.Ptr
= nullptr;
900 DeviceMemoryPool
.Size
= 0;
903 // Create the metainfo of the device environment global.
904 GlobalTy
TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
905 sizeof(DeviceMemoryPoolTrackingTy
),
906 &DeviceMemoryPoolTracking
);
907 GenericGlobalHandlerTy
&GHandler
= Plugin
.getGlobalHandler();
908 if (auto Err
= GHandler
.writeGlobalToDevice(*this, Image
, TrackerGlobal
))
911 // Create the metainfo of the device environment global.
912 GlobalTy
DevEnvGlobal("__omp_rtl_device_memory_pool",
913 sizeof(DeviceMemoryPoolTy
), &DeviceMemoryPool
);
915 // Write device environment values to the device.
916 return GHandler
.writeGlobalToDevice(*this, Image
, DevEnvGlobal
);
919 Error
GenericDeviceTy::setupRPCServer(GenericPluginTy
&Plugin
,
920 DeviceImageTy
&Image
) {
921 // The plugin either does not need an RPC server or it is unavailible.
922 if (!shouldSetupRPCServer())
923 return Plugin::success();
925 // Check if this device needs to run an RPC server.
926 RPCServerTy
&Server
= Plugin
.getRPCServer();
928 Server
.isDeviceUsingRPC(*this, Plugin
.getGlobalHandler(), Image
);
930 return UsingOrErr
.takeError();
932 if (!UsingOrErr
.get())
933 return Plugin::success();
935 if (auto Err
= Server
.initDevice(*this, Plugin
.getGlobalHandler(), Image
))
939 DP("Running an RPC server on device %d\n", getDeviceId());
940 return Plugin::success();
943 Error
GenericDeviceTy::registerOffloadEntries(DeviceImageTy
&Image
) {
944 const __tgt_offload_entry
*Begin
= Image
.getTgtImage()->EntriesBegin
;
945 const __tgt_offload_entry
*End
= Image
.getTgtImage()->EntriesEnd
;
946 for (const __tgt_offload_entry
*Entry
= Begin
; Entry
!= End
; ++Entry
) {
947 // The host should have always something in the address to uniquely
948 // identify the entry.
950 return Plugin::error("Failure to register entry without address");
952 __tgt_offload_entry DeviceEntry
= {0};
955 if (auto Err
= registerGlobalOffloadEntry(Image
, *Entry
, DeviceEntry
))
958 if (auto Err
= registerKernelOffloadEntry(Image
, *Entry
, DeviceEntry
))
962 assert(DeviceEntry
.addr
&& "Device addr of offload entry cannot be null");
964 DP("Entry point " DPxMOD
" maps to%s %s (" DPxMOD
")\n",
965 DPxPTR(Entry
- Begin
), (Entry
->size
) ? " global" : "", Entry
->name
,
966 DPxPTR(DeviceEntry
.addr
));
968 return Plugin::success();
971 Error
GenericDeviceTy::registerGlobalOffloadEntry(
972 DeviceImageTy
&Image
, const __tgt_offload_entry
&GlobalEntry
,
973 __tgt_offload_entry
&DeviceEntry
) {
975 GenericPluginTy
&Plugin
= Plugin::get();
977 DeviceEntry
= GlobalEntry
;
979 // Create a metadata object for the device global.
980 GlobalTy
DeviceGlobal(GlobalEntry
.name
, GlobalEntry
.size
);
982 // Get the address of the device of the global.
983 GenericGlobalHandlerTy
&GHandler
= Plugin
.getGlobalHandler();
985 GHandler
.getGlobalMetadataFromDevice(*this, Image
, DeviceGlobal
))
988 // Store the device address on the device entry.
989 DeviceEntry
.addr
= DeviceGlobal
.getPtr();
990 assert(DeviceEntry
.addr
&& "Invalid device global's address");
992 // Note: In the current implementation declare target variables
993 // can either be link or to. This means that once unified
994 // memory is activated via the requires directive, the variable
995 // can be used directly from the host in both cases.
996 if (Plugin
.getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY
) {
997 // If unified memory is present any target link or to variables
998 // can access host addresses directly. There is no longer a
999 // need for device copies.
1000 GlobalTy
HostGlobal(GlobalEntry
);
1002 GHandler
.writeGlobalToDevice(*this, HostGlobal
, DeviceGlobal
))
1006 // Add the device entry on the entry table.
1007 Image
.getOffloadEntryTable().addEntry(DeviceEntry
);
1009 return Plugin::success();
1012 Error
GenericDeviceTy::registerKernelOffloadEntry(
1013 DeviceImageTy
&Image
, const __tgt_offload_entry
&KernelEntry
,
1014 __tgt_offload_entry
&DeviceEntry
) {
1015 DeviceEntry
= KernelEntry
;
1017 // Create a kernel object.
1018 auto KernelOrErr
= constructKernel(KernelEntry
);
1020 return KernelOrErr
.takeError();
1022 GenericKernelTy
&Kernel
= *KernelOrErr
;
1024 // Initialize the kernel.
1025 if (auto Err
= Kernel
.init(*this, Image
))
1028 // Set the device entry address to the kernel address and store the entry on
1030 DeviceEntry
.addr
= (void *)&Kernel
;
1031 Image
.getOffloadEntryTable().addEntry(DeviceEntry
);
1033 return Plugin::success();
1036 Error
PinnedAllocationMapTy::insertEntry(void *HstPtr
, void *DevAccessiblePtr
,
1037 size_t Size
, bool ExternallyLocked
) {
1038 // Insert the new entry into the map.
1039 auto Res
= Allocs
.insert({HstPtr
, DevAccessiblePtr
, Size
, ExternallyLocked
});
1041 return Plugin::error("Cannot insert locked buffer entry");
1043 // Check whether the next entry overlaps with the inserted entry.
1044 auto It
= std::next(Res
.first
);
1045 if (It
== Allocs
.end())
1046 return Plugin::success();
1048 const EntryTy
*NextEntry
= &(*It
);
1049 if (intersects(NextEntry
->HstPtr
, NextEntry
->Size
, HstPtr
, Size
))
1050 return Plugin::error("Partial overlapping not allowed in locked buffers");
1052 return Plugin::success();
1055 Error
PinnedAllocationMapTy::eraseEntry(const EntryTy
&Entry
) {
1056 // Erase the existing entry. Notice this requires an additional map lookup,
1057 // but this should not be a performance issue. Using iterators would make
1058 // the code more difficult to read.
1059 size_t Erased
= Allocs
.erase({Entry
.HstPtr
});
1061 return Plugin::error("Cannot erase locked buffer entry");
1062 return Plugin::success();
1065 Error
PinnedAllocationMapTy::registerEntryUse(const EntryTy
&Entry
,
1066 void *HstPtr
, size_t Size
) {
1067 if (!contains(Entry
.HstPtr
, Entry
.Size
, HstPtr
, Size
))
1068 return Plugin::error("Partial overlapping not allowed in locked buffers");
1071 return Plugin::success();
1074 Expected
<bool> PinnedAllocationMapTy::unregisterEntryUse(const EntryTy
&Entry
) {
1075 if (Entry
.References
== 0)
1076 return Plugin::error("Invalid number of references");
1078 // Return whether this was the last user.
1079 return (--Entry
.References
== 0);
1082 Error
PinnedAllocationMapTy::registerHostBuffer(void *HstPtr
,
1083 void *DevAccessiblePtr
,
1085 assert(HstPtr
&& "Invalid pointer");
1086 assert(DevAccessiblePtr
&& "Invalid pointer");
1087 assert(Size
&& "Invalid size");
1089 std::lock_guard
<std::shared_mutex
> Lock(Mutex
);
1091 // No pinned allocation should intersect.
1092 const EntryTy
*Entry
= findIntersecting(HstPtr
);
1094 return Plugin::error("Cannot insert entry due to an existing one");
1096 // Now insert the new entry.
1097 return insertEntry(HstPtr
, DevAccessiblePtr
, Size
);
1100 Error
PinnedAllocationMapTy::unregisterHostBuffer(void *HstPtr
) {
1101 assert(HstPtr
&& "Invalid pointer");
1103 std::lock_guard
<std::shared_mutex
> Lock(Mutex
);
1105 const EntryTy
*Entry
= findIntersecting(HstPtr
);
1107 return Plugin::error("Cannot find locked buffer");
1109 // The address in the entry should be the same we are unregistering.
1110 if (Entry
->HstPtr
!= HstPtr
)
1111 return Plugin::error("Unexpected host pointer in locked buffer entry");
1113 // Unregister from the entry.
1114 auto LastUseOrErr
= unregisterEntryUse(*Entry
);
1116 return LastUseOrErr
.takeError();
1118 // There should be no other references to the pinned allocation.
1119 if (!(*LastUseOrErr
))
1120 return Plugin::error("The locked buffer is still being used");
1122 // Erase the entry from the map.
1123 return eraseEntry(*Entry
);
1126 Expected
<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr
,
1128 assert(HstPtr
&& "Invalid pointer");
1129 assert(Size
&& "Invalid size");
1131 std::lock_guard
<std::shared_mutex
> Lock(Mutex
);
1133 const EntryTy
*Entry
= findIntersecting(HstPtr
);
1136 // An already registered intersecting buffer was found. Register a new use.
1137 if (auto Err
= registerEntryUse(*Entry
, HstPtr
, Size
))
1138 return std::move(Err
);
1140 // Return the device accessible pointer with the correct offset.
1141 return advanceVoidPtr(Entry
->DevAccessiblePtr
,
1142 getPtrDiff(HstPtr
, Entry
->HstPtr
));
1145 // No intersecting registered allocation found in the map. First, lock the
1146 // host buffer and retrieve the device accessible pointer.
1147 auto DevAccessiblePtrOrErr
= Device
.dataLockImpl(HstPtr
, Size
);
1148 if (!DevAccessiblePtrOrErr
)
1149 return DevAccessiblePtrOrErr
.takeError();
1151 // Now insert the new entry into the map.
1152 if (auto Err
= insertEntry(HstPtr
, *DevAccessiblePtrOrErr
, Size
))
1153 return std::move(Err
);
1155 // Return the device accessible pointer.
1156 return *DevAccessiblePtrOrErr
;
1159 Error
PinnedAllocationMapTy::unlockHostBuffer(void *HstPtr
) {
1160 assert(HstPtr
&& "Invalid pointer");
1162 std::lock_guard
<std::shared_mutex
> Lock(Mutex
);
1164 const EntryTy
*Entry
= findIntersecting(HstPtr
);
1166 return Plugin::error("Cannot find locked buffer");
1168 // Unregister from the locked buffer. No need to do anything if there are
1169 // others using the allocation.
1170 auto LastUseOrErr
= unregisterEntryUse(*Entry
);
1172 return LastUseOrErr
.takeError();
1174 // No need to do anything if there are others using the allocation.
1175 if (!(*LastUseOrErr
))
1176 return Plugin::success();
1178 // This was the last user of the allocation. Unlock the original locked buffer
1179 // if it was locked by the plugin. Do not unlock it if it was locked by an
1180 // external entity. Unlock the buffer using the host pointer of the entry.
1181 if (!Entry
->ExternallyLocked
)
1182 if (auto Err
= Device
.dataUnlockImpl(Entry
->HstPtr
))
1185 // Erase the entry from the map.
1186 return eraseEntry(*Entry
);
1189 Error
PinnedAllocationMapTy::lockMappedHostBuffer(void *HstPtr
, size_t Size
) {
1190 assert(HstPtr
&& "Invalid pointer");
1191 assert(Size
&& "Invalid size");
1193 std::lock_guard
<std::shared_mutex
> Lock(Mutex
);
1195 // If previously registered, just register a new user on the entry.
1196 const EntryTy
*Entry
= findIntersecting(HstPtr
);
1198 return registerEntryUse(*Entry
, HstPtr
, Size
);
1201 void *BaseHstPtr
, *BaseDevAccessiblePtr
;
1203 // Check if it was externally pinned by a vendor-specific API.
1204 auto IsPinnedOrErr
= Device
.isPinnedPtrImpl(HstPtr
, BaseHstPtr
,
1205 BaseDevAccessiblePtr
, BaseSize
);
1207 return IsPinnedOrErr
.takeError();
1209 // If pinned, just insert the entry representing the whole pinned buffer.
1211 return insertEntry(BaseHstPtr
, BaseDevAccessiblePtr
, BaseSize
,
1212 /* Externally locked */ true);
1214 // Not externally pinned. Do nothing if locking of mapped buffers is disabled.
1215 if (!LockMappedBuffers
)
1216 return Plugin::success();
1218 // Otherwise, lock the buffer and insert the new entry.
1219 auto DevAccessiblePtrOrErr
= Device
.dataLockImpl(HstPtr
, Size
);
1220 if (!DevAccessiblePtrOrErr
) {
1221 // Errors may be tolerated.
1222 if (!IgnoreLockMappedFailures
)
1223 return DevAccessiblePtrOrErr
.takeError();
1225 consumeError(DevAccessiblePtrOrErr
.takeError());
1226 return Plugin::success();
1229 return insertEntry(HstPtr
, *DevAccessiblePtrOrErr
, Size
);
1232 Error
PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr
) {
1233 assert(HstPtr
&& "Invalid pointer");
1235 std::lock_guard
<std::shared_mutex
> Lock(Mutex
);
1237 // Check whether there is any intersecting entry.
1238 const EntryTy
*Entry
= findIntersecting(HstPtr
);
1240 // No entry but automatic locking of mapped buffers is disabled, so
1242 if (!Entry
&& !LockMappedBuffers
)
1243 return Plugin::success();
1245 // No entry, automatic locking is enabled, but the locking may have failed, so
1247 if (!Entry
&& IgnoreLockMappedFailures
)
1248 return Plugin::success();
1250 // No entry, but the automatic locking is enabled, so this is an error.
1252 return Plugin::error("Locked buffer not found");
1254 // There is entry, so unregister a user and check whether it was the last one.
1255 auto LastUseOrErr
= unregisterEntryUse(*Entry
);
1257 return LastUseOrErr
.takeError();
1259 // If it is not the last one, there is nothing to do.
1260 if (!(*LastUseOrErr
))
1261 return Plugin::success();
1263 // Otherwise, if it was the last and the buffer was locked by the plugin,
1265 if (!Entry
->ExternallyLocked
)
1266 if (auto Err
= Device
.dataUnlockImpl(Entry
->HstPtr
))
1269 // Finally erase the entry from the map.
1270 return eraseEntry(*Entry
);
1273 Error
GenericDeviceTy::synchronize(__tgt_async_info
*AsyncInfo
) {
1274 if (!AsyncInfo
|| !AsyncInfo
->Queue
)
1275 return Plugin::error("Invalid async info queue");
1277 if (auto Err
= synchronizeImpl(*AsyncInfo
))
1280 for (auto *Ptr
: AsyncInfo
->AssociatedAllocations
)
1281 if (auto Err
= dataDelete(Ptr
, TargetAllocTy::TARGET_ALLOC_DEVICE
))
1283 AsyncInfo
->AssociatedAllocations
.clear();
1285 return Plugin::success();
1288 Error
GenericDeviceTy::queryAsync(__tgt_async_info
*AsyncInfo
) {
1289 if (!AsyncInfo
|| !AsyncInfo
->Queue
)
1290 return Plugin::error("Invalid async info queue");
1292 return queryAsyncImpl(*AsyncInfo
);
1295 Error
GenericDeviceTy::memoryVAMap(void **Addr
, void *VAddr
, size_t *RSize
) {
1296 return Plugin::error("Device does not suppport VA Management");
1299 Error
GenericDeviceTy::memoryVAUnMap(void *VAddr
, size_t Size
) {
1300 return Plugin::error("Device does not suppport VA Management");
1303 Error
GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize
) {
1304 return Plugin::error(
1305 "Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
1308 Expected
<void *> GenericDeviceTy::dataAlloc(int64_t Size
, void *HostPtr
,
1309 TargetAllocTy Kind
) {
1310 void *Alloc
= nullptr;
1312 if (RecordReplay
.isRecordingOrReplaying())
1313 return RecordReplay
.alloc(Size
);
1316 case TARGET_ALLOC_DEFAULT
:
1317 case TARGET_ALLOC_DEVICE
:
1318 if (MemoryManager
) {
1319 Alloc
= MemoryManager
->allocate(Size
, HostPtr
);
1321 return Plugin::error("Failed to allocate from memory manager");
1325 case TARGET_ALLOC_HOST
:
1326 case TARGET_ALLOC_SHARED
:
1327 Alloc
= allocate(Size
, HostPtr
, Kind
);
1329 return Plugin::error("Failed to allocate from device allocator");
1332 // Report error if the memory manager or the device allocator did not return
1333 // any memory buffer.
1335 return Plugin::error("Invalid target data allocation kind or requested "
1336 "allocator not implemented yet");
1338 // Register allocated buffer as pinned memory if the type is host memory.
1339 if (Kind
== TARGET_ALLOC_HOST
)
1340 if (auto Err
= PinnedAllocs
.registerHostBuffer(Alloc
, Alloc
, Size
))
1341 return std::move(Err
);
1346 Error
GenericDeviceTy::dataDelete(void *TgtPtr
, TargetAllocTy Kind
) {
1347 // Free is a noop when recording or replaying.
1348 if (RecordReplay
.isRecordingOrReplaying())
1349 return Plugin::success();
1353 Res
= MemoryManager
->free(TgtPtr
);
1355 Res
= free(TgtPtr
, Kind
);
1358 return Plugin::error("Failure to deallocate device pointer %p", TgtPtr
);
1360 // Unregister deallocated pinned memory buffer if the type is host memory.
1361 if (Kind
== TARGET_ALLOC_HOST
)
1362 if (auto Err
= PinnedAllocs
.unregisterHostBuffer(TgtPtr
))
1365 return Plugin::success();
1368 Error
GenericDeviceTy::dataSubmit(void *TgtPtr
, const void *HstPtr
,
1369 int64_t Size
, __tgt_async_info
*AsyncInfo
) {
1370 AsyncInfoWrapperTy
AsyncInfoWrapper(*this, AsyncInfo
);
1372 auto Err
= dataSubmitImpl(TgtPtr
, HstPtr
, Size
, AsyncInfoWrapper
);
1373 AsyncInfoWrapper
.finalize(Err
);
1377 Error
GenericDeviceTy::dataRetrieve(void *HstPtr
, const void *TgtPtr
,
1378 int64_t Size
, __tgt_async_info
*AsyncInfo
) {
1379 AsyncInfoWrapperTy
AsyncInfoWrapper(*this, AsyncInfo
);
1381 auto Err
= dataRetrieveImpl(HstPtr
, TgtPtr
, Size
, AsyncInfoWrapper
);
1382 AsyncInfoWrapper
.finalize(Err
);
1386 Error
GenericDeviceTy::dataExchange(const void *SrcPtr
, GenericDeviceTy
&DstDev
,
1387 void *DstPtr
, int64_t Size
,
1388 __tgt_async_info
*AsyncInfo
) {
1389 AsyncInfoWrapperTy
AsyncInfoWrapper(*this, AsyncInfo
);
1391 auto Err
= dataExchangeImpl(SrcPtr
, DstDev
, DstPtr
, Size
, AsyncInfoWrapper
);
1392 AsyncInfoWrapper
.finalize(Err
);
1396 Error
GenericDeviceTy::launchKernel(void *EntryPtr
, void **ArgPtrs
,
1397 ptrdiff_t *ArgOffsets
,
1398 KernelArgsTy
&KernelArgs
,
1399 __tgt_async_info
*AsyncInfo
) {
1400 AsyncInfoWrapperTy
AsyncInfoWrapper(
1401 *this, RecordReplay
.isRecordingOrReplaying() ? nullptr : AsyncInfo
);
1403 GenericKernelTy
&GenericKernel
=
1404 *reinterpret_cast<GenericKernelTy
*>(EntryPtr
);
1406 if (RecordReplay
.isRecording())
1407 RecordReplay
.saveKernelInputInfo(
1408 GenericKernel
.getName(), GenericKernel
.getImage(), ArgPtrs
, ArgOffsets
,
1409 KernelArgs
.NumArgs
, KernelArgs
.NumTeams
[0], KernelArgs
.ThreadLimit
[0],
1410 KernelArgs
.Tripcount
);
1412 if (RecordReplay
.isRecording())
1413 RecordReplay
.saveImage(GenericKernel
.getName(), GenericKernel
.getImage());
1415 auto Err
= GenericKernel
.launch(*this, ArgPtrs
, ArgOffsets
, KernelArgs
,
1418 // 'finalize' here to guarantee next record-replay actions are in-sync
1419 AsyncInfoWrapper
.finalize(Err
);
1421 if (RecordReplay
.isRecordingOrReplaying() &&
1422 RecordReplay
.isSaveOutputEnabled())
1423 RecordReplay
.saveKernelOutputInfo(GenericKernel
.getName());
1428 Error
GenericDeviceTy::initAsyncInfo(__tgt_async_info
**AsyncInfoPtr
) {
1429 assert(AsyncInfoPtr
&& "Invalid async info");
1431 *AsyncInfoPtr
= new __tgt_async_info();
1433 AsyncInfoWrapperTy
AsyncInfoWrapper(*this, *AsyncInfoPtr
);
1435 auto Err
= initAsyncInfoImpl(AsyncInfoWrapper
);
1436 AsyncInfoWrapper
.finalize(Err
);
1440 Error
GenericDeviceTy::initDeviceInfo(__tgt_device_info
*DeviceInfo
) {
1441 assert(DeviceInfo
&& "Invalid device info");
1443 return initDeviceInfoImpl(DeviceInfo
);
1446 Error
GenericDeviceTy::printInfo() {
1447 InfoQueueTy InfoQueue
;
1449 // Get the vendor-specific info entries describing the device properties.
1450 if (auto Err
= obtainInfoImpl(InfoQueue
))
1453 // Print all info entries.
1456 return Plugin::success();
1459 Error
GenericDeviceTy::createEvent(void **EventPtrStorage
) {
1460 return createEventImpl(EventPtrStorage
);
1463 Error
GenericDeviceTy::destroyEvent(void *EventPtr
) {
1464 return destroyEventImpl(EventPtr
);
1467 Error
GenericDeviceTy::recordEvent(void *EventPtr
,
1468 __tgt_async_info
*AsyncInfo
) {
1469 AsyncInfoWrapperTy
AsyncInfoWrapper(*this, AsyncInfo
);
1471 auto Err
= recordEventImpl(EventPtr
, AsyncInfoWrapper
);
1472 AsyncInfoWrapper
.finalize(Err
);
1476 Error
GenericDeviceTy::waitEvent(void *EventPtr
, __tgt_async_info
*AsyncInfo
) {
1477 AsyncInfoWrapperTy
AsyncInfoWrapper(*this, AsyncInfo
);
1479 auto Err
= waitEventImpl(EventPtr
, AsyncInfoWrapper
);
1480 AsyncInfoWrapper
.finalize(Err
);
1484 Error
GenericDeviceTy::syncEvent(void *EventPtr
) {
1485 return syncEventImpl(EventPtr
);
1488 Error
GenericPluginTy::init() {
1489 auto NumDevicesOrErr
= initImpl();
1490 if (!NumDevicesOrErr
)
1491 return NumDevicesOrErr
.takeError();
1493 NumDevices
= *NumDevicesOrErr
;
1494 if (NumDevices
== 0)
1495 return Plugin::success();
1497 assert(Devices
.size() == 0 && "Plugin already initialized");
1498 Devices
.resize(NumDevices
, nullptr);
1500 GlobalHandler
= Plugin::createGlobalHandler();
1501 assert(GlobalHandler
&& "Invalid global handler");
1503 RPCServer
= new RPCServerTy(NumDevices
);
1504 assert(RPCServer
&& "Invalid RPC server");
1506 return Plugin::success();
1509 Error
GenericPluginTy::deinit() {
1510 // Deinitialize all active devices.
1511 for (int32_t DeviceId
= 0; DeviceId
< NumDevices
; ++DeviceId
) {
1512 if (Devices
[DeviceId
]) {
1513 if (auto Err
= deinitDevice(DeviceId
))
1516 assert(!Devices
[DeviceId
] && "Device was not deinitialized");
1519 // There is no global handler if no device is available.
1521 delete GlobalHandler
;
1526 // Perform last deinitializations on the plugin.
1527 return deinitImpl();
1530 Error
GenericPluginTy::initDevice(int32_t DeviceId
) {
1531 assert(!Devices
[DeviceId
] && "Device already initialized");
1533 // Create the device and save the reference.
1534 GenericDeviceTy
*Device
= Plugin::createDevice(DeviceId
, NumDevices
);
1535 assert(Device
&& "Invalid device");
1537 // Save the device reference into the list.
1538 Devices
[DeviceId
] = Device
;
1540 // Initialize the device and its resources.
1541 return Device
->init(*this);
1544 Error
GenericPluginTy::deinitDevice(int32_t DeviceId
) {
1545 // The device may be already deinitialized.
1546 if (Devices
[DeviceId
] == nullptr)
1547 return Plugin::success();
1549 // Deinitialize the device and release its resources.
1550 if (auto Err
= Devices
[DeviceId
]->deinit(*this))
1553 // Delete the device and invalidate its reference.
1554 delete Devices
[DeviceId
];
1555 Devices
[DeviceId
] = nullptr;
1557 return Plugin::success();
1560 const bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
1561 #ifdef LIBOMPTARGET_RPC_SUPPORT
1568 /// Exposed library API function, basically wrappers around the GenericDeviceTy
1569 /// functionality with the same name. All non-async functions are redirected
1570 /// to the async versions right away with a NULL AsyncInfoPtr.
1575 int32_t __tgt_rtl_init_plugin() {
1576 auto Err
= Plugin::initIfNeeded();
1578 REPORT("Failure to initialize plugin " GETNAME(TARGET_NAME
) ": %s\n",
1579 toString(std::move(Err
)).data());
1580 return OFFLOAD_FAIL
;
1583 return OFFLOAD_SUCCESS
;
1586 int32_t __tgt_rtl_deinit_plugin() {
1587 auto Err
= Plugin::deinitIfNeeded();
1589 REPORT("Failure to deinitialize plugin " GETNAME(TARGET_NAME
) ": %s\n",
1590 toString(std::move(Err
)).data());
1591 return OFFLOAD_FAIL
;
1594 return OFFLOAD_SUCCESS
;
1597 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image
*TgtImage
) {
1598 if (!Plugin::isActive())
1601 if (elf_check_machine(TgtImage
, Plugin::get().getMagicElfBits()))
1604 return Plugin::get().getJIT().checkBitcodeImage(*TgtImage
);
1607 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image
*TgtImage
,
1608 __tgt_image_info
*Info
) {
1609 if (!Plugin::isActive())
1612 if (!__tgt_rtl_is_valid_binary(TgtImage
))
1615 // A subarchitecture was not specified. Assume it is compatible.
1619 // Check the compatibility with all the available devices. Notice the
1620 // devices may not be initialized yet.
1621 auto CompatibleOrErr
= Plugin::get().isImageCompatible(Info
);
1622 if (!CompatibleOrErr
) {
1623 // This error should not abort the execution, so we just inform the user
1624 // through the debug system.
1625 std::string ErrString
= toString(CompatibleOrErr
.takeError());
1626 DP("Failure to check whether image %p is valid: %s\n", TgtImage
,
1631 bool Compatible
= *CompatibleOrErr
;
1632 DP("Image is %scompatible with current environment: %s\n",
1633 (Compatible
) ? "" : "not", Info
->Arch
);
1638 int32_t __tgt_rtl_supports_empty_images() {
1639 return Plugin::get().supportsEmptyImages();
1642 int32_t __tgt_rtl_init_device(int32_t DeviceId
) {
1643 auto Err
= Plugin::get().initDevice(DeviceId
);
1645 REPORT("Failure to initialize device %d: %s\n", DeviceId
,
1646 toString(std::move(Err
)).data());
1647 return OFFLOAD_FAIL
;
1650 return OFFLOAD_SUCCESS
;
1653 int32_t __tgt_rtl_deinit_device(int32_t DeviceId
) {
1654 auto Err
= Plugin::get().deinitDevice(DeviceId
);
1656 REPORT("Failure to deinitialize device %d: %s\n", DeviceId
,
1657 toString(std::move(Err
)).data());
1658 return OFFLOAD_FAIL
;
1661 return OFFLOAD_SUCCESS
;
1664 int32_t __tgt_rtl_number_of_devices() { return Plugin::get().getNumDevices(); }
1666 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags
) {
1667 Plugin::get().setRequiresFlag(RequiresFlags
);
1668 return RequiresFlags
;
1671 int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId
,
1672 int32_t DstDeviceId
) {
1673 return Plugin::get().isDataExchangable(SrcDeviceId
, DstDeviceId
);
1676 int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId
, int64_t MemorySize
,
1677 void *VAddr
, bool isRecord
,
1679 GenericPluginTy
&Plugin
= Plugin::get();
1680 GenericDeviceTy
&Device
= Plugin
.getDevice(DeviceId
);
1681 RecordReplayTy::RRStatusTy Status
=
1682 isRecord
? RecordReplayTy::RRStatusTy::RRRecording
1683 : RecordReplayTy::RRStatusTy::RRReplaying
;
1686 RecordReplay
.init(&Device
, MemorySize
, VAddr
, Status
, SaveOutput
)) {
1687 REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
1689 MemorySize
, toString(std::move(Err
)).data());
1690 RecordReplay
.setStatus(RecordReplayTy::RRStatusTy::RRDeactivated
);
1693 return OFFLOAD_FAIL
;
1696 return OFFLOAD_SUCCESS
;
1699 __tgt_target_table
*__tgt_rtl_load_binary(int32_t DeviceId
,
1700 __tgt_device_image
*TgtImage
) {
1701 GenericPluginTy
&Plugin
= Plugin::get();
1702 GenericDeviceTy
&Device
= Plugin
.getDevice(DeviceId
);
1704 auto TableOrErr
= Device
.loadBinary(Plugin
, TgtImage
);
1706 auto Err
= TableOrErr
.takeError();
1707 REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage
,
1708 DeviceId
, toString(std::move(Err
)).data());
1712 __tgt_target_table
*Table
= *TableOrErr
;
1713 assert(Table
!= nullptr && "Invalid table");
1718 void *__tgt_rtl_data_alloc(int32_t DeviceId
, int64_t Size
, void *HostPtr
,
1720 auto AllocOrErr
= Plugin::get().getDevice(DeviceId
).dataAlloc(
1721 Size
, HostPtr
, (TargetAllocTy
)Kind
);
1723 auto Err
= AllocOrErr
.takeError();
1724 REPORT("Failure to allocate device memory: %s\n",
1725 toString(std::move(Err
)).data());
1728 assert(*AllocOrErr
&& "Null pointer upon successful allocation");
1733 int32_t __tgt_rtl_data_delete(int32_t DeviceId
, void *TgtPtr
, int32_t Kind
) {
1735 Plugin::get().getDevice(DeviceId
).dataDelete(TgtPtr
, (TargetAllocTy
)Kind
);
1737 REPORT("Failure to deallocate device pointer %p: %s\n", TgtPtr
,
1738 toString(std::move(Err
)).data());
1739 return OFFLOAD_FAIL
;
1742 return OFFLOAD_SUCCESS
;
1745 int32_t __tgt_rtl_data_lock(int32_t DeviceId
, void *Ptr
, int64_t Size
,
1747 auto LockedPtrOrErr
= Plugin::get().getDevice(DeviceId
).dataLock(Ptr
, Size
);
1748 if (!LockedPtrOrErr
) {
1749 auto Err
= LockedPtrOrErr
.takeError();
1750 REPORT("Failure to lock memory %p: %s\n", Ptr
,
1751 toString(std::move(Err
)).data());
1752 return OFFLOAD_FAIL
;
1755 if (!(*LockedPtrOrErr
)) {
1756 REPORT("Failure to lock memory %p: obtained a null locked pointer\n", Ptr
);
1757 return OFFLOAD_FAIL
;
1759 *LockedPtr
= *LockedPtrOrErr
;
1761 return OFFLOAD_SUCCESS
;
1764 int32_t __tgt_rtl_data_unlock(int32_t DeviceId
, void *Ptr
) {
1765 auto Err
= Plugin::get().getDevice(DeviceId
).dataUnlock(Ptr
);
1767 REPORT("Failure to unlock memory %p: %s\n", Ptr
,
1768 toString(std::move(Err
)).data());
1769 return OFFLOAD_FAIL
;
1772 return OFFLOAD_SUCCESS
;
1775 int32_t __tgt_rtl_data_notify_mapped(int32_t DeviceId
, void *HstPtr
,
1777 auto Err
= Plugin::get().getDevice(DeviceId
).notifyDataMapped(HstPtr
, Size
);
1779 REPORT("Failure to notify data mapped %p: %s\n", HstPtr
,
1780 toString(std::move(Err
)).data());
1781 return OFFLOAD_FAIL
;
1784 return OFFLOAD_SUCCESS
;
1787 int32_t __tgt_rtl_data_notify_unmapped(int32_t DeviceId
, void *HstPtr
) {
1788 auto Err
= Plugin::get().getDevice(DeviceId
).notifyDataUnmapped(HstPtr
);
1790 REPORT("Failure to notify data unmapped %p: %s\n", HstPtr
,
1791 toString(std::move(Err
)).data());
1792 return OFFLOAD_FAIL
;
1795 return OFFLOAD_SUCCESS
;
1798 int32_t __tgt_rtl_data_submit(int32_t DeviceId
, void *TgtPtr
, void *HstPtr
,
1800 return __tgt_rtl_data_submit_async(DeviceId
, TgtPtr
, HstPtr
, Size
,
1801 /* AsyncInfoPtr */ nullptr);
1804 int32_t __tgt_rtl_data_submit_async(int32_t DeviceId
, void *TgtPtr
,
1805 void *HstPtr
, int64_t Size
,
1806 __tgt_async_info
*AsyncInfoPtr
) {
1807 auto Err
= Plugin::get().getDevice(DeviceId
).dataSubmit(TgtPtr
, HstPtr
, Size
,
1810 REPORT("Failure to copy data from host to device. Pointers: host "
1811 "= " DPxMOD
", device = " DPxMOD
", size = %" PRId64
": %s\n",
1812 DPxPTR(HstPtr
), DPxPTR(TgtPtr
), Size
,
1813 toString(std::move(Err
)).data());
1814 return OFFLOAD_FAIL
;
1817 return OFFLOAD_SUCCESS
;
1820 int32_t __tgt_rtl_data_retrieve(int32_t DeviceId
, void *HstPtr
, void *TgtPtr
,
1822 return __tgt_rtl_data_retrieve_async(DeviceId
, HstPtr
, TgtPtr
, Size
,
1823 /* AsyncInfoPtr */ nullptr);
1826 int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId
, void *HstPtr
,
1827 void *TgtPtr
, int64_t Size
,
1828 __tgt_async_info
*AsyncInfoPtr
) {
1829 auto Err
= Plugin::get().getDevice(DeviceId
).dataRetrieve(HstPtr
, TgtPtr
,
1830 Size
, AsyncInfoPtr
);
1832 REPORT("Faliure to copy data from device to host. Pointers: host "
1833 "= " DPxMOD
", device = " DPxMOD
", size = %" PRId64
": %s\n",
1834 DPxPTR(HstPtr
), DPxPTR(TgtPtr
), Size
,
1835 toString(std::move(Err
)).data());
1836 return OFFLOAD_FAIL
;
1839 return OFFLOAD_SUCCESS
;
1842 int32_t __tgt_rtl_data_exchange(int32_t SrcDeviceId
, void *SrcPtr
,
1843 int32_t DstDeviceId
, void *DstPtr
,
1845 return __tgt_rtl_data_exchange_async(SrcDeviceId
, SrcPtr
, DstDeviceId
, DstPtr
,
1846 Size
, /* AsyncInfoPtr */ nullptr);
1849 int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId
, void *SrcPtr
,
1850 int DstDeviceId
, void *DstPtr
,
1852 __tgt_async_info
*AsyncInfo
) {
1853 GenericDeviceTy
&SrcDevice
= Plugin::get().getDevice(SrcDeviceId
);
1854 GenericDeviceTy
&DstDevice
= Plugin::get().getDevice(DstDeviceId
);
1855 auto Err
= SrcDevice
.dataExchange(SrcPtr
, DstDevice
, DstPtr
, Size
, AsyncInfo
);
1857 REPORT("Failure to copy data from device (%d) to device (%d). Pointers: "
1858 "host = " DPxMOD
", device = " DPxMOD
", size = %" PRId64
": %s\n",
1859 SrcDeviceId
, DstDeviceId
, DPxPTR(SrcPtr
), DPxPTR(DstPtr
), Size
,
1860 toString(std::move(Err
)).data());
1861 return OFFLOAD_FAIL
;
1864 return OFFLOAD_SUCCESS
;
1867 int32_t __tgt_rtl_launch_kernel(int32_t DeviceId
, void *TgtEntryPtr
,
1868 void **TgtArgs
, ptrdiff_t *TgtOffsets
,
1869 KernelArgsTy
*KernelArgs
,
1870 __tgt_async_info
*AsyncInfoPtr
) {
1871 auto Err
= Plugin::get().getDevice(DeviceId
).launchKernel(
1872 TgtEntryPtr
, TgtArgs
, TgtOffsets
, *KernelArgs
, AsyncInfoPtr
);
1874 REPORT("Failure to run target region " DPxMOD
" in device %d: %s\n",
1875 DPxPTR(TgtEntryPtr
), DeviceId
, toString(std::move(Err
)).data());
1876 return OFFLOAD_FAIL
;
1879 return OFFLOAD_SUCCESS
;
1882 int32_t __tgt_rtl_synchronize(int32_t DeviceId
,
1883 __tgt_async_info
*AsyncInfoPtr
) {
1884 auto Err
= Plugin::get().getDevice(DeviceId
).synchronize(AsyncInfoPtr
);
1886 REPORT("Failure to synchronize stream %p: %s\n", AsyncInfoPtr
->Queue
,
1887 toString(std::move(Err
)).data());
1888 return OFFLOAD_FAIL
;
1891 return OFFLOAD_SUCCESS
;
1894 int32_t __tgt_rtl_query_async(int32_t DeviceId
,
1895 __tgt_async_info
*AsyncInfoPtr
) {
1896 auto Err
= Plugin::get().getDevice(DeviceId
).queryAsync(AsyncInfoPtr
);
1898 REPORT("Failure to query stream %p: %s\n", AsyncInfoPtr
->Queue
,
1899 toString(std::move(Err
)).data());
1900 return OFFLOAD_FAIL
;
1903 return OFFLOAD_SUCCESS
;
1906 void __tgt_rtl_print_device_info(int32_t DeviceId
) {
1907 if (auto Err
= Plugin::get().getDevice(DeviceId
).printInfo())
1908 REPORT("Failure to print device %d info: %s\n", DeviceId
,
1909 toString(std::move(Err
)).data());
1912 int32_t __tgt_rtl_create_event(int32_t DeviceId
, void **EventPtr
) {
1913 auto Err
= Plugin::get().getDevice(DeviceId
).createEvent(EventPtr
);
1915 REPORT("Failure to create event: %s\n", toString(std::move(Err
)).data());
1916 return OFFLOAD_FAIL
;
1919 return OFFLOAD_SUCCESS
;
1922 int32_t __tgt_rtl_record_event(int32_t DeviceId
, void *EventPtr
,
1923 __tgt_async_info
*AsyncInfoPtr
) {
1925 Plugin::get().getDevice(DeviceId
).recordEvent(EventPtr
, AsyncInfoPtr
);
1927 REPORT("Failure to record event %p: %s\n", EventPtr
,
1928 toString(std::move(Err
)).data());
1929 return OFFLOAD_FAIL
;
1932 return OFFLOAD_SUCCESS
;
1935 int32_t __tgt_rtl_wait_event(int32_t DeviceId
, void *EventPtr
,
1936 __tgt_async_info
*AsyncInfoPtr
) {
1938 Plugin::get().getDevice(DeviceId
).waitEvent(EventPtr
, AsyncInfoPtr
);
1940 REPORT("Failure to wait event %p: %s\n", EventPtr
,
1941 toString(std::move(Err
)).data());
1942 return OFFLOAD_FAIL
;
1945 return OFFLOAD_SUCCESS
;
1948 int32_t __tgt_rtl_sync_event(int32_t DeviceId
, void *EventPtr
) {
1949 auto Err
= Plugin::get().getDevice(DeviceId
).syncEvent(EventPtr
);
1951 REPORT("Failure to synchronize event %p: %s\n", EventPtr
,
1952 toString(std::move(Err
)).data());
1953 return OFFLOAD_FAIL
;
1956 return OFFLOAD_SUCCESS
;
1959 int32_t __tgt_rtl_destroy_event(int32_t DeviceId
, void *EventPtr
) {
1960 auto Err
= Plugin::get().getDevice(DeviceId
).destroyEvent(EventPtr
);
1962 REPORT("Failure to destroy event %p: %s\n", EventPtr
,
1963 toString(std::move(Err
)).data());
1964 return OFFLOAD_FAIL
;
1967 return OFFLOAD_SUCCESS
;
1970 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel
) {
1971 std::atomic
<uint32_t> &InfoLevel
= getInfoLevelInternal();
1972 InfoLevel
.store(NewInfoLevel
);
1975 int32_t __tgt_rtl_init_async_info(int32_t DeviceId
,
1976 __tgt_async_info
**AsyncInfoPtr
) {
1977 assert(AsyncInfoPtr
&& "Invalid async info");
1979 auto Err
= Plugin::get().getDevice(DeviceId
).initAsyncInfo(AsyncInfoPtr
);
1981 REPORT("Failure to initialize async info at " DPxMOD
" on device %d: %s\n",
1982 DPxPTR(*AsyncInfoPtr
), DeviceId
, toString(std::move(Err
)).data());
1983 return OFFLOAD_FAIL
;
1986 return OFFLOAD_SUCCESS
;
1989 int32_t __tgt_rtl_init_device_info(int32_t DeviceId
,
1990 __tgt_device_info
*DeviceInfo
,
1991 const char **ErrStr
) {
1994 auto Err
= Plugin::get().getDevice(DeviceId
).initDeviceInfo(DeviceInfo
);
1996 REPORT("Failure to initialize device info at " DPxMOD
" on device %d: %s\n",
1997 DPxPTR(DeviceInfo
), DeviceId
, toString(std::move(Err
)).data());
1998 return OFFLOAD_FAIL
;
2001 return OFFLOAD_SUCCESS
;
2004 int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset
) {
2005 Plugin::get().setDeviceIdStartIndex(DeviceIdOffset
);
2007 return OFFLOAD_SUCCESS
;