Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / openmp / libomptarget / plugins-nextgen / common / PluginInterface / PluginInterface.cpp
blob106e7a68cd3ae3ca29865ce3b8fa0849c19d2efa
1 //===- PluginInterface.cpp - Target independent plugin device interface ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
11 #include "PluginInterface.h"
12 #include "Debug.h"
13 #include "Environment.h"
14 #include "GlobalHandler.h"
15 #include "JIT.h"
16 #include "elf_common.h"
17 #include "omptarget.h"
18 #include "omptargetplugin.h"
20 #ifdef OMPT_SUPPORT
21 #include "OmptCallback.h"
22 #include "omp-tools.h"
23 #endif
25 #include "llvm/Frontend/OpenMP/OMPConstants.h"
26 #include "llvm/Support/Error.h"
27 #include "llvm/Support/JSON.h"
28 #include "llvm/Support/MathExtras.h"
29 #include "llvm/Support/MemoryBuffer.h"
31 #include <cstdint>
32 #include <limits>
34 using namespace llvm;
35 using namespace omp;
36 using namespace target;
37 using namespace plugin;
39 GenericPluginTy *Plugin::SpecificPlugin = nullptr;
41 // TODO: Fix any thread safety issues for multi-threaded kernel recording.
42 struct RecordReplayTy {
44 // Describes the state of the record replay mechanism.
45 enum RRStatusTy { RRDeactivated = 0, RRRecording, RRReplaying };
47 private:
48 // Memory pointers for recording, replaying memory.
49 void *MemoryStart;
50 void *MemoryPtr;
51 size_t MemorySize;
52 size_t TotalSize;
53 GenericDeviceTy *Device;
54 std::mutex AllocationLock;
56 RRStatusTy Status;
57 bool ReplaySaveOutput;
59 void *suggestAddress(uint64_t MaxMemoryAllocation) {
60 // Get a valid pointer address for this system
61 void *Addr =
62 Device->allocate(1024, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
63 Device->free(Addr);
64 // Align Address to MaxMemoryAllocation
65 Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
66 return Addr;
69 Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
70 size_t ASize = MaxMemoryAllocation;
72 if (!VAddr && isRecording())
73 VAddr = suggestAddress(MaxMemoryAllocation);
75 DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
77 if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
78 return Err;
80 if (isReplaying() && VAddr != MemoryStart) {
81 return Plugin::error("Record-Replay cannot assign the"
82 "requested recorded address (%p, %p)",
83 VAddr, MemoryStart);
86 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
87 "Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
89 MemoryPtr = MemoryStart;
90 MemorySize = 0;
91 TotalSize = ASize;
92 return Plugin::success();
95 Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, void *VAddr) {
96 const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
97 constexpr size_t STEP = 1024 * 1024 * 1024ULL;
98 MemoryStart = nullptr;
99 for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
100 MemoryStart = Device->allocate(TotalSize, /* HstPtr */ nullptr,
101 TARGET_ALLOC_DEFAULT);
102 if (MemoryStart)
103 break;
106 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
107 "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
108 MemoryStart);
110 if (!MemoryStart)
111 return Plugin::error("Allocating record/replay memory");
113 if (VAddr && VAddr != MemoryStart)
114 return Plugin::error("Cannot allocate recorded address");
116 MemoryPtr = MemoryStart;
117 MemorySize = 0;
119 return Plugin::success();
122 Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
123 if (Device->supportVAManagement())
124 return preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
126 uint64_t DevMemSize;
127 if (Device->getDeviceMemorySize(DevMemSize))
128 return Plugin::error("Cannot determine Device Memory Size");
130 return preAllocateHeuristic(DevMemSize, ReqVAddr);
133 void dumpDeviceMemory(StringRef Filename) {
134 ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
135 WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
136 if (!DeviceMemoryMB)
137 report_fatal_error("Error creating MemoryBuffer for device memory");
139 auto Err = Device->dataRetrieve(DeviceMemoryMB.get()->getBufferStart(),
140 MemoryStart, MemorySize, nullptr);
141 if (Err)
142 report_fatal_error("Error retrieving data for target pointer");
144 StringRef DeviceMemory(DeviceMemoryMB.get()->getBufferStart(), MemorySize);
145 std::error_code EC;
146 raw_fd_ostream OS(Filename, EC);
147 if (EC)
148 report_fatal_error("Error dumping memory to file " + Filename + " :" +
149 EC.message());
150 OS << DeviceMemory;
151 OS.close();
154 public:
155 bool isRecording() const { return Status == RRStatusTy::RRRecording; }
156 bool isReplaying() const { return Status == RRStatusTy::RRReplaying; }
157 bool isRecordingOrReplaying() const {
158 return (Status != RRStatusTy::RRDeactivated);
160 void setStatus(RRStatusTy Status) { this->Status = Status; }
161 bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
163 RecordReplayTy()
164 : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false) {}
166 void saveImage(const char *Name, const DeviceImageTy &Image) {
167 SmallString<128> ImageName = {Name, ".image"};
168 std::error_code EC;
169 raw_fd_ostream OS(ImageName, EC);
170 if (EC)
171 report_fatal_error("Error saving image : " + StringRef(EC.message()));
172 if (const auto *TgtImageBitcode = Image.getTgtImageBitcode()) {
173 size_t Size =
174 getPtrDiff(TgtImageBitcode->ImageEnd, TgtImageBitcode->ImageStart);
175 MemoryBufferRef MBR = MemoryBufferRef(
176 StringRef((const char *)TgtImageBitcode->ImageStart, Size), "");
177 OS << MBR.getBuffer();
178 } else {
179 OS << Image.getMemoryBuffer().getBuffer();
181 OS.close();
184 void dumpGlobals(StringRef Filename, DeviceImageTy &Image) {
185 int32_t Size = 0;
187 for (auto &OffloadEntry : Image.getOffloadEntryTable()) {
188 if (!OffloadEntry.size)
189 continue;
190 Size += std::strlen(OffloadEntry.name) + /* '\0' */ 1 +
191 /* OffloadEntry.size value */ sizeof(uint32_t) +
192 OffloadEntry.size;
195 ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB =
196 WritableMemoryBuffer::getNewUninitMemBuffer(Size);
197 if (!GlobalsMB)
198 report_fatal_error("Error creating MemoryBuffer for globals memory");
200 void *BufferPtr = GlobalsMB.get()->getBufferStart();
201 for (auto &OffloadEntry : Image.getOffloadEntryTable()) {
202 if (!OffloadEntry.size)
203 continue;
205 int32_t NameLength = std::strlen(OffloadEntry.name) + 1;
206 memcpy(BufferPtr, OffloadEntry.name, NameLength);
207 BufferPtr = advanceVoidPtr(BufferPtr, NameLength);
209 *((uint32_t *)(BufferPtr)) = OffloadEntry.size;
210 BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t));
212 auto Err = Plugin::success();
214 if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.addr,
215 OffloadEntry.size, nullptr))
216 report_fatal_error("Error retrieving data for global");
218 if (Err)
219 report_fatal_error("Error retrieving data for global");
220 BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.size);
222 assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
223 "Buffer over/under-filled.");
224 assert(Size == getPtrDiff(BufferPtr, GlobalsMB->get()->getBufferStart()) &&
225 "Buffer size mismatch");
227 StringRef GlobalsMemory(GlobalsMB.get()->getBufferStart(), Size);
228 std::error_code EC;
229 raw_fd_ostream OS(Filename, EC);
230 OS << GlobalsMemory;
231 OS.close();
234 void saveKernelInputInfo(const char *Name, DeviceImageTy &Image,
235 void **ArgPtrs, ptrdiff_t *ArgOffsets,
236 int32_t NumArgs, uint64_t NumTeamsClause,
237 uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
238 json::Object JsonKernelInfo;
239 JsonKernelInfo["Name"] = Name;
240 JsonKernelInfo["NumArgs"] = NumArgs;
241 JsonKernelInfo["NumTeamsClause"] = NumTeamsClause;
242 JsonKernelInfo["ThreadLimitClause"] = ThreadLimitClause;
243 JsonKernelInfo["LoopTripCount"] = LoopTripCount;
244 JsonKernelInfo["DeviceMemorySize"] = MemorySize;
245 JsonKernelInfo["DeviceId"] = Device->getDeviceId();
246 JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;
248 json::Array JsonArgPtrs;
249 for (int I = 0; I < NumArgs; ++I)
250 JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]);
251 JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
253 json::Array JsonArgOffsets;
254 for (int I = 0; I < NumArgs; ++I)
255 JsonArgOffsets.push_back(ArgOffsets[I]);
256 JsonKernelInfo["ArgOffsets"] = json::Value(std::move(JsonArgOffsets));
258 SmallString<128> MemoryFilename = {Name, ".memory"};
259 dumpDeviceMemory(MemoryFilename);
261 SmallString<128> GlobalsFilename = {Name, ".globals"};
262 dumpGlobals(GlobalsFilename, Image);
264 SmallString<128> JsonFilename = {Name, ".json"};
265 std::error_code EC;
266 raw_fd_ostream JsonOS(JsonFilename.str(), EC);
267 if (EC)
268 report_fatal_error("Error saving kernel json file : " +
269 StringRef(EC.message()));
270 JsonOS << json::Value(std::move(JsonKernelInfo));
271 JsonOS.close();
274 void saveKernelOutputInfo(const char *Name) {
275 SmallString<128> OutputFilename = {
276 Name, (isRecording() ? ".original.output" : ".replay.output")};
277 dumpDeviceMemory(OutputFilename);
280 void *alloc(uint64_t Size) {
281 assert(MemoryStart && "Expected memory has been pre-allocated");
282 void *Alloc = nullptr;
283 constexpr int Alignment = 16;
284 // Assumes alignment is a power of 2.
285 int64_t AlignedSize = (Size + (Alignment - 1)) & (~(Alignment - 1));
286 std::lock_guard<std::mutex> LG(AllocationLock);
287 Alloc = MemoryPtr;
288 MemoryPtr = (char *)MemoryPtr + AlignedSize;
289 MemorySize += AlignedSize;
290 DP("Memory Allocator return " DPxMOD "\n", DPxPTR(Alloc));
291 return Alloc;
294 Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
295 RRStatusTy Status, bool SaveOutput) {
296 this->Device = Device;
297 this->Status = Status;
298 this->ReplaySaveOutput = SaveOutput;
300 if (auto Err = preallocateDeviceMemory(MemSize, VAddr))
301 return Err;
303 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
304 "Record Replay Initialized (%p)"
305 " as starting address, %lu Memory Size"
306 " and set on status %s\n",
307 MemoryStart, TotalSize,
308 Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
310 return Plugin::success();
313 void deinit() {
314 if (Device->supportVAManagement()) {
315 if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
316 report_fatal_error("Error on releasing virtual memory space");
317 } else {
318 Device->free(MemoryStart);
322 } RecordReplay;
324 // Extract the mapping of host function pointers to device function pointers
325 // from the entry table. Functions marked as 'indirect' in OpenMP will have
326 // offloading entries generated for them which map the host's function pointer
327 // to a global containing the corresponding function pointer on the device.
328 static Expected<std::pair<void *, uint64_t>>
329 setupIndirectCallTable(GenericPluginTy &Plugin, GenericDeviceTy &Device,
330 DeviceImageTy &Image) {
331 GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
333 llvm::ArrayRef<__tgt_offload_entry> Entries(Image.getTgtImage()->EntriesBegin,
334 Image.getTgtImage()->EntriesEnd);
335 llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
336 for (const auto &Entry : Entries) {
337 if (Entry.size == 0 || !(Entry.flags & OMP_DECLARE_TARGET_INDIRECT))
338 continue;
340 assert(Entry.size == sizeof(void *) && "Global not a function pointer?");
341 auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
343 GlobalTy DeviceGlobal(Entry.name, Entry.size);
344 if (auto Err =
345 Handler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal))
346 return std::move(Err);
348 HstPtr = Entry.addr;
349 if (auto Err = Device.dataRetrieve(&DevPtr, DeviceGlobal.getPtr(),
350 Entry.size, nullptr))
351 return std::move(Err);
354 // If we do not have any indirect globals we exit early.
355 if (IndirectCallTable.empty())
356 return std::pair{nullptr, 0};
358 // Sort the array to allow for more efficient lookup of device pointers.
359 llvm::sort(IndirectCallTable,
360 [](const auto &x, const auto &y) { return x.first < y.first; });
362 uint64_t TableSize =
363 IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
364 void *DevicePtr = Device.allocate(TableSize, nullptr, TARGET_ALLOC_DEVICE);
365 if (auto Err = Device.dataSubmit(DevicePtr, IndirectCallTable.data(),
366 TableSize, nullptr))
367 return std::move(Err);
368 return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
371 AsyncInfoWrapperTy::AsyncInfoWrapperTy(GenericDeviceTy &Device,
372 __tgt_async_info *AsyncInfoPtr)
373 : Device(Device),
374 AsyncInfoPtr(AsyncInfoPtr ? AsyncInfoPtr : &LocalAsyncInfo) {}
376 void AsyncInfoWrapperTy::finalize(Error &Err) {
377 assert(AsyncInfoPtr && "AsyncInfoWrapperTy already finalized");
379 // If we used a local async info object we want synchronous behavior. In that
380 // case, and assuming the current status code is correct, we will synchronize
381 // explicitly when the object is deleted. Update the error with the result of
382 // the synchronize operation.
383 if (AsyncInfoPtr == &LocalAsyncInfo && LocalAsyncInfo.Queue && !Err)
384 Err = Device.synchronize(&LocalAsyncInfo);
386 // Invalidate the wrapper object.
387 AsyncInfoPtr = nullptr;
390 Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
391 DeviceImageTy &Image) {
393 ImagePtr = &Image;
395 // Retrieve kernel environment object for the kernel.
396 GlobalTy KernelEnv(std::string(Name) + "_kernel_environment",
397 sizeof(KernelEnvironment), &KernelEnvironment);
398 GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
399 if (auto Err =
400 GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) {
401 [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
402 DP("Failed to read kernel environment for '%s': %s\n"
403 "Using default SPMD (2) execution mode\n",
404 Name, ErrStr.data());
405 assert(KernelEnvironment.Configuration.ReductionBufferSize == 0 &&
406 "Default initialization failed.");
409 // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
410 MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0
411 ? std::min(KernelEnvironment.Configuration.MaxThreads,
412 int32_t(GenericDevice.getThreadLimit()))
413 : GenericDevice.getThreadLimit();
415 // Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref;
416 PreferredNumThreads =
417 KernelEnvironment.Configuration.MinThreads > 0
418 ? std::max(KernelEnvironment.Configuration.MinThreads,
419 int32_t(GenericDevice.getDefaultNumThreads()))
420 : GenericDevice.getDefaultNumThreads();
422 return initImpl(GenericDevice, Image);
425 Expected<KernelLaunchEnvironmentTy *>
426 GenericKernelTy::getKernelLaunchEnvironment(
427 GenericDeviceTy &GenericDevice,
428 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
429 // TODO: Check if the kernel needs a launch environment.
430 auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
431 /*HostPtr=*/nullptr,
432 TargetAllocTy::TARGET_ALLOC_DEVICE);
433 if (!AllocOrErr)
434 return AllocOrErr.takeError();
436 // Remember to free the memory later.
437 AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
439 /// Use the KLE in the __tgt_async_info to ensure a stable address for the
440 /// async data transfer.
441 auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
442 LocalKLE = KernelLaunchEnvironment;
443 if (KernelEnvironment.Configuration.ReductionBufferSize) {
444 auto AllocOrErr = GenericDevice.dataAlloc(
445 KernelEnvironment.Configuration.ReductionBufferSize,
446 /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
447 if (!AllocOrErr)
448 return AllocOrErr.takeError();
449 LocalKLE.ReductionBuffer = *AllocOrErr;
450 // Remember to free the memory later.
451 AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
454 auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
455 sizeof(KernelLaunchEnvironmentTy),
456 AsyncInfoWrapper);
457 if (Err)
458 return Err;
459 return static_cast<KernelLaunchEnvironmentTy *>(*AllocOrErr);
462 Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice,
463 KernelArgsTy &KernelArgs,
464 uint32_t NumThreads,
465 uint64_t NumBlocks) const {
466 INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
467 "Launching kernel %s with %" PRIu64
468 " blocks and %d threads in %s mode\n",
469 getName(), NumBlocks, NumThreads, getExecutionModeName());
470 return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads,
471 NumBlocks);
474 Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
475 KernelArgsTy &KernelArgs,
476 uint32_t NumThreads,
477 uint64_t NumBlocks) const {
478 return Plugin::success();
481 Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
482 ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
483 AsyncInfoWrapperTy &AsyncInfoWrapper) const {
484 llvm::SmallVector<void *, 16> Args;
485 llvm::SmallVector<void *, 16> Ptrs;
487 auto KernelLaunchEnvOrErr =
488 getKernelLaunchEnvironment(GenericDevice, AsyncInfoWrapper);
489 if (!KernelLaunchEnvOrErr)
490 return KernelLaunchEnvOrErr.takeError();
492 void *KernelArgsPtr =
493 prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
494 Ptrs, *KernelLaunchEnvOrErr);
496 uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit);
497 uint64_t NumBlocks =
498 getNumBlocks(GenericDevice, KernelArgs.NumTeams, KernelArgs.Tripcount,
499 NumThreads, KernelArgs.ThreadLimit[0] > 0);
501 if (auto Err =
502 printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
503 return Err;
505 return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
506 KernelArgsPtr, AsyncInfoWrapper);
509 void *GenericKernelTy::prepareArgs(
510 GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
511 uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
512 llvm::SmallVectorImpl<void *> &Ptrs,
513 KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const {
514 if (isCtorOrDtor())
515 return nullptr;
517 NumArgs += 1;
519 Args.resize(NumArgs);
520 Ptrs.resize(NumArgs);
522 Ptrs[0] = KernelLaunchEnvironment;
523 Args[0] = &Ptrs[0];
525 for (int I = 1; I < NumArgs; ++I) {
526 Ptrs[I] = (void *)((intptr_t)ArgPtrs[I - 1] + ArgOffsets[I - 1]);
527 Args[I] = &Ptrs[I];
529 return &Args[0];
532 uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,
533 uint32_t ThreadLimitClause[3]) const {
534 assert(ThreadLimitClause[1] == 0 && ThreadLimitClause[2] == 0 &&
535 "Multi dimensional launch not supported yet.");
536 if (ThreadLimitClause[0] > 0 && isGenericMode())
537 ThreadLimitClause[0] += GenericDevice.getWarpSize();
539 return std::min(MaxNumThreads, (ThreadLimitClause[0] > 0)
540 ? ThreadLimitClause[0]
541 : PreferredNumThreads);
544 uint64_t GenericKernelTy::getNumBlocks(GenericDeviceTy &GenericDevice,
545 uint32_t NumTeamsClause[3],
546 uint64_t LoopTripCount,
547 uint32_t &NumThreads,
548 bool IsNumThreadsFromUser) const {
549 assert(NumTeamsClause[1] == 0 && NumTeamsClause[2] == 0 &&
550 "Multi dimensional launch not supported yet.");
552 if (NumTeamsClause[0] > 0) {
553 // TODO: We need to honor any value and consequently allow more than the
554 // block limit. For this we might need to start multiple kernels or let the
555 // blocks start again until the requested number has been started.
556 return std::min(NumTeamsClause[0], GenericDevice.getBlockLimit());
559 uint64_t DefaultNumBlocks = GenericDevice.getDefaultNumBlocks();
560 uint64_t TripCountNumBlocks = std::numeric_limits<uint64_t>::max();
561 if (LoopTripCount > 0) {
562 if (isSPMDMode()) {
563 // We have a combined construct, i.e. `target teams distribute
564 // parallel for [simd]`. We launch so many teams so that each thread
565 // will execute one iteration of the loop; rounded up to the nearest
566 // integer. However, if that results in too few teams, we artificially
567 // reduce the thread count per team to increase the outer parallelism.
568 auto MinThreads = GenericDevice.getMinThreadsForLowTripCountLoop();
569 MinThreads = std::min(MinThreads, NumThreads);
571 // Honor the thread_limit clause; only lower the number of threads.
572 [[maybe_unused]] auto OldNumThreads = NumThreads;
573 if (LoopTripCount >= DefaultNumBlocks * NumThreads ||
574 IsNumThreadsFromUser) {
575 // Enough parallelism for teams and threads.
576 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
577 assert(IsNumThreadsFromUser ||
578 TripCountNumBlocks >= DefaultNumBlocks &&
579 "Expected sufficient outer parallelism.");
580 } else if (LoopTripCount >= DefaultNumBlocks * MinThreads) {
581 // Enough parallelism for teams, limit threads.
583 // This case is hard; for now, we force "full warps":
584 // First, compute a thread count assuming DefaultNumBlocks.
585 auto NumThreadsDefaultBlocks =
586 (LoopTripCount + DefaultNumBlocks - 1) / DefaultNumBlocks;
587 // Now get a power of two that is larger or equal.
588 auto NumThreadsDefaultBlocksP2 =
589 llvm::PowerOf2Ceil(NumThreadsDefaultBlocks);
590 // Do not increase a thread limit given be the user.
591 NumThreads = std::min(NumThreads, uint32_t(NumThreadsDefaultBlocksP2));
592 assert(NumThreads >= MinThreads &&
593 "Expected sufficient inner parallelism.");
594 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
595 } else {
596 // Not enough parallelism for teams and threads, limit both.
597 NumThreads = std::min(NumThreads, MinThreads);
598 TripCountNumBlocks = ((LoopTripCount - 1) / NumThreads) + 1;
601 assert(NumThreads * TripCountNumBlocks >= LoopTripCount &&
602 "Expected sufficient parallelism");
603 assert(OldNumThreads >= NumThreads &&
604 "Number of threads cannot be increased!");
605 } else {
606 assert((isGenericMode() || isGenericSPMDMode()) &&
607 "Unexpected execution mode!");
608 // If we reach this point, then we have a non-combined construct, i.e.
609 // `teams distribute` with a nested `parallel for` and each team is
610 // assigned one iteration of the `distribute` loop. E.g.:
612 // #pragma omp target teams distribute
613 // for(...loop_tripcount...) {
614 // #pragma omp parallel for
615 // for(...) {}
616 // }
618 // Threads within a team will execute the iterations of the `parallel`
619 // loop.
620 TripCountNumBlocks = LoopTripCount;
623 // If the loops are long running we rather reuse blocks than spawn too many.
624 uint32_t PreferredNumBlocks = std::min(TripCountNumBlocks, DefaultNumBlocks);
625 return std::min(PreferredNumBlocks, GenericDevice.getBlockLimit());
628 GenericDeviceTy::GenericDeviceTy(int32_t DeviceId, int32_t NumDevices,
629 const llvm::omp::GV &OMPGridValues)
630 : MemoryManager(nullptr), OMP_TeamLimit("OMP_TEAM_LIMIT"),
631 OMP_NumTeams("OMP_NUM_TEAMS"),
632 OMP_TeamsThreadLimit("OMP_TEAMS_THREAD_LIMIT"),
633 OMPX_DebugKind("LIBOMPTARGET_DEVICE_RTL_DEBUG"),
634 OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
635 // Do not initialize the following two envars since they depend on the
636 // device initialization. These cannot be consulted until the device is
637 // initialized correctly. We intialize them in GenericDeviceTy::init().
638 OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
639 // By default, the initial number of streams and events is 1.
640 OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
641 OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
642 DeviceId(DeviceId), GridValues(OMPGridValues),
643 PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
644 PinnedAllocs(*this), RPCServer(nullptr) {
645 #ifdef OMPT_SUPPORT
646 OmptInitialized.store(false);
647 // Bind the callbacks to this device's member functions
648 #define bindOmptCallback(Name, Type, Code) \
649 if (ompt::Initialized && ompt::lookupCallbackByCode) { \
650 ompt::lookupCallbackByCode((ompt_callbacks_t)(Code), \
651 ((ompt_callback_t *)&(Name##_fn))); \
652 DP("OMPT: class bound %s=%p\n", #Name, ((void *)(uint64_t)Name##_fn)); \
655 FOREACH_OMPT_DEVICE_EVENT(bindOmptCallback);
656 #undef bindOmptCallback
658 #endif
661 Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
662 if (auto Err = initImpl(Plugin))
663 return Err;
665 #ifdef OMPT_SUPPORT
666 if (ompt::Initialized) {
667 bool ExpectedStatus = false;
668 if (OmptInitialized.compare_exchange_strong(ExpectedStatus, true))
669 performOmptCallback(device_initialize,
670 /* device_num */ DeviceId +
671 Plugin.getDeviceIdStartIndex(),
672 /* type */ getComputeUnitKind().c_str(),
673 /* device */ reinterpret_cast<ompt_device_t *>(this),
674 /* lookup */ ompt::lookupCallbackByName,
675 /* documentation */ nullptr);
677 #endif
679 // Read and reinitialize the envars that depend on the device initialization.
680 // Notice these two envars may change the stack size and heap size of the
681 // device, so they need the device properly initialized.
682 auto StackSizeEnvarOrErr = UInt64Envar::create(
683 "LIBOMPTARGET_STACK_SIZE",
684 [this](uint64_t &V) -> Error { return getDeviceStackSize(V); },
685 [this](uint64_t V) -> Error { return setDeviceStackSize(V); });
686 if (!StackSizeEnvarOrErr)
687 return StackSizeEnvarOrErr.takeError();
688 OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
690 auto HeapSizeEnvarOrErr = UInt64Envar::create(
691 "LIBOMPTARGET_HEAP_SIZE",
692 [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
693 [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
694 if (!HeapSizeEnvarOrErr)
695 return HeapSizeEnvarOrErr.takeError();
696 OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
698 // Update the maximum number of teams and threads after the device
699 // initialization sets the corresponding hardware limit.
700 if (OMP_NumTeams > 0)
701 GridValues.GV_Max_Teams =
702 std::min(GridValues.GV_Max_Teams, uint32_t(OMP_NumTeams));
704 if (OMP_TeamsThreadLimit > 0)
705 GridValues.GV_Max_WG_Size =
706 std::min(GridValues.GV_Max_WG_Size, uint32_t(OMP_TeamsThreadLimit));
708 // Enable the memory manager if required.
709 auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
710 if (EnableMM)
711 MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
713 return Plugin::success();
716 Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
718 if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
719 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
720 for (auto *Image : LoadedImages) {
721 DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
722 GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
723 sizeof(DeviceMemoryPoolTrackingTy),
724 &ImageDeviceMemoryPoolTracking);
725 if (auto Err =
726 GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal))
727 return Err;
728 DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
731 // TODO: Write this by default into a file.
732 printf("\n\n|-----------------------\n"
733 "| Device memory tracker:\n"
734 "|-----------------------\n"
735 "| #Allocations: %lu\n"
736 "| Byes allocated: %lu\n"
737 "| Minimal allocation: %lu\n"
738 "| Maximal allocation: %lu\n"
739 "|-----------------------\n\n\n",
740 DeviceMemoryPoolTracking.NumAllocations,
741 DeviceMemoryPoolTracking.AllocationTotal,
742 DeviceMemoryPoolTracking.AllocationMin,
743 DeviceMemoryPoolTracking.AllocationMax);
746 // Delete the memory manager before deinitializing the device. Otherwise,
747 // we may delete device allocations after the device is deinitialized.
748 if (MemoryManager)
749 delete MemoryManager;
750 MemoryManager = nullptr;
752 if (RecordReplay.isRecordingOrReplaying())
753 RecordReplay.deinit();
755 if (RPCServer)
756 if (auto Err = RPCServer->deinitDevice(*this))
757 return Err;
759 #ifdef OMPT_SUPPORT
760 if (ompt::Initialized) {
761 bool ExpectedStatus = true;
762 if (OmptInitialized.compare_exchange_strong(ExpectedStatus, false))
763 performOmptCallback(device_finalize,
764 /* device_num */ DeviceId +
765 Plugin.getDeviceIdStartIndex());
767 #endif
769 return deinitImpl();
771 Expected<__tgt_target_table *>
772 GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
773 const __tgt_device_image *InputTgtImage) {
774 assert(InputTgtImage && "Expected non-null target image");
775 DP("Load data from image " DPxMOD "\n", DPxPTR(InputTgtImage->ImageStart));
777 auto PostJITImageOrErr = Plugin.getJIT().process(*InputTgtImage, *this);
778 if (!PostJITImageOrErr) {
779 auto Err = PostJITImageOrErr.takeError();
780 REPORT("Failure to jit IR image %p on device %d: %s\n", InputTgtImage,
781 DeviceId, toString(std::move(Err)).data());
782 return nullptr;
785 // Load the binary and allocate the image object. Use the next available id
786 // for the image id, which is the number of previously loaded images.
787 auto ImageOrErr =
788 loadBinaryImpl(PostJITImageOrErr.get(), LoadedImages.size());
789 if (!ImageOrErr)
790 return ImageOrErr.takeError();
792 DeviceImageTy *Image = *ImageOrErr;
793 assert(Image != nullptr && "Invalid image");
794 if (InputTgtImage != PostJITImageOrErr.get())
795 Image->setTgtImageBitcode(InputTgtImage);
797 // Add the image to list.
798 LoadedImages.push_back(Image);
800 // Setup the device environment if needed.
801 if (auto Err = setupDeviceEnvironment(Plugin, *Image))
802 return std::move(Err);
804 // Setup the global device memory pool if needed.
805 if (shouldSetupDeviceMemoryPool()) {
806 uint64_t HeapSize;
807 auto SizeOrErr = getDeviceHeapSize(HeapSize);
808 if (SizeOrErr) {
809 REPORT("No global device memory pool due to error: %s\n",
810 toString(std::move(SizeOrErr)).data());
811 } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
812 return std::move(Err);
815 // Register all offload entries of the image.
816 if (auto Err = registerOffloadEntries(*Image))
817 return std::move(Err);
819 if (auto Err = setupRPCServer(Plugin, *Image))
820 return std::move(Err);
822 #ifdef OMPT_SUPPORT
823 if (ompt::Initialized) {
824 size_t Bytes =
825 getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart);
826 performOmptCallback(device_load,
827 /* device_num */ DeviceId +
828 Plugin.getDeviceIdStartIndex(),
829 /* FileName */ nullptr,
830 /* File Offset */ 0,
831 /* VmaInFile */ nullptr,
832 /* ImgSize */ Bytes,
833 /* HostAddr */ InputTgtImage->ImageStart,
834 /* DeviceAddr */ nullptr,
835 /* FIXME: ModuleId */ 0);
837 #endif
839 // Return the pointer to the table of entries.
840 return Image->getOffloadEntryTable();
843 Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
844 DeviceImageTy &Image) {
845 // There are some plugins that do not need this step.
846 if (!shouldSetupDeviceEnvironment())
847 return Plugin::success();
849 // Obtain a table mapping host function pointers to device function pointers.
850 auto CallTablePairOrErr = setupIndirectCallTable(Plugin, *this, Image);
851 if (!CallTablePairOrErr)
852 return CallTablePairOrErr.takeError();
854 DeviceEnvironmentTy DeviceEnvironment;
855 DeviceEnvironment.DeviceDebugKind = OMPX_DebugKind;
856 DeviceEnvironment.NumDevices = Plugin.getNumDevices();
857 // TODO: The device ID used here is not the real device ID used by OpenMP.
858 DeviceEnvironment.DeviceNum = DeviceId;
859 DeviceEnvironment.DynamicMemSize = OMPX_SharedMemorySize;
860 DeviceEnvironment.ClockFrequency = getClockFrequency();
861 DeviceEnvironment.IndirectCallTable =
862 reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
863 DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
864 DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
866 // Create the metainfo of the device environment global.
867 GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
868 sizeof(DeviceEnvironmentTy), &DeviceEnvironment);
870 // Write device environment values to the device.
871 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
872 if (auto Err = GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal)) {
873 DP("Missing symbol %s, continue execution anyway.\n",
874 DevEnvGlobal.getName().data());
875 consumeError(std::move(Err));
877 return Plugin::success();
880 Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
881 DeviceImageTy &Image,
882 uint64_t PoolSize) {
883 // Free the old pool, if any.
884 if (DeviceMemoryPool.Ptr) {
885 if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
886 TargetAllocTy::TARGET_ALLOC_DEVICE))
887 return Err;
890 DeviceMemoryPool.Size = PoolSize;
891 auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
892 TargetAllocTy::TARGET_ALLOC_DEVICE);
893 if (AllocOrErr) {
894 DeviceMemoryPool.Ptr = *AllocOrErr;
895 } else {
896 auto Err = AllocOrErr.takeError();
897 REPORT("Failure to allocate device memory for global memory pool: %s\n",
898 toString(std::move(Err)).data());
899 DeviceMemoryPool.Ptr = nullptr;
900 DeviceMemoryPool.Size = 0;
903 // Create the metainfo of the device environment global.
904 GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
905 sizeof(DeviceMemoryPoolTrackingTy),
906 &DeviceMemoryPoolTracking);
907 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
908 if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
909 return Err;
911 // Create the metainfo of the device environment global.
912 GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
913 sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
915 // Write device environment values to the device.
916 return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
919 Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
920 DeviceImageTy &Image) {
921 // The plugin either does not need an RPC server or it is unavailible.
922 if (!shouldSetupRPCServer())
923 return Plugin::success();
925 // Check if this device needs to run an RPC server.
926 RPCServerTy &Server = Plugin.getRPCServer();
927 auto UsingOrErr =
928 Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image);
929 if (!UsingOrErr)
930 return UsingOrErr.takeError();
932 if (!UsingOrErr.get())
933 return Plugin::success();
935 if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
936 return Err;
938 RPCServer = &Server;
939 DP("Running an RPC server on device %d\n", getDeviceId());
940 return Plugin::success();
943 Error GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) {
944 const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin;
945 const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd;
946 for (const __tgt_offload_entry *Entry = Begin; Entry != End; ++Entry) {
947 // The host should have always something in the address to uniquely
948 // identify the entry.
949 if (!Entry->addr)
950 return Plugin::error("Failure to register entry without address");
952 __tgt_offload_entry DeviceEntry = {0};
954 if (Entry->size) {
955 if (auto Err = registerGlobalOffloadEntry(Image, *Entry, DeviceEntry))
956 return Err;
957 } else {
958 if (auto Err = registerKernelOffloadEntry(Image, *Entry, DeviceEntry))
959 return Err;
962 assert(DeviceEntry.addr && "Device addr of offload entry cannot be null");
964 DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n",
965 DPxPTR(Entry - Begin), (Entry->size) ? " global" : "", Entry->name,
966 DPxPTR(DeviceEntry.addr));
968 return Plugin::success();
971 Error GenericDeviceTy::registerGlobalOffloadEntry(
972 DeviceImageTy &Image, const __tgt_offload_entry &GlobalEntry,
973 __tgt_offload_entry &DeviceEntry) {
975 GenericPluginTy &Plugin = Plugin::get();
977 DeviceEntry = GlobalEntry;
979 // Create a metadata object for the device global.
980 GlobalTy DeviceGlobal(GlobalEntry.name, GlobalEntry.size);
982 // Get the address of the device of the global.
983 GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
984 if (auto Err =
985 GHandler.getGlobalMetadataFromDevice(*this, Image, DeviceGlobal))
986 return Err;
988 // Store the device address on the device entry.
989 DeviceEntry.addr = DeviceGlobal.getPtr();
990 assert(DeviceEntry.addr && "Invalid device global's address");
992 // Note: In the current implementation declare target variables
993 // can either be link or to. This means that once unified
994 // memory is activated via the requires directive, the variable
995 // can be used directly from the host in both cases.
996 if (Plugin.getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
997 // If unified memory is present any target link or to variables
998 // can access host addresses directly. There is no longer a
999 // need for device copies.
1000 GlobalTy HostGlobal(GlobalEntry);
1001 if (auto Err =
1002 GHandler.writeGlobalToDevice(*this, HostGlobal, DeviceGlobal))
1003 return Err;
1006 // Add the device entry on the entry table.
1007 Image.getOffloadEntryTable().addEntry(DeviceEntry);
1009 return Plugin::success();
1012 Error GenericDeviceTy::registerKernelOffloadEntry(
1013 DeviceImageTy &Image, const __tgt_offload_entry &KernelEntry,
1014 __tgt_offload_entry &DeviceEntry) {
1015 DeviceEntry = KernelEntry;
1017 // Create a kernel object.
1018 auto KernelOrErr = constructKernel(KernelEntry);
1019 if (!KernelOrErr)
1020 return KernelOrErr.takeError();
1022 GenericKernelTy &Kernel = *KernelOrErr;
1024 // Initialize the kernel.
1025 if (auto Err = Kernel.init(*this, Image))
1026 return Err;
1028 // Set the device entry address to the kernel address and store the entry on
1029 // the entry table.
1030 DeviceEntry.addr = (void *)&Kernel;
1031 Image.getOffloadEntryTable().addEntry(DeviceEntry);
1033 return Plugin::success();
1036 Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
1037 size_t Size, bool ExternallyLocked) {
1038 // Insert the new entry into the map.
1039 auto Res = Allocs.insert({HstPtr, DevAccessiblePtr, Size, ExternallyLocked});
1040 if (!Res.second)
1041 return Plugin::error("Cannot insert locked buffer entry");
1043 // Check whether the next entry overlaps with the inserted entry.
1044 auto It = std::next(Res.first);
1045 if (It == Allocs.end())
1046 return Plugin::success();
1048 const EntryTy *NextEntry = &(*It);
1049 if (intersects(NextEntry->HstPtr, NextEntry->Size, HstPtr, Size))
1050 return Plugin::error("Partial overlapping not allowed in locked buffers");
1052 return Plugin::success();
1055 Error PinnedAllocationMapTy::eraseEntry(const EntryTy &Entry) {
1056 // Erase the existing entry. Notice this requires an additional map lookup,
1057 // but this should not be a performance issue. Using iterators would make
1058 // the code more difficult to read.
1059 size_t Erased = Allocs.erase({Entry.HstPtr});
1060 if (!Erased)
1061 return Plugin::error("Cannot erase locked buffer entry");
1062 return Plugin::success();
1065 Error PinnedAllocationMapTy::registerEntryUse(const EntryTy &Entry,
1066 void *HstPtr, size_t Size) {
1067 if (!contains(Entry.HstPtr, Entry.Size, HstPtr, Size))
1068 return Plugin::error("Partial overlapping not allowed in locked buffers");
1070 ++Entry.References;
1071 return Plugin::success();
1074 Expected<bool> PinnedAllocationMapTy::unregisterEntryUse(const EntryTy &Entry) {
1075 if (Entry.References == 0)
1076 return Plugin::error("Invalid number of references");
1078 // Return whether this was the last user.
1079 return (--Entry.References == 0);
1082 Error PinnedAllocationMapTy::registerHostBuffer(void *HstPtr,
1083 void *DevAccessiblePtr,
1084 size_t Size) {
1085 assert(HstPtr && "Invalid pointer");
1086 assert(DevAccessiblePtr && "Invalid pointer");
1087 assert(Size && "Invalid size");
1089 std::lock_guard<std::shared_mutex> Lock(Mutex);
1091 // No pinned allocation should intersect.
1092 const EntryTy *Entry = findIntersecting(HstPtr);
1093 if (Entry)
1094 return Plugin::error("Cannot insert entry due to an existing one");
1096 // Now insert the new entry.
1097 return insertEntry(HstPtr, DevAccessiblePtr, Size);
1100 Error PinnedAllocationMapTy::unregisterHostBuffer(void *HstPtr) {
1101 assert(HstPtr && "Invalid pointer");
1103 std::lock_guard<std::shared_mutex> Lock(Mutex);
1105 const EntryTy *Entry = findIntersecting(HstPtr);
1106 if (!Entry)
1107 return Plugin::error("Cannot find locked buffer");
1109 // The address in the entry should be the same we are unregistering.
1110 if (Entry->HstPtr != HstPtr)
1111 return Plugin::error("Unexpected host pointer in locked buffer entry");
1113 // Unregister from the entry.
1114 auto LastUseOrErr = unregisterEntryUse(*Entry);
1115 if (!LastUseOrErr)
1116 return LastUseOrErr.takeError();
1118 // There should be no other references to the pinned allocation.
1119 if (!(*LastUseOrErr))
1120 return Plugin::error("The locked buffer is still being used");
1122 // Erase the entry from the map.
1123 return eraseEntry(*Entry);
1126 Expected<void *> PinnedAllocationMapTy::lockHostBuffer(void *HstPtr,
1127 size_t Size) {
1128 assert(HstPtr && "Invalid pointer");
1129 assert(Size && "Invalid size");
1131 std::lock_guard<std::shared_mutex> Lock(Mutex);
1133 const EntryTy *Entry = findIntersecting(HstPtr);
1135 if (Entry) {
1136 // An already registered intersecting buffer was found. Register a new use.
1137 if (auto Err = registerEntryUse(*Entry, HstPtr, Size))
1138 return std::move(Err);
1140 // Return the device accessible pointer with the correct offset.
1141 return advanceVoidPtr(Entry->DevAccessiblePtr,
1142 getPtrDiff(HstPtr, Entry->HstPtr));
1145 // No intersecting registered allocation found in the map. First, lock the
1146 // host buffer and retrieve the device accessible pointer.
1147 auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
1148 if (!DevAccessiblePtrOrErr)
1149 return DevAccessiblePtrOrErr.takeError();
1151 // Now insert the new entry into the map.
1152 if (auto Err = insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size))
1153 return std::move(Err);
1155 // Return the device accessible pointer.
1156 return *DevAccessiblePtrOrErr;
1159 Error PinnedAllocationMapTy::unlockHostBuffer(void *HstPtr) {
1160 assert(HstPtr && "Invalid pointer");
1162 std::lock_guard<std::shared_mutex> Lock(Mutex);
1164 const EntryTy *Entry = findIntersecting(HstPtr);
1165 if (!Entry)
1166 return Plugin::error("Cannot find locked buffer");
1168 // Unregister from the locked buffer. No need to do anything if there are
1169 // others using the allocation.
1170 auto LastUseOrErr = unregisterEntryUse(*Entry);
1171 if (!LastUseOrErr)
1172 return LastUseOrErr.takeError();
1174 // No need to do anything if there are others using the allocation.
1175 if (!(*LastUseOrErr))
1176 return Plugin::success();
1178 // This was the last user of the allocation. Unlock the original locked buffer
1179 // if it was locked by the plugin. Do not unlock it if it was locked by an
1180 // external entity. Unlock the buffer using the host pointer of the entry.
1181 if (!Entry->ExternallyLocked)
1182 if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
1183 return Err;
1185 // Erase the entry from the map.
1186 return eraseEntry(*Entry);
1189 Error PinnedAllocationMapTy::lockMappedHostBuffer(void *HstPtr, size_t Size) {
1190 assert(HstPtr && "Invalid pointer");
1191 assert(Size && "Invalid size");
1193 std::lock_guard<std::shared_mutex> Lock(Mutex);
1195 // If previously registered, just register a new user on the entry.
1196 const EntryTy *Entry = findIntersecting(HstPtr);
1197 if (Entry)
1198 return registerEntryUse(*Entry, HstPtr, Size);
1200 size_t BaseSize;
1201 void *BaseHstPtr, *BaseDevAccessiblePtr;
1203 // Check if it was externally pinned by a vendor-specific API.
1204 auto IsPinnedOrErr = Device.isPinnedPtrImpl(HstPtr, BaseHstPtr,
1205 BaseDevAccessiblePtr, BaseSize);
1206 if (!IsPinnedOrErr)
1207 return IsPinnedOrErr.takeError();
1209 // If pinned, just insert the entry representing the whole pinned buffer.
1210 if (*IsPinnedOrErr)
1211 return insertEntry(BaseHstPtr, BaseDevAccessiblePtr, BaseSize,
1212 /* Externally locked */ true);
1214 // Not externally pinned. Do nothing if locking of mapped buffers is disabled.
1215 if (!LockMappedBuffers)
1216 return Plugin::success();
1218 // Otherwise, lock the buffer and insert the new entry.
1219 auto DevAccessiblePtrOrErr = Device.dataLockImpl(HstPtr, Size);
1220 if (!DevAccessiblePtrOrErr) {
1221 // Errors may be tolerated.
1222 if (!IgnoreLockMappedFailures)
1223 return DevAccessiblePtrOrErr.takeError();
1225 consumeError(DevAccessiblePtrOrErr.takeError());
1226 return Plugin::success();
1229 return insertEntry(HstPtr, *DevAccessiblePtrOrErr, Size);
1232 Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
1233 assert(HstPtr && "Invalid pointer");
1235 std::lock_guard<std::shared_mutex> Lock(Mutex);
1237 // Check whether there is any intersecting entry.
1238 const EntryTy *Entry = findIntersecting(HstPtr);
1240 // No entry but automatic locking of mapped buffers is disabled, so
1241 // nothing to do.
1242 if (!Entry && !LockMappedBuffers)
1243 return Plugin::success();
1245 // No entry, automatic locking is enabled, but the locking may have failed, so
1246 // do nothing.
1247 if (!Entry && IgnoreLockMappedFailures)
1248 return Plugin::success();
1250 // No entry, but the automatic locking is enabled, so this is an error.
1251 if (!Entry)
1252 return Plugin::error("Locked buffer not found");
1254 // There is entry, so unregister a user and check whether it was the last one.
1255 auto LastUseOrErr = unregisterEntryUse(*Entry);
1256 if (!LastUseOrErr)
1257 return LastUseOrErr.takeError();
1259 // If it is not the last one, there is nothing to do.
1260 if (!(*LastUseOrErr))
1261 return Plugin::success();
1263 // Otherwise, if it was the last and the buffer was locked by the plugin,
1264 // unlock it.
1265 if (!Entry->ExternallyLocked)
1266 if (auto Err = Device.dataUnlockImpl(Entry->HstPtr))
1267 return Err;
1269 // Finally erase the entry from the map.
1270 return eraseEntry(*Entry);
1273 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) {
1274 if (!AsyncInfo || !AsyncInfo->Queue)
1275 return Plugin::error("Invalid async info queue");
1277 if (auto Err = synchronizeImpl(*AsyncInfo))
1278 return Err;
1280 for (auto *Ptr : AsyncInfo->AssociatedAllocations)
1281 if (auto Err = dataDelete(Ptr, TargetAllocTy::TARGET_ALLOC_DEVICE))
1282 return Err;
1283 AsyncInfo->AssociatedAllocations.clear();
1285 return Plugin::success();
1288 Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
1289 if (!AsyncInfo || !AsyncInfo->Queue)
1290 return Plugin::error("Invalid async info queue");
1292 return queryAsyncImpl(*AsyncInfo);
1295 Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
1296 return Plugin::error("Device does not suppport VA Management");
1299 Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
1300 return Plugin::error("Device does not suppport VA Management");
1303 Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
1304 return Plugin::error(
1305 "Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
1308 Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
1309 TargetAllocTy Kind) {
1310 void *Alloc = nullptr;
1312 if (RecordReplay.isRecordingOrReplaying())
1313 return RecordReplay.alloc(Size);
1315 switch (Kind) {
1316 case TARGET_ALLOC_DEFAULT:
1317 case TARGET_ALLOC_DEVICE:
1318 if (MemoryManager) {
1319 Alloc = MemoryManager->allocate(Size, HostPtr);
1320 if (!Alloc)
1321 return Plugin::error("Failed to allocate from memory manager");
1322 break;
1324 [[fallthrough]];
1325 case TARGET_ALLOC_HOST:
1326 case TARGET_ALLOC_SHARED:
1327 Alloc = allocate(Size, HostPtr, Kind);
1328 if (!Alloc)
1329 return Plugin::error("Failed to allocate from device allocator");
1332 // Report error if the memory manager or the device allocator did not return
1333 // any memory buffer.
1334 if (!Alloc)
1335 return Plugin::error("Invalid target data allocation kind or requested "
1336 "allocator not implemented yet");
1338 // Register allocated buffer as pinned memory if the type is host memory.
1339 if (Kind == TARGET_ALLOC_HOST)
1340 if (auto Err = PinnedAllocs.registerHostBuffer(Alloc, Alloc, Size))
1341 return std::move(Err);
1343 return Alloc;
1346 Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
1347 // Free is a noop when recording or replaying.
1348 if (RecordReplay.isRecordingOrReplaying())
1349 return Plugin::success();
1351 int Res;
1352 if (MemoryManager)
1353 Res = MemoryManager->free(TgtPtr);
1354 else
1355 Res = free(TgtPtr, Kind);
1357 if (Res)
1358 return Plugin::error("Failure to deallocate device pointer %p", TgtPtr);
1360 // Unregister deallocated pinned memory buffer if the type is host memory.
1361 if (Kind == TARGET_ALLOC_HOST)
1362 if (auto Err = PinnedAllocs.unregisterHostBuffer(TgtPtr))
1363 return Err;
1365 return Plugin::success();
1368 Error GenericDeviceTy::dataSubmit(void *TgtPtr, const void *HstPtr,
1369 int64_t Size, __tgt_async_info *AsyncInfo) {
1370 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1372 auto Err = dataSubmitImpl(TgtPtr, HstPtr, Size, AsyncInfoWrapper);
1373 AsyncInfoWrapper.finalize(Err);
1374 return Err;
1377 Error GenericDeviceTy::dataRetrieve(void *HstPtr, const void *TgtPtr,
1378 int64_t Size, __tgt_async_info *AsyncInfo) {
1379 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1381 auto Err = dataRetrieveImpl(HstPtr, TgtPtr, Size, AsyncInfoWrapper);
1382 AsyncInfoWrapper.finalize(Err);
1383 return Err;
1386 Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
1387 void *DstPtr, int64_t Size,
1388 __tgt_async_info *AsyncInfo) {
1389 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1391 auto Err = dataExchangeImpl(SrcPtr, DstDev, DstPtr, Size, AsyncInfoWrapper);
1392 AsyncInfoWrapper.finalize(Err);
1393 return Err;
1396 Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
1397 ptrdiff_t *ArgOffsets,
1398 KernelArgsTy &KernelArgs,
1399 __tgt_async_info *AsyncInfo) {
1400 AsyncInfoWrapperTy AsyncInfoWrapper(
1401 *this, RecordReplay.isRecordingOrReplaying() ? nullptr : AsyncInfo);
1403 GenericKernelTy &GenericKernel =
1404 *reinterpret_cast<GenericKernelTy *>(EntryPtr);
1406 if (RecordReplay.isRecording())
1407 RecordReplay.saveKernelInputInfo(
1408 GenericKernel.getName(), GenericKernel.getImage(), ArgPtrs, ArgOffsets,
1409 KernelArgs.NumArgs, KernelArgs.NumTeams[0], KernelArgs.ThreadLimit[0],
1410 KernelArgs.Tripcount);
1412 if (RecordReplay.isRecording())
1413 RecordReplay.saveImage(GenericKernel.getName(), GenericKernel.getImage());
1415 auto Err = GenericKernel.launch(*this, ArgPtrs, ArgOffsets, KernelArgs,
1416 AsyncInfoWrapper);
1418 // 'finalize' here to guarantee next record-replay actions are in-sync
1419 AsyncInfoWrapper.finalize(Err);
1421 if (RecordReplay.isRecordingOrReplaying() &&
1422 RecordReplay.isSaveOutputEnabled())
1423 RecordReplay.saveKernelOutputInfo(GenericKernel.getName());
1425 return Err;
1428 Error GenericDeviceTy::initAsyncInfo(__tgt_async_info **AsyncInfoPtr) {
1429 assert(AsyncInfoPtr && "Invalid async info");
1431 *AsyncInfoPtr = new __tgt_async_info();
1433 AsyncInfoWrapperTy AsyncInfoWrapper(*this, *AsyncInfoPtr);
1435 auto Err = initAsyncInfoImpl(AsyncInfoWrapper);
1436 AsyncInfoWrapper.finalize(Err);
1437 return Err;
1440 Error GenericDeviceTy::initDeviceInfo(__tgt_device_info *DeviceInfo) {
1441 assert(DeviceInfo && "Invalid device info");
1443 return initDeviceInfoImpl(DeviceInfo);
1446 Error GenericDeviceTy::printInfo() {
1447 InfoQueueTy InfoQueue;
1449 // Get the vendor-specific info entries describing the device properties.
1450 if (auto Err = obtainInfoImpl(InfoQueue))
1451 return Err;
1453 // Print all info entries.
1454 InfoQueue.print();
1456 return Plugin::success();
1459 Error GenericDeviceTy::createEvent(void **EventPtrStorage) {
1460 return createEventImpl(EventPtrStorage);
1463 Error GenericDeviceTy::destroyEvent(void *EventPtr) {
1464 return destroyEventImpl(EventPtr);
1467 Error GenericDeviceTy::recordEvent(void *EventPtr,
1468 __tgt_async_info *AsyncInfo) {
1469 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1471 auto Err = recordEventImpl(EventPtr, AsyncInfoWrapper);
1472 AsyncInfoWrapper.finalize(Err);
1473 return Err;
1476 Error GenericDeviceTy::waitEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
1477 AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
1479 auto Err = waitEventImpl(EventPtr, AsyncInfoWrapper);
1480 AsyncInfoWrapper.finalize(Err);
1481 return Err;
1484 Error GenericDeviceTy::syncEvent(void *EventPtr) {
1485 return syncEventImpl(EventPtr);
1488 Error GenericPluginTy::init() {
1489 auto NumDevicesOrErr = initImpl();
1490 if (!NumDevicesOrErr)
1491 return NumDevicesOrErr.takeError();
1493 NumDevices = *NumDevicesOrErr;
1494 if (NumDevices == 0)
1495 return Plugin::success();
1497 assert(Devices.size() == 0 && "Plugin already initialized");
1498 Devices.resize(NumDevices, nullptr);
1500 GlobalHandler = Plugin::createGlobalHandler();
1501 assert(GlobalHandler && "Invalid global handler");
1503 RPCServer = new RPCServerTy(NumDevices);
1504 assert(RPCServer && "Invalid RPC server");
1506 return Plugin::success();
1509 Error GenericPluginTy::deinit() {
1510 // Deinitialize all active devices.
1511 for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) {
1512 if (Devices[DeviceId]) {
1513 if (auto Err = deinitDevice(DeviceId))
1514 return Err;
1516 assert(!Devices[DeviceId] && "Device was not deinitialized");
1519 // There is no global handler if no device is available.
1520 if (GlobalHandler)
1521 delete GlobalHandler;
1523 if (RPCServer)
1524 delete RPCServer;
1526 // Perform last deinitializations on the plugin.
1527 return deinitImpl();
1530 Error GenericPluginTy::initDevice(int32_t DeviceId) {
1531 assert(!Devices[DeviceId] && "Device already initialized");
1533 // Create the device and save the reference.
1534 GenericDeviceTy *Device = Plugin::createDevice(DeviceId, NumDevices);
1535 assert(Device && "Invalid device");
1537 // Save the device reference into the list.
1538 Devices[DeviceId] = Device;
1540 // Initialize the device and its resources.
1541 return Device->init(*this);
1544 Error GenericPluginTy::deinitDevice(int32_t DeviceId) {
1545 // The device may be already deinitialized.
1546 if (Devices[DeviceId] == nullptr)
1547 return Plugin::success();
1549 // Deinitialize the device and release its resources.
1550 if (auto Err = Devices[DeviceId]->deinit(*this))
1551 return Err;
1553 // Delete the device and invalidate its reference.
1554 delete Devices[DeviceId];
1555 Devices[DeviceId] = nullptr;
1557 return Plugin::success();
1560 const bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
1561 #ifdef LIBOMPTARGET_RPC_SUPPORT
1562 return true;
1563 #else
1564 return false;
1565 #endif
1568 /// Exposed library API function, basically wrappers around the GenericDeviceTy
1569 /// functionality with the same name. All non-async functions are redirected
1570 /// to the async versions right away with a NULL AsyncInfoPtr.
1571 #ifdef __cplusplus
1572 extern "C" {
1573 #endif
1575 int32_t __tgt_rtl_init_plugin() {
1576 auto Err = Plugin::initIfNeeded();
1577 if (Err) {
1578 REPORT("Failure to initialize plugin " GETNAME(TARGET_NAME) ": %s\n",
1579 toString(std::move(Err)).data());
1580 return OFFLOAD_FAIL;
1583 return OFFLOAD_SUCCESS;
1586 int32_t __tgt_rtl_deinit_plugin() {
1587 auto Err = Plugin::deinitIfNeeded();
1588 if (Err) {
1589 REPORT("Failure to deinitialize plugin " GETNAME(TARGET_NAME) ": %s\n",
1590 toString(std::move(Err)).data());
1591 return OFFLOAD_FAIL;
1594 return OFFLOAD_SUCCESS;
1597 int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *TgtImage) {
1598 if (!Plugin::isActive())
1599 return false;
1601 if (elf_check_machine(TgtImage, Plugin::get().getMagicElfBits()))
1602 return true;
1604 return Plugin::get().getJIT().checkBitcodeImage(*TgtImage);
1607 int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *TgtImage,
1608 __tgt_image_info *Info) {
1609 if (!Plugin::isActive())
1610 return false;
1612 if (!__tgt_rtl_is_valid_binary(TgtImage))
1613 return false;
1615 // A subarchitecture was not specified. Assume it is compatible.
1616 if (!Info->Arch)
1617 return true;
1619 // Check the compatibility with all the available devices. Notice the
1620 // devices may not be initialized yet.
1621 auto CompatibleOrErr = Plugin::get().isImageCompatible(Info);
1622 if (!CompatibleOrErr) {
1623 // This error should not abort the execution, so we just inform the user
1624 // through the debug system.
1625 std::string ErrString = toString(CompatibleOrErr.takeError());
1626 DP("Failure to check whether image %p is valid: %s\n", TgtImage,
1627 ErrString.data());
1628 return false;
1631 bool Compatible = *CompatibleOrErr;
1632 DP("Image is %scompatible with current environment: %s\n",
1633 (Compatible) ? "" : "not", Info->Arch);
1635 return Compatible;
1638 int32_t __tgt_rtl_supports_empty_images() {
1639 return Plugin::get().supportsEmptyImages();
1642 int32_t __tgt_rtl_init_device(int32_t DeviceId) {
1643 auto Err = Plugin::get().initDevice(DeviceId);
1644 if (Err) {
1645 REPORT("Failure to initialize device %d: %s\n", DeviceId,
1646 toString(std::move(Err)).data());
1647 return OFFLOAD_FAIL;
1650 return OFFLOAD_SUCCESS;
1653 int32_t __tgt_rtl_deinit_device(int32_t DeviceId) {
1654 auto Err = Plugin::get().deinitDevice(DeviceId);
1655 if (Err) {
1656 REPORT("Failure to deinitialize device %d: %s\n", DeviceId,
1657 toString(std::move(Err)).data());
1658 return OFFLOAD_FAIL;
1661 return OFFLOAD_SUCCESS;
1664 int32_t __tgt_rtl_number_of_devices() { return Plugin::get().getNumDevices(); }
1666 int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1667 Plugin::get().setRequiresFlag(RequiresFlags);
1668 return RequiresFlags;
1671 int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
1672 int32_t DstDeviceId) {
1673 return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId);
1676 int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
1677 void *VAddr, bool isRecord,
1678 bool SaveOutput) {
1679 GenericPluginTy &Plugin = Plugin::get();
1680 GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
1681 RecordReplayTy::RRStatusTy Status =
1682 isRecord ? RecordReplayTy::RRStatusTy::RRRecording
1683 : RecordReplayTy::RRStatusTy::RRReplaying;
1685 if (auto Err =
1686 RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) {
1687 REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
1688 "(Error: %s)\n",
1689 MemorySize, toString(std::move(Err)).data());
1690 RecordReplay.setStatus(RecordReplayTy::RRStatusTy::RRDeactivated);
1692 if (!isRecord) {
1693 return OFFLOAD_FAIL;
1696 return OFFLOAD_SUCCESS;
1699 __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
1700 __tgt_device_image *TgtImage) {
1701 GenericPluginTy &Plugin = Plugin::get();
1702 GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
1704 auto TableOrErr = Device.loadBinary(Plugin, TgtImage);
1705 if (!TableOrErr) {
1706 auto Err = TableOrErr.takeError();
1707 REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
1708 DeviceId, toString(std::move(Err)).data());
1709 return nullptr;
1712 __tgt_target_table *Table = *TableOrErr;
1713 assert(Table != nullptr && "Invalid table");
1715 return Table;
1718 void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
1719 int32_t Kind) {
1720 auto AllocOrErr = Plugin::get().getDevice(DeviceId).dataAlloc(
1721 Size, HostPtr, (TargetAllocTy)Kind);
1722 if (!AllocOrErr) {
1723 auto Err = AllocOrErr.takeError();
1724 REPORT("Failure to allocate device memory: %s\n",
1725 toString(std::move(Err)).data());
1726 return nullptr;
1728 assert(*AllocOrErr && "Null pointer upon successful allocation");
1730 return *AllocOrErr;
1733 int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind) {
1734 auto Err =
1735 Plugin::get().getDevice(DeviceId).dataDelete(TgtPtr, (TargetAllocTy)Kind);
1736 if (Err) {
1737 REPORT("Failure to deallocate device pointer %p: %s\n", TgtPtr,
1738 toString(std::move(Err)).data());
1739 return OFFLOAD_FAIL;
1742 return OFFLOAD_SUCCESS;
1745 int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
1746 void **LockedPtr) {
1747 auto LockedPtrOrErr = Plugin::get().getDevice(DeviceId).dataLock(Ptr, Size);
1748 if (!LockedPtrOrErr) {
1749 auto Err = LockedPtrOrErr.takeError();
1750 REPORT("Failure to lock memory %p: %s\n", Ptr,
1751 toString(std::move(Err)).data());
1752 return OFFLOAD_FAIL;
1755 if (!(*LockedPtrOrErr)) {
1756 REPORT("Failure to lock memory %p: obtained a null locked pointer\n", Ptr);
1757 return OFFLOAD_FAIL;
1759 *LockedPtr = *LockedPtrOrErr;
1761 return OFFLOAD_SUCCESS;
1764 int32_t __tgt_rtl_data_unlock(int32_t DeviceId, void *Ptr) {
1765 auto Err = Plugin::get().getDevice(DeviceId).dataUnlock(Ptr);
1766 if (Err) {
1767 REPORT("Failure to unlock memory %p: %s\n", Ptr,
1768 toString(std::move(Err)).data());
1769 return OFFLOAD_FAIL;
1772 return OFFLOAD_SUCCESS;
1775 int32_t __tgt_rtl_data_notify_mapped(int32_t DeviceId, void *HstPtr,
1776 int64_t Size) {
1777 auto Err = Plugin::get().getDevice(DeviceId).notifyDataMapped(HstPtr, Size);
1778 if (Err) {
1779 REPORT("Failure to notify data mapped %p: %s\n", HstPtr,
1780 toString(std::move(Err)).data());
1781 return OFFLOAD_FAIL;
1784 return OFFLOAD_SUCCESS;
1787 int32_t __tgt_rtl_data_notify_unmapped(int32_t DeviceId, void *HstPtr) {
1788 auto Err = Plugin::get().getDevice(DeviceId).notifyDataUnmapped(HstPtr);
1789 if (Err) {
1790 REPORT("Failure to notify data unmapped %p: %s\n", HstPtr,
1791 toString(std::move(Err)).data());
1792 return OFFLOAD_FAIL;
1795 return OFFLOAD_SUCCESS;
1798 int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
1799 int64_t Size) {
1800 return __tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size,
1801 /* AsyncInfoPtr */ nullptr);
1804 int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
1805 void *HstPtr, int64_t Size,
1806 __tgt_async_info *AsyncInfoPtr) {
1807 auto Err = Plugin::get().getDevice(DeviceId).dataSubmit(TgtPtr, HstPtr, Size,
1808 AsyncInfoPtr);
1809 if (Err) {
1810 REPORT("Failure to copy data from host to device. Pointers: host "
1811 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
1812 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
1813 toString(std::move(Err)).data());
1814 return OFFLOAD_FAIL;
1817 return OFFLOAD_SUCCESS;
1820 int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
1821 int64_t Size) {
1822 return __tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size,
1823 /* AsyncInfoPtr */ nullptr);
1826 int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
1827 void *TgtPtr, int64_t Size,
1828 __tgt_async_info *AsyncInfoPtr) {
1829 auto Err = Plugin::get().getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr,
1830 Size, AsyncInfoPtr);
1831 if (Err) {
1832 REPORT("Faliure to copy data from device to host. Pointers: host "
1833 "= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
1834 DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
1835 toString(std::move(Err)).data());
1836 return OFFLOAD_FAIL;
1839 return OFFLOAD_SUCCESS;
1842 int32_t __tgt_rtl_data_exchange(int32_t SrcDeviceId, void *SrcPtr,
1843 int32_t DstDeviceId, void *DstPtr,
1844 int64_t Size) {
1845 return __tgt_rtl_data_exchange_async(SrcDeviceId, SrcPtr, DstDeviceId, DstPtr,
1846 Size, /* AsyncInfoPtr */ nullptr);
1849 int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
1850 int DstDeviceId, void *DstPtr,
1851 int64_t Size,
1852 __tgt_async_info *AsyncInfo) {
1853 GenericDeviceTy &SrcDevice = Plugin::get().getDevice(SrcDeviceId);
1854 GenericDeviceTy &DstDevice = Plugin::get().getDevice(DstDeviceId);
1855 auto Err = SrcDevice.dataExchange(SrcPtr, DstDevice, DstPtr, Size, AsyncInfo);
1856 if (Err) {
1857 REPORT("Failure to copy data from device (%d) to device (%d). Pointers: "
1858 "host = " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
1859 SrcDeviceId, DstDeviceId, DPxPTR(SrcPtr), DPxPTR(DstPtr), Size,
1860 toString(std::move(Err)).data());
1861 return OFFLOAD_FAIL;
1864 return OFFLOAD_SUCCESS;
1867 int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
1868 void **TgtArgs, ptrdiff_t *TgtOffsets,
1869 KernelArgsTy *KernelArgs,
1870 __tgt_async_info *AsyncInfoPtr) {
1871 auto Err = Plugin::get().getDevice(DeviceId).launchKernel(
1872 TgtEntryPtr, TgtArgs, TgtOffsets, *KernelArgs, AsyncInfoPtr);
1873 if (Err) {
1874 REPORT("Failure to run target region " DPxMOD " in device %d: %s\n",
1875 DPxPTR(TgtEntryPtr), DeviceId, toString(std::move(Err)).data());
1876 return OFFLOAD_FAIL;
1879 return OFFLOAD_SUCCESS;
1882 int32_t __tgt_rtl_synchronize(int32_t DeviceId,
1883 __tgt_async_info *AsyncInfoPtr) {
1884 auto Err = Plugin::get().getDevice(DeviceId).synchronize(AsyncInfoPtr);
1885 if (Err) {
1886 REPORT("Failure to synchronize stream %p: %s\n", AsyncInfoPtr->Queue,
1887 toString(std::move(Err)).data());
1888 return OFFLOAD_FAIL;
1891 return OFFLOAD_SUCCESS;
1894 int32_t __tgt_rtl_query_async(int32_t DeviceId,
1895 __tgt_async_info *AsyncInfoPtr) {
1896 auto Err = Plugin::get().getDevice(DeviceId).queryAsync(AsyncInfoPtr);
1897 if (Err) {
1898 REPORT("Failure to query stream %p: %s\n", AsyncInfoPtr->Queue,
1899 toString(std::move(Err)).data());
1900 return OFFLOAD_FAIL;
1903 return OFFLOAD_SUCCESS;
1906 void __tgt_rtl_print_device_info(int32_t DeviceId) {
1907 if (auto Err = Plugin::get().getDevice(DeviceId).printInfo())
1908 REPORT("Failure to print device %d info: %s\n", DeviceId,
1909 toString(std::move(Err)).data());
1912 int32_t __tgt_rtl_create_event(int32_t DeviceId, void **EventPtr) {
1913 auto Err = Plugin::get().getDevice(DeviceId).createEvent(EventPtr);
1914 if (Err) {
1915 REPORT("Failure to create event: %s\n", toString(std::move(Err)).data());
1916 return OFFLOAD_FAIL;
1919 return OFFLOAD_SUCCESS;
1922 int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
1923 __tgt_async_info *AsyncInfoPtr) {
1924 auto Err =
1925 Plugin::get().getDevice(DeviceId).recordEvent(EventPtr, AsyncInfoPtr);
1926 if (Err) {
1927 REPORT("Failure to record event %p: %s\n", EventPtr,
1928 toString(std::move(Err)).data());
1929 return OFFLOAD_FAIL;
1932 return OFFLOAD_SUCCESS;
1935 int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
1936 __tgt_async_info *AsyncInfoPtr) {
1937 auto Err =
1938 Plugin::get().getDevice(DeviceId).waitEvent(EventPtr, AsyncInfoPtr);
1939 if (Err) {
1940 REPORT("Failure to wait event %p: %s\n", EventPtr,
1941 toString(std::move(Err)).data());
1942 return OFFLOAD_FAIL;
1945 return OFFLOAD_SUCCESS;
1948 int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
1949 auto Err = Plugin::get().getDevice(DeviceId).syncEvent(EventPtr);
1950 if (Err) {
1951 REPORT("Failure to synchronize event %p: %s\n", EventPtr,
1952 toString(std::move(Err)).data());
1953 return OFFLOAD_FAIL;
1956 return OFFLOAD_SUCCESS;
1959 int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
1960 auto Err = Plugin::get().getDevice(DeviceId).destroyEvent(EventPtr);
1961 if (Err) {
1962 REPORT("Failure to destroy event %p: %s\n", EventPtr,
1963 toString(std::move(Err)).data());
1964 return OFFLOAD_FAIL;
1967 return OFFLOAD_SUCCESS;
1970 void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
1971 std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
1972 InfoLevel.store(NewInfoLevel);
1975 int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
1976 __tgt_async_info **AsyncInfoPtr) {
1977 assert(AsyncInfoPtr && "Invalid async info");
1979 auto Err = Plugin::get().getDevice(DeviceId).initAsyncInfo(AsyncInfoPtr);
1980 if (Err) {
1981 REPORT("Failure to initialize async info at " DPxMOD " on device %d: %s\n",
1982 DPxPTR(*AsyncInfoPtr), DeviceId, toString(std::move(Err)).data());
1983 return OFFLOAD_FAIL;
1986 return OFFLOAD_SUCCESS;
1989 int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
1990 __tgt_device_info *DeviceInfo,
1991 const char **ErrStr) {
1992 *ErrStr = "";
1994 auto Err = Plugin::get().getDevice(DeviceId).initDeviceInfo(DeviceInfo);
1995 if (Err) {
1996 REPORT("Failure to initialize device info at " DPxMOD " on device %d: %s\n",
1997 DPxPTR(DeviceInfo), DeviceId, toString(std::move(Err)).data());
1998 return OFFLOAD_FAIL;
2001 return OFFLOAD_SUCCESS;
2004 int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) {
2005 Plugin::get().setDeviceIdStartIndex(DeviceIdOffset);
2007 return OFFLOAD_SUCCESS;
2010 #ifdef __cplusplus
2012 #endif