[LV][X86] Regenerate interleaved load/store costs. NFC.
[llvm-project.git] / offload / src / KernelLanguage / API.cpp
blobef1aad829e7bd7b66e181af6282f09330000d68e
1 //===------ API.cpp - Kernel Language (CUDA/HIP) entry points ----- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //===----------------------------------------------------------------------===//
11 #include "Shared/APITypes.h"
13 #include <cstdio>
15 struct dim3 {
16 unsigned x = 0, y = 0, z = 0;
19 struct __omp_kernel_t {
20 dim3 __grid_size;
21 dim3 __block_size;
22 size_t __shared_memory;
24 void *__stream;
27 static __omp_kernel_t __current_kernel = {};
28 #pragma omp threadprivate(__current_kernel);
30 extern "C" {
32 // TODO: There is little reason we need to keep these names or the way calls are
33 // issued. For now we do to avoid modifying Clang's CUDA codegen. Unclear when
34 // we actually need to push/pop configurations.
35 unsigned __llvmPushCallConfiguration(dim3 __grid_size, dim3 __block_size,
36 size_t __shared_memory, void *__stream) {
37 __omp_kernel_t &__kernel = __current_kernel;
38 __kernel.__grid_size = __grid_size;
39 __kernel.__block_size = __block_size;
40 __kernel.__shared_memory = __shared_memory;
41 __kernel.__stream = __stream;
42 return 0;
45 unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size,
46 size_t *__shared_memory, void *__stream) {
47 __omp_kernel_t &__kernel = __current_kernel;
48 *__grid_size = __kernel.__grid_size;
49 *__block_size = __kernel.__block_size;
50 *__shared_memory = __kernel.__shared_memory;
51 *((void **)__stream) = __kernel.__stream;
52 return 0;
55 int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
56 int32_t ThreadLimit, const void *HostPtr,
57 KernelArgsTy *Args);
59 unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
60 void *args, size_t sharedMem, void *stream) {
61 KernelArgsTy Args = {};
62 Args.DynCGroupMem = sharedMem;
63 Args.NumTeams[0] = gridDim.x;
64 Args.NumTeams[1] = gridDim.y;
65 Args.NumTeams[2] = gridDim.z;
66 Args.ThreadLimit[0] = blockDim.x;
67 Args.ThreadLimit[1] = blockDim.y;
68 Args.ThreadLimit[2] = blockDim.z;
69 Args.ArgPtrs = reinterpret_cast<void **>(args);
70 Args.Flags.IsCUDA = true;
71 return __tgt_target_kernel(nullptr, 0, gridDim.x, blockDim.x, func, &Args);