[libc++][Android] Allow testing libc++ with clang-r536225 (#116149)
[llvm-project.git] / flang / runtime / CUDA / kernel.cpp
blob88cdf3cf426229993a79224a2efd9a9a5e33c5a6
1 //===-- runtime/CUDA/kernel.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
9 #include "flang/Runtime/CUDA/kernel.h"
10 #include "../terminator.h"
11 #include "flang/Runtime/CUDA/common.h"
13 #include "cuda_runtime.h"
15 extern "C" {
17 void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
18 intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
19 int32_t smem, void **params, void **extra) {
20 dim3 gridDim;
21 gridDim.x = gridX;
22 gridDim.y = gridY;
23 gridDim.z = gridZ;
24 dim3 blockDim;
25 blockDim.x = blockX;
26 blockDim.y = blockY;
27 blockDim.z = blockZ;
28 unsigned nbNegGridDim{0};
29 if (gridX < 0) {
30 ++nbNegGridDim;
32 if (gridY < 0) {
33 ++nbNegGridDim;
35 if (gridZ < 0) {
36 ++nbNegGridDim;
38 if (nbNegGridDim == 1) {
39 int maxBlocks, nbBlocks, dev, multiProcCount;
40 cudaError_t err1, err2;
41 nbBlocks = blockDim.x * blockDim.y * blockDim.z;
42 cudaGetDevice(&dev);
43 err1 = cudaDeviceGetAttribute(
44 &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
45 err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
46 &maxBlocks, kernel, nbBlocks, smem);
47 if (err1 == cudaSuccess && err2 == cudaSuccess) {
48 maxBlocks = multiProcCount * maxBlocks;
50 if (maxBlocks > 0) {
51 if (gridDim.x > 0) {
52 maxBlocks = maxBlocks / gridDim.x;
54 if (gridDim.y > 0) {
55 maxBlocks = maxBlocks / gridDim.y;
57 if (gridDim.z > 0) {
58 maxBlocks = maxBlocks / gridDim.z;
60 if (maxBlocks < 1) {
61 maxBlocks = 1;
63 if (gridX < 0) {
64 gridDim.x = maxBlocks;
66 if (gridY < 0) {
67 gridDim.y = maxBlocks;
69 if (gridZ < 0) {
70 gridDim.z = maxBlocks;
73 } else if (nbNegGridDim > 1) {
74 Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
75 terminator.Crash("Too many invalid grid dimensions");
77 cudaStream_t stream = 0; // TODO stream managment
78 CUDA_REPORT_IF_ERROR(
79 cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, stream));
82 void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
83 intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
84 intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
85 int32_t smem, void **params, void **extra) {
86 cudaLaunchConfig_t config;
87 config.gridDim.x = gridX;
88 config.gridDim.y = gridY;
89 config.gridDim.z = gridZ;
90 config.blockDim.x = blockX;
91 config.blockDim.y = blockY;
92 config.blockDim.z = blockZ;
93 unsigned nbNegGridDim{0};
94 if (gridX < 0) {
95 ++nbNegGridDim;
97 if (gridY < 0) {
98 ++nbNegGridDim;
100 if (gridZ < 0) {
101 ++nbNegGridDim;
103 if (nbNegGridDim == 1) {
104 int maxBlocks, nbBlocks, dev, multiProcCount;
105 cudaError_t err1, err2;
106 nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z;
107 cudaGetDevice(&dev);
108 err1 = cudaDeviceGetAttribute(
109 &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
110 err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
111 &maxBlocks, kernel, nbBlocks, smem);
112 if (err1 == cudaSuccess && err2 == cudaSuccess) {
113 maxBlocks = multiProcCount * maxBlocks;
115 if (maxBlocks > 0) {
116 if (config.gridDim.x > 0) {
117 maxBlocks = maxBlocks / config.gridDim.x;
119 if (config.gridDim.y > 0) {
120 maxBlocks = maxBlocks / config.gridDim.y;
122 if (config.gridDim.z > 0) {
123 maxBlocks = maxBlocks / config.gridDim.z;
125 if (maxBlocks < 1) {
126 maxBlocks = 1;
128 if (gridX < 0) {
129 config.gridDim.x = maxBlocks;
131 if (gridY < 0) {
132 config.gridDim.y = maxBlocks;
134 if (gridZ < 0) {
135 config.gridDim.z = maxBlocks;
138 } else if (nbNegGridDim > 1) {
139 Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
140 terminator.Crash("Too many invalid grid dimensions");
142 config.dynamicSmemBytes = smem;
143 config.stream = 0; // TODO stream managment
144 cudaLaunchAttribute launchAttr[1];
145 launchAttr[0].id = cudaLaunchAttributeClusterDimension;
146 launchAttr[0].val.clusterDim.x = clusterX;
147 launchAttr[0].val.clusterDim.y = clusterY;
148 launchAttr[0].val.clusterDim.z = clusterZ;
149 config.numAttrs = 1;
150 config.attrs = launchAttr;
151 CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
154 } // extern "C"