1 //===-- runtime/CUDA/kernel.cpp -------------------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 #include "flang/Runtime/CUDA/kernel.h"
10 #include "../terminator.h"
11 #include "flang/Runtime/CUDA/common.h"
13 #include "cuda_runtime.h"
17 void RTDEF(CUFLaunchKernel
)(const void *kernel
, intptr_t gridX
, intptr_t gridY
,
18 intptr_t gridZ
, intptr_t blockX
, intptr_t blockY
, intptr_t blockZ
,
19 int32_t smem
, void **params
, void **extra
) {
28 unsigned nbNegGridDim
{0};
38 if (nbNegGridDim
== 1) {
39 int maxBlocks
, nbBlocks
, dev
, multiProcCount
;
40 cudaError_t err1
, err2
;
41 nbBlocks
= blockDim
.x
* blockDim
.y
* blockDim
.z
;
43 err1
= cudaDeviceGetAttribute(
44 &multiProcCount
, cudaDevAttrMultiProcessorCount
, dev
);
45 err2
= cudaOccupancyMaxActiveBlocksPerMultiprocessor(
46 &maxBlocks
, kernel
, nbBlocks
, smem
);
47 if (err1
== cudaSuccess
&& err2
== cudaSuccess
) {
48 maxBlocks
= multiProcCount
* maxBlocks
;
52 maxBlocks
= maxBlocks
/ gridDim
.x
;
55 maxBlocks
= maxBlocks
/ gridDim
.y
;
58 maxBlocks
= maxBlocks
/ gridDim
.z
;
64 gridDim
.x
= maxBlocks
;
67 gridDim
.y
= maxBlocks
;
70 gridDim
.z
= maxBlocks
;
73 } else if (nbNegGridDim
> 1) {
74 Fortran::runtime::Terminator terminator
{__FILE__
, __LINE__
};
75 terminator
.Crash("Too many invalid grid dimensions");
77 cudaStream_t stream
= 0; // TODO stream managment
79 cudaLaunchKernel(kernel
, gridDim
, blockDim
, params
, smem
, stream
));
82 void RTDEF(CUFLaunchClusterKernel
)(const void *kernel
, intptr_t clusterX
,
83 intptr_t clusterY
, intptr_t clusterZ
, intptr_t gridX
, intptr_t gridY
,
84 intptr_t gridZ
, intptr_t blockX
, intptr_t blockY
, intptr_t blockZ
,
85 int32_t smem
, void **params
, void **extra
) {
86 cudaLaunchConfig_t config
;
87 config
.gridDim
.x
= gridX
;
88 config
.gridDim
.y
= gridY
;
89 config
.gridDim
.z
= gridZ
;
90 config
.blockDim
.x
= blockX
;
91 config
.blockDim
.y
= blockY
;
92 config
.blockDim
.z
= blockZ
;
93 unsigned nbNegGridDim
{0};
103 if (nbNegGridDim
== 1) {
104 int maxBlocks
, nbBlocks
, dev
, multiProcCount
;
105 cudaError_t err1
, err2
;
106 nbBlocks
= config
.blockDim
.x
* config
.blockDim
.y
* config
.blockDim
.z
;
108 err1
= cudaDeviceGetAttribute(
109 &multiProcCount
, cudaDevAttrMultiProcessorCount
, dev
);
110 err2
= cudaOccupancyMaxActiveBlocksPerMultiprocessor(
111 &maxBlocks
, kernel
, nbBlocks
, smem
);
112 if (err1
== cudaSuccess
&& err2
== cudaSuccess
) {
113 maxBlocks
= multiProcCount
* maxBlocks
;
116 if (config
.gridDim
.x
> 0) {
117 maxBlocks
= maxBlocks
/ config
.gridDim
.x
;
119 if (config
.gridDim
.y
> 0) {
120 maxBlocks
= maxBlocks
/ config
.gridDim
.y
;
122 if (config
.gridDim
.z
> 0) {
123 maxBlocks
= maxBlocks
/ config
.gridDim
.z
;
129 config
.gridDim
.x
= maxBlocks
;
132 config
.gridDim
.y
= maxBlocks
;
135 config
.gridDim
.z
= maxBlocks
;
138 } else if (nbNegGridDim
> 1) {
139 Fortran::runtime::Terminator terminator
{__FILE__
, __LINE__
};
140 terminator
.Crash("Too many invalid grid dimensions");
142 config
.dynamicSmemBytes
= smem
;
143 config
.stream
= 0; // TODO stream managment
144 cudaLaunchAttribute launchAttr
[1];
145 launchAttr
[0].id
= cudaLaunchAttributeClusterDimension
;
146 launchAttr
[0].val
.clusterDim
.x
= clusterX
;
147 launchAttr
[0].val
.clusterDim
.y
= clusterY
;
148 launchAttr
[0].val
.clusterDim
.z
= clusterZ
;
150 config
.attrs
= launchAttr
;
151 CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config
, kernel
, params
));