clang/test/CodeGenCUDA/launch-bounds.cu

   1 // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s
   2 // RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -target-cpu sm_90 -DUSE_MAX_BLOCKS -fcuda-is-device -emit-llvm -o - | FileCheck -check-prefix=CHECK_MAX_BLOCKS %s
   3
   4 #include "Inputs/cuda.h"
   5
   6 #define MAX_THREADS_PER_BLOCK 256
   7 #define MIN_BLOCKS_PER_MP     2
   8 #ifdef USE_MAX_BLOCKS
   9 #define MAX_BLOCKS_PER_MP     4
  10 #endif
  11
  12 // Test both max threads per block and Min cta per sm.
  13 extern "C" {
  14 __global__ void
  15 __launch_bounds__( MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
  16 Kernel1()
  17 {
  18 }
  19 }
  20
  21 // CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"maxntidx", i32 256}
  22 // CHECK: !{{[0-9]+}} = !{ptr @Kernel1, !"minctasm", i32 2}
  23
  24 #ifdef USE_MAX_BLOCKS
  25 // Test max threads per block and min/max cta per sm.
  26 extern "C" {
  27 __global__ void
  28 __launch_bounds__( MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP )
  29 Kernel1_sm_90()
  30 {
  31 }
  32 }
  33
  34 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxntidx", i32 256}
  35 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"minctasm", i32 2}
  36 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @Kernel1_sm_90, !"maxclusterrank", i32 4}
  37 #endif // USE_MAX_BLOCKS
  38
  39 // Test only max threads per block. Min cta per sm defaults to 0, and
  40 // CodeGen doesn't output a zero value for minctasm.
  41 extern "C" {
  42 __global__ void
  43 __launch_bounds__( MAX_THREADS_PER_BLOCK )
  44 Kernel2()
  45 {
  46 }
  47 }
  48
  49 // CHECK: !{{[0-9]+}} = !{ptr @Kernel2, !"maxntidx", i32 256}
  50
  51 template <int max_threads_per_block>
  52 __global__ void
  53 __launch_bounds__(max_threads_per_block)
  54 Kernel3()
  55 {
  56 }
  57
  58 template __global__ void Kernel3<MAX_THREADS_PER_BLOCK>();
  59 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel3{{.*}}, !"maxntidx", i32 256}
  60
  61 template <int max_threads_per_block, int min_blocks_per_mp>
  62 __global__ void
  63 __launch_bounds__(max_threads_per_block, min_blocks_per_mp)
  64 Kernel4()
  65 {
  66 }
  67 template __global__ void Kernel4<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
  68
  69 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"maxntidx", i32 256}
  70 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4{{.*}}, !"minctasm", i32 2}
  71
  72 #ifdef USE_MAX_BLOCKS
  73 template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
  74 __global__ void
  75 __launch_bounds__(max_threads_per_block, min_blocks_per_mp, max_blocks_per_mp)
  76 Kernel4_sm_90()
  77 {
  78 }
  79 template __global__ void Kernel4_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
  80
  81 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxntidx", i32 256}
  82 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"minctasm", i32 2}
  83 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel4_sm_90{{.*}}, !"maxclusterrank", i32 4}
  84 #endif //USE_MAX_BLOCKS
  85
  86 const int constint = 100;
  87 template <int max_threads_per_block, int min_blocks_per_mp>
  88 __global__ void
  89 __launch_bounds__(max_threads_per_block + constint,
  90                   min_blocks_per_mp + max_threads_per_block)
  91 Kernel5()
  92 {
  93 }
  94 template __global__ void Kernel5<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP>();
  95
  96 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"maxntidx", i32 356}
  97 // CHECK: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5{{.*}}, !"minctasm", i32 258}
  98
  99 #ifdef USE_MAX_BLOCKS
 100
 101 template <int max_threads_per_block, int min_blocks_per_mp, int max_blocks_per_mp>
 102 __global__ void
 103 __launch_bounds__(max_threads_per_block + constint,
 104                   min_blocks_per_mp + max_threads_per_block,
 105                   max_blocks_per_mp + max_threads_per_block)
 106 Kernel5_sm_90()
 107 {
 108 }
 109 template __global__ void Kernel5_sm_90<MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP, MAX_BLOCKS_PER_MP>();
 110
 111 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxntidx", i32 356}
 112 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"minctasm", i32 258}
 113 // CHECK_MAX_BLOCKS: !{{[0-9]+}} = !{ptr @{{.*}}Kernel5_sm_90{{.*}}, !"maxclusterrank", i32 260}
 114 #endif //USE_MAX_BLOCKS
 115
 116 // Make sure we don't emit negative launch bounds values.
 117 __global__ void
 118 __launch_bounds__( -MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
 119 Kernel6()
 120 {
 121 }
 122 // CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"maxntidx",
 123 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel6{{.*}}, !"minctasm",
 124
 125 __global__ void
 126 __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP )
 127 Kernel7()
 128 {
 129 }
 130 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"maxntidx",
 131 // CHECK-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7{{.*}}, !"minctasm",
 132
 133 #ifdef USE_MAX_BLOCKS
 134 __global__ void
 135 __launch_bounds__( MAX_THREADS_PER_BLOCK, -MIN_BLOCKS_PER_MP, -MAX_BLOCKS_PER_MP )
 136 Kernel7_sm_90()
 137 {
 138 }
 139 // CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxntidx",
 140 // CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"minctasm",
 141 // CHECK_MAX_BLOCKS-NOT: !{{[0-9]+}} = !{ptr @{{.*}}Kernel7_sm_90{{.*}}, !"maxclusterrank",
 142 #endif // USE_MAX_BLOCKS
 143
 144 const char constchar = 12;
 145 __global__ void __launch_bounds__(constint, constchar) Kernel8() {}
 146 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
 147 // CHECK:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8{{.*}}, !"minctasm", i32 12
 148
 149 #ifdef USE_MAX_BLOCKS
 150 const char constchar_2 = 14;
 151 __global__ void __launch_bounds__(constint, constchar, constchar_2) Kernel8_sm_90() {}
 152 // CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxntidx", i32 100
 153 // CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"minctasm", i32 12
 154 // CHECK_MAX_BLOCKS:     !{{[0-9]+}} = !{ptr @{{.*}}Kernel8_sm_90{{.*}}, !"maxclusterrank", i32 14
 155 #endif // USE_MAX_BLOCKS