clang/test/CodeGen/builtins-nvptx.c

   1 // REQUIRES: nvptx-registered-target
   2 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \
   3 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
   4 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s
   5 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
   6 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
   7 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
   8 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
   9 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
  10 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
  11 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
  12 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
  13 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
  14 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
  15 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
  16 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
  17 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \
  18 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
  19 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
  20 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \
  21 // RUN:   -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
  22 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
  23 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
  24 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s
  25 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
  26 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
  27 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
  28 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
  29 // RUN:            -fcuda-is-device -emit-llvm -o - -x cuda %s \
  30 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
  31
  32 #define __device__ __attribute__((device))
  33 #define __global__ __attribute__((global))
  34 #define __shared__ __attribute__((shared))
  35 #define __constant__ __attribute__((constant))
  36
  37 __device__ int read_tid() {
  38
  39 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  40 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  41 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
  42 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
  43
  44   int x = __nvvm_read_ptx_sreg_tid_x();
  45   int y = __nvvm_read_ptx_sreg_tid_y();
  46   int z = __nvvm_read_ptx_sreg_tid_z();
  47   int w = __nvvm_read_ptx_sreg_tid_w();
  48
  49   return x + y + z + w;
  50
  51 }
  52
  53 __device__ bool reflect() {
  54
  55 // CHECK: call i32 @llvm.nvvm.reflect(ptr {{.*}})
  56
  57   unsigned x = __nvvm_reflect("__CUDA_ARCH");
  58   return x >= 700;
  59 }
  60
  61 __device__ int read_ntid() {
  62
  63 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  64 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
  65 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
  66 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
  67
  68   int x = __nvvm_read_ptx_sreg_ntid_x();
  69   int y = __nvvm_read_ptx_sreg_ntid_y();
  70   int z = __nvvm_read_ptx_sreg_ntid_z();
  71   int w = __nvvm_read_ptx_sreg_ntid_w();
  72
  73   return x + y + z + w;
  74
  75 }
  76
  77 __device__ int read_ctaid() {
  78
  79 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  80 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  81 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
  82 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
  83
  84   int x = __nvvm_read_ptx_sreg_ctaid_x();
  85   int y = __nvvm_read_ptx_sreg_ctaid_y();
  86   int z = __nvvm_read_ptx_sreg_ctaid_z();
  87   int w = __nvvm_read_ptx_sreg_ctaid_w();
  88
  89   return x + y + z + w;
  90
  91 }
  92
  93 __device__ int read_nctaid() {
  94
  95 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
  96 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
  97 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
  98 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
  99
 100   int x = __nvvm_read_ptx_sreg_nctaid_x();
 101   int y = __nvvm_read_ptx_sreg_nctaid_y();
 102   int z = __nvvm_read_ptx_sreg_nctaid_z();
 103   int w = __nvvm_read_ptx_sreg_nctaid_w();
 104
 105   return x + y + z + w;
 106
 107 }
 108
 109 __device__ int read_ids() {
 110
 111 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
 112 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid()
 113 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
 114 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
 115 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
 116 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
 117 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
 118
 119   int a = __nvvm_read_ptx_sreg_laneid();
 120   int b = __nvvm_read_ptx_sreg_warpid();
 121   int c = __nvvm_read_ptx_sreg_nwarpid();
 122   int d = __nvvm_read_ptx_sreg_smid();
 123   int e = __nvvm_read_ptx_sreg_nsmid();
 124   int f = __nvvm_read_ptx_sreg_gridid();
 125   int g = __nvvm_read_ptx_sreg_warpsize();
 126
 127   return a + b + c + d + e + f + g;
 128
 129 }
 130
 131 __device__ int read_lanemasks() {
 132
 133 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
 134 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
 135 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
 136 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
 137 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
 138
 139   int a = __nvvm_read_ptx_sreg_lanemask_eq();
 140   int b = __nvvm_read_ptx_sreg_lanemask_le();
 141   int c = __nvvm_read_ptx_sreg_lanemask_lt();
 142   int d = __nvvm_read_ptx_sreg_lanemask_ge();
 143   int e = __nvvm_read_ptx_sreg_lanemask_gt();
 144
 145   return a + b + c + d + e;
 146
 147 }
 148
 149 __device__ long long read_clocks() {
 150
 151 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
 152 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
 153 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
 154
 155   int a = __nvvm_read_ptx_sreg_clock();
 156   long long b = __nvvm_read_ptx_sreg_clock64();
 157   long long c = __nvvm_read_ptx_sreg_globaltimer();
 158
 159   return a + b + c;
 160 }
 161
 162 __device__ int read_pms() {
 163
 164 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0()
 165 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1()
 166 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2()
 167 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3()
 168
 169   int a = __nvvm_read_ptx_sreg_pm0();
 170   int b = __nvvm_read_ptx_sreg_pm1();
 171   int c = __nvvm_read_ptx_sreg_pm2();
 172   int d = __nvvm_read_ptx_sreg_pm3();
 173
 174   return a + b + c + d;
 175
 176 }
 177
 178 __device__ void sync() {
 179
 180 // CHECK: call void @llvm.nvvm.bar.sync(i32 0)
 181
 182   __nvvm_bar_sync(0);
 183
 184 }
 185
 186 __device__ void activemask() {
 187
 188 // CHECK: call i32 @llvm.nvvm.activemask()
 189
 190   __nvvm_activemask();
 191
 192 }
 193
 194 __device__ void exit() {
 195
 196 // CHECK: call void @llvm.nvvm.exit()
 197
 198   __nvvm_exit();
 199
 200 }
 201
 202 // NVVM intrinsics
 203
 204 // The idea is not to test all intrinsics, just that Clang is recognizing the
 205 // builtins defined in BuiltinsNVPTX.def
 206 __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
 207 // CHECK: call float @llvm.nvvm.fmax.f
 208   float t1 = __nvvm_fmax_f(f1, f2);
 209 // CHECK: call float @llvm.nvvm.fmin.f
 210   float t2 = __nvvm_fmin_f(f1, f2);
 211 // CHECK: call float @llvm.nvvm.sqrt.rn.f
 212   float t3 = __nvvm_sqrt_rn_f(f1);
 213 // CHECK: call float @llvm.nvvm.rcp.rn.f
 214   float t4 = __nvvm_rcp_rn_f(f2);
 215 // CHECK: call float @llvm.nvvm.add.rn.f
 216   float t5 = __nvvm_add_rn_f(f1, f2);
 217
 218 // CHECK: call double @llvm.nvvm.fmax.d
 219   double td1 = __nvvm_fmax_d(d1, d2);
 220 // CHECK: call double @llvm.nvvm.fmin.d
 221   double td2 = __nvvm_fmin_d(d1, d2);
 222 // CHECK: call double @llvm.nvvm.sqrt.rn.d
 223   double td3 = __nvvm_sqrt_rn_d(d1);
 224 // CHECK: call double @llvm.nvvm.rcp.rn.d
 225   double td4 = __nvvm_rcp_rn_d(d2);
 226
 227 // CHECK: call void @llvm.nvvm.membar.cta()
 228   __nvvm_membar_cta();
 229 // CHECK: call void @llvm.nvvm.membar.gl()
 230   __nvvm_membar_gl();
 231 // CHECK: call void @llvm.nvvm.membar.sys()
 232   __nvvm_membar_sys();
 233 // CHECK: call void @llvm.nvvm.barrier0()
 234   __syncthreads();
 235 }
 236
 237 __device__ int di;
 238 __shared__ int si;
 239 __device__ long dl;
 240 __shared__ long sl;
 241 __device__ long long dll;
 242 __shared__ long long sll;
 243
 244 // Check for atomic intrinsics
 245 // CHECK-LABEL: nvvm_atom
 246 __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
 247                           unsigned short *usp, unsigned short us, int *ip,
 248                           int i, unsigned int *uip, unsigned ui, long *lp,
 249                           long l, long long *llp, long long ll) {
 250   // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
 251   __nvvm_atom_add_gen_i(ip, i);
 252   // CHECK: atomicrmw add ptr {{.*}} seq_cst, align {{4|8}}
 253   __nvvm_atom_add_gen_l(&dl, l);
 254   // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 8
 255   __nvvm_atom_add_gen_ll(&sll, ll);
 256
 257   // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 4
 258   __nvvm_atom_sub_gen_i(ip, i);
 259   // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align {{4|8}}
 260   __nvvm_atom_sub_gen_l(&dl, l);
 261   // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 8
 262   __nvvm_atom_sub_gen_ll(&sll, ll);
 263
 264   // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 4
 265   __nvvm_atom_and_gen_i(ip, i);
 266   // CHECK: atomicrmw and ptr {{.*}} seq_cst, align {{4|8}}
 267   __nvvm_atom_and_gen_l(&dl, l);
 268   // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 8
 269   __nvvm_atom_and_gen_ll(&sll, ll);
 270
 271   // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 4
 272   __nvvm_atom_or_gen_i(ip, i);
 273   // CHECK: atomicrmw or ptr {{.*}} seq_cst, align {{4|8}}
 274   __nvvm_atom_or_gen_l(&dl, l);
 275   // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 8
 276   __nvvm_atom_or_gen_ll(&sll, ll);
 277
 278   // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 4
 279   __nvvm_atom_xor_gen_i(ip, i);
 280   // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align {{4|8}}
 281   __nvvm_atom_xor_gen_l(&dl, l);
 282   // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 8
 283   __nvvm_atom_xor_gen_ll(&sll, ll);
 284
 285   // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 4
 286   __nvvm_atom_xchg_gen_i(ip, i);
 287   // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align {{4|8}}
 288   __nvvm_atom_xchg_gen_l(&dl, l);
 289   // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 8
 290   __nvvm_atom_xchg_gen_ll(&sll, ll);
 291
 292   // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 4
 293   __nvvm_atom_max_gen_i(ip, i);
 294   // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 4
 295   __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
 296   // CHECK: atomicrmw max ptr {{.*}} seq_cst, align {{4|8}}
 297   __nvvm_atom_max_gen_l(&dl, l);
 298   // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align {{4|8}}
 299   __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
 300   // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 8
 301   __nvvm_atom_max_gen_ll(&sll, ll);
 302   // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 8
 303   __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
 304
 305   // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 4
 306   __nvvm_atom_min_gen_i(ip, i);
 307   // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 4
 308   __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
 309   // CHECK: atomicrmw min ptr {{.*}} seq_cst, align {{4|8}}
 310   __nvvm_atom_min_gen_l(&dl, l);
 311   // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align {{4|8}}
 312   __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
 313   // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 8
 314   __nvvm_atom_min_gen_ll(&sll, ll);
 315   // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 8
 316   __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
 317
 318   // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 4
 319   // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
 320   __nvvm_atom_cas_gen_i(ip, 0, i);
 321   // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align {{4|8}}
 322   // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
 323   __nvvm_atom_cas_gen_l(&dl, 0, l);
 324   // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 8
 325   // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
 326   __nvvm_atom_cas_gen_ll(&sll, 0, ll);
 327
 328   // CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 4
 329   __nvvm_atom_add_gen_f(fp, f);
 330
 331   // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0
 332   __nvvm_atom_inc_gen_ui(uip, ui);
 333
 334   // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0
 335   __nvvm_atom_dec_gen_ui(uip, ui);
 336
 337
 338   //////////////////////////////////////////////////////////////////
 339   // Atomics with scope (only supported on sm_60+).
 340
 341 #if ERROR_CHECK || __CUDA_ARCH__ >= 600
 342
 343   // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0
 344   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature sm_60}}
 345   __nvvm_atom_cta_add_gen_i(ip, i);
 346   // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0
 347   // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0
 348   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature sm_60}}
 349   __nvvm_atom_cta_add_gen_l(&dl, l);
 350   // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0
 351   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature sm_60}}
 352   __nvvm_atom_cta_add_gen_ll(&sll, ll);
 353   // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0
 354   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature sm_60}}
 355   __nvvm_atom_sys_add_gen_i(ip, i);
 356   // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0
 357   // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0
 358   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature sm_60}}
 359   __nvvm_atom_sys_add_gen_l(&dl, l);
 360   // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0
 361   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature sm_60}}
 362   __nvvm_atom_sys_add_gen_ll(&sll, ll);
 363
 364   // CHECK: call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0
 365   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature sm_60}}
 366   __nvvm_atom_cta_add_gen_f(fp, f);
 367   // CHECK: call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0
 368   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature sm_60}}
 369   __nvvm_atom_cta_add_gen_d(dfp, df);
 370   // CHECK: call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0
 371   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature sm_60}}
 372   __nvvm_atom_sys_add_gen_f(fp, f);
 373   // CHECK: call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0
 374   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature sm_60}}
 375   __nvvm_atom_sys_add_gen_d(dfp, df);
 376
 377   // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0
 378   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature sm_60}}
 379   __nvvm_atom_cta_xchg_gen_i(ip, i);
 380   // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0
 381   // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0
 382   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature sm_60}}
 383   __nvvm_atom_cta_xchg_gen_l(&dl, l);
 384   // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0
 385   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature sm_60}}
 386   __nvvm_atom_cta_xchg_gen_ll(&sll, ll);
 387
 388   // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0
 389   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_i' needs target feature sm_60}}
 390   __nvvm_atom_sys_xchg_gen_i(ip, i);
 391   // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0
 392   // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0
 393   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_l' needs target feature sm_60}}
 394   __nvvm_atom_sys_xchg_gen_l(&dl, l);
 395   // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0
 396   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_ll' needs target feature sm_60}}
 397   __nvvm_atom_sys_xchg_gen_ll(&sll, ll);
 398
 399   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
 400   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_i' needs target feature sm_60}}
 401   __nvvm_atom_cta_max_gen_i(ip, i);
 402   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
 403   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ui' needs target feature sm_60}}
 404   __nvvm_atom_cta_max_gen_ui((unsigned int *)ip, i);
 405   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
 406   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
 407   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_l' needs target feature sm_60}}
 408   __nvvm_atom_cta_max_gen_l(&dl, l);
 409   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
 410   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
 411   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ul' needs target feature sm_60}}
 412   __nvvm_atom_cta_max_gen_ul((unsigned long *)lp, l);
 413   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
 414   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ll' needs target feature sm_60}}
 415   __nvvm_atom_cta_max_gen_ll(&sll, ll);
 416   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
 417   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ull' needs target feature sm_60}}
 418   __nvvm_atom_cta_max_gen_ull((unsigned long long *)llp, ll);
 419
 420   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
 421   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_i' needs target feature sm_60}}
 422   __nvvm_atom_sys_max_gen_i(ip, i);
 423   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
 424   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ui' needs target feature sm_60}}
 425   __nvvm_atom_sys_max_gen_ui((unsigned int *)ip, i);
 426   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
 427   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
 428   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_l' needs target feature sm_60}}
 429   __nvvm_atom_sys_max_gen_l(&dl, l);
 430   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
 431   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
 432   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ul' needs target feature sm_60}}
 433   __nvvm_atom_sys_max_gen_ul((unsigned long *)lp, l);
 434   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
 435   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ll' needs target feature sm_60}}
 436   __nvvm_atom_sys_max_gen_ll(&sll, ll);
 437   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
 438   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ull' needs target feature sm_60}}
 439   __nvvm_atom_sys_max_gen_ull((unsigned long long *)llp, ll);
 440
 441   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
 442   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_i' needs target feature sm_60}}
 443   __nvvm_atom_cta_min_gen_i(ip, i);
 444   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
 445   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ui' needs target feature sm_60}}
 446   __nvvm_atom_cta_min_gen_ui((unsigned int *)ip, i);
 447   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
 448   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
 449   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_l' needs target feature sm_60}}
 450   __nvvm_atom_cta_min_gen_l(&dl, l);
 451   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
 452   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
 453   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ul' needs target feature sm_60}}
 454   __nvvm_atom_cta_min_gen_ul((unsigned long *)lp, l);
 455   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
 456   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ll' needs target feature sm_60}}
 457   __nvvm_atom_cta_min_gen_ll(&sll, ll);
 458   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
 459   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ull' needs target feature sm_60}}
 460   __nvvm_atom_cta_min_gen_ull((unsigned long long *)llp, ll);
 461
 462   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
 463   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_i' needs target feature sm_60}}
 464   __nvvm_atom_sys_min_gen_i(ip, i);
 465   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
 466   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ui' needs target feature sm_60}}
 467   __nvvm_atom_sys_min_gen_ui((unsigned int *)ip, i);
 468   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
 469   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
 470   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_l' needs target feature sm_60}}
 471   __nvvm_atom_sys_min_gen_l(&dl, l);
 472   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
 473   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
 474   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ul' needs target feature sm_60}}
 475   __nvvm_atom_sys_min_gen_ul((unsigned long *)lp, l);
 476   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
 477   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ll' needs target feature sm_60}}
 478   __nvvm_atom_sys_min_gen_ll(&sll, ll);
 479   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
 480   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ull' needs target feature sm_60}}
 481   __nvvm_atom_sys_min_gen_ull((unsigned long long *)llp, ll);
 482
 483   // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0
 484   // expected-error@+1 {{'__nvvm_atom_cta_inc_gen_ui' needs target feature sm_60}}
 485   __nvvm_atom_cta_inc_gen_ui((unsigned int *)ip, i);
 486   // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0
 487   // expected-error@+1 {{'__nvvm_atom_sys_inc_gen_ui' needs target feature sm_60}}
 488   __nvvm_atom_sys_inc_gen_ui((unsigned int *)ip, i);
 489
 490   // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0
 491   // expected-error@+1 {{'__nvvm_atom_cta_dec_gen_ui' needs target feature sm_60}}
 492   __nvvm_atom_cta_dec_gen_ui((unsigned int *)ip, i);
 493   // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0
 494   // expected-error@+1 {{'__nvvm_atom_sys_dec_gen_ui' needs target feature sm_60}}
 495   __nvvm_atom_sys_dec_gen_ui((unsigned int *)ip, i);
 496
 497   // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0
 498   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_i' needs target feature sm_60}}
 499   __nvvm_atom_cta_and_gen_i(ip, i);
 500   // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0
 501   // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0
 502   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_l' needs target feature sm_60}}
 503   __nvvm_atom_cta_and_gen_l(&dl, l);
 504   // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0
 505   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_ll' needs target feature sm_60}}
 506   __nvvm_atom_cta_and_gen_ll(&sll, ll);
 507
 508   // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0
 509   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_i' needs target feature sm_60}}
 510   __nvvm_atom_sys_and_gen_i(ip, i);
 511   // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0
 512   // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0
 513   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_l' needs target feature sm_60}}
 514   __nvvm_atom_sys_and_gen_l(&dl, l);
 515   // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0
 516   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_ll' needs target feature sm_60}}
 517   __nvvm_atom_sys_and_gen_ll(&sll, ll);
 518
 519   // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0
 520   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_i' needs target feature sm_60}}
 521   __nvvm_atom_cta_or_gen_i(ip, i);
 522   // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0
 523   // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0
 524   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_l' needs target feature sm_60}}
 525   __nvvm_atom_cta_or_gen_l(&dl, l);
 526   // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0
 527   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_ll' needs target feature sm_60}}
 528   __nvvm_atom_cta_or_gen_ll(&sll, ll);
 529
 530   // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0
 531   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_i' needs target feature sm_60}}
 532   __nvvm_atom_sys_or_gen_i(ip, i);
 533   // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0
 534   // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0
 535   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_l' needs target feature sm_60}}
 536   __nvvm_atom_sys_or_gen_l(&dl, l);
 537   // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0
 538   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_ll' needs target feature sm_60}}
 539   __nvvm_atom_sys_or_gen_ll(&sll, ll);
 540
 541   // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0
 542   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_i' needs target feature sm_60}}
 543   __nvvm_atom_cta_xor_gen_i(ip, i);
 544   // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0
 545   // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0
 546   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_l' needs target feature sm_60}}
 547   __nvvm_atom_cta_xor_gen_l(&dl, l);
 548   // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0
 549   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_ll' needs target feature sm_60}}
 550   __nvvm_atom_cta_xor_gen_ll(&sll, ll);
 551
 552   // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0
 553   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_i' needs target feature sm_60}}
 554   __nvvm_atom_sys_xor_gen_i(ip, i);
 555   // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0
 556   // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0
 557   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_l' needs target feature sm_60}}
 558   __nvvm_atom_sys_xor_gen_l(&dl, l);
 559   // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0
 560   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_ll' needs target feature sm_60}}
 561   __nvvm_atom_sys_xor_gen_ll(&sll, ll);
 562
 563   // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0
 564   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_i' needs target feature sm_60}}
 565   __nvvm_atom_cta_cas_gen_i(ip, i, 0);
 566   // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0
 567   // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0
 568   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_l' needs target feature sm_60}}
 569   __nvvm_atom_cta_cas_gen_l(&dl, l, 0);
 570   // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0
 571   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_ll' needs target feature sm_60}}
 572   __nvvm_atom_cta_cas_gen_ll(&sll, ll, 0);
 573
 574   // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0
 575   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_i' needs target feature sm_60}}
 576   __nvvm_atom_sys_cas_gen_i(ip, i, 0);
 577   // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0
 578   // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0
 579   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_l' needs target feature sm_60}}
 580   __nvvm_atom_sys_cas_gen_l(&dl, l, 0);
 581   // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0
 582   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_ll' needs target feature sm_60}}
 583   __nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
 584 #endif
 585
 586 #if __CUDA_ARCH__ >= 700
 587   // CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2
 588   // CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0
 589   __nvvm_atom_cas_gen_us(usp, 0, us);
 590   // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.cta.i16.p0
 591   __nvvm_atom_cta_cas_gen_us(usp, 0, us);
 592   // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.sys.i16.p0
 593   __nvvm_atom_sys_cas_gen_us(usp, 0, us);
 594 #endif
 595
 596   // CHECK: ret
 597 }
 598
 599 // CHECK-LABEL: nvvm_ldg
 600 __device__ void nvvm_ldg(const void *p) {
 601   // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
 602   // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
 603   // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
 604   __nvvm_ldg_c((const char *)p);
 605   __nvvm_ldg_uc((const unsigned char *)p);
 606   __nvvm_ldg_sc((const signed char *)p);
 607
 608   // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
 609   // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
 610   __nvvm_ldg_s((const short *)p);
 611   __nvvm_ldg_us((const unsigned short *)p);
 612
 613   // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
 614   // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
 615   __nvvm_ldg_i((const int *)p);
 616   __nvvm_ldg_ui((const unsigned int *)p);
 617
 618   // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
 619   // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
 620   // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
 621   // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
 622   __nvvm_ldg_l((const long *)p);
 623   __nvvm_ldg_ul((const unsigned long *)p);
 624
 625   // CHECK: load float, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
 626   __nvvm_ldg_f((const float *)p);
 627   // CHECK: load double, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
 628   __nvvm_ldg_d((const double *)p);
 629
 630   // In practice, the pointers we pass to __ldg will be aligned as appropriate
 631   // for the CUDA <type>N vector types (e.g. short4), which are not the same as
 632   // the LLVM vector types.  However, each LLVM vector type has an alignment
 633   // less than or equal to its corresponding CUDA type, so we're OK.
 634   //
 635   // PTX Interoperability section 2.2: "For a vector with an even number of
 636   // elements, its alignment is set to number of elements times the alignment of
 637   // its member: n*alignof(t)."
 638
 639   // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
 640   // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
 641   // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
 642   typedef char char2 __attribute__((ext_vector_type(2)));
 643   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
 644   typedef signed char schar2 __attribute__((ext_vector_type(2)));
 645   __nvvm_ldg_c2((const char2 *)p);
 646   __nvvm_ldg_uc2((const uchar2 *)p);
 647   __nvvm_ldg_sc2((const schar2 *)p);
 648
 649   // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
 650   // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
 651   // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
 652   typedef char char4 __attribute__((ext_vector_type(4)));
 653   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
 654   typedef signed char schar4 __attribute__((ext_vector_type(4)));
 655   __nvvm_ldg_c4((const char4 *)p);
 656   __nvvm_ldg_uc4((const uchar4 *)p);
 657   __nvvm_ldg_sc4((const schar4 *)p);
 658
 659   // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
 660   // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
 661   typedef short short2 __attribute__((ext_vector_type(2)));
 662   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
 663   __nvvm_ldg_s2((const short2 *)p);
 664   __nvvm_ldg_us2((const ushort2 *)p);
 665
 666   // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
 667   // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
 668   typedef short short4 __attribute__((ext_vector_type(4)));
 669   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
 670   __nvvm_ldg_s4((const short4 *)p);
 671   __nvvm_ldg_us4((const ushort4 *)p);
 672
 673   // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
 674   // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
 675   typedef int int2 __attribute__((ext_vector_type(2)));
 676   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
 677   __nvvm_ldg_i2((const int2 *)p);
 678   __nvvm_ldg_ui2((const uint2 *)p);
 679
 680   // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
 681   // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
 682   typedef int int4 __attribute__((ext_vector_type(4)));
 683   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
 684   __nvvm_ldg_i4((const int4 *)p);
 685   __nvvm_ldg_ui4((const uint4 *)p);
 686
 687   // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
 688   // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
 689   // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
 690   // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
 691   typedef long long2 __attribute__((ext_vector_type(2)));
 692   typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
 693   __nvvm_ldg_l2((const long2 *)p);
 694   __nvvm_ldg_ul2((const ulong2 *)p);
 695
 696   // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
 697   // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
 698   typedef long long longlong2 __attribute__((ext_vector_type(2)));
 699   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
 700   __nvvm_ldg_ll2((const longlong2 *)p);
 701   __nvvm_ldg_ull2((const ulonglong2 *)p);
 702
 703   // CHECK: load <2 x float>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
 704   typedef float float2 __attribute__((ext_vector_type(2)));
 705   __nvvm_ldg_f2((const float2 *)p);
 706
 707   // CHECK: load <4 x float>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
 708   typedef float float4 __attribute__((ext_vector_type(4)));
 709   __nvvm_ldg_f4((const float4 *)p);
 710
 711   // CHECK: load <2 x double>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
 712   typedef double double2 __attribute__((ext_vector_type(2)));
 713   __nvvm_ldg_d2((const double2 *)p);
 714 }
 715
 716 // CHECK-LABEL: nvvm_ldu
 717 __device__ void nvvm_ldu(const void *p) {
 718   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
 719   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
 720   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
 721   __nvvm_ldu_c((const char *)p);
 722   __nvvm_ldu_uc((const unsigned char *)p);
 723   __nvvm_ldu_sc((const signed char *)p);
 724
 725   // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
 726   // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
 727   __nvvm_ldu_s((const short *)p);
 728   __nvvm_ldu_us((const unsigned short *)p);
 729
 730   // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 731   // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 732   __nvvm_ldu_i((const int *)p);
 733   __nvvm_ldu_ui((const unsigned int *)p);
 734
 735   // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 736   // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 737   // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
 738   // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
 739   __nvvm_ldu_l((const long *)p);
 740   __nvvm_ldu_ul((const unsigned long *)p);
 741
 742   // CHECK: call float @llvm.nvvm.ldu.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
 743   __nvvm_ldu_f((const float *)p);
 744   // CHECK: call double @llvm.nvvm.ldu.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
 745   __nvvm_ldu_d((const double *)p);
 746
 747   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
 748   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
 749   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
 750   typedef char char2 __attribute__((ext_vector_type(2)));
 751   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
 752   typedef signed char schar2 __attribute__((ext_vector_type(2)));
 753   __nvvm_ldu_c2((const char2 *)p);
 754   __nvvm_ldu_uc2((const uchar2 *)p);
 755   __nvvm_ldu_sc2((const schar2 *)p);
 756
 757   // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
 758   // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
 759   // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
 760   typedef char char4 __attribute__((ext_vector_type(4)));
 761   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
 762   typedef signed char schar4 __attribute__((ext_vector_type(4)));
 763   __nvvm_ldu_c4((const char4 *)p);
 764   __nvvm_ldu_uc4((const uchar4 *)p);
 765   __nvvm_ldu_sc4((const schar4 *)p);
 766
 767   // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
 768   // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
 769   typedef short short2 __attribute__((ext_vector_type(2)));
 770   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
 771   __nvvm_ldu_s2((const short2 *)p);
 772   __nvvm_ldu_us2((const ushort2 *)p);
 773
 774   // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
 775   // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
 776   typedef short short4 __attribute__((ext_vector_type(4)));
 777   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
 778   __nvvm_ldu_s4((const short4 *)p);
 779   __nvvm_ldu_us4((const ushort4 *)p);
 780
 781   // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 782   // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 783   typedef int int2 __attribute__((ext_vector_type(2)));
 784   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
 785   __nvvm_ldu_i2((const int2 *)p);
 786   __nvvm_ldu_ui2((const uint2 *)p);
 787
 788   // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
 789   // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
 790   typedef int int4 __attribute__((ext_vector_type(4)));
 791   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
 792   __nvvm_ldu_i4((const int4 *)p);
 793   __nvvm_ldu_ui4((const uint4 *)p);
 794
 795   // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 796   // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 797   // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 798   // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 799   typedef long long2 __attribute__((ext_vector_type(2)));
 800   typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
 801   __nvvm_ldu_l2((const long2 *)p);
 802   __nvvm_ldu_ul2((const ulong2 *)p);
 803
 804   // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 805   // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 806   typedef long long longlong2 __attribute__((ext_vector_type(2)));
 807   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
 808   __nvvm_ldu_ll2((const longlong2 *)p);
 809   __nvvm_ldu_ull2((const ulonglong2 *)p);
 810
 811   // CHECK: call <2 x float> @llvm.nvvm.ldu.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
 812   typedef float float2 __attribute__((ext_vector_type(2)));
 813   __nvvm_ldu_f2((const float2 *)p);
 814
 815   // CHECK: call <4 x float> @llvm.nvvm.ldu.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
 816   typedef float float4 __attribute__((ext_vector_type(4)));
 817   __nvvm_ldu_f4((const float4 *)p);
 818
 819   // CHECK: call <2 x double> @llvm.nvvm.ldu.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
 820   typedef double double2 __attribute__((ext_vector_type(2)));
 821   __nvvm_ldu_d2((const double2 *)p);
 822 }
 823
 824 // CHECK-LABEL: nvvm_shfl
 825 __device__ void nvvm_shfl(int i, float f, int a, int b) {
 826   // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32
 827   __nvvm_shfl_down_i32(i, a, b);
 828   // CHECK: call float @llvm.nvvm.shfl.down.f32(float
 829   __nvvm_shfl_down_f32(f, a, b);
 830   // CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32
 831   __nvvm_shfl_up_i32(i, a, b);
 832   // CHECK: call float @llvm.nvvm.shfl.up.f32(float
 833   __nvvm_shfl_up_f32(f, a, b);
 834   // CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32
 835   __nvvm_shfl_bfly_i32(i, a, b);
 836   // CHECK: call float @llvm.nvvm.shfl.bfly.f32(float
 837   __nvvm_shfl_bfly_f32(f, a, b);
 838   // CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32
 839   __nvvm_shfl_idx_i32(i, a, b);
 840   // CHECK: call float @llvm.nvvm.shfl.idx.f32(float
 841   __nvvm_shfl_idx_f32(f, a, b);
 842   // CHECK: ret void
 843 }
 844
 845 __device__ void nvvm_vote(int pred) {
 846   // CHECK: call i1 @llvm.nvvm.vote.all(i1
 847   __nvvm_vote_all(pred);
 848   // CHECK: call i1 @llvm.nvvm.vote.any(i1
 849   __nvvm_vote_any(pred);
 850   // CHECK: call i1 @llvm.nvvm.vote.uni(i1
 851   __nvvm_vote_uni(pred);
 852   // CHECK: call i32 @llvm.nvvm.vote.ballot(i1
 853   __nvvm_vote_ballot(pred);
 854   // CHECK: ret void
 855 }
 856
 857 // CHECK-LABEL: nvvm_nanosleep
 858 __device__ void nvvm_nanosleep(int d) {
 859 #if __CUDA_ARCH__ >= 700
 860   // CHECK_PTX70_SM80: call void @llvm.nvvm.nanosleep
 861   __nvvm_nanosleep(d);
 862
 863   // CHECK_PTX70_SM80: call void @llvm.nvvm.nanosleep
 864   __nvvm_nanosleep(1);
 865 #endif
 866 }
 867
 868 // CHECK-LABEL: nvvm_mbarrier
 869 __device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) {
 870   #if __CUDA_ARCH__ >= 800
 871   __nvvm_mbarrier_init(addr, count);
 872   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init
 873   __nvvm_mbarrier_init_shared(sharedAddr, count);
 874   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared
 875
 876   __nvvm_mbarrier_inval(addr);
 877   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval
 878   __nvvm_mbarrier_inval_shared(sharedAddr);
 879   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared
 880
 881   __nvvm_mbarrier_arrive(addr);
 882   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive
 883   __nvvm_mbarrier_arrive_shared(sharedAddr);
 884   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared
 885   __nvvm_mbarrier_arrive_noComplete(addr, count);
 886   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete
 887   __nvvm_mbarrier_arrive_noComplete_shared(sharedAddr, count);
 888   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared
 889
 890   __nvvm_mbarrier_arrive_drop(addr);
 891   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop
 892   __nvvm_mbarrier_arrive_drop_shared(sharedAddr);
 893   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared
 894   __nvvm_mbarrier_arrive_drop_noComplete(addr, count);
 895   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete
 896   __nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr, count);
 897   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared
 898
 899   __nvvm_mbarrier_test_wait(addr, state);
 900   // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait
 901   __nvvm_mbarrier_test_wait_shared(sharedAddr, state);
 902   // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared
 903
 904   __nvvm_mbarrier_pending_count(state);
 905   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count
 906   #endif
 907   // CHECK: ret void
 908 }
 909
 910 // CHECK-LABEL: nvvm_async_copy
 911 __device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) {
 912   #if __CUDA_ARCH__ >= 800
 913   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive
 914   __nvvm_cp_async_mbarrier_arrive(addr);
 915   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared
 916   __nvvm_cp_async_mbarrier_arrive_shared(sharedAddr);
 917   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc
 918   __nvvm_cp_async_mbarrier_arrive_noinc(addr);
 919   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
 920   __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
 921
 922   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4(
 923   __nvvm_cp_async_ca_shared_global_4(dst, src);
 924   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8(
 925   __nvvm_cp_async_ca_shared_global_8(dst, src);
 926   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16(
 927   __nvvm_cp_async_ca_shared_global_16(dst, src);
 928   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16(
 929   __nvvm_cp_async_cg_shared_global_16(dst, src);
 930
 931   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4.s({{.*}}, i32 2)
 932   __nvvm_cp_async_ca_shared_global_4(dst, src, 2);
 933   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8.s({{.*}}, i32 2)
 934   __nvvm_cp_async_ca_shared_global_8(dst, src, 2);
 935   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16.s({{.*}}, i32 2)
 936   __nvvm_cp_async_ca_shared_global_16(dst, src, 2);
 937   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16.s({{.*}}, i32 2)
 938   __nvvm_cp_async_cg_shared_global_16(dst, src, 2);
 939
 940   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
 941   __nvvm_cp_async_commit_group();
 942   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
 943   __nvvm_cp_async_wait_group(0);
 944     // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8)
 945   __nvvm_cp_async_wait_group(8);
 946     // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16)
 947   __nvvm_cp_async_wait_group(16);
 948   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all
 949   __nvvm_cp_async_wait_all();
 950   #endif
 951   // CHECK: ret void
 952 }
 953
 954 // CHECK-LABEL: nvvm_cvt_sm80
 955 __device__ void nvvm_cvt_sm80() {
 956 #if __CUDA_ARCH__ >= 800
 957   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn(float 1.000000e+00, float 1.000000e+00)
 958   __nvvm_ff2bf16x2_rn(1, 1);
 959   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
 960   __nvvm_ff2bf16x2_rn_relu(1, 1);
 961   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz(float 1.000000e+00, float 1.000000e+00)
 962   __nvvm_ff2bf16x2_rz(1, 1);
 963   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
 964   __nvvm_ff2bf16x2_rz_relu(1, 1);
 965
 966   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00)
 967   __nvvm_ff2f16x2_rn(1, 1);
 968   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
 969   __nvvm_ff2f16x2_rn_relu(1, 1);
 970   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz(float 1.000000e+00, float 1.000000e+00)
 971   __nvvm_ff2f16x2_rz(1, 1);
 972   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
 973   __nvvm_ff2f16x2_rz_relu(1, 1);
 974
 975   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn(float 1.000000e+00)
 976   __nvvm_f2bf16_rn(1);
 977   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn.relu(float 1.000000e+00)
 978   __nvvm_f2bf16_rn_relu(1);
 979   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz(float 1.000000e+00)
 980   __nvvm_f2bf16_rz(1);
 981   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00)
 982   __nvvm_f2bf16_rz_relu(1);
 983
 984   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00)
 985   __nvvm_f2tf32_rna(1);
 986 #endif
 987   // CHECK: ret void
 988 }
 989
 990 // CHECK-LABEL: nvvm_cvt_sm89
 991 __device__ void nvvm_cvt_sm89() {
 992 #if __CUDA_ARCH__ >= 890
 993   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00)
 994   __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f);
 995   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
 996   __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f);
 997   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.000000e+00, float 1.000000e+00)
 998   __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f);
 999   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1000   __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f);
1001
1002   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> splat (half 0xH3C00))
1003   __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16});
1004   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> splat (half 0xH3C00))
1005   __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16});
1006   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> splat (half 0xH3C00))
1007   __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16});
1008   // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> splat (half 0xH3C00))
1009   __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16});
1010
1011   // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504)
1012   __nvvm_e4m3x2_to_f16x2_rn(0x4848);
1013   // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 18504)
1014   __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848);
1015   // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532)
1016   __nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
1017   // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532)
1018   __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
1019 #endif
1020   // CHECK: ret void
1021 }
1022
1023 #define NAN32 0x7FBFFFFF
1024 #define NAN16 (__bf16)0x7FBF
1025 #define BF16 (__bf16)0.1f
1026 #define BF16_2 (__bf16)0.2f
1027 #define NANBF16 (__bf16)0xFFC1
1028 #define BF16X2 {(__bf16)0.1f, (__bf16)0.1f}
1029 #define BF16X2_2 {(__bf16)0.2f, (__bf16)0.2f}
1030 #define NANBF16X2 {NANBF16, NANBF16}
1031
1032 // CHECK-LABEL: nvvm_abs_neg_bf16_bf16x2_sm80
1033 __device__ void nvvm_abs_neg_bf16_bf16x2_sm80() {
1034 #if __CUDA_ARCH__ >= 800
1035
1036   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.abs.bf16(bfloat 0xR3DCD)
1037   __nvvm_abs_bf16(BF16);
1038   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat> splat (bfloat 0xR3DCD))
1039   __nvvm_abs_bf16x2(BF16X2);
1040
1041   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.neg.bf16(bfloat 0xR3DCD)
1042   __nvvm_neg_bf16(BF16);
1043   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.neg.bf16x2(<2 x bfloat> splat (bfloat 0xR3DCD))
1044   __nvvm_neg_bf16x2(BF16X2);
1045 #endif
1046   // CHECK: ret void
1047 }
1048
1049 // CHECK-LABEL: nvvm_min_max_sm80
1050 __device__ void nvvm_min_max_sm80() {
1051 #if __CUDA_ARCH__ >= 800
1052
1053   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.nan.f
1054   __nvvm_fmin_nan_f(0.1f, (float)NAN32);
1055   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.ftz.nan.f
1056   __nvvm_fmin_ftz_nan_f(0.1f, (float)NAN32);
1057
1058   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.bf16
1059   __nvvm_fmin_bf16(BF16, BF16_2);
1060   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.bf16
1061   __nvvm_fmin_ftz_bf16(BF16, BF16_2);
1062   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.nan.bf16
1063   __nvvm_fmin_nan_bf16(BF16, NANBF16);
1064   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.nan.bf16
1065   __nvvm_fmin_ftz_nan_bf16(BF16, NANBF16);
1066   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.bf16x2
1067   __nvvm_fmin_bf16x2(BF16X2, BF16X2_2);
1068   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.bf16x2
1069   __nvvm_fmin_ftz_bf16x2(BF16X2, BF16X2_2);
1070   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.nan.bf16x2
1071   __nvvm_fmin_nan_bf16x2(BF16X2, NANBF16X2);
1072   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.nan.bf16x2
1073   __nvvm_fmin_ftz_nan_bf16x2(BF16X2, NANBF16X2);
1074   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
1075   __nvvm_fmax_nan_f(0.1f, 0.11f);
1076   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
1077   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
1078
1079   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
1080   __nvvm_fmax_nan_f(0.1f, (float)NAN32);
1081   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
1082   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
1083   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.bf16
1084   __nvvm_fmax_bf16(BF16, BF16_2);
1085   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.bf16
1086   __nvvm_fmax_ftz_bf16(BF16, BF16_2);
1087   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.nan.bf16
1088   __nvvm_fmax_nan_bf16(BF16, NANBF16);
1089   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.nan.bf16
1090   __nvvm_fmax_ftz_nan_bf16(BF16, NANBF16);
1091   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.bf16x2
1092   __nvvm_fmax_bf16x2(BF16X2, BF16X2_2);
1093   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.bf16x2
1094   __nvvm_fmax_ftz_bf16x2(BF16X2, BF16X2_2);
1095   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.nan.bf16x2
1096   __nvvm_fmax_nan_bf16x2(NANBF16X2, BF16X2);
1097   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.nan.bf16x2
1098   __nvvm_fmax_ftz_nan_bf16x2(NANBF16X2, BF16X2);
1099   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
1100   __nvvm_fmax_nan_f(0.1f, (float)NAN32);
1101   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
1102   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
1103
1104 #endif
1105   // CHECK: ret void
1106 }
1107
1108 // CHECK-LABEL: nvvm_fma_bf16_bf16x2_sm80
1109 __device__ void nvvm_fma_bf16_bf16x2_sm80() {
1110 #if __CUDA_ARCH__ >= 800
1111   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.bf16
1112   __nvvm_fma_rn_bf16(BF16, BF16_2, BF16_2);
1113   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.relu.bf16
1114   __nvvm_fma_rn_relu_bf16(BF16, BF16_2, BF16_2);
1115   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.bf16x2
1116   __nvvm_fma_rn_bf16x2(BF16X2, BF16X2_2, BF16X2_2);
1117   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.relu.bf16x2
1118   __nvvm_fma_rn_relu_bf16x2(BF16X2, BF16X2_2, BF16X2_2);
1119 #endif
1120   // CHECK: ret void
1121 }
1122
1123 // CHECK-LABEL: nvvm_min_max_sm86
1124 __device__ void nvvm_min_max_sm86() {
1125 #if __CUDA_ARCH__ >= 860
1126
1127   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.xorsign.abs.bf16
1128   __nvvm_fmin_xorsign_abs_bf16(BF16, BF16_2);
1129   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.nan.xorsign.abs.bf16
1130   __nvvm_fmin_nan_xorsign_abs_bf16(BF16, NANBF16);
1131   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.xorsign.abs.bf16x2
1132   __nvvm_fmin_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
1133   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2
1134   __nvvm_fmin_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
1135   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.xorsign.abs.f
1136   __nvvm_fmin_xorsign_abs_f(-0.1f, 0.1f);
1137   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.xorsign.abs.f
1138   __nvvm_fmin_ftz_xorsign_abs_f(-0.1f, 0.1f);
1139   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.nan.xorsign.abs.f
1140   __nvvm_fmin_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1141   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f
1142   __nvvm_fmin_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1143
1144   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.xorsign.abs.bf16
1145   __nvvm_fmax_xorsign_abs_bf16(BF16, BF16_2);
1146   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.nan.xorsign.abs.bf16
1147   __nvvm_fmax_nan_xorsign_abs_bf16(BF16, NANBF16);
1148   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.xorsign.abs.bf16x2
1149   __nvvm_fmax_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
1150   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2
1151   __nvvm_fmax_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
1152   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.xorsign.abs.f
1153   __nvvm_fmax_xorsign_abs_f(-0.1f, 0.1f);
1154   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.xorsign.abs.f
1155   __nvvm_fmax_ftz_xorsign_abs_f(-0.1f, 0.1f);
1156   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.nan.xorsign.abs.f
1157   __nvvm_fmax_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1158   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f
1159   __nvvm_fmax_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1160 #endif
1161   // CHECK: ret void
1162 }