clang/test/CodeGen/builtins-nvptx.c

   1 // REQUIRES: nvptx-registered-target
   2 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
   3 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
   4 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
   5 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
   6 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
   7 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
   8 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \
   9 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
  10 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
  11 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 \
  12 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
  13 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
  14 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 \
  15 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
  16 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
  17 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 \
  18 // RUN:   -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
  19 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
  20 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
  21 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s
  22 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
  23 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
  24 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
  25
  26 #define __device__ __attribute__((device))
  27 #define __global__ __attribute__((global))
  28 #define __shared__ __attribute__((shared))
  29 #define __constant__ __attribute__((constant))
  30
  31 __device__ int read_tid() {
  32
  33 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  34 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
  35 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
  36 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
  37
  38   int x = __nvvm_read_ptx_sreg_tid_x();
  39   int y = __nvvm_read_ptx_sreg_tid_y();
  40   int z = __nvvm_read_ptx_sreg_tid_z();
  41   int w = __nvvm_read_ptx_sreg_tid_w();
  42
  43   return x + y + z + w;
  44
  45 }
  46
  47 __device__ int read_ntid() {
  48
  49 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  50 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
  51 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
  52 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
  53
  54   int x = __nvvm_read_ptx_sreg_ntid_x();
  55   int y = __nvvm_read_ptx_sreg_ntid_y();
  56   int z = __nvvm_read_ptx_sreg_ntid_z();
  57   int w = __nvvm_read_ptx_sreg_ntid_w();
  58
  59   return x + y + z + w;
  60
  61 }
  62
  63 __device__ int read_ctaid() {
  64
  65 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  66 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  67 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
  68 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
  69
  70   int x = __nvvm_read_ptx_sreg_ctaid_x();
  71   int y = __nvvm_read_ptx_sreg_ctaid_y();
  72   int z = __nvvm_read_ptx_sreg_ctaid_z();
  73   int w = __nvvm_read_ptx_sreg_ctaid_w();
  74
  75   return x + y + z + w;
  76
  77 }
  78
  79 __device__ int read_nctaid() {
  80
  81 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
  82 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
  83 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
  84 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
  85
  86   int x = __nvvm_read_ptx_sreg_nctaid_x();
  87   int y = __nvvm_read_ptx_sreg_nctaid_y();
  88   int z = __nvvm_read_ptx_sreg_nctaid_z();
  89   int w = __nvvm_read_ptx_sreg_nctaid_w();
  90
  91   return x + y + z + w;
  92
  93 }
  94
  95 __device__ int read_ids() {
  96
  97 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
  98 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid()
  99 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
 100 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
 101 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
 102 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
 103
 104   int a = __nvvm_read_ptx_sreg_laneid();
 105   int b = __nvvm_read_ptx_sreg_warpid();
 106   int c = __nvvm_read_ptx_sreg_nwarpid();
 107   int d = __nvvm_read_ptx_sreg_smid();
 108   int e = __nvvm_read_ptx_sreg_nsmid();
 109   int f = __nvvm_read_ptx_sreg_gridid();
 110
 111   return a + b + c + d + e + f;
 112
 113 }
 114
 115 __device__ int read_lanemasks() {
 116
 117 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
 118 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
 119 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
 120 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
 121 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
 122
 123   int a = __nvvm_read_ptx_sreg_lanemask_eq();
 124   int b = __nvvm_read_ptx_sreg_lanemask_le();
 125   int c = __nvvm_read_ptx_sreg_lanemask_lt();
 126   int d = __nvvm_read_ptx_sreg_lanemask_ge();
 127   int e = __nvvm_read_ptx_sreg_lanemask_gt();
 128
 129   return a + b + c + d + e;
 130
 131 }
 132
 133 __device__ long long read_clocks() {
 134
 135 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
 136 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
 137
 138   int a = __nvvm_read_ptx_sreg_clock();
 139   long long b = __nvvm_read_ptx_sreg_clock64();
 140
 141   return a + b;
 142 }
 143
 144 __device__ int read_pms() {
 145
 146 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0()
 147 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1()
 148 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2()
 149 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3()
 150
 151   int a = __nvvm_read_ptx_sreg_pm0();
 152   int b = __nvvm_read_ptx_sreg_pm1();
 153   int c = __nvvm_read_ptx_sreg_pm2();
 154   int d = __nvvm_read_ptx_sreg_pm3();
 155
 156   return a + b + c + d;
 157
 158 }
 159
 160 __device__ void sync() {
 161
 162 // CHECK: call void @llvm.nvvm.bar.sync(i32 0)
 163
 164   __nvvm_bar_sync(0);
 165
 166 }
 167
 168
 169 // NVVM intrinsics
 170
 171 // The idea is not to test all intrinsics, just that Clang is recognizing the
 172 // builtins defined in BuiltinsNVPTX.def
 173 __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
 174 // CHECK: call float @llvm.nvvm.fmax.f
 175   float t1 = __nvvm_fmax_f(f1, f2);
 176 // CHECK: call float @llvm.nvvm.fmin.f
 177   float t2 = __nvvm_fmin_f(f1, f2);
 178 // CHECK: call float @llvm.nvvm.sqrt.rn.f
 179   float t3 = __nvvm_sqrt_rn_f(f1);
 180 // CHECK: call float @llvm.nvvm.rcp.rn.f
 181   float t4 = __nvvm_rcp_rn_f(f2);
 182 // CHECK: call float @llvm.nvvm.add.rn.f
 183   float t5 = __nvvm_add_rn_f(f1, f2);
 184
 185 // CHECK: call double @llvm.nvvm.fmax.d
 186   double td1 = __nvvm_fmax_d(d1, d2);
 187 // CHECK: call double @llvm.nvvm.fmin.d
 188   double td2 = __nvvm_fmin_d(d1, d2);
 189 // CHECK: call double @llvm.nvvm.sqrt.rn.d
 190   double td3 = __nvvm_sqrt_rn_d(d1);
 191 // CHECK: call double @llvm.nvvm.rcp.rn.d
 192   double td4 = __nvvm_rcp_rn_d(d2);
 193
 194 // CHECK: call void @llvm.nvvm.membar.cta()
 195   __nvvm_membar_cta();
 196 // CHECK: call void @llvm.nvvm.membar.gl()
 197   __nvvm_membar_gl();
 198 // CHECK: call void @llvm.nvvm.membar.sys()
 199   __nvvm_membar_sys();
 200 // CHECK: call void @llvm.nvvm.barrier0()
 201   __syncthreads();
 202 }
 203
 204 __device__ int di;
 205 __shared__ int si;
 206 __device__ long dl;
 207 __shared__ long sl;
 208 __device__ long long dll;
 209 __shared__ long long sll;
 210
 211 // Check for atomic intrinsics
 212 // CHECK-LABEL: nvvm_atom
 213 __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
 214                           int i, unsigned int *uip, unsigned ui, long *lp,
 215                           long l, long long *llp, long long ll) {
 216   // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
 217   __nvvm_atom_add_gen_i(ip, i);
 218   // CHECK: atomicrmw add ptr {{.*}} seq_cst, align {{4|8}}
 219   __nvvm_atom_add_gen_l(&dl, l);
 220   // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 8
 221   __nvvm_atom_add_gen_ll(&sll, ll);
 222
 223   // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 4
 224   __nvvm_atom_sub_gen_i(ip, i);
 225   // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align {{4|8}}
 226   __nvvm_atom_sub_gen_l(&dl, l);
 227   // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 8
 228   __nvvm_atom_sub_gen_ll(&sll, ll);
 229
 230   // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 4
 231   __nvvm_atom_and_gen_i(ip, i);
 232   // CHECK: atomicrmw and ptr {{.*}} seq_cst, align {{4|8}}
 233   __nvvm_atom_and_gen_l(&dl, l);
 234   // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 8
 235   __nvvm_atom_and_gen_ll(&sll, ll);
 236
 237   // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 4
 238   __nvvm_atom_or_gen_i(ip, i);
 239   // CHECK: atomicrmw or ptr {{.*}} seq_cst, align {{4|8}}
 240   __nvvm_atom_or_gen_l(&dl, l);
 241   // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 8
 242   __nvvm_atom_or_gen_ll(&sll, ll);
 243
 244   // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 4
 245   __nvvm_atom_xor_gen_i(ip, i);
 246   // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align {{4|8}}
 247   __nvvm_atom_xor_gen_l(&dl, l);
 248   // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 8
 249   __nvvm_atom_xor_gen_ll(&sll, ll);
 250
 251   // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 4
 252   __nvvm_atom_xchg_gen_i(ip, i);
 253   // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align {{4|8}}
 254   __nvvm_atom_xchg_gen_l(&dl, l);
 255   // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 8
 256   __nvvm_atom_xchg_gen_ll(&sll, ll);
 257
 258   // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 4
 259   __nvvm_atom_max_gen_i(ip, i);
 260   // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 4
 261   __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
 262   // CHECK: atomicrmw max ptr {{.*}} seq_cst, align {{4|8}}
 263   __nvvm_atom_max_gen_l(&dl, l);
 264   // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align {{4|8}}
 265   __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
 266   // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 8
 267   __nvvm_atom_max_gen_ll(&sll, ll);
 268   // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 8
 269   __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
 270
 271   // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 4
 272   __nvvm_atom_min_gen_i(ip, i);
 273   // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 4
 274   __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
 275   // CHECK: atomicrmw min ptr {{.*}} seq_cst, align {{4|8}}
 276   __nvvm_atom_min_gen_l(&dl, l);
 277   // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align {{4|8}}
 278   __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
 279   // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 8
 280   __nvvm_atom_min_gen_ll(&sll, ll);
 281   // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 8
 282   __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
 283
 284   // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 4
 285   // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
 286   __nvvm_atom_cas_gen_i(ip, 0, i);
 287   // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align {{4|8}}
 288   // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
 289   __nvvm_atom_cas_gen_l(&dl, 0, l);
 290   // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 8
 291   // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
 292   __nvvm_atom_cas_gen_ll(&sll, 0, ll);
 293
 294   // CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 4
 295   __nvvm_atom_add_gen_f(fp, f);
 296
 297   // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0
 298   __nvvm_atom_inc_gen_ui(uip, ui);
 299
 300   // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0
 301   __nvvm_atom_dec_gen_ui(uip, ui);
 302
 303
 304   //////////////////////////////////////////////////////////////////
 305   // Atomics with scope (only supported on sm_60+).
 306
 307 #if ERROR_CHECK || __CUDA_ARCH__ >= 600
 308
 309   // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0
 310   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature sm_60}}
 311   __nvvm_atom_cta_add_gen_i(ip, i);
 312   // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0
 313   // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0
 314   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature sm_60}}
 315   __nvvm_atom_cta_add_gen_l(&dl, l);
 316   // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0
 317   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature sm_60}}
 318   __nvvm_atom_cta_add_gen_ll(&sll, ll);
 319   // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0
 320   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature sm_60}}
 321   __nvvm_atom_sys_add_gen_i(ip, i);
 322   // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0
 323   // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0
 324   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature sm_60}}
 325   __nvvm_atom_sys_add_gen_l(&dl, l);
 326   // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0
 327   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature sm_60}}
 328   __nvvm_atom_sys_add_gen_ll(&sll, ll);
 329
 330   // CHECK: call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0
 331   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature sm_60}}
 332   __nvvm_atom_cta_add_gen_f(fp, f);
 333   // CHECK: call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0
 334   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature sm_60}}
 335   __nvvm_atom_cta_add_gen_d(dfp, df);
 336   // CHECK: call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0
 337   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature sm_60}}
 338   __nvvm_atom_sys_add_gen_f(fp, f);
 339   // CHECK: call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0
 340   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature sm_60}}
 341   __nvvm_atom_sys_add_gen_d(dfp, df);
 342
 343   // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0
 344   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature sm_60}}
 345   __nvvm_atom_cta_xchg_gen_i(ip, i);
 346   // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0
 347   // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0
 348   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature sm_60}}
 349   __nvvm_atom_cta_xchg_gen_l(&dl, l);
 350   // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0
 351   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature sm_60}}
 352   __nvvm_atom_cta_xchg_gen_ll(&sll, ll);
 353
 354   // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0
 355   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_i' needs target feature sm_60}}
 356   __nvvm_atom_sys_xchg_gen_i(ip, i);
 357   // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0
 358   // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0
 359   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_l' needs target feature sm_60}}
 360   __nvvm_atom_sys_xchg_gen_l(&dl, l);
 361   // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0
 362   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_ll' needs target feature sm_60}}
 363   __nvvm_atom_sys_xchg_gen_ll(&sll, ll);
 364
 365   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
 366   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_i' needs target feature sm_60}}
 367   __nvvm_atom_cta_max_gen_i(ip, i);
 368   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
 369   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ui' needs target feature sm_60}}
 370   __nvvm_atom_cta_max_gen_ui((unsigned int *)ip, i);
 371   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
 372   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
 373   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_l' needs target feature sm_60}}
 374   __nvvm_atom_cta_max_gen_l(&dl, l);
 375   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
 376   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
 377   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ul' needs target feature sm_60}}
 378   __nvvm_atom_cta_max_gen_ul((unsigned long *)lp, l);
 379   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
 380   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ll' needs target feature sm_60}}
 381   __nvvm_atom_cta_max_gen_ll(&sll, ll);
 382   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
 383   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ull' needs target feature sm_60}}
 384   __nvvm_atom_cta_max_gen_ull((unsigned long long *)llp, ll);
 385
 386   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
 387   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_i' needs target feature sm_60}}
 388   __nvvm_atom_sys_max_gen_i(ip, i);
 389   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
 390   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ui' needs target feature sm_60}}
 391   __nvvm_atom_sys_max_gen_ui((unsigned int *)ip, i);
 392   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
 393   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
 394   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_l' needs target feature sm_60}}
 395   __nvvm_atom_sys_max_gen_l(&dl, l);
 396   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
 397   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
 398   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ul' needs target feature sm_60}}
 399   __nvvm_atom_sys_max_gen_ul((unsigned long *)lp, l);
 400   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
 401   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ll' needs target feature sm_60}}
 402   __nvvm_atom_sys_max_gen_ll(&sll, ll);
 403   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
 404   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ull' needs target feature sm_60}}
 405   __nvvm_atom_sys_max_gen_ull((unsigned long long *)llp, ll);
 406
 407   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
 408   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_i' needs target feature sm_60}}
 409   __nvvm_atom_cta_min_gen_i(ip, i);
 410   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
 411   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ui' needs target feature sm_60}}
 412   __nvvm_atom_cta_min_gen_ui((unsigned int *)ip, i);
 413   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
 414   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
 415   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_l' needs target feature sm_60}}
 416   __nvvm_atom_cta_min_gen_l(&dl, l);
 417   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
 418   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
 419   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ul' needs target feature sm_60}}
 420   __nvvm_atom_cta_min_gen_ul((unsigned long *)lp, l);
 421   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
 422   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ll' needs target feature sm_60}}
 423   __nvvm_atom_cta_min_gen_ll(&sll, ll);
 424   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
 425   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ull' needs target feature sm_60}}
 426   __nvvm_atom_cta_min_gen_ull((unsigned long long *)llp, ll);
 427
 428   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
 429   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_i' needs target feature sm_60}}
 430   __nvvm_atom_sys_min_gen_i(ip, i);
 431   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
 432   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ui' needs target feature sm_60}}
 433   __nvvm_atom_sys_min_gen_ui((unsigned int *)ip, i);
 434   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
 435   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
 436   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_l' needs target feature sm_60}}
 437   __nvvm_atom_sys_min_gen_l(&dl, l);
 438   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
 439   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
 440   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ul' needs target feature sm_60}}
 441   __nvvm_atom_sys_min_gen_ul((unsigned long *)lp, l);
 442   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
 443   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ll' needs target feature sm_60}}
 444   __nvvm_atom_sys_min_gen_ll(&sll, ll);
 445   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
 446   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ull' needs target feature sm_60}}
 447   __nvvm_atom_sys_min_gen_ull((unsigned long long *)llp, ll);
 448
 449   // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0
 450   // expected-error@+1 {{'__nvvm_atom_cta_inc_gen_ui' needs target feature sm_60}}
 451   __nvvm_atom_cta_inc_gen_ui((unsigned int *)ip, i);
 452   // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0
 453   // expected-error@+1 {{'__nvvm_atom_sys_inc_gen_ui' needs target feature sm_60}}
 454   __nvvm_atom_sys_inc_gen_ui((unsigned int *)ip, i);
 455
 456   // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0
 457   // expected-error@+1 {{'__nvvm_atom_cta_dec_gen_ui' needs target feature sm_60}}
 458   __nvvm_atom_cta_dec_gen_ui((unsigned int *)ip, i);
 459   // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0
 460   // expected-error@+1 {{'__nvvm_atom_sys_dec_gen_ui' needs target feature sm_60}}
 461   __nvvm_atom_sys_dec_gen_ui((unsigned int *)ip, i);
 462
 463   // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0
 464   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_i' needs target feature sm_60}}
 465   __nvvm_atom_cta_and_gen_i(ip, i);
 466   // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0
 467   // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0
 468   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_l' needs target feature sm_60}}
 469   __nvvm_atom_cta_and_gen_l(&dl, l);
 470   // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0
 471   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_ll' needs target feature sm_60}}
 472   __nvvm_atom_cta_and_gen_ll(&sll, ll);
 473
 474   // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0
 475   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_i' needs target feature sm_60}}
 476   __nvvm_atom_sys_and_gen_i(ip, i);
 477   // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0
 478   // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0
 479   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_l' needs target feature sm_60}}
 480   __nvvm_atom_sys_and_gen_l(&dl, l);
 481   // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0
 482   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_ll' needs target feature sm_60}}
 483   __nvvm_atom_sys_and_gen_ll(&sll, ll);
 484
 485   // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0
 486   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_i' needs target feature sm_60}}
 487   __nvvm_atom_cta_or_gen_i(ip, i);
 488   // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0
 489   // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0
 490   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_l' needs target feature sm_60}}
 491   __nvvm_atom_cta_or_gen_l(&dl, l);
 492   // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0
 493   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_ll' needs target feature sm_60}}
 494   __nvvm_atom_cta_or_gen_ll(&sll, ll);
 495
 496   // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0
 497   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_i' needs target feature sm_60}}
 498   __nvvm_atom_sys_or_gen_i(ip, i);
 499   // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0
 500   // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0
 501   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_l' needs target feature sm_60}}
 502   __nvvm_atom_sys_or_gen_l(&dl, l);
 503   // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0
 504   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_ll' needs target feature sm_60}}
 505   __nvvm_atom_sys_or_gen_ll(&sll, ll);
 506
 507   // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0
 508   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_i' needs target feature sm_60}}
 509   __nvvm_atom_cta_xor_gen_i(ip, i);
 510   // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0
 511   // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0
 512   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_l' needs target feature sm_60}}
 513   __nvvm_atom_cta_xor_gen_l(&dl, l);
 514   // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0
 515   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_ll' needs target feature sm_60}}
 516   __nvvm_atom_cta_xor_gen_ll(&sll, ll);
 517
 518   // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0
 519   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_i' needs target feature sm_60}}
 520   __nvvm_atom_sys_xor_gen_i(ip, i);
 521   // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0
 522   // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0
 523   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_l' needs target feature sm_60}}
 524   __nvvm_atom_sys_xor_gen_l(&dl, l);
 525   // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0
 526   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_ll' needs target feature sm_60}}
 527   __nvvm_atom_sys_xor_gen_ll(&sll, ll);
 528
 529   // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0
 530   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_i' needs target feature sm_60}}
 531   __nvvm_atom_cta_cas_gen_i(ip, i, 0);
 532   // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0
 533   // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0
 534   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_l' needs target feature sm_60}}
 535   __nvvm_atom_cta_cas_gen_l(&dl, l, 0);
 536   // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0
 537   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_ll' needs target feature sm_60}}
 538   __nvvm_atom_cta_cas_gen_ll(&sll, ll, 0);
 539
 540   // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0
 541   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_i' needs target feature sm_60}}
 542   __nvvm_atom_sys_cas_gen_i(ip, i, 0);
 543   // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0
 544   // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0
 545   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_l' needs target feature sm_60}}
 546   __nvvm_atom_sys_cas_gen_l(&dl, l, 0);
 547   // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0
 548   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_ll' needs target feature sm_60}}
 549   __nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
 550 #endif
 551
 552   // CHECK: ret
 553 }
 554
 555 // CHECK-LABEL: nvvm_ldg
 556 __device__ void nvvm_ldg(const void *p) {
 557   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
 558   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
 559   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
 560   __nvvm_ldg_c((const char *)p);
 561   __nvvm_ldg_uc((const unsigned char *)p);
 562   __nvvm_ldg_sc((const signed char *)p);
 563
 564   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
 565   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
 566   __nvvm_ldg_s((const short *)p);
 567   __nvvm_ldg_us((const unsigned short *)p);
 568
 569   // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 570   // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 571   __nvvm_ldg_i((const int *)p);
 572   __nvvm_ldg_ui((const unsigned int *)p);
 573
 574   // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 575   // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 576   // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
 577   // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
 578   __nvvm_ldg_l((const long *)p);
 579   __nvvm_ldg_ul((const unsigned long *)p);
 580
 581   // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
 582   __nvvm_ldg_f((const float *)p);
 583   // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
 584   __nvvm_ldg_d((const double *)p);
 585
 586   // In practice, the pointers we pass to __ldg will be aligned as appropriate
 587   // for the CUDA <type>N vector types (e.g. short4), which are not the same as
 588   // the LLVM vector types.  However, each LLVM vector type has an alignment
 589   // less than or equal to its corresponding CUDA type, so we're OK.
 590   //
 591   // PTX Interoperability section 2.2: "For a vector with an even number of
 592   // elements, its alignment is set to number of elements times the alignment of
 593   // its member: n*alignof(t)."
 594
 595   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
 596   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
 597   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
 598   typedef char char2 __attribute__((ext_vector_type(2)));
 599   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
 600   typedef signed char schar2 __attribute__((ext_vector_type(2)));
 601   __nvvm_ldg_c2((const char2 *)p);
 602   __nvvm_ldg_uc2((const uchar2 *)p);
 603   __nvvm_ldg_sc2((const schar2 *)p);
 604
 605   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
 606   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
 607   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
 608   typedef char char4 __attribute__((ext_vector_type(4)));
 609   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
 610   typedef signed char schar4 __attribute__((ext_vector_type(4)));
 611   __nvvm_ldg_c4((const char4 *)p);
 612   __nvvm_ldg_uc4((const uchar4 *)p);
 613   __nvvm_ldg_sc4((const schar4 *)p);
 614
 615   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
 616   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
 617   typedef short short2 __attribute__((ext_vector_type(2)));
 618   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
 619   __nvvm_ldg_s2((const short2 *)p);
 620   __nvvm_ldg_us2((const ushort2 *)p);
 621
 622   // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
 623   // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
 624   typedef short short4 __attribute__((ext_vector_type(4)));
 625   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
 626   __nvvm_ldg_s4((const short4 *)p);
 627   __nvvm_ldg_us4((const ushort4 *)p);
 628
 629   // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 630   // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 631   typedef int int2 __attribute__((ext_vector_type(2)));
 632   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
 633   __nvvm_ldg_i2((const int2 *)p);
 634   __nvvm_ldg_ui2((const uint2 *)p);
 635
 636   // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
 637   // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
 638   typedef int int4 __attribute__((ext_vector_type(4)));
 639   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
 640   __nvvm_ldg_i4((const int4 *)p);
 641   __nvvm_ldg_ui4((const uint4 *)p);
 642
 643   // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 644   // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 645   // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 646   // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 647   typedef long long2 __attribute__((ext_vector_type(2)));
 648   typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
 649   __nvvm_ldg_l2((const long2 *)p);
 650   __nvvm_ldg_ul2((const ulong2 *)p);
 651
 652   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 653   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 654   typedef long long longlong2 __attribute__((ext_vector_type(2)));
 655   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
 656   __nvvm_ldg_ll2((const longlong2 *)p);
 657   __nvvm_ldg_ull2((const ulonglong2 *)p);
 658
 659   // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
 660   typedef float float2 __attribute__((ext_vector_type(2)));
 661   __nvvm_ldg_f2((const float2 *)p);
 662
 663   // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
 664   typedef float float4 __attribute__((ext_vector_type(4)));
 665   __nvvm_ldg_f4((const float4 *)p);
 666
 667   // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
 668   typedef double double2 __attribute__((ext_vector_type(2)));
 669   __nvvm_ldg_d2((const double2 *)p);
 670 }
 671
 672 // CHECK-LABEL: nvvm_ldu
 673 __device__ void nvvm_ldu(const void *p) {
 674   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
 675   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
 676   // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
 677   __nvvm_ldu_c((const char *)p);
 678   __nvvm_ldu_uc((const unsigned char *)p);
 679   __nvvm_ldu_sc((const signed char *)p);
 680
 681   // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
 682   // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
 683   __nvvm_ldu_s((const short *)p);
 684   __nvvm_ldu_us((const unsigned short *)p);
 685
 686   // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 687   // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 688   __nvvm_ldu_i((const int *)p);
 689   __nvvm_ldu_ui((const unsigned int *)p);
 690
 691   // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 692   // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
 693   // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
 694   // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
 695   __nvvm_ldu_l((const long *)p);
 696   __nvvm_ldu_ul((const unsigned long *)p);
 697
 698   // CHECK: call float @llvm.nvvm.ldu.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
 699   __nvvm_ldu_f((const float *)p);
 700   // CHECK: call double @llvm.nvvm.ldu.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
 701   __nvvm_ldu_d((const double *)p);
 702
 703   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
 704   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
 705   // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
 706   typedef char char2 __attribute__((ext_vector_type(2)));
 707   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
 708   typedef signed char schar2 __attribute__((ext_vector_type(2)));
 709   __nvvm_ldu_c2((const char2 *)p);
 710   __nvvm_ldu_uc2((const uchar2 *)p);
 711   __nvvm_ldu_sc2((const schar2 *)p);
 712
 713   // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
 714   // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
 715   // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
 716   typedef char char4 __attribute__((ext_vector_type(4)));
 717   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
 718   typedef signed char schar4 __attribute__((ext_vector_type(4)));
 719   __nvvm_ldu_c4((const char4 *)p);
 720   __nvvm_ldu_uc4((const uchar4 *)p);
 721   __nvvm_ldu_sc4((const schar4 *)p);
 722
 723   // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
 724   // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
 725   typedef short short2 __attribute__((ext_vector_type(2)));
 726   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
 727   __nvvm_ldu_s2((const short2 *)p);
 728   __nvvm_ldu_us2((const ushort2 *)p);
 729
 730   // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
 731   // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
 732   typedef short short4 __attribute__((ext_vector_type(4)));
 733   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
 734   __nvvm_ldu_s4((const short4 *)p);
 735   __nvvm_ldu_us4((const ushort4 *)p);
 736
 737   // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 738   // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 739   typedef int int2 __attribute__((ext_vector_type(2)));
 740   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
 741   __nvvm_ldu_i2((const int2 *)p);
 742   __nvvm_ldu_ui2((const uint2 *)p);
 743
 744   // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
 745   // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
 746   typedef int int4 __attribute__((ext_vector_type(4)));
 747   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
 748   __nvvm_ldu_i4((const int4 *)p);
 749   __nvvm_ldu_ui4((const uint4 *)p);
 750
 751   // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 752   // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
 753   // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 754   // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 755   typedef long long2 __attribute__((ext_vector_type(2)));
 756   typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
 757   __nvvm_ldu_l2((const long2 *)p);
 758   __nvvm_ldu_ul2((const ulong2 *)p);
 759
 760   // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 761   // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
 762   typedef long long longlong2 __attribute__((ext_vector_type(2)));
 763   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
 764   __nvvm_ldu_ll2((const longlong2 *)p);
 765   __nvvm_ldu_ull2((const ulonglong2 *)p);
 766
 767   // CHECK: call <2 x float> @llvm.nvvm.ldu.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
 768   typedef float float2 __attribute__((ext_vector_type(2)));
 769   __nvvm_ldu_f2((const float2 *)p);
 770
 771   // CHECK: call <4 x float> @llvm.nvvm.ldu.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
 772   typedef float float4 __attribute__((ext_vector_type(4)));
 773   __nvvm_ldu_f4((const float4 *)p);
 774
 775   // CHECK: call <2 x double> @llvm.nvvm.ldu.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
 776   typedef double double2 __attribute__((ext_vector_type(2)));
 777   __nvvm_ldu_d2((const double2 *)p);
 778 }
 779
 780 // CHECK-LABEL: nvvm_shfl
 781 __device__ void nvvm_shfl(int i, float f, int a, int b) {
 782   // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32
 783   __nvvm_shfl_down_i32(i, a, b);
 784   // CHECK: call float @llvm.nvvm.shfl.down.f32(float
 785   __nvvm_shfl_down_f32(f, a, b);
 786   // CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32
 787   __nvvm_shfl_up_i32(i, a, b);
 788   // CHECK: call float @llvm.nvvm.shfl.up.f32(float
 789   __nvvm_shfl_up_f32(f, a, b);
 790   // CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32
 791   __nvvm_shfl_bfly_i32(i, a, b);
 792   // CHECK: call float @llvm.nvvm.shfl.bfly.f32(float
 793   __nvvm_shfl_bfly_f32(f, a, b);
 794   // CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32
 795   __nvvm_shfl_idx_i32(i, a, b);
 796   // CHECK: call float @llvm.nvvm.shfl.idx.f32(float
 797   __nvvm_shfl_idx_f32(f, a, b);
 798   // CHECK: ret void
 799 }
 800
 801 __device__ void nvvm_vote(int pred) {
 802   // CHECK: call i1 @llvm.nvvm.vote.all(i1
 803   __nvvm_vote_all(pred);
 804   // CHECK: call i1 @llvm.nvvm.vote.any(i1
 805   __nvvm_vote_any(pred);
 806   // CHECK: call i1 @llvm.nvvm.vote.uni(i1
 807   __nvvm_vote_uni(pred);
 808   // CHECK: call i32 @llvm.nvvm.vote.ballot(i1
 809   __nvvm_vote_ballot(pred);
 810   // CHECK: ret void
 811 }
 812
 813 // CHECK-LABEL: nvvm_mbarrier
 814 __device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) {
 815   #if __CUDA_ARCH__ >= 800
 816   __nvvm_mbarrier_init(addr, count);
 817   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init
 818   __nvvm_mbarrier_init_shared(sharedAddr, count);
 819   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared
 820
 821   __nvvm_mbarrier_inval(addr);
 822   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval
 823   __nvvm_mbarrier_inval_shared(sharedAddr);
 824   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared
 825
 826   __nvvm_mbarrier_arrive(addr);
 827   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive
 828   __nvvm_mbarrier_arrive_shared(sharedAddr);
 829   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared
 830   __nvvm_mbarrier_arrive_noComplete(addr, count);
 831   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete
 832   __nvvm_mbarrier_arrive_noComplete_shared(sharedAddr, count);
 833   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared
 834
 835   __nvvm_mbarrier_arrive_drop(addr);
 836   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop
 837   __nvvm_mbarrier_arrive_drop_shared(sharedAddr);
 838   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared
 839   __nvvm_mbarrier_arrive_drop_noComplete(addr, count);
 840   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete
 841   __nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr, count);
 842   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared
 843
 844   __nvvm_mbarrier_test_wait(addr, state);
 845   // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait
 846   __nvvm_mbarrier_test_wait_shared(sharedAddr, state);
 847   // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared
 848
 849   __nvvm_mbarrier_pending_count(state);
 850   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count
 851   #endif
 852   // CHECK: ret void
 853 }
 854
 855 // CHECK-LABEL: nvvm_async_copy
 856 __device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) {
 857   #if __CUDA_ARCH__ >= 800
 858   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive
 859   __nvvm_cp_async_mbarrier_arrive(addr);
 860   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared
 861   __nvvm_cp_async_mbarrier_arrive_shared(sharedAddr);
 862   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc
 863   __nvvm_cp_async_mbarrier_arrive_noinc(addr);
 864   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
 865   __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
 866
 867   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4(
 868   __nvvm_cp_async_ca_shared_global_4(dst, src);
 869   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8(
 870   __nvvm_cp_async_ca_shared_global_8(dst, src);
 871   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16(
 872   __nvvm_cp_async_ca_shared_global_16(dst, src);
 873   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16(
 874   __nvvm_cp_async_cg_shared_global_16(dst, src);
 875
 876   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4.s({{.*}}, i32 2)
 877   __nvvm_cp_async_ca_shared_global_4(dst, src, 2);
 878   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8.s({{.*}}, i32 2)
 879   __nvvm_cp_async_ca_shared_global_8(dst, src, 2);
 880   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16.s({{.*}}, i32 2)
 881   __nvvm_cp_async_ca_shared_global_16(dst, src, 2);
 882   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16.s({{.*}}, i32 2)
 883   __nvvm_cp_async_cg_shared_global_16(dst, src, 2);
 884
 885   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
 886   __nvvm_cp_async_commit_group();
 887   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
 888   __nvvm_cp_async_wait_group(0);
 889     // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8)
 890   __nvvm_cp_async_wait_group(8);
 891     // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16)
 892   __nvvm_cp_async_wait_group(16);
 893   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all
 894   __nvvm_cp_async_wait_all();
 895   #endif
 896   // CHECK: ret void
 897 }
 898
 899 // CHECK-LABEL: nvvm_cvt_sm80
 900 __device__ void nvvm_cvt_sm80() {
 901 #if __CUDA_ARCH__ >= 800
 902   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn(float 1.000000e+00, float 1.000000e+00)
 903   __nvvm_ff2bf16x2_rn(1, 1);
 904   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
 905   __nvvm_ff2bf16x2_rn_relu(1, 1);
 906   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz(float 1.000000e+00, float 1.000000e+00)
 907   __nvvm_ff2bf16x2_rz(1, 1);
 908   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
 909   __nvvm_ff2bf16x2_rz_relu(1, 1);
 910
 911   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00)
 912   __nvvm_ff2f16x2_rn(1, 1);
 913   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
 914   __nvvm_ff2f16x2_rn_relu(1, 1);
 915   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz(float 1.000000e+00, float 1.000000e+00)
 916   __nvvm_ff2f16x2_rz(1, 1);
 917   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
 918   __nvvm_ff2f16x2_rz_relu(1, 1);
 919
 920   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn(float 1.000000e+00)
 921   __nvvm_f2bf16_rn(1);
 922   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn.relu(float 1.000000e+00)
 923   __nvvm_f2bf16_rn_relu(1);
 924   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz(float 1.000000e+00)
 925   __nvvm_f2bf16_rz(1);
 926   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00)
 927   __nvvm_f2bf16_rz_relu(1);
 928
 929   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00)
 930   __nvvm_f2tf32_rna(1);
 931 #endif
 932   // CHECK: ret void
 933 }
 934
 935 #define NAN32 0x7FBFFFFF
 936 #define NAN16 (__bf16)0x7FBF
 937 #define BF16 (__bf16)0.1f
 938 #define BF16_2 (__bf16)0.2f
 939 #define NANBF16 (__bf16)0xFFC1
 940 #define BF16X2 {(__bf16)0.1f, (__bf16)0.1f}
 941 #define BF16X2_2 {(__bf16)0.2f, (__bf16)0.2f}
 942 #define NANBF16X2 {NANBF16, NANBF16}
 943
 944 // CHECK-LABEL: nvvm_abs_neg_bf16_bf16x2_sm80
 945 __device__ void nvvm_abs_neg_bf16_bf16x2_sm80() {
 946 #if __CUDA_ARCH__ >= 800
 947
 948   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.abs.bf16(bfloat 0xR3DCD)
 949   __nvvm_abs_bf16(BF16);
 950   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat> <bfloat 0xR3DCD, bfloat 0xR3DCD>)
 951   __nvvm_abs_bf16x2(BF16X2);
 952
 953   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.neg.bf16(bfloat 0xR3DCD)
 954   __nvvm_neg_bf16(BF16);
 955   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.neg.bf16x2(<2 x bfloat> <bfloat 0xR3DCD, bfloat 0xR3DCD>)
 956   __nvvm_neg_bf16x2(BF16X2);
 957 #endif
 958   // CHECK: ret void
 959 }
 960
 961 // CHECK-LABEL: nvvm_min_max_sm80
 962 __device__ void nvvm_min_max_sm80() {
 963 #if __CUDA_ARCH__ >= 800
 964
 965   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.nan.f
 966   __nvvm_fmin_nan_f(0.1f, (float)NAN32);
 967   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.ftz.nan.f
 968   __nvvm_fmin_ftz_nan_f(0.1f, (float)NAN32);
 969
 970   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.bf16
 971   __nvvm_fmin_bf16(BF16, BF16_2);
 972   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.bf16
 973   __nvvm_fmin_ftz_bf16(BF16, BF16_2);
 974   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.nan.bf16
 975   __nvvm_fmin_nan_bf16(BF16, NANBF16);
 976   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.nan.bf16
 977   __nvvm_fmin_ftz_nan_bf16(BF16, NANBF16);
 978   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.bf16x2
 979   __nvvm_fmin_bf16x2(BF16X2, BF16X2_2);
 980   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.bf16x2
 981   __nvvm_fmin_ftz_bf16x2(BF16X2, BF16X2_2);
 982   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.nan.bf16x2
 983   __nvvm_fmin_nan_bf16x2(BF16X2, NANBF16X2);
 984   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.nan.bf16x2
 985   __nvvm_fmin_ftz_nan_bf16x2(BF16X2, NANBF16X2);
 986   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
 987   __nvvm_fmax_nan_f(0.1f, 0.11f);
 988   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
 989   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
 990
 991   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
 992   __nvvm_fmax_nan_f(0.1f, (float)NAN32);
 993   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
 994   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
 995   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.bf16
 996   __nvvm_fmax_bf16(BF16, BF16_2);
 997   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.bf16
 998   __nvvm_fmax_ftz_bf16(BF16, BF16_2);
 999   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.nan.bf16
1000   __nvvm_fmax_nan_bf16(BF16, NANBF16);
1001   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.nan.bf16
1002   __nvvm_fmax_ftz_nan_bf16(BF16, NANBF16);
1003   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.bf16x2
1004   __nvvm_fmax_bf16x2(BF16X2, BF16X2_2);
1005   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.bf16x2
1006   __nvvm_fmax_ftz_bf16x2(BF16X2, BF16X2_2);
1007   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.nan.bf16x2
1008   __nvvm_fmax_nan_bf16x2(NANBF16X2, BF16X2);
1009   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.nan.bf16x2
1010   __nvvm_fmax_ftz_nan_bf16x2(NANBF16X2, BF16X2);
1011   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
1012   __nvvm_fmax_nan_f(0.1f, (float)NAN32);
1013   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
1014   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
1015
1016 #endif
1017   // CHECK: ret void
1018 }
1019
1020 // CHECK-LABEL: nvvm_fma_bf16_bf16x2_sm80
1021 __device__ void nvvm_fma_bf16_bf16x2_sm80() {
1022 #if __CUDA_ARCH__ >= 800
1023   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.bf16
1024   __nvvm_fma_rn_bf16(BF16, BF16_2, BF16_2);
1025   // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.relu.bf16
1026   __nvvm_fma_rn_relu_bf16(BF16, BF16_2, BF16_2);
1027   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.bf16x2
1028   __nvvm_fma_rn_bf16x2(BF16X2, BF16X2_2, BF16X2_2);
1029   // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.relu.bf16x2
1030   __nvvm_fma_rn_relu_bf16x2(BF16X2, BF16X2_2, BF16X2_2);
1031 #endif
1032   // CHECK: ret void
1033 }
1034
1035 // CHECK-LABEL: nvvm_min_max_sm86
1036 __device__ void nvvm_min_max_sm86() {
1037 #if __CUDA_ARCH__ >= 860
1038
1039   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.xorsign.abs.bf16
1040   __nvvm_fmin_xorsign_abs_bf16(BF16, BF16_2);
1041   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.nan.xorsign.abs.bf16
1042   __nvvm_fmin_nan_xorsign_abs_bf16(BF16, NANBF16);
1043   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.xorsign.abs.bf16x2
1044   __nvvm_fmin_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
1045   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2
1046   __nvvm_fmin_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
1047   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.xorsign.abs.f
1048   __nvvm_fmin_xorsign_abs_f(-0.1f, 0.1f);
1049   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.xorsign.abs.f
1050   __nvvm_fmin_ftz_xorsign_abs_f(-0.1f, 0.1f);
1051   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.nan.xorsign.abs.f
1052   __nvvm_fmin_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1053   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f
1054   __nvvm_fmin_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1055
1056   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.xorsign.abs.bf16
1057   __nvvm_fmax_xorsign_abs_bf16(BF16, BF16_2);
1058   // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.nan.xorsign.abs.bf16
1059   __nvvm_fmax_nan_xorsign_abs_bf16(BF16, NANBF16);
1060   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.xorsign.abs.bf16x2
1061   __nvvm_fmax_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
1062   // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2
1063   __nvvm_fmax_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
1064   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.xorsign.abs.f
1065   __nvvm_fmax_xorsign_abs_f(-0.1f, 0.1f);
1066   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.xorsign.abs.f
1067   __nvvm_fmax_ftz_xorsign_abs_f(-0.1f, 0.1f);
1068   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.nan.xorsign.abs.f
1069   __nvvm_fmax_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1070   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f
1071   __nvvm_fmax_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1072 #endif
1073   // CHECK: ret void
1074 }