[libc] Switch to using the generic `<gpuintrin.h>` implementations (#121810)
[llvm-project.git] / clang / test / CodeGen / builtins-nvptx.c
blob163aee4799ff0e36f2a34cbaa88807774ff2ba5f
1 // REQUIRES: nvptx-registered-target
2 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_70 -target-feature +ptx63 \
3 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
4 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX63_SM70 -check-prefix=LP64 %s
5 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
6 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
7 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
8 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
9 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
10 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
11 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
12 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
13 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
14 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
15 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
16 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
17 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \
18 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
19 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
20 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \
21 // RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
22 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
23 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
24 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s
25 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
26 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
27 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
28 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \
29 // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \
30 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s
32 #define __device__ __attribute__((device))
33 #define __global__ __attribute__((global))
34 #define __shared__ __attribute__((shared))
35 #define __constant__ __attribute__((constant))
37 __device__ int read_tid() {
39 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
40 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
41 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
42 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
44 int x = __nvvm_read_ptx_sreg_tid_x();
45 int y = __nvvm_read_ptx_sreg_tid_y();
46 int z = __nvvm_read_ptx_sreg_tid_z();
47 int w = __nvvm_read_ptx_sreg_tid_w();
49 return x + y + z + w;
53 __device__ bool reflect() {
55 // CHECK: call i32 @llvm.nvvm.reflect(ptr {{.*}})
57 unsigned x = __nvvm_reflect("__CUDA_ARCH");
58 return x >= 700;
61 __device__ int read_ntid() {
63 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
64 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
65 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
66 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
68 int x = __nvvm_read_ptx_sreg_ntid_x();
69 int y = __nvvm_read_ptx_sreg_ntid_y();
70 int z = __nvvm_read_ptx_sreg_ntid_z();
71 int w = __nvvm_read_ptx_sreg_ntid_w();
73 return x + y + z + w;
77 __device__ int read_ctaid() {
79 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
80 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
81 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
82 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
84 int x = __nvvm_read_ptx_sreg_ctaid_x();
85 int y = __nvvm_read_ptx_sreg_ctaid_y();
86 int z = __nvvm_read_ptx_sreg_ctaid_z();
87 int w = __nvvm_read_ptx_sreg_ctaid_w();
89 return x + y + z + w;
93 __device__ int read_nctaid() {
95 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
96 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
97 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
98 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
100 int x = __nvvm_read_ptx_sreg_nctaid_x();
101 int y = __nvvm_read_ptx_sreg_nctaid_y();
102 int z = __nvvm_read_ptx_sreg_nctaid_z();
103 int w = __nvvm_read_ptx_sreg_nctaid_w();
105 return x + y + z + w;
109 __device__ int read_ids() {
111 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
112 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid()
113 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
114 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
115 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
116 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
117 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
119 int a = __nvvm_read_ptx_sreg_laneid();
120 int b = __nvvm_read_ptx_sreg_warpid();
121 int c = __nvvm_read_ptx_sreg_nwarpid();
122 int d = __nvvm_read_ptx_sreg_smid();
123 int e = __nvvm_read_ptx_sreg_nsmid();
124 int f = __nvvm_read_ptx_sreg_gridid();
125 int g = __nvvm_read_ptx_sreg_warpsize();
127 return a + b + c + d + e + f + g;
131 __device__ int read_lanemasks() {
133 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
134 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
135 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
136 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
137 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
139 int a = __nvvm_read_ptx_sreg_lanemask_eq();
140 int b = __nvvm_read_ptx_sreg_lanemask_le();
141 int c = __nvvm_read_ptx_sreg_lanemask_lt();
142 int d = __nvvm_read_ptx_sreg_lanemask_ge();
143 int e = __nvvm_read_ptx_sreg_lanemask_gt();
145 return a + b + c + d + e;
149 __device__ long long read_clocks() {
151 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
152 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
153 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.globaltimer()
155 int a = __nvvm_read_ptx_sreg_clock();
156 long long b = __nvvm_read_ptx_sreg_clock64();
157 long long c = __nvvm_read_ptx_sreg_globaltimer();
159 return a + b + c;
162 __device__ int read_pms() {
164 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0()
165 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1()
166 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2()
167 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3()
169 int a = __nvvm_read_ptx_sreg_pm0();
170 int b = __nvvm_read_ptx_sreg_pm1();
171 int c = __nvvm_read_ptx_sreg_pm2();
172 int d = __nvvm_read_ptx_sreg_pm3();
174 return a + b + c + d;
178 __device__ void sync() {
180 // CHECK: call void @llvm.nvvm.bar.sync(i32 0)
182 __nvvm_bar_sync(0);
186 __device__ void activemask() {
188 // CHECK: call i32 @llvm.nvvm.activemask()
190 __nvvm_activemask();
194 __device__ void exit() {
196 // CHECK: call void @llvm.nvvm.exit()
198 __nvvm_exit();
202 // NVVM intrinsics
204 // The idea is not to test all intrinsics, just that Clang is recognizing the
205 // builtins defined in BuiltinsNVPTX.def
206 __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
207 // CHECK: call float @llvm.nvvm.fmax.f
208 float t1 = __nvvm_fmax_f(f1, f2);
209 // CHECK: call float @llvm.nvvm.fmin.f
210 float t2 = __nvvm_fmin_f(f1, f2);
211 // CHECK: call float @llvm.nvvm.sqrt.rn.f
212 float t3 = __nvvm_sqrt_rn_f(f1);
213 // CHECK: call float @llvm.nvvm.rcp.rn.f
214 float t4 = __nvvm_rcp_rn_f(f2);
215 // CHECK: call float @llvm.nvvm.add.rn.f
216 float t5 = __nvvm_add_rn_f(f1, f2);
218 // CHECK: call double @llvm.nvvm.fmax.d
219 double td1 = __nvvm_fmax_d(d1, d2);
220 // CHECK: call double @llvm.nvvm.fmin.d
221 double td2 = __nvvm_fmin_d(d1, d2);
222 // CHECK: call double @llvm.nvvm.sqrt.rn.d
223 double td3 = __nvvm_sqrt_rn_d(d1);
224 // CHECK: call double @llvm.nvvm.rcp.rn.d
225 double td4 = __nvvm_rcp_rn_d(d2);
227 // CHECK: call void @llvm.nvvm.membar.cta()
228 __nvvm_membar_cta();
229 // CHECK: call void @llvm.nvvm.membar.gl()
230 __nvvm_membar_gl();
231 // CHECK: call void @llvm.nvvm.membar.sys()
232 __nvvm_membar_sys();
233 // CHECK: call void @llvm.nvvm.barrier0()
234 __syncthreads();
237 __device__ int di;
238 __shared__ int si;
239 __device__ long dl;
240 __shared__ long sl;
241 __device__ long long dll;
242 __shared__ long long sll;
244 // Check for atomic intrinsics
245 // CHECK-LABEL: nvvm_atom
246 __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
247 unsigned short *usp, unsigned short us, int *ip,
248 int i, unsigned int *uip, unsigned ui, long *lp,
249 long l, long long *llp, long long ll) {
250 // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
251 __nvvm_atom_add_gen_i(ip, i);
252 // CHECK: atomicrmw add ptr {{.*}} seq_cst, align {{4|8}}
253 __nvvm_atom_add_gen_l(&dl, l);
254 // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 8
255 __nvvm_atom_add_gen_ll(&sll, ll);
257 // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 4
258 __nvvm_atom_sub_gen_i(ip, i);
259 // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align {{4|8}}
260 __nvvm_atom_sub_gen_l(&dl, l);
261 // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 8
262 __nvvm_atom_sub_gen_ll(&sll, ll);
264 // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 4
265 __nvvm_atom_and_gen_i(ip, i);
266 // CHECK: atomicrmw and ptr {{.*}} seq_cst, align {{4|8}}
267 __nvvm_atom_and_gen_l(&dl, l);
268 // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 8
269 __nvvm_atom_and_gen_ll(&sll, ll);
271 // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 4
272 __nvvm_atom_or_gen_i(ip, i);
273 // CHECK: atomicrmw or ptr {{.*}} seq_cst, align {{4|8}}
274 __nvvm_atom_or_gen_l(&dl, l);
275 // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 8
276 __nvvm_atom_or_gen_ll(&sll, ll);
278 // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 4
279 __nvvm_atom_xor_gen_i(ip, i);
280 // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align {{4|8}}
281 __nvvm_atom_xor_gen_l(&dl, l);
282 // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 8
283 __nvvm_atom_xor_gen_ll(&sll, ll);
285 // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 4
286 __nvvm_atom_xchg_gen_i(ip, i);
287 // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align {{4|8}}
288 __nvvm_atom_xchg_gen_l(&dl, l);
289 // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 8
290 __nvvm_atom_xchg_gen_ll(&sll, ll);
292 // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 4
293 __nvvm_atom_max_gen_i(ip, i);
294 // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 4
295 __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
296 // CHECK: atomicrmw max ptr {{.*}} seq_cst, align {{4|8}}
297 __nvvm_atom_max_gen_l(&dl, l);
298 // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align {{4|8}}
299 __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
300 // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 8
301 __nvvm_atom_max_gen_ll(&sll, ll);
302 // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 8
303 __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
305 // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 4
306 __nvvm_atom_min_gen_i(ip, i);
307 // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 4
308 __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
309 // CHECK: atomicrmw min ptr {{.*}} seq_cst, align {{4|8}}
310 __nvvm_atom_min_gen_l(&dl, l);
311 // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align {{4|8}}
312 __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
313 // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 8
314 __nvvm_atom_min_gen_ll(&sll, ll);
315 // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 8
316 __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
318 // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 4
319 // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
320 __nvvm_atom_cas_gen_i(ip, 0, i);
321 // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align {{4|8}}
322 // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
323 __nvvm_atom_cas_gen_l(&dl, 0, l);
324 // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 8
325 // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
326 __nvvm_atom_cas_gen_ll(&sll, 0, ll);
328 // CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 4
329 __nvvm_atom_add_gen_f(fp, f);
331 // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0
332 __nvvm_atom_inc_gen_ui(uip, ui);
334 // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0
335 __nvvm_atom_dec_gen_ui(uip, ui);
338 //////////////////////////////////////////////////////////////////
339 // Atomics with scope (only supported on sm_60+).
341 #if ERROR_CHECK || __CUDA_ARCH__ >= 600
343 // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0
344 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature sm_60}}
345 __nvvm_atom_cta_add_gen_i(ip, i);
346 // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0
347 // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0
348 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature sm_60}}
349 __nvvm_atom_cta_add_gen_l(&dl, l);
350 // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0
351 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature sm_60}}
352 __nvvm_atom_cta_add_gen_ll(&sll, ll);
353 // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0
354 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature sm_60}}
355 __nvvm_atom_sys_add_gen_i(ip, i);
356 // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0
357 // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0
358 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature sm_60}}
359 __nvvm_atom_sys_add_gen_l(&dl, l);
360 // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0
361 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature sm_60}}
362 __nvvm_atom_sys_add_gen_ll(&sll, ll);
364 // CHECK: call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0
365 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature sm_60}}
366 __nvvm_atom_cta_add_gen_f(fp, f);
367 // CHECK: call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0
368 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature sm_60}}
369 __nvvm_atom_cta_add_gen_d(dfp, df);
370 // CHECK: call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0
371 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature sm_60}}
372 __nvvm_atom_sys_add_gen_f(fp, f);
373 // CHECK: call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0
374 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature sm_60}}
375 __nvvm_atom_sys_add_gen_d(dfp, df);
377 // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0
378 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature sm_60}}
379 __nvvm_atom_cta_xchg_gen_i(ip, i);
380 // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0
381 // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0
382 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature sm_60}}
383 __nvvm_atom_cta_xchg_gen_l(&dl, l);
384 // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0
385 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature sm_60}}
386 __nvvm_atom_cta_xchg_gen_ll(&sll, ll);
388 // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0
389 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_i' needs target feature sm_60}}
390 __nvvm_atom_sys_xchg_gen_i(ip, i);
391 // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0
392 // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0
393 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_l' needs target feature sm_60}}
394 __nvvm_atom_sys_xchg_gen_l(&dl, l);
395 // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0
396 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_ll' needs target feature sm_60}}
397 __nvvm_atom_sys_xchg_gen_ll(&sll, ll);
399 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
400 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_i' needs target feature sm_60}}
401 __nvvm_atom_cta_max_gen_i(ip, i);
402 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
403 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ui' needs target feature sm_60}}
404 __nvvm_atom_cta_max_gen_ui((unsigned int *)ip, i);
405 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
406 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
407 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_l' needs target feature sm_60}}
408 __nvvm_atom_cta_max_gen_l(&dl, l);
409 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
410 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
411 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ul' needs target feature sm_60}}
412 __nvvm_atom_cta_max_gen_ul((unsigned long *)lp, l);
413 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
414 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ll' needs target feature sm_60}}
415 __nvvm_atom_cta_max_gen_ll(&sll, ll);
416 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
417 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ull' needs target feature sm_60}}
418 __nvvm_atom_cta_max_gen_ull((unsigned long long *)llp, ll);
420 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
421 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_i' needs target feature sm_60}}
422 __nvvm_atom_sys_max_gen_i(ip, i);
423 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
424 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ui' needs target feature sm_60}}
425 __nvvm_atom_sys_max_gen_ui((unsigned int *)ip, i);
426 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
427 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
428 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_l' needs target feature sm_60}}
429 __nvvm_atom_sys_max_gen_l(&dl, l);
430 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
431 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
432 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ul' needs target feature sm_60}}
433 __nvvm_atom_sys_max_gen_ul((unsigned long *)lp, l);
434 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
435 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ll' needs target feature sm_60}}
436 __nvvm_atom_sys_max_gen_ll(&sll, ll);
437 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
438 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ull' needs target feature sm_60}}
439 __nvvm_atom_sys_max_gen_ull((unsigned long long *)llp, ll);
441 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
442 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_i' needs target feature sm_60}}
443 __nvvm_atom_cta_min_gen_i(ip, i);
444 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
445 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ui' needs target feature sm_60}}
446 __nvvm_atom_cta_min_gen_ui((unsigned int *)ip, i);
447 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
448 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
449 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_l' needs target feature sm_60}}
450 __nvvm_atom_cta_min_gen_l(&dl, l);
451 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
452 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
453 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ul' needs target feature sm_60}}
454 __nvvm_atom_cta_min_gen_ul((unsigned long *)lp, l);
455 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
456 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ll' needs target feature sm_60}}
457 __nvvm_atom_cta_min_gen_ll(&sll, ll);
458 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
459 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ull' needs target feature sm_60}}
460 __nvvm_atom_cta_min_gen_ull((unsigned long long *)llp, ll);
462 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
463 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_i' needs target feature sm_60}}
464 __nvvm_atom_sys_min_gen_i(ip, i);
465 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
466 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ui' needs target feature sm_60}}
467 __nvvm_atom_sys_min_gen_ui((unsigned int *)ip, i);
468 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
469 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
470 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_l' needs target feature sm_60}}
471 __nvvm_atom_sys_min_gen_l(&dl, l);
472 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
473 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
474 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ul' needs target feature sm_60}}
475 __nvvm_atom_sys_min_gen_ul((unsigned long *)lp, l);
476 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
477 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ll' needs target feature sm_60}}
478 __nvvm_atom_sys_min_gen_ll(&sll, ll);
479 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
480 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ull' needs target feature sm_60}}
481 __nvvm_atom_sys_min_gen_ull((unsigned long long *)llp, ll);
483 // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0
484 // expected-error@+1 {{'__nvvm_atom_cta_inc_gen_ui' needs target feature sm_60}}
485 __nvvm_atom_cta_inc_gen_ui((unsigned int *)ip, i);
486 // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0
487 // expected-error@+1 {{'__nvvm_atom_sys_inc_gen_ui' needs target feature sm_60}}
488 __nvvm_atom_sys_inc_gen_ui((unsigned int *)ip, i);
490 // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0
491 // expected-error@+1 {{'__nvvm_atom_cta_dec_gen_ui' needs target feature sm_60}}
492 __nvvm_atom_cta_dec_gen_ui((unsigned int *)ip, i);
493 // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0
494 // expected-error@+1 {{'__nvvm_atom_sys_dec_gen_ui' needs target feature sm_60}}
495 __nvvm_atom_sys_dec_gen_ui((unsigned int *)ip, i);
497 // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0
498 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_i' needs target feature sm_60}}
499 __nvvm_atom_cta_and_gen_i(ip, i);
500 // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0
501 // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0
502 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_l' needs target feature sm_60}}
503 __nvvm_atom_cta_and_gen_l(&dl, l);
504 // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0
505 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_ll' needs target feature sm_60}}
506 __nvvm_atom_cta_and_gen_ll(&sll, ll);
508 // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0
509 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_i' needs target feature sm_60}}
510 __nvvm_atom_sys_and_gen_i(ip, i);
511 // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0
512 // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0
513 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_l' needs target feature sm_60}}
514 __nvvm_atom_sys_and_gen_l(&dl, l);
515 // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0
516 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_ll' needs target feature sm_60}}
517 __nvvm_atom_sys_and_gen_ll(&sll, ll);
519 // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0
520 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_i' needs target feature sm_60}}
521 __nvvm_atom_cta_or_gen_i(ip, i);
522 // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0
523 // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0
524 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_l' needs target feature sm_60}}
525 __nvvm_atom_cta_or_gen_l(&dl, l);
526 // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0
527 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_ll' needs target feature sm_60}}
528 __nvvm_atom_cta_or_gen_ll(&sll, ll);
530 // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0
531 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_i' needs target feature sm_60}}
532 __nvvm_atom_sys_or_gen_i(ip, i);
533 // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0
534 // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0
535 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_l' needs target feature sm_60}}
536 __nvvm_atom_sys_or_gen_l(&dl, l);
537 // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0
538 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_ll' needs target feature sm_60}}
539 __nvvm_atom_sys_or_gen_ll(&sll, ll);
541 // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0
542 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_i' needs target feature sm_60}}
543 __nvvm_atom_cta_xor_gen_i(ip, i);
544 // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0
545 // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0
546 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_l' needs target feature sm_60}}
547 __nvvm_atom_cta_xor_gen_l(&dl, l);
548 // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0
549 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_ll' needs target feature sm_60}}
550 __nvvm_atom_cta_xor_gen_ll(&sll, ll);
552 // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0
553 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_i' needs target feature sm_60}}
554 __nvvm_atom_sys_xor_gen_i(ip, i);
555 // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0
556 // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0
557 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_l' needs target feature sm_60}}
558 __nvvm_atom_sys_xor_gen_l(&dl, l);
559 // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0
560 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_ll' needs target feature sm_60}}
561 __nvvm_atom_sys_xor_gen_ll(&sll, ll);
563 // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0
564 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_i' needs target feature sm_60}}
565 __nvvm_atom_cta_cas_gen_i(ip, i, 0);
566 // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0
567 // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0
568 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_l' needs target feature sm_60}}
569 __nvvm_atom_cta_cas_gen_l(&dl, l, 0);
570 // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0
571 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_ll' needs target feature sm_60}}
572 __nvvm_atom_cta_cas_gen_ll(&sll, ll, 0);
574 // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0
575 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_i' needs target feature sm_60}}
576 __nvvm_atom_sys_cas_gen_i(ip, i, 0);
577 // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0
578 // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0
579 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_l' needs target feature sm_60}}
580 __nvvm_atom_sys_cas_gen_l(&dl, l, 0);
581 // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0
582 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_ll' needs target feature sm_60}}
583 __nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
584 #endif
586 #if __CUDA_ARCH__ >= 700
587 // CHECK_PTX63_SM70: cmpxchg ptr {{.*}} seq_cst seq_cst, align 2
588 // CHECK_PTX63_SM70-NEXT: extractvalue { i16, i1 } {{%[0-9]+}}, 0
589 __nvvm_atom_cas_gen_us(usp, 0, us);
590 // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.cta.i16.p0
591 __nvvm_atom_cta_cas_gen_us(usp, 0, us);
592 // CHECK_PTX63_SM70: call i16 @llvm.nvvm.atomic.cas.gen.i.sys.i16.p0
593 __nvvm_atom_sys_cas_gen_us(usp, 0, us);
594 #endif
596 // CHECK: ret
599 // CHECK-LABEL: nvvm_ldg
600 __device__ void nvvm_ldg(const void *p) {
601 // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
602 // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
603 // CHECK: load i8, ptr addrspace(1) {{%[0-9]+}}, align 1, !invariant.load
604 __nvvm_ldg_c((const char *)p);
605 __nvvm_ldg_uc((const unsigned char *)p);
606 __nvvm_ldg_sc((const signed char *)p);
608 // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
609 // CHECK: load i16, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
610 __nvvm_ldg_s((const short *)p);
611 __nvvm_ldg_us((const unsigned short *)p);
613 // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
614 // CHECK: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
615 __nvvm_ldg_i((const int *)p);
616 __nvvm_ldg_ui((const unsigned int *)p);
618 // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
619 // LP32: load i32, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
620 // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
621 // LP64: load i64, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
622 __nvvm_ldg_l((const long *)p);
623 __nvvm_ldg_ul((const unsigned long *)p);
625 // CHECK: load float, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
626 __nvvm_ldg_f((const float *)p);
627 // CHECK: load double, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
628 __nvvm_ldg_d((const double *)p);
630 // In practice, the pointers we pass to __ldg will be aligned as appropriate
631 // for the CUDA <type>N vector types (e.g. short4), which are not the same as
632 // the LLVM vector types. However, each LLVM vector type has an alignment
633 // less than or equal to its corresponding CUDA type, so we're OK.
635 // PTX Interoperability section 2.2: "For a vector with an even number of
636 // elements, its alignment is set to number of elements times the alignment of
637 // its member: n*alignof(t)."
639 // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
640 // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
641 // CHECK: load <2 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 2, !invariant.load
642 typedef char char2 __attribute__((ext_vector_type(2)));
643 typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
644 typedef signed char schar2 __attribute__((ext_vector_type(2)));
645 __nvvm_ldg_c2((const char2 *)p);
646 __nvvm_ldg_uc2((const uchar2 *)p);
647 __nvvm_ldg_sc2((const schar2 *)p);
649 // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
650 // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
651 // CHECK: load <4 x i8>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
652 typedef char char4 __attribute__((ext_vector_type(4)));
653 typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
654 typedef signed char schar4 __attribute__((ext_vector_type(4)));
655 __nvvm_ldg_c4((const char4 *)p);
656 __nvvm_ldg_uc4((const uchar4 *)p);
657 __nvvm_ldg_sc4((const schar4 *)p);
659 // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
660 // CHECK: load <2 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 4, !invariant.load
661 typedef short short2 __attribute__((ext_vector_type(2)));
662 typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
663 __nvvm_ldg_s2((const short2 *)p);
664 __nvvm_ldg_us2((const ushort2 *)p);
666 // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
667 // CHECK: load <4 x i16>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
668 typedef short short4 __attribute__((ext_vector_type(4)));
669 typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
670 __nvvm_ldg_s4((const short4 *)p);
671 __nvvm_ldg_us4((const ushort4 *)p);
673 // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
674 // CHECK: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
675 typedef int int2 __attribute__((ext_vector_type(2)));
676 typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
677 __nvvm_ldg_i2((const int2 *)p);
678 __nvvm_ldg_ui2((const uint2 *)p);
680 // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
681 // CHECK: load <4 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
682 typedef int int4 __attribute__((ext_vector_type(4)));
683 typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
684 __nvvm_ldg_i4((const int4 *)p);
685 __nvvm_ldg_ui4((const uint4 *)p);
687 // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
688 // LP32: load <2 x i32>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
689 // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
690 // LP64: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
691 typedef long long2 __attribute__((ext_vector_type(2)));
692 typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
693 __nvvm_ldg_l2((const long2 *)p);
694 __nvvm_ldg_ul2((const ulong2 *)p);
696 // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
697 // CHECK: load <2 x i64>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
698 typedef long long longlong2 __attribute__((ext_vector_type(2)));
699 typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
700 __nvvm_ldg_ll2((const longlong2 *)p);
701 __nvvm_ldg_ull2((const ulonglong2 *)p);
703 // CHECK: load <2 x float>, ptr addrspace(1) {{%[0-9]+}}, align 8, !invariant.load
704 typedef float float2 __attribute__((ext_vector_type(2)));
705 __nvvm_ldg_f2((const float2 *)p);
707 // CHECK: load <4 x float>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
708 typedef float float4 __attribute__((ext_vector_type(4)));
709 __nvvm_ldg_f4((const float4 *)p);
711 // CHECK: load <2 x double>, ptr addrspace(1) {{%[0-9]+}}, align 16, !invariant.load
712 typedef double double2 __attribute__((ext_vector_type(2)));
713 __nvvm_ldg_d2((const double2 *)p);
716 // CHECK-LABEL: nvvm_ldu
717 __device__ void nvvm_ldu(const void *p) {
718 // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
719 // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
720 // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
721 __nvvm_ldu_c((const char *)p);
722 __nvvm_ldu_uc((const unsigned char *)p);
723 __nvvm_ldu_sc((const signed char *)p);
725 // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
726 // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
727 __nvvm_ldu_s((const short *)p);
728 __nvvm_ldu_us((const unsigned short *)p);
730 // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
731 // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
732 __nvvm_ldu_i((const int *)p);
733 __nvvm_ldu_ui((const unsigned int *)p);
735 // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
736 // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
737 // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
738 // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
739 __nvvm_ldu_l((const long *)p);
740 __nvvm_ldu_ul((const unsigned long *)p);
742 // CHECK: call float @llvm.nvvm.ldu.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
743 __nvvm_ldu_f((const float *)p);
744 // CHECK: call double @llvm.nvvm.ldu.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
745 __nvvm_ldu_d((const double *)p);
747 // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
748 // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
749 // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
750 typedef char char2 __attribute__((ext_vector_type(2)));
751 typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
752 typedef signed char schar2 __attribute__((ext_vector_type(2)));
753 __nvvm_ldu_c2((const char2 *)p);
754 __nvvm_ldu_uc2((const uchar2 *)p);
755 __nvvm_ldu_sc2((const schar2 *)p);
757 // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
758 // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
759 // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
760 typedef char char4 __attribute__((ext_vector_type(4)));
761 typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
762 typedef signed char schar4 __attribute__((ext_vector_type(4)));
763 __nvvm_ldu_c4((const char4 *)p);
764 __nvvm_ldu_uc4((const uchar4 *)p);
765 __nvvm_ldu_sc4((const schar4 *)p);
767 // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
768 // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
769 typedef short short2 __attribute__((ext_vector_type(2)));
770 typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
771 __nvvm_ldu_s2((const short2 *)p);
772 __nvvm_ldu_us2((const ushort2 *)p);
774 // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
775 // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
776 typedef short short4 __attribute__((ext_vector_type(4)));
777 typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
778 __nvvm_ldu_s4((const short4 *)p);
779 __nvvm_ldu_us4((const ushort4 *)p);
781 // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
782 // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
783 typedef int int2 __attribute__((ext_vector_type(2)));
784 typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
785 __nvvm_ldu_i2((const int2 *)p);
786 __nvvm_ldu_ui2((const uint2 *)p);
788 // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
789 // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
790 typedef int int4 __attribute__((ext_vector_type(4)));
791 typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
792 __nvvm_ldu_i4((const int4 *)p);
793 __nvvm_ldu_ui4((const uint4 *)p);
795 // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
796 // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
797 // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
798 // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
799 typedef long long2 __attribute__((ext_vector_type(2)));
800 typedef unsigned long ulong2 __attribute__((ext_vector_type(2)));
801 __nvvm_ldu_l2((const long2 *)p);
802 __nvvm_ldu_ul2((const ulong2 *)p);
804 // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
805 // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
806 typedef long long longlong2 __attribute__((ext_vector_type(2)));
807 typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
808 __nvvm_ldu_ll2((const longlong2 *)p);
809 __nvvm_ldu_ull2((const ulonglong2 *)p);
811 // CHECK: call <2 x float> @llvm.nvvm.ldu.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
812 typedef float float2 __attribute__((ext_vector_type(2)));
813 __nvvm_ldu_f2((const float2 *)p);
815 // CHECK: call <4 x float> @llvm.nvvm.ldu.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
816 typedef float float4 __attribute__((ext_vector_type(4)));
817 __nvvm_ldu_f4((const float4 *)p);
819 // CHECK: call <2 x double> @llvm.nvvm.ldu.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
820 typedef double double2 __attribute__((ext_vector_type(2)));
821 __nvvm_ldu_d2((const double2 *)p);
824 // CHECK-LABEL: nvvm_shfl
825 __device__ void nvvm_shfl(int i, float f, int a, int b) {
826 // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32
827 __nvvm_shfl_down_i32(i, a, b);
828 // CHECK: call float @llvm.nvvm.shfl.down.f32(float
829 __nvvm_shfl_down_f32(f, a, b);
830 // CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32
831 __nvvm_shfl_up_i32(i, a, b);
832 // CHECK: call float @llvm.nvvm.shfl.up.f32(float
833 __nvvm_shfl_up_f32(f, a, b);
834 // CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32
835 __nvvm_shfl_bfly_i32(i, a, b);
836 // CHECK: call float @llvm.nvvm.shfl.bfly.f32(float
837 __nvvm_shfl_bfly_f32(f, a, b);
838 // CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32
839 __nvvm_shfl_idx_i32(i, a, b);
840 // CHECK: call float @llvm.nvvm.shfl.idx.f32(float
841 __nvvm_shfl_idx_f32(f, a, b);
842 // CHECK: ret void
845 __device__ void nvvm_vote(int pred) {
846 // CHECK: call i1 @llvm.nvvm.vote.all(i1
847 __nvvm_vote_all(pred);
848 // CHECK: call i1 @llvm.nvvm.vote.any(i1
849 __nvvm_vote_any(pred);
850 // CHECK: call i1 @llvm.nvvm.vote.uni(i1
851 __nvvm_vote_uni(pred);
852 // CHECK: call i32 @llvm.nvvm.vote.ballot(i1
853 __nvvm_vote_ballot(pred);
854 // CHECK: ret void
857 // CHECK-LABEL: nvvm_nanosleep
858 __device__ void nvvm_nanosleep(int d) {
859 #if __CUDA_ARCH__ >= 700
860 // CHECK_PTX70_SM80: call void @llvm.nvvm.nanosleep
861 __nvvm_nanosleep(d);
863 // CHECK_PTX70_SM80: call void @llvm.nvvm.nanosleep
864 __nvvm_nanosleep(1);
865 #endif
868 // CHECK-LABEL: nvvm_mbarrier
869 __device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) {
870 #if __CUDA_ARCH__ >= 800
871 __nvvm_mbarrier_init(addr, count);
872 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init
873 __nvvm_mbarrier_init_shared(sharedAddr, count);
874 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared
876 __nvvm_mbarrier_inval(addr);
877 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval
878 __nvvm_mbarrier_inval_shared(sharedAddr);
879 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared
881 __nvvm_mbarrier_arrive(addr);
882 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive
883 __nvvm_mbarrier_arrive_shared(sharedAddr);
884 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared
885 __nvvm_mbarrier_arrive_noComplete(addr, count);
886 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete
887 __nvvm_mbarrier_arrive_noComplete_shared(sharedAddr, count);
888 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared
890 __nvvm_mbarrier_arrive_drop(addr);
891 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop
892 __nvvm_mbarrier_arrive_drop_shared(sharedAddr);
893 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared
894 __nvvm_mbarrier_arrive_drop_noComplete(addr, count);
895 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete
896 __nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr, count);
897 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared
899 __nvvm_mbarrier_test_wait(addr, state);
900 // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait
901 __nvvm_mbarrier_test_wait_shared(sharedAddr, state);
902 // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared
904 __nvvm_mbarrier_pending_count(state);
905 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count
906 #endif
907 // CHECK: ret void
910 // CHECK-LABEL: nvvm_async_copy
911 __device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) {
912 #if __CUDA_ARCH__ >= 800
913 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive
914 __nvvm_cp_async_mbarrier_arrive(addr);
915 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared
916 __nvvm_cp_async_mbarrier_arrive_shared(sharedAddr);
917 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc
918 __nvvm_cp_async_mbarrier_arrive_noinc(addr);
919 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
920 __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
922 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4(
923 __nvvm_cp_async_ca_shared_global_4(dst, src);
924 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8(
925 __nvvm_cp_async_ca_shared_global_8(dst, src);
926 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16(
927 __nvvm_cp_async_ca_shared_global_16(dst, src);
928 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16(
929 __nvvm_cp_async_cg_shared_global_16(dst, src);
931 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4.s({{.*}}, i32 2)
932 __nvvm_cp_async_ca_shared_global_4(dst, src, 2);
933 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8.s({{.*}}, i32 2)
934 __nvvm_cp_async_ca_shared_global_8(dst, src, 2);
935 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16.s({{.*}}, i32 2)
936 __nvvm_cp_async_ca_shared_global_16(dst, src, 2);
937 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16.s({{.*}}, i32 2)
938 __nvvm_cp_async_cg_shared_global_16(dst, src, 2);
940 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
941 __nvvm_cp_async_commit_group();
942 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
943 __nvvm_cp_async_wait_group(0);
944 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8)
945 __nvvm_cp_async_wait_group(8);
946 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16)
947 __nvvm_cp_async_wait_group(16);
948 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all
949 __nvvm_cp_async_wait_all();
950 #endif
951 // CHECK: ret void
954 // CHECK-LABEL: nvvm_cvt_sm80
955 __device__ void nvvm_cvt_sm80() {
956 #if __CUDA_ARCH__ >= 800
957 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn(float 1.000000e+00, float 1.000000e+00)
958 __nvvm_ff2bf16x2_rn(1, 1);
959 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
960 __nvvm_ff2bf16x2_rn_relu(1, 1);
961 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz(float 1.000000e+00, float 1.000000e+00)
962 __nvvm_ff2bf16x2_rz(1, 1);
963 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
964 __nvvm_ff2bf16x2_rz_relu(1, 1);
966 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00)
967 __nvvm_ff2f16x2_rn(1, 1);
968 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
969 __nvvm_ff2f16x2_rn_relu(1, 1);
970 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz(float 1.000000e+00, float 1.000000e+00)
971 __nvvm_ff2f16x2_rz(1, 1);
972 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
973 __nvvm_ff2f16x2_rz_relu(1, 1);
975 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn(float 1.000000e+00)
976 __nvvm_f2bf16_rn(1);
977 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn.relu(float 1.000000e+00)
978 __nvvm_f2bf16_rn_relu(1);
979 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz(float 1.000000e+00)
980 __nvvm_f2bf16_rz(1);
981 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00)
982 __nvvm_f2bf16_rz_relu(1);
984 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00)
985 __nvvm_f2tf32_rna(1);
986 #endif
987 // CHECK: ret void
990 // CHECK-LABEL: nvvm_cvt_sm89
991 __device__ void nvvm_cvt_sm89() {
992 #if __CUDA_ARCH__ >= 890
993 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00)
994 __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f);
995 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
996 __nvvm_ff_to_e4m3x2_rn_relu(1.0f, 1.0f);
997 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn(float 1.000000e+00, float 1.000000e+00)
998 __nvvm_ff_to_e5m2x2_rn(1.0f, 1.0f);
999 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e5m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
1000 __nvvm_ff_to_e5m2x2_rn_relu(1.0f, 1.0f);
1002 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn(<2 x half> splat (half 0xH3C00))
1003 __nvvm_f16x2_to_e4m3x2_rn({1.0f16, 1.0f16});
1004 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e4m3x2.rn.relu(<2 x half> splat (half 0xH3C00))
1005 __nvvm_f16x2_to_e4m3x2_rn_relu({1.0f16, 1.0f16});
1006 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn(<2 x half> splat (half 0xH3C00))
1007 __nvvm_f16x2_to_e5m2x2_rn({1.0f16, 1.0f16});
1008 // CHECK_PTX81_SM89: call i16 @llvm.nvvm.f16x2.to.e5m2x2.rn.relu(<2 x half> splat (half 0xH3C00))
1009 __nvvm_f16x2_to_e5m2x2_rn_relu({1.0f16, 1.0f16});
1011 // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 18504)
1012 __nvvm_e4m3x2_to_f16x2_rn(0x4848);
1013 // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 18504)
1014 __nvvm_e4m3x2_to_f16x2_rn_relu(0x4848);
1015 // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 19532)
1016 __nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
1017 // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532)
1018 __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
1019 #endif
1020 // CHECK: ret void
1023 #define NAN32 0x7FBFFFFF
1024 #define NAN16 (__bf16)0x7FBF
1025 #define BF16 (__bf16)0.1f
1026 #define BF16_2 (__bf16)0.2f
1027 #define NANBF16 (__bf16)0xFFC1
1028 #define BF16X2 {(__bf16)0.1f, (__bf16)0.1f}
1029 #define BF16X2_2 {(__bf16)0.2f, (__bf16)0.2f}
1030 #define NANBF16X2 {NANBF16, NANBF16}
1032 // CHECK-LABEL: nvvm_abs_neg_bf16_bf16x2_sm80
1033 __device__ void nvvm_abs_neg_bf16_bf16x2_sm80() {
1034 #if __CUDA_ARCH__ >= 800
1036 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.abs.bf16(bfloat 0xR3DCD)
1037 __nvvm_abs_bf16(BF16);
1038 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat> splat (bfloat 0xR3DCD))
1039 __nvvm_abs_bf16x2(BF16X2);
1041 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.neg.bf16(bfloat 0xR3DCD)
1042 __nvvm_neg_bf16(BF16);
1043 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.neg.bf16x2(<2 x bfloat> splat (bfloat 0xR3DCD))
1044 __nvvm_neg_bf16x2(BF16X2);
1045 #endif
1046 // CHECK: ret void
1049 // CHECK-LABEL: nvvm_min_max_sm80
1050 __device__ void nvvm_min_max_sm80() {
1051 #if __CUDA_ARCH__ >= 800
1053 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.nan.f
1054 __nvvm_fmin_nan_f(0.1f, (float)NAN32);
1055 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.ftz.nan.f
1056 __nvvm_fmin_ftz_nan_f(0.1f, (float)NAN32);
1058 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.bf16
1059 __nvvm_fmin_bf16(BF16, BF16_2);
1060 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.bf16
1061 __nvvm_fmin_ftz_bf16(BF16, BF16_2);
1062 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.nan.bf16
1063 __nvvm_fmin_nan_bf16(BF16, NANBF16);
1064 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.nan.bf16
1065 __nvvm_fmin_ftz_nan_bf16(BF16, NANBF16);
1066 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.bf16x2
1067 __nvvm_fmin_bf16x2(BF16X2, BF16X2_2);
1068 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.bf16x2
1069 __nvvm_fmin_ftz_bf16x2(BF16X2, BF16X2_2);
1070 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.nan.bf16x2
1071 __nvvm_fmin_nan_bf16x2(BF16X2, NANBF16X2);
1072 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.nan.bf16x2
1073 __nvvm_fmin_ftz_nan_bf16x2(BF16X2, NANBF16X2);
1074 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
1075 __nvvm_fmax_nan_f(0.1f, 0.11f);
1076 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
1077 __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
1079 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
1080 __nvvm_fmax_nan_f(0.1f, (float)NAN32);
1081 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
1082 __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
1083 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.bf16
1084 __nvvm_fmax_bf16(BF16, BF16_2);
1085 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.bf16
1086 __nvvm_fmax_ftz_bf16(BF16, BF16_2);
1087 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.nan.bf16
1088 __nvvm_fmax_nan_bf16(BF16, NANBF16);
1089 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.nan.bf16
1090 __nvvm_fmax_ftz_nan_bf16(BF16, NANBF16);
1091 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.bf16x2
1092 __nvvm_fmax_bf16x2(BF16X2, BF16X2_2);
1093 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.bf16x2
1094 __nvvm_fmax_ftz_bf16x2(BF16X2, BF16X2_2);
1095 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.nan.bf16x2
1096 __nvvm_fmax_nan_bf16x2(NANBF16X2, BF16X2);
1097 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.nan.bf16x2
1098 __nvvm_fmax_ftz_nan_bf16x2(NANBF16X2, BF16X2);
1099 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
1100 __nvvm_fmax_nan_f(0.1f, (float)NAN32);
1101 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
1102 __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
1104 #endif
1105 // CHECK: ret void
1108 // CHECK-LABEL: nvvm_fma_bf16_bf16x2_sm80
1109 __device__ void nvvm_fma_bf16_bf16x2_sm80() {
1110 #if __CUDA_ARCH__ >= 800
1111 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.bf16
1112 __nvvm_fma_rn_bf16(BF16, BF16_2, BF16_2);
1113 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.relu.bf16
1114 __nvvm_fma_rn_relu_bf16(BF16, BF16_2, BF16_2);
1115 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.bf16x2
1116 __nvvm_fma_rn_bf16x2(BF16X2, BF16X2_2, BF16X2_2);
1117 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.relu.bf16x2
1118 __nvvm_fma_rn_relu_bf16x2(BF16X2, BF16X2_2, BF16X2_2);
1119 #endif
1120 // CHECK: ret void
1123 // CHECK-LABEL: nvvm_min_max_sm86
1124 __device__ void nvvm_min_max_sm86() {
1125 #if __CUDA_ARCH__ >= 860
1127 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.xorsign.abs.bf16
1128 __nvvm_fmin_xorsign_abs_bf16(BF16, BF16_2);
1129 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.nan.xorsign.abs.bf16
1130 __nvvm_fmin_nan_xorsign_abs_bf16(BF16, NANBF16);
1131 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.xorsign.abs.bf16x2
1132 __nvvm_fmin_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
1133 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2
1134 __nvvm_fmin_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
1135 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.xorsign.abs.f
1136 __nvvm_fmin_xorsign_abs_f(-0.1f, 0.1f);
1137 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.xorsign.abs.f
1138 __nvvm_fmin_ftz_xorsign_abs_f(-0.1f, 0.1f);
1139 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.nan.xorsign.abs.f
1140 __nvvm_fmin_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1141 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f
1142 __nvvm_fmin_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1144 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.xorsign.abs.bf16
1145 __nvvm_fmax_xorsign_abs_bf16(BF16, BF16_2);
1146 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.nan.xorsign.abs.bf16
1147 __nvvm_fmax_nan_xorsign_abs_bf16(BF16, NANBF16);
1148 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.xorsign.abs.bf16x2
1149 __nvvm_fmax_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
1150 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2
1151 __nvvm_fmax_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
1152 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.xorsign.abs.f
1153 __nvvm_fmax_xorsign_abs_f(-0.1f, 0.1f);
1154 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.xorsign.abs.f
1155 __nvvm_fmax_ftz_xorsign_abs_f(-0.1f, 0.1f);
1156 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.nan.xorsign.abs.f
1157 __nvvm_fmax_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1158 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f
1159 __nvvm_fmax_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
1160 #endif
1161 // CHECK: ret void