1 // REQUIRES: nvptx-registered-target
2 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
3 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
4 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
5 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
6 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
7 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
8 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \
9 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
10 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
11 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 \
12 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
13 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
14 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 \
15 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
16 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
17 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 \
18 // RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
19 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
20 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
21 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s
22 // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
23 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
24 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
26 #define __device__ __attribute__((device))
27 #define __global__ __attribute__((global))
28 #define __shared__ __attribute__((shared))
29 #define __constant__ __attribute__((constant))
31 __device__
int read_tid() {
33 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
34 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
35 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
36 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
38 int x
= __nvvm_read_ptx_sreg_tid_x();
39 int y
= __nvvm_read_ptx_sreg_tid_y();
40 int z
= __nvvm_read_ptx_sreg_tid_z();
41 int w
= __nvvm_read_ptx_sreg_tid_w();
47 __device__
int read_ntid() {
49 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
50 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
51 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
52 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
54 int x
= __nvvm_read_ptx_sreg_ntid_x();
55 int y
= __nvvm_read_ptx_sreg_ntid_y();
56 int z
= __nvvm_read_ptx_sreg_ntid_z();
57 int w
= __nvvm_read_ptx_sreg_ntid_w();
63 __device__
int read_ctaid() {
65 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
66 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
67 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
68 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
70 int x
= __nvvm_read_ptx_sreg_ctaid_x();
71 int y
= __nvvm_read_ptx_sreg_ctaid_y();
72 int z
= __nvvm_read_ptx_sreg_ctaid_z();
73 int w
= __nvvm_read_ptx_sreg_ctaid_w();
79 __device__
int read_nctaid() {
81 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
82 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
83 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
84 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
86 int x
= __nvvm_read_ptx_sreg_nctaid_x();
87 int y
= __nvvm_read_ptx_sreg_nctaid_y();
88 int z
= __nvvm_read_ptx_sreg_nctaid_z();
89 int w
= __nvvm_read_ptx_sreg_nctaid_w();
95 __device__
int read_ids() {
97 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
98 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid()
99 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
100 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
101 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
102 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
104 int a
= __nvvm_read_ptx_sreg_laneid();
105 int b
= __nvvm_read_ptx_sreg_warpid();
106 int c
= __nvvm_read_ptx_sreg_nwarpid();
107 int d
= __nvvm_read_ptx_sreg_smid();
108 int e
= __nvvm_read_ptx_sreg_nsmid();
109 int f
= __nvvm_read_ptx_sreg_gridid();
111 return a
+ b
+ c
+ d
+ e
+ f
;
115 __device__
int read_lanemasks() {
117 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
118 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
119 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
120 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
121 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
123 int a
= __nvvm_read_ptx_sreg_lanemask_eq();
124 int b
= __nvvm_read_ptx_sreg_lanemask_le();
125 int c
= __nvvm_read_ptx_sreg_lanemask_lt();
126 int d
= __nvvm_read_ptx_sreg_lanemask_ge();
127 int e
= __nvvm_read_ptx_sreg_lanemask_gt();
129 return a
+ b
+ c
+ d
+ e
;
133 __device__
long long read_clocks() {
135 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
136 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
138 int a
= __nvvm_read_ptx_sreg_clock();
139 long long b
= __nvvm_read_ptx_sreg_clock64();
144 __device__
int read_pms() {
146 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0()
147 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1()
148 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2()
149 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3()
151 int a
= __nvvm_read_ptx_sreg_pm0();
152 int b
= __nvvm_read_ptx_sreg_pm1();
153 int c
= __nvvm_read_ptx_sreg_pm2();
154 int d
= __nvvm_read_ptx_sreg_pm3();
156 return a
+ b
+ c
+ d
;
160 __device__
void sync() {
162 // CHECK: call void @llvm.nvvm.bar.sync(i32 0)
171 // The idea is not to test all intrinsics, just that Clang is recognizing the
172 // builtins defined in BuiltinsNVPTX.def
173 __device__
void nvvm_math(float f1
, float f2
, double d1
, double d2
) {
174 // CHECK: call float @llvm.nvvm.fmax.f
175 float t1
= __nvvm_fmax_f(f1
, f2
);
176 // CHECK: call float @llvm.nvvm.fmin.f
177 float t2
= __nvvm_fmin_f(f1
, f2
);
178 // CHECK: call float @llvm.nvvm.sqrt.rn.f
179 float t3
= __nvvm_sqrt_rn_f(f1
);
180 // CHECK: call float @llvm.nvvm.rcp.rn.f
181 float t4
= __nvvm_rcp_rn_f(f2
);
182 // CHECK: call float @llvm.nvvm.add.rn.f
183 float t5
= __nvvm_add_rn_f(f1
, f2
);
185 // CHECK: call double @llvm.nvvm.fmax.d
186 double td1
= __nvvm_fmax_d(d1
, d2
);
187 // CHECK: call double @llvm.nvvm.fmin.d
188 double td2
= __nvvm_fmin_d(d1
, d2
);
189 // CHECK: call double @llvm.nvvm.sqrt.rn.d
190 double td3
= __nvvm_sqrt_rn_d(d1
);
191 // CHECK: call double @llvm.nvvm.rcp.rn.d
192 double td4
= __nvvm_rcp_rn_d(d2
);
194 // CHECK: call void @llvm.nvvm.membar.cta()
196 // CHECK: call void @llvm.nvvm.membar.gl()
198 // CHECK: call void @llvm.nvvm.membar.sys()
200 // CHECK: call void @llvm.nvvm.barrier0()
208 __device__
long long dll
;
209 __shared__
long long sll
;
211 // Check for atomic intrinsics
212 // CHECK-LABEL: nvvm_atom
213 __device__
void nvvm_atom(float *fp
, float f
, double *dfp
, double df
, int *ip
,
214 int i
, unsigned int *uip
, unsigned ui
, long *lp
,
215 long l
, long long *llp
, long long ll
) {
216 // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 4
217 __nvvm_atom_add_gen_i(ip
, i
);
218 // CHECK: atomicrmw add ptr {{.*}} seq_cst, align {{4|8}}
219 __nvvm_atom_add_gen_l(&dl
, l
);
220 // CHECK: atomicrmw add ptr {{.*}} seq_cst, align 8
221 __nvvm_atom_add_gen_ll(&sll
, ll
);
223 // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 4
224 __nvvm_atom_sub_gen_i(ip
, i
);
225 // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align {{4|8}}
226 __nvvm_atom_sub_gen_l(&dl
, l
);
227 // CHECK: atomicrmw sub ptr {{.*}} seq_cst, align 8
228 __nvvm_atom_sub_gen_ll(&sll
, ll
);
230 // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 4
231 __nvvm_atom_and_gen_i(ip
, i
);
232 // CHECK: atomicrmw and ptr {{.*}} seq_cst, align {{4|8}}
233 __nvvm_atom_and_gen_l(&dl
, l
);
234 // CHECK: atomicrmw and ptr {{.*}} seq_cst, align 8
235 __nvvm_atom_and_gen_ll(&sll
, ll
);
237 // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 4
238 __nvvm_atom_or_gen_i(ip
, i
);
239 // CHECK: atomicrmw or ptr {{.*}} seq_cst, align {{4|8}}
240 __nvvm_atom_or_gen_l(&dl
, l
);
241 // CHECK: atomicrmw or ptr {{.*}} seq_cst, align 8
242 __nvvm_atom_or_gen_ll(&sll
, ll
);
244 // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 4
245 __nvvm_atom_xor_gen_i(ip
, i
);
246 // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align {{4|8}}
247 __nvvm_atom_xor_gen_l(&dl
, l
);
248 // CHECK: atomicrmw xor ptr {{.*}} seq_cst, align 8
249 __nvvm_atom_xor_gen_ll(&sll
, ll
);
251 // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 4
252 __nvvm_atom_xchg_gen_i(ip
, i
);
253 // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align {{4|8}}
254 __nvvm_atom_xchg_gen_l(&dl
, l
);
255 // CHECK: atomicrmw xchg ptr {{.*}} seq_cst, align 8
256 __nvvm_atom_xchg_gen_ll(&sll
, ll
);
258 // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 4
259 __nvvm_atom_max_gen_i(ip
, i
);
260 // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 4
261 __nvvm_atom_max_gen_ui((unsigned int *)ip
, i
);
262 // CHECK: atomicrmw max ptr {{.*}} seq_cst, align {{4|8}}
263 __nvvm_atom_max_gen_l(&dl
, l
);
264 // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align {{4|8}}
265 __nvvm_atom_max_gen_ul((unsigned long *)&dl
, l
);
266 // CHECK: atomicrmw max ptr {{.*}} seq_cst, align 8
267 __nvvm_atom_max_gen_ll(&sll
, ll
);
268 // CHECK: atomicrmw umax ptr {{.*}} seq_cst, align 8
269 __nvvm_atom_max_gen_ull((unsigned long long *)&sll
, ll
);
271 // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 4
272 __nvvm_atom_min_gen_i(ip
, i
);
273 // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 4
274 __nvvm_atom_min_gen_ui((unsigned int *)ip
, i
);
275 // CHECK: atomicrmw min ptr {{.*}} seq_cst, align {{4|8}}
276 __nvvm_atom_min_gen_l(&dl
, l
);
277 // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align {{4|8}}
278 __nvvm_atom_min_gen_ul((unsigned long *)&dl
, l
);
279 // CHECK: atomicrmw min ptr {{.*}} seq_cst, align 8
280 __nvvm_atom_min_gen_ll(&sll
, ll
);
281 // CHECK: atomicrmw umin ptr {{.*}} seq_cst, align 8
282 __nvvm_atom_min_gen_ull((unsigned long long *)&sll
, ll
);
284 // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 4
285 // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
286 __nvvm_atom_cas_gen_i(ip
, 0, i
);
287 // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align {{4|8}}
288 // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
289 __nvvm_atom_cas_gen_l(&dl
, 0, l
);
290 // CHECK: cmpxchg ptr {{.*}} seq_cst seq_cst, align 8
291 // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
292 __nvvm_atom_cas_gen_ll(&sll
, 0, ll
);
294 // CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 4
295 __nvvm_atom_add_gen_f(fp
, f
);
297 // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0
298 __nvvm_atom_inc_gen_ui(uip
, ui
);
300 // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0
301 __nvvm_atom_dec_gen_ui(uip
, ui
);
304 //////////////////////////////////////////////////////////////////
305 // Atomics with scope (only supported on sm_60+).
307 #if ERROR_CHECK || __CUDA_ARCH__ >= 600
309 // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0
310 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature sm_60}}
311 __nvvm_atom_cta_add_gen_i(ip
, i
);
312 // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0
313 // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0
314 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature sm_60}}
315 __nvvm_atom_cta_add_gen_l(&dl
, l
);
316 // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0
317 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature sm_60}}
318 __nvvm_atom_cta_add_gen_ll(&sll
, ll
);
319 // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0
320 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature sm_60}}
321 __nvvm_atom_sys_add_gen_i(ip
, i
);
322 // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0
323 // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0
324 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature sm_60}}
325 __nvvm_atom_sys_add_gen_l(&dl
, l
);
326 // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0
327 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature sm_60}}
328 __nvvm_atom_sys_add_gen_ll(&sll
, ll
);
330 // CHECK: call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0
331 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature sm_60}}
332 __nvvm_atom_cta_add_gen_f(fp
, f
);
333 // CHECK: call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0
334 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature sm_60}}
335 __nvvm_atom_cta_add_gen_d(dfp
, df
);
336 // CHECK: call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0
337 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature sm_60}}
338 __nvvm_atom_sys_add_gen_f(fp
, f
);
339 // CHECK: call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0
340 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature sm_60}}
341 __nvvm_atom_sys_add_gen_d(dfp
, df
);
343 // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0
344 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature sm_60}}
345 __nvvm_atom_cta_xchg_gen_i(ip
, i
);
346 // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0
347 // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0
348 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature sm_60}}
349 __nvvm_atom_cta_xchg_gen_l(&dl
, l
);
350 // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0
351 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature sm_60}}
352 __nvvm_atom_cta_xchg_gen_ll(&sll
, ll
);
354 // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0
355 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_i' needs target feature sm_60}}
356 __nvvm_atom_sys_xchg_gen_i(ip
, i
);
357 // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0
358 // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0
359 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_l' needs target feature sm_60}}
360 __nvvm_atom_sys_xchg_gen_l(&dl
, l
);
361 // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0
362 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_ll' needs target feature sm_60}}
363 __nvvm_atom_sys_xchg_gen_ll(&sll
, ll
);
365 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
366 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_i' needs target feature sm_60}}
367 __nvvm_atom_cta_max_gen_i(ip
, i
);
368 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
369 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ui' needs target feature sm_60}}
370 __nvvm_atom_cta_max_gen_ui((unsigned int *)ip
, i
);
371 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
372 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
373 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_l' needs target feature sm_60}}
374 __nvvm_atom_cta_max_gen_l(&dl
, l
);
375 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0
376 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
377 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ul' needs target feature sm_60}}
378 __nvvm_atom_cta_max_gen_ul((unsigned long *)lp
, l
);
379 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
380 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ll' needs target feature sm_60}}
381 __nvvm_atom_cta_max_gen_ll(&sll
, ll
);
382 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0
383 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ull' needs target feature sm_60}}
384 __nvvm_atom_cta_max_gen_ull((unsigned long long *)llp
, ll
);
386 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
387 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_i' needs target feature sm_60}}
388 __nvvm_atom_sys_max_gen_i(ip
, i
);
389 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
390 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ui' needs target feature sm_60}}
391 __nvvm_atom_sys_max_gen_ui((unsigned int *)ip
, i
);
392 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
393 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
394 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_l' needs target feature sm_60}}
395 __nvvm_atom_sys_max_gen_l(&dl
, l
);
396 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0
397 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
398 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ul' needs target feature sm_60}}
399 __nvvm_atom_sys_max_gen_ul((unsigned long *)lp
, l
);
400 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
401 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ll' needs target feature sm_60}}
402 __nvvm_atom_sys_max_gen_ll(&sll
, ll
);
403 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0
404 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ull' needs target feature sm_60}}
405 __nvvm_atom_sys_max_gen_ull((unsigned long long *)llp
, ll
);
407 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
408 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_i' needs target feature sm_60}}
409 __nvvm_atom_cta_min_gen_i(ip
, i
);
410 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
411 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ui' needs target feature sm_60}}
412 __nvvm_atom_cta_min_gen_ui((unsigned int *)ip
, i
);
413 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
414 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
415 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_l' needs target feature sm_60}}
416 __nvvm_atom_cta_min_gen_l(&dl
, l
);
417 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0
418 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
419 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ul' needs target feature sm_60}}
420 __nvvm_atom_cta_min_gen_ul((unsigned long *)lp
, l
);
421 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
422 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ll' needs target feature sm_60}}
423 __nvvm_atom_cta_min_gen_ll(&sll
, ll
);
424 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0
425 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ull' needs target feature sm_60}}
426 __nvvm_atom_cta_min_gen_ull((unsigned long long *)llp
, ll
);
428 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
429 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_i' needs target feature sm_60}}
430 __nvvm_atom_sys_min_gen_i(ip
, i
);
431 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
432 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ui' needs target feature sm_60}}
433 __nvvm_atom_sys_min_gen_ui((unsigned int *)ip
, i
);
434 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
435 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
436 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_l' needs target feature sm_60}}
437 __nvvm_atom_sys_min_gen_l(&dl
, l
);
438 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0
439 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
440 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ul' needs target feature sm_60}}
441 __nvvm_atom_sys_min_gen_ul((unsigned long *)lp
, l
);
442 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
443 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ll' needs target feature sm_60}}
444 __nvvm_atom_sys_min_gen_ll(&sll
, ll
);
445 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0
446 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ull' needs target feature sm_60}}
447 __nvvm_atom_sys_min_gen_ull((unsigned long long *)llp
, ll
);
449 // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0
450 // expected-error@+1 {{'__nvvm_atom_cta_inc_gen_ui' needs target feature sm_60}}
451 __nvvm_atom_cta_inc_gen_ui((unsigned int *)ip
, i
);
452 // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0
453 // expected-error@+1 {{'__nvvm_atom_sys_inc_gen_ui' needs target feature sm_60}}
454 __nvvm_atom_sys_inc_gen_ui((unsigned int *)ip
, i
);
456 // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0
457 // expected-error@+1 {{'__nvvm_atom_cta_dec_gen_ui' needs target feature sm_60}}
458 __nvvm_atom_cta_dec_gen_ui((unsigned int *)ip
, i
);
459 // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0
460 // expected-error@+1 {{'__nvvm_atom_sys_dec_gen_ui' needs target feature sm_60}}
461 __nvvm_atom_sys_dec_gen_ui((unsigned int *)ip
, i
);
463 // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0
464 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_i' needs target feature sm_60}}
465 __nvvm_atom_cta_and_gen_i(ip
, i
);
466 // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0
467 // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0
468 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_l' needs target feature sm_60}}
469 __nvvm_atom_cta_and_gen_l(&dl
, l
);
470 // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0
471 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_ll' needs target feature sm_60}}
472 __nvvm_atom_cta_and_gen_ll(&sll
, ll
);
474 // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0
475 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_i' needs target feature sm_60}}
476 __nvvm_atom_sys_and_gen_i(ip
, i
);
477 // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0
478 // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0
479 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_l' needs target feature sm_60}}
480 __nvvm_atom_sys_and_gen_l(&dl
, l
);
481 // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0
482 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_ll' needs target feature sm_60}}
483 __nvvm_atom_sys_and_gen_ll(&sll
, ll
);
485 // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0
486 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_i' needs target feature sm_60}}
487 __nvvm_atom_cta_or_gen_i(ip
, i
);
488 // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0
489 // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0
490 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_l' needs target feature sm_60}}
491 __nvvm_atom_cta_or_gen_l(&dl
, l
);
492 // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0
493 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_ll' needs target feature sm_60}}
494 __nvvm_atom_cta_or_gen_ll(&sll
, ll
);
496 // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0
497 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_i' needs target feature sm_60}}
498 __nvvm_atom_sys_or_gen_i(ip
, i
);
499 // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0
500 // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0
501 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_l' needs target feature sm_60}}
502 __nvvm_atom_sys_or_gen_l(&dl
, l
);
503 // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0
504 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_ll' needs target feature sm_60}}
505 __nvvm_atom_sys_or_gen_ll(&sll
, ll
);
507 // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0
508 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_i' needs target feature sm_60}}
509 __nvvm_atom_cta_xor_gen_i(ip
, i
);
510 // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0
511 // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0
512 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_l' needs target feature sm_60}}
513 __nvvm_atom_cta_xor_gen_l(&dl
, l
);
514 // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0
515 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_ll' needs target feature sm_60}}
516 __nvvm_atom_cta_xor_gen_ll(&sll
, ll
);
518 // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0
519 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_i' needs target feature sm_60}}
520 __nvvm_atom_sys_xor_gen_i(ip
, i
);
521 // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0
522 // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0
523 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_l' needs target feature sm_60}}
524 __nvvm_atom_sys_xor_gen_l(&dl
, l
);
525 // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0
526 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_ll' needs target feature sm_60}}
527 __nvvm_atom_sys_xor_gen_ll(&sll
, ll
);
529 // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0
530 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_i' needs target feature sm_60}}
531 __nvvm_atom_cta_cas_gen_i(ip
, i
, 0);
532 // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0
533 // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0
534 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_l' needs target feature sm_60}}
535 __nvvm_atom_cta_cas_gen_l(&dl
, l
, 0);
536 // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0
537 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_ll' needs target feature sm_60}}
538 __nvvm_atom_cta_cas_gen_ll(&sll
, ll
, 0);
540 // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0
541 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_i' needs target feature sm_60}}
542 __nvvm_atom_sys_cas_gen_i(ip
, i
, 0);
543 // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0
544 // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0
545 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_l' needs target feature sm_60}}
546 __nvvm_atom_sys_cas_gen_l(&dl
, l
, 0);
547 // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0
548 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_ll' needs target feature sm_60}}
549 __nvvm_atom_sys_cas_gen_ll(&sll
, ll
, 0);
555 // CHECK-LABEL: nvvm_ldg
556 __device__
void nvvm_ldg(const void *p
) {
557 // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
558 // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
559 // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
560 __nvvm_ldg_c((const char *)p
);
561 __nvvm_ldg_uc((const unsigned char *)p
);
562 __nvvm_ldg_sc((const signed char *)p
);
564 // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
565 // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
566 __nvvm_ldg_s((const short *)p
);
567 __nvvm_ldg_us((const unsigned short *)p
);
569 // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
570 // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
571 __nvvm_ldg_i((const int *)p
);
572 __nvvm_ldg_ui((const unsigned int *)p
);
574 // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
575 // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
576 // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
577 // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
578 __nvvm_ldg_l((const long *)p
);
579 __nvvm_ldg_ul((const unsigned long *)p
);
581 // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
582 __nvvm_ldg_f((const float *)p
);
583 // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
584 __nvvm_ldg_d((const double *)p
);
586 // In practice, the pointers we pass to __ldg will be aligned as appropriate
587 // for the CUDA <type>N vector types (e.g. short4), which are not the same as
588 // the LLVM vector types. However, each LLVM vector type has an alignment
589 // less than or equal to its corresponding CUDA type, so we're OK.
591 // PTX Interoperability section 2.2: "For a vector with an even number of
592 // elements, its alignment is set to number of elements times the alignment of
593 // its member: n*alignof(t)."
595 // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
596 // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
597 // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
598 typedef char char2
__attribute__((ext_vector_type(2)));
599 typedef unsigned char uchar2
__attribute__((ext_vector_type(2)));
600 typedef signed char schar2
__attribute__((ext_vector_type(2)));
601 __nvvm_ldg_c2((const char2
*)p
);
602 __nvvm_ldg_uc2((const uchar2
*)p
);
603 __nvvm_ldg_sc2((const schar2
*)p
);
605 // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
606 // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
607 // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
608 typedef char char4
__attribute__((ext_vector_type(4)));
609 typedef unsigned char uchar4
__attribute__((ext_vector_type(4)));
610 typedef signed char schar4
__attribute__((ext_vector_type(4)));
611 __nvvm_ldg_c4((const char4
*)p
);
612 __nvvm_ldg_uc4((const uchar4
*)p
);
613 __nvvm_ldg_sc4((const schar4
*)p
);
615 // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
616 // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
617 typedef short short2
__attribute__((ext_vector_type(2)));
618 typedef unsigned short ushort2
__attribute__((ext_vector_type(2)));
619 __nvvm_ldg_s2((const short2
*)p
);
620 __nvvm_ldg_us2((const ushort2
*)p
);
622 // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
623 // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
624 typedef short short4
__attribute__((ext_vector_type(4)));
625 typedef unsigned short ushort4
__attribute__((ext_vector_type(4)));
626 __nvvm_ldg_s4((const short4
*)p
);
627 __nvvm_ldg_us4((const ushort4
*)p
);
629 // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
630 // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
631 typedef int int2
__attribute__((ext_vector_type(2)));
632 typedef unsigned int uint2
__attribute__((ext_vector_type(2)));
633 __nvvm_ldg_i2((const int2
*)p
);
634 __nvvm_ldg_ui2((const uint2
*)p
);
636 // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
637 // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
638 typedef int int4
__attribute__((ext_vector_type(4)));
639 typedef unsigned int uint4
__attribute__((ext_vector_type(4)));
640 __nvvm_ldg_i4((const int4
*)p
);
641 __nvvm_ldg_ui4((const uint4
*)p
);
643 // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
644 // LP32: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
645 // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
646 // LP64: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
647 typedef long long2
__attribute__((ext_vector_type(2)));
648 typedef unsigned long ulong2
__attribute__((ext_vector_type(2)));
649 __nvvm_ldg_l2((const long2
*)p
);
650 __nvvm_ldg_ul2((const ulong2
*)p
);
652 // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
653 // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
654 typedef long long longlong2
__attribute__((ext_vector_type(2)));
655 typedef unsigned long long ulonglong2
__attribute__((ext_vector_type(2)));
656 __nvvm_ldg_ll2((const longlong2
*)p
);
657 __nvvm_ldg_ull2((const ulonglong2
*)p
);
659 // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
660 typedef float float2
__attribute__((ext_vector_type(2)));
661 __nvvm_ldg_f2((const float2
*)p
);
663 // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
664 typedef float float4
__attribute__((ext_vector_type(4)));
665 __nvvm_ldg_f4((const float4
*)p
);
667 // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
668 typedef double double2
__attribute__((ext_vector_type(2)));
669 __nvvm_ldg_d2((const double2
*)p
);
672 // CHECK-LABEL: nvvm_ldu
673 __device__
void nvvm_ldu(const void *p
) {
674 // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
675 // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
676 // CHECK: call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr {{%[0-9]+}}, i32 1)
677 __nvvm_ldu_c((const char *)p
);
678 __nvvm_ldu_uc((const unsigned char *)p
);
679 __nvvm_ldu_sc((const signed char *)p
);
681 // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
682 // CHECK: call i16 @llvm.nvvm.ldu.global.i.i16.p0(ptr {{%[0-9]+}}, i32 2)
683 __nvvm_ldu_s((const short *)p
);
684 __nvvm_ldu_us((const unsigned short *)p
);
686 // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
687 // CHECK: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
688 __nvvm_ldu_i((const int *)p
);
689 __nvvm_ldu_ui((const unsigned int *)p
);
691 // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
692 // LP32: call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr {{%[0-9]+}}, i32 4)
693 // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
694 // LP64: call i64 @llvm.nvvm.ldu.global.i.i64.p0(ptr {{%[0-9]+}}, i32 8)
695 __nvvm_ldu_l((const long *)p
);
696 __nvvm_ldu_ul((const unsigned long *)p
);
698 // CHECK: call float @llvm.nvvm.ldu.global.f.f32.p0(ptr {{%[0-9]+}}, i32 4)
699 __nvvm_ldu_f((const float *)p
);
700 // CHECK: call double @llvm.nvvm.ldu.global.f.f64.p0(ptr {{%[0-9]+}}, i32 8)
701 __nvvm_ldu_d((const double *)p
);
703 // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
704 // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
705 // CHECK: call <2 x i8> @llvm.nvvm.ldu.global.i.v2i8.p0(ptr {{%[0-9]+}}, i32 2)
706 typedef char char2
__attribute__((ext_vector_type(2)));
707 typedef unsigned char uchar2
__attribute__((ext_vector_type(2)));
708 typedef signed char schar2
__attribute__((ext_vector_type(2)));
709 __nvvm_ldu_c2((const char2
*)p
);
710 __nvvm_ldu_uc2((const uchar2
*)p
);
711 __nvvm_ldu_sc2((const schar2
*)p
);
713 // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
714 // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
715 // CHECK: call <4 x i8> @llvm.nvvm.ldu.global.i.v4i8.p0(ptr {{%[0-9]+}}, i32 4)
716 typedef char char4
__attribute__((ext_vector_type(4)));
717 typedef unsigned char uchar4
__attribute__((ext_vector_type(4)));
718 typedef signed char schar4
__attribute__((ext_vector_type(4)));
719 __nvvm_ldu_c4((const char4
*)p
);
720 __nvvm_ldu_uc4((const uchar4
*)p
);
721 __nvvm_ldu_sc4((const schar4
*)p
);
723 // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
724 // CHECK: call <2 x i16> @llvm.nvvm.ldu.global.i.v2i16.p0(ptr {{%[0-9]+}}, i32 4)
725 typedef short short2
__attribute__((ext_vector_type(2)));
726 typedef unsigned short ushort2
__attribute__((ext_vector_type(2)));
727 __nvvm_ldu_s2((const short2
*)p
);
728 __nvvm_ldu_us2((const ushort2
*)p
);
730 // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
731 // CHECK: call <4 x i16> @llvm.nvvm.ldu.global.i.v4i16.p0(ptr {{%[0-9]+}}, i32 8)
732 typedef short short4
__attribute__((ext_vector_type(4)));
733 typedef unsigned short ushort4
__attribute__((ext_vector_type(4)));
734 __nvvm_ldu_s4((const short4
*)p
);
735 __nvvm_ldu_us4((const ushort4
*)p
);
737 // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
738 // CHECK: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
739 typedef int int2
__attribute__((ext_vector_type(2)));
740 typedef unsigned int uint2
__attribute__((ext_vector_type(2)));
741 __nvvm_ldu_i2((const int2
*)p
);
742 __nvvm_ldu_ui2((const uint2
*)p
);
744 // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
745 // CHECK: call <4 x i32> @llvm.nvvm.ldu.global.i.v4i32.p0(ptr {{%[0-9]+}}, i32 16)
746 typedef int int4
__attribute__((ext_vector_type(4)));
747 typedef unsigned int uint4
__attribute__((ext_vector_type(4)));
748 __nvvm_ldu_i4((const int4
*)p
);
749 __nvvm_ldu_ui4((const uint4
*)p
);
751 // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
752 // LP32: call <2 x i32> @llvm.nvvm.ldu.global.i.v2i32.p0(ptr {{%[0-9]+}}, i32 8)
753 // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
754 // LP64: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
755 typedef long long2
__attribute__((ext_vector_type(2)));
756 typedef unsigned long ulong2
__attribute__((ext_vector_type(2)));
757 __nvvm_ldu_l2((const long2
*)p
);
758 __nvvm_ldu_ul2((const ulong2
*)p
);
760 // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
761 // CHECK: call <2 x i64> @llvm.nvvm.ldu.global.i.v2i64.p0(ptr {{%[0-9]+}}, i32 16)
762 typedef long long longlong2
__attribute__((ext_vector_type(2)));
763 typedef unsigned long long ulonglong2
__attribute__((ext_vector_type(2)));
764 __nvvm_ldu_ll2((const longlong2
*)p
);
765 __nvvm_ldu_ull2((const ulonglong2
*)p
);
767 // CHECK: call <2 x float> @llvm.nvvm.ldu.global.f.v2f32.p0(ptr {{%[0-9]+}}, i32 8)
768 typedef float float2
__attribute__((ext_vector_type(2)));
769 __nvvm_ldu_f2((const float2
*)p
);
771 // CHECK: call <4 x float> @llvm.nvvm.ldu.global.f.v4f32.p0(ptr {{%[0-9]+}}, i32 16)
772 typedef float float4
__attribute__((ext_vector_type(4)));
773 __nvvm_ldu_f4((const float4
*)p
);
775 // CHECK: call <2 x double> @llvm.nvvm.ldu.global.f.v2f64.p0(ptr {{%[0-9]+}}, i32 16)
776 typedef double double2
__attribute__((ext_vector_type(2)));
777 __nvvm_ldu_d2((const double2
*)p
);
780 // CHECK-LABEL: nvvm_shfl
781 __device__
void nvvm_shfl(int i
, float f
, int a
, int b
) {
782 // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32
783 __nvvm_shfl_down_i32(i
, a
, b
);
784 // CHECK: call float @llvm.nvvm.shfl.down.f32(float
785 __nvvm_shfl_down_f32(f
, a
, b
);
786 // CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32
787 __nvvm_shfl_up_i32(i
, a
, b
);
788 // CHECK: call float @llvm.nvvm.shfl.up.f32(float
789 __nvvm_shfl_up_f32(f
, a
, b
);
790 // CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32
791 __nvvm_shfl_bfly_i32(i
, a
, b
);
792 // CHECK: call float @llvm.nvvm.shfl.bfly.f32(float
793 __nvvm_shfl_bfly_f32(f
, a
, b
);
794 // CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32
795 __nvvm_shfl_idx_i32(i
, a
, b
);
796 // CHECK: call float @llvm.nvvm.shfl.idx.f32(float
797 __nvvm_shfl_idx_f32(f
, a
, b
);
801 __device__
void nvvm_vote(int pred
) {
802 // CHECK: call i1 @llvm.nvvm.vote.all(i1
803 __nvvm_vote_all(pred
);
804 // CHECK: call i1 @llvm.nvvm.vote.any(i1
805 __nvvm_vote_any(pred
);
806 // CHECK: call i1 @llvm.nvvm.vote.uni(i1
807 __nvvm_vote_uni(pred
);
808 // CHECK: call i32 @llvm.nvvm.vote.ballot(i1
809 __nvvm_vote_ballot(pred
);
813 // CHECK-LABEL: nvvm_mbarrier
814 __device__
void nvvm_mbarrier(long long* addr
, __attribute__((address_space(3))) long long* sharedAddr
, int count
, long long state
) {
815 #if __CUDA_ARCH__ >= 800
816 __nvvm_mbarrier_init(addr
, count
);
817 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init
818 __nvvm_mbarrier_init_shared(sharedAddr
, count
);
819 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared
821 __nvvm_mbarrier_inval(addr
);
822 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval
823 __nvvm_mbarrier_inval_shared(sharedAddr
);
824 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared
826 __nvvm_mbarrier_arrive(addr
);
827 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive
828 __nvvm_mbarrier_arrive_shared(sharedAddr
);
829 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared
830 __nvvm_mbarrier_arrive_noComplete(addr
, count
);
831 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete
832 __nvvm_mbarrier_arrive_noComplete_shared(sharedAddr
, count
);
833 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared
835 __nvvm_mbarrier_arrive_drop(addr
);
836 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop
837 __nvvm_mbarrier_arrive_drop_shared(sharedAddr
);
838 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared
839 __nvvm_mbarrier_arrive_drop_noComplete(addr
, count
);
840 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete
841 __nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr
, count
);
842 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared
844 __nvvm_mbarrier_test_wait(addr
, state
);
845 // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait
846 __nvvm_mbarrier_test_wait_shared(sharedAddr
, state
);
847 // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared
849 __nvvm_mbarrier_pending_count(state
);
850 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count
855 // CHECK-LABEL: nvvm_async_copy
856 __device__
void nvvm_async_copy(__attribute__((address_space(3))) void* dst
, __attribute__((address_space(1))) const void* src
, long long* addr
, __attribute__((address_space(3))) long long* sharedAddr
) {
857 #if __CUDA_ARCH__ >= 800
858 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive
859 __nvvm_cp_async_mbarrier_arrive(addr
);
860 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared
861 __nvvm_cp_async_mbarrier_arrive_shared(sharedAddr
);
862 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc
863 __nvvm_cp_async_mbarrier_arrive_noinc(addr
);
864 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
865 __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr
);
867 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4(
868 __nvvm_cp_async_ca_shared_global_4(dst
, src
);
869 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8(
870 __nvvm_cp_async_ca_shared_global_8(dst
, src
);
871 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16(
872 __nvvm_cp_async_ca_shared_global_16(dst
, src
);
873 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16(
874 __nvvm_cp_async_cg_shared_global_16(dst
, src
);
876 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4.s({{.*}}, i32 2)
877 __nvvm_cp_async_ca_shared_global_4(dst
, src
, 2);
878 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8.s({{.*}}, i32 2)
879 __nvvm_cp_async_ca_shared_global_8(dst
, src
, 2);
880 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16.s({{.*}}, i32 2)
881 __nvvm_cp_async_ca_shared_global_16(dst
, src
, 2);
882 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16.s({{.*}}, i32 2)
883 __nvvm_cp_async_cg_shared_global_16(dst
, src
, 2);
885 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
886 __nvvm_cp_async_commit_group();
887 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
888 __nvvm_cp_async_wait_group(0);
889 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8)
890 __nvvm_cp_async_wait_group(8);
891 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16)
892 __nvvm_cp_async_wait_group(16);
893 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all
894 __nvvm_cp_async_wait_all();
899 // CHECK-LABEL: nvvm_cvt_sm80
900 __device__
void nvvm_cvt_sm80() {
901 #if __CUDA_ARCH__ >= 800
902 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn(float 1.000000e+00, float 1.000000e+00)
903 __nvvm_ff2bf16x2_rn(1, 1);
904 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
905 __nvvm_ff2bf16x2_rn_relu(1, 1);
906 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz(float 1.000000e+00, float 1.000000e+00)
907 __nvvm_ff2bf16x2_rz(1, 1);
908 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
909 __nvvm_ff2bf16x2_rz_relu(1, 1);
911 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00)
912 __nvvm_ff2f16x2_rn(1, 1);
913 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
914 __nvvm_ff2f16x2_rn_relu(1, 1);
915 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz(float 1.000000e+00, float 1.000000e+00)
916 __nvvm_ff2f16x2_rz(1, 1);
917 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
918 __nvvm_ff2f16x2_rz_relu(1, 1);
920 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn(float 1.000000e+00)
922 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn.relu(float 1.000000e+00)
923 __nvvm_f2bf16_rn_relu(1);
924 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz(float 1.000000e+00)
926 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00)
927 __nvvm_f2bf16_rz_relu(1);
929 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00)
930 __nvvm_f2tf32_rna(1);
935 #define NAN32 0x7FBFFFFF
936 #define NAN16 (__bf16)0x7FBF
937 #define BF16 (__bf16)0.1f
938 #define BF16_2 (__bf16)0.2f
939 #define NANBF16 (__bf16)0xFFC1
940 #define BF16X2 {(__bf16)0.1f, (__bf16)0.1f}
941 #define BF16X2_2 {(__bf16)0.2f, (__bf16)0.2f}
942 #define NANBF16X2 {NANBF16, NANBF16}
944 // CHECK-LABEL: nvvm_abs_neg_bf16_bf16x2_sm80
945 __device__
void nvvm_abs_neg_bf16_bf16x2_sm80() {
946 #if __CUDA_ARCH__ >= 800
948 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.abs.bf16(bfloat 0xR3DCD)
949 __nvvm_abs_bf16(BF16
);
950 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat> <bfloat 0xR3DCD, bfloat 0xR3DCD>)
951 __nvvm_abs_bf16x2(BF16X2
);
953 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.neg.bf16(bfloat 0xR3DCD)
954 __nvvm_neg_bf16(BF16
);
955 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.neg.bf16x2(<2 x bfloat> <bfloat 0xR3DCD, bfloat 0xR3DCD>)
956 __nvvm_neg_bf16x2(BF16X2
);
961 // CHECK-LABEL: nvvm_min_max_sm80
962 __device__
void nvvm_min_max_sm80() {
963 #if __CUDA_ARCH__ >= 800
965 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.nan.f
966 __nvvm_fmin_nan_f(0.1f
, (float)NAN32
);
967 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.ftz.nan.f
968 __nvvm_fmin_ftz_nan_f(0.1f
, (float)NAN32
);
970 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.bf16
971 __nvvm_fmin_bf16(BF16
, BF16_2
);
972 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.bf16
973 __nvvm_fmin_ftz_bf16(BF16
, BF16_2
);
974 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.nan.bf16
975 __nvvm_fmin_nan_bf16(BF16
, NANBF16
);
976 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmin.ftz.nan.bf16
977 __nvvm_fmin_ftz_nan_bf16(BF16
, NANBF16
);
978 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.bf16x2
979 __nvvm_fmin_bf16x2(BF16X2
, BF16X2_2
);
980 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.bf16x2
981 __nvvm_fmin_ftz_bf16x2(BF16X2
, BF16X2_2
);
982 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.nan.bf16x2
983 __nvvm_fmin_nan_bf16x2(BF16X2
, NANBF16X2
);
984 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmin.ftz.nan.bf16x2
985 __nvvm_fmin_ftz_nan_bf16x2(BF16X2
, NANBF16X2
);
986 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
987 __nvvm_fmax_nan_f(0.1f
, 0.11f
);
988 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
989 __nvvm_fmax_ftz_nan_f(0.1f
, (float)NAN32
);
991 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
992 __nvvm_fmax_nan_f(0.1f
, (float)NAN32
);
993 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
994 __nvvm_fmax_ftz_nan_f(0.1f
, (float)NAN32
);
995 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.bf16
996 __nvvm_fmax_bf16(BF16
, BF16_2
);
997 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.bf16
998 __nvvm_fmax_ftz_bf16(BF16
, BF16_2
);
999 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.nan.bf16
1000 __nvvm_fmax_nan_bf16(BF16
, NANBF16
);
1001 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fmax.ftz.nan.bf16
1002 __nvvm_fmax_ftz_nan_bf16(BF16
, NANBF16
);
1003 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.bf16x2
1004 __nvvm_fmax_bf16x2(BF16X2
, BF16X2_2
);
1005 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.bf16x2
1006 __nvvm_fmax_ftz_bf16x2(BF16X2
, BF16X2_2
);
1007 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.nan.bf16x2
1008 __nvvm_fmax_nan_bf16x2(NANBF16X2
, BF16X2
);
1009 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fmax.ftz.nan.bf16x2
1010 __nvvm_fmax_ftz_nan_bf16x2(NANBF16X2
, BF16X2
);
1011 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
1012 __nvvm_fmax_nan_f(0.1f
, (float)NAN32
);
1013 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
1014 __nvvm_fmax_ftz_nan_f(0.1f
, (float)NAN32
);
1020 // CHECK-LABEL: nvvm_fma_bf16_bf16x2_sm80
1021 __device__
void nvvm_fma_bf16_bf16x2_sm80() {
1022 #if __CUDA_ARCH__ >= 800
1023 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.bf16
1024 __nvvm_fma_rn_bf16(BF16
, BF16_2
, BF16_2
);
1025 // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fma.rn.relu.bf16
1026 __nvvm_fma_rn_relu_bf16(BF16
, BF16_2
, BF16_2
);
1027 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.bf16x2
1028 __nvvm_fma_rn_bf16x2(BF16X2
, BF16X2_2
, BF16X2_2
);
1029 // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fma.rn.relu.bf16x2
1030 __nvvm_fma_rn_relu_bf16x2(BF16X2
, BF16X2_2
, BF16X2_2
);
1035 // CHECK-LABEL: nvvm_min_max_sm86
1036 __device__
void nvvm_min_max_sm86() {
1037 #if __CUDA_ARCH__ >= 860
1039 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.xorsign.abs.bf16
1040 __nvvm_fmin_xorsign_abs_bf16(BF16
, BF16_2
);
1041 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmin.nan.xorsign.abs.bf16
1042 __nvvm_fmin_nan_xorsign_abs_bf16(BF16
, NANBF16
);
1043 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.xorsign.abs.bf16x2
1044 __nvvm_fmin_xorsign_abs_bf16x2(BF16X2
, BF16X2_2
);
1045 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2
1046 __nvvm_fmin_nan_xorsign_abs_bf16x2(BF16X2
, NANBF16X2
);
1047 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.xorsign.abs.f
1048 __nvvm_fmin_xorsign_abs_f(-0.1f
, 0.1f
);
1049 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.xorsign.abs.f
1050 __nvvm_fmin_ftz_xorsign_abs_f(-0.1f
, 0.1f
);
1051 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.nan.xorsign.abs.f
1052 __nvvm_fmin_nan_xorsign_abs_f(-0.1f
, (float)NAN32
);
1053 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f
1054 __nvvm_fmin_ftz_nan_xorsign_abs_f(-0.1f
, (float)NAN32
);
1056 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.xorsign.abs.bf16
1057 __nvvm_fmax_xorsign_abs_bf16(BF16
, BF16_2
);
1058 // CHECK_PTX72_SM86: call bfloat @llvm.nvvm.fmax.nan.xorsign.abs.bf16
1059 __nvvm_fmax_nan_xorsign_abs_bf16(BF16
, NANBF16
);
1060 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.xorsign.abs.bf16x2
1061 __nvvm_fmax_xorsign_abs_bf16x2(BF16X2
, BF16X2_2
);
1062 // CHECK_PTX72_SM86: call <2 x bfloat> @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2
1063 __nvvm_fmax_nan_xorsign_abs_bf16x2(BF16X2
, NANBF16X2
);
1064 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.xorsign.abs.f
1065 __nvvm_fmax_xorsign_abs_f(-0.1f
, 0.1f
);
1066 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.xorsign.abs.f
1067 __nvvm_fmax_ftz_xorsign_abs_f(-0.1f
, 0.1f
);
1068 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.nan.xorsign.abs.f
1069 __nvvm_fmax_nan_xorsign_abs_f(-0.1f
, (float)NAN32
);
1070 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f
1071 __nvvm_fmax_ftz_nan_xorsign_abs_f(-0.1f
, (float)NAN32
);