1 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | FileCheck %s
2 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx60 | %ptxas-verify %}
4 declare {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32, i32, i32, i32)
5 declare {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32, float, i32, i32)
6 declare {i32, i1} @llvm.nvvm.shfl.sync.up.i32p(i32, i32, i32, i32)
7 declare {float, i1} @llvm.nvvm.shfl.sync.up.f32p(i32, float, i32, i32)
8 declare {i32, i1} @llvm.nvvm.shfl.sync.bfly.i32p(i32, i32, i32, i32)
9 declare {float, i1} @llvm.nvvm.shfl.sync.bfly.f32p(i32, float, i32, i32)
10 declare {i32, i1} @llvm.nvvm.shfl.sync.idx.i32p(i32, i32, i32, i32)
11 declare {float, i1} @llvm.nvvm.shfl.sync.idx.f32p(i32, float, i32, i32)
13 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rrr
14 define {i32, i1} @shfl_sync_i32_rrr(i32 %mask, i32 %a, i32 %b, i32 %c) {
15 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
16 ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
17 ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
18 ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
19 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]];
20 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
21 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 %c)
25 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_irr
26 define {i32, i1} @shfl_sync_i32_irr(i32 %a, i32 %b, i32 %c) {
27 ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
28 ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
29 ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
30 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1;
31 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
32 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 %c)
36 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rri
37 define {i32, i1} @shfl_sync_i32_rri(i32 %mask, i32 %a, i32 %b) {
38 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
39 ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
40 ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
41 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]];
42 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
43 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 1)
47 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_iri
48 define {i32, i1} @shfl_sync_i32_iri(i32 %a, i32 %b) {
49 ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
50 ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
51 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1;
52 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
53 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 2)
57 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rir
58 define {i32, i1} @shfl_sync_i32_rir(i32 %mask, i32 %a, i32 %c) {
59 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
60 ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
61 ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
62 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]];
63 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
64 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 %c)
68 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_iir
69 define {i32, i1} @shfl_sync_i32_iir(i32 %a, i32 %c) {
70 ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
71 ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
72 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1;
73 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
74 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 %c)
78 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rii
79 define {i32, i1} @shfl_sync_i32_rii(i32 %mask, i32 %a) {
80 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
81 ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
82 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]];
83 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
84 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 2)
88 ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_iii
89 define {i32, i1} @shfl_sync_i32_iii(i32 %a, i32 %b) {
90 ; CHECK: ld.param.u32 [[A:%r[0-9]+]]
91 ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1;
92 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
93 %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 3)
97 ;; Same intrinsics, but for float
99 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rrr
100 define {float, i1} @shfl_sync_f32_rrr(i32 %mask, float %a, i32 %b, i32 %c) {
101 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
102 ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
103 ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
104 ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
105 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]];
106 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
107 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 %c)
111 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_irr
112 define {float, i1} @shfl_sync_f32_irr(float %a, i32 %b, i32 %c) {
113 ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
114 ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
115 ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
116 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1;
117 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
118 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 %c)
122 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rri
123 define {float, i1} @shfl_sync_f32_rri(i32 %mask, float %a, i32 %b) {
124 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
125 ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
126 ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
127 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]];
128 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
129 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 1)
133 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iri
134 define {float, i1} @shfl_sync_f32_iri(float %a, i32 %b) {
135 ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
136 ; CHECK: ld.param.u32 [[B:%r[0-9]+]]
137 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1;
138 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
139 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 2)
143 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rir
144 define {float, i1} @shfl_sync_f32_rir(i32 %mask, float %a, i32 %c) {
145 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
146 ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
147 ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
148 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]];
149 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
150 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 %c)
154 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iir
155 define {float, i1} @shfl_sync_f32_iir(float %a, i32 %c) {
156 ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
157 ; CHECK: ld.param.u32 [[C:%r[0-9]+]]
158 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1;
159 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
160 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 %c)
164 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rii
165 define {float, i1} @shfl_sync_f32_rii(i32 %mask, float %a) {
166 ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]]
167 ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
168 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]];
169 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
170 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 2)
174 ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iii
175 define {float, i1} @shfl_sync_f32_iii(float %a, i32 %b) {
176 ; CHECK: ld.param.f32 [[A:%f[0-9]+]]
177 ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1;
178 ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]]
179 %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 3)