1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-OPT,GCN-OPT %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -O0 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX8-NOOPT %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GCN-OPT %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-enable-vopd=0 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11,GCN-OPT %s
6 ; GCN-LABEL: {{^}}dpp_test:
7 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
8 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
12 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
13 define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
14 %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0
15 store i32 %tmp0, ptr addrspace(1) %out
19 ; GCN-LABEL: {{^}}dpp_test_bc:
20 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
21 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
25 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1{{$}}
26 define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
27 %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 true) #0
28 store i32 %tmp0, ptr addrspace(1) %out
33 ; GCN-LABEL: {{^}}dpp_test1:
34 ; GFX10,GFX11: v_add_nc_u32_e32 [[REG:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
35 ; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
36 ; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}}
37 ; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
39 ; GFX8-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
40 @0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4
41 define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
43 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
44 %tmp1 = zext i32 %tmp to i64
45 %tmp2 = getelementptr inbounds [448 x i32], ptr addrspace(3) @0, i32 0, i32 %tmp
46 %tmp3 = load i32, ptr addrspace(3) %tmp2, align 4
47 fence syncscope("workgroup-one-as") release
48 tail call void @llvm.amdgcn.s.barrier()
49 fence syncscope("workgroup-one-as") acquire
50 %tmp4 = add nsw i32 %tmp3, %tmp3
51 %tmp5 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp4, i32 177, i32 15, i32 15, i1 zeroext false)
52 %tmp6 = add nsw i32 %tmp5, %tmp4
53 %tmp7 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
54 store i32 %tmp6, ptr %tmp7, align 4
58 ; GCN-LABEL: {{^}}update_dpp64_test:
59 ; GCN: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
60 ; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
61 ; GCN-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
62 define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
63 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
64 %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
65 %load = load i64, ptr addrspace(1) %gep
66 %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 false) #0
67 store i64 %tmp0, ptr addrspace(1) %gep
71 ; GCN-LABEL: {{^}}update_dpp64_imm_old_test:
72 ; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
73 ; GFX8-OPT-DAG,GFX10-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
74 ; GFX11-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
75 ; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
76 ; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x7047
77 ; GCN-DAG: load_{{dwordx2|b64}} v[[[SRC_LO:[0-9]+]]:[[SRC_HI:[0-9]+]]]
78 ; GCN-OPT-DAG: v_mov_b32_dpp v[[OLD_LO]], v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
79 ; GFX8-OPT-DAG,GFX10-DAG,GFX11-DAG: v_mov_b32_dpp v[[OLD_HI]], v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
80 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
81 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
82 define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64 %in2) {
83 %id = tail call i32 @llvm.amdgcn.workitem.id.x()
84 %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
85 %load = load i64, ptr addrspace(1) %gep
86 %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 false) #0
87 store i64 %tmp0, ptr addrspace(1) %gep
91 ; GCN-LABEL: {{^}}update_dpp64_imm_src_test:
92 ; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_LO:[0-9]+]], 0x3afaedd9
93 ; GCN-OPT-DAG: v_mov_b32_e32 v[[OLD_HI:[0-9]+]], 0x7047
94 ; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_LO:[0-9]+]], 0x3afaedd9
95 ; GFX8-NOOPT-DAG: s_mov_b32 s[[SOLD_HI:[0-9]+]], 0x7047
96 ; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
97 ; GCN-OPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[OLD_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
98 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
99 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
100 define amdgpu_kernel void @update_dpp64_imm_src_test(ptr addrspace(1) %out, i64 %in1) {
101 %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 false) #0
102 store i64 %tmp0, ptr addrspace(1) %out
106 ; GCN-LABEL: {{^}}dpp_test_f32:
107 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
108 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
111 ; GFX8-NOOPT: s_nop 1
112 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
113 define amdgpu_kernel void @dpp_test_f32(ptr addrspace(1) %out, float %in1, float %in2) {
114 %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 1, i32 1, i1 false)
115 store float %tmp0, ptr addrspace(1) %out
119 ; GCN-LABEL: {{^}}dpp_test_f32_imm_comb1:
120 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
121 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
124 ; GFX8-NOOPT: s_nop 1
125 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
126 define amdgpu_kernel void @dpp_test_f32_imm_comb1(ptr addrspace(1) %out, float %in1, float %in2) {
127 %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 0, i32 0, i32 0, i1 false)
128 store float %tmp0, ptr addrspace(1) %out
132 ; GCN-LABEL: {{^}}dpp_test_f32_imm_comb2:
133 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
134 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
137 ; GFX8-NOOPT: s_nop 1
138 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
139 define amdgpu_kernel void @dpp_test_f32_imm_comb2(ptr addrspace(1) %out, float %in1, float %in2) {
140 %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 3, i32 3, i32 3, i1 false)
141 store float %tmp0, ptr addrspace(1) %out
145 ; GCN-LABEL: {{^}}dpp_test_f32_imm_comb3:
146 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
147 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
150 ; GFX8-NOOPT: s_nop 1
151 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
152 define amdgpu_kernel void @dpp_test_f32_imm_comb3(ptr addrspace(1) %out, float %in1, float %in2) {
153 %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 2, i32 3, i1 true)
154 store float %tmp0, ptr addrspace(1) %out
158 ; GCN-LABEL: {{^}}dpp_test_f32_imm_comb4:
159 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
160 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
163 ; GFX8-NOOPT: s_nop 1
164 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
165 define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %in1, float %in2) {
166 %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 4, i32 3, i32 2, i1 true)
167 store float %tmp0, ptr addrspace(1) %out
171 ; GCN-LABEL: {{^}}dpp_test_f32_imm_comb5:
172 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
173 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
176 ; GFX8-NOOPT: s_nop 1
177 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
178 define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %in1, float %in2) {
179 %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 62, i32 61, i1 true)
180 store float %tmp0, ptr addrspace(1) %out
184 ; GCN-LABEL: {{^}}dpp_test_f32_imm_comb6:
185 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
186 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
189 ; GFX8-NOOPT: s_nop 1
190 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
191 define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %in1, float %in2) {
192 %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 63, i32 63, i1 true)
193 store float %tmp0, ptr addrspace(1) %out
198 ; GCN-LABEL: {{^}}dpp_test_f32_imm_comb7:
199 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
200 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
203 ; GFX8-NOOPT: s_nop 1
204 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
205 define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %in1, float %in2) {
206 %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 64, i32 64, i1 true)
207 store float %tmp0, ptr addrspace(1) %out
211 ; GCN-LABEL: {{^}}dpp_test_f32_imm_comb8:
212 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
213 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
216 ; GFX8-NOOPT: s_nop 1
217 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
218 define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %in1, float %in2) {
219 %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 63, i32 128, i1 true)
220 store float %tmp0, ptr addrspace(1) %out
224 ; GCN-LABEL: {{^}}dpp_test_v2i16:
225 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
226 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
229 ; GFX8-NOOPT: s_nop 1
230 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
231 define amdgpu_kernel void @dpp_test_v2i16(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
232 %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 1, i32 1, i1 false)
233 store <2 x i16> %tmp0, ptr addrspace(1) %out
237 ; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb1:
238 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
239 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
242 ; GFX8-NOOPT: s_nop 1
243 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
244 define amdgpu_kernel void @dpp_test_v2i16_imm_comb1(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
245 %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 0, i32 0, i32 0, i1 false)
246 store <2 x i16> %tmp0, ptr addrspace(1) %out
250 ; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb2:
251 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
252 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
255 ; GFX8-NOOPT: s_nop 1
256 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
257 define amdgpu_kernel void @dpp_test_v2i16_imm_comb2(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
258 %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 3, i32 3, i32 3, i1 false)
259 store <2 x i16> %tmp0, ptr addrspace(1) %out
263 ; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb3:
264 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
265 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
268 ; GFX8-NOOPT: s_nop 1
269 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
270 define amdgpu_kernel void @dpp_test_v2i16_imm_comb3(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
271 %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 1, i32 2, i32 3, i1 true)
272 store <2 x i16> %tmp0, ptr addrspace(1) %out
276 ; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb4:
277 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
278 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
281 ; GFX8-NOOPT: s_nop 1
282 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
283 define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
284 %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 4, i32 3, i32 2, i1 true)
285 store <2 x i16> %tmp0, ptr addrspace(1) %out
289 ; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb5:
290 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
291 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
294 ; GFX8-NOOPT: s_nop 1
295 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
296 define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
297 %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 62, i32 61, i1 true)
298 store <2 x i16> %tmp0, ptr addrspace(1) %out
302 ; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb6:
303 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
304 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
307 ; GFX8-NOOPT: s_nop 1
308 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
309 define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
310 %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 63, i32 63, i32 63, i1 true)
311 store <2 x i16> %tmp0, ptr addrspace(1) %out
315 ; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb7:
316 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
317 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
320 ; GFX8-NOOPT: s_nop 1
321 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
322 define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
323 %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 64, i32 64, i32 64, i1 true)
324 store <2 x i16> %tmp0, ptr addrspace(1) %out
328 ; GCN-LABEL: {{^}}dpp_test_v2i16_imm_comb8:
329 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
330 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
333 ; GFX8-NOOPT: s_nop 1
334 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
335 define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x i16> %in1, <2 x i16> %in2) {
336 %tmp0 = call <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16> %in1, <2 x i16> %in2, i32 31, i32 63, i32 128, i1 true)
337 store <2 x i16> %tmp0, ptr addrspace(1) %out
341 ; GCN-LABEL: {{^}}dpp_test_v2f16:
342 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
343 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
346 ; GFX8-NOOPT: s_nop 1
347 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
348 define amdgpu_kernel void @dpp_test_v2f16(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
349 %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 1, i32 1, i1 false)
350 store <2 x half> %tmp0, ptr addrspace(1) %out
354 ; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb1:
355 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
356 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
359 ; GFX8-NOOPT: s_nop 1
360 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
361 define amdgpu_kernel void @dpp_test_v2f16_imm_comb1(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
362 %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 0, i32 0, i32 0, i1 false)
363 store <2 x half> %tmp0, ptr addrspace(1) %out
367 ; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb2:
368 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
369 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
372 ; GFX8-NOOPT: s_nop 1
373 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
374 define amdgpu_kernel void @dpp_test_v2f16_imm_comb2(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
375 %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 3, i32 3, i32 3, i1 false)
376 store <2 x half> %tmp0, ptr addrspace(1) %out
380 ; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb3:
381 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
382 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
385 ; GFX8-NOOPT: s_nop 1
386 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
387 define amdgpu_kernel void @dpp_test_v2f16_imm_comb3(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
388 %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 1, i32 2, i32 3, i1 true)
389 store <2 x half> %tmp0, ptr addrspace(1) %out
393 ; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb4:
394 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
395 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
398 ; GFX8-NOOPT: s_nop 1
399 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
400 define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
401 %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 4, i32 3, i32 2, i1 true)
402 store <2 x half> %tmp0, ptr addrspace(1) %out
406 ; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb5:
407 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
408 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
411 ; GFX8-NOOPT: s_nop 1
412 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
413 define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
414 %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 62, i32 61, i1 true)
415 store <2 x half> %tmp0, ptr addrspace(1) %out
419 ; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb6:
420 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
421 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
424 ; GFX8-NOOPT: s_nop 1
425 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
426 define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
427 %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 63, i32 63, i32 63, i1 true)
428 store <2 x half> %tmp0, ptr addrspace(1) %out
432 ; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb7:
433 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
434 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
437 ; GFX8-NOOPT: s_nop 1
438 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
439 define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
440 %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 64, i32 64, i32 64, i1 true)
441 store <2 x half> %tmp0, ptr addrspace(1) %out
445 ; GCN-LABEL: {{^}}dpp_test_v2f16_imm_comb8:
446 ; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
447 ; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
450 ; GFX8-NOOPT: s_nop 1
451 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
452 define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x half> %in1, <2 x half> %in2) {
453 %tmp0 = call <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half> %in1, <2 x half> %in2, i32 31, i32 63, i32 128, i1 true)
454 store <2 x half> %tmp0, ptr addrspace(1) %out
458 declare i32 @llvm.amdgcn.workitem.id.x()
459 declare void @llvm.amdgcn.s.barrier()
460 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
461 declare <2 x i16> @llvm.amdgcn.update.dpp.v2i16(<2 x i16>, <2 x i16>, i32, i32, i32, i1) #0
462 declare <2 x half> @llvm.amdgcn.update.dpp.v2f16(<2 x half>, <2 x half>, i32, i32, i32, i1) #0
463 declare float @llvm.amdgcn.update.dpp.f32(float, float, i32, i32, i32, i1) #0
464 declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0
466 attributes #0 = { nounwind readnone convergent }