1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
4 define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
5 ; GFX906-LABEL: v3i8_liveout:
6 ; GFX906: ; %bb.0: ; %entry
7 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
8 ; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
9 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
10 ; GFX906-NEXT: v_mov_b32_e32 v3, 8
11 ; GFX906-NEXT: s_mov_b32 s4, 0xff0000
12 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
13 ; GFX906-NEXT: global_load_dword v4, v2, s[0:1]
14 ; GFX906-NEXT: v_mov_b32_e32 v1, 0
15 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
16 ; GFX906-NEXT: s_waitcnt vmcnt(0)
17 ; GFX906-NEXT: v_lshrrev_b32_sdwa v5, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
18 ; GFX906-NEXT: v_or_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
19 ; GFX906-NEXT: v_and_b32_e32 v5, 0xffff, v5
20 ; GFX906-NEXT: v_and_or_b32 v4, v4, s4, v5
21 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
22 ; GFX906-NEXT: s_cbranch_execz .LBB0_2
23 ; GFX906-NEXT: ; %bb.1: ; %bb.1
24 ; GFX906-NEXT: global_load_dword v0, v2, s[2:3]
25 ; GFX906-NEXT: s_waitcnt vmcnt(0)
26 ; GFX906-NEXT: v_lshrrev_b32_sdwa v2, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
27 ; GFX906-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
28 ; GFX906-NEXT: v_and_b32_e32 v2, 0xffff, v2
29 ; GFX906-NEXT: v_and_or_b32 v4, v0, s4, v2
30 ; GFX906-NEXT: .LBB0_2: ; %bb.2
31 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
32 ; GFX906-NEXT: global_store_byte_d16_hi v1, v4, s[6:7] offset:2
33 ; GFX906-NEXT: global_store_short v1, v4, s[6:7]
34 ; GFX906-NEXT: s_endpgm
36 %idx = call i32 @llvm.amdgcn.workitem.id.x()
37 %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
38 %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
39 %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
40 %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
41 %cmp = icmp ult i32 %idx, 15
42 br i1 %cmp, label %bb.1, label %bb.2
47 %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
48 store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
52 define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
53 ; GFX906-LABEL: v4i8_liveout:
54 ; GFX906: ; %bb.0: ; %entry
55 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
56 ; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
57 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 2, v0
58 ; GFX906-NEXT: v_mov_b32_e32 v1, 0
59 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
60 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
61 ; GFX906-NEXT: global_load_dword v2, v3, s[0:1]
62 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
63 ; GFX906-NEXT: s_cbranch_execz .LBB1_2
64 ; GFX906-NEXT: ; %bb.1: ; %bb.1
65 ; GFX906-NEXT: global_load_dword v2, v3, s[2:3]
66 ; GFX906-NEXT: .LBB1_2: ; %bb.2
67 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
68 ; GFX906-NEXT: s_waitcnt vmcnt(0)
69 ; GFX906-NEXT: global_store_dword v1, v2, s[6:7]
70 ; GFX906-NEXT: s_endpgm
72 %idx = call i32 @llvm.amdgcn.workitem.id.x()
73 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
74 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
75 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
76 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
77 %cmp = icmp ult i32 %idx, 15
78 br i1 %cmp, label %bb.1, label %bb.2
83 %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
84 store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
88 define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
89 ; GFX906-LABEL: v5i8_liveout:
90 ; GFX906: ; %bb.0: ; %entry
91 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
92 ; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
93 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
94 ; GFX906-NEXT: v_mov_b32_e32 v3, 0
95 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
96 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
97 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1]
98 ; GFX906-NEXT: s_waitcnt vmcnt(0)
99 ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
100 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
101 ; GFX906-NEXT: s_cbranch_execz .LBB2_2
102 ; GFX906-NEXT: ; %bb.1: ; %bb.1
103 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3]
104 ; GFX906-NEXT: s_waitcnt vmcnt(0)
105 ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
106 ; GFX906-NEXT: .LBB2_2: ; %bb.2
107 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
108 ; GFX906-NEXT: global_store_byte v3, v2, s[6:7] offset:4
109 ; GFX906-NEXT: global_store_dword v3, v1, s[6:7]
110 ; GFX906-NEXT: s_endpgm
112 %idx = call i32 @llvm.amdgcn.workitem.id.x()
113 %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
114 %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
115 %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
116 %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
117 %cmp = icmp ult i32 %idx, 15
118 br i1 %cmp, label %bb.1, label %bb.2
123 %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
124 store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
128 define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
129 ; GFX906-LABEL: v8i8_liveout:
130 ; GFX906: ; %bb.0: ; %entry
131 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
132 ; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
133 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
134 ; GFX906-NEXT: v_mov_b32_e32 v3, 0
135 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
136 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
137 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1]
138 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
139 ; GFX906-NEXT: s_cbranch_execz .LBB3_2
140 ; GFX906-NEXT: ; %bb.1: ; %bb.1
141 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[2:3]
142 ; GFX906-NEXT: .LBB3_2: ; %bb.2
143 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
144 ; GFX906-NEXT: s_waitcnt vmcnt(0)
145 ; GFX906-NEXT: global_store_dwordx2 v3, v[1:2], s[6:7]
146 ; GFX906-NEXT: s_endpgm
148 %idx = call i32 @llvm.amdgcn.workitem.id.x()
149 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
150 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
151 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
152 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
153 %cmp = icmp ult i32 %idx, 15
154 br i1 %cmp, label %bb.1, label %bb.2
159 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
160 store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
164 define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
165 ; GFX906-LABEL: v16i8_liveout:
166 ; GFX906: ; %bb.0: ; %entry
167 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
168 ; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
169 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 4, v0
170 ; GFX906-NEXT: v_mov_b32_e32 v5, 0
171 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
172 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
173 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[0:1]
174 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
175 ; GFX906-NEXT: s_cbranch_execz .LBB4_2
176 ; GFX906-NEXT: ; %bb.1: ; %bb.1
177 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v6, s[2:3]
178 ; GFX906-NEXT: .LBB4_2: ; %bb.2
179 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
180 ; GFX906-NEXT: s_waitcnt vmcnt(0)
181 ; GFX906-NEXT: global_store_dwordx4 v5, v[1:4], s[6:7]
182 ; GFX906-NEXT: s_endpgm
184 %idx = call i32 @llvm.amdgcn.workitem.id.x()
185 %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
186 %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
187 %gep2 = getelementptr <16 x i8>, ptr addrspace(1) %src2, i32 %idx
188 %vec2 = load <16 x i8>, ptr addrspace(1) %gep2
189 %cmp = icmp ult i32 %idx, 15
190 br i1 %cmp, label %bb.1, label %bb.2
195 %tmp5 = phi <16 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
196 store <16 x i8> %tmp5, ptr addrspace(1) %dst, align 4
200 define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
201 ; GFX906-LABEL: v32i8_liveout:
202 ; GFX906: ; %bb.0: ; %entry
203 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
204 ; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
205 ; GFX906-NEXT: v_lshlrev_b32_e32 v10, 5, v0
206 ; GFX906-NEXT: v_mov_b32_e32 v9, 0
207 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
208 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
209 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[0:1] offset:16
210 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[0:1]
211 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
212 ; GFX906-NEXT: s_cbranch_execz .LBB5_2
213 ; GFX906-NEXT: ; %bb.1: ; %bb.1
214 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v10, s[2:3] offset:16
215 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v10, s[2:3]
216 ; GFX906-NEXT: .LBB5_2: ; %bb.2
217 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
218 ; GFX906-NEXT: s_waitcnt vmcnt(1)
219 ; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[6:7] offset:16
220 ; GFX906-NEXT: s_waitcnt vmcnt(1)
221 ; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[6:7]
222 ; GFX906-NEXT: s_endpgm
224 %idx = call i32 @llvm.amdgcn.workitem.id.x()
225 %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
226 %vec1 = load <32 x i8>, ptr addrspace(1) %gep1
227 %gep2 = getelementptr <32 x i8>, ptr addrspace(1) %src2, i32 %idx
228 %vec2 = load <32 x i8>, ptr addrspace(1) %gep2
229 %cmp = icmp ult i32 %idx, 15
230 br i1 %cmp, label %bb.1, label %bb.2
235 %tmp5 = phi <32 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
236 store <32 x i8> %tmp5, ptr addrspace(1) %dst, align 4
240 define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
241 ; GFX906-LABEL: v256i8_liveout:
242 ; GFX906: ; %bb.0: ; %entry
243 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
244 ; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
245 ; GFX906-NEXT: v_lshlrev_b32_e32 v61, 3, v0
246 ; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
247 ; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
248 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
249 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:240
250 ; GFX906-NEXT: s_mov_b32 s14, -1
251 ; GFX906-NEXT: s_mov_b32 s15, 0xe00000
252 ; GFX906-NEXT: s_add_u32 s12, s12, s11
253 ; GFX906-NEXT: s_addc_u32 s13, s13, 0
254 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
255 ; GFX906-NEXT: v_mov_b32_e32 v4, 0
256 ; GFX906-NEXT: s_waitcnt vmcnt(0)
257 ; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill
258 ; GFX906-NEXT: s_nop 0
259 ; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
260 ; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
261 ; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
262 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[0:1] offset:224
263 ; GFX906-NEXT: s_nop 0
264 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[0:1] offset:208
265 ; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[0:1] offset:192
266 ; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[0:1] offset:176
267 ; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[0:1] offset:160
268 ; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[0:1] offset:144
269 ; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[0:1] offset:128
270 ; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[0:1] offset:112
271 ; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[0:1] offset:96
272 ; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[0:1] offset:80
273 ; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[0:1] offset:64
274 ; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[0:1] offset:48
275 ; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[0:1] offset:32
276 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[0:1] offset:16
277 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[0:1]
278 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
279 ; GFX906-NEXT: s_cbranch_execz .LBB6_2
280 ; GFX906-NEXT: ; %bb.1: ; %bb.1
281 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3] offset:240
282 ; GFX906-NEXT: s_waitcnt vmcnt(0)
283 ; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
284 ; GFX906-NEXT: s_nop 0
285 ; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
286 ; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
287 ; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
288 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v61, s[2:3] offset:224
289 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v61, s[2:3] offset:208
290 ; GFX906-NEXT: global_load_dwordx4 v[13:16], v61, s[2:3] offset:192
291 ; GFX906-NEXT: global_load_dwordx4 v[17:20], v61, s[2:3] offset:176
292 ; GFX906-NEXT: global_load_dwordx4 v[21:24], v61, s[2:3] offset:160
293 ; GFX906-NEXT: global_load_dwordx4 v[25:28], v61, s[2:3] offset:144
294 ; GFX906-NEXT: global_load_dwordx4 v[29:32], v61, s[2:3] offset:128
295 ; GFX906-NEXT: global_load_dwordx4 v[33:36], v61, s[2:3] offset:112
296 ; GFX906-NEXT: global_load_dwordx4 v[37:40], v61, s[2:3] offset:96
297 ; GFX906-NEXT: global_load_dwordx4 v[41:44], v61, s[2:3] offset:80
298 ; GFX906-NEXT: global_load_dwordx4 v[45:48], v61, s[2:3] offset:64
299 ; GFX906-NEXT: global_load_dwordx4 v[49:52], v61, s[2:3] offset:48
300 ; GFX906-NEXT: global_load_dwordx4 v[53:56], v61, s[2:3] offset:32
301 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v61, s[2:3] offset:16
302 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v61, s[2:3]
303 ; GFX906-NEXT: .LBB6_2: ; %bb.2
304 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
305 ; GFX906-NEXT: s_waitcnt vmcnt(7)
306 ; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[6:7] offset:112
307 ; GFX906-NEXT: s_waitcnt vmcnt(7)
308 ; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[6:7] offset:96
309 ; GFX906-NEXT: s_waitcnt vmcnt(7)
310 ; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[6:7] offset:80
311 ; GFX906-NEXT: s_waitcnt vmcnt(7)
312 ; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[6:7] offset:64
313 ; GFX906-NEXT: s_waitcnt vmcnt(7)
314 ; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[6:7] offset:48
315 ; GFX906-NEXT: s_waitcnt vmcnt(7)
316 ; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[6:7] offset:32
317 ; GFX906-NEXT: s_waitcnt vmcnt(7)
318 ; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[6:7] offset:16
319 ; GFX906-NEXT: s_waitcnt vmcnt(7)
320 ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
321 ; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
322 ; GFX906-NEXT: s_nop 0
323 ; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
324 ; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
325 ; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
326 ; GFX906-NEXT: s_waitcnt vmcnt(0)
327 ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:240
328 ; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[6:7] offset:224
329 ; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[6:7] offset:208
330 ; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[6:7] offset:192
331 ; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[6:7] offset:176
332 ; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[6:7] offset:160
333 ; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[6:7] offset:144
334 ; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[6:7] offset:128
335 ; GFX906-NEXT: s_endpgm
337 %idx = call i32 @llvm.amdgcn.workitem.id.x()
338 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
339 %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
340 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
341 %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
342 %cmp = icmp ult i32 %idx, 15
343 br i1 %cmp, label %bb.1, label %bb.2
348 %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
349 store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
353 define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
354 ; GFX906-LABEL: repeat_successor:
355 ; GFX906: ; %bb.0: ; %entry
356 ; GFX906-NEXT: s_load_dword s8, s[4:5], 0x24
357 ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
358 ; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
359 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
360 ; GFX906-NEXT: s_cmp_lt_i32 s8, 3
361 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_3
362 ; GFX906-NEXT: ; %bb.1: ; %LeafBlock
363 ; GFX906-NEXT: s_cmp_gt_i32 s8, 0
364 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
365 ; GFX906-NEXT: ; %bb.2:
366 ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
367 ; GFX906-NEXT: global_load_dword v0, v0, s[0:1]
368 ; GFX906-NEXT: s_branch .LBB7_5
369 ; GFX906-NEXT: .LBB7_3: ; %LeafBlock5
370 ; GFX906-NEXT: s_cmp_eq_u32 s8, 3
371 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
372 ; GFX906-NEXT: ; %bb.4: ; %sw.bb5
373 ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
374 ; GFX906-NEXT: global_load_dword v0, v0, s[2:3]
375 ; GFX906-NEXT: .LBB7_5: ; %return.sink.split
376 ; GFX906-NEXT: v_mov_b32_e32 v1, 0
377 ; GFX906-NEXT: s_waitcnt vmcnt(0)
378 ; GFX906-NEXT: global_store_dword v1, v0, s[6:7]
379 ; GFX906-NEXT: .LBB7_6: ; %return
380 ; GFX906-NEXT: s_endpgm
382 %idx = call i32 @llvm.amdgcn.workitem.id.x()
383 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
384 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
385 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
386 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
387 switch i32 %in, label %return [
388 i32 1, label %return.sink.split
389 i32 2, label %return.sink.split
394 br label %return.sink.split
397 %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
398 store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
405 define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
406 ; GFX906-LABEL: v8i8_phi_chain:
407 ; GFX906: ; %bb.0: ; %entry
408 ; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
409 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
410 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
411 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
412 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
413 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9]
414 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
415 ; GFX906-NEXT: s_cbranch_execz .LBB8_2
416 ; GFX906-NEXT: ; %bb.1: ; %bb.1
417 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11]
418 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
419 ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
420 ; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec
421 ; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
422 ; GFX906-NEXT: .LBB8_2: ; %Flow
423 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
424 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
425 ; GFX906-NEXT: s_cbranch_execz .LBB8_4
426 ; GFX906-NEXT: ; %bb.3: ; %bb.2
427 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
428 ; GFX906-NEXT: s_waitcnt vmcnt(0)
429 ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13]
430 ; GFX906-NEXT: .LBB8_4: ; %bb.3
431 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
432 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
433 ; GFX906-NEXT: s_waitcnt vmcnt(0)
434 ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15]
435 ; GFX906-NEXT: s_endpgm
437 %idx = call i32 @llvm.amdgcn.workitem.id.x()
438 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
439 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
440 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
441 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
442 %cmp = icmp ult i32 %idx, 15
443 br i1 %cmp, label %bb.1, label %bb.2
445 %cmp2 = icmp ult i32 %idx, 7
446 br i1 %cmp2, label %bb.2, label %bb.3
449 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
450 store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
454 %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
455 store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
460 define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
461 ; GFX906-LABEL: v8i8_phi_zeroinit:
462 ; GFX906: ; %bb.0: ; %entry
463 ; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
464 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
465 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
466 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
467 ; GFX906-NEXT: ; implicit-def: $vgpr1_vgpr2
468 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
469 ; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9]
470 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
471 ; GFX906-NEXT: s_cbranch_execz .LBB9_2
472 ; GFX906-NEXT: ; %bb.1: ; %bb.1
473 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11]
474 ; GFX906-NEXT: s_mov_b32 s4, 0
475 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
476 ; GFX906-NEXT: s_mov_b32 s5, s4
477 ; GFX906-NEXT: s_waitcnt vmcnt(1)
478 ; GFX906-NEXT: v_mov_b32_e32 v3, s4
479 ; GFX906-NEXT: v_mov_b32_e32 v4, s5
480 ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
481 ; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec
482 ; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
483 ; GFX906-NEXT: .LBB9_2: ; %Flow
484 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
485 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
486 ; GFX906-NEXT: s_cbranch_execz .LBB9_4
487 ; GFX906-NEXT: ; %bb.3: ; %bb.2
488 ; GFX906-NEXT: s_waitcnt vmcnt(0)
489 ; GFX906-NEXT: v_mov_b32_e32 v1, v3
490 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
491 ; GFX906-NEXT: v_mov_b32_e32 v2, v4
492 ; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13]
493 ; GFX906-NEXT: .LBB9_4: ; %bb.3
494 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
495 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
496 ; GFX906-NEXT: s_waitcnt vmcnt(0)
497 ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15]
498 ; GFX906-NEXT: s_endpgm
500 %idx = call i32 @llvm.amdgcn.workitem.id.x()
501 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
502 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
503 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
504 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
505 %cmp = icmp ult i32 %idx, 15
506 br i1 %cmp, label %bb.1, label %bb.2
508 %cmp2 = icmp ult i32 %idx, 7
509 br i1 %cmp2, label %bb.2, label %bb.3
512 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ zeroinitializer, %bb.1 ]
513 store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
517 %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
518 store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
522 define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
523 ; GFX906-LABEL: v8i8_phi_const:
524 ; GFX906: ; %bb.0: ; %entry
525 ; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
526 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
527 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
528 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
529 ; GFX906-NEXT: ; implicit-def: $vgpr3
530 ; GFX906-NEXT: ; implicit-def: $vgpr13
531 ; GFX906-NEXT: ; implicit-def: $vgpr11
532 ; GFX906-NEXT: ; implicit-def: $vgpr14
533 ; GFX906-NEXT: ; implicit-def: $vgpr15
534 ; GFX906-NEXT: ; implicit-def: $vgpr12
535 ; GFX906-NEXT: ; implicit-def: $vgpr16
536 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
537 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[8:9]
538 ; GFX906-NEXT: s_waitcnt vmcnt(0)
539 ; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2
540 ; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v2
541 ; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v2
542 ; GFX906-NEXT: v_lshrrev_b32_e32 v8, 24, v1
543 ; GFX906-NEXT: v_lshrrev_b32_e32 v9, 16, v1
544 ; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v1
545 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
546 ; GFX906-NEXT: s_cbranch_execz .LBB10_2
547 ; GFX906-NEXT: ; %bb.1: ; %bb.1
548 ; GFX906-NEXT: global_load_dwordx2 v[3:4], v4, s[10:11]
549 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
550 ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
551 ; GFX906-NEXT: s_and_b64 s[4:5], vcc, exec
552 ; GFX906-NEXT: v_mov_b32_e32 v1, 1
553 ; GFX906-NEXT: v_mov_b32_e32 v10, 2
554 ; GFX906-NEXT: v_mov_b32_e32 v9, 3
555 ; GFX906-NEXT: v_mov_b32_e32 v8, 4
556 ; GFX906-NEXT: v_mov_b32_e32 v2, 5
557 ; GFX906-NEXT: v_mov_b32_e32 v7, 6
558 ; GFX906-NEXT: v_mov_b32_e32 v6, 7
559 ; GFX906-NEXT: v_mov_b32_e32 v5, 8
560 ; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
561 ; GFX906-NEXT: s_waitcnt vmcnt(0)
562 ; GFX906-NEXT: v_lshrrev_b32_e32 v16, 24, v4
563 ; GFX906-NEXT: v_lshrrev_b32_e32 v12, 16, v4
564 ; GFX906-NEXT: v_lshrrev_b32_e32 v15, 8, v4
565 ; GFX906-NEXT: v_lshrrev_b32_e32 v14, 24, v3
566 ; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v3
567 ; GFX906-NEXT: v_lshrrev_b32_e32 v13, 8, v3
568 ; GFX906-NEXT: .LBB10_2: ; %Flow
569 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
570 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
571 ; GFX906-NEXT: s_cbranch_execz .LBB10_4
572 ; GFX906-NEXT: ; %bb.3: ; %bb.2
573 ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v10
574 ; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v8
575 ; GFX906-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
576 ; GFX906-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
577 ; GFX906-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
578 ; GFX906-NEXT: v_lshlrev_b16_e32 v4, 8, v7
579 ; GFX906-NEXT: v_lshlrev_b16_e32 v11, 8, v5
580 ; GFX906-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
581 ; GFX906-NEXT: v_or_b32_sdwa v11, v6, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
582 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
583 ; GFX906-NEXT: v_or_b32_sdwa v4, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
584 ; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13]
585 ; GFX906-NEXT: v_mov_b32_e32 v3, v1
586 ; GFX906-NEXT: v_mov_b32_e32 v13, v10
587 ; GFX906-NEXT: v_mov_b32_e32 v11, v9
588 ; GFX906-NEXT: v_mov_b32_e32 v14, v8
589 ; GFX906-NEXT: v_mov_b32_e32 v4, v2
590 ; GFX906-NEXT: v_mov_b32_e32 v15, v7
591 ; GFX906-NEXT: v_mov_b32_e32 v12, v6
592 ; GFX906-NEXT: v_mov_b32_e32 v16, v5
593 ; GFX906-NEXT: .LBB10_4: ; %bb.3
594 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
595 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v13
596 ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v14
597 ; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
598 ; GFX906-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
599 ; GFX906-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
600 ; GFX906-NEXT: v_lshlrev_b16_e32 v1, 8, v15
601 ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v16
602 ; GFX906-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
603 ; GFX906-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
604 ; GFX906-NEXT: v_mov_b32_e32 v2, 0
605 ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
606 ; GFX906-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15]
607 ; GFX906-NEXT: s_endpgm
609 %idx = call i32 @llvm.amdgcn.workitem.id.x()
610 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
611 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
612 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
613 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
614 %cmp = icmp ult i32 %idx, 15
615 br i1 %cmp, label %bb.1, label %bb.2
617 %cmp2 = icmp ult i32 %idx, 7
618 br i1 %cmp2, label %bb.2, label %bb.3
621 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [<i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, %bb.1 ]
622 store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
626 %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
627 store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
631 define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
632 ; GFX906-LABEL: v8i8_multi_block:
633 ; GFX906: ; %bb.0: ; %entry
634 ; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
635 ; GFX906-NEXT: v_lshlrev_b32_e32 v6, 3, v0
636 ; GFX906-NEXT: v_mov_b32_e32 v5, 0
637 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
638 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
639 ; GFX906-NEXT: global_load_dwordx2 v[3:4], v6, s[8:9]
640 ; GFX906-NEXT: s_waitcnt vmcnt(0)
641 ; GFX906-NEXT: v_mov_b32_e32 v1, v3
642 ; GFX906-NEXT: v_mov_b32_e32 v2, v4
643 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
644 ; GFX906-NEXT: s_cbranch_execz .LBB11_4
645 ; GFX906-NEXT: ; %bb.1: ; %bb.1
646 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v6, s[10:11]
647 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
648 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
649 ; GFX906-NEXT: s_cbranch_execz .LBB11_3
650 ; GFX906-NEXT: ; %bb.2: ; %bb.2
651 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
652 ; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13]
653 ; GFX906-NEXT: .LBB11_3: ; %Flow
654 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
655 ; GFX906-NEXT: .LBB11_4: ; %bb.3
656 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
657 ; GFX906-NEXT: s_waitcnt vmcnt(0)
658 ; GFX906-NEXT: global_store_dwordx2 v5, v[1:2], s[14:15]
659 ; GFX906-NEXT: s_endpgm
661 %idx = call i32 @llvm.amdgcn.workitem.id.x()
662 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
663 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
664 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
665 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
666 %cmp = icmp ult i32 %idx, 15
667 br i1 %cmp, label %bb.1, label %bb.3
669 %cmp2 = icmp ult i32 %idx, 7
670 br i1 %cmp2, label %bb.2, label %bb.3
673 store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
677 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
678 store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
682 define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
683 ; GFX906-LABEL: v32i8_loop_carried:
684 ; GFX906: ; %bb.0: ; %entry
685 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
686 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0
687 ; GFX906-NEXT: v_cmp_lt_u32_e32 vcc, 14, v0
688 ; GFX906-NEXT: s_mov_b32 s2, 0x2000604
689 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
690 ; GFX906-NEXT: global_load_dword v1, v1, s[0:1]
691 ; GFX906-NEXT: s_mov_b64 s[0:1], 0
692 ; GFX906-NEXT: s_waitcnt vmcnt(0)
693 ; GFX906-NEXT: v_mov_b32_e32 v0, v1
694 ; GFX906-NEXT: .LBB12_1: ; %bb.1
695 ; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1
696 ; GFX906-NEXT: s_and_b64 s[6:7], exec, vcc
697 ; GFX906-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1]
698 ; GFX906-NEXT: v_perm_b32 v0, v1, v0, s2
699 ; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1]
700 ; GFX906-NEXT: s_cbranch_execnz .LBB12_1
701 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit
702 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
703 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
704 ; GFX906-NEXT: v_mov_b32_e32 v1, 0
705 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
706 ; GFX906-NEXT: global_store_dword v1, v0, s[0:1]
707 ; GFX906-NEXT: s_endpgm
709 %idx = call i32 @llvm.amdgcn.workitem.id.x()
710 %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
711 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
715 %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
716 %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
717 %cmp = icmp ult i32 %idx, 15
718 br i1 %cmp, label %bb.1, label %bb.2
722 store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
726 ; Should not have instances of "Instruction does not dominate all uses!"
728 define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst1, ptr addrspace(1) nocapture %dst2, ptr addrspace(1) nocapture %dst3) {
729 ; GFX906-LABEL: v8i8_multiuse_multiblock:
730 ; GFX906: ; %bb.0: ; %entry
731 ; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
732 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 3, v0
733 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44
734 ; GFX906-NEXT: v_cmp_lt_u32_e64 s[2:3], 14, v0
735 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
736 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
737 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v1, s[8:9]
738 ; GFX906-NEXT: s_waitcnt vmcnt(0)
739 ; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1
740 ; GFX906-NEXT: s_and_saveexec_b64 s[4:5], vcc
741 ; GFX906-NEXT: s_cbranch_execz .LBB13_2
742 ; GFX906-NEXT: ; %bb.1: ; %bb.1
743 ; GFX906-NEXT: s_movk_i32 s6, 0xff00
744 ; GFX906-NEXT: v_mov_b32_e32 v5, 8
745 ; GFX906-NEXT: v_and_b32_sdwa v6, v1, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
746 ; GFX906-NEXT: s_mov_b32 s6, 0x6070504
747 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
748 ; GFX906-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
749 ; GFX906-NEXT: v_lshlrev_b16_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
750 ; GFX906-NEXT: v_perm_b32 v7, v1, v1, s6
751 ; GFX906-NEXT: s_andn2_b64 s[2:3], s[2:3], exec
752 ; GFX906-NEXT: s_and_b64 s[6:7], vcc, exec
753 ; GFX906-NEXT: v_mov_b32_e32 v3, 0
754 ; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
755 ; GFX906-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
756 ; GFX906-NEXT: v_or_b32_sdwa v6, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
757 ; GFX906-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7]
758 ; GFX906-NEXT: v_or_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
759 ; GFX906-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
760 ; GFX906-NEXT: global_store_dword v3, v1, s[12:13]
761 ; GFX906-NEXT: global_store_dword v3, v7, s[12:13] offset:8
762 ; GFX906-NEXT: global_store_dword v3, v6, s[12:13] offset:16
763 ; GFX906-NEXT: global_store_dword v3, v4, s[12:13] offset:24
764 ; GFX906-NEXT: .LBB13_2: ; %Flow
765 ; GFX906-NEXT: s_or_b64 exec, exec, s[4:5]
766 ; GFX906-NEXT: s_and_saveexec_b64 s[4:5], s[2:3]
767 ; GFX906-NEXT: s_cbranch_execz .LBB13_4
768 ; GFX906-NEXT: ; %bb.3: ; %bb.2
769 ; GFX906-NEXT: v_lshlrev_b16_e32 v3, 8, v2
770 ; GFX906-NEXT: v_and_b32_e32 v4, 0xffffff00, v2
771 ; GFX906-NEXT: v_and_b32_e32 v5, 0xffffff00, v1
772 ; GFX906-NEXT: s_mov_b32 s2, 0xc0c0001
773 ; GFX906-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
774 ; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
775 ; GFX906-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
776 ; GFX906-NEXT: v_perm_b32 v2, 0, v2, s2
777 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
778 ; GFX906-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
779 ; GFX906-NEXT: v_perm_b32 v6, 0, v1, s2
780 ; GFX906-NEXT: s_mov_b32 s3, 0xffff0000
781 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2
782 ; GFX906-NEXT: v_and_or_b32 v7, v1, s3, v6
783 ; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
784 ; GFX906-NEXT: v_or_b32_e32 v2, v6, v2
785 ; GFX906-NEXT: global_store_dword v0, v3, s[14:15]
786 ; GFX906-NEXT: global_store_dword v0, v4, s[14:15] offset:8
787 ; GFX906-NEXT: global_store_dword v0, v7, s[14:15] offset:16
788 ; GFX906-NEXT: global_store_dword v0, v2, s[14:15] offset:24
789 ; GFX906-NEXT: .LBB13_4: ; %bb.3
790 ; GFX906-NEXT: s_or_b64 exec, exec, s[4:5]
791 ; GFX906-NEXT: s_movk_i32 s3, 0xff00
792 ; GFX906-NEXT: v_mov_b32_e32 v4, 8
793 ; GFX906-NEXT: s_movk_i32 s2, 0xff
794 ; GFX906-NEXT: v_and_b32_sdwa v2, v1, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
795 ; GFX906-NEXT: v_lshlrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
796 ; GFX906-NEXT: v_or_b32_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
797 ; GFX906-NEXT: v_or_b32_sdwa v5, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
798 ; GFX906-NEXT: v_lshlrev_b16_e32 v6, 8, v1
799 ; GFX906-NEXT: v_and_b32_sdwa v7, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
800 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
801 ; GFX906-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
802 ; GFX906-NEXT: v_or_b32_sdwa v7, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
803 ; GFX906-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
804 ; GFX906-NEXT: v_or_b32_sdwa v4, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
805 ; GFX906-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
806 ; GFX906-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
807 ; GFX906-NEXT: v_or_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
808 ; GFX906-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
809 ; GFX906-NEXT: global_store_dword v0, v3, s[0:1]
810 ; GFX906-NEXT: global_store_dword v0, v1, s[0:1] offset:8
811 ; GFX906-NEXT: global_store_dword v0, v4, s[0:1] offset:16
812 ; GFX906-NEXT: global_store_dword v0, v2, s[0:1] offset:24
813 ; GFX906-NEXT: s_endpgm
815 %idx = call i32 @llvm.amdgcn.workitem.id.x()
816 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
817 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
818 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
819 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
820 %cmp = icmp ult i32 %idx, 15
821 br i1 %cmp, label %bb.1, label %bb.2
823 %s1 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
824 %s2 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
825 %s3 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
826 %s4 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 2, i32 3, i32 1>
827 %gep4 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 0
828 %gep5 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 1
829 %gep6 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 2
830 %gep7 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst1, i32 3
831 store <4 x i8> %s1, ptr addrspace(1) %gep4, align 4
832 store <4 x i8> %s2, ptr addrspace(1) %gep5, align 4
833 store <4 x i8> %s3, ptr addrspace(1) %gep6, align 4
834 store <4 x i8> %s4, ptr addrspace(1) %gep7, align 4
835 %cmp2 = icmp ult i32 %idx, 7
836 br i1 %cmp2, label %bb.2, label %bb.3
839 %s5 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
840 %s6 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
841 %s7 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
842 %s8 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
843 %gep8 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 0
844 %gep9 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 1
845 %gep10 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 2
846 %gep11 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst2, i32 3
847 store <4 x i8> %s5, ptr addrspace(1) %gep8, align 4
848 store <4 x i8> %s6, ptr addrspace(1) %gep9, align 4
849 store <4 x i8> %s7, ptr addrspace(1) %gep10, align 4
850 store <4 x i8> %s8, ptr addrspace(1) %gep11, align 4
854 %s9 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
855 %s10 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
856 %s11 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
857 %s12 = shufflevector <8 x i8> %vec1, <8 x i8> %vec2, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
858 %gep12 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 0
859 %gep13 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 1
860 %gep14 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 2
861 %gep15 = getelementptr ptr addrspace(1), ptr addrspace(1) %dst3, i32 3
862 store <4 x i8> %s9, ptr addrspace(1) %gep12, align 4
863 store <4 x i8> %s10, ptr addrspace(1) %gep13, align 4
864 store <4 x i8> %s11, ptr addrspace(1) %gep14, align 4
865 store <4 x i8> %s12, ptr addrspace(1) %gep15, align 4
870 declare i32 @llvm.amdgcn.workitem.id.x()