1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
4 define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
5 ; GFX906-LABEL: v3i8_liveout:
6 ; GFX906: ; %bb.0: ; %entry
7 ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
8 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
9 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
10 ; GFX906-NEXT: v_mov_b32_e32 v3, 8
11 ; GFX906-NEXT: v_mov_b32_e32 v5, 16
12 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
13 ; GFX906-NEXT: global_load_dword v4, v2, s[4:5]
14 ; GFX906-NEXT: v_mov_b32_e32 v1, 0xff
15 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
16 ; GFX906-NEXT: s_waitcnt vmcnt(0)
17 ; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v4
18 ; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
19 ; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
20 ; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4
21 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
22 ; GFX906-NEXT: s_cbranch_execz .LBB0_2
23 ; GFX906-NEXT: ; %bb.1: ; %bb.1
24 ; GFX906-NEXT: global_load_dword v0, v2, s[6:7]
25 ; GFX906-NEXT: s_waitcnt vmcnt(0)
26 ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0
27 ; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
28 ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
29 ; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0
30 ; GFX906-NEXT: .LBB0_2: ; %bb.2
31 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
32 ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4
33 ; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0
34 ; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
35 ; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
36 ; GFX906-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
37 ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
38 ; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
39 ; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
40 ; GFX906-NEXT: v_mov_b32_e32 v1, 0
41 ; GFX906-NEXT: global_store_short v1, v0, s[0:1]
42 ; GFX906-NEXT: global_store_byte_d16_hi v1, v0, s[0:1] offset:2
43 ; GFX906-NEXT: s_endpgm
45 %idx = call i32 @llvm.amdgcn.workitem.id.x()
46 %gep1 = getelementptr <3 x i8>, ptr addrspace(1) %src1, i32 %idx
47 %vec1 = load <3 x i8>, ptr addrspace(1) %gep1
48 %gep2 = getelementptr <3 x i8>, ptr addrspace(1) %src2, i32 %idx
49 %vec2 = load <3 x i8>, ptr addrspace(1) %gep2
50 %cmp = icmp ult i32 %idx, 15
51 br i1 %cmp, label %bb.1, label %bb.2
56 %tmp5 = phi <3 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
57 store <3 x i8> %tmp5, ptr addrspace(1) %dst, align 4
61 define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
62 ; GFX906-LABEL: v4i8_liveout:
63 ; GFX906: ; %bb.0: ; %entry
64 ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
65 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
66 ; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
67 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
68 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
69 ; GFX906-NEXT: global_load_dword v1, v2, s[4:5]
70 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
71 ; GFX906-NEXT: s_cbranch_execz .LBB1_2
72 ; GFX906-NEXT: ; %bb.1: ; %bb.1
73 ; GFX906-NEXT: global_load_dword v1, v2, s[6:7]
74 ; GFX906-NEXT: .LBB1_2: ; %bb.2
75 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
76 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
77 ; GFX906-NEXT: s_waitcnt vmcnt(0)
78 ; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
79 ; GFX906-NEXT: s_endpgm
81 %idx = call i32 @llvm.amdgcn.workitem.id.x()
82 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
83 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
84 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
85 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
86 %cmp = icmp ult i32 %idx, 15
87 br i1 %cmp, label %bb.1, label %bb.2
92 %tmp5 = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
93 store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
97 define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
98 ; GFX906-LABEL: v5i8_liveout:
99 ; GFX906: ; %bb.0: ; %entry
100 ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
101 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
102 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
103 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
104 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
105 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
106 ; GFX906-NEXT: s_waitcnt vmcnt(0)
107 ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
108 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
109 ; GFX906-NEXT: s_cbranch_execz .LBB2_2
110 ; GFX906-NEXT: ; %bb.1: ; %bb.1
111 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
112 ; GFX906-NEXT: s_waitcnt vmcnt(0)
113 ; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v2
114 ; GFX906-NEXT: .LBB2_2: ; %bb.2
115 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
116 ; GFX906-NEXT: v_mov_b32_e32 v4, 0
117 ; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v1
118 ; GFX906-NEXT: v_lshrrev_b32_e32 v3, 24, v1
119 ; GFX906-NEXT: global_store_byte v4, v1, s[0:1]
120 ; GFX906-NEXT: global_store_byte v4, v0, s[0:1] offset:1
121 ; GFX906-NEXT: global_store_byte_d16_hi v4, v1, s[0:1] offset:2
122 ; GFX906-NEXT: global_store_byte v4, v3, s[0:1] offset:3
123 ; GFX906-NEXT: global_store_byte v4, v2, s[0:1] offset:4
124 ; GFX906-NEXT: s_endpgm
126 %idx = call i32 @llvm.amdgcn.workitem.id.x()
127 %gep1 = getelementptr <5 x i8>, ptr addrspace(1) %src1, i32 %idx
128 %vec1 = load <5 x i8>, ptr addrspace(1) %gep1
129 %gep2 = getelementptr <5 x i8>, ptr addrspace(1) %src2, i32 %idx
130 %vec2 = load <5 x i8>, ptr addrspace(1) %gep2
131 %cmp = icmp ult i32 %idx, 15
132 br i1 %cmp, label %bb.1, label %bb.2
137 %tmp5 = phi <5 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
138 store <5 x i8> %tmp5, ptr addrspace(1) %dst, align 4
142 define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
143 ; GFX906-LABEL: v8i8_liveout:
144 ; GFX906: ; %bb.0: ; %entry
145 ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
146 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
147 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
148 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
149 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
150 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
151 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
152 ; GFX906-NEXT: s_cbranch_execz .LBB3_2
153 ; GFX906-NEXT: ; %bb.1: ; %bb.1
154 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
155 ; GFX906-NEXT: .LBB3_2: ; %bb.2
156 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
157 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
158 ; GFX906-NEXT: s_waitcnt vmcnt(0)
159 ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1]
160 ; GFX906-NEXT: s_endpgm
162 %idx = call i32 @llvm.amdgcn.workitem.id.x()
163 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
164 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
165 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
166 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
167 %cmp = icmp ult i32 %idx, 15
168 br i1 %cmp, label %bb.1, label %bb.2
173 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
174 store <8 x i8> %tmp5, ptr addrspace(1) %dst, align 4
178 define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
179 ; GFX906-LABEL: v16i8_liveout:
180 ; GFX906: ; %bb.0: ; %entry
181 ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
182 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
183 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 4, v0
184 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
185 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
186 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[4:5]
187 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
188 ; GFX906-NEXT: s_cbranch_execz .LBB4_2
189 ; GFX906-NEXT: ; %bb.1: ; %bb.1
190 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v5, s[6:7]
191 ; GFX906-NEXT: .LBB4_2: ; %bb.2
192 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
193 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
194 ; GFX906-NEXT: s_waitcnt vmcnt(0)
195 ; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1]
196 ; GFX906-NEXT: s_endpgm
198 %idx = call i32 @llvm.amdgcn.workitem.id.x()
199 %gep1 = getelementptr <16 x i8>, ptr addrspace(1) %src1, i32 %idx
200 %vec1 = load <16 x i8>, ptr addrspace(1) %gep1
201 %gep2 = getelementptr <16 x i8>, ptr addrspace(1) %src2, i32 %idx
202 %vec2 = load <16 x i8>, ptr addrspace(1) %gep2
203 %cmp = icmp ult i32 %idx, 15
204 br i1 %cmp, label %bb.1, label %bb.2
209 %tmp5 = phi <16 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
210 store <16 x i8> %tmp5, ptr addrspace(1) %dst, align 4
214 define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
215 ; GFX906-LABEL: v32i8_liveout:
216 ; GFX906: ; %bb.0: ; %entry
217 ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
218 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
219 ; GFX906-NEXT: v_lshlrev_b32_e32 v9, 5, v0
220 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
221 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
222 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[4:5]
223 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[4:5] offset:16
224 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
225 ; GFX906-NEXT: s_cbranch_execz .LBB5_2
226 ; GFX906-NEXT: ; %bb.1: ; %bb.1
227 ; GFX906-NEXT: global_load_dwordx4 v[1:4], v9, s[6:7]
228 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v9, s[6:7] offset:16
229 ; GFX906-NEXT: .LBB5_2: ; %bb.2
230 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
231 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
232 ; GFX906-NEXT: s_waitcnt vmcnt(1)
233 ; GFX906-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1]
234 ; GFX906-NEXT: s_waitcnt vmcnt(1)
235 ; GFX906-NEXT: global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
236 ; GFX906-NEXT: s_endpgm
238 %idx = call i32 @llvm.amdgcn.workitem.id.x()
239 %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
240 %vec1 = load <32 x i8>, ptr addrspace(1) %gep1
241 %gep2 = getelementptr <32 x i8>, ptr addrspace(1) %src2, i32 %idx
242 %vec2 = load <32 x i8>, ptr addrspace(1) %gep2
243 %cmp = icmp ult i32 %idx, 15
244 br i1 %cmp, label %bb.1, label %bb.2
249 %tmp5 = phi <32 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
250 store <32 x i8> %tmp5, ptr addrspace(1) %dst, align 4
254 define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
255 ; GFX906-LABEL: v256i8_liveout:
256 ; GFX906: ; %bb.0: ; %entry
257 ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
258 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
259 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
260 ; GFX906-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
261 ; GFX906-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
262 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
263 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5]
264 ; GFX906-NEXT: s_mov_b32 s14, -1
265 ; GFX906-NEXT: s_mov_b32 s15, 0xe00000
266 ; GFX906-NEXT: s_add_u32 s12, s12, s9
267 ; GFX906-NEXT: s_addc_u32 s13, s13, 0
268 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
269 ; GFX906-NEXT: s_waitcnt vmcnt(0)
270 ; GFX906-NEXT: buffer_store_dword v5, off, s[12:15], 0 ; 4-byte Folded Spill
271 ; GFX906-NEXT: s_waitcnt vmcnt(0)
272 ; GFX906-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
273 ; GFX906-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
274 ; GFX906-NEXT: buffer_store_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
275 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[4:5] offset:16
276 ; GFX906-NEXT: s_nop 0
277 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[4:5] offset:32
278 ; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[4:5] offset:48
279 ; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[4:5] offset:64
280 ; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[4:5] offset:80
281 ; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[4:5] offset:96
282 ; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[4:5] offset:112
283 ; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[4:5] offset:128
284 ; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[4:5] offset:144
285 ; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[4:5] offset:160
286 ; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[4:5] offset:176
287 ; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[4:5] offset:192
288 ; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[4:5] offset:208
289 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[4:5] offset:224
290 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[4:5] offset:240
291 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
292 ; GFX906-NEXT: s_cbranch_execz .LBB6_2
293 ; GFX906-NEXT: ; %bb.1: ; %bb.1
294 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
295 ; GFX906-NEXT: s_waitcnt vmcnt(0)
296 ; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
297 ; GFX906-NEXT: s_waitcnt vmcnt(0)
298 ; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
299 ; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
300 ; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
301 ; GFX906-NEXT: global_load_dwordx4 v[5:8], v4, s[6:7] offset:16
302 ; GFX906-NEXT: global_load_dwordx4 v[9:12], v4, s[6:7] offset:32
303 ; GFX906-NEXT: global_load_dwordx4 v[13:16], v4, s[6:7] offset:48
304 ; GFX906-NEXT: global_load_dwordx4 v[17:20], v4, s[6:7] offset:64
305 ; GFX906-NEXT: global_load_dwordx4 v[21:24], v4, s[6:7] offset:80
306 ; GFX906-NEXT: global_load_dwordx4 v[25:28], v4, s[6:7] offset:96
307 ; GFX906-NEXT: global_load_dwordx4 v[29:32], v4, s[6:7] offset:112
308 ; GFX906-NEXT: global_load_dwordx4 v[33:36], v4, s[6:7] offset:128
309 ; GFX906-NEXT: global_load_dwordx4 v[37:40], v4, s[6:7] offset:144
310 ; GFX906-NEXT: global_load_dwordx4 v[41:44], v4, s[6:7] offset:160
311 ; GFX906-NEXT: global_load_dwordx4 v[45:48], v4, s[6:7] offset:176
312 ; GFX906-NEXT: global_load_dwordx4 v[49:52], v4, s[6:7] offset:192
313 ; GFX906-NEXT: global_load_dwordx4 v[53:56], v4, s[6:7] offset:208
314 ; GFX906-NEXT: global_load_dwordx4 v[57:60], v4, s[6:7] offset:224
315 ; GFX906-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] offset:240
316 ; GFX906-NEXT: .LBB6_2: ; %bb.2
317 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
318 ; GFX906-NEXT: s_waitcnt vmcnt(0)
319 ; GFX906-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
320 ; GFX906-NEXT: s_waitcnt vmcnt(0)
321 ; GFX906-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
322 ; GFX906-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
323 ; GFX906-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
324 ; GFX906-NEXT: v_mov_b32_e32 v0, v57
325 ; GFX906-NEXT: v_mov_b32_e32 v1, v58
326 ; GFX906-NEXT: v_mov_b32_e32 v2, v59
327 ; GFX906-NEXT: v_mov_b32_e32 v3, v60
328 ; GFX906-NEXT: v_mov_b32_e32 v60, v56
329 ; GFX906-NEXT: v_mov_b32_e32 v59, v55
330 ; GFX906-NEXT: v_mov_b32_e32 v58, v54
331 ; GFX906-NEXT: v_mov_b32_e32 v57, v53
332 ; GFX906-NEXT: v_mov_b32_e32 v56, v52
333 ; GFX906-NEXT: v_mov_b32_e32 v55, v51
334 ; GFX906-NEXT: v_mov_b32_e32 v54, v50
335 ; GFX906-NEXT: v_mov_b32_e32 v53, v49
336 ; GFX906-NEXT: v_mov_b32_e32 v52, v48
337 ; GFX906-NEXT: v_mov_b32_e32 v51, v47
338 ; GFX906-NEXT: v_mov_b32_e32 v50, v46
339 ; GFX906-NEXT: v_mov_b32_e32 v49, v45
340 ; GFX906-NEXT: v_mov_b32_e32 v48, v44
341 ; GFX906-NEXT: v_mov_b32_e32 v47, v43
342 ; GFX906-NEXT: v_mov_b32_e32 v46, v42
343 ; GFX906-NEXT: v_mov_b32_e32 v45, v41
344 ; GFX906-NEXT: v_mov_b32_e32 v44, v40
345 ; GFX906-NEXT: v_mov_b32_e32 v43, v39
346 ; GFX906-NEXT: v_mov_b32_e32 v42, v38
347 ; GFX906-NEXT: v_mov_b32_e32 v41, v37
348 ; GFX906-NEXT: v_mov_b32_e32 v40, v36
349 ; GFX906-NEXT: v_mov_b32_e32 v39, v35
350 ; GFX906-NEXT: v_mov_b32_e32 v38, v34
351 ; GFX906-NEXT: v_mov_b32_e32 v37, v33
352 ; GFX906-NEXT: v_mov_b32_e32 v36, v32
353 ; GFX906-NEXT: v_mov_b32_e32 v35, v31
354 ; GFX906-NEXT: v_mov_b32_e32 v34, v30
355 ; GFX906-NEXT: v_mov_b32_e32 v33, v29
356 ; GFX906-NEXT: v_mov_b32_e32 v32, v28
357 ; GFX906-NEXT: v_mov_b32_e32 v31, v27
358 ; GFX906-NEXT: v_mov_b32_e32 v30, v26
359 ; GFX906-NEXT: v_mov_b32_e32 v29, v25
360 ; GFX906-NEXT: v_mov_b32_e32 v28, v24
361 ; GFX906-NEXT: v_mov_b32_e32 v27, v23
362 ; GFX906-NEXT: v_mov_b32_e32 v26, v22
363 ; GFX906-NEXT: v_mov_b32_e32 v25, v21
364 ; GFX906-NEXT: v_mov_b32_e32 v24, v20
365 ; GFX906-NEXT: v_mov_b32_e32 v23, v19
366 ; GFX906-NEXT: v_mov_b32_e32 v22, v18
367 ; GFX906-NEXT: v_mov_b32_e32 v21, v17
368 ; GFX906-NEXT: v_mov_b32_e32 v20, v16
369 ; GFX906-NEXT: v_mov_b32_e32 v19, v15
370 ; GFX906-NEXT: v_mov_b32_e32 v18, v14
371 ; GFX906-NEXT: v_mov_b32_e32 v17, v13
372 ; GFX906-NEXT: v_mov_b32_e32 v16, v12
373 ; GFX906-NEXT: v_mov_b32_e32 v15, v11
374 ; GFX906-NEXT: v_mov_b32_e32 v14, v10
375 ; GFX906-NEXT: v_mov_b32_e32 v13, v9
376 ; GFX906-NEXT: v_mov_b32_e32 v12, v8
377 ; GFX906-NEXT: v_mov_b32_e32 v11, v7
378 ; GFX906-NEXT: v_mov_b32_e32 v10, v6
379 ; GFX906-NEXT: v_mov_b32_e32 v9, v5
380 ; GFX906-NEXT: buffer_load_dword v5, off, s[12:15], 0 ; 4-byte Folded Reload
381 ; GFX906-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
382 ; GFX906-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
383 ; GFX906-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
384 ; GFX906-NEXT: v_mov_b32_e32 v4, 0
385 ; GFX906-NEXT: s_waitcnt vmcnt(0)
386 ; GFX906-NEXT: global_store_dwordx4 v4, v[5:8], s[0:1]
387 ; GFX906-NEXT: global_store_dwordx4 v4, v[9:12], s[0:1] offset:16
388 ; GFX906-NEXT: global_store_dwordx4 v4, v[13:16], s[0:1] offset:32
389 ; GFX906-NEXT: global_store_dwordx4 v4, v[17:20], s[0:1] offset:48
390 ; GFX906-NEXT: global_store_dwordx4 v4, v[21:24], s[0:1] offset:64
391 ; GFX906-NEXT: global_store_dwordx4 v4, v[25:28], s[0:1] offset:80
392 ; GFX906-NEXT: global_store_dwordx4 v4, v[29:32], s[0:1] offset:96
393 ; GFX906-NEXT: global_store_dwordx4 v4, v[33:36], s[0:1] offset:112
394 ; GFX906-NEXT: global_store_dwordx4 v4, v[37:40], s[0:1] offset:128
395 ; GFX906-NEXT: global_store_dwordx4 v4, v[41:44], s[0:1] offset:144
396 ; GFX906-NEXT: global_store_dwordx4 v4, v[45:48], s[0:1] offset:160
397 ; GFX906-NEXT: global_store_dwordx4 v4, v[49:52], s[0:1] offset:176
398 ; GFX906-NEXT: global_store_dwordx4 v4, v[53:56], s[0:1] offset:192
399 ; GFX906-NEXT: global_store_dwordx4 v4, v[57:60], s[0:1] offset:208
400 ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:224
401 ; GFX906-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
402 ; GFX906-NEXT: s_nop 0
403 ; GFX906-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
404 ; GFX906-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
405 ; GFX906-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
406 ; GFX906-NEXT: s_waitcnt vmcnt(0)
407 ; GFX906-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:240
408 ; GFX906-NEXT: s_endpgm
410 %idx = call i32 @llvm.amdgcn.workitem.id.x()
411 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
412 %vec1 = load <256 x i8>, ptr addrspace(1) %gep1
413 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
414 %vec2 = load <256 x i8>, ptr addrspace(1) %gep2
415 %cmp = icmp ult i32 %idx, 15
416 br i1 %cmp, label %bb.1, label %bb.2
421 %tmp5 = phi <256 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
422 store <256 x i8> %tmp5, ptr addrspace(1) %dst, align 4
427 define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
428 ; GFX906-LABEL: repeat_successor:
429 ; GFX906: ; %bb.0: ; %entry
430 ; GFX906-NEXT: s_load_dword s0, s[2:3], 0x24
431 ; GFX906-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x2c
432 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
433 ; GFX906-NEXT: s_cmp_lt_i32 s0, 3
434 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_3
435 ; GFX906-NEXT: ; %bb.1: ; %LeafBlock
436 ; GFX906-NEXT: s_cmp_ge_i32 s0, 1
437 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
438 ; GFX906-NEXT: ; %bb.2:
439 ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
440 ; GFX906-NEXT: global_load_dword v0, v0, s[4:5]
441 ; GFX906-NEXT: s_branch .LBB7_5
442 ; GFX906-NEXT: .LBB7_3: ; %LeafBlock5
443 ; GFX906-NEXT: s_cmp_eq_u32 s0, 3
444 ; GFX906-NEXT: s_cbranch_scc0 .LBB7_6
445 ; GFX906-NEXT: ; %bb.4: ; %sw.bb5
446 ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 2, v0
447 ; GFX906-NEXT: global_load_dword v0, v0, s[6:7]
448 ; GFX906-NEXT: .LBB7_5: ; %return.sink.split
449 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x3c
450 ; GFX906-NEXT: v_mov_b32_e32 v1, 0
451 ; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
452 ; GFX906-NEXT: global_store_dword v1, v0, s[0:1]
453 ; GFX906-NEXT: .LBB7_6: ; %return
454 ; GFX906-NEXT: s_endpgm
456 %idx = call i32 @llvm.amdgcn.workitem.id.x()
457 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx
458 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
459 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx
460 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2
461 switch i32 %in, label %return [
462 i32 1, label %return.sink.split
463 i32 2, label %return.sink.split
468 br label %return.sink.split
471 %tmp5 = phi <4 x i8> [ %vec2, %sw.bb5 ], [ %vec1, %entry ], [ %vec1, %entry ]
472 store <4 x i8> %tmp5, ptr addrspace(1) %dst, align 4
479 define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
480 ; GFX906-LABEL: v8i8_phi_chain:
481 ; GFX906: ; %bb.0: ; %entry
482 ; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
483 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0
484 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
485 ; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1
486 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
487 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[4:5]
488 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
489 ; GFX906-NEXT: s_cbranch_execz .LBB8_2
490 ; GFX906-NEXT: ; %bb.1: ; %bb.1
491 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[6:7]
492 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
493 ; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
494 ; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc
495 ; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5]
496 ; GFX906-NEXT: .LBB8_2: ; %Flow
497 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
498 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
499 ; GFX906-NEXT: s_cbranch_execz .LBB8_4
500 ; GFX906-NEXT: ; %bb.3: ; %bb.2
501 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
502 ; GFX906-NEXT: s_waitcnt vmcnt(0)
503 ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9]
504 ; GFX906-NEXT: .LBB8_4: ; %bb.3
505 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
506 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
507 ; GFX906-NEXT: s_waitcnt vmcnt(0)
508 ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11]
509 ; GFX906-NEXT: s_endpgm
511 %idx = call i32 @llvm.amdgcn.workitem.id.x()
512 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
513 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
514 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
515 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
516 %cmp = icmp ult i32 %idx, 15
517 br i1 %cmp, label %bb.1, label %bb.2
519 %cmp2 = icmp ult i32 %idx, 7
520 br i1 %cmp2, label %bb.2, label %bb.3
523 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
524 store <8 x i8> %tmp5, ptr addrspace(1) %dst0, align 4
528 %tmp7 = phi <8 x i8> [ %vec2, %bb.1], [%tmp5, %bb.2]
529 store <8 x i8> %tmp7, ptr addrspace(1) %dst1, align 4
533 define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst0, ptr addrspace(1) nocapture %dst1) {
534 ; GFX906-LABEL: v8i8_multi_block:
535 ; GFX906: ; %bb.0: ; %entry
536 ; GFX906-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
537 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0
538 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
539 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
540 ; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[4:5]
541 ; GFX906-NEXT: s_waitcnt vmcnt(0)
542 ; GFX906-NEXT: v_mov_b32_e32 v1, v3
543 ; GFX906-NEXT: v_mov_b32_e32 v2, v4
544 ; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
545 ; GFX906-NEXT: s_cbranch_execz .LBB9_4
546 ; GFX906-NEXT: ; %bb.1: ; %bb.1
547 ; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[6:7]
548 ; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0
549 ; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
550 ; GFX906-NEXT: s_cbranch_execz .LBB9_3
551 ; GFX906-NEXT: ; %bb.2: ; %bb.2
552 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
553 ; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[8:9]
554 ; GFX906-NEXT: .LBB9_3: ; %Flow
555 ; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
556 ; GFX906-NEXT: .LBB9_4: ; %bb.3
557 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
558 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
559 ; GFX906-NEXT: s_waitcnt vmcnt(0)
560 ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[10:11]
561 ; GFX906-NEXT: s_endpgm
563 %idx = call i32 @llvm.amdgcn.workitem.id.x()
564 %gep1 = getelementptr <8 x i8>, ptr addrspace(1) %src1, i32 %idx
565 %vec1 = load <8 x i8>, ptr addrspace(1) %gep1
566 %gep2 = getelementptr <8 x i8>, ptr addrspace(1) %src2, i32 %idx
567 %vec2 = load <8 x i8>, ptr addrspace(1) %gep2
568 %cmp = icmp ult i32 %idx, 15
569 br i1 %cmp, label %bb.1, label %bb.3
571 %cmp2 = icmp ult i32 %idx, 7
572 br i1 %cmp2, label %bb.2, label %bb.3
575 store <8 x i8> %vec1, ptr addrspace(1) %dst0, align 4
579 %tmp5 = phi <8 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ], [ %vec2, %bb.2]
580 store <8 x i8> %tmp5, ptr addrspace(1) %dst1, align 4
584 define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) {
585 ; GFX906-LABEL: v32i8_loop_carried:
586 ; GFX906: ; %bb.0: ; %entry
587 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
588 ; GFX906-NEXT: v_lshlrev_b32_e32 v1, 5, v0
589 ; GFX906-NEXT: v_mov_b32_e32 v3, 8
590 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xff
591 ; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0
592 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
593 ; GFX906-NEXT: global_load_dword v1, v1, s[0:1]
594 ; GFX906-NEXT: s_mov_b64 s[0:1], 0
595 ; GFX906-NEXT: s_waitcnt vmcnt(0)
596 ; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
597 ; GFX906-NEXT: v_and_or_b32 v0, v1, v2, v0
598 ; GFX906-NEXT: v_mov_b32_e32 v2, 24
599 ; GFX906-NEXT: .LBB10_1: ; %bb.1
600 ; GFX906-NEXT: ; =>This Inner Loop Header: Depth=1
601 ; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v1
602 ; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc
603 ; GFX906-NEXT: v_lshlrev_b32_e32 v3, 16, v3
604 ; GFX906-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
605 ; GFX906-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
606 ; GFX906-NEXT: v_or3_b32 v1, v0, v3, v1
607 ; GFX906-NEXT: s_andn2_b64 exec, exec, s[0:1]
608 ; GFX906-NEXT: s_cbranch_execnz .LBB10_1
609 ; GFX906-NEXT: ; %bb.2: ; %bb.2.loopexit
610 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
611 ; GFX906-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
612 ; GFX906-NEXT: v_mov_b32_e32 v0, 0
613 ; GFX906-NEXT: s_waitcnt lgkmcnt(0)
614 ; GFX906-NEXT: global_store_dword v0, v1, s[0:1]
615 ; GFX906-NEXT: s_endpgm
617 %idx = call i32 @llvm.amdgcn.workitem.id.x()
618 %gep1 = getelementptr <32 x i8>, ptr addrspace(1) %src1, i32 %idx
619 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1
623 %temp = phi <4 x i8> [ %vec1, %entry ], [ %vec2, %bb.1 ]
624 %vec2 = shufflevector <4 x i8> %vec1, <4 x i8> %temp, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
625 %cmp = icmp ult i32 %idx, 15
626 br i1 %cmp, label %bb.1, label %bb.2
630 store <4 x i8> %vec2, ptr addrspace(1) %dst, align 4
635 declare i32 @llvm.amdgcn.workitem.id.x()