1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=VI %s
5 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
8 define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
9 ; SI-LABEL: test_copy_v4i8:
11 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
12 ; SI-NEXT: s_mov_b32 s7, 0xf000
13 ; SI-NEXT: s_mov_b32 s10, 0
14 ; SI-NEXT: s_mov_b32 s11, s7
15 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
16 ; SI-NEXT: s_waitcnt lgkmcnt(0)
17 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
18 ; SI-NEXT: v_mov_b32_e32 v1, 0
19 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
20 ; SI-NEXT: s_mov_b32 s6, -1
21 ; SI-NEXT: s_mov_b32 s4, s0
22 ; SI-NEXT: s_mov_b32 s5, s1
23 ; SI-NEXT: s_waitcnt vmcnt(0)
24 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
27 ; VI-LABEL: test_copy_v4i8:
29 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
30 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
31 ; VI-NEXT: s_waitcnt lgkmcnt(0)
32 ; VI-NEXT: v_mov_b32_e32 v1, s3
33 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
34 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
35 ; VI-NEXT: flat_load_dword v0, v[0:1]
36 ; VI-NEXT: s_mov_b32 s3, 0xf000
37 ; VI-NEXT: s_mov_b32 s2, -1
38 ; VI-NEXT: s_waitcnt vmcnt(0)
39 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
41 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
42 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
43 %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
44 store <4 x i8> %val, ptr addrspace(1) %out, align 4
48 define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind {
49 ; SI-LABEL: test_copy_v4i8_x2:
51 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
52 ; SI-NEXT: s_mov_b32 s3, 0xf000
53 ; SI-NEXT: s_mov_b32 s6, 0
54 ; SI-NEXT: s_mov_b32 s7, s3
55 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
56 ; SI-NEXT: v_mov_b32_e32 v1, 0
57 ; SI-NEXT: s_waitcnt lgkmcnt(0)
58 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
59 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
60 ; SI-NEXT: s_mov_b32 s2, -1
61 ; SI-NEXT: s_mov_b32 s10, s2
62 ; SI-NEXT: s_mov_b32 s11, s3
63 ; SI-NEXT: s_waitcnt lgkmcnt(0)
64 ; SI-NEXT: s_mov_b32 s0, s4
65 ; SI-NEXT: s_mov_b32 s1, s5
66 ; SI-NEXT: s_mov_b32 s8, s6
67 ; SI-NEXT: s_mov_b32 s9, s7
68 ; SI-NEXT: s_waitcnt vmcnt(0)
69 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
70 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
73 ; VI-LABEL: test_copy_v4i8_x2:
75 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
76 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
77 ; VI-NEXT: s_mov_b32 s7, 0xf000
78 ; VI-NEXT: s_mov_b32 s6, -1
79 ; VI-NEXT: s_mov_b32 s10, s6
80 ; VI-NEXT: s_waitcnt lgkmcnt(0)
81 ; VI-NEXT: v_mov_b32_e32 v1, s3
82 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
83 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
84 ; VI-NEXT: flat_load_dword v0, v[0:1]
85 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
86 ; VI-NEXT: s_mov_b32 s11, s7
87 ; VI-NEXT: s_waitcnt lgkmcnt(0)
88 ; VI-NEXT: s_mov_b32 s4, s0
89 ; VI-NEXT: s_mov_b32 s5, s1
90 ; VI-NEXT: s_mov_b32 s8, s2
91 ; VI-NEXT: s_mov_b32 s9, s3
92 ; VI-NEXT: s_waitcnt vmcnt(0)
93 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
94 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
96 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
97 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
98 %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
99 store <4 x i8> %val, ptr addrspace(1) %out0, align 4
100 store <4 x i8> %val, ptr addrspace(1) %out1, align 4
104 define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind {
105 ; SI-LABEL: test_copy_v4i8_x3:
107 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
108 ; SI-NEXT: s_mov_b32 s11, 0xf000
109 ; SI-NEXT: s_mov_b32 s14, 0
110 ; SI-NEXT: s_mov_b32 s15, s11
111 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
112 ; SI-NEXT: s_waitcnt lgkmcnt(0)
113 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
114 ; SI-NEXT: v_mov_b32_e32 v1, 0
115 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
116 ; SI-NEXT: s_mov_b32 s10, -1
117 ; SI-NEXT: s_mov_b32 s8, s0
118 ; SI-NEXT: s_mov_b32 s9, s1
119 ; SI-NEXT: s_mov_b32 s14, s10
120 ; SI-NEXT: s_mov_b32 s6, s10
121 ; SI-NEXT: s_mov_b32 s7, s11
122 ; SI-NEXT: s_mov_b32 s12, s2
123 ; SI-NEXT: s_mov_b32 s13, s3
124 ; SI-NEXT: s_waitcnt vmcnt(0)
125 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
126 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
127 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
130 ; VI-LABEL: test_copy_v4i8_x3:
132 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
133 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
134 ; VI-NEXT: s_mov_b32 s11, 0xf000
135 ; VI-NEXT: s_mov_b32 s10, -1
136 ; VI-NEXT: s_mov_b32 s14, s10
137 ; VI-NEXT: s_waitcnt lgkmcnt(0)
138 ; VI-NEXT: v_mov_b32_e32 v1, s7
139 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
140 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
141 ; VI-NEXT: flat_load_dword v0, v[0:1]
142 ; VI-NEXT: s_mov_b32 s8, s0
143 ; VI-NEXT: s_mov_b32 s9, s1
144 ; VI-NEXT: s_mov_b32 s15, s11
145 ; VI-NEXT: s_mov_b32 s6, s10
146 ; VI-NEXT: s_mov_b32 s7, s11
147 ; VI-NEXT: s_mov_b32 s12, s2
148 ; VI-NEXT: s_mov_b32 s13, s3
149 ; VI-NEXT: s_waitcnt vmcnt(0)
150 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
151 ; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
152 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
154 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
155 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
156 %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
157 store <4 x i8> %val, ptr addrspace(1) %out0, align 4
158 store <4 x i8> %val, ptr addrspace(1) %out1, align 4
159 store <4 x i8> %val, ptr addrspace(1) %out2, align 4
163 define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3, ptr addrspace(1) %in) nounwind {
164 ; SI-LABEL: test_copy_v4i8_x4:
166 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11
167 ; SI-NEXT: s_mov_b32 s3, 0xf000
168 ; SI-NEXT: s_mov_b32 s6, 0
169 ; SI-NEXT: s_mov_b32 s7, s3
170 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
171 ; SI-NEXT: v_mov_b32_e32 v1, 0
172 ; SI-NEXT: s_waitcnt lgkmcnt(0)
173 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
174 ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
175 ; SI-NEXT: s_mov_b32 s2, -1
176 ; SI-NEXT: s_mov_b32 s14, s2
177 ; SI-NEXT: s_mov_b32 s15, s3
178 ; SI-NEXT: s_mov_b32 s18, s2
179 ; SI-NEXT: s_waitcnt lgkmcnt(0)
180 ; SI-NEXT: s_mov_b32 s0, s4
181 ; SI-NEXT: s_mov_b32 s1, s5
182 ; SI-NEXT: s_mov_b32 s19, s3
183 ; SI-NEXT: s_mov_b32 s22, s2
184 ; SI-NEXT: s_mov_b32 s23, s3
185 ; SI-NEXT: s_mov_b32 s12, s6
186 ; SI-NEXT: s_mov_b32 s13, s7
187 ; SI-NEXT: s_mov_b32 s16, s8
188 ; SI-NEXT: s_mov_b32 s17, s9
189 ; SI-NEXT: s_mov_b32 s20, s10
190 ; SI-NEXT: s_mov_b32 s21, s11
191 ; SI-NEXT: s_waitcnt vmcnt(0)
192 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
193 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
194 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
195 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0
198 ; VI-LABEL: test_copy_v4i8_x4:
200 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
201 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
202 ; VI-NEXT: s_mov_b32 s11, 0xf000
203 ; VI-NEXT: s_mov_b32 s10, -1
204 ; VI-NEXT: s_mov_b32 s14, s10
205 ; VI-NEXT: s_waitcnt lgkmcnt(0)
206 ; VI-NEXT: v_mov_b32_e32 v1, s3
207 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
208 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
209 ; VI-NEXT: flat_load_dword v0, v[0:1]
210 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
211 ; VI-NEXT: s_mov_b32 s15, s11
212 ; VI-NEXT: s_mov_b32 s18, s10
213 ; VI-NEXT: s_mov_b32 s19, s11
214 ; VI-NEXT: s_mov_b32 s22, s10
215 ; VI-NEXT: s_waitcnt lgkmcnt(0)
216 ; VI-NEXT: s_mov_b32 s8, s0
217 ; VI-NEXT: s_mov_b32 s9, s1
218 ; VI-NEXT: s_mov_b32 s23, s11
219 ; VI-NEXT: s_mov_b32 s12, s2
220 ; VI-NEXT: s_mov_b32 s13, s3
221 ; VI-NEXT: s_mov_b32 s16, s4
222 ; VI-NEXT: s_mov_b32 s17, s5
223 ; VI-NEXT: s_mov_b32 s20, s6
224 ; VI-NEXT: s_mov_b32 s21, s7
225 ; VI-NEXT: s_waitcnt vmcnt(0)
226 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
227 ; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
228 ; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0
229 ; VI-NEXT: buffer_store_dword v0, off, s[20:23], 0
231 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
232 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
233 %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
234 store <4 x i8> %val, ptr addrspace(1) %out0, align 4
235 store <4 x i8> %val, ptr addrspace(1) %out1, align 4
236 store <4 x i8> %val, ptr addrspace(1) %out2, align 4
237 store <4 x i8> %val, ptr addrspace(1) %out3, align 4
241 define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind {
242 ; SI-LABEL: test_copy_v4i8_extra_use:
244 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
245 ; SI-NEXT: s_mov_b32 s3, 0xf000
246 ; SI-NEXT: s_mov_b32 s6, 0
247 ; SI-NEXT: s_mov_b32 s7, s3
248 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
249 ; SI-NEXT: v_mov_b32_e32 v1, 0
250 ; SI-NEXT: s_waitcnt lgkmcnt(0)
251 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
252 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
253 ; SI-NEXT: s_mov_b32 s2, -1
254 ; SI-NEXT: s_mov_b32 s10, s2
255 ; SI-NEXT: s_mov_b32 s11, s3
256 ; SI-NEXT: s_waitcnt lgkmcnt(0)
257 ; SI-NEXT: s_mov_b32 s0, s4
258 ; SI-NEXT: s_mov_b32 s1, s5
259 ; SI-NEXT: s_mov_b32 s8, s6
260 ; SI-NEXT: s_mov_b32 s9, s7
261 ; SI-NEXT: s_waitcnt vmcnt(0)
262 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
263 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
264 ; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0
265 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
266 ; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1
267 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
268 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
269 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
270 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
271 ; SI-NEXT: v_or_b32_e32 v1, v4, v1
272 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
273 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
274 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
275 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
276 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
277 ; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0
280 ; VI-LABEL: test_copy_v4i8_extra_use:
282 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
283 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
284 ; VI-NEXT: s_mov_b32 s7, 0xf000
285 ; VI-NEXT: s_mov_b32 s6, -1
286 ; VI-NEXT: s_mov_b32 s10, s6
287 ; VI-NEXT: s_waitcnt lgkmcnt(0)
288 ; VI-NEXT: v_mov_b32_e32 v1, s3
289 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
290 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
291 ; VI-NEXT: flat_load_dword v0, v[0:1]
292 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
293 ; VI-NEXT: s_mov_b32 s11, s7
294 ; VI-NEXT: s_waitcnt lgkmcnt(0)
295 ; VI-NEXT: s_mov_b32 s4, s0
296 ; VI-NEXT: s_mov_b32 s5, s1
297 ; VI-NEXT: s_mov_b32 s8, s2
298 ; VI-NEXT: s_mov_b32 s9, s3
299 ; VI-NEXT: s_waitcnt vmcnt(0)
300 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
301 ; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
302 ; VI-NEXT: v_add_u16_e32 v1, 9, v1
303 ; VI-NEXT: v_add_u16_e32 v3, 9, v0
304 ; VI-NEXT: v_and_b32_e32 v1, 0xff, v1
305 ; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0
306 ; VI-NEXT: v_and_b32_e32 v3, 0xff, v3
307 ; VI-NEXT: v_or_b32_e32 v1, v4, v1
308 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
309 ; VI-NEXT: v_add_u16_e32 v1, 0x900, v1
310 ; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
311 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
312 ; VI-NEXT: v_or_b32_e32 v1, v2, v1
313 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
314 ; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
316 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
317 %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
318 %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
319 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
320 store <4 x i8> %val, ptr addrspace(1) %out0, align 4
321 store <4 x i8> %add, ptr addrspace(1) %out1, align 4
325 ; FIXME: Need to handle non-uniform case for function below (load without gep).
326 define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind {
327 ; SI-LABEL: test_copy_v4i8_x2_extra_use:
329 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
330 ; SI-NEXT: s_mov_b32 s11, 0xf000
331 ; SI-NEXT: s_mov_b32 s14, 0
332 ; SI-NEXT: s_mov_b32 s15, s11
333 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
334 ; SI-NEXT: s_waitcnt lgkmcnt(0)
335 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
336 ; SI-NEXT: v_mov_b32_e32 v1, 0
337 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
338 ; SI-NEXT: s_mov_b32 s10, -1
339 ; SI-NEXT: s_mov_b32 s14, s10
340 ; SI-NEXT: s_mov_b32 s8, s0
341 ; SI-NEXT: s_mov_b32 s9, s1
342 ; SI-NEXT: s_mov_b32 s12, s2
343 ; SI-NEXT: s_mov_b32 s13, s3
344 ; SI-NEXT: s_mov_b32 s6, s10
345 ; SI-NEXT: s_mov_b32 s7, s11
346 ; SI-NEXT: s_waitcnt vmcnt(0)
347 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
348 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
349 ; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0
350 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3
351 ; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1
352 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
353 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
354 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
355 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
356 ; SI-NEXT: v_or_b32_e32 v1, v4, v1
357 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
358 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
359 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
360 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
361 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
362 ; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0
363 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
366 ; VI-LABEL: test_copy_v4i8_x2_extra_use:
368 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
369 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
370 ; VI-NEXT: s_mov_b32 s11, 0xf000
371 ; VI-NEXT: s_mov_b32 s10, -1
372 ; VI-NEXT: s_mov_b32 s14, s10
373 ; VI-NEXT: s_waitcnt lgkmcnt(0)
374 ; VI-NEXT: v_mov_b32_e32 v1, s7
375 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
376 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
377 ; VI-NEXT: flat_load_dword v0, v[0:1]
378 ; VI-NEXT: s_mov_b32 s15, s11
379 ; VI-NEXT: s_mov_b32 s8, s0
380 ; VI-NEXT: s_mov_b32 s9, s1
381 ; VI-NEXT: s_mov_b32 s12, s2
382 ; VI-NEXT: s_mov_b32 s13, s3
383 ; VI-NEXT: s_mov_b32 s6, s10
384 ; VI-NEXT: s_mov_b32 s7, s11
385 ; VI-NEXT: s_waitcnt vmcnt(0)
386 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
387 ; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
388 ; VI-NEXT: v_add_u16_e32 v1, 9, v1
389 ; VI-NEXT: v_add_u16_e32 v3, 9, v0
390 ; VI-NEXT: v_and_b32_e32 v1, 0xff, v1
391 ; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0
392 ; VI-NEXT: v_and_b32_e32 v3, 0xff, v3
393 ; VI-NEXT: v_or_b32_e32 v1, v4, v1
394 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
395 ; VI-NEXT: v_add_u16_e32 v1, 0x900, v1
396 ; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
397 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
398 ; VI-NEXT: v_or_b32_e32 v1, v2, v1
399 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
400 ; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0
401 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
403 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
404 %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
405 %val = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4
406 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
407 store <4 x i8> %val, ptr addrspace(1) %out0, align 4
408 store <4 x i8> %add, ptr addrspace(1) %out1, align 4
409 store <4 x i8> %val, ptr addrspace(1) %out2, align 4
413 define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
414 ; SI-LABEL: test_copy_v3i8_align4:
416 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
417 ; SI-NEXT: s_mov_b32 s7, 0xf000
418 ; SI-NEXT: s_mov_b32 s10, 0
419 ; SI-NEXT: s_mov_b32 s11, s7
420 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
421 ; SI-NEXT: s_waitcnt lgkmcnt(0)
422 ; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
423 ; SI-NEXT: v_mov_b32_e32 v1, 0
424 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
425 ; SI-NEXT: s_mov_b32 s6, -1
426 ; SI-NEXT: s_mov_b32 s4, s0
427 ; SI-NEXT: s_mov_b32 s5, s1
428 ; SI-NEXT: s_waitcnt vmcnt(0)
429 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
430 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
431 ; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
434 ; VI-LABEL: test_copy_v3i8_align4:
436 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
437 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
438 ; VI-NEXT: s_waitcnt lgkmcnt(0)
439 ; VI-NEXT: v_mov_b32_e32 v1, s3
440 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
441 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
442 ; VI-NEXT: flat_load_dword v0, v[0:1]
443 ; VI-NEXT: s_mov_b32 s3, 0xf000
444 ; VI-NEXT: s_mov_b32 s2, -1
445 ; VI-NEXT: s_waitcnt vmcnt(0)
446 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
447 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
448 ; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
450 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
451 %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid.x
452 %val = load <3 x i8>, ptr addrspace(1) %gep, align 4
453 store <3 x i8> %val, ptr addrspace(1) %out, align 4
457 define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
458 ; SI-LABEL: test_copy_v3i8_align2:
460 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
461 ; SI-NEXT: s_mov_b32 s7, 0xf000
462 ; SI-NEXT: s_mov_b32 s6, -1
463 ; SI-NEXT: s_mov_b32 s10, s6
464 ; SI-NEXT: s_mov_b32 s11, s7
465 ; SI-NEXT: s_waitcnt lgkmcnt(0)
466 ; SI-NEXT: s_mov_b32 s8, s2
467 ; SI-NEXT: s_mov_b32 s9, s3
468 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2
469 ; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
470 ; SI-NEXT: s_mov_b32 s4, s0
471 ; SI-NEXT: s_mov_b32 s5, s1
472 ; SI-NEXT: s_waitcnt vmcnt(1)
473 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2
474 ; SI-NEXT: s_waitcnt vmcnt(1)
475 ; SI-NEXT: buffer_store_short v1, off, s[4:7], 0
478 ; VI-LABEL: test_copy_v3i8_align2:
480 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
481 ; VI-NEXT: s_mov_b32 s7, 0xf000
482 ; VI-NEXT: s_mov_b32 s6, -1
483 ; VI-NEXT: s_mov_b32 s10, s6
484 ; VI-NEXT: s_mov_b32 s11, s7
485 ; VI-NEXT: s_waitcnt lgkmcnt(0)
486 ; VI-NEXT: s_mov_b32 s8, s2
487 ; VI-NEXT: s_mov_b32 s9, s3
488 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2
489 ; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
490 ; VI-NEXT: s_mov_b32 s4, s0
491 ; VI-NEXT: s_mov_b32 s5, s1
492 ; VI-NEXT: s_waitcnt vmcnt(1)
493 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2
494 ; VI-NEXT: s_waitcnt vmcnt(1)
495 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0
497 %val = load <3 x i8>, ptr addrspace(1) %in, align 2
498 store <3 x i8> %val, ptr addrspace(1) %out, align 2
502 define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
503 ; SI-LABEL: test_copy_v3i8_align1:
505 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
506 ; SI-NEXT: s_mov_b32 s7, 0xf000
507 ; SI-NEXT: s_mov_b32 s6, -1
508 ; SI-NEXT: s_mov_b32 s10, s6
509 ; SI-NEXT: s_mov_b32 s11, s7
510 ; SI-NEXT: s_waitcnt lgkmcnt(0)
511 ; SI-NEXT: s_mov_b32 s8, s2
512 ; SI-NEXT: s_mov_b32 s9, s3
513 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
514 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1
515 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
516 ; SI-NEXT: s_mov_b32 s4, s0
517 ; SI-NEXT: s_mov_b32 s5, s1
518 ; SI-NEXT: s_waitcnt vmcnt(2)
519 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
520 ; SI-NEXT: s_waitcnt vmcnt(2)
521 ; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:1
522 ; SI-NEXT: s_waitcnt vmcnt(2)
523 ; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:2
526 ; VI-LABEL: test_copy_v3i8_align1:
528 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
529 ; VI-NEXT: s_mov_b32 s7, 0xf000
530 ; VI-NEXT: s_mov_b32 s6, -1
531 ; VI-NEXT: s_mov_b32 s10, s6
532 ; VI-NEXT: s_mov_b32 s11, s7
533 ; VI-NEXT: s_waitcnt lgkmcnt(0)
534 ; VI-NEXT: s_mov_b32 s8, s2
535 ; VI-NEXT: s_mov_b32 s9, s3
536 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
537 ; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
538 ; VI-NEXT: s_mov_b32 s4, s0
539 ; VI-NEXT: s_mov_b32 s5, s1
540 ; VI-NEXT: s_waitcnt vmcnt(1)
541 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
542 ; VI-NEXT: s_waitcnt vmcnt(1)
543 ; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
544 ; VI-NEXT: v_lshrrev_b16_e32 v0, 8, v0
545 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:1
547 %val = load <3 x i8>, ptr addrspace(1) %in, align 1
548 store <3 x i8> %val, ptr addrspace(1) %out, align 1
552 define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
553 ; SI-LABEL: test_copy_v4i8_volatile_load:
555 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
556 ; SI-NEXT: s_mov_b32 s7, 0xf000
557 ; SI-NEXT: s_mov_b32 s6, -1
558 ; SI-NEXT: s_mov_b32 s10, s6
559 ; SI-NEXT: s_mov_b32 s11, s7
560 ; SI-NEXT: s_waitcnt lgkmcnt(0)
561 ; SI-NEXT: s_mov_b32 s8, s2
562 ; SI-NEXT: s_mov_b32 s9, s3
563 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
564 ; SI-NEXT: s_waitcnt vmcnt(0)
565 ; SI-NEXT: s_mov_b32 s4, s0
566 ; SI-NEXT: s_mov_b32 s5, s1
567 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
570 ; VI-LABEL: test_copy_v4i8_volatile_load:
572 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
573 ; VI-NEXT: s_mov_b32 s7, 0xf000
574 ; VI-NEXT: s_mov_b32 s6, -1
575 ; VI-NEXT: s_mov_b32 s10, s6
576 ; VI-NEXT: s_mov_b32 s11, s7
577 ; VI-NEXT: s_waitcnt lgkmcnt(0)
578 ; VI-NEXT: s_mov_b32 s8, s2
579 ; VI-NEXT: s_mov_b32 s9, s3
580 ; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
581 ; VI-NEXT: s_waitcnt vmcnt(0)
582 ; VI-NEXT: s_mov_b32 s4, s0
583 ; VI-NEXT: s_mov_b32 s5, s1
584 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
586 %val = load volatile <4 x i8>, ptr addrspace(1) %in, align 4
587 store <4 x i8> %val, ptr addrspace(1) %out, align 4
591 define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
592 ; SI-LABEL: test_copy_v4i8_volatile_store:
594 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
595 ; SI-NEXT: s_mov_b32 s7, 0xf000
596 ; SI-NEXT: s_mov_b32 s6, -1
597 ; SI-NEXT: s_mov_b32 s10, s6
598 ; SI-NEXT: s_mov_b32 s11, s7
599 ; SI-NEXT: s_waitcnt lgkmcnt(0)
600 ; SI-NEXT: s_mov_b32 s8, s2
601 ; SI-NEXT: s_mov_b32 s9, s3
602 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:3
603 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
604 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:1
605 ; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0
606 ; SI-NEXT: s_mov_b32 s4, s0
607 ; SI-NEXT: s_mov_b32 s5, s1
608 ; SI-NEXT: s_waitcnt vmcnt(3)
609 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:3
610 ; SI-NEXT: s_waitcnt vmcnt(0)
611 ; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
612 ; SI-NEXT: s_waitcnt vmcnt(0)
613 ; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1
614 ; SI-NEXT: s_waitcnt vmcnt(0)
615 ; SI-NEXT: buffer_store_byte v3, off, s[4:7], 0
616 ; SI-NEXT: s_waitcnt vmcnt(0)
619 ; VI-LABEL: test_copy_v4i8_volatile_store:
621 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
622 ; VI-NEXT: s_mov_b32 s7, 0xf000
623 ; VI-NEXT: s_mov_b32 s6, -1
624 ; VI-NEXT: s_mov_b32 s10, s6
625 ; VI-NEXT: s_mov_b32 s11, s7
626 ; VI-NEXT: s_waitcnt lgkmcnt(0)
627 ; VI-NEXT: s_mov_b32 s8, s2
628 ; VI-NEXT: s_mov_b32 s9, s3
629 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:3
630 ; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
631 ; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:1
632 ; VI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0
633 ; VI-NEXT: s_mov_b32 s4, s0
634 ; VI-NEXT: s_mov_b32 s5, s1
635 ; VI-NEXT: s_waitcnt vmcnt(3)
636 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:3
637 ; VI-NEXT: s_waitcnt vmcnt(0)
638 ; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
639 ; VI-NEXT: s_waitcnt vmcnt(0)
640 ; VI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1
641 ; VI-NEXT: s_waitcnt vmcnt(0)
642 ; VI-NEXT: buffer_store_byte v3, off, s[4:7], 0
643 ; VI-NEXT: s_waitcnt vmcnt(0)
645 %val = load <4 x i8>, ptr addrspace(1) %in, align 4
646 store volatile <4 x i8> %val, ptr addrspace(1) %out, align 4