1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=VI %s
5 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
8 define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
9 ; SI-LABEL: test_copy_v4i8:
11 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
12 ; SI-NEXT: s_mov_b32 s3, 0xf000
13 ; SI-NEXT: s_mov_b32 s10, 0
14 ; SI-NEXT: s_mov_b32 s11, s3
15 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
16 ; SI-NEXT: s_waitcnt lgkmcnt(0)
17 ; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
18 ; SI-NEXT: v_mov_b32_e32 v1, 0
19 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
20 ; SI-NEXT: s_mov_b32 s2, -1
21 ; SI-NEXT: s_mov_b32 s0, s4
22 ; SI-NEXT: s_mov_b32 s1, s5
23 ; SI-NEXT: s_waitcnt vmcnt(0)
24 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
27 ; VI-LABEL: test_copy_v4i8:
29 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
30 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
31 ; VI-NEXT: s_mov_b32 s3, 0xf000
32 ; VI-NEXT: s_mov_b32 s2, -1
33 ; VI-NEXT: s_waitcnt lgkmcnt(0)
34 ; VI-NEXT: v_mov_b32_e32 v1, s7
35 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
36 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
37 ; VI-NEXT: flat_load_dword v0, v[0:1]
38 ; VI-NEXT: s_mov_b32 s0, s4
39 ; VI-NEXT: s_mov_b32 s1, s5
40 ; VI-NEXT: s_waitcnt vmcnt(0)
41 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
43 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
44 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
45 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
46 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
50 define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
51 ; SI-LABEL: test_copy_v4i8_x2:
53 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
54 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
55 ; SI-NEXT: s_mov_b32 s11, 0xf000
56 ; SI-NEXT: s_mov_b32 s2, 0
57 ; SI-NEXT: s_mov_b32 s3, s11
58 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
59 ; SI-NEXT: v_mov_b32_e32 v1, 0
60 ; SI-NEXT: s_waitcnt lgkmcnt(0)
61 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
62 ; SI-NEXT: s_mov_b32 s10, -1
63 ; SI-NEXT: s_mov_b32 s8, s4
64 ; SI-NEXT: s_mov_b32 s9, s5
65 ; SI-NEXT: s_mov_b32 s2, s10
66 ; SI-NEXT: s_mov_b32 s0, s6
67 ; SI-NEXT: s_mov_b32 s1, s7
68 ; SI-NEXT: s_waitcnt vmcnt(0)
69 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
70 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
73 ; VI-LABEL: test_copy_v4i8_x2:
75 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
76 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
77 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
78 ; VI-NEXT: s_mov_b32 s3, 0xf000
79 ; VI-NEXT: s_mov_b32 s2, -1
80 ; VI-NEXT: s_mov_b32 s10, s2
81 ; VI-NEXT: s_waitcnt lgkmcnt(0)
82 ; VI-NEXT: v_mov_b32_e32 v1, s1
83 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
84 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
85 ; VI-NEXT: flat_load_dword v0, v[0:1]
86 ; VI-NEXT: s_mov_b32 s0, s4
87 ; VI-NEXT: s_mov_b32 s1, s5
88 ; VI-NEXT: s_mov_b32 s11, s3
89 ; VI-NEXT: s_mov_b32 s8, s6
90 ; VI-NEXT: s_mov_b32 s9, s7
91 ; VI-NEXT: s_waitcnt vmcnt(0)
92 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
93 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
95 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
96 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
97 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
98 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
99 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
103 define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
104 ; SI-LABEL: test_copy_v4i8_x3:
106 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
107 ; SI-NEXT: s_mov_b32 s11, 0xf000
108 ; SI-NEXT: s_mov_b32 s14, 0
109 ; SI-NEXT: s_mov_b32 s15, s11
110 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
111 ; SI-NEXT: s_waitcnt lgkmcnt(0)
112 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
113 ; SI-NEXT: v_mov_b32_e32 v1, 0
114 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
115 ; SI-NEXT: s_mov_b32 s10, -1
116 ; SI-NEXT: s_mov_b32 s8, s0
117 ; SI-NEXT: s_mov_b32 s9, s1
118 ; SI-NEXT: s_mov_b32 s14, s10
119 ; SI-NEXT: s_mov_b32 s6, s10
120 ; SI-NEXT: s_mov_b32 s7, s11
121 ; SI-NEXT: s_mov_b32 s12, s2
122 ; SI-NEXT: s_mov_b32 s13, s3
123 ; SI-NEXT: s_waitcnt vmcnt(0)
124 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
125 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
126 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
129 ; VI-LABEL: test_copy_v4i8_x3:
131 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
132 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
133 ; VI-NEXT: s_mov_b32 s11, 0xf000
134 ; VI-NEXT: s_mov_b32 s10, -1
135 ; VI-NEXT: s_mov_b32 s14, s10
136 ; VI-NEXT: s_waitcnt lgkmcnt(0)
137 ; VI-NEXT: v_mov_b32_e32 v1, s7
138 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
139 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
140 ; VI-NEXT: flat_load_dword v0, v[0:1]
141 ; VI-NEXT: s_mov_b32 s8, s0
142 ; VI-NEXT: s_mov_b32 s9, s1
143 ; VI-NEXT: s_mov_b32 s15, s11
144 ; VI-NEXT: s_mov_b32 s6, s10
145 ; VI-NEXT: s_mov_b32 s7, s11
146 ; VI-NEXT: s_mov_b32 s12, s2
147 ; VI-NEXT: s_mov_b32 s13, s3
148 ; VI-NEXT: s_waitcnt vmcnt(0)
149 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
150 ; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
151 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
153 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
154 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
155 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
156 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
157 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
158 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
162 define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
163 ; SI-LABEL: test_copy_v4i8_x4:
165 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11
166 ; SI-NEXT: s_mov_b32 s11, 0xf000
167 ; SI-NEXT: s_mov_b32 s6, 0
168 ; SI-NEXT: s_mov_b32 s7, s11
169 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
170 ; SI-NEXT: v_mov_b32_e32 v1, 0
171 ; SI-NEXT: s_waitcnt lgkmcnt(0)
172 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
173 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
174 ; SI-NEXT: s_mov_b32 s10, -1
175 ; SI-NEXT: s_mov_b32 s14, s10
176 ; SI-NEXT: s_mov_b32 s15, s11
177 ; SI-NEXT: s_mov_b32 s18, s10
178 ; SI-NEXT: s_waitcnt lgkmcnt(0)
179 ; SI-NEXT: s_mov_b32 s8, s0
180 ; SI-NEXT: s_mov_b32 s9, s1
181 ; SI-NEXT: s_mov_b32 s19, s11
182 ; SI-NEXT: s_mov_b32 s22, s10
183 ; SI-NEXT: s_mov_b32 s23, s11
184 ; SI-NEXT: s_mov_b32 s12, s2
185 ; SI-NEXT: s_mov_b32 s13, s3
186 ; SI-NEXT: s_mov_b32 s16, s4
187 ; SI-NEXT: s_mov_b32 s17, s5
188 ; SI-NEXT: s_mov_b32 s20, s6
189 ; SI-NEXT: s_mov_b32 s21, s7
190 ; SI-NEXT: s_waitcnt vmcnt(0)
191 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
192 ; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
193 ; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
194 ; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0
197 ; VI-LABEL: test_copy_v4i8_x4:
199 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
200 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
201 ; VI-NEXT: s_mov_b32 s11, 0xf000
202 ; VI-NEXT: s_mov_b32 s10, -1
203 ; VI-NEXT: s_mov_b32 s14, s10
204 ; VI-NEXT: s_waitcnt lgkmcnt(0)
205 ; VI-NEXT: v_mov_b32_e32 v1, s3
206 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
207 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
208 ; VI-NEXT: flat_load_dword v0, v[0:1]
209 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
210 ; VI-NEXT: s_mov_b32 s15, s11
211 ; VI-NEXT: s_mov_b32 s18, s10
212 ; VI-NEXT: s_mov_b32 s19, s11
213 ; VI-NEXT: s_mov_b32 s22, s10
214 ; VI-NEXT: s_waitcnt lgkmcnt(0)
215 ; VI-NEXT: s_mov_b32 s8, s0
216 ; VI-NEXT: s_mov_b32 s9, s1
217 ; VI-NEXT: s_mov_b32 s23, s11
218 ; VI-NEXT: s_mov_b32 s12, s2
219 ; VI-NEXT: s_mov_b32 s13, s3
220 ; VI-NEXT: s_mov_b32 s16, s4
221 ; VI-NEXT: s_mov_b32 s17, s5
222 ; VI-NEXT: s_mov_b32 s20, s6
223 ; VI-NEXT: s_mov_b32 s21, s7
224 ; VI-NEXT: s_waitcnt vmcnt(0)
225 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
226 ; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
227 ; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0
228 ; VI-NEXT: buffer_store_dword v0, off, s[20:23], 0
230 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
231 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
232 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
233 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
234 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
235 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
236 store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
240 define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
241 ; SI-LABEL: test_copy_v4i8_extra_use:
243 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
244 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
245 ; SI-NEXT: s_mov_b32 s11, 0xf000
246 ; SI-NEXT: s_mov_b32 s2, 0
247 ; SI-NEXT: s_mov_b32 s3, s11
248 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
249 ; SI-NEXT: v_mov_b32_e32 v1, 0
250 ; SI-NEXT: s_waitcnt lgkmcnt(0)
251 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
252 ; SI-NEXT: s_mov_b32 s12, 0xff00
253 ; SI-NEXT: s_movk_i32 s13, 0xff
254 ; SI-NEXT: s_mov_b32 s10, -1
255 ; SI-NEXT: s_mov_b32 s8, s4
256 ; SI-NEXT: s_mov_b32 s9, s5
257 ; SI-NEXT: s_mov_b32 s2, s10
258 ; SI-NEXT: s_mov_b32 s0, s6
259 ; SI-NEXT: s_mov_b32 s1, s7
260 ; SI-NEXT: s_waitcnt vmcnt(0)
261 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
262 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
263 ; SI-NEXT: v_and_b32_e32 v4, s12, v1
264 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
265 ; SI-NEXT: v_and_b32_e32 v2, s12, v0
266 ; SI-NEXT: v_and_b32_e32 v3, s13, v3
267 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
268 ; SI-NEXT: v_and_b32_e32 v1, s13, v1
269 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
270 ; SI-NEXT: v_or_b32_e32 v1, v4, v1
271 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
272 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
273 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
274 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
275 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
276 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
279 ; VI-LABEL: test_copy_v4i8_extra_use:
281 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
282 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
283 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
284 ; VI-NEXT: s_movk_i32 s12, 0xff00
285 ; VI-NEXT: s_movk_i32 s13, 0xff
286 ; VI-NEXT: s_movk_i32 s14, 0x900
287 ; VI-NEXT: s_waitcnt lgkmcnt(0)
288 ; VI-NEXT: v_mov_b32_e32 v1, s1
289 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
290 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
291 ; VI-NEXT: flat_load_dword v0, v[0:1]
292 ; VI-NEXT: s_mov_b32 s3, 0xf000
293 ; VI-NEXT: s_mov_b32 s2, -1
294 ; VI-NEXT: s_mov_b32 s0, s4
295 ; VI-NEXT: s_mov_b32 s1, s5
296 ; VI-NEXT: s_mov_b32 s10, s2
297 ; VI-NEXT: s_mov_b32 s11, s3
298 ; VI-NEXT: s_mov_b32 s8, s6
299 ; VI-NEXT: s_mov_b32 s9, s7
300 ; VI-NEXT: s_waitcnt vmcnt(0)
301 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
302 ; VI-NEXT: v_and_b32_e32 v4, s12, v1
303 ; VI-NEXT: v_add_u16_e32 v1, 9, v1
304 ; VI-NEXT: v_add_u16_e32 v3, 9, v0
305 ; VI-NEXT: v_and_b32_e32 v1, s13, v1
306 ; VI-NEXT: v_or_b32_e32 v1, v4, v1
307 ; VI-NEXT: v_and_b32_e32 v2, s12, v0
308 ; VI-NEXT: v_and_b32_e32 v3, s13, v3
309 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
310 ; VI-NEXT: v_add_u16_e32 v1, s14, v1
311 ; VI-NEXT: v_add_u16_e32 v2, s14, v2
312 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
313 ; VI-NEXT: v_or_b32_e32 v1, v2, v1
314 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
315 ; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
317 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
318 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
319 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
320 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
321 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
322 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
326 ; FIXME: Need to handle non-uniform case for function below (load without gep).
327 define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
328 ; SI-LABEL: test_copy_v4i8_x2_extra_use:
330 ; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
331 ; SI-NEXT: s_mov_b32 s11, 0xf000
332 ; SI-NEXT: s_mov_b32 s14, 0
333 ; SI-NEXT: s_mov_b32 s15, s11
334 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
335 ; SI-NEXT: s_waitcnt lgkmcnt(0)
336 ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
337 ; SI-NEXT: v_mov_b32_e32 v1, 0
338 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
339 ; SI-NEXT: s_mov_b32 s16, 0xff00
340 ; SI-NEXT: s_movk_i32 s17, 0xff
341 ; SI-NEXT: s_mov_b32 s10, -1
342 ; SI-NEXT: s_mov_b32 s14, s10
343 ; SI-NEXT: s_mov_b32 s8, s0
344 ; SI-NEXT: s_mov_b32 s9, s1
345 ; SI-NEXT: s_mov_b32 s12, s2
346 ; SI-NEXT: s_mov_b32 s13, s3
347 ; SI-NEXT: s_mov_b32 s6, s10
348 ; SI-NEXT: s_mov_b32 s7, s11
349 ; SI-NEXT: s_waitcnt vmcnt(0)
350 ; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
351 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
352 ; SI-NEXT: v_and_b32_e32 v4, s16, v1
353 ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
354 ; SI-NEXT: v_and_b32_e32 v2, s16, v0
355 ; SI-NEXT: v_and_b32_e32 v3, s17, v3
356 ; SI-NEXT: v_or_b32_e32 v2, v2, v3
357 ; SI-NEXT: v_and_b32_e32 v1, s17, v1
358 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
359 ; SI-NEXT: v_or_b32_e32 v1, v4, v1
360 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
361 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
362 ; SI-NEXT: v_or_b32_e32 v1, v1, v2
363 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
364 ; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
365 ; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0
366 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
369 ; VI-LABEL: test_copy_v4i8_x2_extra_use:
371 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
372 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
373 ; VI-NEXT: s_movk_i32 s16, 0xff00
374 ; VI-NEXT: s_movk_i32 s17, 0xff
375 ; VI-NEXT: s_movk_i32 s18, 0x900
376 ; VI-NEXT: s_waitcnt lgkmcnt(0)
377 ; VI-NEXT: v_mov_b32_e32 v1, s7
378 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
379 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
380 ; VI-NEXT: flat_load_dword v0, v[0:1]
381 ; VI-NEXT: s_mov_b32 s11, 0xf000
382 ; VI-NEXT: s_mov_b32 s10, -1
383 ; VI-NEXT: s_mov_b32 s14, s10
384 ; VI-NEXT: s_mov_b32 s15, s11
385 ; VI-NEXT: s_mov_b32 s8, s0
386 ; VI-NEXT: s_mov_b32 s9, s1
387 ; VI-NEXT: s_mov_b32 s12, s2
388 ; VI-NEXT: s_mov_b32 s13, s3
389 ; VI-NEXT: s_mov_b32 s6, s10
390 ; VI-NEXT: s_mov_b32 s7, s11
391 ; VI-NEXT: s_waitcnt vmcnt(0)
392 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
393 ; VI-NEXT: v_and_b32_e32 v4, s16, v1
394 ; VI-NEXT: v_add_u16_e32 v1, 9, v1
395 ; VI-NEXT: v_add_u16_e32 v3, 9, v0
396 ; VI-NEXT: v_and_b32_e32 v1, s17, v1
397 ; VI-NEXT: v_or_b32_e32 v1, v4, v1
398 ; VI-NEXT: v_and_b32_e32 v2, s16, v0
399 ; VI-NEXT: v_and_b32_e32 v3, s17, v3
400 ; VI-NEXT: v_or_b32_e32 v2, v2, v3
401 ; VI-NEXT: v_add_u16_e32 v1, s18, v1
402 ; VI-NEXT: v_add_u16_e32 v2, s18, v2
403 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
404 ; VI-NEXT: v_or_b32_e32 v1, v2, v1
405 ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
406 ; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0
407 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
409 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
410 %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
411 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
412 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
413 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
414 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
415 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
419 define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
420 ; SI-LABEL: test_copy_v3i8_align4:
422 ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
423 ; SI-NEXT: s_mov_b32 s3, 0xf000
424 ; SI-NEXT: s_mov_b32 s6, 0
425 ; SI-NEXT: s_mov_b32 s7, s3
426 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
427 ; SI-NEXT: s_waitcnt lgkmcnt(0)
428 ; SI-NEXT: s_mov_b64 s[4:5], s[10:11]
429 ; SI-NEXT: v_mov_b32_e32 v1, 0
430 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
431 ; SI-NEXT: s_mov_b32 s2, -1
432 ; SI-NEXT: s_mov_b32 s0, s8
433 ; SI-NEXT: s_mov_b32 s1, s9
434 ; SI-NEXT: s_waitcnt vmcnt(0)
435 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
436 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
437 ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
440 ; VI-LABEL: test_copy_v3i8_align4:
442 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
443 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
444 ; VI-NEXT: s_waitcnt lgkmcnt(0)
445 ; VI-NEXT: v_mov_b32_e32 v1, s3
446 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
447 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
448 ; VI-NEXT: flat_load_dword v0, v[0:1]
449 ; VI-NEXT: s_mov_b32 s3, 0xf000
450 ; VI-NEXT: s_mov_b32 s2, -1
451 ; VI-NEXT: s_waitcnt vmcnt(0)
452 ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
453 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
454 ; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
456 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
457 %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
458 %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
459 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
463 define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
464 ; SI-LABEL: test_copy_v3i8_align2:
466 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
467 ; SI-NEXT: s_mov_b32 s3, 0xf000
468 ; SI-NEXT: s_mov_b32 s2, -1
469 ; SI-NEXT: s_mov_b32 s10, s2
470 ; SI-NEXT: s_mov_b32 s11, s3
471 ; SI-NEXT: s_waitcnt lgkmcnt(0)
472 ; SI-NEXT: s_mov_b32 s8, s6
473 ; SI-NEXT: s_mov_b32 s9, s7
474 ; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
475 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
476 ; SI-NEXT: s_mov_b32 s0, s4
477 ; SI-NEXT: s_mov_b32 s1, s5
478 ; SI-NEXT: s_waitcnt vmcnt(0)
479 ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
480 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
483 ; VI-LABEL: test_copy_v3i8_align2:
485 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
486 ; VI-NEXT: s_mov_b32 s3, 0xf000
487 ; VI-NEXT: s_mov_b32 s2, -1
488 ; VI-NEXT: s_mov_b32 s10, s2
489 ; VI-NEXT: s_mov_b32 s11, s3
490 ; VI-NEXT: s_waitcnt lgkmcnt(0)
491 ; VI-NEXT: s_mov_b32 s8, s6
492 ; VI-NEXT: s_mov_b32 s9, s7
493 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
494 ; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
495 ; VI-NEXT: s_mov_b32 s0, s4
496 ; VI-NEXT: s_mov_b32 s1, s5
497 ; VI-NEXT: s_waitcnt vmcnt(0)
498 ; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
499 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
501 %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
502 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
506 define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
507 ; SI-LABEL: test_copy_v3i8_align1:
509 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
510 ; SI-NEXT: s_mov_b32 s3, 0xf000
511 ; SI-NEXT: s_mov_b32 s2, -1
512 ; SI-NEXT: s_mov_b32 s10, s2
513 ; SI-NEXT: s_mov_b32 s11, s3
514 ; SI-NEXT: s_waitcnt lgkmcnt(0)
515 ; SI-NEXT: s_mov_b32 s8, s6
516 ; SI-NEXT: s_mov_b32 s9, s7
517 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
518 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1
519 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
520 ; SI-NEXT: s_mov_b32 s0, s4
521 ; SI-NEXT: s_mov_b32 s1, s5
522 ; SI-NEXT: s_waitcnt vmcnt(2)
523 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
524 ; SI-NEXT: s_waitcnt vmcnt(2)
525 ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1
526 ; SI-NEXT: s_waitcnt vmcnt(2)
527 ; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2
530 ; VI-LABEL: test_copy_v3i8_align1:
532 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
533 ; VI-NEXT: s_mov_b32 s3, 0xf000
534 ; VI-NEXT: s_mov_b32 s2, -1
535 ; VI-NEXT: s_mov_b32 s10, s2
536 ; VI-NEXT: s_mov_b32 s11, s3
537 ; VI-NEXT: s_waitcnt lgkmcnt(0)
538 ; VI-NEXT: s_mov_b32 s8, s6
539 ; VI-NEXT: s_mov_b32 s9, s7
540 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
541 ; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1
542 ; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
543 ; VI-NEXT: s_mov_b32 s0, s4
544 ; VI-NEXT: s_mov_b32 s1, s5
545 ; VI-NEXT: s_waitcnt vmcnt(2)
546 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
547 ; VI-NEXT: s_waitcnt vmcnt(2)
548 ; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1
549 ; VI-NEXT: s_waitcnt vmcnt(2)
550 ; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2
552 %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
553 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
557 define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
558 ; SI-LABEL: test_copy_v4i8_volatile_load:
560 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
561 ; SI-NEXT: s_mov_b32 s3, 0xf000
562 ; SI-NEXT: s_mov_b32 s2, -1
563 ; SI-NEXT: s_waitcnt lgkmcnt(0)
564 ; SI-NEXT: s_mov_b32 s0, s4
565 ; SI-NEXT: s_mov_b32 s1, s5
566 ; SI-NEXT: s_mov_b32 s4, s6
567 ; SI-NEXT: s_mov_b32 s5, s7
568 ; SI-NEXT: s_mov_b32 s6, s2
569 ; SI-NEXT: s_mov_b32 s7, s3
570 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
571 ; SI-NEXT: s_waitcnt vmcnt(0)
572 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
575 ; VI-LABEL: test_copy_v4i8_volatile_load:
577 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
578 ; VI-NEXT: s_mov_b32 s3, 0xf000
579 ; VI-NEXT: s_mov_b32 s2, -1
580 ; VI-NEXT: s_waitcnt lgkmcnt(0)
581 ; VI-NEXT: s_mov_b32 s0, s4
582 ; VI-NEXT: s_mov_b32 s1, s5
583 ; VI-NEXT: s_mov_b32 s4, s6
584 ; VI-NEXT: s_mov_b32 s5, s7
585 ; VI-NEXT: s_mov_b32 s6, s2
586 ; VI-NEXT: s_mov_b32 s7, s3
587 ; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
588 ; VI-NEXT: s_waitcnt vmcnt(0)
589 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
591 %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
592 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
596 define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
597 ; SI-LABEL: test_copy_v4i8_volatile_store:
599 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
600 ; SI-NEXT: s_mov_b32 s3, 0xf000
601 ; SI-NEXT: s_mov_b32 s2, -1
602 ; SI-NEXT: s_mov_b32 s10, s2
603 ; SI-NEXT: s_mov_b32 s11, s3
604 ; SI-NEXT: s_waitcnt lgkmcnt(0)
605 ; SI-NEXT: s_mov_b32 s8, s6
606 ; SI-NEXT: s_mov_b32 s9, s7
607 ; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
608 ; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1
609 ; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
610 ; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3
611 ; SI-NEXT: s_mov_b32 s0, s4
612 ; SI-NEXT: s_mov_b32 s1, s5
613 ; SI-NEXT: s_waitcnt vmcnt(0)
614 ; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0 offset:3
615 ; SI-NEXT: s_waitcnt vmcnt(0)
616 ; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2
617 ; SI-NEXT: s_waitcnt vmcnt(0)
618 ; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1
619 ; SI-NEXT: s_waitcnt vmcnt(0)
620 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
621 ; SI-NEXT: s_waitcnt vmcnt(0)
624 ; VI-LABEL: test_copy_v4i8_volatile_store:
626 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
627 ; VI-NEXT: s_mov_b32 s3, 0xf000
628 ; VI-NEXT: s_mov_b32 s2, -1
629 ; VI-NEXT: s_mov_b32 s10, s2
630 ; VI-NEXT: s_mov_b32 s11, s3
631 ; VI-NEXT: s_waitcnt lgkmcnt(0)
632 ; VI-NEXT: s_mov_b32 s8, s6
633 ; VI-NEXT: s_mov_b32 s9, s7
634 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
635 ; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1
636 ; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
637 ; VI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3
638 ; VI-NEXT: s_mov_b32 s0, s4
639 ; VI-NEXT: s_mov_b32 s1, s5
640 ; VI-NEXT: s_waitcnt vmcnt(0)
641 ; VI-NEXT: buffer_store_byte v3, off, s[0:3], 0 offset:3
642 ; VI-NEXT: s_waitcnt vmcnt(0)
643 ; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:2
644 ; VI-NEXT: s_waitcnt vmcnt(0)
645 ; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:1
646 ; VI-NEXT: s_waitcnt vmcnt(0)
647 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
648 ; VI-NEXT: s_waitcnt vmcnt(0)
650 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
651 store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4