1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
5 ; FIXME: Broken on evergreen
6 ; FIXME: For some reason the 8 and 16 vectors are being stored as
7 ; individual elements instead of 128-bit stores.
9 define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
10 ; SI-LABEL: insertelement_v2f32_0:
12 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
13 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
14 ; SI-NEXT: s_mov_b32 s3, 0x100f000
15 ; SI-NEXT: s_mov_b32 s2, -1
16 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
17 ; SI-NEXT: s_waitcnt lgkmcnt(0)
18 ; SI-NEXT: v_mov_b32_e32 v1, s5
19 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
22 ; VI-LABEL: insertelement_v2f32_0:
24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
25 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
26 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
27 ; VI-NEXT: s_mov_b32 s2, -1
28 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
29 ; VI-NEXT: s_waitcnt lgkmcnt(0)
30 ; VI-NEXT: v_mov_b32_e32 v1, s5
31 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
33 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0
34 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
38 define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind {
39 ; SI-LABEL: insertelement_v2f32_1:
41 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
42 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
43 ; SI-NEXT: s_mov_b32 s3, 0x100f000
44 ; SI-NEXT: s_mov_b32 s2, -1
45 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000
46 ; SI-NEXT: s_waitcnt lgkmcnt(0)
47 ; SI-NEXT: v_mov_b32_e32 v0, s4
48 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
51 ; VI-LABEL: insertelement_v2f32_1:
53 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
54 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
55 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
56 ; VI-NEXT: s_mov_b32 s2, -1
57 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000
58 ; VI-NEXT: s_waitcnt lgkmcnt(0)
59 ; VI-NEXT: v_mov_b32_e32 v0, s4
60 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
62 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1
63 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16
67 define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
68 ; SI-LABEL: insertelement_v2i32_0:
70 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
71 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
72 ; SI-NEXT: s_mov_b32 s3, 0x100f000
73 ; SI-NEXT: s_mov_b32 s2, -1
74 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
75 ; SI-NEXT: s_waitcnt lgkmcnt(0)
76 ; SI-NEXT: v_mov_b32_e32 v1, s5
77 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
80 ; VI-LABEL: insertelement_v2i32_0:
82 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
83 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
84 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
85 ; VI-NEXT: s_mov_b32 s2, -1
86 ; VI-NEXT: v_mov_b32_e32 v0, 0x3e7
87 ; VI-NEXT: s_waitcnt lgkmcnt(0)
88 ; VI-NEXT: v_mov_b32_e32 v1, s5
89 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
91 %vecins = insertelement <2 x i32> %a, i32 999, i32 0
92 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
96 define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
97 ; SI-LABEL: insertelement_v2i32_1:
99 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
100 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
101 ; SI-NEXT: s_mov_b32 s3, 0x100f000
102 ; SI-NEXT: s_mov_b32 s2, -1
103 ; SI-NEXT: v_mov_b32_e32 v1, 0x3e7
104 ; SI-NEXT: s_waitcnt lgkmcnt(0)
105 ; SI-NEXT: v_mov_b32_e32 v0, s4
106 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
109 ; VI-LABEL: insertelement_v2i32_1:
111 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
112 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8
113 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
114 ; VI-NEXT: s_mov_b32 s2, -1
115 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e7
116 ; VI-NEXT: s_waitcnt lgkmcnt(0)
117 ; VI-NEXT: v_mov_b32_e32 v0, s4
118 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
120 %vecins = insertelement <2 x i32> %a, i32 999, i32 1
121 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16
125 ; FIXME: Why is the constant moved into the intermediate register and
126 ; not just directly into the vector component?
127 define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
128 ; SI-LABEL: insertelement_v4f32_0:
130 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
131 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
132 ; SI-NEXT: s_waitcnt lgkmcnt(0)
133 ; SI-NEXT: s_mov_b32 s4, 0x40a00000
134 ; SI-NEXT: s_mov_b32 s3, 0x100f000
135 ; SI-NEXT: s_mov_b32 s2, -1
136 ; SI-NEXT: v_mov_b32_e32 v0, s4
137 ; SI-NEXT: v_mov_b32_e32 v1, s5
138 ; SI-NEXT: v_mov_b32_e32 v2, s6
139 ; SI-NEXT: v_mov_b32_e32 v3, s7
140 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
143 ; VI-LABEL: insertelement_v4f32_0:
145 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
146 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
147 ; VI-NEXT: s_waitcnt lgkmcnt(0)
148 ; VI-NEXT: s_mov_b32 s4, 0x40a00000
149 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
150 ; VI-NEXT: s_mov_b32 s2, -1
151 ; VI-NEXT: v_mov_b32_e32 v0, s4
152 ; VI-NEXT: v_mov_b32_e32 v1, s5
153 ; VI-NEXT: v_mov_b32_e32 v2, s6
154 ; VI-NEXT: v_mov_b32_e32 v3, s7
155 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
157 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
158 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
162 define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
163 ; SI-LABEL: insertelement_v4f32_1:
165 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
166 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
167 ; SI-NEXT: s_waitcnt lgkmcnt(0)
168 ; SI-NEXT: s_mov_b32 s5, 0x40a00000
169 ; SI-NEXT: s_mov_b32 s3, 0x100f000
170 ; SI-NEXT: s_mov_b32 s2, -1
171 ; SI-NEXT: v_mov_b32_e32 v0, s4
172 ; SI-NEXT: v_mov_b32_e32 v1, s5
173 ; SI-NEXT: v_mov_b32_e32 v2, s6
174 ; SI-NEXT: v_mov_b32_e32 v3, s7
175 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
178 ; VI-LABEL: insertelement_v4f32_1:
180 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
181 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
182 ; VI-NEXT: s_waitcnt lgkmcnt(0)
183 ; VI-NEXT: s_mov_b32 s5, 0x40a00000
184 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
185 ; VI-NEXT: s_mov_b32 s2, -1
186 ; VI-NEXT: v_mov_b32_e32 v0, s4
187 ; VI-NEXT: v_mov_b32_e32 v1, s5
188 ; VI-NEXT: v_mov_b32_e32 v2, s6
189 ; VI-NEXT: v_mov_b32_e32 v3, s7
190 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
192 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
193 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
197 define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
198 ; SI-LABEL: insertelement_v4f32_2:
200 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
201 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
202 ; SI-NEXT: s_waitcnt lgkmcnt(0)
203 ; SI-NEXT: s_mov_b32 s6, 0x40a00000
204 ; SI-NEXT: s_mov_b32 s3, 0x100f000
205 ; SI-NEXT: s_mov_b32 s2, -1
206 ; SI-NEXT: v_mov_b32_e32 v0, s4
207 ; SI-NEXT: v_mov_b32_e32 v1, s5
208 ; SI-NEXT: v_mov_b32_e32 v2, s6
209 ; SI-NEXT: v_mov_b32_e32 v3, s7
210 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
213 ; VI-LABEL: insertelement_v4f32_2:
215 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
216 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
217 ; VI-NEXT: s_waitcnt lgkmcnt(0)
218 ; VI-NEXT: s_mov_b32 s6, 0x40a00000
219 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
220 ; VI-NEXT: s_mov_b32 s2, -1
221 ; VI-NEXT: v_mov_b32_e32 v0, s4
222 ; VI-NEXT: v_mov_b32_e32 v1, s5
223 ; VI-NEXT: v_mov_b32_e32 v2, s6
224 ; VI-NEXT: v_mov_b32_e32 v3, s7
225 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
227 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
228 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
232 define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
233 ; SI-LABEL: insertelement_v4f32_3:
235 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
236 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
237 ; SI-NEXT: s_waitcnt lgkmcnt(0)
238 ; SI-NEXT: s_mov_b32 s7, 0x40a00000
239 ; SI-NEXT: s_mov_b32 s3, 0x100f000
240 ; SI-NEXT: s_mov_b32 s2, -1
241 ; SI-NEXT: v_mov_b32_e32 v0, s4
242 ; SI-NEXT: v_mov_b32_e32 v1, s5
243 ; SI-NEXT: v_mov_b32_e32 v2, s6
244 ; SI-NEXT: v_mov_b32_e32 v3, s7
245 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
248 ; VI-LABEL: insertelement_v4f32_3:
250 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
251 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
252 ; VI-NEXT: s_waitcnt lgkmcnt(0)
253 ; VI-NEXT: s_mov_b32 s7, 0x40a00000
254 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
255 ; VI-NEXT: s_mov_b32 s2, -1
256 ; VI-NEXT: v_mov_b32_e32 v0, s4
257 ; VI-NEXT: v_mov_b32_e32 v1, s5
258 ; VI-NEXT: v_mov_b32_e32 v2, s6
259 ; VI-NEXT: v_mov_b32_e32 v3, s7
260 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
262 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
263 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
267 define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
268 ; SI-LABEL: insertelement_v4i32_0:
270 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
271 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
272 ; SI-NEXT: s_waitcnt lgkmcnt(0)
273 ; SI-NEXT: s_movk_i32 s4, 0x3e7
274 ; SI-NEXT: s_mov_b32 s3, 0x100f000
275 ; SI-NEXT: s_mov_b32 s2, -1
276 ; SI-NEXT: v_mov_b32_e32 v0, s4
277 ; SI-NEXT: v_mov_b32_e32 v1, s5
278 ; SI-NEXT: v_mov_b32_e32 v2, s6
279 ; SI-NEXT: v_mov_b32_e32 v3, s7
280 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
283 ; VI-LABEL: insertelement_v4i32_0:
285 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
286 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
287 ; VI-NEXT: s_waitcnt lgkmcnt(0)
288 ; VI-NEXT: s_movk_i32 s4, 0x3e7
289 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
290 ; VI-NEXT: s_mov_b32 s2, -1
291 ; VI-NEXT: v_mov_b32_e32 v0, s4
292 ; VI-NEXT: v_mov_b32_e32 v1, s5
293 ; VI-NEXT: v_mov_b32_e32 v2, s6
294 ; VI-NEXT: v_mov_b32_e32 v3, s7
295 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
297 %vecins = insertelement <4 x i32> %a, i32 999, i32 0
298 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
302 define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
303 ; SI-LABEL: insertelement_v3f32_1:
305 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
306 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
307 ; SI-NEXT: s_mov_b32 s3, 0x100f000
308 ; SI-NEXT: s_mov_b32 s2, -1
309 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000
310 ; SI-NEXT: s_waitcnt lgkmcnt(0)
311 ; SI-NEXT: v_mov_b32_e32 v0, s4
312 ; SI-NEXT: v_mov_b32_e32 v2, s6
313 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
316 ; VI-LABEL: insertelement_v3f32_1:
318 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
319 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
320 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
321 ; VI-NEXT: s_mov_b32 s2, -1
322 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000
323 ; VI-NEXT: s_waitcnt lgkmcnt(0)
324 ; VI-NEXT: v_mov_b32_e32 v0, s4
325 ; VI-NEXT: v_mov_b32_e32 v2, s6
326 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
328 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
329 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
333 define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
334 ; SI-LABEL: insertelement_v3f32_2:
336 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
337 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
338 ; SI-NEXT: s_mov_b32 s3, 0x100f000
339 ; SI-NEXT: s_mov_b32 s2, -1
340 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000
341 ; SI-NEXT: s_waitcnt lgkmcnt(0)
342 ; SI-NEXT: v_mov_b32_e32 v0, s4
343 ; SI-NEXT: v_mov_b32_e32 v1, s5
344 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
347 ; VI-LABEL: insertelement_v3f32_2:
349 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
350 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
351 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
352 ; VI-NEXT: s_mov_b32 s2, -1
353 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000
354 ; VI-NEXT: s_waitcnt lgkmcnt(0)
355 ; VI-NEXT: v_mov_b32_e32 v0, s4
356 ; VI-NEXT: v_mov_b32_e32 v1, s5
357 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
359 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
360 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
364 define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
365 ; GCN-LABEL: insertelement_v3f32_3:
368 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
369 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
373 define <4 x float> @insertelement_to_sgpr() nounwind {
374 ; GCN-LABEL: insertelement_to_sgpr:
376 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
377 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
378 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
379 ; GCN-NEXT: s_mov_b32 s12, 0
380 ; GCN-NEXT: s_mov_b32 s4, s12
381 ; GCN-NEXT: s_mov_b32 s5, s12
382 ; GCN-NEXT: s_mov_b32 s6, s12
383 ; GCN-NEXT: s_mov_b32 s7, s12
384 ; GCN-NEXT: s_mov_b32 s8, s12
385 ; GCN-NEXT: s_mov_b32 s9, s12
386 ; GCN-NEXT: s_mov_b32 s10, s12
387 ; GCN-NEXT: s_mov_b32 s11, s12
388 ; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1
389 ; GCN-NEXT: s_waitcnt vmcnt(0)
390 ; GCN-NEXT: s_setpc_b64 s[30:31]
391 %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
392 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
393 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
394 ret <4 x float> %tmp2
397 define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
398 ; SI-LABEL: dynamic_insertelement_v2f32:
400 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
401 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
402 ; SI-NEXT: s_load_dword s4, s[4:5], 0x4
403 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
404 ; SI-NEXT: s_mov_b32 s3, 0x100f000
405 ; SI-NEXT: s_mov_b32 s2, -1
406 ; SI-NEXT: s_waitcnt lgkmcnt(0)
407 ; SI-NEXT: v_mov_b32_e32 v1, s7
408 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
409 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
410 ; SI-NEXT: v_mov_b32_e32 v2, s6
411 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
412 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
413 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
416 ; VI-LABEL: dynamic_insertelement_v2f32:
418 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
419 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
420 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
421 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
422 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
423 ; VI-NEXT: s_mov_b32 s2, -1
424 ; VI-NEXT: s_waitcnt lgkmcnt(0)
425 ; VI-NEXT: v_mov_b32_e32 v1, s7
426 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
427 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
428 ; VI-NEXT: v_mov_b32_e32 v2, s6
429 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
430 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
431 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
433 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
434 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
438 define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
439 ; SI-LABEL: dynamic_insertelement_v3f32:
441 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
442 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
443 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
444 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
445 ; SI-NEXT: s_mov_b32 s3, 0x100f000
446 ; SI-NEXT: s_mov_b32 s2, -1
447 ; SI-NEXT: s_waitcnt lgkmcnt(0)
448 ; SI-NEXT: v_mov_b32_e32 v1, s10
449 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
450 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
451 ; SI-NEXT: v_mov_b32_e32 v1, s9
452 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
453 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
454 ; SI-NEXT: v_mov_b32_e32 v3, s8
455 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
456 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
457 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
460 ; VI-LABEL: dynamic_insertelement_v3f32:
462 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
463 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
464 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
465 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
466 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
467 ; VI-NEXT: s_mov_b32 s2, -1
468 ; VI-NEXT: s_waitcnt lgkmcnt(0)
469 ; VI-NEXT: v_mov_b32_e32 v1, s10
470 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
471 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
472 ; VI-NEXT: v_mov_b32_e32 v1, s9
473 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
474 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
475 ; VI-NEXT: v_mov_b32_e32 v3, s8
476 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
477 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
478 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
480 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
481 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
485 define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
486 ; SI-LABEL: dynamic_insertelement_v4f32:
488 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
489 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
490 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
491 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
492 ; SI-NEXT: s_mov_b32 s3, 0x100f000
493 ; SI-NEXT: s_mov_b32 s2, -1
494 ; SI-NEXT: s_waitcnt lgkmcnt(0)
495 ; SI-NEXT: v_mov_b32_e32 v1, s11
496 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
497 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
498 ; SI-NEXT: v_mov_b32_e32 v1, s10
499 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
500 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
501 ; SI-NEXT: v_mov_b32_e32 v1, s9
502 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
503 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
504 ; SI-NEXT: v_mov_b32_e32 v4, s8
505 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
506 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
507 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
510 ; VI-LABEL: dynamic_insertelement_v4f32:
512 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
513 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
514 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
515 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
516 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
517 ; VI-NEXT: s_mov_b32 s2, -1
518 ; VI-NEXT: s_waitcnt lgkmcnt(0)
519 ; VI-NEXT: v_mov_b32_e32 v1, s11
520 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
521 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
522 ; VI-NEXT: v_mov_b32_e32 v1, s10
523 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
524 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
525 ; VI-NEXT: v_mov_b32_e32 v1, s9
526 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
527 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
528 ; VI-NEXT: v_mov_b32_e32 v4, s8
529 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
530 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
531 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
533 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
534 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
538 define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
539 ; SI-LABEL: dynamic_insertelement_v8f32:
541 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
542 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
543 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10
544 ; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000
545 ; SI-NEXT: s_mov_b32 s3, 0x100f000
546 ; SI-NEXT: s_mov_b32 s2, -1
547 ; SI-NEXT: s_waitcnt lgkmcnt(0)
548 ; SI-NEXT: v_mov_b32_e32 v0, s11
549 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
550 ; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
551 ; SI-NEXT: v_mov_b32_e32 v0, s10
552 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
553 ; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
554 ; SI-NEXT: v_mov_b32_e32 v0, s9
555 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
556 ; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
557 ; SI-NEXT: v_mov_b32_e32 v0, s8
558 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
559 ; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
560 ; SI-NEXT: v_mov_b32_e32 v5, s15
561 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
562 ; SI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc
563 ; SI-NEXT: v_mov_b32_e32 v5, s14
564 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
565 ; SI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
566 ; SI-NEXT: v_mov_b32_e32 v5, s13
567 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
568 ; SI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc
569 ; SI-NEXT: v_mov_b32_e32 v8, s12
570 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
571 ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
572 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
573 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
576 ; VI-LABEL: dynamic_insertelement_v8f32:
578 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
579 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
580 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40
581 ; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000
582 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
583 ; VI-NEXT: s_mov_b32 s2, -1
584 ; VI-NEXT: s_waitcnt lgkmcnt(0)
585 ; VI-NEXT: v_mov_b32_e32 v0, s11
586 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
587 ; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
588 ; VI-NEXT: v_mov_b32_e32 v0, s10
589 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
590 ; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
591 ; VI-NEXT: v_mov_b32_e32 v0, s9
592 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
593 ; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
594 ; VI-NEXT: v_mov_b32_e32 v0, s8
595 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
596 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
597 ; VI-NEXT: v_mov_b32_e32 v5, s15
598 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
599 ; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc
600 ; VI-NEXT: v_mov_b32_e32 v5, s14
601 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
602 ; VI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc
603 ; VI-NEXT: v_mov_b32_e32 v5, s13
604 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
605 ; VI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc
606 ; VI-NEXT: v_mov_b32_e32 v8, s12
607 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
608 ; VI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
609 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
610 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
612 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
613 store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
617 define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
618 ; SI-LABEL: dynamic_insertelement_v16f32:
620 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
621 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
622 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
623 ; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000
624 ; SI-NEXT: s_mov_b32 s3, 0x100f000
625 ; SI-NEXT: s_mov_b32 s2, -1
626 ; SI-NEXT: s_waitcnt lgkmcnt(0)
627 ; SI-NEXT: v_mov_b32_e32 v0, s8
628 ; SI-NEXT: v_mov_b32_e32 v1, s9
629 ; SI-NEXT: v_mov_b32_e32 v2, s10
630 ; SI-NEXT: v_mov_b32_e32 v3, s11
631 ; SI-NEXT: v_mov_b32_e32 v4, s12
632 ; SI-NEXT: v_mov_b32_e32 v5, s13
633 ; SI-NEXT: v_mov_b32_e32 v6, s14
634 ; SI-NEXT: v_mov_b32_e32 v7, s15
635 ; SI-NEXT: v_mov_b32_e32 v8, s16
636 ; SI-NEXT: v_mov_b32_e32 v9, s17
637 ; SI-NEXT: v_mov_b32_e32 v10, s18
638 ; SI-NEXT: v_mov_b32_e32 v11, s19
639 ; SI-NEXT: v_mov_b32_e32 v12, s20
640 ; SI-NEXT: v_mov_b32_e32 v13, s21
641 ; SI-NEXT: v_mov_b32_e32 v14, s22
642 ; SI-NEXT: v_mov_b32_e32 v15, s23
643 ; SI-NEXT: s_mov_b32 m0, s4
644 ; SI-NEXT: v_movreld_b32_e32 v0, v16
645 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
646 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
647 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
648 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
651 ; VI-LABEL: dynamic_insertelement_v16f32:
653 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
654 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
655 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
656 ; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000
657 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
658 ; VI-NEXT: s_mov_b32 s2, -1
659 ; VI-NEXT: s_waitcnt lgkmcnt(0)
660 ; VI-NEXT: v_mov_b32_e32 v0, s8
661 ; VI-NEXT: v_mov_b32_e32 v1, s9
662 ; VI-NEXT: v_mov_b32_e32 v2, s10
663 ; VI-NEXT: v_mov_b32_e32 v3, s11
664 ; VI-NEXT: v_mov_b32_e32 v4, s12
665 ; VI-NEXT: v_mov_b32_e32 v5, s13
666 ; VI-NEXT: v_mov_b32_e32 v6, s14
667 ; VI-NEXT: v_mov_b32_e32 v7, s15
668 ; VI-NEXT: v_mov_b32_e32 v8, s16
669 ; VI-NEXT: v_mov_b32_e32 v9, s17
670 ; VI-NEXT: v_mov_b32_e32 v10, s18
671 ; VI-NEXT: v_mov_b32_e32 v11, s19
672 ; VI-NEXT: v_mov_b32_e32 v12, s20
673 ; VI-NEXT: v_mov_b32_e32 v13, s21
674 ; VI-NEXT: v_mov_b32_e32 v14, s22
675 ; VI-NEXT: v_mov_b32_e32 v15, s23
676 ; VI-NEXT: s_mov_b32 m0, s4
677 ; VI-NEXT: v_movreld_b32_e32 v0, v16
678 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
679 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
680 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
681 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
683 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
684 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
688 define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
689 ; SI-LABEL: dynamic_insertelement_v2i32:
691 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
692 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
693 ; SI-NEXT: s_load_dword s4, s[4:5], 0x4
694 ; SI-NEXT: s_mov_b32 s3, 0x100f000
695 ; SI-NEXT: s_mov_b32 s2, -1
696 ; SI-NEXT: s_waitcnt lgkmcnt(0)
697 ; SI-NEXT: v_mov_b32_e32 v0, s7
698 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
699 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
700 ; SI-NEXT: v_mov_b32_e32 v0, s6
701 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
702 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
703 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
706 ; VI-LABEL: dynamic_insertelement_v2i32:
708 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
709 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
710 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
711 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
712 ; VI-NEXT: s_mov_b32 s2, -1
713 ; VI-NEXT: s_waitcnt lgkmcnt(0)
714 ; VI-NEXT: s_cmp_lg_u32 s4, 1
715 ; VI-NEXT: s_cselect_b32 s5, s7, 5
716 ; VI-NEXT: s_cmp_lg_u32 s4, 0
717 ; VI-NEXT: s_cselect_b32 s4, s6, 5
718 ; VI-NEXT: v_mov_b32_e32 v0, s4
719 ; VI-NEXT: v_mov_b32_e32 v1, s5
720 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
722 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
723 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
727 define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
728 ; SI-LABEL: dynamic_insertelement_v3i32:
730 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
731 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
732 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
733 ; SI-NEXT: s_mov_b32 s3, 0x100f000
734 ; SI-NEXT: s_mov_b32 s2, -1
735 ; SI-NEXT: s_waitcnt lgkmcnt(0)
736 ; SI-NEXT: v_mov_b32_e32 v0, s10
737 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
738 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
739 ; SI-NEXT: v_mov_b32_e32 v0, s9
740 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
741 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
742 ; SI-NEXT: v_mov_b32_e32 v0, s8
743 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
744 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
745 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
748 ; VI-LABEL: dynamic_insertelement_v3i32:
750 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
751 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
752 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
753 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
754 ; VI-NEXT: s_mov_b32 s2, -1
755 ; VI-NEXT: s_waitcnt lgkmcnt(0)
756 ; VI-NEXT: s_cmp_lg_u32 s4, 2
757 ; VI-NEXT: s_cselect_b32 s5, s10, 5
758 ; VI-NEXT: s_cmp_lg_u32 s4, 1
759 ; VI-NEXT: s_cselect_b32 s6, s9, 5
760 ; VI-NEXT: s_cmp_lg_u32 s4, 0
761 ; VI-NEXT: s_cselect_b32 s4, s8, 5
762 ; VI-NEXT: v_mov_b32_e32 v0, s4
763 ; VI-NEXT: v_mov_b32_e32 v1, s6
764 ; VI-NEXT: v_mov_b32_e32 v2, s5
765 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
767 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
768 store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
772 define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
773 ; SI-LABEL: dynamic_insertelement_v4i32:
775 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
776 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
777 ; SI-NEXT: s_load_dword s6, s[4:5], 0x8
778 ; SI-NEXT: s_load_dword s4, s[4:5], 0x11
779 ; SI-NEXT: s_mov_b32 s3, 0x100f000
780 ; SI-NEXT: s_mov_b32 s2, -1
781 ; SI-NEXT: s_waitcnt lgkmcnt(0)
782 ; SI-NEXT: v_mov_b32_e32 v0, s11
783 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3
784 ; SI-NEXT: v_mov_b32_e32 v4, s4
785 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
786 ; SI-NEXT: v_mov_b32_e32 v0, s10
787 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2
788 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
789 ; SI-NEXT: v_mov_b32_e32 v0, s9
790 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1
791 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
792 ; SI-NEXT: v_mov_b32_e32 v0, s8
793 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0
794 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
795 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
798 ; VI-LABEL: dynamic_insertelement_v4i32:
800 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
801 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
802 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20
803 ; VI-NEXT: s_load_dword s4, s[4:5], 0x44
804 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
805 ; VI-NEXT: s_mov_b32 s2, -1
806 ; VI-NEXT: s_waitcnt lgkmcnt(0)
807 ; VI-NEXT: s_cmp_eq_u32 s6, 3
808 ; VI-NEXT: s_cselect_b32 s5, s4, s11
809 ; VI-NEXT: s_cmp_eq_u32 s6, 2
810 ; VI-NEXT: s_cselect_b32 s7, s4, s10
811 ; VI-NEXT: s_cmp_eq_u32 s6, 1
812 ; VI-NEXT: s_cselect_b32 s9, s4, s9
813 ; VI-NEXT: s_cmp_eq_u32 s6, 0
814 ; VI-NEXT: s_cselect_b32 s4, s4, s8
815 ; VI-NEXT: v_mov_b32_e32 v0, s4
816 ; VI-NEXT: v_mov_b32_e32 v1, s9
817 ; VI-NEXT: v_mov_b32_e32 v2, s7
818 ; VI-NEXT: v_mov_b32_e32 v3, s5
819 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
821 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
822 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
826 define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
827 ; SI-LABEL: dynamic_insertelement_v8i32:
829 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
830 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
831 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10
832 ; SI-NEXT: s_mov_b32 s3, 0x100f000
833 ; SI-NEXT: s_mov_b32 s2, -1
834 ; SI-NEXT: s_waitcnt lgkmcnt(0)
835 ; SI-NEXT: v_mov_b32_e32 v0, s11
836 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
837 ; SI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc
838 ; SI-NEXT: v_mov_b32_e32 v0, s10
839 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
840 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc
841 ; SI-NEXT: v_mov_b32_e32 v0, s9
842 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
843 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc
844 ; SI-NEXT: v_mov_b32_e32 v0, s8
845 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
846 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
847 ; SI-NEXT: v_mov_b32_e32 v4, s15
848 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
849 ; SI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc
850 ; SI-NEXT: v_mov_b32_e32 v4, s14
851 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
852 ; SI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc
853 ; SI-NEXT: v_mov_b32_e32 v4, s13
854 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
855 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc
856 ; SI-NEXT: v_mov_b32_e32 v4, s12
857 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
858 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
859 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
860 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
863 ; VI-LABEL: dynamic_insertelement_v8i32:
865 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
866 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
867 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40
868 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
869 ; VI-NEXT: s_mov_b32 s2, -1
870 ; VI-NEXT: s_waitcnt lgkmcnt(0)
871 ; VI-NEXT: s_cmp_lg_u32 s4, 3
872 ; VI-NEXT: s_cselect_b32 s5, s11, 5
873 ; VI-NEXT: s_cmp_lg_u32 s4, 2
874 ; VI-NEXT: s_cselect_b32 s6, s10, 5
875 ; VI-NEXT: s_cmp_lg_u32 s4, 1
876 ; VI-NEXT: s_cselect_b32 s7, s9, 5
877 ; VI-NEXT: s_cmp_lg_u32 s4, 0
878 ; VI-NEXT: s_cselect_b32 s8, s8, 5
879 ; VI-NEXT: s_cmp_lg_u32 s4, 7
880 ; VI-NEXT: s_cselect_b32 s9, s15, 5
881 ; VI-NEXT: s_cmp_lg_u32 s4, 6
882 ; VI-NEXT: s_cselect_b32 s10, s14, 5
883 ; VI-NEXT: s_cmp_lg_u32 s4, 5
884 ; VI-NEXT: s_cselect_b32 s11, s13, 5
885 ; VI-NEXT: s_cmp_lg_u32 s4, 4
886 ; VI-NEXT: s_cselect_b32 s4, s12, 5
887 ; VI-NEXT: v_mov_b32_e32 v0, s4
888 ; VI-NEXT: v_mov_b32_e32 v1, s11
889 ; VI-NEXT: v_mov_b32_e32 v2, s10
890 ; VI-NEXT: v_mov_b32_e32 v3, s9
891 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
893 ; VI-NEXT: v_mov_b32_e32 v0, s8
894 ; VI-NEXT: v_mov_b32_e32 v1, s7
895 ; VI-NEXT: v_mov_b32_e32 v2, s6
896 ; VI-NEXT: v_mov_b32_e32 v3, s5
897 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
899 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
900 store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
904 define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
905 ; SI-LABEL: dynamic_insertelement_v16i32:
907 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
908 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
909 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
910 ; SI-NEXT: s_mov_b32 s3, 0x100f000
911 ; SI-NEXT: s_mov_b32 s2, -1
912 ; SI-NEXT: s_waitcnt lgkmcnt(0)
913 ; SI-NEXT: v_mov_b32_e32 v0, s8
914 ; SI-NEXT: v_mov_b32_e32 v1, s9
915 ; SI-NEXT: v_mov_b32_e32 v2, s10
916 ; SI-NEXT: v_mov_b32_e32 v3, s11
917 ; SI-NEXT: v_mov_b32_e32 v4, s12
918 ; SI-NEXT: v_mov_b32_e32 v5, s13
919 ; SI-NEXT: v_mov_b32_e32 v6, s14
920 ; SI-NEXT: v_mov_b32_e32 v7, s15
921 ; SI-NEXT: v_mov_b32_e32 v8, s16
922 ; SI-NEXT: v_mov_b32_e32 v9, s17
923 ; SI-NEXT: v_mov_b32_e32 v10, s18
924 ; SI-NEXT: v_mov_b32_e32 v11, s19
925 ; SI-NEXT: v_mov_b32_e32 v12, s20
926 ; SI-NEXT: v_mov_b32_e32 v13, s21
927 ; SI-NEXT: v_mov_b32_e32 v14, s22
928 ; SI-NEXT: v_mov_b32_e32 v15, s23
929 ; SI-NEXT: s_mov_b32 m0, s4
930 ; SI-NEXT: v_movreld_b32_e32 v0, 5
931 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
932 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
933 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
934 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
937 ; VI-LABEL: dynamic_insertelement_v16i32:
939 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
940 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
941 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
942 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
943 ; VI-NEXT: s_mov_b32 s2, -1
944 ; VI-NEXT: s_waitcnt lgkmcnt(0)
945 ; VI-NEXT: v_mov_b32_e32 v0, s8
946 ; VI-NEXT: v_mov_b32_e32 v1, s9
947 ; VI-NEXT: v_mov_b32_e32 v2, s10
948 ; VI-NEXT: v_mov_b32_e32 v3, s11
949 ; VI-NEXT: v_mov_b32_e32 v4, s12
950 ; VI-NEXT: v_mov_b32_e32 v5, s13
951 ; VI-NEXT: v_mov_b32_e32 v6, s14
952 ; VI-NEXT: v_mov_b32_e32 v7, s15
953 ; VI-NEXT: v_mov_b32_e32 v8, s16
954 ; VI-NEXT: v_mov_b32_e32 v9, s17
955 ; VI-NEXT: v_mov_b32_e32 v10, s18
956 ; VI-NEXT: v_mov_b32_e32 v11, s19
957 ; VI-NEXT: v_mov_b32_e32 v12, s20
958 ; VI-NEXT: v_mov_b32_e32 v13, s21
959 ; VI-NEXT: v_mov_b32_e32 v14, s22
960 ; VI-NEXT: v_mov_b32_e32 v15, s23
961 ; VI-NEXT: s_mov_b32 m0, s4
962 ; VI-NEXT: v_movreld_b32_e32 v0, 5
963 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
964 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
965 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
966 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
968 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
969 store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
973 define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
974 ; SI-LABEL: dynamic_insertelement_v2i16:
976 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
977 ; SI-NEXT: s_load_dword s6, s[4:5], 0x2
978 ; SI-NEXT: s_load_dword s4, s[4:5], 0x3
979 ; SI-NEXT: s_mov_b32 s3, 0x100f000
980 ; SI-NEXT: s_mov_b32 s2, -1
981 ; SI-NEXT: s_waitcnt lgkmcnt(0)
982 ; SI-NEXT: s_lshl_b32 s4, s4, 4
983 ; SI-NEXT: s_lshl_b32 s4, 0xffff, s4
984 ; SI-NEXT: s_andn2_b32 s5, s6, s4
985 ; SI-NEXT: s_and_b32 s4, s4, 0x50005
986 ; SI-NEXT: s_or_b32 s4, s4, s5
987 ; SI-NEXT: v_mov_b32_e32 v0, s4
988 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
991 ; VI-LABEL: dynamic_insertelement_v2i16:
993 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
994 ; VI-NEXT: s_load_dword s6, s[4:5], 0x8
995 ; VI-NEXT: s_load_dword s4, s[4:5], 0xc
996 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
997 ; VI-NEXT: s_mov_b32 s2, -1
998 ; VI-NEXT: s_waitcnt lgkmcnt(0)
999 ; VI-NEXT: s_lshl_b32 s4, s4, 4
1000 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4
1001 ; VI-NEXT: s_andn2_b32 s5, s6, s4
1002 ; VI-NEXT: s_and_b32 s4, s4, 0x50005
1003 ; VI-NEXT: s_or_b32 s4, s4, s5
1004 ; VI-NEXT: v_mov_b32_e32 v0, s4
1005 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1007 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
1008 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
1012 define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
1013 ; SI-LABEL: dynamic_insertelement_v3i16:
1015 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1016 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
1017 ; SI-NEXT: s_load_dword s4, s[4:5], 0x4
1018 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1019 ; SI-NEXT: s_mov_b32 s2, -1
1020 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1021 ; SI-NEXT: s_lshl_b32 s8, s4, 4
1022 ; SI-NEXT: s_mov_b64 s[4:5], 0xffff
1023 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
1024 ; SI-NEXT: s_mov_b32 s8, 0x50005
1025 ; SI-NEXT: s_and_b32 s9, s5, s8
1026 ; SI-NEXT: s_and_b32 s8, s4, s8
1027 ; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5]
1028 ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
1029 ; SI-NEXT: v_mov_b32_e32 v0, s5
1030 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
1031 ; SI-NEXT: v_mov_b32_e32 v0, s4
1032 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1035 ; VI-LABEL: dynamic_insertelement_v3i16:
1037 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1038 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
1039 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10
1040 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1041 ; VI-NEXT: s_mov_b32 s2, -1
1042 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1043 ; VI-NEXT: s_lshl_b32 s8, s4, 4
1044 ; VI-NEXT: s_mov_b64 s[4:5], 0xffff
1045 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
1046 ; VI-NEXT: s_mov_b32 s8, 0x50005
1047 ; VI-NEXT: s_mov_b32 s9, s8
1048 ; VI-NEXT: s_andn2_b64 s[6:7], s[6:7], s[4:5]
1049 ; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
1050 ; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
1051 ; VI-NEXT: v_mov_b32_e32 v0, s5
1052 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4
1053 ; VI-NEXT: v_mov_b32_e32 v0, s4
1054 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1056 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
1057 store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
1061 define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
1062 ; SI-LABEL: dynamic_insertelement_v2i8:
1064 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1065 ; SI-NEXT: s_load_dword s6, s[4:5], 0xa
1066 ; SI-NEXT: s_load_dword s4, s[4:5], 0x13
1067 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1068 ; SI-NEXT: s_mov_b32 s2, -1
1069 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1070 ; SI-NEXT: s_lshl_b32 s4, s4, 3
1071 ; SI-NEXT: s_lshl_b32 s4, -1, s4
1072 ; SI-NEXT: s_andn2_b32 s5, s6, s4
1073 ; SI-NEXT: s_and_b32 s4, s4, 0x505
1074 ; SI-NEXT: s_or_b32 s4, s4, s5
1075 ; SI-NEXT: v_mov_b32_e32 v0, s4
1076 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1079 ; VI-LABEL: dynamic_insertelement_v2i8:
1081 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1082 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28
1083 ; VI-NEXT: s_load_dword s4, s[4:5], 0x4c
1084 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1085 ; VI-NEXT: s_mov_b32 s2, -1
1086 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1087 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1088 ; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1
1089 ; VI-NEXT: v_not_b32_e32 v1, v0
1090 ; VI-NEXT: v_and_b32_e32 v1, s6, v1
1091 ; VI-NEXT: v_and_b32_e32 v0, 0x505, v0
1092 ; VI-NEXT: v_or_b32_e32 v0, v0, v1
1093 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
1095 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
1096 store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
1100 ; FIXME: post legalize i16 and i32 shifts aren't merged because of
1101 ; isTypeDesirableForOp in SimplifyDemandedBits
1102 define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
1103 ; SI-LABEL: dynamic_insertelement_v3i8:
1105 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1106 ; SI-NEXT: s_load_dword s6, s[4:5], 0xa
1107 ; SI-NEXT: s_load_dword s4, s[4:5], 0x13
1108 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1109 ; SI-NEXT: s_mov_b32 s2, -1
1110 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1111 ; SI-NEXT: s_lshl_b32 s4, s4, 3
1112 ; SI-NEXT: s_lshl_b32 s4, 0xffff, s4
1113 ; SI-NEXT: s_andn2_b32 s5, s6, s4
1114 ; SI-NEXT: s_and_b32 s4, s4, 0x5050505
1115 ; SI-NEXT: s_or_b32 s4, s4, s5
1116 ; SI-NEXT: v_mov_b32_e32 v0, s4
1117 ; SI-NEXT: s_lshr_b32 s5, s4, 16
1118 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1119 ; SI-NEXT: v_mov_b32_e32 v0, s5
1120 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
1123 ; VI-LABEL: dynamic_insertelement_v3i8:
1125 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1126 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28
1127 ; VI-NEXT: s_load_dword s4, s[4:5], 0x4c
1128 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1129 ; VI-NEXT: s_mov_b32 s2, -1
1130 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1131 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1132 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4
1133 ; VI-NEXT: s_andn2_b32 s5, s6, s4
1134 ; VI-NEXT: s_and_b32 s4, s4, 0x5050505
1135 ; VI-NEXT: s_or_b32 s4, s4, s5
1136 ; VI-NEXT: v_mov_b32_e32 v0, s4
1137 ; VI-NEXT: s_lshr_b32 s5, s4, 16
1138 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
1139 ; VI-NEXT: v_mov_b32_e32 v0, s5
1140 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
1142 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1143 store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
1147 define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1148 ; SI-LABEL: dynamic_insertelement_v4i8:
1150 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1151 ; SI-NEXT: s_load_dword s6, s[4:5], 0xa
1152 ; SI-NEXT: s_load_dword s4, s[4:5], 0x13
1153 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1154 ; SI-NEXT: s_mov_b32 s2, -1
1155 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1156 ; SI-NEXT: s_lshl_b32 s4, s4, 3
1157 ; SI-NEXT: s_lshl_b32 s4, 0xffff, s4
1158 ; SI-NEXT: s_andn2_b32 s5, s6, s4
1159 ; SI-NEXT: s_and_b32 s4, s4, 0x5050505
1160 ; SI-NEXT: s_or_b32 s4, s4, s5
1161 ; SI-NEXT: v_mov_b32_e32 v0, s4
1162 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1165 ; VI-LABEL: dynamic_insertelement_v4i8:
1167 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1168 ; VI-NEXT: s_load_dword s6, s[4:5], 0x28
1169 ; VI-NEXT: s_load_dword s4, s[4:5], 0x4c
1170 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1171 ; VI-NEXT: s_mov_b32 s2, -1
1172 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1173 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1174 ; VI-NEXT: s_lshl_b32 s4, 0xffff, s4
1175 ; VI-NEXT: s_andn2_b32 s5, s6, s4
1176 ; VI-NEXT: s_and_b32 s4, s4, 0x5050505
1177 ; VI-NEXT: s_or_b32 s4, s4, s5
1178 ; VI-NEXT: v_mov_b32_e32 v0, s4
1179 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1181 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1182 store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
1186 define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
1187 ; SI-LABEL: s_dynamic_insertelement_v8i8:
1189 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
1190 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4
1191 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1192 ; SI-NEXT: s_mov_b32 s2, -1
1193 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1194 ; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
1195 ; SI-NEXT: s_mov_b32 s0, s8
1196 ; SI-NEXT: s_lshl_b32 s8, s6, 3
1197 ; SI-NEXT: s_mov_b64 s[6:7], 0xffff
1198 ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
1199 ; SI-NEXT: s_mov_b32 s8, 0x5050505
1200 ; SI-NEXT: s_mov_b32 s1, s9
1201 ; SI-NEXT: s_and_b32 s9, s7, s8
1202 ; SI-NEXT: s_and_b32 s8, s6, s8
1203 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1204 ; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
1205 ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
1206 ; SI-NEXT: v_mov_b32_e32 v0, s4
1207 ; SI-NEXT: v_mov_b32_e32 v1, s5
1208 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1211 ; VI-LABEL: s_dynamic_insertelement_v8i8:
1213 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
1214 ; VI-NEXT: s_load_dword s6, s[4:5], 0x10
1215 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1216 ; VI-NEXT: s_mov_b32 s2, -1
1217 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1218 ; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0
1219 ; VI-NEXT: s_mov_b32 s0, s8
1220 ; VI-NEXT: s_lshl_b32 s8, s6, 3
1221 ; VI-NEXT: s_mov_b64 s[6:7], 0xffff
1222 ; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
1223 ; VI-NEXT: s_mov_b32 s8, 0x5050505
1224 ; VI-NEXT: s_mov_b32 s1, s9
1225 ; VI-NEXT: s_and_b32 s9, s7, s8
1226 ; VI-NEXT: s_and_b32 s8, s6, s8
1227 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1228 ; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
1229 ; VI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
1230 ; VI-NEXT: v_mov_b32_e32 v0, s4
1231 ; VI-NEXT: v_mov_b32_e32 v1, s5
1232 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1234 %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4
1235 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1236 store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
1240 define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
1241 ; SI-LABEL: dynamic_insertelement_v16i8:
1243 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1244 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
1245 ; SI-NEXT: s_load_dword s4, s[4:5], 0x8
1246 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1247 ; SI-NEXT: s_mov_b32 s2, -1
1248 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1249 ; SI-NEXT: s_lshr_b32 s5, s11, 24
1250 ; SI-NEXT: v_mov_b32_e32 v0, s5
1251 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15
1252 ; SI-NEXT: s_lshr_b32 s5, s11, 16
1253 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1254 ; SI-NEXT: v_mov_b32_e32 v1, s5
1255 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14
1256 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1257 ; SI-NEXT: s_movk_i32 s5, 0xff
1258 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1259 ; SI-NEXT: v_and_b32_e32 v1, s5, v1
1260 ; SI-NEXT: s_lshr_b32 s6, s11, 8
1261 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1262 ; SI-NEXT: v_mov_b32_e32 v1, s6
1263 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13
1264 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1265 ; SI-NEXT: v_mov_b32_e32 v2, s11
1266 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12
1267 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1268 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1269 ; SI-NEXT: v_and_b32_e32 v2, s5, v2
1270 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
1271 ; SI-NEXT: s_mov_b32 s6, 0xffff
1272 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1273 ; SI-NEXT: v_and_b32_e32 v1, s6, v1
1274 ; SI-NEXT: s_lshr_b32 s7, s10, 24
1275 ; SI-NEXT: v_or_b32_e32 v3, v1, v0
1276 ; SI-NEXT: v_mov_b32_e32 v0, s7
1277 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11
1278 ; SI-NEXT: s_lshr_b32 s7, s10, 16
1279 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1280 ; SI-NEXT: v_mov_b32_e32 v1, s7
1281 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10
1282 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1283 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1284 ; SI-NEXT: v_and_b32_e32 v1, s5, v1
1285 ; SI-NEXT: s_lshr_b32 s7, s10, 8
1286 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1287 ; SI-NEXT: v_mov_b32_e32 v1, s7
1288 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9
1289 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1290 ; SI-NEXT: v_mov_b32_e32 v2, s10
1291 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8
1292 ; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1293 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1294 ; SI-NEXT: v_and_b32_e32 v2, s5, v2
1295 ; SI-NEXT: v_or_b32_e32 v1, v2, v1
1296 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1297 ; SI-NEXT: v_and_b32_e32 v1, s6, v1
1298 ; SI-NEXT: s_lshr_b32 s7, s9, 24
1299 ; SI-NEXT: v_or_b32_e32 v2, v1, v0
1300 ; SI-NEXT: v_mov_b32_e32 v0, s7
1301 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
1302 ; SI-NEXT: s_lshr_b32 s7, s9, 16
1303 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1304 ; SI-NEXT: v_mov_b32_e32 v1, s7
1305 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
1306 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1307 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1308 ; SI-NEXT: v_and_b32_e32 v1, s5, v1
1309 ; SI-NEXT: s_lshr_b32 s7, s9, 8
1310 ; SI-NEXT: v_or_b32_e32 v0, v1, v0
1311 ; SI-NEXT: v_mov_b32_e32 v1, s7
1312 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
1313 ; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1314 ; SI-NEXT: v_mov_b32_e32 v4, s9
1315 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
1316 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1317 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1
1318 ; SI-NEXT: v_and_b32_e32 v4, s5, v4
1319 ; SI-NEXT: v_or_b32_e32 v1, v4, v1
1320 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1321 ; SI-NEXT: v_and_b32_e32 v1, s6, v1
1322 ; SI-NEXT: s_lshr_b32 s7, s8, 24
1323 ; SI-NEXT: v_or_b32_e32 v1, v1, v0
1324 ; SI-NEXT: v_mov_b32_e32 v0, s7
1325 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
1326 ; SI-NEXT: s_lshr_b32 s7, s8, 16
1327 ; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1328 ; SI-NEXT: v_mov_b32_e32 v4, s7
1329 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
1330 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1331 ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
1332 ; SI-NEXT: v_and_b32_e32 v4, s5, v4
1333 ; SI-NEXT: s_lshr_b32 s7, s8, 8
1334 ; SI-NEXT: v_or_b32_e32 v0, v4, v0
1335 ; SI-NEXT: v_mov_b32_e32 v4, s7
1336 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
1337 ; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1338 ; SI-NEXT: v_mov_b32_e32 v5, s8
1339 ; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
1340 ; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc
1341 ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
1342 ; SI-NEXT: v_and_b32_e32 v5, s5, v5
1343 ; SI-NEXT: v_or_b32_e32 v4, v5, v4
1344 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
1345 ; SI-NEXT: v_and_b32_e32 v4, s6, v4
1346 ; SI-NEXT: v_or_b32_e32 v0, v4, v0
1347 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1350 ; VI-LABEL: dynamic_insertelement_v16i8:
1352 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1353 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
1354 ; VI-NEXT: s_load_dword s4, s[4:5], 0x20
1355 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1356 ; VI-NEXT: s_mov_b32 s2, -1
1357 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1358 ; VI-NEXT: s_lshr_b32 s5, s11, 24
1359 ; VI-NEXT: v_mov_b32_e32 v0, s5
1360 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 15
1361 ; VI-NEXT: s_lshr_b32 s5, s11, 16
1362 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1363 ; VI-NEXT: v_mov_b32_e32 v1, s5
1364 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 14
1365 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1366 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1367 ; VI-NEXT: s_lshr_b32 s5, s11, 8
1368 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1369 ; VI-NEXT: v_mov_b32_e32 v1, s5
1370 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 13
1371 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1372 ; VI-NEXT: v_mov_b32_e32 v2, s11
1373 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 12
1374 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1375 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1376 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1377 ; VI-NEXT: s_lshr_b32 s5, s10, 24
1378 ; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1379 ; VI-NEXT: v_mov_b32_e32 v0, s5
1380 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 11
1381 ; VI-NEXT: s_lshr_b32 s5, s10, 16
1382 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1383 ; VI-NEXT: v_mov_b32_e32 v1, s5
1384 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 10
1385 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1386 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1387 ; VI-NEXT: s_lshr_b32 s5, s10, 8
1388 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1389 ; VI-NEXT: v_mov_b32_e32 v1, s5
1390 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 9
1391 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1392 ; VI-NEXT: v_mov_b32_e32 v2, s10
1393 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 8
1394 ; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc
1395 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1396 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1397 ; VI-NEXT: s_lshr_b32 s5, s9, 24
1398 ; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1399 ; VI-NEXT: v_mov_b32_e32 v0, s5
1400 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 7
1401 ; VI-NEXT: s_lshr_b32 s5, s9, 16
1402 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1403 ; VI-NEXT: v_mov_b32_e32 v1, s5
1404 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 6
1405 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1406 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1407 ; VI-NEXT: s_lshr_b32 s5, s9, 8
1408 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1409 ; VI-NEXT: v_mov_b32_e32 v1, s5
1410 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 5
1411 ; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
1412 ; VI-NEXT: v_mov_b32_e32 v4, s9
1413 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 4
1414 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1415 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
1416 ; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1417 ; VI-NEXT: s_lshr_b32 s5, s8, 24
1418 ; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1419 ; VI-NEXT: v_mov_b32_e32 v0, s5
1420 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
1421 ; VI-NEXT: s_lshr_b32 s5, s8, 16
1422 ; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc
1423 ; VI-NEXT: v_mov_b32_e32 v4, s5
1424 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
1425 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1426 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
1427 ; VI-NEXT: s_lshr_b32 s5, s8, 8
1428 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1429 ; VI-NEXT: v_mov_b32_e32 v4, s5
1430 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
1431 ; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc
1432 ; VI-NEXT: v_mov_b32_e32 v5, s8
1433 ; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
1434 ; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4
1435 ; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc
1436 ; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
1437 ; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1438 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1440 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1441 store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
1445 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that
1446 ; the compiler doesn't crash.
1447 define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
1448 ; SI-LABEL: insert_split_bb:
1449 ; SI: ; %bb.0: ; %entry
1450 ; SI-NEXT: s_load_dword s6, s[4:5], 0x4
1451 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1452 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1453 ; SI-NEXT: s_cmp_lg_u32 s6, 0
1454 ; SI-NEXT: s_cbranch_scc0 BB30_2
1455 ; SI-NEXT: ; %bb.1: ; %else
1456 ; SI-NEXT: s_load_dword s7, s[2:3], 0x1
1457 ; SI-NEXT: s_mov_b64 s[4:5], 0
1458 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
1459 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1460 ; SI-NEXT: s_mov_b64 vcc, vcc
1461 ; SI-NEXT: s_cbranch_vccz BB30_3
1462 ; SI-NEXT: s_branch BB30_4
1464 ; SI-NEXT: BB30_3: ; %if
1465 ; SI-NEXT: s_load_dword s7, s[2:3], 0x0
1466 ; SI-NEXT: BB30_4: ; %endif
1467 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1468 ; SI-NEXT: v_mov_b32_e32 v0, s6
1469 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1470 ; SI-NEXT: s_mov_b32 s2, -1
1471 ; SI-NEXT: v_mov_b32_e32 v1, s7
1472 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1475 ; VI-LABEL: insert_split_bb:
1476 ; VI: ; %bb.0: ; %entry
1477 ; VI-NEXT: s_load_dword s6, s[4:5], 0x10
1478 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1479 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1480 ; VI-NEXT: s_cmp_lg_u32 s6, 0
1481 ; VI-NEXT: s_cbranch_scc0 BB30_2
1482 ; VI-NEXT: ; %bb.1: ; %else
1483 ; VI-NEXT: s_load_dword s7, s[2:3], 0x4
1484 ; VI-NEXT: s_cbranch_execz BB30_3
1485 ; VI-NEXT: s_branch BB30_4
1487 ; VI-NEXT: BB30_3: ; %if
1488 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1489 ; VI-NEXT: s_load_dword s7, s[2:3], 0x0
1490 ; VI-NEXT: BB30_4: ; %endif
1491 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1492 ; VI-NEXT: v_mov_b32_e32 v0, s6
1493 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1494 ; VI-NEXT: s_mov_b32 s2, -1
1495 ; VI-NEXT: v_mov_b32_e32 v1, s7
1496 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1499 %0 = insertelement <2 x i32> undef, i32 %a, i32 0
1500 %1 = icmp eq i32 %a, 0
1501 br i1 %1, label %if, label %else
1504 %2 = load i32, i32 addrspace(1)* %in
1505 %3 = insertelement <2 x i32> %0, i32 %2, i32 1
1509 %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1
1510 %5 = load i32, i32 addrspace(1)* %4
1511 %6 = insertelement <2 x i32> %0, i32 %5, i32 1
1515 %7 = phi <2 x i32> [%3, %if], [%6, %else]
1516 store <2 x i32> %7, <2 x i32> addrspace(1)* %out
1520 define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
1521 ; SI-LABEL: dynamic_insertelement_v2f64:
1523 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1524 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xc
1525 ; SI-NEXT: s_load_dword s4, s[4:5], 0x18
1526 ; SI-NEXT: v_mov_b32_e32 v1, 0x40200000
1527 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1528 ; SI-NEXT: s_mov_b32 s2, -1
1529 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1530 ; SI-NEXT: v_mov_b32_e32 v0, s11
1531 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1532 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
1533 ; SI-NEXT: v_mov_b32_e32 v0, s10
1534 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1535 ; SI-NEXT: v_mov_b32_e32 v0, s9
1536 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1537 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
1538 ; SI-NEXT: v_mov_b32_e32 v0, s8
1539 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1540 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1543 ; VI-LABEL: dynamic_insertelement_v2f64:
1545 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1546 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x30
1547 ; VI-NEXT: s_load_dword s4, s[4:5], 0x60
1548 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000
1549 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1550 ; VI-NEXT: s_mov_b32 s2, -1
1551 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1552 ; VI-NEXT: v_mov_b32_e32 v0, s11
1553 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1554 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
1555 ; VI-NEXT: v_mov_b32_e32 v0, s10
1556 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1557 ; VI-NEXT: v_mov_b32_e32 v0, s9
1558 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1559 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
1560 ; VI-NEXT: v_mov_b32_e32 v0, s8
1561 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1562 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1564 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
1565 store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
1569 define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
1570 ; SI-LABEL: dynamic_insertelement_v2i64:
1572 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1573 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
1574 ; SI-NEXT: s_load_dword s6, s[4:5], 0x8
1575 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1576 ; SI-NEXT: s_mov_b32 s2, -1
1577 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1578 ; SI-NEXT: v_mov_b32_e32 v0, s11
1579 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
1580 ; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1581 ; SI-NEXT: v_mov_b32_e32 v0, s10
1582 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1583 ; SI-NEXT: v_mov_b32_e32 v0, s9
1584 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
1585 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1586 ; SI-NEXT: v_mov_b32_e32 v0, s8
1587 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1588 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1591 ; VI-LABEL: dynamic_insertelement_v2i64:
1593 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1594 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
1595 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20
1596 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1597 ; VI-NEXT: s_mov_b32 s2, -1
1598 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1599 ; VI-NEXT: v_mov_b32_e32 v0, s11
1600 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 1
1601 ; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1602 ; VI-NEXT: v_mov_b32_e32 v0, s10
1603 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1604 ; VI-NEXT: v_mov_b32_e32 v0, s9
1605 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, 0
1606 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1607 ; VI-NEXT: v_mov_b32_e32 v0, s8
1608 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1609 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1611 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
1612 store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
1616 define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
1617 ; SI-LABEL: dynamic_insertelement_v3i64:
1619 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1620 ; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
1621 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xc
1622 ; SI-NEXT: s_load_dword s12, s[4:5], 0x10
1623 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1624 ; SI-NEXT: s_mov_b32 s2, -1
1625 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1626 ; SI-NEXT: v_mov_b32_e32 v0, s11
1627 ; SI-NEXT: v_mov_b32_e32 v4, s7
1628 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1
1629 ; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1630 ; SI-NEXT: v_mov_b32_e32 v0, s10
1631 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1632 ; SI-NEXT: v_mov_b32_e32 v0, s9
1633 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0
1634 ; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1635 ; SI-NEXT: v_mov_b32_e32 v0, s8
1636 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1637 ; SI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2
1638 ; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5]
1639 ; SI-NEXT: v_mov_b32_e32 v4, s6
1640 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5]
1641 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1642 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1645 ; VI-LABEL: dynamic_insertelement_v3i64:
1647 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1648 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20
1649 ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x30
1650 ; VI-NEXT: s_load_dword s12, s[4:5], 0x40
1651 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1652 ; VI-NEXT: s_mov_b32 s2, -1
1653 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1654 ; VI-NEXT: v_mov_b32_e32 v0, s11
1655 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 1
1656 ; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5]
1657 ; VI-NEXT: v_mov_b32_e32 v0, s10
1658 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5]
1659 ; VI-NEXT: v_mov_b32_e32 v0, s9
1660 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 0
1661 ; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
1662 ; VI-NEXT: v_mov_b32_e32 v0, s8
1663 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5]
1664 ; VI-NEXT: v_mov_b32_e32 v4, s7
1665 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 2
1666 ; VI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5]
1667 ; VI-NEXT: v_mov_b32_e32 v4, s6
1668 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5]
1669 ; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
1670 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1672 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
1673 store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
1677 define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
1678 ; SI-LABEL: dynamic_insertelement_v4f64:
1680 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1681 ; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
1682 ; SI-NEXT: s_load_dword s4, s[4:5], 0x10
1683 ; SI-NEXT: v_mov_b32_e32 v4, 0x40200000
1684 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1685 ; SI-NEXT: s_mov_b32 s2, -1
1686 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1687 ; SI-NEXT: v_mov_b32_e32 v0, s11
1688 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1689 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
1690 ; SI-NEXT: v_mov_b32_e32 v0, s10
1691 ; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1692 ; SI-NEXT: v_mov_b32_e32 v0, s9
1693 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1694 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
1695 ; SI-NEXT: v_mov_b32_e32 v0, s8
1696 ; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1697 ; SI-NEXT: v_mov_b32_e32 v5, s15
1698 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3
1699 ; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
1700 ; SI-NEXT: v_mov_b32_e32 v5, s14
1701 ; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc
1702 ; SI-NEXT: v_mov_b32_e32 v5, s13
1703 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2
1704 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
1705 ; SI-NEXT: v_mov_b32_e32 v4, s12
1706 ; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
1707 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1708 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1711 ; VI-LABEL: dynamic_insertelement_v4f64:
1713 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1714 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
1715 ; VI-NEXT: s_load_dword s4, s[4:5], 0x40
1716 ; VI-NEXT: v_mov_b32_e32 v4, 0x40200000
1717 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1718 ; VI-NEXT: s_mov_b32 s2, -1
1719 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1720 ; VI-NEXT: v_mov_b32_e32 v0, s11
1721 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1
1722 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc
1723 ; VI-NEXT: v_mov_b32_e32 v0, s10
1724 ; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc
1725 ; VI-NEXT: v_mov_b32_e32 v0, s9
1726 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0
1727 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc
1728 ; VI-NEXT: v_mov_b32_e32 v0, s8
1729 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
1730 ; VI-NEXT: v_mov_b32_e32 v5, s15
1731 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 3
1732 ; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc
1733 ; VI-NEXT: v_mov_b32_e32 v5, s14
1734 ; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc
1735 ; VI-NEXT: v_mov_b32_e32 v5, s13
1736 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, s4, 2
1737 ; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
1738 ; VI-NEXT: v_mov_b32_e32 v4, s12
1739 ; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc
1740 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1741 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1743 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
1744 store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
1748 define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
1749 ; SI-LABEL: dynamic_insertelement_v8f64:
1751 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1752 ; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10
1753 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20
1754 ; SI-NEXT: v_mov_b32_e32 v16, 0x40200000
1755 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1756 ; SI-NEXT: s_mov_b32 s2, -1
1757 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1758 ; SI-NEXT: v_mov_b32_e32 v0, s8
1759 ; SI-NEXT: s_lshl_b32 s4, s4, 1
1760 ; SI-NEXT: v_mov_b32_e32 v1, s9
1761 ; SI-NEXT: v_mov_b32_e32 v2, s10
1762 ; SI-NEXT: v_mov_b32_e32 v3, s11
1763 ; SI-NEXT: v_mov_b32_e32 v4, s12
1764 ; SI-NEXT: v_mov_b32_e32 v5, s13
1765 ; SI-NEXT: v_mov_b32_e32 v6, s14
1766 ; SI-NEXT: v_mov_b32_e32 v7, s15
1767 ; SI-NEXT: v_mov_b32_e32 v8, s16
1768 ; SI-NEXT: v_mov_b32_e32 v9, s17
1769 ; SI-NEXT: v_mov_b32_e32 v10, s18
1770 ; SI-NEXT: v_mov_b32_e32 v11, s19
1771 ; SI-NEXT: v_mov_b32_e32 v12, s20
1772 ; SI-NEXT: v_mov_b32_e32 v13, s21
1773 ; SI-NEXT: v_mov_b32_e32 v14, s22
1774 ; SI-NEXT: v_mov_b32_e32 v15, s23
1775 ; SI-NEXT: s_mov_b32 m0, s4
1776 ; SI-NEXT: v_movreld_b32_e32 v0, 0
1777 ; SI-NEXT: v_movreld_b32_e32 v1, v16
1778 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1779 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1780 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1781 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1784 ; VI-LABEL: dynamic_insertelement_v8f64:
1786 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1787 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
1788 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80
1789 ; VI-NEXT: v_mov_b32_e32 v16, 0x40200000
1790 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1791 ; VI-NEXT: s_mov_b32 s2, -1
1792 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1793 ; VI-NEXT: v_mov_b32_e32 v0, s8
1794 ; VI-NEXT: s_lshl_b32 s4, s4, 1
1795 ; VI-NEXT: v_mov_b32_e32 v1, s9
1796 ; VI-NEXT: v_mov_b32_e32 v2, s10
1797 ; VI-NEXT: v_mov_b32_e32 v3, s11
1798 ; VI-NEXT: v_mov_b32_e32 v4, s12
1799 ; VI-NEXT: v_mov_b32_e32 v5, s13
1800 ; VI-NEXT: v_mov_b32_e32 v6, s14
1801 ; VI-NEXT: v_mov_b32_e32 v7, s15
1802 ; VI-NEXT: v_mov_b32_e32 v8, s16
1803 ; VI-NEXT: v_mov_b32_e32 v9, s17
1804 ; VI-NEXT: v_mov_b32_e32 v10, s18
1805 ; VI-NEXT: v_mov_b32_e32 v11, s19
1806 ; VI-NEXT: v_mov_b32_e32 v12, s20
1807 ; VI-NEXT: v_mov_b32_e32 v13, s21
1808 ; VI-NEXT: v_mov_b32_e32 v14, s22
1809 ; VI-NEXT: v_mov_b32_e32 v15, s23
1810 ; VI-NEXT: s_mov_b32 m0, s4
1811 ; VI-NEXT: v_movreld_b32_e32 v0, 0
1812 ; VI-NEXT: v_movreld_b32_e32 v1, v16
1813 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1814 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1815 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1816 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1818 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
1819 store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
1823 declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
1825 attributes #0 = { nounwind }
1826 attributes #1 = { nounwind readnone }