1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
5 ; FIXME: Broken on evergreen
6 ; FIXME: For some reason the 8 and 16 vectors are being stored as
7 ; individual elements instead of 128-bit stores.
9 define amdgpu_kernel void @insertelement_v2f32_0(ptr addrspace(1) %out, <2 x float> %a) nounwind {
10 ; SI-LABEL: insertelement_v2f32_0:
12 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
13 ; SI-NEXT: s_mov_b32 s7, 0x100f000
14 ; SI-NEXT: s_mov_b32 s6, -1
15 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
16 ; SI-NEXT: s_waitcnt lgkmcnt(0)
17 ; SI-NEXT: s_mov_b32 s4, s0
18 ; SI-NEXT: s_mov_b32 s5, s1
19 ; SI-NEXT: v_mov_b32_e32 v1, s3
20 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
23 ; VI-LABEL: insertelement_v2f32_0:
25 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
26 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
27 ; VI-NEXT: s_mov_b32 s6, -1
28 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
29 ; VI-NEXT: s_waitcnt lgkmcnt(0)
30 ; VI-NEXT: s_mov_b32 s4, s0
31 ; VI-NEXT: s_mov_b32 s5, s1
32 ; VI-NEXT: v_mov_b32_e32 v1, s3
33 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
35 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0
36 store <2 x float> %vecins, ptr addrspace(1) %out, align 16
40 define amdgpu_kernel void @insertelement_v2f32_1(ptr addrspace(1) %out, <2 x float> %a) nounwind {
41 ; SI-LABEL: insertelement_v2f32_1:
43 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
44 ; SI-NEXT: s_mov_b32 s7, 0x100f000
45 ; SI-NEXT: s_mov_b32 s6, -1
46 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000
47 ; SI-NEXT: s_waitcnt lgkmcnt(0)
48 ; SI-NEXT: s_mov_b32 s4, s0
49 ; SI-NEXT: s_mov_b32 s5, s1
50 ; SI-NEXT: v_mov_b32_e32 v0, s2
51 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
54 ; VI-LABEL: insertelement_v2f32_1:
56 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
57 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
58 ; VI-NEXT: s_mov_b32 s6, -1
59 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000
60 ; VI-NEXT: s_waitcnt lgkmcnt(0)
61 ; VI-NEXT: s_mov_b32 s4, s0
62 ; VI-NEXT: s_mov_b32 s5, s1
63 ; VI-NEXT: v_mov_b32_e32 v0, s2
64 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
66 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1
67 store <2 x float> %vecins, ptr addrspace(1) %out, align 16
71 define amdgpu_kernel void @insertelement_v2i32_0(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
72 ; SI-LABEL: insertelement_v2i32_0:
74 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
75 ; SI-NEXT: s_mov_b32 s7, 0x100f000
76 ; SI-NEXT: s_mov_b32 s6, -1
77 ; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
78 ; SI-NEXT: s_waitcnt lgkmcnt(0)
79 ; SI-NEXT: s_mov_b32 s4, s0
80 ; SI-NEXT: s_mov_b32 s5, s1
81 ; SI-NEXT: v_mov_b32_e32 v1, s3
82 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
85 ; VI-LABEL: insertelement_v2i32_0:
87 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
88 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
89 ; VI-NEXT: s_mov_b32 s6, -1
90 ; VI-NEXT: v_mov_b32_e32 v0, 0x3e7
91 ; VI-NEXT: s_waitcnt lgkmcnt(0)
92 ; VI-NEXT: s_mov_b32 s4, s0
93 ; VI-NEXT: s_mov_b32 s5, s1
94 ; VI-NEXT: v_mov_b32_e32 v1, s3
95 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
97 %vecins = insertelement <2 x i32> %a, i32 999, i32 0
98 store <2 x i32> %vecins, ptr addrspace(1) %out, align 16
102 define amdgpu_kernel void @insertelement_v2i32_1(ptr addrspace(1) %out, <2 x i32> %a) nounwind {
103 ; SI-LABEL: insertelement_v2i32_1:
105 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
106 ; SI-NEXT: s_mov_b32 s7, 0x100f000
107 ; SI-NEXT: s_mov_b32 s6, -1
108 ; SI-NEXT: v_mov_b32_e32 v1, 0x3e7
109 ; SI-NEXT: s_waitcnt lgkmcnt(0)
110 ; SI-NEXT: s_mov_b32 s4, s0
111 ; SI-NEXT: s_mov_b32 s5, s1
112 ; SI-NEXT: v_mov_b32_e32 v0, s2
113 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
116 ; VI-LABEL: insertelement_v2i32_1:
118 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
119 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
120 ; VI-NEXT: s_mov_b32 s6, -1
121 ; VI-NEXT: v_mov_b32_e32 v1, 0x3e7
122 ; VI-NEXT: s_waitcnt lgkmcnt(0)
123 ; VI-NEXT: s_mov_b32 s4, s0
124 ; VI-NEXT: s_mov_b32 s5, s1
125 ; VI-NEXT: v_mov_b32_e32 v0, s2
126 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
128 %vecins = insertelement <2 x i32> %a, i32 999, i32 1
129 store <2 x i32> %vecins, ptr addrspace(1) %out, align 16
133 ; FIXME: Why is the constant moved into the intermediate register and
134 ; not just directly into the vector component?
135 define amdgpu_kernel void @insertelement_v4f32_0(ptr addrspace(1) %out, <4 x float> %a) nounwind {
136 ; SI-LABEL: insertelement_v4f32_0:
138 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
139 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
140 ; SI-NEXT: s_waitcnt lgkmcnt(0)
141 ; SI-NEXT: s_mov_b32 s0, 0x40a00000
142 ; SI-NEXT: s_mov_b32 s7, 0x100f000
143 ; SI-NEXT: s_mov_b32 s6, -1
144 ; SI-NEXT: v_mov_b32_e32 v0, s0
145 ; SI-NEXT: v_mov_b32_e32 v1, s1
146 ; SI-NEXT: v_mov_b32_e32 v2, s2
147 ; SI-NEXT: v_mov_b32_e32 v3, s3
148 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
151 ; VI-LABEL: insertelement_v4f32_0:
153 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
154 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
155 ; VI-NEXT: s_waitcnt lgkmcnt(0)
156 ; VI-NEXT: s_mov_b32 s0, 0x40a00000
157 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
158 ; VI-NEXT: s_mov_b32 s6, -1
159 ; VI-NEXT: v_mov_b32_e32 v0, s0
160 ; VI-NEXT: v_mov_b32_e32 v1, s1
161 ; VI-NEXT: v_mov_b32_e32 v2, s2
162 ; VI-NEXT: v_mov_b32_e32 v3, s3
163 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
165 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
166 store <4 x float> %vecins, ptr addrspace(1) %out, align 16
170 define amdgpu_kernel void @insertelement_v4f32_1(ptr addrspace(1) %out, <4 x float> %a) nounwind {
171 ; SI-LABEL: insertelement_v4f32_1:
173 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
174 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
175 ; SI-NEXT: s_waitcnt lgkmcnt(0)
176 ; SI-NEXT: s_mov_b32 s1, 0x40a00000
177 ; SI-NEXT: s_mov_b32 s7, 0x100f000
178 ; SI-NEXT: s_mov_b32 s6, -1
179 ; SI-NEXT: v_mov_b32_e32 v0, s0
180 ; SI-NEXT: v_mov_b32_e32 v1, s1
181 ; SI-NEXT: v_mov_b32_e32 v2, s2
182 ; SI-NEXT: v_mov_b32_e32 v3, s3
183 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
186 ; VI-LABEL: insertelement_v4f32_1:
188 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
189 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
190 ; VI-NEXT: s_waitcnt lgkmcnt(0)
191 ; VI-NEXT: s_mov_b32 s1, 0x40a00000
192 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
193 ; VI-NEXT: s_mov_b32 s6, -1
194 ; VI-NEXT: v_mov_b32_e32 v0, s0
195 ; VI-NEXT: v_mov_b32_e32 v1, s1
196 ; VI-NEXT: v_mov_b32_e32 v2, s2
197 ; VI-NEXT: v_mov_b32_e32 v3, s3
198 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
200 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
201 store <4 x float> %vecins, ptr addrspace(1) %out, align 16
205 define amdgpu_kernel void @insertelement_v4f32_2(ptr addrspace(1) %out, <4 x float> %a) nounwind {
206 ; SI-LABEL: insertelement_v4f32_2:
208 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
209 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
210 ; SI-NEXT: s_waitcnt lgkmcnt(0)
211 ; SI-NEXT: s_mov_b32 s2, 0x40a00000
212 ; SI-NEXT: s_mov_b32 s7, 0x100f000
213 ; SI-NEXT: s_mov_b32 s6, -1
214 ; SI-NEXT: v_mov_b32_e32 v0, s0
215 ; SI-NEXT: v_mov_b32_e32 v1, s1
216 ; SI-NEXT: v_mov_b32_e32 v2, s2
217 ; SI-NEXT: v_mov_b32_e32 v3, s3
218 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
221 ; VI-LABEL: insertelement_v4f32_2:
223 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
224 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
225 ; VI-NEXT: s_waitcnt lgkmcnt(0)
226 ; VI-NEXT: s_mov_b32 s2, 0x40a00000
227 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
228 ; VI-NEXT: s_mov_b32 s6, -1
229 ; VI-NEXT: v_mov_b32_e32 v0, s0
230 ; VI-NEXT: v_mov_b32_e32 v1, s1
231 ; VI-NEXT: v_mov_b32_e32 v2, s2
232 ; VI-NEXT: v_mov_b32_e32 v3, s3
233 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
235 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
236 store <4 x float> %vecins, ptr addrspace(1) %out, align 16
240 define amdgpu_kernel void @insertelement_v4f32_3(ptr addrspace(1) %out, <4 x float> %a) nounwind {
241 ; SI-LABEL: insertelement_v4f32_3:
243 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
244 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
245 ; SI-NEXT: s_waitcnt lgkmcnt(0)
246 ; SI-NEXT: s_mov_b32 s3, 0x40a00000
247 ; SI-NEXT: s_mov_b32 s7, 0x100f000
248 ; SI-NEXT: s_mov_b32 s6, -1
249 ; SI-NEXT: v_mov_b32_e32 v0, s0
250 ; SI-NEXT: v_mov_b32_e32 v1, s1
251 ; SI-NEXT: v_mov_b32_e32 v2, s2
252 ; SI-NEXT: v_mov_b32_e32 v3, s3
253 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
256 ; VI-LABEL: insertelement_v4f32_3:
258 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
259 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
260 ; VI-NEXT: s_waitcnt lgkmcnt(0)
261 ; VI-NEXT: s_mov_b32 s3, 0x40a00000
262 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
263 ; VI-NEXT: s_mov_b32 s6, -1
264 ; VI-NEXT: v_mov_b32_e32 v0, s0
265 ; VI-NEXT: v_mov_b32_e32 v1, s1
266 ; VI-NEXT: v_mov_b32_e32 v2, s2
267 ; VI-NEXT: v_mov_b32_e32 v3, s3
268 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
270 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
271 store <4 x float> %vecins, ptr addrspace(1) %out, align 16
275 define amdgpu_kernel void @insertelement_v4i32_0(ptr addrspace(1) %out, <4 x i32> %a) nounwind {
276 ; SI-LABEL: insertelement_v4i32_0:
278 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
279 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
280 ; SI-NEXT: s_waitcnt lgkmcnt(0)
281 ; SI-NEXT: s_movk_i32 s0, 0x3e7
282 ; SI-NEXT: s_mov_b32 s7, 0x100f000
283 ; SI-NEXT: s_mov_b32 s6, -1
284 ; SI-NEXT: v_mov_b32_e32 v0, s0
285 ; SI-NEXT: v_mov_b32_e32 v1, s1
286 ; SI-NEXT: v_mov_b32_e32 v2, s2
287 ; SI-NEXT: v_mov_b32_e32 v3, s3
288 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
291 ; VI-LABEL: insertelement_v4i32_0:
293 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
294 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
295 ; VI-NEXT: s_waitcnt lgkmcnt(0)
296 ; VI-NEXT: s_movk_i32 s0, 0x3e7
297 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
298 ; VI-NEXT: s_mov_b32 s6, -1
299 ; VI-NEXT: v_mov_b32_e32 v0, s0
300 ; VI-NEXT: v_mov_b32_e32 v1, s1
301 ; VI-NEXT: v_mov_b32_e32 v2, s2
302 ; VI-NEXT: v_mov_b32_e32 v3, s3
303 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
305 %vecins = insertelement <4 x i32> %a, i32 999, i32 0
306 store <4 x i32> %vecins, ptr addrspace(1) %out, align 16
310 define amdgpu_kernel void @insertelement_v3f32_1(ptr addrspace(1) %out, <3 x float> %a) nounwind {
311 ; SI-LABEL: insertelement_v3f32_1:
313 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
314 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
315 ; SI-NEXT: s_mov_b32 s7, 0x100f000
316 ; SI-NEXT: s_mov_b32 s6, -1
317 ; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000
318 ; SI-NEXT: s_waitcnt lgkmcnt(0)
319 ; SI-NEXT: v_mov_b32_e32 v0, s0
320 ; SI-NEXT: v_mov_b32_e32 v2, s2
321 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
324 ; VI-LABEL: insertelement_v3f32_1:
326 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
327 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
328 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
329 ; VI-NEXT: s_mov_b32 s6, -1
330 ; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000
331 ; VI-NEXT: s_waitcnt lgkmcnt(0)
332 ; VI-NEXT: v_mov_b32_e32 v0, s0
333 ; VI-NEXT: v_mov_b32_e32 v2, s2
334 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
336 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
337 store <3 x float> %vecins, ptr addrspace(1) %out, align 16
341 define amdgpu_kernel void @insertelement_v3f32_2(ptr addrspace(1) %out, <3 x float> %a) nounwind {
342 ; SI-LABEL: insertelement_v3f32_2:
344 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
345 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
346 ; SI-NEXT: s_mov_b32 s7, 0x100f000
347 ; SI-NEXT: s_mov_b32 s6, -1
348 ; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000
349 ; SI-NEXT: s_waitcnt lgkmcnt(0)
350 ; SI-NEXT: v_mov_b32_e32 v0, s0
351 ; SI-NEXT: v_mov_b32_e32 v1, s1
352 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
355 ; VI-LABEL: insertelement_v3f32_2:
357 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
358 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
359 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
360 ; VI-NEXT: s_mov_b32 s6, -1
361 ; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000
362 ; VI-NEXT: s_waitcnt lgkmcnt(0)
363 ; VI-NEXT: v_mov_b32_e32 v0, s0
364 ; VI-NEXT: v_mov_b32_e32 v1, s1
365 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
367 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
368 store <3 x float> %vecins, ptr addrspace(1) %out, align 16
372 define amdgpu_kernel void @insertelement_v3f32_3(ptr addrspace(1) %out, <3 x float> %a) nounwind {
373 ; GCN-LABEL: insertelement_v3f32_3:
376 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
377 store <3 x float> %vecins, ptr addrspace(1) %out, align 16
381 define <4 x float> @insertelement_to_sgpr() nounwind {
382 ; GCN-LABEL: insertelement_to_sgpr:
384 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
385 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
386 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
387 ; GCN-NEXT: s_mov_b32 s4, 0
388 ; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1
389 ; GCN-NEXT: s_waitcnt vmcnt(0)
390 ; GCN-NEXT: s_setpc_b64 s[30:31]
391 %tmp = load <4 x i32>, ptr addrspace(4) undef
392 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
393 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
394 ret <4 x float> %tmp2
397 define <9 x float> @insertelement_to_v9f32_undef() nounwind {
398 ; GCN-LABEL: insertelement_to_v9f32_undef:
400 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401 ; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
402 ; GCN-NEXT: v_mov_b32_e32 v0, 0x40a00000
403 ; GCN-NEXT: v_mov_b32_e32 v2, 0xc0a00000
404 ; GCN-NEXT: v_mov_b32_e32 v7, 0x41880000
405 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
406 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
407 ; GCN-NEXT: v_mov_b32_e32 v1, s5
408 ; GCN-NEXT: v_mov_b32_e32 v3, s7
409 ; GCN-NEXT: v_mov_b32_e32 v4, s8
410 ; GCN-NEXT: v_mov_b32_e32 v5, s9
411 ; GCN-NEXT: v_mov_b32_e32 v6, s10
412 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
413 ; GCN-NEXT: v_mov_b32_e32 v8, s4
414 ; GCN-NEXT: s_setpc_b64 s[30:31]
415 %tmp = load <9 x float>, ptr addrspace(4) undef
416 %tmp1 = insertelement <9 x float> %tmp, float 5.000, i32 0
417 %tmp2 = insertelement <9 x float> %tmp1, float -5.000, i32 2
418 %tmp3 = insertelement <9 x float> %tmp2, float 17.000, i32 7
419 ret <9 x float> %tmp3
422 define <10 x float> @insertelement_to_v10f32_undef() nounwind {
423 ; GCN-LABEL: insertelement_to_v10f32_undef:
425 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
426 ; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
427 ; GCN-NEXT: v_mov_b32_e32 v0, 2.0
428 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
429 ; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
430 ; GCN-NEXT: v_mov_b32_e32 v1, s5
431 ; GCN-NEXT: v_mov_b32_e32 v2, s6
432 ; GCN-NEXT: v_mov_b32_e32 v3, s7
433 ; GCN-NEXT: v_mov_b32_e32 v4, s8
434 ; GCN-NEXT: v_mov_b32_e32 v5, s9
435 ; GCN-NEXT: v_mov_b32_e32 v6, s10
436 ; GCN-NEXT: v_mov_b32_e32 v7, s11
437 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
438 ; GCN-NEXT: v_mov_b32_e32 v8, s12
439 ; GCN-NEXT: v_mov_b32_e32 v9, s13
440 ; GCN-NEXT: s_setpc_b64 s[30:31]
441 %tmp = load <10 x float>, ptr addrspace(4) undef
442 %tmp1 = insertelement <10 x float> %tmp, float 2.0, i32 0
443 ret <10 x float> %tmp1
446 define <11 x float> @insertelement_to_v11f32_undef() nounwind {
447 ; GCN-LABEL: insertelement_to_v11f32_undef:
449 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450 ; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
451 ; GCN-NEXT: v_mov_b32_e32 v0, 1.0
452 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
453 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
454 ; GCN-NEXT: v_mov_b32_e32 v1, s5
455 ; GCN-NEXT: v_mov_b32_e32 v2, s6
456 ; GCN-NEXT: v_mov_b32_e32 v3, s7
457 ; GCN-NEXT: v_mov_b32_e32 v4, s8
458 ; GCN-NEXT: v_mov_b32_e32 v5, s9
459 ; GCN-NEXT: v_mov_b32_e32 v6, s10
460 ; GCN-NEXT: v_mov_b32_e32 v7, s11
461 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
462 ; GCN-NEXT: v_mov_b32_e32 v8, s12
463 ; GCN-NEXT: v_mov_b32_e32 v9, s13
464 ; GCN-NEXT: v_mov_b32_e32 v10, s14
465 ; GCN-NEXT: s_setpc_b64 s[30:31]
466 %tmp = load <11 x float>, ptr addrspace(4) undef
467 %tmp1 = insertelement <11 x float> %tmp, float 1.000, i32 0
468 ret <11 x float> %tmp1
471 define <12 x float> @insertelement_to_v12f32_undef() nounwind {
472 ; GCN-LABEL: insertelement_to_v12f32_undef:
474 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
475 ; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
476 ; GCN-NEXT: v_mov_b32_e32 v0, 4.0
477 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
478 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0
479 ; GCN-NEXT: v_mov_b32_e32 v1, s5
480 ; GCN-NEXT: v_mov_b32_e32 v2, s6
481 ; GCN-NEXT: v_mov_b32_e32 v3, s7
482 ; GCN-NEXT: v_mov_b32_e32 v4, s8
483 ; GCN-NEXT: v_mov_b32_e32 v5, s9
484 ; GCN-NEXT: v_mov_b32_e32 v6, s10
485 ; GCN-NEXT: v_mov_b32_e32 v7, s11
486 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
487 ; GCN-NEXT: v_mov_b32_e32 v8, s12
488 ; GCN-NEXT: v_mov_b32_e32 v9, s13
489 ; GCN-NEXT: v_mov_b32_e32 v10, s14
490 ; GCN-NEXT: v_mov_b32_e32 v11, s15
491 ; GCN-NEXT: s_setpc_b64 s[30:31]
492 %tmp = load <12 x float>, ptr addrspace(4) undef
493 %tmp1 = insertelement <12 x float> %tmp, float 4.0, i32 0
494 ret <12 x float> %tmp1
497 define amdgpu_kernel void @dynamic_insertelement_v2f32(ptr addrspace(1) %out, <2 x float> %a, i32 %b) nounwind {
498 ; SI-LABEL: dynamic_insertelement_v2f32:
500 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
501 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
502 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
503 ; SI-NEXT: s_mov_b32 s7, 0x100f000
504 ; SI-NEXT: s_mov_b32 s6, -1
505 ; SI-NEXT: s_waitcnt lgkmcnt(0)
506 ; SI-NEXT: s_cmp_lg_u32 s2, 1
507 ; SI-NEXT: v_mov_b32_e32 v1, s1
508 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
509 ; SI-NEXT: s_cmp_lg_u32 s2, 0
510 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
511 ; SI-NEXT: v_mov_b32_e32 v2, s0
512 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
513 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
514 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
517 ; VI-LABEL: dynamic_insertelement_v2f32:
519 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
520 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
521 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
522 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
523 ; VI-NEXT: s_mov_b32 s6, -1
524 ; VI-NEXT: s_waitcnt lgkmcnt(0)
525 ; VI-NEXT: s_cmp_lg_u32 s2, 1
526 ; VI-NEXT: v_mov_b32_e32 v1, s1
527 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
528 ; VI-NEXT: s_cmp_lg_u32 s2, 0
529 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
530 ; VI-NEXT: v_mov_b32_e32 v2, s0
531 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
532 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
533 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
535 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
536 store <2 x float> %vecins, ptr addrspace(1) %out, align 8
540 define amdgpu_kernel void @dynamic_insertelement_v3f32(ptr addrspace(1) %out, <3 x float> %a, i32 %b) nounwind {
541 ; SI-LABEL: dynamic_insertelement_v3f32:
543 ; SI-NEXT: s_load_dword s10, s[8:9], 0x8
544 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
545 ; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x4
546 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
547 ; SI-NEXT: s_mov_b32 s3, 0x100f000
548 ; SI-NEXT: s_waitcnt lgkmcnt(0)
549 ; SI-NEXT: s_cmp_lg_u32 s10, 2
550 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
551 ; SI-NEXT: v_mov_b32_e32 v1, s6
552 ; SI-NEXT: s_cmp_lg_u32 s10, 1
553 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
554 ; SI-NEXT: v_mov_b32_e32 v1, s5
555 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
556 ; SI-NEXT: s_cmp_lg_u32 s10, 0
557 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
558 ; SI-NEXT: v_mov_b32_e32 v3, s4
559 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
560 ; SI-NEXT: s_mov_b32 s2, -1
561 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
562 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
565 ; VI-LABEL: dynamic_insertelement_v3f32:
567 ; VI-NEXT: s_load_dword s10, s[8:9], 0x20
568 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
569 ; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x10
570 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
571 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
572 ; VI-NEXT: s_waitcnt lgkmcnt(0)
573 ; VI-NEXT: s_cmp_lg_u32 s10, 2
574 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
575 ; VI-NEXT: v_mov_b32_e32 v1, s6
576 ; VI-NEXT: s_cmp_lg_u32 s10, 1
577 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
578 ; VI-NEXT: v_mov_b32_e32 v1, s5
579 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
580 ; VI-NEXT: s_cmp_lg_u32 s10, 0
581 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
582 ; VI-NEXT: v_mov_b32_e32 v3, s4
583 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
584 ; VI-NEXT: s_mov_b32 s2, -1
585 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
586 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
588 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
589 store <3 x float> %vecins, ptr addrspace(1) %out, align 16
593 define amdgpu_kernel void @dynamic_insertelement_v4f32(ptr addrspace(1) %out, <4 x float> %a, i32 %b) nounwind {
594 ; SI-LABEL: dynamic_insertelement_v4f32:
596 ; SI-NEXT: s_load_dword s10, s[8:9], 0x8
597 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
598 ; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x4
599 ; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
600 ; SI-NEXT: s_mov_b32 s3, 0x100f000
601 ; SI-NEXT: s_waitcnt lgkmcnt(0)
602 ; SI-NEXT: s_cmp_lg_u32 s10, 3
603 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
604 ; SI-NEXT: v_mov_b32_e32 v1, s7
605 ; SI-NEXT: s_cmp_lg_u32 s10, 2
606 ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
607 ; SI-NEXT: v_mov_b32_e32 v1, s6
608 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
609 ; SI-NEXT: s_cmp_lg_u32 s10, 1
610 ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
611 ; SI-NEXT: v_mov_b32_e32 v1, s5
612 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
613 ; SI-NEXT: s_cmp_lg_u32 s10, 0
614 ; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
615 ; SI-NEXT: v_mov_b32_e32 v4, s4
616 ; SI-NEXT: s_cselect_b64 vcc, -1, 0
617 ; SI-NEXT: s_mov_b32 s2, -1
618 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
619 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
622 ; VI-LABEL: dynamic_insertelement_v4f32:
624 ; VI-NEXT: s_load_dword s10, s[8:9], 0x20
625 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
626 ; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x10
627 ; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
628 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
629 ; VI-NEXT: s_waitcnt lgkmcnt(0)
630 ; VI-NEXT: s_cmp_lg_u32 s10, 3
631 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
632 ; VI-NEXT: v_mov_b32_e32 v1, s7
633 ; VI-NEXT: s_cmp_lg_u32 s10, 2
634 ; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
635 ; VI-NEXT: v_mov_b32_e32 v1, s6
636 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
637 ; VI-NEXT: s_cmp_lg_u32 s10, 1
638 ; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
639 ; VI-NEXT: v_mov_b32_e32 v1, s5
640 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
641 ; VI-NEXT: s_cmp_lg_u32 s10, 0
642 ; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
643 ; VI-NEXT: v_mov_b32_e32 v4, s4
644 ; VI-NEXT: s_cselect_b64 vcc, -1, 0
645 ; VI-NEXT: s_mov_b32 s2, -1
646 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
647 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
649 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
650 store <4 x float> %vecins, ptr addrspace(1) %out, align 16
654 define amdgpu_kernel void @dynamic_insertelement_v8f32(ptr addrspace(1) %out, <8 x float> %a, i32 %b) nounwind {
655 ; SI-LABEL: dynamic_insertelement_v8f32:
657 ; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8
658 ; SI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
659 ; SI-NEXT: s_load_dword s8, s[8:9], 0x10
660 ; SI-NEXT: v_mov_b32_e32 v8, 0x40a00000
661 ; SI-NEXT: s_mov_b32 s15, 0x100f000
662 ; SI-NEXT: s_mov_b32 s14, -1
663 ; SI-NEXT: s_waitcnt lgkmcnt(0)
664 ; SI-NEXT: v_mov_b32_e32 v0, s0
665 ; SI-NEXT: v_mov_b32_e32 v1, s1
666 ; SI-NEXT: v_mov_b32_e32 v2, s2
667 ; SI-NEXT: v_mov_b32_e32 v3, s3
668 ; SI-NEXT: v_mov_b32_e32 v4, s4
669 ; SI-NEXT: v_mov_b32_e32 v5, s5
670 ; SI-NEXT: v_mov_b32_e32 v6, s6
671 ; SI-NEXT: v_mov_b32_e32 v7, s7
672 ; SI-NEXT: s_mov_b32 m0, s8
673 ; SI-NEXT: v_movreld_b32_e32 v0, v8
674 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
675 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
678 ; VI-LABEL: dynamic_insertelement_v8f32:
680 ; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x20
681 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
682 ; VI-NEXT: s_load_dword s4, s[8:9], 0x40
683 ; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000
684 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
685 ; VI-NEXT: s_mov_b32 s2, -1
686 ; VI-NEXT: s_waitcnt lgkmcnt(0)
687 ; VI-NEXT: v_mov_b32_e32 v0, s12
688 ; VI-NEXT: v_mov_b32_e32 v1, s13
689 ; VI-NEXT: v_mov_b32_e32 v2, s14
690 ; VI-NEXT: v_mov_b32_e32 v3, s15
691 ; VI-NEXT: v_mov_b32_e32 v4, s16
692 ; VI-NEXT: v_mov_b32_e32 v5, s17
693 ; VI-NEXT: v_mov_b32_e32 v6, s18
694 ; VI-NEXT: v_mov_b32_e32 v7, s19
695 ; VI-NEXT: s_mov_b32 m0, s4
696 ; VI-NEXT: v_movreld_b32_e32 v0, v8
697 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
698 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
700 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
701 store <8 x float> %vecins, ptr addrspace(1) %out, align 32
705 define amdgpu_kernel void @dynamic_insertelement_v9f32(ptr addrspace(1) %out, <9 x float> %a, i32 %b) nounwind {
706 ; SI-LABEL: dynamic_insertelement_v9f32:
708 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
709 ; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
710 ; SI-NEXT: s_load_dword s4, s[8:9], 0x18
711 ; SI-NEXT: s_load_dword s5, s[8:9], 0x20
712 ; SI-NEXT: v_mov_b32_e32 v9, 0x40a00000
713 ; SI-NEXT: s_mov_b32 s3, 0x100f000
714 ; SI-NEXT: s_waitcnt lgkmcnt(0)
715 ; SI-NEXT: v_mov_b32_e32 v0, s12
716 ; SI-NEXT: v_mov_b32_e32 v1, s13
717 ; SI-NEXT: v_mov_b32_e32 v2, s14
718 ; SI-NEXT: v_mov_b32_e32 v3, s15
719 ; SI-NEXT: v_mov_b32_e32 v4, s16
720 ; SI-NEXT: v_mov_b32_e32 v5, s17
721 ; SI-NEXT: v_mov_b32_e32 v6, s18
722 ; SI-NEXT: v_mov_b32_e32 v7, s19
723 ; SI-NEXT: v_mov_b32_e32 v8, s4
724 ; SI-NEXT: s_mov_b32 m0, s5
725 ; SI-NEXT: s_mov_b32 s2, -1
726 ; SI-NEXT: v_movreld_b32_e32 v0, v9
727 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
728 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
729 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
732 ; VI-LABEL: dynamic_insertelement_v9f32:
734 ; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40
735 ; VI-NEXT: s_load_dword s4, s[8:9], 0x60
736 ; VI-NEXT: s_load_dword s5, s[8:9], 0x80
737 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
738 ; VI-NEXT: v_mov_b32_e32 v9, 0x40a00000
739 ; VI-NEXT: s_waitcnt lgkmcnt(0)
740 ; VI-NEXT: v_mov_b32_e32 v0, s12
741 ; VI-NEXT: v_mov_b32_e32 v1, s13
742 ; VI-NEXT: v_mov_b32_e32 v2, s14
743 ; VI-NEXT: v_mov_b32_e32 v3, s15
744 ; VI-NEXT: v_mov_b32_e32 v4, s16
745 ; VI-NEXT: v_mov_b32_e32 v5, s17
746 ; VI-NEXT: v_mov_b32_e32 v6, s18
747 ; VI-NEXT: v_mov_b32_e32 v7, s19
748 ; VI-NEXT: v_mov_b32_e32 v8, s4
749 ; VI-NEXT: s_mov_b32 m0, s5
750 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
751 ; VI-NEXT: s_mov_b32 s2, -1
752 ; VI-NEXT: v_movreld_b32_e32 v0, v9
753 ; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
754 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
755 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
757 %vecins = insertelement <9 x float> %a, float 5.000000e+00, i32 %b
758 store <9 x float> %vecins, ptr addrspace(1) %out, align 32
762 define amdgpu_kernel void @dynamic_insertelement_v10f32(ptr addrspace(1) %out, <10 x float> %a, i32 %b) nounwind {
763 ; SI-LABEL: dynamic_insertelement_v10f32:
765 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
766 ; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
767 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18
768 ; SI-NEXT: s_load_dword s6, s[8:9], 0x20
769 ; SI-NEXT: v_mov_b32_e32 v10, 0x40a00000
770 ; SI-NEXT: s_mov_b32 s3, 0x100f000
771 ; SI-NEXT: s_waitcnt lgkmcnt(0)
772 ; SI-NEXT: v_mov_b32_e32 v0, s12
773 ; SI-NEXT: v_mov_b32_e32 v1, s13
774 ; SI-NEXT: v_mov_b32_e32 v2, s14
775 ; SI-NEXT: v_mov_b32_e32 v3, s15
776 ; SI-NEXT: v_mov_b32_e32 v4, s16
777 ; SI-NEXT: v_mov_b32_e32 v5, s17
778 ; SI-NEXT: v_mov_b32_e32 v6, s18
779 ; SI-NEXT: v_mov_b32_e32 v7, s19
780 ; SI-NEXT: v_mov_b32_e32 v8, s4
781 ; SI-NEXT: v_mov_b32_e32 v9, s5
782 ; SI-NEXT: s_mov_b32 m0, s6
783 ; SI-NEXT: s_mov_b32 s2, -1
784 ; SI-NEXT: v_movreld_b32_e32 v0, v10
785 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
786 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
787 ; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
790 ; VI-LABEL: dynamic_insertelement_v10f32:
792 ; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40
793 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x60
794 ; VI-NEXT: s_load_dword s6, s[8:9], 0x80
795 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
796 ; VI-NEXT: v_mov_b32_e32 v10, 0x40a00000
797 ; VI-NEXT: s_waitcnt lgkmcnt(0)
798 ; VI-NEXT: v_mov_b32_e32 v0, s12
799 ; VI-NEXT: v_mov_b32_e32 v1, s13
800 ; VI-NEXT: v_mov_b32_e32 v2, s14
801 ; VI-NEXT: v_mov_b32_e32 v3, s15
802 ; VI-NEXT: v_mov_b32_e32 v4, s16
803 ; VI-NEXT: v_mov_b32_e32 v5, s17
804 ; VI-NEXT: v_mov_b32_e32 v6, s18
805 ; VI-NEXT: v_mov_b32_e32 v7, s19
806 ; VI-NEXT: v_mov_b32_e32 v8, s4
807 ; VI-NEXT: v_mov_b32_e32 v9, s5
808 ; VI-NEXT: s_mov_b32 m0, s6
809 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
810 ; VI-NEXT: s_mov_b32 s2, -1
811 ; VI-NEXT: v_movreld_b32_e32 v0, v10
812 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
813 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
814 ; VI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
816 %vecins = insertelement <10 x float> %a, float 5.000000e+00, i32 %b
817 store <10 x float> %vecins, ptr addrspace(1) %out, align 32
821 define amdgpu_kernel void @dynamic_insertelement_v11f32(ptr addrspace(1) %out, <11 x float> %a, i32 %b) nounwind {
822 ; SI-LABEL: dynamic_insertelement_v11f32:
824 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
825 ; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
826 ; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18
827 ; SI-NEXT: s_waitcnt lgkmcnt(0)
828 ; SI-NEXT: s_load_dword s7, s[8:9], 0x20
829 ; SI-NEXT: v_mov_b32_e32 v11, 0x40a00000
830 ; SI-NEXT: s_mov_b32 s3, 0x100f000
831 ; SI-NEXT: v_mov_b32_e32 v0, s12
832 ; SI-NEXT: v_mov_b32_e32 v1, s13
833 ; SI-NEXT: v_mov_b32_e32 v2, s14
834 ; SI-NEXT: v_mov_b32_e32 v3, s15
835 ; SI-NEXT: v_mov_b32_e32 v4, s16
836 ; SI-NEXT: v_mov_b32_e32 v5, s17
837 ; SI-NEXT: v_mov_b32_e32 v6, s18
838 ; SI-NEXT: v_mov_b32_e32 v7, s19
839 ; SI-NEXT: v_mov_b32_e32 v8, s4
840 ; SI-NEXT: v_mov_b32_e32 v9, s5
841 ; SI-NEXT: v_mov_b32_e32 v10, s6
842 ; SI-NEXT: s_waitcnt lgkmcnt(0)
843 ; SI-NEXT: s_mov_b32 m0, s7
844 ; SI-NEXT: s_mov_b32 s2, -1
845 ; SI-NEXT: v_movreld_b32_e32 v0, v11
846 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
847 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
848 ; SI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
851 ; VI-LABEL: dynamic_insertelement_v11f32:
853 ; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60
854 ; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40
855 ; VI-NEXT: s_waitcnt lgkmcnt(0)
856 ; VI-NEXT: s_load_dword s7, s[8:9], 0x80
857 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
858 ; VI-NEXT: v_mov_b32_e32 v11, 0x40a00000
859 ; VI-NEXT: v_mov_b32_e32 v8, s4
860 ; VI-NEXT: v_mov_b32_e32 v0, s12
861 ; VI-NEXT: v_mov_b32_e32 v1, s13
862 ; VI-NEXT: v_mov_b32_e32 v2, s14
863 ; VI-NEXT: v_mov_b32_e32 v3, s15
864 ; VI-NEXT: v_mov_b32_e32 v4, s16
865 ; VI-NEXT: v_mov_b32_e32 v5, s17
866 ; VI-NEXT: v_mov_b32_e32 v6, s18
867 ; VI-NEXT: v_mov_b32_e32 v7, s19
868 ; VI-NEXT: v_mov_b32_e32 v9, s5
869 ; VI-NEXT: v_mov_b32_e32 v10, s6
870 ; VI-NEXT: s_waitcnt lgkmcnt(0)
871 ; VI-NEXT: s_mov_b32 m0, s7
872 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
873 ; VI-NEXT: s_mov_b32 s2, -1
874 ; VI-NEXT: v_movreld_b32_e32 v0, v11
875 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
876 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
877 ; VI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
879 %vecins = insertelement <11 x float> %a, float 5.000000e+00, i32 %b
880 store <11 x float> %vecins, ptr addrspace(1) %out, align 32
884 define amdgpu_kernel void @dynamic_insertelement_v12f32(ptr addrspace(1) %out, <12 x float> %a, i32 %b) nounwind {
885 ; SI-LABEL: dynamic_insertelement_v12f32:
887 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
888 ; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
889 ; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18
890 ; SI-NEXT: s_load_dword s8, s[8:9], 0x20
891 ; SI-NEXT: v_mov_b32_e32 v12, 0x40a00000
892 ; SI-NEXT: s_mov_b32 s3, 0x100f000
893 ; SI-NEXT: s_waitcnt lgkmcnt(0)
894 ; SI-NEXT: v_mov_b32_e32 v0, s12
895 ; SI-NEXT: v_mov_b32_e32 v1, s13
896 ; SI-NEXT: v_mov_b32_e32 v2, s14
897 ; SI-NEXT: v_mov_b32_e32 v3, s15
898 ; SI-NEXT: v_mov_b32_e32 v4, s16
899 ; SI-NEXT: v_mov_b32_e32 v5, s17
900 ; SI-NEXT: v_mov_b32_e32 v6, s18
901 ; SI-NEXT: v_mov_b32_e32 v7, s19
902 ; SI-NEXT: v_mov_b32_e32 v8, s4
903 ; SI-NEXT: v_mov_b32_e32 v9, s5
904 ; SI-NEXT: v_mov_b32_e32 v10, s6
905 ; SI-NEXT: v_mov_b32_e32 v11, s7
906 ; SI-NEXT: s_mov_b32 m0, s8
907 ; SI-NEXT: s_mov_b32 s2, -1
908 ; SI-NEXT: v_movreld_b32_e32 v0, v12
909 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
910 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
911 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
914 ; VI-LABEL: dynamic_insertelement_v12f32:
916 ; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40
917 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
918 ; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60
919 ; VI-NEXT: s_load_dword s8, s[8:9], 0x80
920 ; VI-NEXT: v_mov_b32_e32 v12, 0x40a00000
921 ; VI-NEXT: s_waitcnt lgkmcnt(0)
922 ; VI-NEXT: v_mov_b32_e32 v0, s12
923 ; VI-NEXT: v_mov_b32_e32 v1, s13
924 ; VI-NEXT: v_mov_b32_e32 v2, s14
925 ; VI-NEXT: v_mov_b32_e32 v3, s15
926 ; VI-NEXT: v_mov_b32_e32 v4, s16
927 ; VI-NEXT: v_mov_b32_e32 v5, s17
928 ; VI-NEXT: v_mov_b32_e32 v6, s18
929 ; VI-NEXT: v_mov_b32_e32 v7, s19
930 ; VI-NEXT: v_mov_b32_e32 v8, s4
931 ; VI-NEXT: v_mov_b32_e32 v9, s5
932 ; VI-NEXT: v_mov_b32_e32 v10, s6
933 ; VI-NEXT: v_mov_b32_e32 v11, s7
934 ; VI-NEXT: s_mov_b32 m0, s8
935 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
936 ; VI-NEXT: s_mov_b32 s2, -1
937 ; VI-NEXT: v_movreld_b32_e32 v0, v12
938 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
939 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
940 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
942 %vecins = insertelement <12 x float> %a, float 5.000000e+00, i32 %b
943 store <12 x float> %vecins, ptr addrspace(1) %out, align 32
947 define amdgpu_kernel void @dynamic_insertelement_v16f32(ptr addrspace(1) %out, <16 x float> %a, i32 %b) nounwind {
948 ; SI-LABEL: dynamic_insertelement_v16f32:
950 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
951 ; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10
952 ; SI-NEXT: s_load_dword s4, s[8:9], 0x20
953 ; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000
954 ; SI-NEXT: s_mov_b32 s3, 0x100f000
955 ; SI-NEXT: s_mov_b32 s2, -1
956 ; SI-NEXT: s_waitcnt lgkmcnt(0)
957 ; SI-NEXT: v_mov_b32_e32 v0, s12
958 ; SI-NEXT: v_mov_b32_e32 v1, s13
959 ; SI-NEXT: v_mov_b32_e32 v2, s14
960 ; SI-NEXT: v_mov_b32_e32 v3, s15
961 ; SI-NEXT: v_mov_b32_e32 v4, s16
962 ; SI-NEXT: v_mov_b32_e32 v5, s17
963 ; SI-NEXT: v_mov_b32_e32 v6, s18
964 ; SI-NEXT: v_mov_b32_e32 v7, s19
965 ; SI-NEXT: v_mov_b32_e32 v8, s20
966 ; SI-NEXT: v_mov_b32_e32 v9, s21
967 ; SI-NEXT: v_mov_b32_e32 v10, s22
968 ; SI-NEXT: v_mov_b32_e32 v11, s23
969 ; SI-NEXT: v_mov_b32_e32 v12, s24
970 ; SI-NEXT: v_mov_b32_e32 v13, s25
971 ; SI-NEXT: v_mov_b32_e32 v14, s26
972 ; SI-NEXT: v_mov_b32_e32 v15, s27
973 ; SI-NEXT: s_mov_b32 m0, s4
974 ; SI-NEXT: v_movreld_b32_e32 v0, v16
975 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
976 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
977 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
978 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
981 ; VI-LABEL: dynamic_insertelement_v16f32:
983 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
984 ; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40
985 ; VI-NEXT: s_load_dword s4, s[8:9], 0x80
986 ; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000
987 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
988 ; VI-NEXT: s_mov_b32 s2, -1
989 ; VI-NEXT: s_waitcnt lgkmcnt(0)
990 ; VI-NEXT: v_mov_b32_e32 v0, s12
991 ; VI-NEXT: v_mov_b32_e32 v1, s13
992 ; VI-NEXT: v_mov_b32_e32 v2, s14
993 ; VI-NEXT: v_mov_b32_e32 v3, s15
994 ; VI-NEXT: v_mov_b32_e32 v4, s16
995 ; VI-NEXT: v_mov_b32_e32 v5, s17
996 ; VI-NEXT: v_mov_b32_e32 v6, s18
997 ; VI-NEXT: v_mov_b32_e32 v7, s19
998 ; VI-NEXT: v_mov_b32_e32 v8, s20
999 ; VI-NEXT: v_mov_b32_e32 v9, s21
1000 ; VI-NEXT: v_mov_b32_e32 v10, s22
1001 ; VI-NEXT: v_mov_b32_e32 v11, s23
1002 ; VI-NEXT: v_mov_b32_e32 v12, s24
1003 ; VI-NEXT: v_mov_b32_e32 v13, s25
1004 ; VI-NEXT: v_mov_b32_e32 v14, s26
1005 ; VI-NEXT: v_mov_b32_e32 v15, s27
1006 ; VI-NEXT: s_mov_b32 m0, s4
1007 ; VI-NEXT: v_movreld_b32_e32 v0, v16
1008 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1009 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1010 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1011 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1013 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
1014 store <16 x float> %vecins, ptr addrspace(1) %out, align 64
1018 define amdgpu_kernel void @dynamic_insertelement_v2i32(ptr addrspace(1) %out, <2 x i32> %a, i32 %b) nounwind {
1019 ; SI-LABEL: dynamic_insertelement_v2i32:
1021 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
1022 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1023 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1024 ; SI-NEXT: s_mov_b32 s6, -1
1025 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1026 ; SI-NEXT: s_cmp_lg_u32 s2, 1
1027 ; SI-NEXT: s_cselect_b32 s1, s1, 5
1028 ; SI-NEXT: s_cmp_lg_u32 s2, 0
1029 ; SI-NEXT: s_cselect_b32 s0, s0, 5
1030 ; SI-NEXT: v_mov_b32_e32 v0, s0
1031 ; SI-NEXT: v_mov_b32_e32 v1, s1
1032 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1035 ; VI-LABEL: dynamic_insertelement_v2i32:
1037 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
1038 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1039 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1040 ; VI-NEXT: s_mov_b32 s6, -1
1041 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1042 ; VI-NEXT: s_cmp_lg_u32 s2, 1
1043 ; VI-NEXT: s_cselect_b32 s1, s1, 5
1044 ; VI-NEXT: s_cmp_lg_u32 s2, 0
1045 ; VI-NEXT: s_cselect_b32 s0, s0, 5
1046 ; VI-NEXT: v_mov_b32_e32 v0, s0
1047 ; VI-NEXT: v_mov_b32_e32 v1, s1
1048 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1050 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
1051 store <2 x i32> %vecins, ptr addrspace(1) %out, align 8
1055 define amdgpu_kernel void @dynamic_insertelement_v3i32(ptr addrspace(1) %out, <3 x i32> %a, i32 %b) nounwind {
1056 ; SI-LABEL: dynamic_insertelement_v3i32:
1058 ; SI-NEXT: s_load_dword s10, s[8:9], 0x8
1059 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
1060 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1061 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1062 ; SI-NEXT: s_mov_b32 s6, -1
1063 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1064 ; SI-NEXT: s_cmp_lg_u32 s10, 2
1065 ; SI-NEXT: s_cselect_b32 s2, s2, 5
1066 ; SI-NEXT: s_cmp_lg_u32 s10, 1
1067 ; SI-NEXT: s_cselect_b32 s1, s1, 5
1068 ; SI-NEXT: s_cmp_lg_u32 s10, 0
1069 ; SI-NEXT: s_cselect_b32 s0, s0, 5
1070 ; SI-NEXT: v_mov_b32_e32 v0, s0
1071 ; SI-NEXT: v_mov_b32_e32 v1, s1
1072 ; SI-NEXT: v_mov_b32_e32 v2, s2
1073 ; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
1076 ; VI-LABEL: dynamic_insertelement_v3i32:
1078 ; VI-NEXT: s_load_dword s10, s[8:9], 0x20
1079 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
1080 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1081 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1082 ; VI-NEXT: s_mov_b32 s6, -1
1083 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1084 ; VI-NEXT: s_cmp_lg_u32 s10, 2
1085 ; VI-NEXT: s_cselect_b32 s2, s2, 5
1086 ; VI-NEXT: s_cmp_lg_u32 s10, 1
1087 ; VI-NEXT: s_cselect_b32 s1, s1, 5
1088 ; VI-NEXT: s_cmp_lg_u32 s10, 0
1089 ; VI-NEXT: s_cselect_b32 s0, s0, 5
1090 ; VI-NEXT: v_mov_b32_e32 v0, s0
1091 ; VI-NEXT: v_mov_b32_e32 v1, s1
1092 ; VI-NEXT: v_mov_b32_e32 v2, s2
1093 ; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
1095 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
1096 store <3 x i32> %vecins, ptr addrspace(1) %out, align 16
1100 define amdgpu_kernel void @dynamic_insertelement_v4i32(ptr addrspace(1) %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
1101 ; SI-LABEL: dynamic_insertelement_v4i32:
1103 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
1104 ; SI-NEXT: s_load_dword s10, s[8:9], 0x8
1105 ; SI-NEXT: s_load_dword s11, s[8:9], 0x11
1106 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1107 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1108 ; SI-NEXT: s_mov_b32 s6, -1
1109 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1110 ; SI-NEXT: s_cmp_eq_u32 s10, 3
1111 ; SI-NEXT: s_cselect_b32 s3, s11, s3
1112 ; SI-NEXT: s_cmp_eq_u32 s10, 2
1113 ; SI-NEXT: s_cselect_b32 s2, s11, s2
1114 ; SI-NEXT: s_cmp_eq_u32 s10, 1
1115 ; SI-NEXT: s_cselect_b32 s1, s11, s1
1116 ; SI-NEXT: s_cmp_eq_u32 s10, 0
1117 ; SI-NEXT: s_cselect_b32 s0, s11, s0
1118 ; SI-NEXT: v_mov_b32_e32 v0, s0
1119 ; SI-NEXT: v_mov_b32_e32 v1, s1
1120 ; SI-NEXT: v_mov_b32_e32 v2, s2
1121 ; SI-NEXT: v_mov_b32_e32 v3, s3
1122 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1125 ; VI-LABEL: dynamic_insertelement_v4i32:
1127 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
1128 ; VI-NEXT: s_load_dword s10, s[8:9], 0x20
1129 ; VI-NEXT: s_load_dword s11, s[8:9], 0x44
1130 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
1131 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1132 ; VI-NEXT: s_mov_b32 s6, -1
1133 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1134 ; VI-NEXT: s_cmp_eq_u32 s10, 3
1135 ; VI-NEXT: s_cselect_b32 s3, s11, s3
1136 ; VI-NEXT: s_cmp_eq_u32 s10, 2
1137 ; VI-NEXT: s_cselect_b32 s2, s11, s2
1138 ; VI-NEXT: s_cmp_eq_u32 s10, 1
1139 ; VI-NEXT: s_cselect_b32 s1, s11, s1
1140 ; VI-NEXT: s_cmp_eq_u32 s10, 0
1141 ; VI-NEXT: s_cselect_b32 s0, s11, s0
1142 ; VI-NEXT: v_mov_b32_e32 v0, s0
1143 ; VI-NEXT: v_mov_b32_e32 v1, s1
1144 ; VI-NEXT: v_mov_b32_e32 v2, s2
1145 ; VI-NEXT: v_mov_b32_e32 v3, s3
1146 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1148 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
1149 store <4 x i32> %vecins, ptr addrspace(1) %out, align 16
1153 define amdgpu_kernel void @dynamic_insertelement_v8i32(ptr addrspace(1) %out, <8 x i32> %a, i32 %b) nounwind {
1154 ; SI-LABEL: dynamic_insertelement_v8i32:
1156 ; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8
1157 ; SI-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
1158 ; SI-NEXT: s_load_dword s8, s[8:9], 0x10
1159 ; SI-NEXT: s_mov_b32 s15, 0x100f000
1160 ; SI-NEXT: s_mov_b32 s14, -1
1161 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1162 ; SI-NEXT: v_mov_b32_e32 v0, s0
1163 ; SI-NEXT: v_mov_b32_e32 v1, s1
1164 ; SI-NEXT: v_mov_b32_e32 v2, s2
1165 ; SI-NEXT: v_mov_b32_e32 v3, s3
1166 ; SI-NEXT: v_mov_b32_e32 v4, s4
1167 ; SI-NEXT: v_mov_b32_e32 v5, s5
1168 ; SI-NEXT: v_mov_b32_e32 v6, s6
1169 ; SI-NEXT: v_mov_b32_e32 v7, s7
1170 ; SI-NEXT: s_mov_b32 m0, s8
1171 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1172 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
1173 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1176 ; VI-LABEL: dynamic_insertelement_v8i32:
1178 ; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x20
1179 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1180 ; VI-NEXT: s_load_dword s4, s[8:9], 0x40
1181 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1182 ; VI-NEXT: s_mov_b32 s2, -1
1183 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1184 ; VI-NEXT: v_mov_b32_e32 v0, s12
1185 ; VI-NEXT: v_mov_b32_e32 v1, s13
1186 ; VI-NEXT: v_mov_b32_e32 v2, s14
1187 ; VI-NEXT: v_mov_b32_e32 v3, s15
1188 ; VI-NEXT: v_mov_b32_e32 v4, s16
1189 ; VI-NEXT: v_mov_b32_e32 v5, s17
1190 ; VI-NEXT: v_mov_b32_e32 v6, s18
1191 ; VI-NEXT: v_mov_b32_e32 v7, s19
1192 ; VI-NEXT: s_mov_b32 m0, s4
1193 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1194 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1195 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1197 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
1198 store <8 x i32> %vecins, ptr addrspace(1) %out, align 32
1202 define amdgpu_kernel void @dynamic_insertelement_v9i32(ptr addrspace(1) %out, <9 x i32> %a, i32 %b) nounwind {
1203 ; SI-LABEL: dynamic_insertelement_v9i32:
1205 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1206 ; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
1207 ; SI-NEXT: s_load_dword s4, s[8:9], 0x18
1208 ; SI-NEXT: s_load_dword s5, s[8:9], 0x20
1209 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1210 ; SI-NEXT: s_mov_b32 s2, -1
1211 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1212 ; SI-NEXT: v_mov_b32_e32 v0, s12
1213 ; SI-NEXT: v_mov_b32_e32 v1, s13
1214 ; SI-NEXT: v_mov_b32_e32 v2, s14
1215 ; SI-NEXT: v_mov_b32_e32 v3, s15
1216 ; SI-NEXT: v_mov_b32_e32 v4, s16
1217 ; SI-NEXT: v_mov_b32_e32 v5, s17
1218 ; SI-NEXT: v_mov_b32_e32 v6, s18
1219 ; SI-NEXT: v_mov_b32_e32 v7, s19
1220 ; SI-NEXT: v_mov_b32_e32 v8, s4
1221 ; SI-NEXT: s_mov_b32 m0, s5
1222 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1223 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
1224 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1225 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1228 ; VI-LABEL: dynamic_insertelement_v9i32:
1230 ; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40
1231 ; VI-NEXT: s_load_dword s4, s[8:9], 0x60
1232 ; VI-NEXT: s_load_dword s5, s[8:9], 0x80
1233 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1234 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1235 ; VI-NEXT: s_mov_b32 s2, -1
1236 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1237 ; VI-NEXT: v_mov_b32_e32 v0, s12
1238 ; VI-NEXT: v_mov_b32_e32 v1, s13
1239 ; VI-NEXT: v_mov_b32_e32 v2, s14
1240 ; VI-NEXT: v_mov_b32_e32 v3, s15
1241 ; VI-NEXT: v_mov_b32_e32 v4, s16
1242 ; VI-NEXT: v_mov_b32_e32 v5, s17
1243 ; VI-NEXT: v_mov_b32_e32 v6, s18
1244 ; VI-NEXT: v_mov_b32_e32 v7, s19
1245 ; VI-NEXT: v_mov_b32_e32 v8, s4
1246 ; VI-NEXT: s_mov_b32 m0, s5
1247 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1248 ; VI-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
1249 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1250 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1252 %vecins = insertelement <9 x i32> %a, i32 5, i32 %b
1253 store <9 x i32> %vecins, ptr addrspace(1) %out, align 32
1257 define amdgpu_kernel void @dynamic_insertelement_v10i32(ptr addrspace(1) %out, <10 x i32> %a, i32 %b) nounwind {
1258 ; SI-LABEL: dynamic_insertelement_v10i32:
1260 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1261 ; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
1262 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18
1263 ; SI-NEXT: s_load_dword s6, s[8:9], 0x20
1264 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1265 ; SI-NEXT: s_mov_b32 s2, -1
1266 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1267 ; SI-NEXT: v_mov_b32_e32 v0, s12
1268 ; SI-NEXT: v_mov_b32_e32 v1, s13
1269 ; SI-NEXT: v_mov_b32_e32 v2, s14
1270 ; SI-NEXT: v_mov_b32_e32 v3, s15
1271 ; SI-NEXT: v_mov_b32_e32 v4, s16
1272 ; SI-NEXT: v_mov_b32_e32 v5, s17
1273 ; SI-NEXT: v_mov_b32_e32 v6, s18
1274 ; SI-NEXT: v_mov_b32_e32 v7, s19
1275 ; SI-NEXT: v_mov_b32_e32 v8, s4
1276 ; SI-NEXT: v_mov_b32_e32 v9, s5
1277 ; SI-NEXT: s_mov_b32 m0, s6
1278 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1279 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1280 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1281 ; SI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
1284 ; VI-LABEL: dynamic_insertelement_v10i32:
1286 ; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40
1287 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x60
1288 ; VI-NEXT: s_load_dword s6, s[8:9], 0x80
1289 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1290 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1291 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1292 ; VI-NEXT: v_mov_b32_e32 v0, s12
1293 ; VI-NEXT: v_mov_b32_e32 v1, s13
1294 ; VI-NEXT: v_mov_b32_e32 v2, s14
1295 ; VI-NEXT: v_mov_b32_e32 v3, s15
1296 ; VI-NEXT: v_mov_b32_e32 v4, s16
1297 ; VI-NEXT: v_mov_b32_e32 v5, s17
1298 ; VI-NEXT: v_mov_b32_e32 v6, s18
1299 ; VI-NEXT: v_mov_b32_e32 v7, s19
1300 ; VI-NEXT: v_mov_b32_e32 v8, s4
1301 ; VI-NEXT: v_mov_b32_e32 v9, s5
1302 ; VI-NEXT: s_mov_b32 m0, s6
1303 ; VI-NEXT: s_mov_b32 s2, -1
1304 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1305 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1306 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1307 ; VI-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
1309 %vecins = insertelement <10 x i32> %a, i32 5, i32 %b
1310 store <10 x i32> %vecins, ptr addrspace(1) %out, align 32
1314 define amdgpu_kernel void @dynamic_insertelement_v11i32(ptr addrspace(1) %out, <11 x i32> %a, i32 %b) nounwind {
1315 ; SI-LABEL: dynamic_insertelement_v11i32:
1317 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1318 ; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
1319 ; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18
1320 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1321 ; SI-NEXT: s_load_dword s7, s[8:9], 0x20
1322 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1323 ; SI-NEXT: s_mov_b32 s2, -1
1324 ; SI-NEXT: v_mov_b32_e32 v0, s12
1325 ; SI-NEXT: v_mov_b32_e32 v1, s13
1326 ; SI-NEXT: v_mov_b32_e32 v2, s14
1327 ; SI-NEXT: v_mov_b32_e32 v3, s15
1328 ; SI-NEXT: v_mov_b32_e32 v4, s16
1329 ; SI-NEXT: v_mov_b32_e32 v5, s17
1330 ; SI-NEXT: v_mov_b32_e32 v6, s18
1331 ; SI-NEXT: v_mov_b32_e32 v7, s19
1332 ; SI-NEXT: v_mov_b32_e32 v8, s4
1333 ; SI-NEXT: v_mov_b32_e32 v9, s5
1334 ; SI-NEXT: v_mov_b32_e32 v10, s6
1335 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1336 ; SI-NEXT: s_mov_b32 m0, s7
1337 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1338 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1339 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1340 ; SI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
1343 ; VI-LABEL: dynamic_insertelement_v11i32:
1345 ; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60
1346 ; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40
1347 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1348 ; VI-NEXT: s_load_dword s7, s[8:9], 0x80
1349 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1350 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1351 ; VI-NEXT: v_mov_b32_e32 v8, s4
1352 ; VI-NEXT: v_mov_b32_e32 v0, s12
1353 ; VI-NEXT: v_mov_b32_e32 v1, s13
1354 ; VI-NEXT: v_mov_b32_e32 v2, s14
1355 ; VI-NEXT: v_mov_b32_e32 v3, s15
1356 ; VI-NEXT: v_mov_b32_e32 v4, s16
1357 ; VI-NEXT: v_mov_b32_e32 v5, s17
1358 ; VI-NEXT: v_mov_b32_e32 v6, s18
1359 ; VI-NEXT: v_mov_b32_e32 v7, s19
1360 ; VI-NEXT: v_mov_b32_e32 v9, s5
1361 ; VI-NEXT: v_mov_b32_e32 v10, s6
1362 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1363 ; VI-NEXT: s_mov_b32 m0, s7
1364 ; VI-NEXT: s_mov_b32 s2, -1
1365 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1366 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1367 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1368 ; VI-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
1370 %vecins = insertelement <11 x i32> %a, i32 5, i32 %b
1371 store <11 x i32> %vecins, ptr addrspace(1) %out, align 32
1375 define amdgpu_kernel void @dynamic_insertelement_v12i32(ptr addrspace(1) %out, <12 x i32> %a, i32 %b) nounwind {
1376 ; SI-LABEL: dynamic_insertelement_v12i32:
1378 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1379 ; SI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
1380 ; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x18
1381 ; SI-NEXT: s_load_dword s8, s[8:9], 0x20
1382 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1383 ; SI-NEXT: s_mov_b32 s2, -1
1384 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1385 ; SI-NEXT: v_mov_b32_e32 v0, s12
1386 ; SI-NEXT: v_mov_b32_e32 v1, s13
1387 ; SI-NEXT: v_mov_b32_e32 v2, s14
1388 ; SI-NEXT: v_mov_b32_e32 v3, s15
1389 ; SI-NEXT: v_mov_b32_e32 v4, s16
1390 ; SI-NEXT: v_mov_b32_e32 v5, s17
1391 ; SI-NEXT: v_mov_b32_e32 v6, s18
1392 ; SI-NEXT: v_mov_b32_e32 v7, s19
1393 ; SI-NEXT: v_mov_b32_e32 v8, s4
1394 ; SI-NEXT: v_mov_b32_e32 v9, s5
1395 ; SI-NEXT: v_mov_b32_e32 v10, s6
1396 ; SI-NEXT: v_mov_b32_e32 v11, s7
1397 ; SI-NEXT: s_mov_b32 m0, s8
1398 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1399 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1400 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1401 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1404 ; VI-LABEL: dynamic_insertelement_v12i32:
1406 ; VI-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x40
1407 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1408 ; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x60
1409 ; VI-NEXT: s_load_dword s8, s[8:9], 0x80
1410 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1411 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1412 ; VI-NEXT: v_mov_b32_e32 v0, s12
1413 ; VI-NEXT: v_mov_b32_e32 v1, s13
1414 ; VI-NEXT: v_mov_b32_e32 v2, s14
1415 ; VI-NEXT: v_mov_b32_e32 v3, s15
1416 ; VI-NEXT: v_mov_b32_e32 v4, s16
1417 ; VI-NEXT: v_mov_b32_e32 v5, s17
1418 ; VI-NEXT: v_mov_b32_e32 v6, s18
1419 ; VI-NEXT: v_mov_b32_e32 v7, s19
1420 ; VI-NEXT: v_mov_b32_e32 v8, s4
1421 ; VI-NEXT: v_mov_b32_e32 v9, s5
1422 ; VI-NEXT: v_mov_b32_e32 v10, s6
1423 ; VI-NEXT: v_mov_b32_e32 v11, s7
1424 ; VI-NEXT: s_mov_b32 m0, s8
1425 ; VI-NEXT: s_mov_b32 s2, -1
1426 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1427 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1428 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1429 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1431 %vecins = insertelement <12 x i32> %a, i32 5, i32 %b
1432 store <12 x i32> %vecins, ptr addrspace(1) %out, align 32
1436 define amdgpu_kernel void @dynamic_insertelement_v16i32(ptr addrspace(1) %out, <16 x i32> %a, i32 %b) nounwind {
1437 ; SI-LABEL: dynamic_insertelement_v16i32:
1439 ; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10
1440 ; SI-NEXT: s_load_dword s4, s[8:9], 0x20
1441 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1442 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1443 ; SI-NEXT: s_mov_b32 s2, -1
1444 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1445 ; SI-NEXT: v_mov_b32_e32 v0, s12
1446 ; SI-NEXT: v_mov_b32_e32 v1, s13
1447 ; SI-NEXT: v_mov_b32_e32 v2, s14
1448 ; SI-NEXT: v_mov_b32_e32 v3, s15
1449 ; SI-NEXT: v_mov_b32_e32 v4, s16
1450 ; SI-NEXT: v_mov_b32_e32 v5, s17
1451 ; SI-NEXT: v_mov_b32_e32 v6, s18
1452 ; SI-NEXT: v_mov_b32_e32 v7, s19
1453 ; SI-NEXT: v_mov_b32_e32 v8, s20
1454 ; SI-NEXT: v_mov_b32_e32 v9, s21
1455 ; SI-NEXT: v_mov_b32_e32 v10, s22
1456 ; SI-NEXT: v_mov_b32_e32 v11, s23
1457 ; SI-NEXT: v_mov_b32_e32 v12, s24
1458 ; SI-NEXT: v_mov_b32_e32 v13, s25
1459 ; SI-NEXT: v_mov_b32_e32 v14, s26
1460 ; SI-NEXT: v_mov_b32_e32 v15, s27
1461 ; SI-NEXT: s_mov_b32 m0, s4
1462 ; SI-NEXT: v_movreld_b32_e32 v0, 5
1463 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1464 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1465 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1466 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1469 ; VI-LABEL: dynamic_insertelement_v16i32:
1471 ; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40
1472 ; VI-NEXT: s_load_dword s4, s[8:9], 0x80
1473 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1474 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1475 ; VI-NEXT: s_mov_b32 s2, -1
1476 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1477 ; VI-NEXT: v_mov_b32_e32 v0, s12
1478 ; VI-NEXT: v_mov_b32_e32 v1, s13
1479 ; VI-NEXT: v_mov_b32_e32 v2, s14
1480 ; VI-NEXT: v_mov_b32_e32 v3, s15
1481 ; VI-NEXT: v_mov_b32_e32 v4, s16
1482 ; VI-NEXT: v_mov_b32_e32 v5, s17
1483 ; VI-NEXT: v_mov_b32_e32 v6, s18
1484 ; VI-NEXT: v_mov_b32_e32 v7, s19
1485 ; VI-NEXT: v_mov_b32_e32 v8, s20
1486 ; VI-NEXT: v_mov_b32_e32 v9, s21
1487 ; VI-NEXT: v_mov_b32_e32 v10, s22
1488 ; VI-NEXT: v_mov_b32_e32 v11, s23
1489 ; VI-NEXT: v_mov_b32_e32 v12, s24
1490 ; VI-NEXT: v_mov_b32_e32 v13, s25
1491 ; VI-NEXT: v_mov_b32_e32 v14, s26
1492 ; VI-NEXT: v_mov_b32_e32 v15, s27
1493 ; VI-NEXT: s_mov_b32 m0, s4
1494 ; VI-NEXT: v_movreld_b32_e32 v0, 5
1495 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
1496 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
1497 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1498 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1500 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
1501 store <16 x i32> %vecins, ptr addrspace(1) %out, align 64
1505 define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 x i16> %a, i32 %b) nounwind {
1506 ; SI-LABEL: dynamic_insertelement_v2i16:
1508 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1509 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1510 ; SI-NEXT: s_mov_b32 s6, -1
1511 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1512 ; SI-NEXT: s_mov_b32 s4, s0
1513 ; SI-NEXT: s_lshl_b32 s0, s3, 4
1514 ; SI-NEXT: s_lshl_b32 s0, 0xffff, s0
1515 ; SI-NEXT: s_mov_b32 s5, s1
1516 ; SI-NEXT: s_andn2_b32 s1, s2, s0
1517 ; SI-NEXT: s_and_b32 s0, s0, 0x50005
1518 ; SI-NEXT: s_or_b32 s0, s0, s1
1519 ; SI-NEXT: v_mov_b32_e32 v0, s0
1520 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1523 ; VI-LABEL: dynamic_insertelement_v2i16:
1525 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1526 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1527 ; VI-NEXT: s_mov_b32 s6, -1
1528 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1529 ; VI-NEXT: s_mov_b32 s4, s0
1530 ; VI-NEXT: s_lshl_b32 s0, s3, 4
1531 ; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
1532 ; VI-NEXT: s_mov_b32 s5, s1
1533 ; VI-NEXT: s_andn2_b32 s1, s2, s0
1534 ; VI-NEXT: s_and_b32 s0, s0, 0x50005
1535 ; VI-NEXT: s_or_b32 s0, s0, s1
1536 ; VI-NEXT: v_mov_b32_e32 v0, s0
1537 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1539 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
1540 store <2 x i16> %vecins, ptr addrspace(1) %out, align 8
1544 define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 x i16> %a, i32 %b) nounwind {
1545 ; SI-LABEL: dynamic_insertelement_v3i16:
1547 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1548 ; SI-NEXT: s_load_dword s8, s[8:9], 0x4
1549 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1550 ; SI-NEXT: s_mov_b32 s6, -1
1551 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1552 ; SI-NEXT: s_mov_b32 s4, s0
1553 ; SI-NEXT: s_lshl_b32 s0, s8, 4
1554 ; SI-NEXT: s_mov_b32 s5, s1
1555 ; SI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
1556 ; SI-NEXT: s_and_b32 s9, s1, 0x50005
1557 ; SI-NEXT: s_and_b32 s8, s0, 0x50005
1558 ; SI-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1]
1559 ; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
1560 ; SI-NEXT: v_mov_b32_e32 v0, s1
1561 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
1562 ; SI-NEXT: v_mov_b32_e32 v0, s0
1563 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1566 ; VI-LABEL: dynamic_insertelement_v3i16:
1568 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1569 ; VI-NEXT: s_load_dword s8, s[8:9], 0x10
1570 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1571 ; VI-NEXT: s_mov_b32 s6, -1
1572 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1573 ; VI-NEXT: s_mov_b32 s4, s0
1574 ; VI-NEXT: s_lshl_b32 s0, s8, 4
1575 ; VI-NEXT: s_mov_b32 s8, 0x50005
1576 ; VI-NEXT: s_mov_b32 s5, s1
1577 ; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0
1578 ; VI-NEXT: s_mov_b32 s9, s8
1579 ; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
1580 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9]
1581 ; VI-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
1582 ; VI-NEXT: v_mov_b32_e32 v0, s1
1583 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
1584 ; VI-NEXT: v_mov_b32_e32 v0, s0
1585 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
1587 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
1588 store <3 x i16> %vecins, ptr addrspace(1) %out, align 8
1592 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
1593 ; SI-LABEL: dynamic_insertelement_v2i8:
1595 ; SI-NEXT: s_load_dword s4, s[8:9], 0x13
1596 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1597 ; SI-NEXT: s_load_dword s5, s[8:9], 0xa
1598 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1599 ; SI-NEXT: s_mov_b32 s2, -1
1600 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1601 ; SI-NEXT: s_lshl_b32 s4, s4, 3
1602 ; SI-NEXT: s_lshl_b32 s4, 0xff, s4
1603 ; SI-NEXT: s_andn2_b32 s5, s5, s4
1604 ; SI-NEXT: s_and_b32 s4, s4, 0x505
1605 ; SI-NEXT: s_or_b32 s4, s4, s5
1606 ; SI-NEXT: v_mov_b32_e32 v0, s4
1607 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1610 ; VI-LABEL: dynamic_insertelement_v2i8:
1612 ; VI-NEXT: s_load_dword s4, s[8:9], 0x4c
1613 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1614 ; VI-NEXT: s_load_dword s5, s[8:9], 0x28
1615 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1616 ; VI-NEXT: s_mov_b32 s2, -1
1617 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1618 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1619 ; VI-NEXT: s_lshl_b32 s4, 0xff, s4
1620 ; VI-NEXT: s_and_b32 s6, s4, 0x505
1621 ; VI-NEXT: s_xor_b32 s4, s4, 0xffff
1622 ; VI-NEXT: s_and_b32 s4, s4, s5
1623 ; VI-NEXT: s_or_b32 s4, s6, s4
1624 ; VI-NEXT: v_mov_b32_e32 v0, s4
1625 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
1627 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
1628 store <2 x i8> %vecins, ptr addrspace(1) %out, align 8
1632 ; FIXME: post legalize i16 and i32 shifts aren't merged because of
1633 ; isTypeDesirableForOp in SimplifyDemandedBits
1634 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
1635 ; SI-LABEL: dynamic_insertelement_v3i8:
1637 ; SI-NEXT: s_load_dword s4, s[8:9], 0x13
1638 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1639 ; SI-NEXT: s_load_dword s5, s[8:9], 0xa
1640 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1641 ; SI-NEXT: s_mov_b32 s2, -1
1642 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1643 ; SI-NEXT: s_lshl_b32 s4, s4, 3
1644 ; SI-NEXT: s_lshl_b32 s4, 0xff, s4
1645 ; SI-NEXT: s_andn2_b32 s5, s5, s4
1646 ; SI-NEXT: s_and_b32 s4, s4, 0x5050505
1647 ; SI-NEXT: s_or_b32 s4, s4, s5
1648 ; SI-NEXT: s_lshr_b32 s5, s4, 16
1649 ; SI-NEXT: v_mov_b32_e32 v0, s4
1650 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
1651 ; SI-NEXT: v_mov_b32_e32 v0, s5
1652 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
1655 ; VI-LABEL: dynamic_insertelement_v3i8:
1657 ; VI-NEXT: s_load_dword s4, s[8:9], 0x4c
1658 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1659 ; VI-NEXT: s_load_dword s5, s[8:9], 0x28
1660 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1661 ; VI-NEXT: s_mov_b32 s2, -1
1662 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1663 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1664 ; VI-NEXT: s_lshl_b32 s4, 0xff, s4
1665 ; VI-NEXT: s_andn2_b32 s5, s5, s4
1666 ; VI-NEXT: s_and_b32 s4, s4, 0x5050505
1667 ; VI-NEXT: s_or_b32 s4, s4, s5
1668 ; VI-NEXT: s_lshr_b32 s5, s4, 16
1669 ; VI-NEXT: v_mov_b32_e32 v0, s4
1670 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
1671 ; VI-NEXT: v_mov_b32_e32 v0, s5
1672 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
1674 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
1675 store <3 x i8> %vecins, ptr addrspace(1) %out, align 4
1679 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
1680 ; SI-LABEL: dynamic_insertelement_v4i8:
1682 ; SI-NEXT: s_load_dword s4, s[8:9], 0x13
1683 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1684 ; SI-NEXT: s_load_dword s5, s[8:9], 0xa
1685 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1686 ; SI-NEXT: s_mov_b32 s2, -1
1687 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1688 ; SI-NEXT: s_lshl_b32 s4, s4, 3
1689 ; SI-NEXT: s_lshl_b32 s4, 0xff, s4
1690 ; SI-NEXT: s_andn2_b32 s5, s5, s4
1691 ; SI-NEXT: s_and_b32 s4, s4, 0x5050505
1692 ; SI-NEXT: s_or_b32 s4, s4, s5
1693 ; SI-NEXT: v_mov_b32_e32 v0, s4
1694 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1697 ; VI-LABEL: dynamic_insertelement_v4i8:
1699 ; VI-NEXT: s_load_dword s4, s[8:9], 0x4c
1700 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1701 ; VI-NEXT: s_load_dword s5, s[8:9], 0x28
1702 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1703 ; VI-NEXT: s_mov_b32 s2, -1
1704 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1705 ; VI-NEXT: s_lshl_b32 s4, s4, 3
1706 ; VI-NEXT: s_lshl_b32 s4, 0xff, s4
1707 ; VI-NEXT: s_andn2_b32 s5, s5, s4
1708 ; VI-NEXT: s_and_b32 s4, s4, 0x5050505
1709 ; VI-NEXT: s_or_b32 s4, s4, s5
1710 ; VI-NEXT: v_mov_b32_e32 v0, s4
1711 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1713 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
1714 store <4 x i8> %vecins, ptr addrspace(1) %out, align 4
1718 define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %a.ptr, i32 %b) nounwind {
1719 ; SI-LABEL: s_dynamic_insertelement_v8i8:
1721 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1722 ; SI-NEXT: s_load_dword s8, s[8:9], 0x4
1723 ; SI-NEXT: s_mov_b32 s7, 0x100f000
1724 ; SI-NEXT: s_mov_b32 s6, -1
1725 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1726 ; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1727 ; SI-NEXT: s_mov_b32 s4, s0
1728 ; SI-NEXT: s_lshl_b32 s0, s8, 3
1729 ; SI-NEXT: s_mov_b32 s5, s1
1730 ; SI-NEXT: s_lshl_b64 s[0:1], 0xff, s0
1731 ; SI-NEXT: s_and_b32 s9, s1, 0x5050505
1732 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1733 ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
1734 ; SI-NEXT: s_and_b32 s8, s0, 0x5050505
1735 ; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3]
1736 ; SI-NEXT: v_mov_b32_e32 v0, s0
1737 ; SI-NEXT: v_mov_b32_e32 v1, s1
1738 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1741 ; VI-LABEL: s_dynamic_insertelement_v8i8:
1743 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1744 ; VI-NEXT: s_load_dword s8, s[8:9], 0x10
1745 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
1746 ; VI-NEXT: s_mov_b32 s6, -1
1747 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1748 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1749 ; VI-NEXT: s_mov_b32 s4, s0
1750 ; VI-NEXT: s_lshl_b32 s0, s8, 3
1751 ; VI-NEXT: s_mov_b32 s5, s1
1752 ; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0
1753 ; VI-NEXT: s_and_b32 s9, s1, 0x5050505
1754 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1755 ; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
1756 ; VI-NEXT: s_and_b32 s8, s0, 0x5050505
1757 ; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3]
1758 ; VI-NEXT: v_mov_b32_e32 v0, s0
1759 ; VI-NEXT: v_mov_b32_e32 v1, s1
1760 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1762 %a = load <8 x i8>, ptr addrspace(4) %a.ptr, align 4
1763 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
1764 store <8 x i8> %vecins, ptr addrspace(1) %out, align 8
1768 define amdgpu_kernel void @dynamic_insertelement_v16i8(ptr addrspace(1) %out, <16 x i8> %a, i32 %b) nounwind {
1769 ; SI-LABEL: dynamic_insertelement_v16i8:
1771 ; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x4
1772 ; SI-NEXT: s_load_dword s10, s[8:9], 0x8
1773 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1774 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1775 ; SI-NEXT: s_mov_b32 s2, -1
1776 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1777 ; SI-NEXT: s_lshr_b32 s8, s7, 24
1778 ; SI-NEXT: s_cmp_lg_u32 s10, 15
1779 ; SI-NEXT: s_cselect_b32 s8, s8, 5
1780 ; SI-NEXT: s_lshl_b32 s8, s8, 24
1781 ; SI-NEXT: s_lshr_b32 s9, s7, 16
1782 ; SI-NEXT: s_cmp_lg_u32 s10, 14
1783 ; SI-NEXT: s_cselect_b32 s9, s9, 5
1784 ; SI-NEXT: s_and_b32 s9, s9, 0xff
1785 ; SI-NEXT: s_lshl_b32 s9, s9, 16
1786 ; SI-NEXT: s_or_b32 s8, s8, s9
1787 ; SI-NEXT: s_lshr_b32 s9, s7, 8
1788 ; SI-NEXT: s_cmp_lg_u32 s10, 13
1789 ; SI-NEXT: s_cselect_b32 s9, s9, 5
1790 ; SI-NEXT: s_lshl_b32 s9, s9, 8
1791 ; SI-NEXT: s_cmp_lg_u32 s10, 12
1792 ; SI-NEXT: s_cselect_b32 s7, s7, 5
1793 ; SI-NEXT: s_and_b32 s7, s7, 0xff
1794 ; SI-NEXT: s_or_b32 s7, s7, s9
1795 ; SI-NEXT: s_and_b32 s7, s7, 0xffff
1796 ; SI-NEXT: s_or_b32 s7, s7, s8
1797 ; SI-NEXT: s_lshr_b32 s8, s6, 24
1798 ; SI-NEXT: s_cmp_lg_u32 s10, 11
1799 ; SI-NEXT: s_cselect_b32 s8, s8, 5
1800 ; SI-NEXT: s_lshl_b32 s8, s8, 24
1801 ; SI-NEXT: s_lshr_b32 s9, s6, 16
1802 ; SI-NEXT: s_cmp_lg_u32 s10, 10
1803 ; SI-NEXT: s_cselect_b32 s9, s9, 5
1804 ; SI-NEXT: s_and_b32 s9, s9, 0xff
1805 ; SI-NEXT: s_lshl_b32 s9, s9, 16
1806 ; SI-NEXT: s_or_b32 s8, s8, s9
1807 ; SI-NEXT: s_lshr_b32 s9, s6, 8
1808 ; SI-NEXT: s_cmp_lg_u32 s10, 9
1809 ; SI-NEXT: s_cselect_b32 s9, s9, 5
1810 ; SI-NEXT: s_lshl_b32 s9, s9, 8
1811 ; SI-NEXT: s_cmp_lg_u32 s10, 8
1812 ; SI-NEXT: s_cselect_b32 s6, s6, 5
1813 ; SI-NEXT: s_and_b32 s6, s6, 0xff
1814 ; SI-NEXT: s_or_b32 s6, s6, s9
1815 ; SI-NEXT: s_and_b32 s6, s6, 0xffff
1816 ; SI-NEXT: s_or_b32 s6, s6, s8
1817 ; SI-NEXT: s_lshr_b32 s8, s5, 24
1818 ; SI-NEXT: s_cmp_lg_u32 s10, 7
1819 ; SI-NEXT: s_cselect_b32 s8, s8, 5
1820 ; SI-NEXT: s_lshl_b32 s8, s8, 24
1821 ; SI-NEXT: s_lshr_b32 s9, s5, 16
1822 ; SI-NEXT: s_cmp_lg_u32 s10, 6
1823 ; SI-NEXT: s_cselect_b32 s9, s9, 5
1824 ; SI-NEXT: s_and_b32 s9, s9, 0xff
1825 ; SI-NEXT: s_lshl_b32 s9, s9, 16
1826 ; SI-NEXT: s_or_b32 s8, s8, s9
1827 ; SI-NEXT: s_lshr_b32 s9, s5, 8
1828 ; SI-NEXT: s_cmp_lg_u32 s10, 5
1829 ; SI-NEXT: s_cselect_b32 s9, s9, 5
1830 ; SI-NEXT: s_lshl_b32 s9, s9, 8
1831 ; SI-NEXT: s_cmp_lg_u32 s10, 4
1832 ; SI-NEXT: s_cselect_b32 s5, s5, 5
1833 ; SI-NEXT: s_and_b32 s5, s5, 0xff
1834 ; SI-NEXT: s_or_b32 s5, s5, s9
1835 ; SI-NEXT: s_and_b32 s5, s5, 0xffff
1836 ; SI-NEXT: s_or_b32 s5, s5, s8
1837 ; SI-NEXT: s_lshr_b32 s8, s4, 24
1838 ; SI-NEXT: s_cmp_lg_u32 s10, 3
1839 ; SI-NEXT: s_cselect_b32 s8, s8, 5
1840 ; SI-NEXT: s_lshl_b32 s8, s8, 24
1841 ; SI-NEXT: s_lshr_b32 s9, s4, 16
1842 ; SI-NEXT: s_cmp_lg_u32 s10, 2
1843 ; SI-NEXT: s_cselect_b32 s9, s9, 5
1844 ; SI-NEXT: s_and_b32 s9, s9, 0xff
1845 ; SI-NEXT: s_lshl_b32 s9, s9, 16
1846 ; SI-NEXT: s_or_b32 s8, s8, s9
1847 ; SI-NEXT: s_lshr_b32 s9, s4, 8
1848 ; SI-NEXT: s_cmp_lg_u32 s10, 1
1849 ; SI-NEXT: s_cselect_b32 s9, s9, 5
1850 ; SI-NEXT: s_lshl_b32 s9, s9, 8
1851 ; SI-NEXT: s_cmp_lg_u32 s10, 0
1852 ; SI-NEXT: s_cselect_b32 s4, s4, 5
1853 ; SI-NEXT: s_and_b32 s4, s4, 0xff
1854 ; SI-NEXT: s_or_b32 s4, s4, s9
1855 ; SI-NEXT: s_and_b32 s4, s4, 0xffff
1856 ; SI-NEXT: s_or_b32 s4, s4, s8
1857 ; SI-NEXT: v_mov_b32_e32 v0, s4
1858 ; SI-NEXT: v_mov_b32_e32 v1, s5
1859 ; SI-NEXT: v_mov_b32_e32 v2, s6
1860 ; SI-NEXT: v_mov_b32_e32 v3, s7
1861 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1864 ; VI-LABEL: dynamic_insertelement_v16i8:
1866 ; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x10
1867 ; VI-NEXT: s_load_dword s10, s[8:9], 0x20
1868 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
1869 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
1870 ; VI-NEXT: s_mov_b32 s2, -1
1871 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1872 ; VI-NEXT: s_lshr_b32 s8, s7, 24
1873 ; VI-NEXT: s_cmp_lg_u32 s10, 15
1874 ; VI-NEXT: s_cselect_b32 s8, s8, 5
1875 ; VI-NEXT: s_lshl_b32 s8, s8, 8
1876 ; VI-NEXT: s_lshr_b32 s9, s7, 16
1877 ; VI-NEXT: s_cmp_lg_u32 s10, 14
1878 ; VI-NEXT: s_cselect_b32 s9, s9, 5
1879 ; VI-NEXT: s_and_b32 s9, s9, 0xff
1880 ; VI-NEXT: s_or_b32 s8, s9, s8
1881 ; VI-NEXT: s_lshl_b32 s8, s8, 16
1882 ; VI-NEXT: s_lshr_b32 s9, s7, 8
1883 ; VI-NEXT: s_cmp_lg_u32 s10, 13
1884 ; VI-NEXT: s_cselect_b32 s9, s9, 5
1885 ; VI-NEXT: s_lshl_b32 s9, s9, 8
1886 ; VI-NEXT: s_cmp_lg_u32 s10, 12
1887 ; VI-NEXT: s_cselect_b32 s7, s7, 5
1888 ; VI-NEXT: s_and_b32 s7, s7, 0xff
1889 ; VI-NEXT: s_or_b32 s7, s7, s9
1890 ; VI-NEXT: s_and_b32 s7, s7, 0xffff
1891 ; VI-NEXT: s_or_b32 s7, s7, s8
1892 ; VI-NEXT: s_lshr_b32 s8, s6, 24
1893 ; VI-NEXT: s_cmp_lg_u32 s10, 11
1894 ; VI-NEXT: s_cselect_b32 s8, s8, 5
1895 ; VI-NEXT: s_lshl_b32 s8, s8, 8
1896 ; VI-NEXT: s_lshr_b32 s9, s6, 16
1897 ; VI-NEXT: s_cmp_lg_u32 s10, 10
1898 ; VI-NEXT: s_cselect_b32 s9, s9, 5
1899 ; VI-NEXT: s_and_b32 s9, s9, 0xff
1900 ; VI-NEXT: s_or_b32 s8, s9, s8
1901 ; VI-NEXT: s_lshl_b32 s8, s8, 16
1902 ; VI-NEXT: s_lshr_b32 s9, s6, 8
1903 ; VI-NEXT: s_cmp_lg_u32 s10, 9
1904 ; VI-NEXT: s_cselect_b32 s9, s9, 5
1905 ; VI-NEXT: s_lshl_b32 s9, s9, 8
1906 ; VI-NEXT: s_cmp_lg_u32 s10, 8
1907 ; VI-NEXT: s_cselect_b32 s6, s6, 5
1908 ; VI-NEXT: s_and_b32 s6, s6, 0xff
1909 ; VI-NEXT: s_or_b32 s6, s6, s9
1910 ; VI-NEXT: s_and_b32 s6, s6, 0xffff
1911 ; VI-NEXT: s_or_b32 s6, s6, s8
1912 ; VI-NEXT: s_lshr_b32 s8, s5, 24
1913 ; VI-NEXT: s_cmp_lg_u32 s10, 7
1914 ; VI-NEXT: s_cselect_b32 s8, s8, 5
1915 ; VI-NEXT: s_lshl_b32 s8, s8, 8
1916 ; VI-NEXT: s_lshr_b32 s9, s5, 16
1917 ; VI-NEXT: s_cmp_lg_u32 s10, 6
1918 ; VI-NEXT: s_cselect_b32 s9, s9, 5
1919 ; VI-NEXT: s_and_b32 s9, s9, 0xff
1920 ; VI-NEXT: s_or_b32 s8, s9, s8
1921 ; VI-NEXT: s_lshl_b32 s8, s8, 16
1922 ; VI-NEXT: s_lshr_b32 s9, s5, 8
1923 ; VI-NEXT: s_cmp_lg_u32 s10, 5
1924 ; VI-NEXT: s_cselect_b32 s9, s9, 5
1925 ; VI-NEXT: s_lshl_b32 s9, s9, 8
1926 ; VI-NEXT: s_cmp_lg_u32 s10, 4
1927 ; VI-NEXT: s_cselect_b32 s5, s5, 5
1928 ; VI-NEXT: s_and_b32 s5, s5, 0xff
1929 ; VI-NEXT: s_or_b32 s5, s5, s9
1930 ; VI-NEXT: s_and_b32 s5, s5, 0xffff
1931 ; VI-NEXT: s_or_b32 s5, s5, s8
1932 ; VI-NEXT: s_lshr_b32 s8, s4, 24
1933 ; VI-NEXT: s_cmp_lg_u32 s10, 3
1934 ; VI-NEXT: s_cselect_b32 s8, s8, 5
1935 ; VI-NEXT: s_lshl_b32 s8, s8, 8
1936 ; VI-NEXT: s_lshr_b32 s9, s4, 16
1937 ; VI-NEXT: s_cmp_lg_u32 s10, 2
1938 ; VI-NEXT: s_cselect_b32 s9, s9, 5
1939 ; VI-NEXT: s_and_b32 s9, s9, 0xff
1940 ; VI-NEXT: s_or_b32 s8, s9, s8
1941 ; VI-NEXT: s_lshl_b32 s8, s8, 16
1942 ; VI-NEXT: s_lshr_b32 s9, s4, 8
1943 ; VI-NEXT: s_cmp_lg_u32 s10, 1
1944 ; VI-NEXT: s_cselect_b32 s9, s9, 5
1945 ; VI-NEXT: s_lshl_b32 s9, s9, 8
1946 ; VI-NEXT: s_cmp_lg_u32 s10, 0
1947 ; VI-NEXT: s_cselect_b32 s4, s4, 5
1948 ; VI-NEXT: s_and_b32 s4, s4, 0xff
1949 ; VI-NEXT: s_or_b32 s4, s4, s9
1950 ; VI-NEXT: s_and_b32 s4, s4, 0xffff
1951 ; VI-NEXT: s_or_b32 s4, s4, s8
1952 ; VI-NEXT: v_mov_b32_e32 v0, s4
1953 ; VI-NEXT: v_mov_b32_e32 v1, s5
1954 ; VI-NEXT: v_mov_b32_e32 v2, s6
1955 ; VI-NEXT: v_mov_b32_e32 v3, s7
1956 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1958 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
1959 store <16 x i8> %vecins, ptr addrspace(1) %out, align 16
1963 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that
1964 ; the compiler doesn't crash.
1965 define amdgpu_kernel void @insert_split_bb(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %a, i32 %b) {
1966 ; SI-LABEL: insert_split_bb:
1967 ; SI: ; %bb.0: ; %entry
1968 ; SI-NEXT: s_load_dword s4, s[8:9], 0x4
1969 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1970 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1971 ; SI-NEXT: s_cmp_lg_u32 s4, 0
1972 ; SI-NEXT: s_cbranch_scc0 .LBB42_4
1973 ; SI-NEXT: ; %bb.1: ; %else
1974 ; SI-NEXT: s_load_dword s5, s[2:3], 0x1
1975 ; SI-NEXT: s_mov_b64 s[6:7], 0
1976 ; SI-NEXT: s_andn2_b64 vcc, exec, s[6:7]
1977 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1978 ; SI-NEXT: s_mov_b64 vcc, vcc
1979 ; SI-NEXT: s_cbranch_vccnz .LBB42_3
1980 ; SI-NEXT: .LBB42_2: ; %if
1981 ; SI-NEXT: s_load_dword s5, s[2:3], 0x0
1982 ; SI-NEXT: .LBB42_3: ; %endif
1983 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1984 ; SI-NEXT: v_mov_b32_e32 v0, s4
1985 ; SI-NEXT: s_mov_b32 s3, 0x100f000
1986 ; SI-NEXT: s_mov_b32 s2, -1
1987 ; SI-NEXT: v_mov_b32_e32 v1, s5
1988 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1990 ; SI-NEXT: .LBB42_4:
1991 ; SI-NEXT: s_branch .LBB42_2
1993 ; VI-LABEL: insert_split_bb:
1994 ; VI: ; %bb.0: ; %entry
1995 ; VI-NEXT: s_load_dword s4, s[8:9], 0x10
1996 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
1997 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1998 ; VI-NEXT: s_cmp_lg_u32 s4, 0
1999 ; VI-NEXT: s_cbranch_scc0 .LBB42_4
2000 ; VI-NEXT: ; %bb.1: ; %else
2001 ; VI-NEXT: s_load_dword s5, s[2:3], 0x4
2002 ; VI-NEXT: s_cbranch_execnz .LBB42_3
2003 ; VI-NEXT: .LBB42_2: ; %if
2004 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2005 ; VI-NEXT: s_load_dword s5, s[2:3], 0x0
2006 ; VI-NEXT: .LBB42_3: ; %endif
2007 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2008 ; VI-NEXT: v_mov_b32_e32 v0, s4
2009 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
2010 ; VI-NEXT: s_mov_b32 s2, -1
2011 ; VI-NEXT: v_mov_b32_e32 v1, s5
2012 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2014 ; VI-NEXT: .LBB42_4:
2015 ; VI-NEXT: s_branch .LBB42_2
2017 %0 = insertelement <2 x i32> undef, i32 %a, i32 0
2018 %1 = icmp eq i32 %a, 0
2019 br i1 %1, label %if, label %else
2022 %2 = load i32, ptr addrspace(1) %in
2023 %3 = insertelement <2 x i32> %0, i32 %2, i32 1
2027 %4 = getelementptr i32, ptr addrspace(1) %in, i32 1
2028 %5 = load i32, ptr addrspace(1) %4
2029 %6 = insertelement <2 x i32> %0, i32 %5, i32 1
2033 %7 = phi <2 x i32> [%3, %if], [%6, %else]
2034 store <2 x i32> %7, ptr addrspace(1) %out
2038 define amdgpu_kernel void @dynamic_insertelement_v2f64(ptr addrspace(1) %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
2039 ; SI-LABEL: dynamic_insertelement_v2f64:
2041 ; SI-NEXT: s_load_dword s10, s[8:9], 0x18
2042 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0xc
2043 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2044 ; SI-NEXT: s_mov_b32 s7, 0x100f000
2045 ; SI-NEXT: s_mov_b32 s6, -1
2046 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2047 ; SI-NEXT: s_cmp_eq_u32 s10, 1
2048 ; SI-NEXT: s_cselect_b32 s3, 0x40200000, s3
2049 ; SI-NEXT: s_cselect_b32 s2, 0, s2
2050 ; SI-NEXT: s_cmp_eq_u32 s10, 0
2051 ; SI-NEXT: s_cselect_b32 s1, 0x40200000, s1
2052 ; SI-NEXT: s_cselect_b32 s0, 0, s0
2053 ; SI-NEXT: v_mov_b32_e32 v0, s0
2054 ; SI-NEXT: v_mov_b32_e32 v1, s1
2055 ; SI-NEXT: v_mov_b32_e32 v2, s2
2056 ; SI-NEXT: v_mov_b32_e32 v3, s3
2057 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2060 ; VI-LABEL: dynamic_insertelement_v2f64:
2062 ; VI-NEXT: s_load_dword s10, s[8:9], 0x60
2063 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x30
2064 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2065 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
2066 ; VI-NEXT: s_mov_b32 s6, -1
2067 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2068 ; VI-NEXT: s_cmp_eq_u32 s10, 1
2069 ; VI-NEXT: s_cselect_b32 s3, 0x40200000, s3
2070 ; VI-NEXT: s_cselect_b32 s2, 0, s2
2071 ; VI-NEXT: s_cmp_eq_u32 s10, 0
2072 ; VI-NEXT: s_cselect_b32 s1, 0x40200000, s1
2073 ; VI-NEXT: s_cselect_b32 s0, 0, s0
2074 ; VI-NEXT: v_mov_b32_e32 v0, s0
2075 ; VI-NEXT: v_mov_b32_e32 v1, s1
2076 ; VI-NEXT: v_mov_b32_e32 v2, s2
2077 ; VI-NEXT: v_mov_b32_e32 v3, s3
2078 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2080 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
2081 store <2 x double> %vecins, ptr addrspace(1) %out, align 16
2085 define amdgpu_kernel void @dynamic_insertelement_v2i64(ptr addrspace(1) %out, <2 x i64> %a, i32 %b) nounwind {
2086 ; SI-LABEL: dynamic_insertelement_v2i64:
2088 ; SI-NEXT: s_load_dword s10, s[8:9], 0x8
2089 ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
2090 ; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2091 ; SI-NEXT: s_mov_b32 s7, 0x100f000
2092 ; SI-NEXT: s_mov_b32 s6, -1
2093 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2094 ; SI-NEXT: s_cmp_eq_u32 s10, 1
2095 ; SI-NEXT: s_cselect_b32 s3, 0, s3
2096 ; SI-NEXT: s_cselect_b32 s2, 5, s2
2097 ; SI-NEXT: s_cmp_eq_u32 s10, 0
2098 ; SI-NEXT: s_cselect_b32 s1, 0, s1
2099 ; SI-NEXT: s_cselect_b32 s0, 5, s0
2100 ; SI-NEXT: v_mov_b32_e32 v0, s0
2101 ; SI-NEXT: v_mov_b32_e32 v1, s1
2102 ; SI-NEXT: v_mov_b32_e32 v2, s2
2103 ; SI-NEXT: v_mov_b32_e32 v3, s3
2104 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2107 ; VI-LABEL: dynamic_insertelement_v2i64:
2109 ; VI-NEXT: s_load_dword s10, s[8:9], 0x20
2110 ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
2111 ; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2112 ; VI-NEXT: s_mov_b32 s7, 0x1100f000
2113 ; VI-NEXT: s_mov_b32 s6, -1
2114 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2115 ; VI-NEXT: s_cmp_eq_u32 s10, 1
2116 ; VI-NEXT: s_cselect_b32 s3, 0, s3
2117 ; VI-NEXT: s_cselect_b32 s2, 5, s2
2118 ; VI-NEXT: s_cmp_eq_u32 s10, 0
2119 ; VI-NEXT: s_cselect_b32 s1, 0, s1
2120 ; VI-NEXT: s_cselect_b32 s0, 5, s0
2121 ; VI-NEXT: v_mov_b32_e32 v0, s0
2122 ; VI-NEXT: v_mov_b32_e32 v1, s1
2123 ; VI-NEXT: v_mov_b32_e32 v2, s2
2124 ; VI-NEXT: v_mov_b32_e32 v3, s3
2125 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2127 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
2128 store <2 x i64> %vecins, ptr addrspace(1) %out, align 8
2132 define amdgpu_kernel void @dynamic_insertelement_v3i64(ptr addrspace(1) %out, <3 x i64> %a, i32 %b) nounwind {
2133 ; SI-LABEL: dynamic_insertelement_v3i64:
2135 ; SI-NEXT: s_load_dword s10, s[8:9], 0x10
2136 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
2137 ; SI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x8
2138 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0xc
2139 ; SI-NEXT: s_mov_b32 s3, 0x100f000
2140 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2141 ; SI-NEXT: s_cmp_eq_u32 s10, 1
2142 ; SI-NEXT: s_mov_b32 s2, -1
2143 ; SI-NEXT: s_cselect_b32 s7, 0, s7
2144 ; SI-NEXT: s_cselect_b32 s6, 5, s6
2145 ; SI-NEXT: s_cmp_eq_u32 s10, 0
2146 ; SI-NEXT: s_cselect_b32 s5, 0, s5
2147 ; SI-NEXT: s_cselect_b32 s4, 5, s4
2148 ; SI-NEXT: s_cmp_eq_u32 s10, 2
2149 ; SI-NEXT: s_cselect_b32 s9, 0, s9
2150 ; SI-NEXT: s_cselect_b32 s8, 5, s8
2151 ; SI-NEXT: v_mov_b32_e32 v0, s8
2152 ; SI-NEXT: v_mov_b32_e32 v1, s9
2153 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
2154 ; SI-NEXT: v_mov_b32_e32 v0, s4
2155 ; SI-NEXT: v_mov_b32_e32 v1, s5
2156 ; SI-NEXT: v_mov_b32_e32 v2, s6
2157 ; SI-NEXT: v_mov_b32_e32 v3, s7
2158 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2161 ; VI-LABEL: dynamic_insertelement_v3i64:
2163 ; VI-NEXT: s_load_dword s10, s[8:9], 0x40
2164 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
2165 ; VI-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x20
2166 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x30
2167 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
2168 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2169 ; VI-NEXT: s_cmp_eq_u32 s10, 1
2170 ; VI-NEXT: s_mov_b32 s2, -1
2171 ; VI-NEXT: s_cselect_b32 s7, 0, s7
2172 ; VI-NEXT: s_cselect_b32 s6, 5, s6
2173 ; VI-NEXT: s_cmp_eq_u32 s10, 0
2174 ; VI-NEXT: s_cselect_b32 s5, 0, s5
2175 ; VI-NEXT: s_cselect_b32 s4, 5, s4
2176 ; VI-NEXT: s_cmp_eq_u32 s10, 2
2177 ; VI-NEXT: s_cselect_b32 s9, 0, s9
2178 ; VI-NEXT: s_cselect_b32 s8, 5, s8
2179 ; VI-NEXT: v_mov_b32_e32 v0, s8
2180 ; VI-NEXT: v_mov_b32_e32 v1, s9
2181 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
2182 ; VI-NEXT: v_mov_b32_e32 v0, s4
2183 ; VI-NEXT: v_mov_b32_e32 v1, s5
2184 ; VI-NEXT: v_mov_b32_e32 v2, s6
2185 ; VI-NEXT: v_mov_b32_e32 v3, s7
2186 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2188 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
2189 store <3 x i64> %vecins, ptr addrspace(1) %out, align 32
2193 define amdgpu_kernel void @dynamic_insertelement_v4f64(ptr addrspace(1) %out, <4 x double> %a, i32 %b) nounwind {
2194 ; SI-LABEL: dynamic_insertelement_v4f64:
2196 ; SI-NEXT: s_load_dword s12, s[8:9], 0x10
2197 ; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8
2198 ; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
2199 ; SI-NEXT: s_mov_b32 s11, 0x100f000
2200 ; SI-NEXT: s_mov_b32 s10, -1
2201 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2202 ; SI-NEXT: s_cmp_eq_u32 s12, 1
2203 ; SI-NEXT: s_cselect_b32 s3, 0x40200000, s3
2204 ; SI-NEXT: s_cselect_b32 s2, 0, s2
2205 ; SI-NEXT: s_cmp_eq_u32 s12, 0
2206 ; SI-NEXT: s_cselect_b32 s1, 0x40200000, s1
2207 ; SI-NEXT: s_cselect_b32 s0, 0, s0
2208 ; SI-NEXT: s_cmp_eq_u32 s12, 3
2209 ; SI-NEXT: s_cselect_b32 s7, 0x40200000, s7
2210 ; SI-NEXT: s_cselect_b32 s6, 0, s6
2211 ; SI-NEXT: s_cmp_eq_u32 s12, 2
2212 ; SI-NEXT: s_cselect_b32 s5, 0x40200000, s5
2213 ; SI-NEXT: s_cselect_b32 s4, 0, s4
2214 ; SI-NEXT: v_mov_b32_e32 v0, s4
2215 ; SI-NEXT: v_mov_b32_e32 v1, s5
2216 ; SI-NEXT: v_mov_b32_e32 v2, s6
2217 ; SI-NEXT: v_mov_b32_e32 v3, s7
2218 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
2220 ; SI-NEXT: v_mov_b32_e32 v0, s0
2221 ; SI-NEXT: v_mov_b32_e32 v1, s1
2222 ; SI-NEXT: v_mov_b32_e32 v2, s2
2223 ; SI-NEXT: v_mov_b32_e32 v3, s3
2224 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2227 ; VI-LABEL: dynamic_insertelement_v4f64:
2229 ; VI-NEXT: s_load_dword s12, s[8:9], 0x40
2230 ; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
2231 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
2232 ; VI-NEXT: s_mov_b32 s11, 0x1100f000
2233 ; VI-NEXT: s_mov_b32 s10, -1
2234 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2235 ; VI-NEXT: s_cmp_eq_u32 s12, 1
2236 ; VI-NEXT: s_cselect_b32 s3, 0x40200000, s3
2237 ; VI-NEXT: s_cselect_b32 s2, 0, s2
2238 ; VI-NEXT: s_cmp_eq_u32 s12, 0
2239 ; VI-NEXT: s_cselect_b32 s1, 0x40200000, s1
2240 ; VI-NEXT: s_cselect_b32 s0, 0, s0
2241 ; VI-NEXT: s_cmp_eq_u32 s12, 3
2242 ; VI-NEXT: s_cselect_b32 s7, 0x40200000, s7
2243 ; VI-NEXT: s_cselect_b32 s6, 0, s6
2244 ; VI-NEXT: s_cmp_eq_u32 s12, 2
2245 ; VI-NEXT: s_cselect_b32 s5, 0x40200000, s5
2246 ; VI-NEXT: s_cselect_b32 s4, 0, s4
2247 ; VI-NEXT: v_mov_b32_e32 v0, s4
2248 ; VI-NEXT: v_mov_b32_e32 v1, s5
2249 ; VI-NEXT: v_mov_b32_e32 v2, s6
2250 ; VI-NEXT: v_mov_b32_e32 v3, s7
2251 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
2253 ; VI-NEXT: v_mov_b32_e32 v0, s0
2254 ; VI-NEXT: v_mov_b32_e32 v1, s1
2255 ; VI-NEXT: v_mov_b32_e32 v2, s2
2256 ; VI-NEXT: v_mov_b32_e32 v3, s3
2257 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2259 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
2260 store <4 x double> %vecins, ptr addrspace(1) %out, align 16
2264 define amdgpu_kernel void @dynamic_insertelement_v8f64(ptr addrspace(1) %out, <8 x double> %a, i32 %b) #0 {
2265 ; SI-LABEL: dynamic_insertelement_v8f64:
2267 ; SI-NEXT: s_load_dword s4, s[8:9], 0x20
2268 ; SI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x10
2269 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
2270 ; SI-NEXT: v_mov_b32_e32 v16, 0x40200000
2271 ; SI-NEXT: s_mov_b32 s3, 0x100f000
2272 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2273 ; SI-NEXT: s_lshl_b32 s4, s4, 1
2274 ; SI-NEXT: v_mov_b32_e32 v0, s12
2275 ; SI-NEXT: v_mov_b32_e32 v1, s13
2276 ; SI-NEXT: v_mov_b32_e32 v2, s14
2277 ; SI-NEXT: v_mov_b32_e32 v3, s15
2278 ; SI-NEXT: v_mov_b32_e32 v4, s16
2279 ; SI-NEXT: v_mov_b32_e32 v5, s17
2280 ; SI-NEXT: v_mov_b32_e32 v6, s18
2281 ; SI-NEXT: v_mov_b32_e32 v7, s19
2282 ; SI-NEXT: v_mov_b32_e32 v8, s20
2283 ; SI-NEXT: v_mov_b32_e32 v9, s21
2284 ; SI-NEXT: v_mov_b32_e32 v10, s22
2285 ; SI-NEXT: v_mov_b32_e32 v11, s23
2286 ; SI-NEXT: v_mov_b32_e32 v12, s24
2287 ; SI-NEXT: v_mov_b32_e32 v13, s25
2288 ; SI-NEXT: v_mov_b32_e32 v14, s26
2289 ; SI-NEXT: v_mov_b32_e32 v15, s27
2290 ; SI-NEXT: s_mov_b32 m0, s4
2291 ; SI-NEXT: v_movreld_b32_e32 v0, 0
2292 ; SI-NEXT: s_mov_b32 s2, -1
2293 ; SI-NEXT: v_movreld_b32_e32 v1, v16
2294 ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
2295 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
2296 ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
2297 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2300 ; VI-LABEL: dynamic_insertelement_v8f64:
2302 ; VI-NEXT: s_load_dword s4, s[8:9], 0x80
2303 ; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x40
2304 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
2305 ; VI-NEXT: v_mov_b32_e32 v16, 0x40200000
2306 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
2307 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2308 ; VI-NEXT: s_lshl_b32 s4, s4, 1
2309 ; VI-NEXT: v_mov_b32_e32 v0, s12
2310 ; VI-NEXT: v_mov_b32_e32 v1, s13
2311 ; VI-NEXT: v_mov_b32_e32 v2, s14
2312 ; VI-NEXT: v_mov_b32_e32 v3, s15
2313 ; VI-NEXT: v_mov_b32_e32 v4, s16
2314 ; VI-NEXT: v_mov_b32_e32 v5, s17
2315 ; VI-NEXT: v_mov_b32_e32 v6, s18
2316 ; VI-NEXT: v_mov_b32_e32 v7, s19
2317 ; VI-NEXT: v_mov_b32_e32 v8, s20
2318 ; VI-NEXT: v_mov_b32_e32 v9, s21
2319 ; VI-NEXT: v_mov_b32_e32 v10, s22
2320 ; VI-NEXT: v_mov_b32_e32 v11, s23
2321 ; VI-NEXT: v_mov_b32_e32 v12, s24
2322 ; VI-NEXT: v_mov_b32_e32 v13, s25
2323 ; VI-NEXT: v_mov_b32_e32 v14, s26
2324 ; VI-NEXT: v_mov_b32_e32 v15, s27
2325 ; VI-NEXT: s_mov_b32 m0, s4
2326 ; VI-NEXT: v_movreld_b32_e32 v0, 0
2327 ; VI-NEXT: s_mov_b32 s2, -1
2328 ; VI-NEXT: v_movreld_b32_e32 v1, v16
2329 ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
2330 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
2331 ; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
2332 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2334 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
2335 store <8 x double> %vecins, ptr addrspace(1) %out, align 16
2339 declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
2341 attributes #0 = { nounwind }
2342 attributes #1 = { nounwind readnone }